Spaces:

Sizzing
/

aws_rl_env

Running

App Files Files Community

Sizzing commited on Apr 5

Commit

0f8f2c1

verified ·

1 Parent(s): eea2be5

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +8 -5
Makefile +6 -5
README.md +369 -166
__init__.py +7 -2
aws_infra/aws_infra/app.py +122 -0
aws_infra/aws_infra/services/acm.py +15 -0
aws_infra/aws_infra/services/alb.py +32 -0
aws_infra/aws_infra/services/apigateway.py +12 -0
aws_infra/aws_infra/services/apigateway_v1.py +21 -0
aws_infra/aws_infra/services/athena.py +23 -0
aws_infra/aws_infra/services/cloudformation/__init__.py +27 -0
aws_infra/aws_infra/services/cloudwatch.py +24 -0
aws_infra/aws_infra/services/cloudwatch_logs.py +23 -0
aws_infra/aws_infra/services/cognito.py +43 -0
aws_infra/aws_infra/services/dynamodb.py +23 -0
aws_infra/aws_infra/services/ec2.py +68 -0
aws_infra/aws_infra/services/ecs.py +20 -0
aws_infra/aws_infra/services/efs.py +32 -0
aws_infra/aws_infra/services/elasticache.py +26 -0
aws_infra/aws_infra/services/emr.py +31 -0
aws_infra/aws_infra/services/eventbridge.py +23 -0
aws_infra/aws_infra/services/firehose.py +14 -0
aws_infra/aws_infra/services/glue.py +30 -0
aws_infra/aws_infra/services/iam_sts.py +37 -0
aws_infra/aws_infra/services/kinesis.py +19 -0
aws_infra/aws_infra/services/lambda_svc.py +30 -0
aws_infra/aws_infra/services/rds.py +28 -0
aws_infra/aws_infra/services/route53.py +22 -0
aws_infra/aws_infra/services/s3.py +33 -0
aws_infra/aws_infra/services/secretsmanager.py +17 -0
aws_infra/aws_infra/services/ses.py +22 -0
aws_infra/aws_infra/services/ses_v2.py +17 -0
aws_infra/aws_infra/services/sns.py +21 -0
aws_infra/aws_infra/services/sqs.py +17 -0
aws_infra/aws_infra/services/ssm.py +15 -0
aws_infra/aws_infra/services/stepfunctions.py +19 -0
aws_infra/aws_infra/services/waf.py +21 -0
client.py +15 -6
inference-complete.py +13 -14
inference.py +6 -0
models.py +124 -8
pyproject.toml +5 -0
server/app.py +23 -0
server/aws_rl_env_environment.py +122 -23
server/services/aws_backend.py +57 -1
server/services/chaos_engine.py +168 -0
server/services/curriculum.py +58 -18
server/services/drift_engine.py +67 -0
server/services/environment_designer.py +10 -1
server/services/episode_tracker.py +96 -0

Dockerfile CHANGED Viewed

@@ -42,16 +42,16 @@ RUN if ! command -v uv >/dev/null 2>&1; then \
 # If uv.lock exists, use it; otherwise resolve on the fly
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ -f uv.lock ]; then \
-    uv sync --frozen --no-install-project --no-editable; \
     else \
-    uv sync --no-install-project --no-editable; \
     fi
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ -f uv.lock ]; then \
-    uv sync --frozen --no-editable; \
     else \
-    uv sync --no-editable; \
     fi
 # Final runtime stage
@@ -90,7 +90,10 @@ ENV PYTHONPATH="/app/env:$PYTHONPATH"
 # DEV_MODE=1 enables live reload via --reload flag
-ENV DEV_MODE=0
 # Entrypoint: start aws_infra in background, then run the FastAPI server
 CMD ["sh", "-c", "aws_infra -d & sleep 2 && uvicorn server.app:app --host 0.0.0.0 --port 8000 $([ \"$DEV_MODE\" = '1' ] && echo '--reload --reload-dir /app/env')"]

 # If uv.lock exists, use it; otherwise resolve on the fly
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ -f uv.lock ]; then \
+    uv sync --frozen --extra dev --no-install-project --no-editable; \
     else \
+    uv sync --extra dev --no-install-project --no-editable; \
     fi
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ -f uv.lock ]; then \
+    uv sync --frozen --extra dev --no-editable; \
     else \
+    uv sync --extra dev --no-editable; \
     fi
 # Final runtime stage
 # DEV_MODE=1 enables live reload via --reload flag
+ENV DEV_MODE=1
+ENV API_BASE_URL=https://router.huggingface.co/v1
+ENV MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
 # Entrypoint: start aws_infra in background, then run the FastAPI server
 CMD ["sh", "-c", "aws_infra -d & sleep 2 && uvicorn server.app:app --host 0.0.0.0 --port 8000 $([ \"$DEV_MODE\" = '1' ] && echo '--reload --reload-dir /app/env')"]

Makefile CHANGED Viewed

@@ -21,9 +21,6 @@ install: ## Install project dependencies
 install-dev: ## Install project with dev dependencies
 	$(UV) sync --frozen --extra dev
-.PHONY: install-train
-install-train: ## Install project with training dependencies (trl, torch, peft, etc.)
-	$(UV) sync --frozen --extra training
 .PHONY: install-all
 install-all: ## Install project with all dependencies (dev + training)
@@ -39,7 +36,7 @@ lock: ## Update the lockfile
 .PHONY: run
 run: ## Run with MiniStack + FastAPI server (mirrors Docker CMD)
-	ministack & sleep 2 && $(UV) run uvicorn server.app:app --host $(SERVER_HOST) --port $(SERVER_PORT)
 # ──────────────────────────────────────────────
 # Code Quality
@@ -82,7 +79,7 @@ docker-run-dev: ## Run Docker container in dev mode with live reload
 .PHONY: docker-run-detach
 docker-run-detach: ## Run Docker container in background
-	docker run -d --rm -p $(SERVER_PORT):8000 --name $(DOCKER_IMAGE) $(DOCKER_IMAGE):$(DOCKER_TAG)
 .PHONY: docker-stop
 docker-stop: ## Stop the running Docker container
@@ -100,6 +97,10 @@ docker-shell: ## Open a shell in the running Docker container
 docker-clean: ## Stop and remove all running containers for this image
 	@docker ps -q --filter ancestor=$(DOCKER_IMAGE):$(DOCKER_TAG) | xargs -r docker rm -f
 .PHONY: docker-health
 docker-health: ## Check health of the running container
 	@curl -sf http://localhost:$(SERVER_PORT)/health && echo " OK" || echo " FAIL"

 install-dev: ## Install project with dev dependencies
 	$(UV) sync --frozen --extra dev
 .PHONY: install-all
 install-all: ## Install project with all dependencies (dev + training)
 .PHONY: run
 run: ## Run with MiniStack + FastAPI server (mirrors Docker CMD)
+	aws_infra -d & sleep 2 && $(UV) run uvicorn server.app:app --host $(SERVER_HOST) --port $(SERVER_PORT) --reload
 # ──────────────────────────────────────────────
 # Code Quality
 .PHONY: docker-run-detach
 docker-run-detach: ## Run Docker container in background
+	docker run -d --rm --name $(DOCKER_IMAGE) -p $(SERVER_PORT):8000 -v $(PWD):/app/env -v /app/env/.venv -e DEV_MODE=1 $(DOCKER_IMAGE):$(DOCKER_TAG)
 .PHONY: docker-stop
 docker-stop: ## Stop the running Docker container
 docker-clean: ## Stop and remove all running containers for this image
 	@docker ps -q --filter ancestor=$(DOCKER_IMAGE):$(DOCKER_TAG) | xargs -r docker rm -f
+.PHONY: docker-test
+docker-test: ## Run tests inside the running Docker container
+	docker exec $(DOCKER_IMAGE) python -m pytest env/tests -v
 .PHONY: docker-health
 docker-health: ## Check health of the running container
 	@curl -sf http://localhost:$(SERVER_PORT)/health && echo " OK" || echo " FAIL"

README.md CHANGED Viewed

@@ -11,19 +11,242 @@ tags:
   - openenv
 ---
-# AWS RL Environment
-A **Gymnasium-style RL environment** for training LLM agents on real-world AWS cloud operations. The agent sends AWS CLI commands as actions, receives structured observations, and progresses through a **curriculum of 21 tasks** across 5 difficulty tiers — from basic listing to SRE incident response.
-The environment runs a **vendored MiniStack emulator** (34 AWS services, in-memory, zero-cost) inside the same Docker container, so no AWS account is needed.
-## Key Innovations
-- **Priority-queue curriculum** — Tasks are selected by weakness, novelty, and spaced-repetition schedules instead of random or round-robin sampling
-- **Spaced repetition** — Graduated tasks resurface at exponentially increasing intervals (3 -> 6 -> 12 -> ... -> 48 episodes) to prevent catastrophic forgetting
-- **Anti-reward-hacking** — Grading verifies ground-truth state in MiniStack, not agent output; partial credit is capped at 0.99; monotonic progress prevents manipulation
-- **SRE incident tasks** — Expert-tier tasks provision broken infrastructure, then require the agent to diagnose and fix it
-- **Shaped rewards** — Dense reward signals (progress bonuses, failure penalties) in [0.0, 1.0] guide exploration without enabling gaming
 ## Quick Start
@@ -46,76 +269,83 @@ result = env.reset()
 result = env.step(AwsRlAction(command="aws s3 ls"))
 ```
 ---
 ## Architecture
 ```
-┌──────────────────────────────────────────────────────────┐
-│                    Docker Container                       │
-│                                                          │
 │  ┌─────────────────────┐      ┌────────────────────┐    │
-│  │  FastAPI RL Server   │      │  MiniStack         │    │
-│  │  (port 8000)         │─────>│  (port 4566)       │    │
-│  │                      │      │  34 AWS services    │    │
-│  │  - Environment       │      │  In-memory state    │    │
-│  │  - Curriculum        │      │  Reset API          │    │
-│  │  - Grading Engine    │      │                     │    │
-│  │  - Episode Tracker   │      │                     │    │
 │  └─────────────────────┘      └────────────────────┘    │
-│          ^                             ^                  │
-│          | OpenEnv HTTP/WS             | AWS CLI calls    │
-└──────────┼─────────────────────────────┼─────────────────┘
            |                             |
-      RL Agent (client)          (internal only)
 ```
 ### Episode Lifecycle
-1. **`reset()`** -- Wipes MiniStack state, selects next task from curriculum, provisions setup commands (if any), returns initial observation
-2. **`step(action)`** -- Validates command (`aws` prefix only), executes against MiniStack, records in tracker, grades with shaped reward, returns observation
-3. **Terminates** when `task_achieved == True` or max steps reached
 ---
 ## Core Classes
 ### `AwsRlEnvironment`
-[server/aws_rl_env_environment.py](server/aws_rl_env_environment.py) -- Implements the OpenEnv `Environment` interface. Orchestrates all services.
 | Method | Description |
 |--------|-------------|
 | `reset()` | Wipe infra, select task, provision setup, return initial observation |
-| `step(action)` | Execute command, grade, update curriculum, return observation |
 ### `Curriculum`
-[server/services/curriculum.py](server/services/curriculum.py) -- Priority-queue-based task selection with progressive difficulty.
 Selects the next task using a **max-heap scored by**:
 ```
 score = (
     novelty_bonus          # +100 if never attempted (explore first)
-    + weakness_weight      # +50 * (1 - task_success_rate) -- worse tasks get higher priority
     + spaced_rep_bonus     # +30 if graduated task is "due" for re-test
     - recency_penalty      # -20 if attempted in last 2 episodes (ensure variety)
 )
 ```
-| Feature | Detail |
-|---------|--------|
-| **Per-task mastery** | Sliding-window success rate with exponential decay (0.85^i weighting) |
-| **Graduation** | Task is "graduated" when success rate >= mastery_threshold in window |
-| **Spaced repetition** | Graduated tasks resurface at doubling intervals (3 -> 6 -> ... -> 48 episodes) |
-| **Tier progression** | Advance when tier success rate >= advance_rate after min_episodes |
-| **Fast-track** | Skip min_episodes wait after 3 consecutive episodes at >= 90% success |
-| **Skill profile** | `get_stats()` returns per-task success rates, weak spots, and due re-tests |
 ### `TaskGrader`
-[server/services/task_grader.py](server/services/task_grader.py) -- Evaluates task completion using a dispatcher pattern. Rewards are always in [0.0, 1.0].
 **Grading strategies by tier:**
@@ -127,71 +357,101 @@ score = (
 | Advanced | Multi-step + services | All steps completed AND all required services touched |
 | Expert | State checks | Runs arbitrary AWS CLI commands to assert end-state (ground truth) |
-**Reward shaping:**
-```
-if task_achieved:       reward = 1.0
-else:
-    reward = partial_progress * 0.8        # base: scaled to [0.0, 0.8]
-    if progress_increased: reward += 0.1   # dense signal for advancing
-    if command_failed:     reward *= 0.5   # penalty for errors
-    reward = clamp(reward, 0.0, 0.99)      # never 1.0 without completion
-```
 ### `EpisodeTracker`
-[server/services/episode_tracker.py](server/services/episode_tracker.py) -- Maintains per-episode step history. Parses AWS CLI commands to extract (service, operation, resource) tuples. Tracks credited operations for deduplication and monotonic progress.
 ### `ResourceVerifier`
-[server/services/resource_verifier.py](server/services/resource_verifier.py) -- Queries MiniStack directly to verify ground-truth resource state. Service-specific checks for S3, DynamoDB, Lambda, SQS, SNS, IAM, and API Gateway. Also evaluates `StateCheck` assertions (substring match, JSON path extraction).
 ### `EnvironmentDesigner`
-[server/services/environment_designer.py](server/services/environment_designer.py) -- Provisions initial AWS state via setup commands before the agent acts. Used by SRE/expert tasks to create broken infrastructure the agent must fix.
 ### `AwsBackend`
-[server/services/aws_backend.py](server/services/aws_backend.py) -- Executes AWS CLI commands against MiniStack (`AWS_ENDPOINT_URL=http://localhost:4566`). Provides `reset_environment()` via MiniStack's `/_ministack/reset` endpoint.
 ### `AwsRlEnv` (Client)
-[client.py](client.py) -- OpenEnv HTTP/WebSocket client. Wraps `reset()` and `step()` calls to the server.
 ---
 ## Data Models
-[models.py](models.py) -- All Pydantic models and type aliases.
-### Action & Observation
 ```python
 class AwsRlAction(Action):
     command: str   # AWS CLI command, e.g. "aws s3 ls"
 class AwsRlObservation(Observation):
     episode_id: EpisodeID
     step_count: StepCount
     command_success: bool
     command_output: str          # stdout from AWS CLI
     error: str                   # stderr if failed
-    resources: dict[AwsService, dict | list | str]
-    task: Task | None            # current task definition
     task_achieved: bool
-    done: bool
-    reward: float                # shaped reward in [0.0, 1.0]
 ```
 ### Task Definitions
 ```python
 class Task:
-    task_id: TaskID              # 0-20
     difficulty: TaskDifficulty   # warmup | beginner | intermediate | advanced | expert
     description: str             # human-readable goal
     success_criteria: SuccessCriteria
-    setup_commands: list[SetupCommand]  # pre-provision for SRE tasks
 class SuccessCriteria:
     command_contains: str | None           # warmup/beginner
@@ -211,6 +471,7 @@ class TierConfig:
     mastery_window: int       # sliding window size (default: 10)
     mastery_threshold: float  # per-task graduation threshold (default: 0.7)
     fast_track_rate: float    # early promotion threshold (default: 0.9)
 class SpacedRepState:
     interval: int                  # episodes until next re-test (3 -> 48)
@@ -219,88 +480,6 @@ class SpacedRepState:
 ---
-## Task Catalog (21 Tasks)
-### Warmup (6 tasks) -- Simple listing operations
-| ID | Description | Service |
-|----|-------------|---------|
-| 0 | List all S3 buckets | S3 |
-| 1 | Describe EC2 instances | EC2 |
-| 2 | List DynamoDB tables | DynamoDB |
-| 3 | List Lambda functions | Lambda |
-| 4 | List SQS queues | SQS |
-| 5 | List SNS topics | SNS |
-### Beginner (5 tasks) -- Single-resource creation with verification
-| ID | Description | Verified Resource |
-|----|-------------|-------------------|
-| 6 | Create an S3 bucket | Bucket exists in MiniStack |
-| 7 | Create a DynamoDB table | Table exists |
-| 8 | Create an SQS queue | Queue URL resolvable |
-| 9 | Create an SNS topic | Topic ARN in list |
-| 10 | Create a Lambda function | Function exists |
-### Intermediate (4 tasks) -- Multi-step workflows
-| ID | Description | Steps |
-|----|-------------|-------|
-| 11 | Create S3 bucket + upload file | create-bucket, put-object |
-| 12 | Create DynamoDB table + insert item | create-table, put-item |
-| 13 | Create SNS topic + SQS queue + subscribe | create-topic, create-queue, subscribe |
-| 14 | Create IAM role + attach policy | create-role, attach-role-policy |
-### Advanced (3 tasks) -- Cross-service architectures
-| ID | Description | Services | Steps |
-|----|-------------|----------|-------|
-| 15 | Lambda + SQS event source pipeline | Lambda, SQS, IAM | 4-5 steps |
-| 16 | Serverless API (DynamoDB + Lambda + API Gateway) | DynamoDB, Lambda, API Gateway, IAM | 7 steps |
-| 17 | Fan-out notification system (SNS + SQS) | SNS, SQS | 5 steps |
-### Expert (3 tasks) -- SRE incident response
-| ID | Description | Setup | Fix Required |
-|----|-------------|-------|-------------|
-| 18 | Fix Lambda missing SQS permissions | Broken role + Lambda + queue | Attach SQS policy, create event source |
-| 19 | Enable S3 versioning + lifecycle | Bucket + object | Enable versioning, add lifecycle rule |
-| 20 | Fix DynamoDB throttling + alerting | Under-provisioned table + SNS | Scale to 50 RCU/WCU, subscribe SQS |
-Expert tasks use **state checks** (ground-truth AWS CLI assertions) to verify the fix, not just command matching.
----
-## Anti-Reward-Hacking Measures
-| Defense | How it works |
-|---------|-------------|
-| **Ground-truth verification** | Grader queries MiniStack directly -- agent cannot fake resource state |
-| **Deduplication** | `EpisodeTracker.has_executed_operation()` prevents re-earning credit for repeated commands |
-| **Invisible grading** | Verification commands run server-side, invisible to the agent's observations |
-| **Command allowlisting** | Only commands starting with `aws` are executed; pipes and shell escape are rejected |
-| **No credit for read-only** | Running a `state_check` command earns no progress; only mutating `steps` earn credit |
-| **Monotonic progress** | `partial_progress` can only increase within an episode |
-| **Exact resource names** | `resource_exists` checks the exact name, not just any resource of that type |
-| **State checks verify final state** | Expert tasks run actual CLI commands against MiniStack at grading time |
----
-## Supported AWS Services (34)
-| Category | Services |
-|----------|----------|
-| **Storage & DB** | S3, DynamoDB, RDS, ElastiCache, EFS |
-| **Compute** | Lambda, ECS, EC2, Step Functions |
-| **Messaging** | SQS, SNS, Kinesis, EventBridge, Firehose |
-| **API** | API Gateway v1/v2, ALB/ELBv2 |
-| **Security** | IAM, STS, Cognito, ACM, WAF v2, Secrets Manager |
-| **Monitoring** | CloudWatch, CloudWatch Logs, SSM |
-| **Infrastructure** | CloudFormation, Route53 |
-| **Other** | SES, Athena, Glue, EMR |
----
 ## Project Structure
 ```
@@ -309,22 +488,48 @@ aws-rl-env/
 ├── models.py                      # Pydantic data models & type aliases
 ├── client.py                      # AwsRlEnv OpenEnv client
 ├── inference.py                   # LLM agent inference script
 ├── server/
 │   ├── app.py                     # FastAPI application + web UI endpoints
 │   ├── aws_rl_env_environment.py  # Core RL environment (reset/step)
 │   └── services/
 │       ├── aws_backend.py         # MiniStack command executor
 │       ├── task_grader.py         # Grading engine with reward shaping
 │       ├── curriculum.py          # Curriculum learning manager
-│       ├── episode_tracker.py     # Per-episode step history
 │       ├── resource_verifier.py   # Ground-truth state verification
 │       ├── environment_designer.py # Setup provisioning for SRE tasks
 │       └── tasks/
-│           ├── warmup.yaml        # 6 listing tasks
-│           ├── beginner.yaml      # 5 creation tasks
-│           ├── intermediate.yaml  # 4 multi-step tasks
-│           ├── advanced.yaml      # 3 architecture tasks
-│           └── expert.yaml        # 3 SRE incident tasks
 ├── aws_infra/                     # Vendored MiniStack emulator
 │   └── aws_infra/
 │       ├── app.py                 # MiniStack ASGI router
@@ -351,19 +556,7 @@ make docker-health         # Health check
 ### Local (without Docker)
-```bash
-# Terminal 1: Start MiniStack
-pip install ministack
-ministack                  # port 4566
-# Terminal 2: Start RL server
-export AWS_ENDPOINT_URL=http://localhost:4566
-export AWS_ACCESS_KEY_ID=test
-export AWS_SECRET_ACCESS_KEY=test
-uv run uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
-```
-Or use the combined Makefile target:
 ```bash
 make run                   # Starts MiniStack + server
@@ -388,9 +581,9 @@ make openenv-push          # Push to HuggingFace Spaces
 | `AWS_SECRET_ACCESS_KEY` | `test` | AWS credentials (any value works) |
 | `AWS_DEFAULT_REGION` | `us-east-1` | AWS region |
 | `MAX_STEPS` | `15` | Max steps per episode |
-| `API_BASE_URL` | -- | LLM API endpoint (for inference.py) |
-| `MODEL_NAME` | -- | LLM model name (for inference.py) |
-| `HF_TOKEN` | -- | HuggingFace token (for inference.py) |
 | `TEMPERATURE` | `0.7` | LLM sampling temperature |
 ---
@@ -413,3 +606,13 @@ curriculum.get_stats()
 #   "avg_reward_last_10": 0.65
 # }
 ```

   - openenv
 ---
+# AWS Cloud CLI and SRE Reinforcement Learning Environment
+A **OpenEnv** RL environment** for training AI agents on real-world AWS cloud operations. The agent sends AWS CLI commands as actions, receives structured observations, and progresses through a **curriculum of 120+ tasks** across 5 difficulty tiers — from basic listing to SRE incident response and security posture auditing.
+The agents interact with a **real-world AWS Shell simulator** — a vendored MiniStack emulator (34 AWS services, in-memory, zero-cost) inside the same Docker container. The response of every executed command is the same as production AWS. The grading system evaluates rewards and penalties based on the **actual AWS infrastructure state** instead of static metrics. No AWS account needed.
+> **[Try the Playground](https://sizzing-aws-rl-env.hf.space/web)** | **[API Docs](https://sizzing-aws-rl-env.hf.space/docs)** | **[Hugging Face Space](https://huggingface.co/spaces/Sizzing/aws_rl_env)**
+## Task Tiers (100+ Tasks)
+### Warmup — 20 tasks
+> List resources — single read-only commands
+- Run one AWS CLI command to list or describe a resource type
+- S3 buckets, EC2 instances, DynamoDB tables, Lambda functions, RDS, EBS volumes
+- Graded by **command_match** — checks operation + service pair
+- No setup required, no state mutations
+### Beginner — 20 tasks
+> Create single resources with verification
+- Create an S3 bucket, DynamoDB table, SQS queue, or Lambda function
+- Graded by **resource_creation** — verifies the exact resource exists in the AWS Infrastructure Simulator
+- Introduces resource name validation — "my-bucket-2" won't satisfy a check for "my-bucket"
+- First tier where idempotency bonus (+0.02) can be earned
+### Intermediate — 20 tasks
+> Multi-step workflows — create, configure, connect
+- Ordered sequences: create a bucket then enable versioning, create a table then add an item
+- Graded by **multi_step** — validates each step was completed in order
+- Chaos injection begins at **10% probability** — resources may be silently mutated mid-episode
+- Rollback penalty (-0.1) starts to matter with multi-step create/delete patterns
+### Advanced — 20 tasks
+> Cross-service architectures spanning multiple AWS services
+- Wire Lambda to SQS, configure API Gateway with integrations, build event-driven pipelines
+- Graded by **multi_step + services** — all required services must be configured
+- Chaos injection escalates to **20% probability** — DynamoDB throughput, Lambda configs may change
+- Hints cost more: 3 hints = only 61% of max reward (0.85³ decay)
+### Expert — 20 tasks
+> SRE incidents, drift detection & security posture audits
+- Fix overly permissive S3 policies, replace broad IAM inline policies, repair broken infrastructure
+- Graded by **state_checks** — actual CLI commands run against MiniStack at grading time
+- Chaos injection at **30% probability** — maximum perturbation frequency
+- **6 drift detection tasks** — correct infra is provisioned, then 2-3 random mutations applied from a pool
+- Agent must audit environment, discover which resources drifted, and fix only those
+- Drift is randomized per episode — prevents memorization of fix sequences
+---
+## Features
+### 1. Curriculum & Training
+Adaptive learning system that tracks mastery and selects optimal tasks.
+#### Progressive Difficulty
+- **What:** The environment organizes 120+ tasks across 5 tiers: Warmup, Beginner, Intermediate, Advanced, and Expert. Tasks progress from simple listing operations to complex SRE incident response and drift detection scenarios.
+- **Why:** Prevents the agent from being overwhelmed by complex tasks early on. Scaffolded difficulty ensures the agent builds foundational skills before tackling multi-service architectures.
+- **How:** The `CurriculumManager` maintains per-agent tier state. Promotion requires meeting a minimum episode count and success rate threshold. A fast-track mechanism allows agents scoring 90%+ on 3 consecutive episodes to skip the minimum wait.
+- **Metrics:** 5 Difficulty Tiers | 120+ Total Tasks | 90% Fast-track Threshold
+#### Mastery Tracking
+- **What:** Each task independently tracks the agent's performance using a weighted success rate over a sliding window. Tasks "graduate" when performance exceeds the mastery threshold consistently.
+- **Why:** Ensures the agent truly masters a skill before moving on. Prevents lucky single completions from being treated as mastery. Un-graduation catches skill decay.
+- **How:** A `mastery_window` of 10 episodes and `mastery_threshold` of 0.7 (70% success). Minimum 3 attempts required before graduation. Recent results are weighted more heavily using exponential decay (factor 0.85). Graduated tasks can un-graduate if performance drops.
+- **Metrics:** 70% Mastery Threshold | 10 Window Size | 0.85 Decay Factor
+#### Spaced Repetition
+- **What:** Graduated tasks don't disappear — they resurface at exponentially increasing intervals (3, 6, 12, 24, 48 episodes) for re-testing, earning a +30 priority bonus when due.
+- **Why:** Prevents catastrophic forgetting. The agent must retain skills even as it learns new ones. Exponential spacing is the most efficient retention schedule, borrowed from cognitive science.
+- **How:** Each task tracks a `spaced_rep_interval` starting at 3 episodes. When re-tested and passes, the interval doubles (up to 48). If it fails, the interval resets. `_is_spaced_rep_due()` checks elapsed episodes against the interval.
+- **Metrics:** +30 Spaced Rep Bonus | 3→48 Interval Range | 2x Interval Growth
+#### Priority Selection
+- **What:** Tasks are ranked by a composite score combining novelty, weakness, spaced repetition due dates, and recency. The highest-scoring task is selected for each episode.
+- **Why:** Optimizes the training curriculum by ensuring the agent explores new tasks, practices weak areas, revisits graduated skills, and maintains variety — all balanced automatically.
+- **How:** `score = novelty_bonus (+100 if never attempted) + weakness_weight (+50 × (1 - success_rate)) + spaced_rep_bonus (+30 if due) - recency_penalty (-20 if attempted in last 2 episodes)`. Uses exponential decay (0.85) to emphasize recent performance.
+- **Metrics:** +100 Novelty Bonus | +50 Max Weakness Weight | -20 Recency Penalty
+#### Tier Progression
+- **What:** Agents advance through tiers via standard promotion (minimum episodes + success rate) or fast-track (3 consecutive high-scoring episodes). Tiers gate access to increasingly complex task pools.
+- **Why:** Provides structure to the learning process. Standard promotion ensures sufficient exposure; fast-track rewards agents that demonstrate immediate competence.
+- **How:** Standard: complete `min_episodes` at current tier with `success_rate >= advance_rate`. Fast-track: 3 consecutive episodes at >= 90% success bypasses the minimum episode requirement. Un-promotion is not supported — agents cannot drop tiers.
+- **Metrics:** 3 Fast-track Streak | 90% Fast-track Rate | 5 Total Tiers
+### 2. Reward Shaping
+Dense reward signals that encourage operational discipline and real progress.
+```
+if task_achieved:       reward = 1.0
+else:
+    reward = partial_progress * 0.8        # base: scaled to [0.0, 0.8]
+    if progress_increased: reward += 0.1   # dense signal for advancing
+    if command_failed:     reward *= 0.5   # penalty for errors
+    reward = clamp(reward, 0.0, 0.99)      # never 1.0 without completion
+    reward *= 0.85 ** hints_used           # hint decay
+    if survived_chaos:    reward *= 1.05   # chaos survival bonus
+```
+#### Rollback Penalty & Idempotency Bonus
+- **What:** Detects create→delete pairs on the same resource (rollbacks) and penalizes them (-0.1 each). Rewards graceful "already exists" handling (+0.02) where the agent retries idempotently.
+- **Why:** First RL environment rewarding operational discipline. In production, create-then-delete cycles are wasteful. Handling "already exists" gracefully is a sign of robust automation.
+- **How:** `EpisodeTracker.detect_rollbacks()` scans command history for paired create/delete operations on the same resource. Idempotency detection looks for commands that fail with "already exists" patterns (BucketAlreadyExists, ResourceInUseException, etc.) followed by successful continuation.
+- **Metrics:** -0.1 Rollback Penalty | +0.02 Idempotency Bonus | Per-pair Detection
+#### Shaped Reward System
+- **What:** Rewards are carefully shaped: 1.0 for full completion, 0.0-0.8 for partial progress, +0.1 progress bonus for advancing, ×0.5 for failures, capped at 0.99 without completion. Chaos bonus (×1.05) and hint decay (×0.85^n) layer on top.
+- **Why:** Dense reward signal prevents sparse-reward stagnation. The agent gets meaningful feedback on every step, not just at episode end. Capping at 0.99 ensures only real completion earns full credit.
+- **How:** `TaskGrader` dispatches to 5 strategies by tier: `command_match` (warmup), `resource_creation` (beginner), `multi_step` (intermediate), `multi_step+services` (advanced), and `state_checks` (expert). Each returns `partial_progress` which is converted to reward with bonuses/penalties applied.
+- **Metrics:** 1.0 Max Reward | 0.99 Progress Cap | ×1.05 Chaos Bonus
+#### Multi-Strategy Grading
+- **What:** Five distinct grading strategies, one per tier: `command_match` checks operation+service pairs, `resource_creation` verifies resources exist, `multi_step` validates ordered sequences, advanced adds service coverage, and expert runs `state_checks` against MiniStack.
+- **Why:** Each tier tests fundamentally different skills. A single grading strategy would either be too lenient for beginners or miss the nuance needed for expert SRE tasks.
+- **How:** `TaskGrader.grade()` dispatches based on the task's `grading_strategy` field. Each strategy returns a `GradeResult` with `partial_progress` (0.0-1.0), `completed` flag, and details. Grading is deterministic and fully automated.
+- **Metrics:** 5 Grading Strategies | 100% Automated | Per-tier Selection
+### 3. Resilience & Adaptability
+Features that test agent robustness under unpredictable conditions.
+#### Progressive Hint System
+- **What:** A 3-level hint system where each level reveals progressively more detail: Level 1 names the AWS services, Level 2 describes the operations, Level 3 gives near-complete command structure. Each hint reduces the final reward by ×0.85.
+- **Why:** Creates an information-reward tradeoff unique in RL. The agent learns to wean off hints over time — initially relying on them for unfamiliar tasks, then solving independently for maximum reward. From GRPO perspective, it creates a natural exploration/exploitation axis within a single episode.
+- **How:** Agent issues special command `aws help --task-hint` as its action (intercepted before reaching MiniStack). Hints auto-generated from `SuccessCriteria` fields (services, steps, operations). Reward decay: `final_reward *= 0.85 ^ hints_used` — 0 hints: 1.0×, 1 hint: 0.85×, 2 hints: 0.72×, 3 hints: 0.61×. Curriculum naturally penalizes hint-dependent agents: lower rewards → slower graduation.
+- **Metrics:** 3 Hint Levels | ×0.85 Decay Per Hint | ~61% Reward with 3 Hints
+#### Chaos Injection Engine
+- **What:** Silently mutates AWS resource state mid-episode to test agent resilience. Perturbations are scoped to services the current task uses. If the agent completes despite chaos, it earns a ×1.05 bonus.
+- **Why:** Tests whether the agent can handle unexpected state changes — a critical SRE skill. Prevents brittle memorization of exact command sequences. Probability scales with tier difficulty.
+- **How:** `ChaosEngine` selects perturbation templates specific to the services in use (S3 policy changes, DynamoDB throughput modifications, Lambda config alterations, etc.). Resource names are extracted from successful commands via regex. Chaos probability: 10% (Intermediate), 20% (Advanced), 30% (Expert).
+- **Metrics:** ×1.05 Chaos Survival Bonus | 10-30% Probability by Tier | 5 Service Templates
+#### Drift Detection Tasks
+- **What:** 6 expert-tier tasks where infrastructure is provisioned correctly, then 2-3 random mutations are applied from a pool. The agent must audit, discover drifted resources, and fix only those — without knowing which drifted.
+- **Why:** Randomized per episode, preventing memorization. Tests real SRE audit skills: the agent must reason about desired vs. actual state, not just follow a script.
+- **How:** `DriftEngine` randomly selects 2-3 mutations from a task's `possible_drifts` pool and applies them after setup. Each task defines a `desired_state_spec` (natural language) and `state_checks` (ground truth CLI commands). Examples: S3 versioning/encryption drift, DynamoDB throughput changes, SNS subscription modifications.
+- **Metrics:** 6 Drift Tasks | 2-3 Mutations Per Episode | Random Selection Per Run
+### 4. Security Posture Audit
+Tests *reasoning about configuration state* — the agent must READ and ANALYZE existing infrastructure, not just build things. Unlike SRE tasks (broken functionality), these have *working but insecure* infrastructure.
+#### Public S3 Bucket Lockdown
+- **What:** A pre-provisioned S3 bucket "public-assets" has an overly permissive bucket policy granting access to any principal (`Principal: *`). The agent must read the policy, identify the vulnerability, and replace it with a restrictive policy allowing only a specific IAM role.
+- **Why:** Tests security reasoning — the infrastructure is functional but insecure. Unlike SRE tasks where things are broken, here the agent must understand what "correct" security posture looks like and make the right judgment call.
+- **How:** Setup creates the bucket with a wide-open policy. State checks verify the new policy denies `Principal: *` and only allows the `app-role` principal to perform `s3:GetObject`.
+- **Metrics:** S3 Target Service | Policy Attack Surface | Expert Tier
+#### IAM Least Privilege
+- **What:** An IAM role "app-role" has an inline policy with `Action: *` and `Resource: *` — full admin access. The agent must replace it with a least-privilege policy allowing only `dynamodb:GetItem` and `dynamodb:PutItem` on the users table.
+- **Why:** IAM misconfiguration is the #1 cloud security risk. This task tests whether the agent understands permission scoping and can reason about what access an application actually needs vs. what it currently has.
+- **How:** Setup creates the role with a wildcard policy. The agent must craft a replacement policy document with specific actions and resource ARN. State checks verify the policy document matches the expected least-privilege permissions.
+- **Metrics:** IAM Target Service | 2 Allowed Actions | Expert Tier
+#### Secrets in Lambda Environment
+- **What:** A Lambda function "data-processor" has a database password stored as a plaintext environment variable (`DB_PASSWORD=hunter2`). The agent must create a secret in Secrets Manager, update the Lambda to reference the secret ARN, and remove the plaintext variable.
+- **Why:** Plaintext secrets in environment variables is a critical security anti-pattern. This task combines multiple services (Lambda + Secrets Manager) and tests the agent's ability to perform a safe credential rotation without breaking the function.
+- **How:** Setup creates the Lambda with the plaintext env var. The agent must: (1) create a secret in Secrets Manager, (2) add `SECRET_ARN` env var to Lambda, (3) remove `DB_PASSWORD`. State checks verify all three conditions.
+- **Metrics:** 2 Services Involved | 3 Required Steps | Expert Tier
+### 5. Anti-Reward-Hacking (8 Defense Layers)
+8 defense layers that prevent the agent from gaming the reward system.
+#### 1. Ground-Truth Verification via MiniStack
+- **What:** The grader never trusts agent command output. It independently queries MiniStack (the simulated AWS backend) to verify resource state for 20+ services. Even if the agent crafts fake-looking stdout, the grader checks actual state.
+- **Why:** Prevents reward hacking through output fabrication. The agent cannot game the system by producing convincing but fake CLI output — ground truth is always checked server-side.
+- **How:** `ResourceVerifier` has per-service verification methods that query MiniStack directly. For expert tasks, `StateCheck` assertions run actual AWS CLI commands against MiniStack at grading time, checking either `output_contains` (substring) or `json_path` extraction with expected values.
+- **Metrics:** 20+ Verified Services | 100% Server-side | 0 Agent Visibility
+#### 2. Deduplication
+- **What:** `EpisodeTracker.has_executed_operation()` tracks which (operation, resource) pairs have been credited. Running the same successful command twice does NOT increase `partial_progress`. Progress can only increase, never re-earn.
+- **Why:** Prevents the agent from gaming the reward system by repeating the same command to accumulate credit. Each unique operation earns credit exactly once.
+- **How:** `credit_operation()` records each (operation, resource) pair. Before granting credit, `is_operation_already_credited()` checks if this exact pair was already rewarded. The check is deterministic and happens at grading time.
+- **Metrics:** 1x Credit Per Operation | Exact Match Type | (op, res) Tracking Granularity
+#### 3. Grader Invisibility
+- **What:** The verification commands run by `ResourceVerifier` are NOT returned in the observation's `command_output`. They happen server-side during grading. The agent cannot observe or mimic them.
+- **Why:** If the agent could see which verification commands the grader runs, it could learn to craft fake outputs that match expected patterns. Keeping grader logic invisible forces the agent to actually perform the task.
+- **How:** `ResourceVerifier` executes AWS CLI commands against MiniStack in a separate execution context. Results are consumed internally by the grading pipeline. The observation returned to the agent only contains output from the agent's own commands.
+- **Metrics:** 0 Grader Cmds Exposed | Server Execution Context | 20+ Hidden Verifications
+#### 4. Command Allowlisting
+- **What:** Only commands starting with `aws` are executed. Any attempt to run shell commands, pipe to other tools, use redirects, or escape the sandbox is rejected with `success=False`.
+- **Why:** Prevents the agent from escaping the AWS CLI sandbox. Without this, the agent could potentially execute arbitrary shell commands, access the filesystem, or interfere with the environment.
+- **How:** The environment's `step()` method validates the command before execution. Commands not starting with `aws` are immediately rejected.
+- **Metrics:** `aws *` Allowed Pattern | 0 Shell Access | Instant Rejection
+#### 5. No Verification Reward
+- **What:** If the agent runs a command that matches a `state_check` command exactly (e.g., `aws s3api get-bucket-versioning --bucket app-config-store`), it gets no progress credit. Progress is only earned through `steps` operations (mutating commands), not read-only queries.
+- **Why:** Prevents the agent from gaming progress by running the same verification commands the grader uses. The agent can run read commands to understand state, but only mutation commands earn progress.
+- **How:** During grading, the `TaskGrader` checks if the agent's command matches any `state_check` command. Matching commands are flagged as verification-only and excluded from credit. Only commands matching `steps` operations (create, put, update, delete) earn `partial_progress`.
+- **Metrics:** 0 Credit for Reads | Mutate Rewarded Actions | Exact Match Detection
+#### 6. Monotonic Progress
+- **What:** `partial_progress` can only increase within an episode. It is clamped to [0.0, 0.99] — reaching 1.0 requires actual task completion. The agent cannot lose progress, but also cannot re-earn it.
+- **Why:** Prevents cycling strategies where the agent creates and destroys resources repeatedly. Combined with deduplication, this ensures steady forward progress.
+- **How:** In `TaskGrader`, `previous_progress` tracks the highest progress seen. New progress is always `max(previous, current)`. Reward is clamped at 0.99 for partial completion, reserving 1.0 exclusively for verified full completion.
+- **Metrics:** 0.99 Max Without Completion | 1.0 Requires Full Completion | max() Progress Function
+#### 7. Resource Name Validation
+- **What:** For `resource_exists` checks, the verifier matches the exact resource name, not just any resource of that type. Creating "my-test-bucket-2" doesn't satisfy a check for "my-test-bucket".
+- **Why:** Prevents the agent from creating arbitrarily named resources to game the verification system. Forces precise execution of the task requirements.
+- **How:** `ResourceVerifier`'s per-service methods (`verify_s3_bucket`, `verify_dynamodb_table`, etc.) compare against the exact expected resource name from the task definition. Each of the 20+ supported services has its own verification logic.
+- **Metrics:** Exact Name Matching | 20+ Verified Services | 0 Partial Matches
+#### 8. State Checks Verify Final State
+- **What:** For expert SRE tasks, `state_checks` run actual AWS CLI commands against MiniStack at grading time. The grader verifies the final infrastructure state — not the commands the agent ran.
+- **Why:** The agent cannot fake the state. MiniStack is the ground truth. This decouples "what the agent did" from "what was actually achieved", making reward hacking extremely difficult.
+- **How:** Each expert task defines `state_checks` with command + assertion pairs. Assertions support `output_contains` (substring match on CLI output) and `json_path + expected` (JSON extraction). The grader runs these checks against the live MiniStack state independently of the agent.
+- **Metrics:** CLI Verification Method | 2 Assertion Types | Live State Source
+---
+## Supported AWS Services (34)
+| Category | Services |
+|----------|----------|
+| **Storage & DB** | S3, DynamoDB, RDS, ElastiCache, EFS |
+| **Compute** | Lambda, ECS, EC2, Step Functions |
+| **Messaging** | SQS, SNS, Kinesis, EventBridge, Firehose |
+| **API** | API Gateway v1/v2, ALB/ELBv2 |
+| **Security** | IAM, STS, Cognito, ACM, WAF v2, Secrets Manager |
+| **Monitoring** | CloudWatch, CloudWatch Logs, SSM |
+| **Infrastructure** | CloudFormation, Route53 |
+| **Other** | SES, Athena, Glue, EMR |
+---
 ## Quick Start
 result = env.step(AwsRlAction(command="aws s3 ls"))
 ```
+WebSocket API:
+```python
+import websockets, json
+async with websockets.connect("wss://sizzing-aws-rl-env.hf.space/ws") as ws:
+    await ws.send(json.dumps({"type": "reset"}))
+    obs = json.loads(await ws.recv())
+    await ws.send(json.dumps({"type": "step", "data": {"command": "aws s3 ls"}}))
+    obs = json.loads(await ws.recv())
+```
 ---
 ## Architecture
 ```
+┌─────────────────────────────────────────────────────────┐
+│                    Docker Container                     │
+│                                                         │
 │  ┌─────────────────────┐      ┌────────────────────┐    │
+│  │  FastAPI RL Server  │      │  AWS Simulator     │    │
+│  │  (port 8000)        │─────>│  (port 4566)       │    │
+│  │                     │      │  34 AWS services   │    │
+│  │  - Environment      │      │  In-memory state   │    │
+│  │  - Curriculum       │      │  Reset API         │    │
+│  │  - Grading Engine   │      │  (Ministack)       │    │
+│  │  - Episode Tracker  │      │                    │    │
+│  │  - Hint Provider    │      │                    │    │
 │  └─────────────────────┘      └────────────────────┘    │
+│          ^                             ^                │
+│          | OpenEnv HTTP/WS             | AWS CLI calls  │
+└──────────┼─────────────────────────────┼────────────────┘
            |                             |
+      RL Agent (client, External)     (internal only)
 ```
 ### Episode Lifecycle
+1. **`reset()`** — Wipes AWS Infracture state, selects next task from curriculum, provisions setup commands (if any), returns initial observation
+2. **`step(action)`** — Validates command (`aws` prefix only), executes against MiniStack, records in tracker, grades with shaped reward, returns observation
+3. **Hint request** — Agent sends `aws help --task-hint` to get a progressive hint (costs reward)
+4. **Terminates** when `task_achieved == True` or max steps reached
 ---
 ## Core Classes
 ### `AwsRlEnvironment`
+[server/aws_rl_env_environment.py](server/aws_rl_env_environment.py) — Implements the OpenEnv `Environment` interface. Orchestrates all services.
 | Method | Description |
 |--------|-------------|
 | `reset()` | Wipe infra, select task, provision setup, return initial observation |
+| `step(action)` | Execute command (or intercept hint request), grade, update curriculum, return observation |
 ### `Curriculum`
+[server/services/curriculum.py](server/services/curriculum.py) — Priority-queue-based task selection with progressive difficulty.
 Selects the next task using a **max-heap scored by**:
 ```
 score = (
     novelty_bonus          # +100 if never attempted (explore first)
+    + weakness_weight      # +50 * (1 - task_success_rate) — worse tasks get higher priority
     + spaced_rep_bonus     # +30 if graduated task is "due" for re-test
     - recency_penalty      # -20 if attempted in last 2 episodes (ensure variety)
 )
 ```
 ### `TaskGrader`
+[server/services/task_grader.py](server/services/task_grader.py) — Evaluates task completion using a dispatcher pattern. Rewards are always in [0.0, 1.0].
 **Grading strategies by tier:**
 | Advanced | Multi-step + services | All steps completed AND all required services touched |
 | Expert | State checks | Runs arbitrary AWS CLI commands to assert end-state (ground truth) |
+### `HintProvider`
+[server/services/hint_provider.py](server/services/hint_provider.py) — Generates progressive hints from `SuccessCriteria` fields.
+| Hint Level | What it reveals | Example |
+|-----------|----------------|---------|
+| Level 1 | Which AWS services to use | "You'll need IAM and Lambda" |
+| Level 2 | Which operations | "Start with create-role, then put-role-policy" |
+| Level 3 | Near-complete command structure | "Use: aws iam create-role --role-name ..." |
 ### `EpisodeTracker`
+[server/services/episode_tracker.py](server/services/episode_tracker.py) — Maintains per-episode step history. Parses AWS CLI commands to extract (service, operation, resource) tuples. Tracks credited operations for deduplication, monotonic progress, and hint usage.
 ### `ResourceVerifier`
+[server/services/resource_verifier.py](server/services/resource_verifier.py) — Queries MiniStack directly to verify ground-truth resource state. Service-specific checks for S3, DynamoDB, Lambda, SQS, SNS, IAM, Secrets Manager, and API Gateway. Also evaluates `StateCheck` assertions (substring match, JSON path extraction).
 ### `EnvironmentDesigner`
+[server/services/environment_designer.py](server/services/environment_designer.py) — Provisions initial AWS state via setup commands before the agent acts. Used by SRE/expert tasks to create broken or insecure infrastructure the agent must fix.
 ### `AwsBackend`
+[server/services/aws_backend.py](server/services/aws_backend.py) — Executes AWS CLI commands against MiniStack (`AWS_ENDPOINT_URL=http://localhost:4566`). Provides `reset_environment()` via MiniStack's `/_ministack/reset` endpoint.
 ### `AwsRlEnv` (Client)
+[client.py](client.py) — OpenEnv HTTP/WebSocket client. Wraps `reset()` and `step()` calls to the server.
 ---
 ## Data Models
+[models.py](models.py) — All Pydantic models and type aliases.
+### Action
 ```python
 class AwsRlAction(Action):
     command: str   # AWS CLI command, e.g. "aws s3 ls"
+```
+### Observation
+```python
 class AwsRlObservation(Observation):
     episode_id: EpisodeID
     step_count: StepCount
     command_success: bool
     command_output: str          # stdout from AWS CLI
     error: str                   # stderr if failed
+    task: TaskInfo | None        # masked task definition (hides success criteria)
     task_achieved: bool
+    partial_progress: float      # current task progress in [0.0, 1.0]
+    hints_used: int              # number of hints requested this episode
+    hint_text: str               # most recent hint text (if any)
+```
+### Environment State
+```python
+class AwsRlState(State):
+    current_task: Task | None    # full task assigned for the episode
+    tracker: TrackerState        # episode tracker snapshot
+    infra_state: dict            # AWS infrastructure state keyed by service name
+    chaos_occurred: bool         # whether chaos was injected this episode
+    current_tier: str            # agent's current difficulty tier
+class TrackerState:
+    step_count: int              # steps taken this episode
+    hints_used: int              # hints requested this episode
+    progress: float              # current partial progress [0.0, 1.0]
+    commands_executed: list[str] # commands executed this episode
+    credited_operations: list[str]  # (operation, resource) pairs that earned credit
 ```
 ### Task Definitions
 ```python
 class Task:
+    task_id: TaskID
     difficulty: TaskDifficulty   # warmup | beginner | intermediate | advanced | expert
     description: str             # human-readable goal
     success_criteria: SuccessCriteria
+    setup_commands: list[SetupCommand]       # pre-provision for SRE tasks
+    desired_state_spec: str | None           # natural-language desired end state (drift tasks)
+    possible_drifts: list[SetupCommand]      # pool of mutations for DriftEngine
+class TaskInfo:
+    """Agent-visible subset of Task — masks success_criteria, setup_commands, and possible_drifts."""
+    task_id: TaskID
+    difficulty: TaskDifficulty
+    description: str
+    desired_state_spec: str | None
 class SuccessCriteria:
     command_contains: str | None           # warmup/beginner
     mastery_window: int       # sliding window size (default: 10)
     mastery_threshold: float  # per-task graduation threshold (default: 0.7)
     fast_track_rate: float    # early promotion threshold (default: 0.9)
+    chaos_probability: float  # probability of chaos injection per step (default: 0.0)
 class SpacedRepState:
     interval: int                  # episodes until next re-test (3 -> 48)
 ---
 ## Project Structure
 ```
 ├── models.py                      # Pydantic data models & type aliases
 ├── client.py                      # AwsRlEnv OpenEnv client
 ├── inference.py                   # LLM agent inference script
+├── inference-complete.py          # Full inference pipeline with curriculum
 ├── server/
 │   ├── app.py                     # FastAPI application + web UI endpoints
 │   ├── aws_rl_env_environment.py  # Core RL environment (reset/step)
+│   ├── templates/
+│   │   └── index.html             # Web playground UI
+│   ├── static/
+│   │   ├── css/style.css          # Playground styles
+│   │   └── js/app.js              # Playground frontend logic
 │   └── services/
 │       ├── aws_backend.py         # MiniStack command executor
 │       ├── task_grader.py         # Grading engine with reward shaping
 │       ├── curriculum.py          # Curriculum learning manager
+│       ├── episode_tracker.py     # Per-episode step history & hints
 │       ├── resource_verifier.py   # Ground-truth state verification
 │       ├── environment_designer.py # Setup provisioning for SRE tasks
+│       ├── hint_provider.py       # Progressive hint generator
+│       ├── chaos_engine.py        # Chaos injection engine
+│       ├── drift_engine.py        # Drift detection engine
+│       ├── task_solutions.py      # Reference solutions for tasks
 │       └── tasks/
+│           ├── warmup.yaml        # 20 listing tasks
+│           ├── beginner.yaml      # 20 creation tasks
+│           ├── intermediate.yaml  # 20 multi-step tasks
+│           ├── advanced.yaml      # 20 architecture tasks
+│           ├── expert.yaml        # 20 SRE/security tasks
+│           └── drift.yaml         # Drift detection tasks
+├── tests/                         # Unit tests for core services
+│   ├── test_aws_rl_env_environment.py
+│   ├── test_drift_engine.py
+│   ├── test_environment_designer.py
+│   ├── test_episode_tracker.py
+│   ├── test_hint_provider.py
+│   ├── test_resource_verifier.py
+│   └── test_task_grader.py
+├── tests_tasks/                   # Integration tests per task tier
+│   ├── test_warmup_tasks.py
+│   ├── test_beginner_tasks.py
+│   ├── test_intermediate_tasks.py
+│   ├── test_advanced_tasks.py
+│   ├── test_expert_tasks.py
+│   └── test_drift_tasks.py
 ├── aws_infra/                     # Vendored MiniStack emulator
 │   └── aws_infra/
 │       ├── app.py                 # MiniStack ASGI router
 ### Local (without Docker)
+Use the combined Makefile target:
 ```bash
 make run                   # Starts MiniStack + server
 | `AWS_SECRET_ACCESS_KEY` | `test` | AWS credentials (any value works) |
 | `AWS_DEFAULT_REGION` | `us-east-1` | AWS region |
 | `MAX_STEPS` | `15` | Max steps per episode |
+| `API_BASE_URL` | — | LLM API endpoint (for inference.py) |
+| `MODEL_NAME` | — | LLM model name (for inference.py) |
+| `HF_TOKEN` | — | HuggingFace token (for inference.py) |
 | `TEMPERATURE` | `0.7` | LLM sampling temperature |
 ---
 #   "avg_reward_last_10": 0.65
 # }
 ```
+---
+## Links
+- **GitHub**: [github.com/udaykiranpadhy/aws-rl-env](https://github.com/udaykiranpadhy/aws-rl-env)
+- **Hugging Face Space**: [huggingface.co/spaces/Sizzing/aws_rl_env](https://huggingface.co/spaces/Sizzing/aws_rl_env)
+- **API Reference**: [/docs](https://sizzing-aws-rl-env.hf.space/docs)
+- **ReDoc**: [/redoc](https://sizzing-aws-rl-env.hf.space/redoc)
+- **Portfolio**: [portfolio.udaykp.dev](https://portfolio.udaykp.dev)

__init__.py CHANGED Viewed

@@ -6,8 +6,13 @@
 """Aws Rl Env Environment."""
-from .client import AwsRlEnv
-from .models import AwsRlAction, AwsRlObservation
 __all__ = [
     "AwsRlAction",

 """Aws Rl Env Environment."""
+try:
+    from .client import AwsRlEnv
+    from .models import AwsRlAction, AwsRlObservation
+except ImportError:
+    # When imported directly (e.g. by pytest from rootdir) rather than as
+    # part of the aws_rl_env package, relative imports are unavailable.
+    pass
 __all__ = [
     "AwsRlAction",

aws_infra/aws_infra/app.py CHANGED Viewed

@@ -235,6 +235,29 @@ async def app(scope, receive, send):
                              json.dumps({"reset": "ok"}).encode())
         return
     if path == "/_ministack/config" and method == "POST":
         _ALLOWED_CONFIG_KEYS = {
             "athena.ATHENA_ENGINE", "athena.ATHENA_DATA_DIR",
@@ -570,6 +593,105 @@ def _run_init_scripts():
             logger.error("Failed to execute init script %s: %s", script_path, e)
 def _reset_all_state():
     """Wipe all in-memory state across every service module, and persisted files if enabled."""
     import shutil

                              json.dumps({"reset": "ok"}).encode())
         return
+    if path == "/_ministack/state" and method == "GET":
+        state = _get_all_state()
+        await _send_response(send, 200, {"Content-Type": "application/json"},
+                             json.dumps(state).encode())
+        return
+    if path == "/_ministack/handlers" and method == "GET":
+        handlers = _get_all_handlers()
+        await _send_response(send, 200, {"Content-Type": "application/json"},
+                             json.dumps(handlers).encode())
+        return
+    if path.startswith("/_ministack/handlers/") and method == "GET":
+        service_name = path[len("/_ministack/handlers/"):].strip("/")
+        info = _get_service_info(service_name)
+        if info is None:
+            await _send_response(send, 404, {"Content-Type": "application/json"},
+                                 json.dumps({"error": f"Unknown service: {service_name}"}).encode())
+        else:
+            await _send_response(send, 200, {"Content-Type": "application/json"},
+                                 json.dumps(info).encode())
+        return
     if path == "/_ministack/config" and method == "POST":
         _ALLOWED_CONFIG_KEYS = {
             "athena.ATHENA_ENGINE", "athena.ATHENA_DATA_DIR",
             logger.error("Failed to execute init script %s: %s", script_path, e)
+def _service_modules() -> list:
+    """Return list of (canonical_name, module) for all service modules."""
+    from aws_infra.services import iam_sts
+    return [
+        ("s3", s3), ("sqs", sqs), ("sns", sns), ("dynamodb", dynamodb),
+        ("lambda", lambda_svc), ("iam", iam_sts), ("secretsmanager", secretsmanager),
+        ("logs", cloudwatch_logs), ("ssm", ssm), ("events", eventbridge),
+        ("kinesis", kinesis), ("monitoring", cloudwatch), ("ses", ses),
+        ("ses_v2", ses_v2), ("acm", acm), ("wafv2", waf),
+        ("states", stepfunctions), ("ecs", ecs), ("rds", rds),
+        ("elasticache", elasticache), ("glue", glue), ("athena", athena),
+        ("apigateway", apigateway), ("apigateway_v1", apigateway_v1),
+        ("firehose", firehose), ("route53", route53), ("cognito", cognito),
+        ("ec2", ec2), ("elasticmapreduce", emr), ("elasticloadbalancing", alb),
+        ("elasticfilesystem", efs), ("cloudformation", cloudformation),
+    ]
+# Extra aliases for the /_ministack/handlers/<service> endpoint so users can
+# look up services using common short names (e.g. "lambda", "stepfunctions").
+_HANDLER_LOOKUP_ALIASES = {
+    **SERVICE_NAME_ALIASES,
+    "lambda": "lambda",
+    "iam": "iam",
+    "sts": "iam",
+    "ses-v2": "ses_v2",
+    "sesv2": "ses_v2",
+    "apigateway-v1": "apigateway_v1",
+    "apigatewayv1": "apigateway_v1",
+    "logs": "logs",
+    "emr": "elasticmapreduce",
+    "alb": "elasticloadbalancing",
+    "efs": "elasticfilesystem",
+    "cfn": "cloudformation",
+    "sf": "states",
+    "sfn": "states",
+    "cw": "monitoring",
+    "cwl": "logs",
+    "sm": "secretsmanager",
+    "eb": "events",
+    "ddb": "dynamodb",
+}
+def _resolve_service_module(service_name: str):
+    """Resolve a service name (or alias) to its (canonical_name, module) pair."""
+    name = service_name.lower().strip()
+    canonical = _HANDLER_LOOKUP_ALIASES.get(name, name)
+    for svc_name, mod in _service_modules():
+        if svc_name == canonical:
+            return svc_name, mod
+    return None, None
+def _get_all_state() -> dict:
+    """Collect summary state from every service module."""
+    state = {}
+    for name, mod in _service_modules():
+        if name not in SERVICE_HANDLERS and name not in ("iam", "ses_v2", "apigateway_v1"):
+            continue
+        try:
+            state[name] = mod.get_state()
+        except Exception as e:
+            logger.warning("get_state() failed for %s: %s", name, e)
+            state[name] = {"error": str(e)}
+    return {"services": state}
+def _get_all_handlers() -> dict:
+    """Collect SUPPORTED_ACTIONS from every service module."""
+    handlers = {}
+    for name, mod in _service_modules():
+        if name not in SERVICE_HANDLERS and name not in ("iam", "ses_v2", "apigateway_v1"):
+            continue
+        actions = getattr(mod, "SUPPORTED_ACTIONS", [])
+        handlers[name] = {"actions": actions, "count": len(actions)}
+    return {"services": handlers}
+def _get_service_info(service_name: str) -> dict | None:
+    """Return detailed info for a single service: docstring, actions, and current state."""
+    name, mod = _resolve_service_module(service_name)
+    if mod is None:
+        return None
+    docstring = (mod.__doc__ or "").strip()
+    actions = getattr(mod, "SUPPORTED_ACTIONS", [])
+    try:
+        state = mod.get_state()
+    except Exception:
+        state = {}
+    return {
+        "service": name,
+        "description": docstring,
+        "supported_actions": actions,
+        "action_count": len(actions),
+        "state": state,
+    }
 def _reset_all_state():
     """Wipe all in-memory state across every service module, and persisted files if enabled."""
     import shutil

aws_infra/aws_infra/services/acm.py CHANGED Viewed

@@ -234,5 +234,20 @@ def _resend_validation_email(data):
     return json_response({})
 def reset():
     _certificates.clear()

     return json_response({})
+SUPPORTED_ACTIONS = [
+    "RequestCertificate", "DescribeCertificate", "ListCertificates",
+    "DeleteCertificate", "GetCertificate", "ImportCertificate",
+    "AddTagsToCertificate", "RemoveTagsFromCertificate",
+    "ListTagsForCertificate", "UpdateCertificateOptions",
+    "RenewCertificate", "ResendValidationEmail",
+]
+def get_state() -> dict:
+    return {
+        "certificates": {"count": len(_certificates), "ids": list(_certificates.keys())},
+    }
 def reset():
     _certificates.clear()

aws_infra/aws_infra/services/alb.py CHANGED Viewed

@@ -1044,6 +1044,38 @@ async def dispatch_request(lb, method, path, headers, body, query_params, port=8
             json.dumps({"message": "No matching ALB rule found"}).encode())
 def reset():
     _lbs.clear()
     _tgs.clear()

             json.dumps({"message": "No matching ALB rule found"}).encode())
+# ---------------------------------------------------------------------------
+# Supported Actions
+# ---------------------------------------------------------------------------
+SUPPORTED_ACTIONS = [
+    "CreateLoadBalancer", "DeleteLoadBalancer", "DescribeLoadBalancers",
+    "ModifyLoadBalancerAttributes", "AddTags", "RemoveTags", "DescribeTags",
+    "CreateTargetGroup", "DeleteTargetGroup", "DescribeTargetGroups",
+    "ModifyTargetGroup", "ModifyTargetGroupAttributes", "CreateListener",
+    "DeleteListener", "DescribeListeners", "ModifyListener", "CreateRule",
+    "DeleteRule", "DescribeRules", "ModifyRule", "RegisterTargets",
+    "DeregisterTargets", "DescribeTargetHealth", "SetRulePriorities",
+]
+# ---------------------------------------------------------------------------
+# State
+# ---------------------------------------------------------------------------
+def get_state() -> dict:
+    return {
+        "load_balancers": {"count": len(_lbs), "names": list(_lbs.keys())},
+        "target_groups": {"count": len(_tgs), "names": list(_tgs.keys())},
+        "listeners": {"count": len(_listeners), "ids": list(_listeners.keys())},
+        "rules": {"count": len(_rules), "ids": list(_rules.keys())},
+        "targets": {"count": sum(len(tgts) for tgts in _targets.values())},
+        "tags": {"count": sum(len(tags) for tags in _tags.values())},
+        "load_balancer_attributes": {"count": sum(len(attrs) for attrs in _lb_attrs.values())},
+        "target_group_attributes": {"count": sum(len(attrs) for attrs in _tg_attrs.values())},
+    }
 def reset():
     _lbs.clear()
     _tgs.clear()

aws_infra/aws_infra/services/apigateway.py CHANGED Viewed

@@ -83,6 +83,18 @@ def _api_arn(api_id: str) -> str:
     return f"arn:aws:apigateway:{REGION}::/apis/{api_id}"
 # ---- Persistence hooks ----
 def get_state() -> dict:

     return f"arn:aws:apigateway:{REGION}::/apis/{api_id}"
+SUPPORTED_ACTIONS = [
+    "CreateApi", "GetApis", "GetApi", "UpdateApi", "DeleteApi",
+    "CreateRoute", "GetRoutes", "GetRoute", "UpdateRoute", "DeleteRoute",
+    "CreateIntegration", "GetIntegrations", "GetIntegration",
+    "UpdateIntegration", "DeleteIntegration", "CreateStage", "GetStages",
+    "GetStage", "UpdateStage", "DeleteStage", "CreateDeployment",
+    "GetDeployments", "GetDeployment", "DeleteDeployment", "GetTags",
+    "TagResource", "UntagResource", "CreateAuthorizer", "GetAuthorizers",
+    "GetAuthorizer", "UpdateAuthorizer", "DeleteAuthorizer",
+]
 # ---- Persistence hooks ----
 def get_state() -> dict:

aws_infra/aws_infra/services/apigateway_v1.py CHANGED Viewed

@@ -245,6 +245,27 @@ async def _call_lambda(func_name, event):
         return {"statusCode": 200, "body": "Mock response"}, None
 # ---- Persistence hooks ----
 def get_state():

         return {"statusCode": 200, "body": "Mock response"}, None
+SUPPORTED_ACTIONS = [
+    "CreateRestApi", "GetRestApis", "GetRestApi", "UpdateRestApi",
+    "DeleteRestApi", "GetResources", "GetResource", "CreateResource",
+    "UpdateResource", "DeleteResource", "PutMethod", "GetMethod",
+    "DeleteMethod", "PutMethodResponse", "GetMethodResponse",
+    "DeleteMethodResponse", "PutIntegration", "GetIntegration",
+    "DeleteIntegration", "PutIntegrationResponse", "GetIntegrationResponse",
+    "DeleteIntegrationResponse", "CreateDeployment", "GetDeployments",
+    "GetDeployment", "UpdateDeployment", "DeleteDeployment", "CreateStage",
+    "GetStages", "GetStage", "UpdateStage", "DeleteStage",
+    "CreateAuthorizer", "GetAuthorizers", "GetAuthorizer",
+    "UpdateAuthorizer", "DeleteAuthorizer", "CreateModel", "GetModels",
+    "GetModel", "DeleteModel", "GetApiKeys", "CreateApiKey", "GetApiKey",
+    "DeleteApiKey", "GetUsagePlans", "CreateUsagePlan", "GetUsagePlan",
+    "DeleteUsagePlan", "GetUsagePlanKeys", "CreateUsagePlanKey",
+    "DeleteUsagePlanKey", "GetDomainNames", "CreateDomainName",
+    "GetDomainName", "DeleteDomainName", "GetTags", "TagResource",
+    "UntagResource",
+]
 # ---- Persistence hooks ----
 def get_state():

aws_infra/aws_infra/services/athena.py CHANGED Viewed

@@ -853,6 +853,29 @@ def _execution_out(ex):
     return {k: v for k, v in ex.items() if not k.startswith("_")}
 def reset():
     import time as _time

     return {k: v for k, v in ex.items() if not k.startswith("_")}
+SUPPORTED_ACTIONS = [
+    "StartQueryExecution", "GetQueryExecution", "GetQueryResults", "StopQueryExecution",
+    "ListQueryExecutions", "CreateWorkGroup", "DeleteWorkGroup", "GetWorkGroup",
+    "ListWorkGroups", "UpdateWorkGroup", "CreateNamedQuery", "DeleteNamedQuery",
+    "GetNamedQuery", "ListNamedQueries", "BatchGetNamedQuery", "BatchGetQueryExecution",
+    "CreateDataCatalog", "GetDataCatalog", "ListDataCatalogs", "DeleteDataCatalog",
+    "UpdateDataCatalog", "CreatePreparedStatement", "GetPreparedStatement",
+    "DeletePreparedStatement", "ListPreparedStatements", "GetTableMetadata",
+    "ListTableMetadata", "TagResource", "UntagResource", "ListTagsForResource",
+]
+def get_state() -> dict:
+    return {
+        "workgroups": {"count": len(_workgroups), "names": list(_workgroups.keys())},
+        "named_queries": {"count": len(_named_queries), "ids": list(_named_queries.keys())},
+        "data_catalogs": {"count": len(_data_catalogs), "names": list(_data_catalogs.keys())},
+        "executions": {"count": len(_executions), "ids": list(_executions.keys())},
+        "prepared_statements": {"count": len(_prepared_statements), "keys": list(_prepared_statements.keys())},
+        "tags": {"count": len(_tags), "arns": list(_tags.keys())},
+    }
 def reset():
     import time as _time

aws_infra/aws_infra/services/cloudformation/__init__.py CHANGED Viewed

@@ -54,6 +54,33 @@ async def handle_request(method: str, path: str, headers: dict,
     return handler(params)
 def reset():
     _stacks.clear()
     _stack_events.clear()

     return handler(params)
+# ---------------------------------------------------------------------------
+# Supported Actions
+# ---------------------------------------------------------------------------
+SUPPORTED_ACTIONS = [
+    "CreateStack", "UpdateStack", "DeleteStack", "DescribeStacks",
+    "ListStacks", "DescribeStackEvents", "DescribeStackResource",
+    "DescribeStackResources", "GetTemplate", "ValidateTemplate",
+    "ListExports", "CreateChangeSet", "DescribeChangeSet",
+    "ExecuteChangeSet", "DeleteChangeSet", "ListChangeSets",
+    "GetTemplateSummary",
+]
+# ---------------------------------------------------------------------------
+# State
+# ---------------------------------------------------------------------------
+def get_state() -> dict:
+    return {
+        "stacks": {"count": len(_stacks), "names": list(_stacks.keys())},
+        "change_sets": {"count": len(_change_sets), "ids": list(_change_sets.keys())},
+        "stack_events": {"count": len(_stack_events), "ids": list(_stack_events.keys())},
+        "exports": {"count": len(_exports), "names": list(_exports.keys())},
+    }
 def reset():
     _stacks.clear()
     _stack_events.clear()

aws_infra/aws_infra/services/cloudwatch.py CHANGED Viewed

@@ -1382,8 +1382,32 @@ def _error(code, message, status, use_json=False):
     return status, {"Content-Type": "application/xml"}, body
 def reset():
     _alarms.clear()
     _composite_alarms.clear()
     _alarm_history.clear()
     _resource_tags.clear()

     return status, {"Content-Type": "application/xml"}, body
+SUPPORTED_ACTIONS = [
+    "PutMetricData", "GetMetricStatistics", "GetMetricData", "ListMetrics",
+    "PutMetricAlarm", "PutCompositeAlarm", "DescribeAlarms",
+    "DescribeAlarmsForMetric", "DescribeAlarmHistory", "DeleteAlarms",
+    "EnableAlarmActions", "DisableAlarmActions", "SetAlarmState",
+    "TagResource", "UntagResource", "ListTagsForResource",
+    "PutDashboard", "GetDashboard", "DeleteDashboards", "ListDashboards",
+]
+def get_state() -> dict:
+    return {
+        "metrics": {"count": len(_metrics), "names": [f"{ns}:{mn}" for (ns, mn, _), _ in _metrics.items()]},
+        "alarms": {"count": len(_alarms), "names": list(_alarms.keys())},
+        "composite_alarms": {"count": len(_composite_alarms), "names": list(_composite_alarms.keys())},
+        "dashboards": {"count": len(_dashboards), "names": list(_dashboards.keys())},
+        "alarm_history": {"count": len(_alarm_history)},
+        "resource_tags": {"count": len(_resource_tags), "arns": list(_resource_tags.keys())},
+    }
 def reset():
     _alarms.clear()
     _composite_alarms.clear()
     _alarm_history.clear()
     _resource_tags.clear()
+    _dashboards.clear()
+    _metrics.clear()

aws_infra/aws_infra/services/cloudwatch_logs.py CHANGED Viewed

@@ -848,6 +848,29 @@ def _stop_query(data):
     return json_response({"success": True})
 def reset():
     _log_groups.clear()
     _destinations.clear()

     return json_response({"success": True})
+SUPPORTED_ACTIONS = [
+    "CreateLogGroup", "DeleteLogGroup", "DescribeLogGroups",
+    "CreateLogStream", "DeleteLogStream", "DescribeLogStreams",
+    "PutLogEvents", "GetLogEvents", "FilterLogEvents",
+    "PutRetentionPolicy", "DeleteRetentionPolicy",
+    "PutSubscriptionFilter", "DeleteSubscriptionFilter", "DescribeSubscriptionFilters",
+    "TagLogGroup", "UntagLogGroup", "ListTagsLogGroup",
+    "TagResource", "UntagResource", "ListTagsForResource",
+    "PutDestination", "DeleteDestination", "DescribeDestinations", "PutDestinationPolicy",
+    "PutMetricFilter", "DeleteMetricFilter", "DescribeMetricFilters",
+    "StartQuery", "GetQueryResults", "StopQuery",
+]
+def get_state() -> dict:
+    return {
+        "log_groups": {"count": len(_log_groups), "names": list(_log_groups.keys())},
+        "destinations": {"count": len(_destinations), "names": list(_destinations.keys())},
+        "metric_filters": {"count": len(_metric_filters), "keys": list(_metric_filters.keys())},
+        "queries": {"count": len(_queries), "ids": list(_queries.keys())},
+    }
 def reset():
     _log_groups.clear()
     _destinations.clear()

aws_infra/aws_infra/services/cognito.py CHANGED Viewed

@@ -1904,6 +1904,49 @@ def _apply_user_filter(users: list, filter_str: str) -> list:
     return result
 # ===========================================================================
 # RESET
 # ===========================================================================

     return result
+# ===========================================================================
+# SUPPORTED ACTIONS
+# ===========================================================================
+SUPPORTED_ACTIONS = [
+    "CreateUserPool", "DeleteUserPool", "DescribeUserPool", "ListUserPools",
+    "UpdateUserPool", "CreateUserPoolClient", "DeleteUserPoolClient",
+    "DescribeUserPoolClient", "ListUserPoolClients", "UpdateUserPoolClient",
+    "AdminCreateUser", "AdminDeleteUser", "AdminGetUser", "ListUsers",
+    "AdminSetUserPassword", "AdminUpdateUserAttributes", "AdminInitiateAuth",
+    "AdminRespondToAuthChallenge", "InitiateAuth", "RespondToAuthChallenge",
+    "SignUp", "ConfirmSignUp", "ForgotPassword", "ConfirmForgotPassword",
+    "ChangePassword", "GetUser", "UpdateUserAttributes", "DeleteUser",
+    "AdminAddUserToGroup", "AdminRemoveUserFromGroup",
+    "AdminListGroupsForUser", "AdminListUserAuthEvents", "CreateGroup",
+    "DeleteGroup", "GetGroup", "ListGroups", "AdminConfirmSignUp",
+    "AdminDisableUser", "AdminEnableUser", "AdminResetUserPassword",
+    "AdminUserGlobalSignOut", "GlobalSignOut", "RevokeToken",
+    "CreateUserPoolDomain", "DeleteUserPoolDomain", "DescribeUserPoolDomain",
+    "GetUserPoolMfaConfig", "SetUserPoolMfaConfig", "AssociateSoftwareToken",
+    "VerifySoftwareToken", "TagResource", "UntagResource",
+    "ListTagsForResource", "CreateIdentityPool", "DeleteIdentityPool",
+    "DescribeIdentityPool", "ListIdentityPools", "UpdateIdentityPool",
+    "GetId", "GetCredentialsForIdentity", "GetOpenIdToken",
+    "SetIdentityPoolRoles", "GetIdentityPoolRoles", "ListIdentities",
+    "DescribeIdentity", "MergeDeveloperIdentities",
+    "UnlinkDeveloperIdentity", "UnlinkIdentity",
+]
+# ===========================================================================
+# STATE
+# ===========================================================================
+def get_state() -> dict:
+    return {
+        "user_pools": {"count": len(_user_pools), "ids": list(_user_pools.keys())},
+        "identity_pools": {"count": len(_identity_pools), "ids": list(_identity_pools.keys())},
+        "pool_domain_map": {"count": len(_pool_domain_map), "domains": list(_pool_domain_map.keys())},
+        "identity_tags": {"count": len(_identity_tags), "arns": list(_identity_tags.keys())},
+    }
 # ===========================================================================
 # RESET
 # ===========================================================================

aws_infra/aws_infra/services/dynamodb.py CHANGED Viewed

@@ -1801,6 +1801,29 @@ def _diff_attributes(old_item, new_item, return_old=True):
     return result
 def reset():
     with _lock:
         _tables.clear()

     return result
+SUPPORTED_ACTIONS = [
+    "CreateTable", "DeleteTable", "DescribeTable", "ListTables", "UpdateTable",
+    "PutItem", "GetItem", "DeleteItem", "UpdateItem",
+    "Query", "Scan",
+    "BatchWriteItem", "BatchGetItem",
+    "TransactWriteItems", "TransactGetItems",
+    "DescribeTimeToLive", "UpdateTimeToLive",
+    "DescribeContinuousBackups", "UpdateContinuousBackups",
+    "DescribeEndpoints",
+    "TagResource", "UntagResource", "ListTagsOfResource",
+]
+def get_state() -> dict:
+    return {
+        "tables": {"count": len(_tables), "names": list(_tables.keys())},
+        "tags": {"count": len(_tags), "names": list(_tags.keys())},
+        "ttl_settings": {"count": len(_ttl_settings), "names": list(_ttl_settings.keys())},
+        "pitr_settings": {"count": len(_pitr_settings), "names": list(_pitr_settings.keys())},
+        "stream_records": {"count": len(_stream_records), "names": list(_stream_records.keys())},
+    }
 def reset():
     with _lock:
         _tables.clear()

aws_infra/aws_infra/services/ec2.py CHANGED Viewed

@@ -2224,6 +2224,74 @@ def _delete_egress_only_igw(params):
     return _xml(200, "DeleteEgressOnlyInternetGatewayResponse", "<returnCode>true</returnCode>")
 # ---------------------------------------------------------------------------
 # Reset
 # ---------------------------------------------------------------------------

     return _xml(200, "DeleteEgressOnlyInternetGatewayResponse", "<returnCode>true</returnCode>")
+# ---------------------------------------------------------------------------
+# Supported Actions
+# ---------------------------------------------------------------------------
+SUPPORTED_ACTIONS = [
+    "RunInstances", "TerminateInstances", "DescribeInstances", "StartInstances",
+    "StopInstances", "RebootInstances", "DescribeImages", "CreateSecurityGroup",
+    "DeleteSecurityGroup", "DescribeSecurityGroups",
+    "AuthorizeSecurityGroupIngress", "RevokeSecurityGroupIngress",
+    "AuthorizeSecurityGroupEgress", "RevokeSecurityGroupEgress",
+    "CreateKeyPair", "DeleteKeyPair", "DescribeKeyPairs", "ImportKeyPair",
+    "DescribeVpcs", "DescribeSubnets", "DescribeAvailabilityZones",
+    "CreateVpc", "DeleteVpc", "CreateSubnet", "DeleteSubnet",
+    "CreateInternetGateway", "DeleteInternetGateway",
+    "DescribeInternetGateways", "AttachInternetGateway",
+    "DetachInternetGateway", "AllocateAddress", "ReleaseAddress",
+    "AssociateAddress", "DisassociateAddress", "DescribeAddresses",
+    "CreateTags", "DeleteTags", "DescribeTags", "ModifyVpcAttribute",
+    "ModifySubnetAttribute", "CreateRouteTable", "DeleteRouteTable",
+    "DescribeRouteTables", "AssociateRouteTable", "DisassociateRouteTable",
+    "CreateRoute", "ReplaceRoute", "DeleteRoute", "CreateNetworkInterface",
+    "DeleteNetworkInterface", "DescribeNetworkInterfaces",
+    "AttachNetworkInterface", "DetachNetworkInterface", "CreateVpcEndpoint",
+    "DeleteVpcEndpoints", "DescribeVpcEndpoints", "CreateVolume",
+    "DeleteVolume", "DescribeVolumes", "DescribeVolumeStatus", "AttachVolume",
+    "DetachVolume", "ModifyVolume", "DescribeVolumesModifications",
+    "EnableVolumeIO", "ModifyVolumeAttribute", "DescribeVolumeAttribute",
+    "CreateSnapshot", "DeleteSnapshot", "DescribeSnapshots",
+    "ModifySnapshotAttribute", "DescribeSnapshotAttribute", "CopySnapshot",
+    "CreateNatGateway", "DescribeNatGateways", "DeleteNatGateway",
+    "CreateNetworkAcl", "DescribeNetworkAcls", "DeleteNetworkAcl",
+    "CreateNetworkAclEntry", "DeleteNetworkAclEntry",
+    "ReplaceNetworkAclEntry", "ReplaceNetworkAclAssociation",
+    "CreateFlowLogs", "DescribeFlowLogs", "DeleteFlowLogs",
+    "CreateVpcPeeringConnection", "AcceptVpcPeeringConnection",
+    "DescribeVpcPeeringConnections", "DeleteVpcPeeringConnection",
+    "CreateDhcpOptions", "AssociateDhcpOptions", "DescribeDhcpOptions",
+    "DeleteDhcpOptions", "CreateEgressOnlyInternetGateway",
+    "DescribeEgressOnlyInternetGateways", "DeleteEgressOnlyInternetGateway",
+]
+# ---------------------------------------------------------------------------
+# State
+# ---------------------------------------------------------------------------
+def get_state() -> dict:
+    return {
+        "instances": {"count": len(_instances), "ids": list(_instances.keys())},
+        "security_groups": {"count": len(_security_groups), "ids": list(_security_groups.keys())},
+        "vpcs": {"count": len(_vpcs), "ids": list(_vpcs.keys())},
+        "subnets": {"count": len(_subnets), "ids": list(_subnets.keys())},
+        "volumes": {"count": len(_volumes), "ids": list(_volumes.keys())},
+        "key_pairs": {"count": len(_key_pairs), "names": list(_key_pairs.keys())},
+        "internet_gateways": {"count": len(_internet_gateways), "ids": list(_internet_gateways.keys())},
+        "nat_gateways": {"count": len(_nat_gateways), "ids": list(_nat_gateways.keys())},
+        "route_tables": {"count": len(_route_tables), "ids": list(_route_tables.keys())},
+        "network_interfaces": {"count": len(_network_interfaces), "ids": list(_network_interfaces.keys())},
+        "vpc_endpoints": {"count": len(_vpc_endpoints), "ids": list(_vpc_endpoints.keys())},
+        "snapshots": {"count": len(_snapshots), "ids": list(_snapshots.keys())},
+        "network_acls": {"count": len(_network_acls), "ids": list(_network_acls.keys())},
+        "flow_logs": {"count": len(_flow_logs), "ids": list(_flow_logs.keys())},
+        "vpc_peering": {"count": len(_vpc_peering), "ids": list(_vpc_peering.keys())},
+        "dhcp_options": {"count": len(_dhcp_options), "ids": list(_dhcp_options.keys())},
+        "egress_igws": {"count": len(_egress_igws), "ids": list(_egress_igws.keys())},
+    }
 # ---------------------------------------------------------------------------
 # Reset
 # ---------------------------------------------------------------------------

aws_infra/aws_infra/services/ecs.py CHANGED Viewed

@@ -1229,6 +1229,26 @@ _ACTION_MAP = {
 }
 def reset():
     docker_client = _get_docker()
     if docker_client:

 }
+SUPPORTED_ACTIONS = [
+    "CreateCluster", "DeleteCluster", "DescribeClusters", "ListClusters", "UpdateCluster",
+    "UpdateClusterSettings", "RegisterTaskDefinition", "DeregisterTaskDefinition",
+    "DescribeTaskDefinition", "ListTaskDefinitions", "CreateService", "DeleteService",
+    "DescribeServices", "UpdateService", "ListServices", "RunTask", "StopTask",
+    "DescribeTasks", "ListTasks", "TagResource", "UntagResource", "ListTagsForResource",
+    "ExecuteCommand", "ListAccountSettings", "PutAccountSetting", "CreateCapacityProvider",
+    "DeleteCapacityProvider", "DescribeCapacityProviders", "PutClusterCapacityProviders",
+]
+def get_state() -> dict:
+    return {
+        "clusters": {"count": len(_clusters), "names": list(_clusters.keys())},
+        "task_definitions": {"count": len(_task_defs), "names": list(_task_defs.keys())},
+        "services": {"count": len(_services), "names": list(_services.keys())},
+        "tasks": {"count": len(_tasks), "ids": list(_tasks.keys())},
+    }
 def reset():
     docker_client = _get_docker()
     if docker_client:

aws_infra/aws_infra/services/efs.py CHANGED Viewed

@@ -497,6 +497,38 @@ def _error(status, code, message):
     return status, {"Content-Type": "application/json", "x-amzn-errortype": code}, body
 # ---------------------------------------------------------------------------
 # Reset
 # ---------------------------------------------------------------------------

     return status, {"Content-Type": "application/json", "x-amzn-errortype": code}, body
+# ---------------------------------------------------------------------------
+# Supported Actions
+# ---------------------------------------------------------------------------
+SUPPORTED_ACTIONS = [
+    "CreateFileSystem", "DeleteFileSystem", "DescribeFileSystems",
+    "DescribeFileSystemPolicy", "PutFileSystemPolicy",
+    "DeleteFileSystemPolicy", "CreateMountTarget", "DeleteMountTarget",
+    "DescribeMountTargets", "ModifyMountTargetSecurityGroups",
+    "CreateAccessPoint", "DeleteAccessPoint", "DescribeAccessPoints",
+    "TagResource", "UntagResource", "ListTagsForResource",
+    "CreateReplicationConfiguration", "DeleteReplicationConfiguration",
+    "DescribeReplicationConfigurations", "PutLifecycleConfiguration",
+    "GetLifecycleConfiguration", "PutBackupPolicy", "GetBackupPolicy",
+    "DescribeAccountPreferences", "PutAccountPreferences",
+]
+# ---------------------------------------------------------------------------
+# State
+# ---------------------------------------------------------------------------
+def get_state() -> dict:
+    return {
+        "file_systems": {"count": len(_file_systems), "ids": list(_file_systems.keys())},
+        "mount_targets": {"count": len(_mount_targets), "ids": list(_mount_targets.keys())},
+        "access_points": {"count": len(_access_points), "ids": list(_access_points.keys())},
+        "lifecycle_configs": {"count": len(_lifecycle_configs), "file_systems": list(_lifecycle_configs.keys())},
+        "backup_policies": {"count": len(_backup_policies), "file_systems": list(_backup_policies.keys())},
+    }
 # ---------------------------------------------------------------------------
 # Reset
 # ---------------------------------------------------------------------------

aws_infra/aws_infra/services/elasticache.py CHANGED Viewed

@@ -1266,6 +1266,32 @@ def _error(code, message, status):
     return status, {"Content-Type": "application/xml"}, body
 def reset():
     docker_client = _get_docker()
     if docker_client:

     return status, {"Content-Type": "application/xml"}, body
+SUPPORTED_ACTIONS = [
+    "CreateCacheCluster", "DeleteCacheCluster", "DescribeCacheClusters", "ModifyCacheCluster",
+    "RebootCacheCluster", "CreateReplicationGroup", "DeleteReplicationGroup",
+    "DescribeReplicationGroups", "ModifyReplicationGroup", "IncreaseReplicaCount",
+    "DecreaseReplicaCount", "CreateCacheSubnetGroup", "DescribeCacheSubnetGroups",
+    "DeleteCacheSubnetGroup", "ModifyCacheSubnetGroup", "CreateCacheParameterGroup",
+    "DescribeCacheParameterGroups", "DeleteCacheParameterGroup", "DescribeCacheParameters",
+    "ModifyCacheParameterGroup", "ResetCacheParameterGroup", "CreateUser", "DescribeUsers",
+    "DeleteUser", "ModifyUser", "CreateUserGroup", "DescribeUserGroups", "DeleteUserGroup",
+    "ModifyUserGroup", "DescribeCacheEngineVersions", "ListTagsForResource",
+    "AddTagsToResource", "RemoveTagsFromResource", "CreateSnapshot", "DeleteSnapshot",
+    "DescribeSnapshots", "DescribeEvents",
+]
+def get_state() -> dict:
+    return {
+        "clusters": {"count": len(_clusters), "ids": list(_clusters.keys())},
+        "replication_groups": {"count": len(_replication_groups), "ids": list(_replication_groups.keys())},
+        "users": {"count": len(_users), "ids": list(_users.keys())},
+        "subnet_groups": {"count": len(_subnet_groups), "ids": list(_subnet_groups.keys())},
+        "parameter_groups": {"count": len(_param_groups), "ids": list(_param_groups.keys())},
+        "snapshots": {"count": len(_snapshots), "ids": list(_snapshots.keys())},
+    }
 def reset():
     docker_client = _get_docker()
     if docker_client:

aws_infra/aws_infra/services/emr.py CHANGED Viewed

@@ -568,6 +568,37 @@ async def handle_request(method, path, headers, body, query_params):
     return handler(data)
 # ---------------------------------------------------------------------------
 # Reset
 # ---------------------------------------------------------------------------

     return handler(data)
+# ---------------------------------------------------------------------------
+# Supported Actions
+# ---------------------------------------------------------------------------
+SUPPORTED_ACTIONS = [
+    "CreateCluster", "DescribeCluster", "ListClusters", "TerminateJobFlows",
+    "SetTerminationProtection", "AddJobFlowSteps", "DescribeStep",
+    "ListSteps", "ModifyInstanceGroups",
+    "GetBlockPublicAccessConfiguration",
+    "PutBlockPublicAccessConfiguration", "ListInstances",
+    "DescribeInstance", "ListBootstrapActions", "GetAutoScalingPolicy",
+    "PutAutoScalingPolicy", "RemoveAutoScalingPolicy",
+    "ListSecurityConfigurations", "CreateSecurityConfiguration",
+    "DeleteSecurityConfiguration", "DescribeSecurityConfiguration",
+    "ListStudios", "CreateStudio", "DeleteStudio", "DescribeStudio",
+    "ListStudioSessions", "CreateStudioSession", "DeleteStudioSession",
+    "GetStudioSessionMapping", "CreateStudioSessionMapping",
+    "UpdateStudioSessionMapping", "DeleteStudioSessionMapping",
+]
+# ---------------------------------------------------------------------------
+# State
+# ---------------------------------------------------------------------------
+def get_state() -> dict:
+    return {
+        "clusters": {"count": len(_clusters), "ids": list(_clusters.keys())},
+    }
 # ---------------------------------------------------------------------------
 # Reset
 # ---------------------------------------------------------------------------

aws_infra/aws_infra/services/eventbridge.py CHANGED Viewed

@@ -991,6 +991,29 @@ def _update_api_destination(data):
     })
 def reset():
     global _event_buses
     _rules.clear()

     })
+SUPPORTED_ACTIONS = [
+    "CreateEventBus", "DeleteEventBus", "ListEventBuses", "DescribeEventBus",
+    "PutRule", "DeleteRule", "ListRules", "DescribeRule", "EnableRule", "DisableRule",
+    "PutTargets", "RemoveTargets", "ListTargetsByRule", "PutEvents",
+    "TagResource", "UntagResource", "ListTagsForResource",
+    "CreateArchive", "DeleteArchive", "DescribeArchive", "ListArchives",
+    "PutPermission", "RemovePermission",
+    "CreateConnection", "DescribeConnection", "DeleteConnection", "ListConnections",
+    "UpdateConnection", "CreateApiDestination", "DescribeApiDestination",
+    "DeleteApiDestination", "ListApiDestinations", "UpdateApiDestination",
+]
+def get_state() -> dict:
+    return {
+        "event_buses": {"count": len(_event_buses), "names": list(_event_buses.keys())},
+        "rules": {"count": len(_rules), "names": list(_rules.keys())},
+        "archives": {"count": len(_archives), "names": list(_archives.keys())},
+        "connections": {"count": len(_connections), "names": list(_connections.keys())},
+        "api_destinations": {"count": len(_api_destinations), "names": list(_api_destinations.keys())},
+    }
 def reset():
     global _event_buses
     _rules.clear()

aws_infra/aws_infra/services/firehose.py CHANGED Viewed

@@ -41,6 +41,20 @@ _lock = threading.Lock()
 _dest_counter = 0
 def reset():
     global _streams, _dest_counter
     with _lock:

 _dest_counter = 0
+SUPPORTED_ACTIONS = [
+    "CreateDeliveryStream", "DeleteDeliveryStream", "DescribeDeliveryStream",
+    "ListDeliveryStreams", "PutRecord", "PutRecordBatch", "UpdateDestination",
+    "StartDeliveryStreamEncryption", "StopDeliveryStreamEncryption",
+    "ListTagsForResource", "TagResource", "UntagResource",
+]
+def get_state() -> dict:
+    return {
+        "delivery_streams": {"count": len(_streams), "names": list(_streams.keys())},
+    }
 def reset():
     global _streams, _dest_counter
     with _lock:

aws_infra/aws_infra/services/glue.py CHANGED Viewed

@@ -1074,6 +1074,36 @@ def _simple_glob_match(pattern, name):
     return fnmatch.fnmatch(name, pattern)
 def reset():
     _databases.clear()
     _tables.clear()

     return fnmatch.fnmatch(name, pattern)
+SUPPORTED_ACTIONS = [
+    "CreateDatabase", "DeleteDatabase", "GetDatabase", "GetDatabases", "UpdateDatabase",
+    "CreateTable", "DeleteTable", "GetTable", "GetTables", "UpdateTable", "BatchDeleteTable",
+    "CreatePartition", "DeletePartition", "GetPartition", "GetPartitions",
+    "BatchCreatePartition", "BatchGetPartition", "CreatePartitionIndex", "GetPartitionIndexes",
+    "CreateConnection", "DeleteConnection", "GetConnection", "GetConnections",
+    "CreateCrawler", "DeleteCrawler", "GetCrawler", "GetCrawlers", "UpdateCrawler",
+    "StartCrawler", "StopCrawler", "GetCrawlerMetrics", "CreateJob", "DeleteJob", "GetJob",
+    "GetJobs", "UpdateJob", "StartJobRun", "GetJobRun", "GetJobRuns", "BatchStopJobRun",
+    "CreateSecurityConfiguration", "DeleteSecurityConfiguration", "GetSecurityConfiguration",
+    "GetSecurityConfigurations", "ListSecurityConfigurations", "CreateClassifier",
+    "DeleteClassifier", "GetClassifier", "GetClassifiers", "UpdateClassifier",
+    "CreateTrigger", "DeleteTrigger", "GetTrigger", "GetTriggers", "UpdateTrigger",
+    "StartTrigger", "StopTrigger", "CreateWorkflow", "DeleteWorkflow", "GetWorkflow",
+    "GetWorkflows", "UpdateWorkflow", "StartWorkflowRun", "GetWorkflowRun",
+    "GetWorkflowRuns", "GetWorkflowRunProperties", "TagResource", "UntagResource",
+    "ListTagsForResource",
+]
+def get_state() -> dict:
+    return {
+        "databases": {"count": len(_databases), "names": list(_databases.keys())},
+        "crawlers": {"count": len(_crawlers), "names": list(_crawlers.keys())},
+        "jobs": {"count": len(_jobs), "names": list(_jobs.keys())},
+        "connections": {"count": len(_connections), "names": list(_connections.keys())},
+        "workflows": {"count": len(_workflows), "names": list(_workflows.keys())},
+    }
 def reset():
     _databases.clear()
     _tables.clear()

aws_infra/aws_infra/services/iam_sts.py CHANGED Viewed

@@ -1532,6 +1532,43 @@ _IAM_HANDLERS = {
 }
 def reset():
     _users.clear()
     _roles.clear()

 }
+SUPPORTED_ACTIONS = [
+    "CreateUser", "GetUser", "ListUsers", "DeleteUser",
+    "CreateRole", "GetRole", "ListRoles", "DeleteRole",
+    "CreatePolicy", "GetPolicy", "GetPolicyVersion", "ListPolicyVersions",
+    "ListPolicies", "DeletePolicy", "CreatePolicyVersion", "DeletePolicyVersion",
+    "AttachRolePolicy", "DetachRolePolicy", "ListAttachedRolePolicies",
+    "PutRolePolicy", "GetRolePolicy", "DeleteRolePolicy", "ListRolePolicies",
+    "AttachUserPolicy", "DetachUserPolicy", "ListAttachedUserPolicies",
+    "PutUserPolicy", "GetUserPolicy", "DeleteUserPolicy", "ListUserPolicies",
+    "CreateAccessKey", "ListAccessKeys", "DeleteAccessKey",
+    "CreateInstanceProfile", "DeleteInstanceProfile", "GetInstanceProfile",
+    "AddRoleToInstanceProfile", "RemoveRoleFromInstanceProfile",
+    "ListInstanceProfiles", "ListInstanceProfilesForRole",
+    "UpdateAssumeRolePolicy",
+    "CreateGroup", "GetGroup", "DeleteGroup", "ListGroups",
+    "AddUserToGroup", "RemoveUserFromGroup", "ListGroupsForUser",
+    "CreateServiceLinkedRole",
+    "CreateOpenIDConnectProvider", "GetOpenIDConnectProvider", "DeleteOpenIDConnectProvider",
+    "TagRole", "UntagRole", "ListRoleTags",
+    "TagUser", "UntagUser", "ListUserTags",
+    "TagPolicy", "UntagPolicy", "ListPolicyTags",
+    "SimulatePrincipalPolicy", "SimulateCustomPolicy",
+    "GetCallerIdentity", "AssumeRole", "GetSessionToken",
+]
+def get_state() -> dict:
+    return {
+        "users": {"count": len(_users), "names": list(_users.keys())},
+        "roles": {"count": len(_roles), "names": list(_roles.keys())},
+        "policies": {"count": len(_policies), "names": list(_policies.keys())},
+        "instance_profiles": {"count": len(_instance_profiles), "names": list(_instance_profiles.keys())},
+        "groups": {"count": len(_groups), "names": list(_groups.keys())},
+        "oidc_providers": {"count": len(_oidc_providers), "names": list(_oidc_providers.keys())},
+    }
 def reset():
     _users.clear()
     _roles.clear()

aws_infra/aws_infra/services/kinesis.py CHANGED Viewed

@@ -899,6 +899,25 @@ def _stream_desc(stream, shard_ids=None):
     }
 def reset():
     _streams.clear()
     _shard_iterators.clear()

     }
+SUPPORTED_ACTIONS = [
+    "CreateStream", "DeleteStream", "DescribeStream", "DescribeStreamSummary",
+    "ListStreams", "PutRecord", "PutRecords", "GetShardIterator", "GetRecords",
+    "MergeShards", "SplitShard", "UpdateShardCount", "ListShards",
+    "IncreaseStreamRetentionPeriod", "DecreaseStreamRetentionPeriod",
+    "AddTagsToStream", "RemoveTagsFromStream", "ListTagsForStream",
+    "RegisterStreamConsumer", "DeregisterStreamConsumer", "ListStreamConsumers",
+    "DescribeStreamConsumer", "StartStreamEncryption", "StopStreamEncryption",
+    "EnableEnhancedMonitoring", "DisableEnhancedMonitoring",
+]
+def get_state() -> dict:
+    return {
+        "streams": {"count": len(_streams), "names": list(_streams.keys())},
+        "consumers": {"count": len(_consumers), "names": list(_consumers.keys())},
+    }
 def reset():
     _streams.clear()
     _shard_iterators.clear()

aws_infra/aws_infra/services/lambda_svc.py CHANGED Viewed

@@ -2531,6 +2531,36 @@ def _list_function_url_configs(func_name: str, query_params: dict):
     return json_response({"FunctionUrlConfigs": configs})
 def reset():
     from aws_infra.core import lambda_runtime

     return json_response({"FunctionUrlConfigs": configs})
+SUPPORTED_ACTIONS = [
+    "CreateFunction", "DeleteFunction", "GetFunction", "GetFunctionConfiguration",
+    "ListFunctions", "Invoke",
+    "UpdateFunctionCode", "UpdateFunctionConfiguration",
+    "PublishVersion", "ListVersionsByFunction",
+    "CreateAlias", "GetAlias", "UpdateAlias", "DeleteAlias", "ListAliases",
+    "AddPermission", "RemovePermission", "GetPolicy",
+    "ListTags", "TagResource", "UntagResource",
+    "PublishLayerVersion", "GetLayerVersion", "GetLayerVersionByArn",
+    "ListLayerVersions", "DeleteLayerVersion", "ListLayers",
+    "AddLayerVersionPermission", "RemoveLayerVersionPermission", "GetLayerVersionPolicy",
+    "CreateEventSourceMapping", "DeleteEventSourceMapping",
+    "GetEventSourceMapping", "ListEventSourceMappings", "UpdateEventSourceMapping",
+    "GetFunctionEventInvokeConfig", "PutFunctionEventInvokeConfig",
+    "PutFunctionConcurrency", "GetFunctionConcurrency", "DeleteFunctionConcurrency",
+    "GetFunctionCodeSigningConfig",
+    "CreateFunctionUrlConfig", "GetFunctionUrlConfig",
+    "UpdateFunctionUrlConfig", "DeleteFunctionUrlConfig", "ListFunctionUrlConfigs",
+]
+def get_state() -> dict:
+    return {
+        "functions": {"count": len(_functions), "names": list(_functions.keys())},
+        "layers": {"count": len(_layers), "names": list(_layers.keys())},
+        "event_source_mappings": {"count": len(_esms), "ids": list(_esms.keys())},
+        "function_urls": {"count": len(_function_urls), "keys": list(_function_urls.keys())},
+    }
 def reset():
     from aws_infra.core import lambda_runtime

aws_infra/aws_infra/services/rds.py CHANGED Viewed

@@ -1972,6 +1972,34 @@ _ACTION_MAP = {
 }
 def reset():
     docker_client = _get_docker()
     if docker_client:

 }
+SUPPORTED_ACTIONS = [
+    "CreateDBInstance", "DeleteDBInstance", "DescribeDBInstances", "ModifyDBInstance",
+    "StartDBInstance", "StopDBInstance", "RebootDBInstance", "CreateDBCluster",
+    "DeleteDBCluster", "DescribeDBClusters", "ModifyDBCluster", "StartDBCluster",
+    "StopDBCluster", "CreateDBSubnetGroup", "DeleteDBSubnetGroup", "DescribeDBSubnetGroups",
+    "ModifyDBSubnetGroup", "CreateDBParameterGroup", "DeleteDBParameterGroup",
+    "DescribeDBParameterGroups", "DescribeDBParameters", "ModifyDBParameterGroup",
+    "CreateDBClusterParameterGroup", "DescribeDBClusterParameterGroups",
+    "DeleteDBClusterParameterGroup", "DescribeDBClusterParameters",
+    "ModifyDBClusterParameterGroup", "CreateDBSnapshot", "DeleteDBSnapshot",
+    "DescribeDBSnapshots", "CreateDBClusterSnapshot", "DescribeDBClusterSnapshots",
+    "DeleteDBClusterSnapshot", "CreateOptionGroup", "DeleteOptionGroup",
+    "DescribeOptionGroups", "DescribeOptionGroupOptions", "CreateDBInstanceReadReplica",
+    "RestoreDBInstanceFromDBSnapshot", "ListTagsForResource", "AddTagsToResource",
+    "RemoveTagsFromResource", "DescribeDBEngineVersions", "DescribeOrderableDBInstanceOptions",
+]
+def get_state() -> dict:
+    return {
+        "instances": {"count": len(_instances), "ids": list(_instances.keys())},
+        "clusters": {"count": len(_clusters), "ids": list(_clusters.keys())},
+        "subnet_groups": {"count": len(_subnet_groups), "names": list(_subnet_groups.keys())},
+        "snapshots": {"count": len(_snapshots), "ids": list(_snapshots.keys())},
+        "db_cluster_snapshots": {"count": len(_db_cluster_snapshots), "ids": list(_db_cluster_snapshots.keys())},
+    }
 def reset():
     docker_client = _get_docker()
     if docker_client:

aws_infra/aws_infra/services/route53.py CHANGED Viewed

@@ -45,6 +45,28 @@ _hc_caller_refs: dict = {}  # caller_reference -> hc_id
 _lock = threading.Lock()
 def reset():
     global _zones, _records, _changes, _health_checks, _tags, _caller_refs, _hc_caller_refs
     with _lock:

 _lock = threading.Lock()
+SUPPORTED_ACTIONS = [
+    "CreateHostedZone", "DeleteHostedZone", "ListHostedZones", "GetHostedZone",
+    "UpdateHostedZoneComment", "GetChange", "ListResourceRecordSets",
+    "ChangeResourceRecordSets", "GetHostedZoneCount", "GetDNSSEC", "CreateHealthCheck",
+    "DeleteHealthCheck", "GetHealthCheck", "ListHealthChecks", "UpdateHealthCheckComment",
+    "GetHealthCheckStatus", "GetHealthCheckCount", "ChangeTagsForResource",
+    "ListTagsForResource", "ListTagsForResources", "CreateQueryLoggingConfig",
+    "DeleteQueryLoggingConfig", "ListQueryLoggingConfigs", "GetQueryLoggingConfig",
+    "ListHostedZonesByName", "CreateReusableDelegationSet", "DeleteReusableDelegationSet",
+    "ListReusableDelegationSets", "GetReusableDelegationSet",
+]
+def get_state() -> dict:
+    return {
+        "hosted_zones": {"count": len(_zones), "ids": list(_zones.keys())},
+        "health_checks": {"count": len(_health_checks), "ids": list(_health_checks.keys())},
+        "tags": {"count": len(_tags), "resources": list(_tags.keys())},
+        "record_sets": {"count": sum(len(recs) for recs in _records.values())},
+    }
 def reset():
     global _zones, _records, _changes, _health_checks, _tags, _caller_refs, _hc_caller_refs
     with _lock:

aws_infra/aws_infra/services/s3.py CHANGED Viewed

@@ -2718,6 +2718,39 @@ def _load_persisted_data():
 _load_persisted_data()
 def reset():
     """Wipe all in-memory state (used by /_ministack/reset)."""
     global _buckets, _bucket_policies, _bucket_notifications, _bucket_tags

 _load_persisted_data()
+SUPPORTED_ACTIONS = [
+    "CreateBucket", "DeleteBucket", "ListBuckets", "HeadBucket",
+    "PutObject", "GetObject", "DeleteObject", "HeadObject", "CopyObject",
+    "ListObjectsV1", "ListObjectsV2", "DeleteObjects",
+    "PutObjectTagging", "GetObjectTagging", "DeleteObjectTagging",
+    "ListObjectVersions", "PutBucketVersioning", "GetBucketVersioning",
+    "PutBucketPolicy", "GetBucketPolicy", "DeleteBucketPolicy",
+    "PutBucketNotificationConfiguration", "GetBucketNotificationConfiguration",
+    "PutBucketEncryption", "GetBucketEncryption", "DeleteBucketEncryption",
+    "PutBucketLifecycleConfiguration", "GetBucketLifecycleConfiguration", "DeleteBucketLifecycle",
+    "PutBucketCors", "GetBucketCors", "DeleteBucketCors",
+    "PutBucketAcl", "GetBucketAcl",
+    "PutBucketWebsite", "GetBucketWebsite", "DeleteBucketWebsite",
+    "PutBucketLogging", "GetBucketLogging",
+    "PutBucketAccelerateConfiguration", "GetBucketAccelerateConfiguration",
+    "PutBucketRequestPayment", "GetBucketRequestPayment",
+    "PutObjectLockConfiguration", "GetObjectLockConfiguration",
+    "PutObjectRetention", "GetObjectRetention",
+    "PutObjectLegalHold", "GetObjectLegalHold",
+    "PutBucketReplication", "GetBucketReplication", "DeleteBucketReplication",
+    "CreateMultipartUpload", "UploadPart", "CompleteMultipartUpload",
+    "AbortMultipartUpload", "ListMultipartUploads",
+    "GetBucketLocation",
+    "GetBucketTagging", "PutBucketTagging", "DeleteBucketTagging",
+]
+def get_state() -> dict:
+    return {
+        "buckets": {"count": len(_buckets), "names": list(_buckets.keys())},
+    }
 def reset():
     """Wipe all in-memory state (used by /_ministack/reset)."""
     global _buckets, _bucket_policies, _bucket_notifications, _bucket_tags

aws_infra/aws_infra/services/secretsmanager.py CHANGED Viewed

@@ -708,6 +708,23 @@ def _validate_resource_policy(data):
     })
 def reset():
     _secrets.clear()
     _resource_policies.clear()

     })
+SUPPORTED_ACTIONS = [
+    "CreateSecret", "GetSecretValue", "ListSecrets", "DeleteSecret",
+    "RestoreSecret", "UpdateSecret", "DescribeSecret", "PutSecretValue",
+    "TagResource", "UntagResource", "ListSecretVersionIds",
+    "RotateSecret", "GetRandomPassword", "ReplicateSecretToRegions",
+    "PutResourcePolicy", "GetResourcePolicy", "DeleteResourcePolicy",
+    "ValidateResourcePolicy",
+]
+def get_state() -> dict:
+    return {
+        "secrets": {"count": len(_secrets), "names": list(_secrets.keys())},
+        "resource_policies": {"count": len(_resource_policies), "arns": list(_resource_policies.keys())},
+    }
 def reset():
     _secrets.clear()
     _resource_policies.clear()

aws_infra/aws_infra/services/ses.py CHANGED Viewed

@@ -1002,6 +1002,28 @@ def _json_error(code, message, status):
     return _json_response(status, {"__type": code, "message": message})
 def reset():
     _identities.clear()
     _sent_emails.clear()

     return _json_response(status, {"__type": code, "message": message})
+SUPPORTED_ACTIONS = [
+    "SendEmail", "SendRawEmail", "SendTemplatedEmail", "SendBulkTemplatedEmail",
+    "VerifyEmailIdentity", "VerifyEmailAddress", "VerifyDomainIdentity",
+    "VerifyDomainDkim", "ListIdentities", "GetIdentityVerificationAttributes",
+    "DeleteIdentity", "GetSendQuota", "GetSendStatistics",
+    "ListVerifiedEmailAddresses", "CreateConfigurationSet",
+    "DeleteConfigurationSet", "DescribeConfigurationSet", "ListConfigurationSets",
+    "CreateTemplate", "GetTemplate", "DeleteTemplate", "ListTemplates",
+    "UpdateTemplate", "GetIdentityDkimAttributes", "SetIdentityNotificationTopic",
+    "SetIdentityFeedbackForwardingEnabled",
+]
+def get_state() -> dict:
+    return {
+        "identities": {"count": len(_identities), "names": list(_identities.keys())},
+        "templates": {"count": len(_templates), "names": list(_templates.keys())},
+        "configuration_sets": {"count": len(_configuration_sets), "names": list(_configuration_sets.keys())},
+        "sent_emails": {"count": len(_sent_emails)},
+    }
 def reset():
     _identities.clear()
     _sent_emails.clear()

aws_infra/aws_infra/services/ses_v2.py CHANGED Viewed

@@ -156,6 +156,23 @@ async def handle_request(method, path, headers, body, query_params):
     return _json_err("NotFoundException", f"Unknown SES v2 path: {method} {path}", 404)
 def reset():
     _identities.clear()
     _config_sets.clear()

     return _json_err("NotFoundException", f"Unknown SES v2 path: {method} {path}", 404)
+SUPPORTED_ACTIONS = [
+    "SendEmail", "CreateEmailIdentity", "GetEmailIdentity", "DeleteEmailIdentity",
+    "ListEmailIdentities", "CreateConfigurationSet", "GetConfigurationSet",
+    "DeleteConfigurationSet", "ListConfigurationSets", "GetAccount",
+    "ListSuppressedDestinations", "PutAccountSuppressionAttributes",
+    "TagResource", "UntagResource", "ListTagsForResource",
+]
+def get_state() -> dict:
+    return {
+        "identities": {"count": len(_identities), "names": list(_identities.keys())},
+        "configuration_sets": {"count": len(_config_sets), "names": list(_config_sets.keys())},
+        "tags": {"count": len(_ses_tags), "resources": list(_ses_tags.keys())},
+    }
 def reset():
     _identities.clear()
     _config_sets.clear()

aws_infra/aws_infra/services/sns.py CHANGED Viewed

@@ -950,6 +950,27 @@ def _build_envelope(topic_arn: str, msg_id: str, message: str, subject: str,
     return json.dumps({k: v for k, v in envelope.items() if v is not None})
 def reset():
     _topics.clear()
     _sub_arn_to_topic.clear()

     return json.dumps({k: v for k, v in envelope.items() if v is not None})
+SUPPORTED_ACTIONS = [
+    "CreateTopic", "DeleteTopic", "ListTopics",
+    "GetTopicAttributes", "SetTopicAttributes",
+    "Subscribe", "Unsubscribe", "ConfirmSubscription",
+    "ListSubscriptions", "ListSubscriptionsByTopic",
+    "GetSubscriptionAttributes", "SetSubscriptionAttributes",
+    "Publish", "PublishBatch",
+    "ListTagsForResource", "TagResource", "UntagResource",
+    "CreatePlatformApplication", "CreatePlatformEndpoint",
+]
+def get_state() -> dict:
+    return {
+        "topics": {"count": len(_topics), "names": list(_topics.keys())},
+        "platform_applications": {"count": len(_platform_applications), "names": list(_platform_applications.keys())},
+        "platform_endpoints": {"count": len(_platform_endpoints), "names": list(_platform_endpoints.keys())},
+        "subscriptions": {"count": len(_sub_arn_to_topic), "sub_arn_to_topic": dict(_sub_arn_to_topic.items())},
+    }
 def reset():
     _topics.clear()
     _sub_arn_to_topic.clear()

aws_infra/aws_infra/services/sqs.py CHANGED Viewed

@@ -1231,6 +1231,23 @@ def _url_from_path(path: str) -> str:
     return ""
 def reset():
     _queues.clear()
     _queue_name_to_url.clear()

     return ""
+SUPPORTED_ACTIONS = [
+    "CreateQueue", "DeleteQueue", "ListQueues", "GetQueueUrl",
+    "GetQueueAttributes", "SetQueueAttributes", "PurgeQueue",
+    "SendMessage", "ReceiveMessage", "DeleteMessage",
+    "ChangeMessageVisibility", "ChangeMessageVisibilityBatch",
+    "SendMessageBatch", "DeleteMessageBatch",
+    "ListQueueTags", "TagQueue", "UntagQueue",
+]
+def get_state() -> dict:
+    return {
+        "queues": {"count": len(_queues), "names": list(_queues.keys())},
+        "queue_name_to_url": dict(_queue_name_to_url),
+    }
 def reset():
     _queues.clear()
     _queue_name_to_url.clear()

aws_infra/aws_infra/services/ssm.py CHANGED Viewed

@@ -488,6 +488,21 @@ def _param_out(param, with_decryption=False):
     return out
 def reset():
     _parameters.clear()
     _parameter_history.clear()

     return out
+SUPPORTED_ACTIONS = [
+    "PutParameter", "GetParameter", "GetParameters", "GetParametersByPath",
+    "DeleteParameter", "DeleteParameters", "DescribeParameters",
+    "GetParameterHistory", "LabelParameterVersion", "AddTagsToResource",
+    "RemoveTagsFromResource", "ListTagsForResource",
+]
+def get_state() -> dict:
+    return {
+        "parameters": {"count": len(_parameters), "names": list(_parameters.keys())},
+        "tags": {"count": len(_tags), "arns": list(_tags.keys())},
+    }
 def reset():
     _parameters.clear()
     _parameter_history.clear()

aws_infra/aws_infra/services/stepfunctions.py CHANGED Viewed

@@ -1786,6 +1786,25 @@ _SERVICE_DISPATCH = {
 }
 def reset():
     _state_machines.clear()
     _executions.clear()

 }
+SUPPORTED_ACTIONS = [
+    "CreateStateMachine", "DeleteStateMachine", "DescribeStateMachine", "UpdateStateMachine",
+    "ListStateMachines", "StartExecution", "StartSyncExecution", "StopExecution",
+    "DescribeExecution", "DescribeStateMachineForExecution", "ListExecutions",
+    "GetExecutionHistory", "SendTaskSuccess", "SendTaskFailure", "SendTaskHeartbeat",
+    "CreateActivity", "DeleteActivity", "DescribeActivity", "ListActivities",
+    "GetActivityTask", "TagResource", "UntagResource", "ListTagsForResource",
+]
+def get_state() -> dict:
+    return {
+        "state_machines": {"count": len(_state_machines), "names": list(_state_machines.keys())},
+        "executions": {"count": len(_executions), "arns": list(_executions.keys())},
+        "activities": {"count": len(_activities), "names": list(_activities.keys())},
+        "tags": {"count": len(_tags), "resources": list(_tags.keys())},
+    }
 def reset():
     _state_machines.clear()
     _executions.clear()

aws_infra/aws_infra/services/waf.py CHANGED Viewed

@@ -358,6 +358,27 @@ def _describe_managed_rule_group(data):
     })
 def reset():
     _web_acls.clear()
     _ip_sets.clear()

     })
+SUPPORTED_ACTIONS = [
+    "CreateWebACL", "GetWebACL", "UpdateWebACL", "DeleteWebACL", "ListWebACLs",
+    "AssociateWebACL", "DisassociateWebACL", "GetWebACLForResource",
+    "ListResourcesForWebACL", "CreateIPSet", "GetIPSet", "UpdateIPSet",
+    "DeleteIPSet", "ListIPSets", "CreateRuleGroup", "GetRuleGroup",
+    "UpdateRuleGroup", "DeleteRuleGroup", "ListRuleGroups",
+    "TagResource", "UntagResource", "ListTagsForResource",
+    "CheckCapacity", "DescribeManagedRuleGroup",
+]
+def get_state() -> dict:
+    return {
+        "web_acls": {"count": len(_web_acls), "ids": list(_web_acls.keys())},
+        "ip_sets": {"count": len(_ip_sets), "ids": list(_ip_sets.keys())},
+        "rule_groups": {"count": len(_rule_groups), "ids": list(_rule_groups.keys())},
+        "associations": {"count": len(_associations), "resources": list(_associations.keys())},
+        "waf_tags": {"count": len(_waf_tags), "resources": list(_waf_tags.keys())},
+    }
 def reset():
     _web_acls.clear()
     _ip_sets.clear()

client.py CHANGED Viewed

@@ -10,12 +10,11 @@ from typing import Dict
 from openenv.core import EnvClient
 from openenv.core.client_types import StepResult
-from openenv.core.env_server.types import State
-from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount
-class AwsRlEnv(EnvClient[AwsRlAction, AwsRlObservation, State]):
     """
     Client for the Aws Rl Env Environment.
@@ -65,9 +64,19 @@ class AwsRlEnv(EnvClient[AwsRlAction, AwsRlObservation, State]):
             done=payload.get("done", False),
         )
-    def _parse_state(self, payload: Dict) -> State:
-        """Parse server response into State object."""
-        return State(
             episode_id=payload.get("episode_id"),
             step_count=payload.get("step_count", 0),
         )

 from openenv.core import EnvClient
 from openenv.core.client_types import StepResult
+from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount, AwsRlState
+class AwsRlEnv(EnvClient[AwsRlAction, AwsRlObservation, AwsRlState]):
     """
     Client for the Aws Rl Env Environment.
             done=payload.get("done", False),
         )
+    def _parse_state(self, payload: Dict) -> AwsRlState:
+        """Parse server response into AwsRlState object."""
+        from models import TrackerState, Task
+        tracker_data = payload.get("tracker", {})
+        task_data = payload.get("current_task")
+        return AwsRlState(
             episode_id=payload.get("episode_id"),
             step_count=payload.get("step_count", 0),
+            current_task=Task(**task_data) if task_data else None,
+            tracker=TrackerState(**tracker_data) if tracker_data else TrackerState(),
+            infra_state=payload.get("infra_state", {}),
+            chaos_occurred=payload.get("chaos_occurred", False),
+            current_tier=payload.get("current_tier", "warmup"),
         )

inference-complete.py CHANGED Viewed

@@ -89,6 +89,8 @@ SYSTEM_PROMPT = textwrap.dedent(
     - Only send AWS CLI commands (e.g. 'aws s3 ls', 'aws dynamodb create-table ...')
     - One command per turn — no pipes, no shell syntax, no chaining
     - Reply with ONLY the command, nothing else — no explanations, no quotes
     """
 ).strip()
@@ -165,10 +167,7 @@ def get_model_command(
 # ---------------------------------------------------------------------------
-async def run_episode(
-    env: AwsRlEnv,
-    llm_client: OpenAI
-) -> Optional[dict]:
     """Run a single episode: reset -> step loop -> return results."""
     result = await env.reset()
     obs = result.observation
@@ -182,9 +181,9 @@ async def run_episode(
     task_desc = task.description
     task_id = int(task.task_id)
-    print(f"\n{'='*60}")
     print(f"Episode {episode_num} -- Task {task_id}: {task_desc} (tier: {tier})")
-    print(f"\n{'='*60}")
     history: List[str] = []
     last_output = obs.command_output
@@ -206,7 +205,6 @@ async def run_episode(
             last_reward,
             history,
         )
         result = await env.step(AwsRlAction(command=command))
         obs = result.observation
@@ -214,21 +212,22 @@ async def run_episode(
         reward = result.reward or 0.0
         success = obs.command_success
         task_achieved = obs.task_achieved
-        done = result.done
         rewards.append(reward)
         print()
-        print(f"\n{'-'*60}")
         print(
-            f"  [Step {step}] cmd=\"{command}\"  command_output={obs.command_output!r} "
             f"reward={reward:.2f} command_success={success} achieved={task_achieved}"
         )
-        print(f"\n{'-'*60}")
         print()
         status = "OK" if success else "FAIL"
-        history.append(f"Step {step} [{status}]: {command} [command_output]={obs.command_output!r} [error]={obs.error!r} -> reward={reward:.2f}")
         last_output = obs.command_output
         last_error = obs.error
         last_reward = reward
@@ -299,9 +298,9 @@ def print_summary(tier_results: dict[str, list]) -> None:
     total_passed = 0
     total_tasks = 0
-    print(f"\n{'='*60}")
     print("FINAL RESULTS")
-    print(f"{'='*60}")
     for tier in ALL_TIERS:
         results = tier_results.get(tier, [])

     - Only send AWS CLI commands (e.g. 'aws s3 ls', 'aws dynamodb create-table ...')
     - One command per turn — no pipes, no shell syntax, no chaining
     - Reply with ONLY the command, nothing else — no explanations, no quotes
+    - If unsure, use 'aws help' to get unstuck, but try to be specific to the service if possible (e.g. 'aws s3 help')
+    - When ever you need a hint, use 'aws help --task-hint' to get a task-specific hint (you can use this multiple times for more hints, but hints reduce your reward)
     """
 ).strip()
 # ---------------------------------------------------------------------------
+async def run_episode(env: AwsRlEnv, llm_client: OpenAI) -> Optional[dict]:
     """Run a single episode: reset -> step loop -> return results."""
     result = await env.reset()
     obs = result.observation
     task_desc = task.description
     task_id = int(task.task_id)
+    print(f"\n{'=' * 60}")
     print(f"Episode {episode_num} -- Task {task_id}: {task_desc} (tier: {tier})")
+    print(f"\n{'=' * 60}")
     history: List[str] = []
     last_output = obs.command_output
             last_reward,
             history,
         )
         result = await env.step(AwsRlAction(command=command))
         obs = result.observation
         reward = result.reward or 0.0
         success = obs.command_success
         task_achieved = obs.task_achieved
         rewards.append(reward)
         print()
+        print(f"\n{'-' * 60}")
         print(
+            f'  [Step {step}] cmd="{command}"  command_output={obs.command_output!r} '
             f"reward={reward:.2f} command_success={success} achieved={task_achieved}"
         )
+        print(f"\n{'-' * 60}")
         print()
         status = "OK" if success else "FAIL"
+        history.append(
+            f"Step {step} [{status}]: {command} [command_output]={obs.command_output!r} [error]={obs.error!r} -> reward={reward:.2f}"
+        )
         last_output = obs.command_output
         last_error = obs.error
         last_reward = reward
     total_passed = 0
     total_tasks = 0
+    print(f"\n{'=' * 60}")
     print("FINAL RESULTS")
+    print(f"{'=' * 60}")
     for tier in ALL_TIERS:
         results = tier_results.get(tier, [])

inference.py CHANGED Viewed

@@ -54,6 +54,10 @@ load_dotenv()  # Load variables from .env file if present
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
 HF_TOKEN = os.getenv("HF_TOKEN")
 API_KEY = os.getenv("API_KEY")  # Optional if using HF_TOKEN
@@ -77,6 +81,8 @@ SYSTEM_PROMPT = textwrap.dedent(
     - Only send AWS CLI commands (e.g. 'aws s3 ls', 'aws dynamodb create-table ...')
     - One command per turn — no pipes, no shell syntax, no chaining
     - Reply with ONLY the command, nothing else — no explanations, no quotes
     """
 ).strip()

 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+if not API_BASE_URL:
+    API_BASE_URL = "https://router.huggingface.co/v1"
+if not MODEL_NAME:
+    MODEL_NAME = "Qwen/Qwen2.5-72B-Instruct"
 HF_TOKEN = os.getenv("HF_TOKEN")
 API_KEY = os.getenv("API_KEY")  # Optional if using HF_TOKEN
     - Only send AWS CLI commands (e.g. 'aws s3 ls', 'aws dynamodb create-table ...')
     - One command per turn — no pipes, no shell syntax, no chaining
     - Reply with ONLY the command, nothing else — no explanations, no quotes
+    - If unsure, use 'aws help' to get unstuck, but try to be specific to the service if possible (e.g. 'aws s3 help')
+    - When ever you need a hint, use 'aws help --task-hint' to get a task-specific hint (you can use this multiple times for more hints, but hints reduce your reward)
     """
 ).strip()

models.py CHANGED Viewed

@@ -3,9 +3,9 @@ Data models for the Aws Rl Env Environment.
 """
 from enum import Enum
-from typing import NewType, Union
-from openenv.core.env_server.types import Action, Observation
 from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
@@ -18,6 +18,7 @@ StepCount = NewType("StepCount", int)
 class AwsService(str, Enum):
     S3 = "s3"
     EC2 = "ec2"
     DYNAMODB = "dynamodb"
@@ -26,6 +27,31 @@ class AwsService(str, Enum):
     SNS = "sns"
     IAM = "iam"
     APIGATEWAY = "apigateway"
 # ---------------------------------------------------------------------------
@@ -62,6 +88,12 @@ class TierConfig(BaseModel):
         le=1.0,
         description="Success rate for early promotion after 3 episodes",
     )
 class SpacedRepState(BaseModel):
@@ -169,6 +201,82 @@ class Task(BaseModel):
         default_factory=list,
         description="Commands to run during reset to set up initial state (e.g. for SRE tasks)",
     )
 # ---------------------------------------------------------------------------
@@ -199,13 +307,21 @@ class AwsRlObservation(Observation):
         default="", description="Stdout from the executed AWS CLI command"
     )
     error: str = Field(default="", description="Stderr if the command failed")
-    resources: dict[AwsService, Union[dict, list, str]] = Field(
-        default_factory=dict,
-        description="Current resource state from MiniStack, keyed by service name",
-    )
-    task: Task | None = Field(
-        default=None, description="The task the agent is trying to accomplish"
     )
     task_achieved: bool = Field(
         default=False, description="Whether the task has been achieved"
     )

 """
 from enum import Enum
+from typing import NewType
+from openenv.core.env_server.types import Action, Observation, State
 from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
 class AwsService(str, Enum):
+    # Core services
     S3 = "s3"
     EC2 = "ec2"
     DYNAMODB = "dynamodb"
     SNS = "sns"
     IAM = "iam"
     APIGATEWAY = "apigateway"
+    SECRETSMANAGER = "secretsmanager"
+    # Compute & containers
+    ECS = "ecs"
+    # Data & analytics
+    RDS = "rds"
+    ELASTICACHE = "elasticache"
+    ATHENA = "athena"
+    GLUE = "glue"
+    FIREHOSE = "firehose"
+    EMR = "emr"
+    # Networking & routing
+    APIGATEWAYV2 = "apigatewayv2"
+    ROUTE53 = "route53"
+    ELBV2 = "elbv2"
+    # Storage
+    EBS = "ebs"
+    EFS = "efs"
+    # Identity & config
+    COGNITO = "cognito-idp"
+    SSM = "ssm"
+    EVENTBRIDGE = "events"
+    # Monitoring
+    CLOUDWATCH = "cloudwatch"
+    # Infrastructure as code
+    CLOUDFORMATION = "cloudformation"
 # ---------------------------------------------------------------------------
         le=1.0,
         description="Success rate for early promotion after 3 episodes",
     )
+    chaos_probability: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Probability of chaos injection per step",
+    )
 class SpacedRepState(BaseModel):
         default_factory=list,
         description="Commands to run during reset to set up initial state (e.g. for SRE tasks)",
     )
+    desired_state_spec: str | None = Field(
+        default=None,
+        description="Natural-language specification of the desired end state (shown to agent for drift tasks)",
+    )
+    possible_drifts: list[SetupCommand] = Field(
+        default_factory=list,
+        description="Pool of mutations the DriftEngine may randomly apply after setup",
+    )
+class TaskInfo(BaseModel):
+    """Agent-visible subset of Task — masks success_criteria, setup_commands, and possible_drifts."""
+    task_id: TaskID = Field(..., ge=0, description="Unique task identifier")
+    difficulty: TaskDifficulty = Field(
+        default=TaskDifficulty.WARMUP, description="Task difficulty level"
+    )
+    description: str = Field(..., description="Human-readable task description")
+    desired_state_spec: str | None = Field(
+        default=None,
+        description="Natural-language specification of the desired end state (shown to agent for drift tasks)",
+    )
+    @classmethod
+    def from_task(cls, task: Task) -> "TaskInfo":
+        """Create a masked TaskInfo from a full Task."""
+        return cls(
+            task_id=task.task_id,
+            difficulty=task.difficulty,
+            description=task.description,
+            desired_state_spec=task.desired_state_spec,
+        )
+# ---------------------------------------------------------------------------
+# Environment State
+# ---------------------------------------------------------------------------
+class TrackerState(BaseModel):
+    """Serializable snapshot of the EpisodeTracker."""
+    step_count: int = Field(default=0, ge=0, description="Steps taken this episode")
+    hints_used: int = Field(default=0, ge=0, description="Hints requested this episode")
+    progress: float = Field(
+        default=0.0, ge=0.0, le=1.0, description="Current partial progress"
+    )
+    commands_executed: list[str] = Field(
+        default_factory=list, description="Commands executed this episode"
+    )
+    credited_operations: list[str] = Field(
+        default_factory=list,
+        description="(operation, resource) pairs that earned credit",
+    )
+class AwsRlState(State):
+    """Full environment state including task, tracker, and infrastructure."""
+    current_task: Task | None = Field(
+        default=None, description="The task assigned for this episode"
+    )
+    tracker: TrackerState = Field(
+        default_factory=TrackerState,
+        description="Episode tracker snapshot",
+    )
+    infra_state: dict = Field(
+        default_factory=dict,
+        description="AWS infrastructure state keyed by service name",
+    )
+    chaos_occurred: bool = Field(
+        default=False, description="Whether chaos was injected this episode"
+    )
+    current_tier: str = Field(
+        default="warmup", description="Agent's current difficulty tier"
+    )
 # ---------------------------------------------------------------------------
         default="", description="Stdout from the executed AWS CLI command"
     )
     error: str = Field(default="", description="Stderr if the command failed")
+    task: TaskInfo | None = Field(
+        default=None, description="The task the agent is trying to accomplish (masked)"
     )
     task_achieved: bool = Field(
         default=False, description="Whether the task has been achieved"
     )
+    partial_progress: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Current task progress (0.0 to 1.0)",
+    )
+    hints_used: int = Field(
+        default=0, ge=0, description="Number of hints requested this episode"
+    )
+    hint_text: str = Field(
+        default="", description="Text of the most recently requested hint"
+    )

pyproject.toml CHANGED Viewed

@@ -49,6 +49,11 @@ include-package-data = true
 packages = ["aws_rl_env", "aws_rl_env.server"]
 package-dir = { "aws_rl_env" = ".", "aws_rl_env.server" = "server" }
 [tool.ruff]
 exclude = ["aws_infra/"]

 packages = ["aws_rl_env", "aws_rl_env.server"]
 package-dir = { "aws_rl_env" = ".", "aws_rl_env.server" = "server" }
+[tool.pytest.ini_options]
+addopts = "--import-mode=importlib"
+testpaths = ["tests"]
+pythonpath = ["."]
 [tool.ruff]
 exclude = ["aws_infra/"]

server/app.py CHANGED Viewed

@@ -83,6 +83,29 @@ async def web_reset():
     }
 @app.post("/web/step", include_in_schema=False)
 async def web_step(request: WebStepRequest = Body(...)):
     action = AwsRlAction(**request.action)

     }
+@app.get("/web/solution", include_in_schema=False)
+async def web_solution():
+    """Return the next solution command for the current task step."""
+    if not _env._current_task:
+        return {"command": None, "error": "No active task. Start a new episode first."}
+    from server.services.task_solutions import get_next_solution
+    result = get_next_solution(
+        task_id=_env._current_task.task_id,
+        backend=_env._backend,
+        tracker=_env._tracker,
+    )
+    result["task_id"] = _env._current_task.task_id
+    return result
+@app.get("/web/state", include_in_schema=False)
+async def web_state():
+    """Return the full AwsRlState for the web UI."""
+    return _env.state.model_dump()
 @app.post("/web/step", include_in_schema=False)
 async def web_step(request: WebStepRequest = Body(...)):
     action = AwsRlAction(**request.action)

server/aws_rl_env_environment.py CHANGED Viewed

@@ -18,31 +18,59 @@ from typing import Any, Optional
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
-from openenv.core.env_server.types import State
-from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount, Task
 from server.services.aws_backend import AwsBackend
 from server.services.curriculum import Curriculum
 from server.services.environment_designer import EnvironmentDesigner
 from server.services.episode_tracker import EpisodeTracker
 from server.services.task_grader import TaskGrader
 logger = logging.getLogger(__name__)
-class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, State]):
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
     def __init__(self) -> None:
         print("Initializing AWS RL Environment...")
-        self._state = State(episode_id=str(uuid4()), step_count=0)
         self._backend = AwsBackend()
         self._curriculum = Curriculum()
         self._grader = TaskGrader(self._backend)
         self._designer = EnvironmentDesigner(self._backend)
         self._tracker = EpisodeTracker()
         self._current_task: Task | None = None
     def reset(
         self,
         seed: Optional[int] = None,
@@ -50,33 +78,29 @@ class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, State]):
         **kwargs: Any,
     ) -> AwsRlObservation:
         self._backend.reset_environment()
-        self._state = State(episode_id=episode_id or str(uuid4()), step_count=0)
         self._tracker.reset()
         self._current_task = self._curriculum.next_task()
         self._designer.apply(self._current_task)
         return AwsRlObservation(
             episode_id=EpisodeID(self._state.episode_id or ""),
             step_count=StepCount(self._state.step_count),
             command_success=True,
             command_output="Environment reset. Infra state wiped.",
-            task=self._current_task,
             done=False,
             reward=0.0,
         )
-    def step(
-        self,
-        action: AwsRlAction,
-        timeout_s: Optional[float] = None,
-        **kwargs: Any,
-    ) -> AwsRlObservation:
-        assert self._current_task is not None, "Call reset() before step()"
-        self._state.step_count += 1
-        # Anti-hack: only allow AWS CLI commands
-        command = action.command.strip()
         if not command.startswith("aws "):
             return AwsRlObservation(
                 episode_id=EpisodeID(self._state.episode_id or ""),
@@ -84,22 +108,86 @@ class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, State]):
                 command_success=False,
                 command_output="",
                 error="Only AWS CLI commands (starting with 'aws') are allowed.",
-                task=self._current_task,
                 task_achieved=False,
                 done=False,
                 reward=0.0,
             )
         success, stdout, stderr = self._backend.execute_command(command)
         # Record in tracker
         latest_step = self._tracker.record_step(command, success, stdout, stderr)
-        # Grade the task
-        task_achieved = False
         grade_result = self._grader.grade(
-            self._current_task, self._tracker, latest_step
         )
         task_achieved = grade_result.task_achieved
         reward = grade_result.reward
@@ -109,18 +197,29 @@ class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, State]):
                 self._current_task, achieved=True, reward=reward
             )
         return AwsRlObservation(
             episode_id=EpisodeID(self._state.episode_id or ""),
             step_count=StepCount(self._state.step_count),
             command_success=success,
             command_output=stdout,
             error=stderr,
-            task=self._current_task,
             task_achieved=task_achieved,
             done=task_achieved,
             reward=reward,
         )
     @property
-    def state(self) -> State:
         return self._state

 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
+from models import (
+    AwsRlAction,
+    AwsRlObservation,
+    AwsRlState,
+    EpisodeID,
+    StepCount,
+    Task,
+    TaskInfo,
+    TrackerState,
+)
 from server.services.aws_backend import AwsBackend
+from server.services.chaos_engine import ChaosEngine
 from server.services.curriculum import Curriculum
 from server.services.environment_designer import EnvironmentDesigner
 from server.services.episode_tracker import EpisodeTracker
+from server.services.hint_provider import HintProvider, MAX_HINT_LEVEL
 from server.services.task_grader import TaskGrader
 logger = logging.getLogger(__name__)
+class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, AwsRlState]):
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
     def __init__(self) -> None:
         print("Initializing AWS RL Environment...")
+        self._state = AwsRlState(episode_id=str(uuid4()), step_count=0)
         self._backend = AwsBackend()
         self._curriculum = Curriculum()
         self._grader = TaskGrader(self._backend)
         self._designer = EnvironmentDesigner(self._backend)
         self._tracker = EpisodeTracker()
+        self._chaos_engine = ChaosEngine(self._backend)
+        self._hint_provider = HintProvider()
         self._current_task: Task | None = None
+    def _sync_state(self) -> None:
+        """Sync internal state to the AwsRlState object."""
+        self._state.current_task = self._current_task
+        self._state.tracker = TrackerState(
+            step_count=self._tracker.step_count,
+            hints_used=self._tracker.hints_used,
+            progress=self._tracker.previous_progress,
+            commands_executed=[s.command for s in self._tracker.command_history],
+            credited_operations=[
+                f"{op}:{res}" for op, res in self._tracker._credited_operations
+            ],
+        )
+        self._state.chaos_occurred = self._chaos_engine.chaos_occurred
+        self._state.current_tier = self._curriculum.current_difficulty.value
+        self._state.infra_state = self._backend.get_infra_state()
     def reset(
         self,
         seed: Optional[int] = None,
         **kwargs: Any,
     ) -> AwsRlObservation:
         self._backend.reset_environment()
+        self._state = AwsRlState(episode_id=episode_id or str(uuid4()), step_count=0)
         self._tracker.reset()
+        self._chaos_engine.reset()
         self._current_task = self._curriculum.next_task()
         self._designer.apply(self._current_task)
+        self._sync_state()
         return AwsRlObservation(
             episode_id=EpisodeID(self._state.episode_id or ""),
             step_count=StepCount(self._state.step_count),
             command_success=True,
             command_output="Environment reset. Infra state wiped.",
+            task=TaskInfo.from_task(self._current_task) if self._current_task else None,
             done=False,
             reward=0.0,
         )
+    def _intercept_command(self, command: str) -> AwsRlObservation | None:
+        """Handle anti-hack validation, hint requests, and help commands.
+        Returns an observation if the command was intercepted, None otherwise.
+        """
         if not command.startswith("aws "):
             return AwsRlObservation(
                 episode_id=EpisodeID(self._state.episode_id or ""),
                 command_success=False,
                 command_output="",
                 error="Only AWS CLI commands (starting with 'aws') are allowed.",
+                task=TaskInfo.from_task(self._current_task)
+                if self._current_task
+                else None,
                 task_achieved=False,
                 done=False,
                 reward=0.0,
             )
+        if command == "aws help --task-hint":
+            hint_level = self._tracker.record_hint()
+            clamped_level = min(hint_level, MAX_HINT_LEVEL)
+            assert self._current_task is not None
+            hint_text = self._hint_provider.get_hint(self._current_task, clamped_level)
+            return AwsRlObservation(
+                episode_id=EpisodeID(self._state.episode_id or ""),
+                step_count=StepCount(self._state.step_count),
+                command_success=True,
+                command_output=hint_text,
+                task=TaskInfo.from_task(self._current_task)
+                if self._current_task
+                else None,
+                task_achieved=False,
+                done=False,
+                reward=0.0,
+                hints_used=self._tracker.hints_used,
+                hint_text=hint_text,
+            )
+        parts = command.split()
+        if len(parts) == 3 and parts[0] == "aws":
+            service_name = None
+            if parts[2] == "help":
+                service_name = parts[1]
+            elif parts[1] == "help":
+                service_name = parts[2]
+            if service_name is not None:
+                svc_success, help_text = self._backend.get_service_help(service_name)
+                return AwsRlObservation(
+                    episode_id=EpisodeID(self._state.episode_id or ""),
+                    step_count=StepCount(self._state.step_count),
+                    command_success=svc_success,
+                    command_output=help_text if svc_success else "",
+                    error="" if svc_success else help_text,
+                    task=TaskInfo.from_task(self._current_task)
+                    if self._current_task
+                    else None,
+                    task_achieved=False,
+                    done=False,
+                    reward=0.0,
+                )
+        return None
+    def step(
+        self,
+        action: AwsRlAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> AwsRlObservation:
+        assert self._current_task is not None, "Call reset() before step()"
+        self._state.step_count += 1
+        command = action.command.strip()
+        intercepted = self._intercept_command(command)
+        if intercepted is not None:
+            return intercepted
         success, stdout, stderr = self._backend.execute_command(command)
         # Record in tracker
         latest_step = self._tracker.record_step(command, success, stdout, stderr)
+        # Grade the task (pass cumulative chaos flag and hint count)
         grade_result = self._grader.grade(
+            self._current_task,
+            self._tracker,
+            latest_step,
+            chaos_occurred=self._chaos_engine.chaos_occurred,
+            hints_used=self._tracker.hints_used,
         )
         task_achieved = grade_result.task_achieved
         reward = grade_result.reward
                 self._current_task, achieved=True, reward=reward
             )
+        # Inject chaos AFTER grading — disrupts state for future steps
+        self._chaos_engine.maybe_inject(
+            self._current_task,
+            self._tracker,
+            self._curriculum.chaos_probability,
+        )
+        self._sync_state()
         return AwsRlObservation(
             episode_id=EpisodeID(self._state.episode_id or ""),
             step_count=StepCount(self._state.step_count),
             command_success=success,
             command_output=stdout,
             error=stderr,
+            task=TaskInfo.from_task(self._current_task) if self._current_task else None,
             task_achieved=task_achieved,
+            partial_progress=self._tracker.previous_progress,
             done=task_achieved,
             reward=reward,
+            hints_used=self._tracker.hints_used,
         )
     @property
+    def state(self) -> AwsRlState:
         return self._state

server/services/aws_backend.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import logging
 import os
 import subprocess
 import httpx
@@ -27,6 +28,61 @@ class AwsBackend:
             logger.warning("Failed to reset MiniStack state: %s", e)
             raise
     def execute_command(self, command: str) -> tuple[bool, str, str]:
         """Execute an AWS CLI command against MiniStack.
@@ -46,7 +102,7 @@ class AwsBackend:
         try:
             result = subprocess.run(
-                command.split(),
                 capture_output=True,
                 text=True,
                 timeout=30,

 import logging
 import os
+import shlex
 import subprocess
 import httpx
             logger.warning("Failed to reset MiniStack state: %s", e)
             raise
+    def get_infra_state(self) -> dict:
+        """Fetch current infrastructure state from MiniStack via GET /_ministack/state."""
+        try:
+            resp = httpx.get(f"{self._aws_infra_url}/_ministack/state", timeout=10)
+            resp.raise_for_status()
+            return resp.json()
+        except httpx.HTTPError as e:
+            logger.warning("Failed to fetch MiniStack state: %s", e)
+            return {}
+    def get_service_help(self, service_name: str) -> tuple[bool, str]:
+        """Fetch service info from MiniStack via GET /_ministack/handlers/<service>.
+        Returns:
+            Tuple of (success, formatted_help_text)
+        """
+        try:
+            resp = httpx.get(
+                f"{self._aws_infra_url}/_ministack/handlers/{service_name}",
+                timeout=10,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            lines = [
+                f"SERVICE: {data['service']}",
+                "",
+                "DESCRIPTION",
+                data.get("description", "No description available."),
+                "",
+                f"AVAILABLE ACTIONS ({data['action_count']}):",
+                "",
+            ]
+            for action in data.get("supported_actions", []):
+                lines.append(f"  - {action}")
+            state = data.get("state", {})
+            if state:
+                lines.append("")
+                lines.append("CURRENT STATE:")
+                for resource, info in state.items():
+                    count = info.get("count", 0)
+                    names = info.get("names", info.get("ids", info.get("arns", [])))
+                    lines.append(f"  {resource}: {count}")
+                    if names:
+                        for n in names[:20]:
+                            lines.append(f"    - {n}")
+                        if len(names) > 20:
+                            lines.append(f"    ... and {len(names) - 20} more")
+            return True, "\n".join(lines)
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code == 404:
+                return False, f"Unknown service: {service_name}"
+            return False, f"Failed to fetch service help: {e}"
+        except httpx.HTTPError as e:
+            return False, f"Failed to fetch service help: {e}"
     def execute_command(self, command: str) -> tuple[bool, str, str]:
         """Execute an AWS CLI command against MiniStack.
         try:
             result = subprocess.run(
+                shlex.split(command),
                 capture_output=True,
                 text=True,
                 timeout=30,

server/services/chaos_engine.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Chaos Injection Engine.
+Silently mutates AWS state mid-episode to test agent resilience and
+situational awareness. Perturbations are scoped to services the current
+task uses and are selected from a per-service catalog of destructive
+AWS CLI commands.
+"""
+import logging
+import os
+import random
+import re
+from models import AwsService, Task
+from server.services.aws_backend import AwsBackend
+from server.services.episode_tracker import EpisodeTracker
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Resource-name extraction patterns (from successful AWS CLI commands)
+# ---------------------------------------------------------------------------
+_RESOURCE_PATTERNS: dict[AwsService, list[re.Pattern[str]]] = {
+    AwsService.S3: [
+        re.compile(r"aws\s+s3\s+mb\s+s3://([^\s]+)"),
+        re.compile(r"aws\s+s3api\s+create-bucket\s+--bucket\s+([^\s]+)"),
+    ],
+    AwsService.DYNAMODB: [
+        re.compile(r"aws\s+dynamodb\s+create-table\s+.*--table-name\s+([^\s]+)"),
+    ],
+    AwsService.LAMBDA: [
+        re.compile(r"aws\s+lambda\s+create-function\s+.*--function-name\s+([^\s]+)"),
+    ],
+    AwsService.SQS: [
+        re.compile(r"aws\s+sqs\s+create-queue\s+.*--queue-name\s+([^\s]+)"),
+    ],
+    AwsService.IAM: [
+        re.compile(
+            r"aws\s+iam\s+attach-role-policy\s+.*--role-name\s+([^\s]+)"
+            r"\s+.*--policy-arn\s+([^\s]+)"
+        ),
+        re.compile(
+            r"aws\s+iam\s+attach-role-policy\s+.*--policy-arn\s+([^\s]+)"
+            r"\s+.*--role-name\s+([^\s]+)"
+        ),
+    ],
+}
+# ---------------------------------------------------------------------------
+# Perturbation templates per service
+# ---------------------------------------------------------------------------
+_PERTURBATION_TEMPLATES: dict[AwsService, list[str]] = {
+    AwsService.S3: [
+        "aws s3 rb s3://{name} --force",
+    ],
+    AwsService.DYNAMODB: [
+        "aws dynamodb delete-table --table-name {name}",
+    ],
+    AwsService.LAMBDA: [
+        "aws lambda delete-function --function-name {name}",
+    ],
+    AwsService.SQS: [
+        "aws sqs delete-queue --queue-url {name}",
+    ],
+    AwsService.IAM: [
+        "aws iam detach-role-policy --role-name {name} --policy-arn {arn}",
+    ],
+}
+class ChaosEngine:
+    """Silently mutates AWS state mid-episode to test agent resilience."""
+    def __init__(self, backend: AwsBackend) -> None:
+        self._backend = backend
+        self._enabled = os.environ.get("ENABLE_CHAOS", "true").lower() == "true"
+        self._chaos_occurred = False
+    def reset(self) -> None:
+        """Reset per-episode chaos state."""
+        self._chaos_occurred = False
+    @property
+    def chaos_occurred(self) -> bool:
+        """Whether chaos was injected at any point during this episode."""
+        return self._chaos_occurred
+    def maybe_inject(
+        self,
+        task: Task,
+        tracker: EpisodeTracker,
+        probability: float,
+    ) -> bool:
+        """Roll dice and, if triggered, execute a task-relevant perturbation.
+        Returns True if a perturbation was actually executed.
+        """
+        if not self._enabled or probability <= 0.0:
+            return False
+        if random.random() >= probability:
+            return False
+        perturbation = self._select_perturbation(task, tracker)
+        if perturbation is None:
+            return False
+        logger.info("Chaos injection: %s", perturbation)
+        self._backend.execute_command(perturbation)
+        self._chaos_occurred = True
+        return True
+    # -- Private helpers ------------------------------------------------------
+    def _select_perturbation(
+        self,
+        task: Task,
+        tracker: EpisodeTracker,
+    ) -> str | None:
+        """Pick a concrete perturbation command scoped to services the task uses."""
+        task_services = set(task.success_criteria.services)
+        if not task_services:
+            return None
+        # Collect all candidate (service, rendered_command) pairs
+        candidates: list[str] = []
+        for step in tracker.command_history:
+            if not step.success:
+                continue
+            for service in task_services:
+                for pattern in _RESOURCE_PATTERNS.get(service, []):
+                    match = pattern.search(step.command)
+                    if not match:
+                        continue
+                    templates = _PERTURBATION_TEMPLATES.get(service, [])
+                    for template in templates:
+                        rendered = self._render_template(template, match, service)
+                        if rendered:
+                            candidates.append(rendered)
+        if not candidates:
+            return None
+        return random.choice(candidates)
+    @staticmethod
+    def _render_template(
+        template: str,
+        match: re.Match[str],
+        service: AwsService,
+    ) -> str | None:
+        """Fill a perturbation template from regex match groups."""
+        groups = match.groups()
+        if not groups:
+            return None
+        if service == AwsService.IAM and len(groups) >= 2:
+            # IAM patterns capture (role_name, policy_arn) or vice-versa
+            # The first pattern has role first, second has arn first
+            if "role-name" in template and "policy-arn" in template:
+                return template.format(name=groups[0], arn=groups[1])
+            return None
+        return template.format(name=groups[0])

server/services/curriculum.py CHANGED Viewed

@@ -17,6 +17,7 @@ import logging
 import random
 from collections import defaultdict
 from pathlib import Path
 import yaml
@@ -59,6 +60,7 @@ TIER_CONFIGS: dict[TaskDifficulty, TierConfig] = {
         mastery_window=10,
         mastery_threshold=0.7,
         fast_track_rate=0.9,
     ),
     TaskDifficulty.ADVANCED: TierConfig(
         min_episodes=10,
@@ -66,6 +68,7 @@ TIER_CONFIGS: dict[TaskDifficulty, TierConfig] = {
         mastery_window=10,
         mastery_threshold=0.7,
         fast_track_rate=0.9,
     ),
     TaskDifficulty.EXPERT: TierConfig(
         min_episodes=0,
@@ -73,6 +76,7 @@ TIER_CONFIGS: dict[TaskDifficulty, TierConfig] = {
         mastery_window=10,
         mastery_threshold=0.7,
         fast_track_rate=1.0,
     ),
 }
@@ -85,6 +89,11 @@ _TIER_FILES: dict[TaskDifficulty, str] = {
     TaskDifficulty.EXPERT: "expert.yaml",
 }
 # ---------------------------------------------------------------------------
 # Priority score tuning constants
 # ---------------------------------------------------------------------------
@@ -109,8 +118,34 @@ _FAST_TRACK_MIN_EPISODES = 3
 # ---------------------------------------------------------------------------
 def load_tier(difficulty: TaskDifficulty, tasks_dir: Path = TASKS_DIR) -> list[Task]:
-    """Load tasks for a single difficulty tier from its YAML file."""
     filename = _TIER_FILES.get(difficulty)
     if filename is None:
         logger.warning("No file mapping for difficulty: %s", difficulty.value)
@@ -124,24 +159,25 @@ def load_tier(difficulty: TaskDifficulty, tasks_dir: Path = TASKS_DIR) -> list[T
     with open(filepath) as f:
         entries = yaml.safe_load(f) or []
-    tasks = [
-        Task(
-            task_id=TaskID(entry["task_id"]),
-            difficulty=difficulty,
-            description=entry["description"],
-            success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
-            setup_commands=[
-                SetupCommand(command=cmd)
-                if isinstance(cmd, str)
-                else SetupCommand(**cmd)
-                for cmd in entry.get("setup_commands", [])
-            ],
         )
-        for entry in entries
-    ]
-    logger.info(
-        "Loaded %d %s tasks from %s", len(tasks), difficulty.value, filepath.name
-    )
     return tasks
@@ -237,6 +273,10 @@ class Curriculum:
     def is_warmup(self) -> bool:
         return self.current_difficulty == TaskDifficulty.WARMUP
     # -- Public API -----------------------------------------------------------
     def next_task(self) -> Task:

 import random
 from collections import defaultdict
 from pathlib import Path
+from typing import Any
 import yaml
         mastery_window=10,
         mastery_threshold=0.7,
         fast_track_rate=0.9,
+        chaos_probability=0.1,
     ),
     TaskDifficulty.ADVANCED: TierConfig(
         min_episodes=10,
         mastery_window=10,
         mastery_threshold=0.7,
         fast_track_rate=0.9,
+        chaos_probability=0.2,
     ),
     TaskDifficulty.EXPERT: TierConfig(
         min_episodes=0,
         mastery_window=10,
         mastery_threshold=0.7,
         fast_track_rate=1.0,
+        chaos_probability=0.3,
     ),
 }
     TaskDifficulty.EXPERT: "expert.yaml",
 }
+# Supplementary task files merged into an existing tier
+_SUPPLEMENTARY_FILES: dict[TaskDifficulty, list[str]] = {
+    TaskDifficulty.EXPERT: ["drift.yaml"],
+}
 # ---------------------------------------------------------------------------
 # Priority score tuning constants
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
+def _parse_task_entries(
+    entries: list[dict[str, Any]], difficulty: TaskDifficulty
+) -> list[Task]:
+    """Convert raw YAML entries into Task models."""
+    return [
+        Task(
+            task_id=TaskID(entry["task_id"]),
+            difficulty=difficulty,
+            description=entry["description"],
+            success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
+            setup_commands=[
+                SetupCommand(command=cmd)
+                if isinstance(cmd, str)
+                else SetupCommand(**cmd)
+                for cmd in entry.get("setup_commands", [])
+            ],
+            desired_state_spec=entry.get("desired_state_spec"),
+            possible_drifts=[
+                SetupCommand(command=d) if isinstance(d, str) else SetupCommand(**d)
+                for d in entry.get("possible_drifts", [])
+            ],
+        )
+        for entry in entries
+    ]
 def load_tier(difficulty: TaskDifficulty, tasks_dir: Path = TASKS_DIR) -> list[Task]:
+    """Load tasks for a single difficulty tier from its YAML file(s)."""
     filename = _TIER_FILES.get(difficulty)
     if filename is None:
         logger.warning("No file mapping for difficulty: %s", difficulty.value)
     with open(filepath) as f:
         entries = yaml.safe_load(f) or []
+    tasks = _parse_task_entries(entries, difficulty)
+    # Load supplementary task files for this tier
+    for extra_file in _SUPPLEMENTARY_FILES.get(difficulty, []):
+        extra_path = tasks_dir / extra_file
+        if not extra_path.exists():
+            continue
+        with open(extra_path) as f:
+            extra_entries = yaml.safe_load(f) or []
+        extra_tasks = _parse_task_entries(extra_entries, difficulty)
+        tasks.extend(extra_tasks)
+        logger.info(
+            "Loaded %d supplementary %s tasks from %s",
+            len(extra_tasks),
+            difficulty.value,
+            extra_file,
         )
+    logger.info("Loaded %d %s tasks total", len(tasks), difficulty.value)
     return tasks
     def is_warmup(self) -> bool:
         return self.current_difficulty == TaskDifficulty.WARMUP
+    @property
+    def chaos_probability(self) -> float:
+        return self.tier_config.chaos_probability
     # -- Public API -----------------------------------------------------------
     def next_task(self) -> Task:

server/services/drift_engine.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Configuration Drift Engine.
+Randomly applies a subset of a task's possible mutations after the correct
+state has been provisioned. This forces the agent to audit and discover
+which resources drifted rather than memorising a fixed solution path.
+"""
+from __future__ import annotations
+import logging
+import random
+from models import Task
+from server.services.aws_backend import AwsBackend
+logger = logging.getLogger(__name__)
+# Default range for how many drifts to apply (inclusive).
+_MIN_DRIFTS = 2
+_MAX_DRIFTS = 3
+class DriftEngine:
+    """Selects and applies random configuration drifts for a task."""
+    def __init__(self, backend: AwsBackend) -> None:
+        self._backend = backend
+    def apply_drift(self, task: Task) -> list[str]:
+        """Randomly select and execute K of N possible drifts.
+        Args:
+            task: A task whose ``possible_drifts`` list defines the
+                candidate mutations.
+        Returns:
+            Human-readable descriptions of the drifts that were applied
+            (empty list if none).
+        """
+        if not task.possible_drifts:
+            return []
+        pool = task.possible_drifts
+        k = self._pick_count(len(pool))
+        selected = random.sample(pool, k)
+        applied: list[str] = []
+        for drift in selected:
+            success, _stdout, stderr = self._backend.execute_command(drift.command)
+            label = drift.description or drift.command
+            if success:
+                logger.info("Drift applied: %s", label)
+                applied.append(label)
+            else:
+                logger.warning("Drift command failed: %s — %s", drift.command, stderr)
+        return applied
+    @staticmethod
+    def _pick_count(pool_size: int) -> int:
+        """Determine how many drifts to apply given the pool size."""
+        if pool_size <= 1:
+            return pool_size
+        lo = min(_MIN_DRIFTS, pool_size)
+        hi = min(_MAX_DRIFTS, pool_size)
+        return random.randint(lo, hi)

server/services/environment_designer.py CHANGED Viewed

@@ -14,6 +14,7 @@ from pydantic import BaseModel, Field
 from models import SetupCommand, Task
 from server.services.aws_backend import AwsBackend
 logger = logging.getLogger(__name__)
@@ -47,6 +48,7 @@ class EnvironmentDesigner:
     def __init__(self, backend: AwsBackend) -> None:
         self._backend = backend
     def apply(self, task: Task) -> ProvisionResult:
         """Apply the task's environment setup to MiniStack.
@@ -61,7 +63,14 @@ class EnvironmentDesigner:
         if not task.setup_commands:
             return ProvisionResult(resources_created=0)
-        return self._apply_cli_commands(task.setup_commands)
     # -- Provisioning strategies ----------------------------------------------

 from models import SetupCommand, Task
 from server.services.aws_backend import AwsBackend
+from server.services.drift_engine import DriftEngine
 logger = logging.getLogger(__name__)
     def __init__(self, backend: AwsBackend) -> None:
         self._backend = backend
+        self._drift_engine = DriftEngine(backend)
     def apply(self, task: Task) -> ProvisionResult:
         """Apply the task's environment setup to MiniStack.
         if not task.setup_commands:
             return ProvisionResult(resources_created=0)
+        result = self._apply_cli_commands(task.setup_commands)
+        # Apply random configuration drifts after provisioning correct state
+        if task.possible_drifts:
+            applied = self._drift_engine.apply_drift(task)
+            logger.info("Applied %d configuration drifts", len(applied))
+        return result
     # -- Provisioning strategies ----------------------------------------------

server/services/episode_tracker.py CHANGED Viewed

@@ -63,6 +63,44 @@ def _command_mentions_resource(command: str, resource: str) -> bool:
     return False
 class EpisodeTracker:
     """Tracks command history within a single episode for grading."""
@@ -72,12 +110,14 @@ class EpisodeTracker:
         self._previous_progress: float = 0.0
         # Track which (operation, resource) pairs have been credited
         self._credited_operations: set[tuple[str, str | None]] = set()
     def reset(self) -> None:
         self._history.clear()
         self._step_counter = 0
         self._previous_progress = 0.0
         self._credited_operations.clear()
     def record_step(
         self, command: str, success: bool, stdout: str, stderr: str
@@ -136,6 +176,15 @@ class EpisodeTracker:
     def step_count(self) -> int:
         return self._step_counter
     @property
     def previous_progress(self) -> float:
         return self._previous_progress
@@ -143,3 +192,50 @@ class EpisodeTracker:
     @previous_progress.setter
     def previous_progress(self, value: float) -> None:
         self._previous_progress = value

     return False
+# Maps create operations to their corresponding delete operations.
+_CREATE_DELETE_PAIRS: dict[str, str] = {
+    "create-bucket": "delete-bucket",
+    "create-table": "delete-table",
+    "create-function": "delete-function",
+    "create-queue": "delete-queue",
+    "create-topic": "delete-topic",
+    "create-role": "delete-role",
+    "create-rest-api": "delete-rest-api",
+    "create-secret": "delete-secret",
+    "put-bucket-policy": "delete-bucket-policy",
+    "attach-role-policy": "detach-role-policy",
+}
+_ALREADY_EXISTS_PATTERNS: list[str] = [
+    "already exists",
+    "BucketAlreadyExists",
+    "BucketAlreadyOwnedByYou",
+    "ResourceInUseException",
+    "ResourceConflictException",
+    "EntityAlreadyExists",
+    "QueueNameExists",
+    "TopicAlreadyExists",
+]
+def _extract_resource_name(command: str) -> str | None:
+    """Extract the primary resource name from an AWS CLI command."""
+    parts = command.strip().split()
+    for i, part in enumerate(parts):
+        if part in _RESOURCE_FLAGS and i + 1 < len(parts):
+            return parts[i + 1]
+        for flag in _RESOURCE_FLAGS:
+            if part.startswith(f"{flag}="):
+                return part.split("=", 1)[1]
+    return None
 class EpisodeTracker:
     """Tracks command history within a single episode for grading."""
         self._previous_progress: float = 0.0
         # Track which (operation, resource) pairs have been credited
         self._credited_operations: set[tuple[str, str | None]] = set()
+        self._hints_used: int = 0
     def reset(self) -> None:
         self._history.clear()
         self._step_counter = 0
         self._previous_progress = 0.0
         self._credited_operations.clear()
+        self._hints_used = 0
     def record_step(
         self, command: str, success: bool, stdout: str, stderr: str
     def step_count(self) -> int:
         return self._step_counter
+    def record_hint(self) -> int:
+        """Record that a hint was used. Returns the new hint level (1-indexed)."""
+        self._hints_used += 1
+        return self._hints_used
+    @property
+    def hints_used(self) -> int:
+        return self._hints_used
     @property
     def previous_progress(self) -> float:
         return self._previous_progress
     @previous_progress.setter
     def previous_progress(self, value: float) -> None:
         self._previous_progress = value
+    def detect_rollbacks(self) -> int:
+        """Count create→delete pairs on the same resource (wasteful rollbacks)."""
+        # Build a set of (operation, resource) for successful create commands
+        creates: list[tuple[str, str]] = []
+        for record in self._history:
+            if not record.success:
+                continue
+            _, op = _parse_aws_command(record.command)
+            if op is None or op not in _CREATE_DELETE_PAIRS:
+                continue
+            resource = _extract_resource_name(record.command)
+            if resource is not None:
+                creates.append((op, resource))
+        rollback_count = 0
+        for create_op, resource in creates:
+            delete_op = _CREATE_DELETE_PAIRS[create_op]
+            for record in self._history:
+                if not record.success:
+                    continue
+                _, op = _parse_aws_command(record.command)
+                if op == delete_op and _command_mentions_resource(
+                    record.command, resource
+                ):
+                    rollback_count += 1
+                    break
+        return rollback_count
+    def detect_idempotent_retries(self) -> int:
+        """Count create failures with 'already exists' followed by a successful next step."""
+        count = 0
+        for i, record in enumerate(self._history):
+            if record.success:
+                continue
+            _, op = _parse_aws_command(record.command)
+            if op is None or not op.startswith("create"):
+                continue
+            # Check stderr for "already exists" patterns
+            if not any(pat in record.stderr for pat in _ALREADY_EXISTS_PATTERNS):
+                continue
+            # Next step must exist and be successful
+            if i + 1 < len(self._history) and self._history[i + 1].success:
+                count += 1
+        return count