Spaces:

Sizzing
/

aws_rl_env

Running

App Files Files Community

Sizzing commited on Apr 3

Commit

2073b3f

verified ·

1 Parent(s): b626a01

Upload folder using huggingface_hub

Browse files

Files changed (32) hide show

Dockerfile +96 -0
Makefile +138 -0
README.md +211 -4
__init__.py +16 -0
client.py +73 -0
inference.py +241 -0
models.py +197 -0
openenv.yaml +7 -0
openenv_aws_rl_env.egg-info/PKG-INFO +14 -0
openenv_aws_rl_env.egg-info/SOURCES.txt +19 -0
openenv_aws_rl_env.egg-info/dependency_links.txt +1 -0
openenv_aws_rl_env.egg-info/entry_points.txt +2 -0
openenv_aws_rl_env.egg-info/requires.txt +10 -0
openenv_aws_rl_env.egg-info/top_level.txt +1 -0
pyproject.toml +57 -0
server/__init__.py +11 -0
server/app.py +76 -0
server/aws_rl_env_environment.py +126 -0
server/requirements.txt +6 -0
server/services/__init__.py +0 -0
server/services/aws_backend.py +65 -0
server/services/curriculum.py +471 -0
server/services/environment_designer.py +94 -0
server/services/episode_tracker.py +145 -0
server/services/resource_verifier.py +179 -0
server/services/task_grader.py +250 -0
server/services/tasks/advanced.yaml +60 -0
server/services/tasks/beginner.yaml +44 -0
server/services/tasks/expert.yaml +97 -0
server/services/tasks/intermediate.yaml +43 -0
server/services/tasks/warmup.yaml +35 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=aws_rl_env
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+    curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    mv /root/.local/bin/uv /usr/local/bin/uv && \
+    mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+    uv sync --frozen --no-install-project --no-editable; \
+    else \
+    uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+    uv sync --frozen --no-editable; \
+    else \
+    uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the uv-managed Python interpreter from builder
+COPY --from=builder /root/.local/share/uv/python /root/.local/share/uv/python
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Install AWS CLI
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends awscli && \
+    rm -rf /var/lib/apt/lists/*
+# Configure AWS CLI to point to MiniStack
+RUN mkdir -p /root/.aws && \
+    printf '[default]\nregion = us-east-1\noutput = json\n' > /root/.aws/config && \
+    printf '[default]\naws_access_key_id = test\naws_secret_access_key = test\n' > /root/.aws/credentials
+ENV AWS_ENDPOINT_URL=http://localhost:4566
+# Enable the web interface for OpenEnv (if applicable)
+# ENV ENABLE_WEB_INTERFACE=true
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# DEV_MODE=1 enables live reload via --reload flag
+ENV DEV_MODE=0
+# Entrypoint: start MiniStack in background, then run the FastAPI server
+CMD ["sh", "-c", "ministack & sleep 2 && uvicorn server.app:app --host 0.0.0.0 --port 8000 $([ \"$DEV_MODE\" = '1' ] && echo '--reload --reload-dir /app/env')"]

Makefile ADDED Viewed

	@@ -0,0 +1,138 @@

+# Project settings
+PROJECT_NAME := openenv-aws_rl_env
+PYTHON := python3
+UV := uv
+DOCKER_IMAGE := aws-rl-env
+DOCKER_TAG := latest
+SERVER_HOST := 0.0.0.0
+SERVER_PORT := 8000
+.DEFAULT_GOAL := help
+# ──────────────────────────────────────────────
+# Setup & Dependencies
+# ──────────────────────────────────────────────
+.PHONY: install
+install: ## Install project dependencies
+	$(UV) sync --frozen
+.PHONY: install-dev
+install-dev: ## Install project with dev dependencies
+	$(UV) sync --frozen --all-extras
+.PHONY: lock
+lock: ## Update the lockfile
+	$(UV) lock
+# ──────────────────────────────────────────────
+# Development
+# ──────────────────────────────────────────────
+.PHONY: run
+run: ## Run with MiniStack + FastAPI server (mirrors Docker CMD)
+	ministack & sleep 2 && $(UV) run uvicorn server.app:app --host $(SERVER_HOST) --port $(SERVER_PORT)
+# ──────────────────────────────────────────────
+# Code Quality
+# ──────────────────────────────────────────────
+.PHONY: format
+format: ## Format code with ruff
+	$(UV) run ruff format .
+.PHONY: lint
+lint: ## Lint code with ruff
+	$(UV) run ruff check .
+.PHONY: lint-fix
+lint-fix: ## Lint and auto-fix code with ruff
+	$(UV) run ruff check --fix .
+.PHONY: typecheck
+typecheck: ## Run type checking with mypy
+	$(UV) run mypy
+.PHONY: check
+check: lint typecheck
+# ──────────────────────────────────────────────
+# Docker
+# ──────────────────────────────────────────────
+.PHONY: docker-build
+docker-build: ## Build Docker image
+	docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
+.PHONY: docker-run
+docker-run: ## Run Docker container
+	docker run --rm -p $(SERVER_PORT):8000 $(DOCKER_IMAGE):$(DOCKER_TAG)
+.PHONY: docker-run-dev
+docker-run-dev: ## Run Docker container in dev mode with live reload
+	docker run --rm -p $(SERVER_PORT):8000 -v $(PWD):/app/env -v /app/env/.venv -e DEV_MODE=1 $(DOCKER_IMAGE):$(DOCKER_TAG)
+.PHONY: docker-run-detach
+docker-run-detach: ## Run Docker container in background
+	docker run -d --rm -p $(SERVER_PORT):8000 --name $(DOCKER_IMAGE) $(DOCKER_IMAGE):$(DOCKER_TAG)
+.PHONY: docker-stop
+docker-stop: ## Stop the running Docker container
+	docker stop $(DOCKER_IMAGE)
+.PHONY: docker-logs
+docker-logs: ## Tail logs from the running Docker container
+	docker logs -f $(DOCKER_IMAGE)
+.PHONY: docker-shell
+docker-shell: ## Open a shell in the running Docker container
+	docker exec -it $(DOCKER_IMAGE) /bin/bash
+.PHONY: docker-clean
+docker-clean: ## Stop and remove all running containers for this image
+	@docker ps -q --filter ancestor=$(DOCKER_IMAGE):$(DOCKER_TAG) | xargs -r docker rm -f
+.PHONY: docker-health
+docker-health: ## Check health of the running container
+	@curl -sf http://localhost:$(SERVER_PORT)/health && echo " OK" || echo " FAIL"
+# ──────────────────────────────────────────────
+# OpenEnv
+# ──────────────────────────────────────────────
+.PHONY: openenv-validate
+openenv-validate: ## Validate the OpenEnv configuration
+	openenv validate
+.PHONY: openenv-build
+openenv-build: ## Build the environment using OpenEnv CLI
+	openenv build
+.PHONY: openenv-push
+openenv-push: ## Push the environment to Hugging Face Spaces
+	openenv push
+# ──────────────────────────────────────────────
+# Cleanup
+# ──────────────────────────────────────────────
+.PHONY: clean
+clean: ## Remove build artifacts and caches
+	rm -rf build/ dist/ *.egg-info .eggs/
+	rm -rf .pytest_cache/ .mypy_cache/ .ruff_cache/
+	rm -rf htmlcov/ .coverage coverage.xml
+	find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+	find . -type f -name '*.pyc' -delete 2>/dev/null || true
+.PHONY: clean-all
+clean-all: clean ## Remove all artifacts including venv
+	rm -rf .venv/
+# ──────────────────────────────────────────────
+# Help
+# ──────────────────────���───────────────────────
+.PHONY: help
+help: ## Show this help message
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
+		awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'

README.md CHANGED Viewed

@@ -1,10 +1,217 @@
 ---
-title: Aws Rl Env
-emoji: 📚
-colorFrom: indigo
 colorTo: pink
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AWS RL Environment Server
+emoji: 🥇
+colorFrom: pink
 colorTo: pink
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
 ---
+# AWS RL Environment
+An RL environment backed by a **simulated AWS cloud** powered by [MiniStack](https://github.com/Nahuel990/ministack). The agent sends AWS API calls as actions and receives API responses as observations. MiniStack runs inside the same Docker container, emulating 34 AWS services locally.
+## Quick Start
+```python
+from aws_rl_env import AwsRlAction, AwsRlEnv
+try:
+    # Create environment from Docker image
+    env = AwsRlEnv.from_docker_image("aws_rl_env-env:latest")
+    # Reset
+    result = env.reset()
+    print(f"Episode: {result.observation.episode_id}")
+    # Create an S3 bucket
+    result = env.step(AwsRlAction(command="aws s3 mb s3://my-rl-bucket"))
+    print(f"Create bucket success: {result.observation.command_success}")
+    print(f"Output: {result.observation.command_output}")
+    # Upload a file to the bucket
+    result = env.step(AwsRlAction(command="aws s3 cp hello.txt s3://my-rl-bucket/"))
+    print(f"Upload success: {result.observation.command_success}")
+    # List buckets
+    result = env.step(AwsRlAction(command="aws s3 ls"))
+    print(f"Buckets: {result.observation.command_output}")
+    # Describe EC2 instances
+    result = env.step(AwsRlAction(command="aws ec2 describe-instances"))
+    print(f"EC2 output: {result.observation.command_output}")
+    # Check current task and resource state
+    print(f"Task: {result.observation.task}")
+    print(f"Task achieved: {result.observation.task_achieved}")
+    print(f"Resources: {result.observation.resources}")
+finally:
+    env.close()
+```
+## Supported AWS Services
+The environment supports **34 AWS services** via MiniStack:
+| Category | Services |
+|----------|----------|
+| **Storage & DB** | S3, DynamoDB, RDS, ElastiCache, EFS |
+| **Compute** | Lambda, ECS, EC2, Step Functions |
+| **Messaging** | SQS, SNS, Kinesis, EventBridge, Firehose |
+| **API** | API Gateway v1/v2, ALB/ELBv2 |
+| **Security** | IAM, STS, Cognito, ACM, WAF v2, Secrets Manager |
+| **Monitoring** | CloudWatch, CloudWatch Logs, SSM |
+| **Infrastructure** | CloudFormation, Route53 |
+| **Other** | SES, Athena, Glue, EMR |
+## Building the Docker Image
+```bash
+docker build -t aws_rl_env-env:latest -f Dockerfile .
+```
+The Docker image bundles:
+- The RL environment server (port 8000)
+- MiniStack AWS emulator (port 4566)
+- boto3 for AWS SDK access
+- All MiniStack dependencies
+## Environment Details
+### Core Types
+- `TaskID` — Unique task identifier (int)
+- `EpisodeID` — Unique episode identifier (str)
+- `StepCount` — Step counter within an episode (int)
+- `AwsService` — Supported AWS services: `s3`, `ec2`, `dynamodb`, `lambda`
+### Task
+**Task**: Defines what the RL agent must accomplish
+- `task_id` (TaskID) — Unique task identifier
+- `difficulty` (TaskDifficulty) — One of: `warmup`, `beginner`, `intermediate`, `advanced`, `expert`
+- `description` (str) — Human-readable task description
+- `success_criteria` (dict) — Machine-readable criteria to evaluate task completion
+### Action
+**AwsRlAction**: An AWS CLI command to execute against MiniStack
+- `command` (str) — AWS CLI command to execute, e.g. `"aws s3 ls"`, `"aws ec2 describe-instances"`
+### Observation
+**AwsRlObservation**: The result returned after each step
+- `episode_id` (EpisodeID) — Unique identifier for the episode
+- `step_count` (StepCount) — Current step count in the episode
+- `command_success` (bool) — Whether the CLI command executed successfully
+- `command_output` (str) — Stdout from the executed AWS CLI command
+- `error` (str) — Stderr if the command failed
+- `resources` (dict[AwsService, dict | list | str]) — Current resource state from MiniStack, keyed by service name
+- `task` (Task | None) — The task the agent is trying to accomplish
+- `task_achieved` (bool) — Whether the task has been achieved
+## Architecture
+```
+┌─────────────────────────────────────────┐
+│           Docker Container              │
+│                                         │
+│  ┌──────────────┐   ┌───────────────┐   │
+│  │  RL Server   │   │  MiniStack    │   │
+│  │  (port 8000) │──▶│  (port 4566)  │   │
+│  │  FastAPI +   │   │  34 AWS       │   │
+│  │  WebSocket   │   │  services     │   │
+│  └──────────────┘   └───────────────┘   │
+│         │                    │           │
+│         │    boto3 calls     │           │
+│         └──��─────────────────┘           │
+└─────────────────────────────────────────┘
+        ▲
+        │ WebSocket / HTTP
+        │
+   RL Agent (client)
+```
+## Advanced Usage
+### Connecting to an Existing Server
+```python
+from aws_rl_env import AwsRlAction, AwsRlEnv
+env = AwsRlEnv(base_url="http://localhost:8000")
+result = env.reset()
+# Create a DynamoDB table
+result = env.step(AwsRlAction(
+    command="aws dynamodb create-table --table-name my-table --key-schema AttributeName=id,KeyType=HASH --attribute-definitions AttributeName=id,AttributeType=S --billing-mode PAY_PER_REQUEST"
+))
+print(f"Table created: {result.observation.command_success}")
+print(f"Output: {result.observation.command_output}")
+```
+### Concurrent Sessions
+```python
+from aws_rl_env import AwsRlAction, AwsRlEnv
+from concurrent.futures import ThreadPoolExecutor
+def run_episode(client_id: int):
+    with AwsRlEnv(base_url="http://localhost:8000") as env:
+        result = env.reset()
+        for i in range(10):
+            result = env.step(AwsRlAction(
+                command=f"aws s3api put-object --bucket client-{client_id} --key step-{i}.txt --body 'data from step {i}'"
+            ))
+        return client_id, result.observation.command_success
+with ThreadPoolExecutor(max_workers=4) as executor:
+    results = list(executor.map(run_episode, range(4)))
+```
+### Running Locally (without Docker)
+Start MiniStack and the RL server separately:
+```bash
+# Terminal 1: Start MiniStack
+pip install ministack
+ministack  # Runs on port 4566
+# Terminal 2: Start RL server
+export AWS_ENDPOINT_URL=http://localhost:4566
+export AWS_ACCESS_KEY_ID=test
+export AWS_SECRET_ACCESS_KEY=test
+uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+```
+## Project Structure
+```
+aws_rl_env/
+├── __init__.py            # Module exports
+├── README.md              # This file
+├── Dockerfile             # Container image (bundles RL server + MiniStack)
+├── entrypoint.sh          # Starts MiniStack then RL server
+├── openenv.yaml           # OpenEnv manifest
+├── pyproject.toml         # Project metadata and dependencies
+├── uv.lock                # Locked dependencies
+├── client.py              # AwsRlEnv client
+├── models.py              # AwsRlAction and AwsRlObservation models
+├── ministack/             # MiniStack AWS emulator (bundled)
+│   ├── app.py             # MiniStack ASGI application
+│   ├── core/              # Routing, persistence, responses
+│   └── services/          # 34 AWS service implementations
+└── server/
+    ├── __init__.py
+    ├── aws_rl_env_environment.py  # Core RL environment (uses boto3 → MiniStack)
+    └── app.py             # FastAPI application
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Aws Rl Env Environment."""
+from .client import AwsRlEnv
+from .models import AwsRlAction, AwsRlObservation
+__all__ = [
+    "AwsRlAction",
+    "AwsRlObservation",
+    "AwsRlEnv",
+]

client.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Aws Rl Env Environment Client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount
+class AwsRlEnv(EnvClient[AwsRlAction, AwsRlObservation, State]):
+    """
+    Client for the Aws Rl Env Environment.
+    This client maintains a persistent WebSocket connection to the environment server,
+    enabling efficient multi-step interactions with lower latency.
+    Each client instance has its own dedicated environment session on the server.
+    Example:
+        >>> with AwsRlEnv(base_url="http://localhost:8000") as client:
+        ...     result = client.reset()
+        ...     print(result.observation.command_output)
+        ...
+        ...     result = client.step(AwsRlAction(command="aws s3 ls"))
+        ...     print(result.observation.command_output)
+    Example with Docker:
+        >>> client = AwsRlEnv.from_docker_image("aws_rl_env-env:latest")
+        >>> try:
+        ...     result = client.reset()
+        ...     result = client.step(AwsRlAction(command="aws s3 ls"))
+        ... finally:
+        ...     client.close()
+    """
+    def _step_payload(self, action: AwsRlAction) -> Dict:
+        """Convert AwsRlAction to JSON payload for step message."""
+        return {"command": action.command}
+    def _parse_result(self, payload: Dict) -> StepResult[AwsRlObservation]:
+        """Parse server response into StepResult[AwsRlObservation]."""
+        obs_data = payload.get("observation", {})
+        observation = AwsRlObservation(
+            episode_id=EpisodeID(obs_data.get("episode_id", "")),
+            step_count=StepCount(obs_data.get("step_count", 0)),
+            command_success=obs_data.get("command_success", False),
+            command_output=obs_data.get("command_output", ""),
+            error=obs_data.get("error", ""),
+            task=obs_data.get("task"),
+            task_achieved=obs_data.get("task_achieved", False),
+            done=payload.get("done", False),
+            reward=payload.get("reward", 0.0),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward", 0.0),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """Parse server response into State object."""
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

inference.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+Inference Script Example
+===================================
+MANDATORY
+- Before submitting, ensure the following variables are defined in your environment configuration:
+    API_BASE_URL   The API endpoint for the LLM.
+    MODEL_NAME     The model identifier to use for inference.
+    HF_TOKEN       Your Hugging Face / API key.
+    LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
+                     method
+- Defaults are set only for API_BASE_URL and MODEL_NAME
+    (and should reflect your active inference setup):
+    API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
+    MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
+- The inference script must be named `inference.py` and placed in the root directory of the project
+- Participants must use OpenAI Client for all LLM calls using above variables
+STDOUT FORMAT
+- The script must emit exactly three line types to stdout, in this order:
+    [START] task=<task_name> env=<benchmark> model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
+  Rules:
+    - One [START] line at episode begin.
+    - One [STEP] line per step, immediately after env.step() returns.
+    - One [END] line after env.close(), always emitted (even on exception).
+    - reward and rewards are formatted to 2 decimal places.
+    - done and success are lowercase booleans: true or false.
+    - error is the raw last_action_error string, or null if none.
+    - All fields on a single line with no newlines within a line.
+  Example:
+    [START] task=create-s3-bucket env=aws_rl_env model=Qwen2.5-72B-Instruct
+    [STEP] step=1 action=aws s3api create-bucket --bucket my-test-bucket reward=1.00 done=false error=null
+    [END] success=true steps=1 rewards=1.00
+"""
+import asyncio
+import os
+import textwrap
+from typing import List, Optional
+from dotenv import load_dotenv
+from openai import OpenAI
+from client import AwsRlEnv
+from models import AwsRlAction
+load_dotenv()  # Load variables from .env file if present
+IMAGE_NAME = os.getenv("IMAGE_NAME")  # If you are using docker image
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
+MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
+BENCHMARK = os.getenv("BENCHMARK", "aws_rl_env")
+MAX_STEPS = int(os.getenv("MAX_STEPS", "15"))
+TEMPERATURE = 0.7
+MAX_TOKENS = 512
+SUCCESS_SCORE_THRESHOLD = 1.0  # task_achieved yields reward=1.0
+SYSTEM_PROMPT = textwrap.dedent(
+    """
+    You are an AWS cloud engineer interacting with a real AWS environment via CLI.
+    Each turn you must send exactly ONE valid AWS CLI command (starting with 'aws').
+    You will be given a task to accomplish. Read the task description carefully.
+    Use the command output and error messages to guide your next action.
+    Rules:
+    - Only send AWS CLI commands (e.g. 'aws s3 ls', 'aws dynamodb create-table ...')
+    - One command per turn — no pipes, no shell syntax, no chaining
+    - Reply with ONLY the command, nothing else — no explanations, no quotes
+    """
+).strip()
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
+def build_user_prompt(
+    task_description: str,
+    step: int,
+    last_output: str,
+    last_error: str,
+    last_reward: float,
+    history: List[str],
+) -> str:
+    history_block = "\n".join(history[-6:]) if history else "None"
+    return textwrap.dedent(
+        f"""
+        TASK: {task_description}
+        Step: {step}
+        Last command output: {last_output!r}
+        Last error: {last_error!r}
+        Last reward: {last_reward:.2f}
+        Previous steps:
+        {history_block}
+        Send your next AWS CLI command.
+        """
+    ).strip()
+def get_model_command(
+    client: OpenAI,
+    task_description: str,
+    step: int,
+    last_output: str,
+    last_error: str,
+    last_reward: float,
+    history: List[str],
+) -> str:
+    user_prompt = build_user_prompt(
+        task_description, step, last_output, last_error, last_reward, history
+    )
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+            stream=False,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        # Strip markdown code fences if the model wraps the command
+        if text.startswith("```"):
+            lines = text.split("\n")
+            text = "\n".join(
+                line for line in lines if not line.startswith("```")
+            ).strip()
+        return text if text.startswith("aws ") else "aws help"
+    except Exception as exc:
+        print(f"[DEBUG] Model request failed: {exc}", flush=True)
+        return "aws help"
+async def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = await AwsRlEnv.from_docker_image(IMAGE_NAME)
+    history: List[str] = []
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    task_name = "unknown"
+    task_description = ""
+    try:
+        result = await env.reset()  # OpenENV.reset()
+        obs = result.observation
+        # Extract task info from the first observation
+        if obs.task is not None:
+            task_name = f"task-{obs.task.task_id}"
+            task_description = obs.task.description
+        else:
+            task_description = "No task assigned."
+        log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
+        last_output = obs.command_output
+        last_error = ""
+        last_reward = 0.0
+        for step in range(1, MAX_STEPS + 1):
+            if result.done:
+                break
+            command = get_model_command(
+                client, task_description, step,
+                last_output, last_error, last_reward, history,
+            )
+            result = await env.step(AwsRlAction(command=command))
+            obs = result.observation
+            reward = result.reward or 0.0
+            done = result.done
+            error = obs.error if obs.error else None
+            rewards.append(reward)
+            steps_taken = step
+            last_output = obs.command_output
+            last_error = obs.error
+            last_reward = reward
+            log_step(step=step, action=command, reward=reward, done=done, error=error)
+            status = "OK" if obs.command_success else "FAIL"
+            history.append(f"Step {step} [{status}]: {command} -> reward={reward:.2f}")
+            # Task achieved — episode success
+            if obs.task_achieved:
+                success = True
+                break
+            if done:
+                break
+        score = max(rewards) if rewards else 0.0
+        score = min(max(score, 0.0), 1.0)  # clamp to [0, 1]
+        if not success:
+            success = score >= SUCCESS_SCORE_THRESHOLD
+    finally:
+        try:
+            await env.close()
+        except Exception as e:
+            print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+if __name__ == "__main__":
+    asyncio.run(main())

models.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+Data models for the Aws Rl Env Environment.
+"""
+from enum import Enum
+from typing import NewType, Union
+from openenv.core.env_server.types import Action, Observation
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Core Types
+# ---------------------------------------------------------------------------
+TaskID = NewType("TaskID", int)
+EpisodeID = NewType("EpisodeID", str)
+StepCount = NewType("StepCount", int)
+class AwsService(str, Enum):
+    S3 = "s3"
+    EC2 = "ec2"
+    DYNAMODB = "dynamodb"
+    LAMBDA = "lambda"
+    SQS = "sqs"
+    SNS = "sns"
+    IAM = "iam"
+    APIGATEWAY = "apigateway"
+# ---------------------------------------------------------------------------
+# RL Task Definition
+# ---------------------------------------------------------------------------
+class TaskDifficulty(str, Enum):
+    WARMUP = "warmup"
+    BEGINNER = "beginner"
+    INTERMEDIATE = "intermediate"
+    ADVANCED = "advanced"
+    EXPERT = "expert"
+class TierConfig(BaseModel):
+    """Configuration for a single difficulty tier's promotion and mastery rules."""
+    min_episodes: int = Field(..., ge=0, description="Minimum episodes before promotion eligible")
+    advance_rate: float = Field(..., ge=0.0, le=1.0, description="Tier success rate to advance")
+    mastery_window: int = Field(default=10, ge=1, description="Sliding window size for success rate")
+    mastery_threshold: float = Field(
+        default=0.7, ge=0.0, le=1.0, description="Per-task graduation threshold"
+    )
+    fast_track_rate: float = Field(
+        default=0.9, ge=0.0, le=1.0,
+        description="Success rate for early promotion after 3 episodes",
+    )
+class SpacedRepState(BaseModel):
+    """Tracks spaced repetition schedule for a graduated task."""
+    interval: int = Field(default=3, ge=1, description="Episodes until next re-test")
+    last_graduated_episode: int = Field(
+        default=0, ge=0, description="Episode number when task was last graduated"
+    )
+class SetupCommand(BaseModel):
+    """A single AWS CLI command executed during environment setup before the agent acts."""
+    command: str = Field(..., description="AWS CLI command to execute")
+    description: str | None = Field(
+        default=None, description="Human-readable explanation of what this command sets up"
+    )
+    ignore_failure: bool = Field(
+        default=False,
+        description="If True, continue setup even if this command fails",
+    )
+class ResourceExistsCheck(BaseModel):
+    """Checks that a specific named resource exists in MiniStack."""
+    service: AwsService = Field(..., description="AWS service to verify the resource in")
+    name: str = Field(..., description="Exact resource name to verify")
+class StepCriteria(BaseModel):
+    """A single required step in a multi-step task."""
+    operation: str = Field(..., description="AWS CLI operation, e.g. 'create-bucket'")
+    resource: str | None = Field(
+        default=None, description="Resource name the operation must target"
+    )
+class StateCheck(BaseModel):
+    """An assertion about the environment's end-state, evaluated via AWS CLI."""
+    command: str = Field(..., description="AWS CLI command to run for verification")
+    output_contains: str | None = Field(
+        default=None, description="Substring that must appear in stdout"
+    )
+    json_path: str | None = Field(
+        default=None, description="JSON path to extract from stdout, e.g. '$.Table.Name'"
+    )
+    expected: int | float | str | bool | None = Field(
+        default=None, description="Expected value at json_path"
+    )
+class SuccessCriteria(BaseModel):
+    """Machine-readable criteria to evaluate task completion.
+    Different tiers populate different fields:
+    - Warmup: command_contains + operation
+    - Beginner: command_contains + operation + resource_exists
+    - Intermediate: steps
+    - Advanced: services + steps
+    - Expert: services + state_checks + steps (optional)
+    """
+    command_contains: str | None = Field(
+        default=None, description="Substring the agent's command must contain"
+    )
+    operation: str | None = Field(
+        default=None, description="AWS CLI operation the agent must invoke"
+    )
+    resource_exists: ResourceExistsCheck | None = Field(
+        default=None, description="Resource that must exist after the agent acts"
+    )
+    steps: list[StepCriteria] = Field(
+        default_factory=list, description="Ordered sequence of required operations"
+    )
+    services: list[AwsService] = Field(
+        default_factory=list, description="AWS services the agent must interact with"
+    )
+    state_checks: list[StateCheck] = Field(
+        default_factory=list,
+        description="End-state assertions — source of truth for expert/SRE tasks",
+    )
+class Task(BaseModel):
+    """Defines a task the RL agent must accomplish in the AWS environment."""
+    task_id: TaskID = Field(..., ge=0, description="Unique task identifier")
+    difficulty: TaskDifficulty = Field(
+        default=TaskDifficulty.WARMUP, description="Task difficulty level"
+    )
+    description: str = Field(..., description="Human-readable task description")
+    success_criteria: SuccessCriteria = Field(
+        default_factory=SuccessCriteria,
+        description="Machine-readable criteria to evaluate task completion",
+    )
+    setup_commands: list[SetupCommand] = Field(
+        default_factory=list,
+        description="Commands to run during reset to set up initial state (e.g. for SRE tasks)",
+    )
+# ---------------------------------------------------------------------------
+# Action & Observation
+# ---------------------------------------------------------------------------
+class AwsRlAction(Action):
+    """Action for the Aws Rl Env environment — an AWS CLI command to execute against MiniStack."""
+    command: str = Field(
+        ...,
+        description="AWS CLI command to execute, e.g. 'aws s3 ls', 'aws ec2 describe-instances'",
+    )
+class AwsRlObservation(Observation):
+    """Observation returned after each step in the AWS RL environment."""
+    episode_id: EpisodeID = Field(..., description="Unique identifier for the episode")
+    step_count: StepCount = Field(..., ge=0, description="Current step count in the episode")
+    command_success: bool = Field(
+        ..., description="Whether the CLI command executed successfully"
+    )
+    command_output: str = Field(
+        default="", description="Stdout from the executed AWS CLI command"
+    )
+    error: str = Field(default="", description="Stderr if the command failed")
+    resources: dict[AwsService, Union[dict, list, str]] = Field(
+        default_factory=dict,
+        description="Current resource state from MiniStack, keyed by service name",
+    )
+    task: Task | None = Field(
+        default=None, description="The task the agent is trying to accomplish"
+    )
+    task_achieved: bool = Field(
+        default=False, description="Whether the task has been achieved"
+    )

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: aws_rl_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

openenv_aws_rl_env.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,14 @@

+Metadata-Version: 2.4
+Name: openenv-aws_rl_env
+Version: 0.1.0
+Summary: Aws Rl Env environment for OpenEnv
+Requires-Python: >=3.12
+Requires-Dist: openenv-core[core]>=0.2.2
+Requires-Dist: ministack>=1.1.24
+Requires-Dist: python-dotenv>=1.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: ruff>=0.4.0; extra == "dev"
+Requires-Dist: mypy>=1.10.0; extra == "dev"
+Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"

openenv_aws_rl_env.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+README.md
+__init__.py
+client.py
+inference.py
+models.py
+pyproject.toml
+./__init__.py
+./client.py
+./inference.py
+./models.py
+openenv_aws_rl_env.egg-info/PKG-INFO
+openenv_aws_rl_env.egg-info/SOURCES.txt
+openenv_aws_rl_env.egg-info/dependency_links.txt
+openenv_aws_rl_env.egg-info/entry_points.txt
+openenv_aws_rl_env.egg-info/requires.txt
+openenv_aws_rl_env.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+server/aws_rl_env_environment.py

openenv_aws_rl_env.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_aws_rl_env.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = aws_rl_env.server.app:main

openenv_aws_rl_env.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+openenv-core[core]>=0.2.2
+ministack>=1.1.24
+python-dotenv>=1.0.0
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0
+ruff>=0.4.0
+mypy>=1.10.0
+types-PyYAML>=6.0.0

openenv_aws_rl_env.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ aws_rl_env

pyproject.toml ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-aws_rl_env"
+version = "0.1.0"
+description = "Aws Rl Env environment for OpenEnv"
+requires-python = ">=3.12"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.2",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+    "ministack>=1.1.24",
+    "python-dotenv>=1.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+    "ruff>=0.4.0",
+    "mypy>=1.10.0",
+    "types-PyYAML>=6.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m aws_rl_env.server.app
+server = "aws_rl_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["aws_rl_env", "aws_rl_env.server"]
+package-dir = { "aws_rl_env" = ".", "aws_rl_env.server" = "server" }
+[tool.mypy]
+files = ["*.py", "server/"]
+ignore_missing_imports = true
+namespace_packages = true
+explicit_package_bases = true
+mypy_path = "."

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Aws Rl Env environment server components."""
+from .aws_rl_env_environment import AwsRlEnvironment
+__all__ = ["AwsRlEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+FastAPI application for the Aws Rl Env Environment.
+This module creates an HTTP server that exposes the AwsRlEnvironment
+over HTTP and WebSocket endpoints, compatible with EnvClient.
+Endpoints:
+    - POST /reset: Reset the environment
+    - POST /step: Execute an action
+    - GET /state: Get current environment state
+    - GET /schema: Get action/observation schemas
+    - WS /ws: WebSocket endpoint for persistent sessions
+Usage:
+    # Development (with auto-reload):
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+    # Production:
+    uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
+    # Or run directly:
+    python -m server.app
+"""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
+    ) from e
+from models import AwsRlAction, AwsRlObservation
+from server.aws_rl_env_environment import AwsRlEnvironment
+# Create the app with web interface and README integration
+app = create_app(
+    AwsRlEnvironment,
+    AwsRlAction,
+    AwsRlObservation,
+    env_name="aws_rl_env",
+    max_concurrent_envs=1,  # increase this number to allow more concurrent WebSocket sessions
+)
+def main(host: str = "0.0.0.0", port: int = 8000):
+    """
+    Entry point for direct execution via uv run or python -m.
+    This function enables running the server without Docker:
+        uv run --project . server
+        uv run --project . server --port 8001
+        python -m aws_rl_env.server.app
+    Args:
+        host: Host address to bind to (default: "0.0.0.0")
+        port: Port number to listen on (default: 8000)
+    For production deployments, consider using uvicorn directly with
+    multiple workers:
+        uvicorn aws_rl_env.server.app:app --workers 4
+    """
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/aws_rl_env_environment.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Aws Rl Env Environment Implementation.
+An RL environment backed by a simulated AWS cloud powered by MiniStack.
+The agent sends AWS CLI commands as actions and receives CLI output plus
+the current resource state as observations.
+"""
+import logging
+from typing import Any, Optional
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount, Task
+from server.services.aws_backend import AwsBackend
+from server.services.curriculum import Curriculum
+from server.services.environment_designer import EnvironmentDesigner
+from server.services.episode_tracker import EpisodeTracker
+from server.services.task_grader import TaskGrader
+logger = logging.getLogger(__name__)
+class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, State]):
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self) -> None:
+        print("Initializing AWS RL Environment...")
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._backend = AwsBackend()
+        self._curriculum = Curriculum()
+        self._grader = TaskGrader(self._backend)
+        self._designer = EnvironmentDesigner(self._backend)
+        self._tracker = EpisodeTracker()
+        self._current_task: Task | None = None
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **kwargs: Any,
+    ) -> AwsRlObservation:
+        self._backend.reset_environment()
+        self._state = State(episode_id=episode_id or str(uuid4()), step_count=0)
+        self._tracker.reset()
+        self._current_task = self._curriculum.next_task()
+        self._designer.apply(self._current_task)
+        return AwsRlObservation(
+            episode_id=EpisodeID(self._state.episode_id or ""),
+            step_count=StepCount(self._state.step_count),
+            command_success=True,
+            command_output="Environment reset. MiniStack state wiped.",
+            task=self._current_task,
+            done=False,
+            reward=0.0,
+        )
+    def step(
+        self,
+        action: AwsRlAction,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> AwsRlObservation:
+        assert self._current_task is not None, "Call reset() before step()"
+        self._state.step_count += 1
+        # Anti-hack: only allow AWS CLI commands
+        command = action.command.strip()
+        if not command.startswith("aws "):
+            return AwsRlObservation(
+                episode_id=EpisodeID(self._state.episode_id or ""),
+                step_count=StepCount(self._state.step_count),
+                command_success=False,
+                command_output="",
+                error="Only AWS CLI commands (starting with 'aws') are allowed.",
+                task=self._current_task,
+                task_achieved=False,
+                done=False,
+                reward=0.0,
+            )
+        success, stdout, stderr = self._backend.execute_command(command)
+        # Record in tracker
+        latest_step = self._tracker.record_step(command, success, stdout, stderr)
+        # Grade the task
+        task_achieved = False
+        grade_result = self._grader.grade(
+            self._current_task, self._tracker, latest_step
+        )
+        task_achieved = grade_result.task_achieved
+        reward = grade_result.reward
+        if task_achieved:
+            self._curriculum.record_result(
+                self._current_task, achieved=True, reward=reward
+            )
+        return AwsRlObservation(
+            episode_id=EpisodeID(self._state.episode_id or ""),
+            step_count=StepCount(self._state.step_count),
+            command_success=success,
+            command_output=stdout,
+            error=stderr,
+            task=self._current_task,
+            task_achieved=task_achieved,
+            done=task_achieved,
+            reward=reward,
+        )
+    @property
+    def state(self) -> State:
+        return self._state

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0

server/services/__init__.py ADDED Viewed

File without changes

server/services/aws_backend.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Backend service for managing AWS interactions via MiniStack."""
+import logging
+import os
+import subprocess
+import httpx
+logger = logging.getLogger(__name__)
+MINISTACK_URL = os.getenv("MINISTACK_URL", "http://localhost:4566")
+class AwsBackend:
+    """Backend service for executing AWS CLI commands against MiniStack."""
+    def __init__(self, ministack_url: str = MINISTACK_URL) -> None:
+        self._ministack_url = ministack_url
+    def reset_environment(self) -> None:
+        """Wipe all MiniStack service state via POST /_ministack/reset."""
+        try:
+            resp = httpx.post(
+                f"{self._ministack_url}/_ministack/reset", timeout=10
+            )
+            resp.raise_for_status()
+            logger.info("MiniStack state reset successfully")
+        except httpx.HTTPError as e:
+            logger.warning("Failed to reset MiniStack state: %s", e)
+            raise
+    def execute_command(self, command: str) -> tuple[bool, str, str]:
+        """Execute an AWS CLI command against MiniStack.
+        Args:
+            command: Raw AWS CLI command, e.g. 'aws s3 ls'
+        Returns:
+            Tuple of (success, stdout, stderr)
+        """
+        env = {
+            **os.environ,
+            "AWS_ENDPOINT_URL": self._ministack_url,
+            "AWS_ACCESS_KEY_ID": "test",
+            "AWS_SECRET_ACCESS_KEY": "test",
+            "AWS_DEFAULT_REGION": "us-east-1",
+        }
+        try:
+            result = subprocess.run(
+                command.split(),
+                capture_output=True,
+                text=True,
+                timeout=30,
+                env=env,
+            )
+            return (
+                result.returncode == 0,
+                result.stdout,
+                result.stderr,
+            )
+        except subprocess.TimeoutExpired:
+            return False, "", "Command timed out after 30s"
+        except Exception as e:
+            return False, "", str(e)

server/services/curriculum.py ADDED Viewed

	@@ -0,0 +1,471 @@

+"""Curriculum manager for progressive LLM training in the AWS RL environment.
+Training flow:
+  1. Agent starts at the warmup tier with simple listing tasks.
+  2. A priority queue selects the next task based on weakness, novelty,
+     spaced repetition, and recency — replacing blind round-robin.
+  3. Per-task mastery tracking graduates individual tasks once the agent
+     demonstrates sustained competence.
+  4. Graduated tasks resurface via spaced repetition at exponentially
+     increasing intervals to prevent catastrophic forgetting.
+  5. Fast-track promotion lets strong agents skip minimum episode waits.
+  6. Exponential decay on history ensures recent results matter more.
+"""
+import heapq
+import logging
+import random
+from collections import defaultdict
+from pathlib import Path
+import yaml
+from models import (
+    SetupCommand,
+    SpacedRepState,
+    SuccessCriteria,
+    Task,
+    TaskDifficulty,
+    TaskID,
+    TierConfig,
+)
+logger = logging.getLogger(__name__)
+TASKS_DIR = Path(__file__).parent / "tasks"
+# ---------------------------------------------------------------------------
+# Per-tier configuration
+# ---------------------------------------------------------------------------
+TIER_CONFIGS: dict[TaskDifficulty, TierConfig] = {
+    TaskDifficulty.WARMUP: TierConfig(
+        min_episodes=5, advance_rate=0.6, mastery_window=10,
+        mastery_threshold=0.7, fast_track_rate=0.9,
+    ),
+    TaskDifficulty.BEGINNER: TierConfig(
+        min_episodes=5, advance_rate=0.6, mastery_window=10,
+        mastery_threshold=0.7, fast_track_rate=0.9,
+    ),
+    TaskDifficulty.INTERMEDIATE: TierConfig(
+        min_episodes=8, advance_rate=0.65, mastery_window=10,
+        mastery_threshold=0.7, fast_track_rate=0.9,
+    ),
+    TaskDifficulty.ADVANCED: TierConfig(
+        min_episodes=10, advance_rate=0.7, mastery_window=10,
+        mastery_threshold=0.7, fast_track_rate=0.9,
+    ),
+    TaskDifficulty.EXPERT: TierConfig(
+        min_episodes=0, advance_rate=1.0, mastery_window=10,
+        mastery_threshold=0.7, fast_track_rate=1.0,
+    ),
+}
+# Map YAML filenames to difficulty tiers
+_TIER_FILES: dict[TaskDifficulty, str] = {
+    TaskDifficulty.WARMUP: "warmup.yaml",
+    TaskDifficulty.BEGINNER: "beginner.yaml",
+    TaskDifficulty.INTERMEDIATE: "intermediate.yaml",
+    TaskDifficulty.ADVANCED: "advanced.yaml",
+    TaskDifficulty.EXPERT: "expert.yaml",
+}
+# ---------------------------------------------------------------------------
+# Priority score tuning constants
+# ---------------------------------------------------------------------------
+_NOVELTY_BONUS = 100       # untried tasks — explore first
+_WEAKNESS_WEIGHT = 50      # multiplied by (1 - success_rate)
+_SPACED_REP_BONUS = 30     # graduated task due for re-test
+_RECENCY_PENALTY = 20      # attempted in last 2 episodes
+# Exponential decay factor for weighted success rate
+_DECAY_FACTOR = 0.85
+# Minimum attempts before a task can be graduated
+_MIN_ATTEMPTS_FOR_MASTERY = 3
+# Fast-track requires at least this many episodes in the tier
+_FAST_TRACK_MIN_EPISODES = 3
+# ---------------------------------------------------------------------------
+# YAML loader
+# ---------------------------------------------------------------------------
+def load_tier(difficulty: TaskDifficulty, tasks_dir: Path = TASKS_DIR) -> list[Task]:
+    """Load tasks for a single difficulty tier from its YAML file."""
+    filename = _TIER_FILES.get(difficulty)
+    if filename is None:
+        logger.warning("No file mapping for difficulty: %s", difficulty.value)
+        return []
+    filepath = tasks_dir / filename
+    if not filepath.exists():
+        logger.warning("Task file not found: %s", filepath)
+        return []
+    with open(filepath) as f:
+        entries = yaml.safe_load(f) or []
+    tasks = [
+        Task(
+            task_id=TaskID(entry["task_id"]),
+            difficulty=difficulty,
+            description=entry["description"],
+            success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
+            setup_commands=[
+                SetupCommand(command=cmd) if isinstance(cmd, str) else SetupCommand(**cmd)
+                for cmd in entry.get("setup_commands", [])
+            ],
+        )
+        for entry in entries
+    ]
+    logger.info("Loaded %d %s tasks from %s", len(tasks), difficulty.value, filepath.name)
+    return tasks
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _weighted_success_rate(results: list[bool], decay: float = _DECAY_FACTOR) -> float:
+    """Compute success rate with exponential decay — recent results matter more."""
+    if not results:
+        return 0.0
+    weights = [decay ** i for i in range(len(results) - 1, -1, -1)]
+    total_weight = sum(weights)
+    return sum(w * float(r) for w, r in zip(weights, results)) / total_weight
+# ---------------------------------------------------------------------------
+# Curriculum
+# ---------------------------------------------------------------------------
+class Curriculum:
+    """Manages progressive task assignment with priority-queue-based selection.
+    Features:
+      - Priority queue task selection (novelty, weakness, spaced rep, recency)
+      - Per-task mastery tracking with graduation
+      - Spaced repetition for graduated tasks (prevents catastrophic forgetting)
+      - Fast-track tier promotion for strong agents
+      - Exponential decay on success history
+      - Rich observability via get_stats()
+    """
+    def __init__(
+        self,
+        tier_configs: dict[TaskDifficulty, TierConfig] | None = None,
+        tasks_dir: Path = TASKS_DIR,
+    ) -> None:
+        self._tier_configs = tier_configs or TIER_CONFIGS
+        self._tasks_dir = tasks_dir
+        # Ordered difficulty progression
+        self._levels = list(TaskDifficulty)
+        # Tier tracking
+        self._current_level_idx: int = 0
+        self._tier_episodes: int = 0
+        self._tier_results: list[bool] = []  # results within current tier
+        # Per-task tracking
+        self._task_history: dict[TaskID, list[bool]] = defaultdict(list)
+        self._task_attempt_count: dict[TaskID, int] = defaultdict(int)
+        self._last_attempted_episode: dict[TaskID, int] = {}
+        self._graduated_tasks: set[TaskID] = set()
+        self._spaced_rep: dict[TaskID, SpacedRepState] = {}
+        # Global counters
+        self._episode_count: int = 0
+        self._episode_rewards: list[float] = []
+        # Load starting tier
+        self._current_tasks: list[Task] = load_tier(
+            self.current_difficulty, self._tasks_dir
+        )
+        self._task_map: dict[TaskID, Task] = {t.task_id: t for t in self._current_tasks}
+        # Priority queue: list of (-score, random_tiebreaker, task_id)
+        self._priority_queue: list[tuple[float, float, TaskID]] = []
+        self._rebuild_priority_queue()
+        logger.info(
+            "Curriculum initialised — starting at %s with %d tasks",
+            self.current_difficulty.value,
+            len(self._current_tasks),
+        )
+    # -- Properties -----------------------------------------------------------
+    @property
+    def current_difficulty(self) -> TaskDifficulty:
+        return self._levels[self._current_level_idx]
+    @property
+    def tier_config(self) -> TierConfig:
+        return self._tier_configs[self.current_difficulty]
+    @property
+    def current_level_success_rate(self) -> float:
+        return _weighted_success_rate(self._tier_results)
+    @property
+    def is_warmup(self) -> bool:
+        return self.current_difficulty == TaskDifficulty.WARMUP
+    # -- Public API -----------------------------------------------------------
+    def next_task(self) -> Task:
+        """Select the highest-priority task from the current tier."""
+        if not self._current_tasks:
+            self._current_tasks = load_tier(
+                self.current_difficulty, self._tasks_dir
+            )
+            self._task_map = {t.task_id: t for t in self._current_tasks}
+            self._rebuild_priority_queue()
+        if not self._priority_queue:
+            self._rebuild_priority_queue()
+        # Pop highest priority (most negative = highest score)
+        _, _, task_id = heapq.heappop(self._priority_queue)
+        task = self._task_map[task_id]
+        # If queue is now empty, rebuild for next call
+        if not self._priority_queue:
+            self._rebuild_priority_queue()
+        return task
+    def record_result(
+        self, task: Task, achieved: bool, reward: float = 0.0
+    ) -> None:
+        """Record episode outcome, update mastery, check promotion."""
+        self._episode_count += 1
+        self._tier_episodes += 1
+        self._episode_rewards.append(reward)
+        # Per-tier results
+        self._tier_results.append(achieved)
+        # Per-task results
+        self._task_history[task.task_id].append(achieved)
+        self._task_attempt_count[task.task_id] += 1
+        self._last_attempted_episode[task.task_id] = self._episode_count
+        # Check mastery
+        self._check_mastery(task.task_id)
+        # Check tier promotion
+        self._maybe_promote()
+        # Rebuild priority queue with updated scores
+        self._rebuild_priority_queue()
+        logger.info(
+            "Episode %d: task=%d difficulty=%s achieved=%s tier_rate=%.2f",
+            self._episode_count,
+            task.task_id,
+            task.difficulty.value,
+            achieved,
+            self.current_level_success_rate,
+        )
+    def reset(self) -> None:
+        """Reset curriculum back to warmup (full training restart)."""
+        self._current_level_idx = 0
+        self._tier_episodes = 0
+        self._tier_results.clear()
+        self._task_history.clear()
+        self._task_attempt_count.clear()
+        self._last_attempted_episode.clear()
+        self._graduated_tasks.clear()
+        self._spaced_rep.clear()
+        self._episode_count = 0
+        self._episode_rewards.clear()
+        self._current_tasks = load_tier(self.current_difficulty, self._tasks_dir)
+        self._task_map = {t.task_id: t for t in self._current_tasks}
+        self._rebuild_priority_queue()
+        logger.info("Curriculum reset to %s", self.current_difficulty.value)
+    # -- Observability --------------------------------------------------------
+    def get_skill_profile(self) -> dict[TaskID, float]:
+        """Weighted success rate per task over recent history."""
+        config = self.tier_config
+        return {
+            task_id: round(
+                _weighted_success_rate(results[-config.mastery_window:]), 2
+            )
+            for task_id, results in self._task_history.items()
+            if results
+        }
+    def get_weak_spots(self) -> list[TaskID]:
+        """Tasks in the current tier below mastery threshold."""
+        config = self.tier_config
+        profile = self.get_skill_profile()
+        return [
+            task_id
+            for task_id in self._task_map
+            if profile.get(task_id, 0.0) < config.mastery_threshold
+            and task_id not in self._graduated_tasks
+        ]
+    def get_stats(self) -> dict:
+        """Full curriculum state for logging/debugging."""
+        return {
+            "episode_count": self._episode_count,
+            "tier": self.current_difficulty.value,
+            "tier_episodes": self._tier_episodes,
+            "tier_success_rate": round(self.current_level_success_rate, 3),
+            "graduated_tasks": sorted(self._graduated_tasks),
+            "weak_spots": self.get_weak_spots(),
+            "skill_profile": self.get_skill_profile(),
+            "spaced_rep_due": [
+                int(tid)
+                for tid in self._task_map
+                if self._is_spaced_rep_due(tid)
+            ],
+            "avg_reward_last_10": round(
+                sum(self._episode_rewards[-10:])
+                / max(1, len(self._episode_rewards[-10:])),
+                3,
+            ),
+        }
+    # -- Priority queue -------------------------------------------------------
+    def _compute_priority(self, task_id: TaskID) -> float:
+        """Compute composite priority score for a task. Higher = selected sooner."""
+        config = self.tier_config
+        score = 0.0
+        attempts = self._task_attempt_count.get(task_id, 0)
+        # Novelty: never attempted → explore first
+        if attempts == 0:
+            score += _NOVELTY_BONUS
+            return score  # no other signals available yet
+        # Weakness: worse tasks get higher priority
+        results = self._task_history.get(task_id, [])
+        task_rate = _weighted_success_rate(results[-config.mastery_window:])
+        score += _WEAKNESS_WEIGHT * (1.0 - task_rate)
+        # Spaced repetition: graduated task due for re-test
+        if task_id in self._graduated_tasks and self._is_spaced_rep_due(task_id):
+            score += _SPACED_REP_BONUS
+        # Recency penalty: attempted in last 2 episodes
+        last_ep = self._last_attempted_episode.get(task_id, -100)
+        if self._episode_count - last_ep <= 2:
+            score -= _RECENCY_PENALTY
+        return score
+    def _rebuild_priority_queue(self) -> None:
+        """Recompute priorities for all current-tier tasks and rebuild the heap."""
+        self._priority_queue.clear()
+        for task in self._current_tasks:
+            score = self._compute_priority(task.task_id)
+            # heapq is a min-heap, so negate score for max-priority-first
+            # random tiebreaker prevents deterministic ordering among equal scores
+            heapq.heappush(
+                self._priority_queue,
+                (-score, random.random(), task.task_id),
+            )
+    # -- Mastery & spaced repetition ------------------------------------------
+    def _check_mastery(self, task_id: TaskID) -> None:
+        """Check if a task should be graduated or un-graduated."""
+        config = self.tier_config
+        results = self._task_history.get(task_id, [])
+        recent = results[-config.mastery_window:]
+        if len(recent) < _MIN_ATTEMPTS_FOR_MASTERY:
+            return
+        rate = _weighted_success_rate(recent)
+        if rate >= config.mastery_threshold:
+            if task_id not in self._graduated_tasks:
+                self._graduated_tasks.add(task_id)
+                self._spaced_rep[task_id] = SpacedRepState(
+                    interval=3,
+                    last_graduated_episode=self._episode_count,
+                )
+                logger.info(
+                    "Task %d GRADUATED (rate=%.2f) — scheduling spaced repetition",
+                    task_id,
+                    rate,
+                )
+        else:
+            # Un-graduate if performance dropped
+            if task_id in self._graduated_tasks:
+                self._graduated_tasks.discard(task_id)
+                self._spaced_rep.pop(task_id, None)
+                logger.info(
+                    "Task %d UN-GRADUATED (rate=%.2f) — resetting to active",
+                    task_id,
+                    rate,
+                )
+    def _is_spaced_rep_due(self, task_id: TaskID) -> bool:
+        """Check if a graduated task is due for a re-test."""
+        state = self._spaced_rep.get(task_id)
+        if state is None:
+            return False
+        episodes_since = self._episode_count - state.last_graduated_episode
+        return episodes_since >= state.interval
+    def _advance_spaced_rep(self, task_id: TaskID) -> None:
+        """Double the interval after a successful re-test."""
+        state = self._spaced_rep.get(task_id)
+        if state is not None:
+            state.interval = min(state.interval * 2, 48)  # cap at 48 episodes
+            state.last_graduated_episode = self._episode_count
+    # -- Tier promotion -------------------------------------------------------
+    def _maybe_promote(self) -> None:
+        """Advance to the next difficulty tier if the agent is ready."""
+        if self._current_level_idx >= len(self._levels) - 1:
+            return  # already at max tier
+        config = self.tier_config
+        rate = self.current_level_success_rate
+        # Fast-track: high success rate after minimum 3 episodes
+        fast_track = (
+            self._tier_episodes >= _FAST_TRACK_MIN_EPISODES
+            and rate >= config.fast_track_rate
+        )
+        if not fast_track and self._tier_episodes < config.min_episodes:
+            return
+        if rate < config.advance_rate:
+            return
+        prev_tier = self.current_difficulty.value
+        prev_rate = rate
+        self._current_level_idx += 1
+        self._tier_episodes = 0
+        self._tier_results.clear()
+        self._current_tasks = load_tier(self.current_difficulty, self._tasks_dir)
+        self._task_map = {t.task_id: t for t in self._current_tasks}
+        self._rebuild_priority_queue()
+        logger.info(
+            "PROMOTED from %s to %s (rate=%.2f%s)",
+            prev_tier,
+            self.current_difficulty.value,
+            prev_rate,
+            ", FAST-TRACK" if fast_track else "",
+        )

server/services/environment_designer.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Environment designer — provisions initial AWS state for each task.
+Currently supports raw AWS CLI setup commands. Designed to be extended
+with CloudFormation YAML template support so that each difficulty level
+can declaratively define its starting infrastructure.
+"""
+from __future__ import annotations
+import logging
+from enum import Enum
+from pydantic import BaseModel, Field
+from models import SetupCommand, Task
+from server.services.aws_backend import AwsBackend
+logger = logging.getLogger(__name__)
+class ProvisionMethod(str, Enum):
+    """How the initial environment state is provisioned."""
+    CLI_COMMANDS = "cli_commands"
+    CLOUDFORMATION = "cloudformation"
+class ProvisionResult(BaseModel):
+    """Outcome of provisioning the environment for a task."""
+    success: bool = True
+    method: ProvisionMethod = ProvisionMethod.CLI_COMMANDS
+    resources_created: int = 0
+    errors: list[str] = Field(default_factory=list)
+class EnvironmentDesigner:
+    """Provisions the initial AWS state required by a task before the agent acts.
+    Usage::
+        designer = EnvironmentDesigner(backend)
+        result = designer.apply(task)
+        if not result.success:
+            logger.error("Failed to set up environment: %s", result.errors)
+    """
+    def __init__(self, backend: AwsBackend) -> None:
+        self._backend = backend
+    def apply(self, task: Task) -> ProvisionResult:
+        """Apply the task's environment setup to MiniStack.
+        Dispatches to the appropriate provisioning method based on what the
+        task defines. Currently supports ``setup_commands``; CloudFormation
+        support can be added by extending this method.
+        Returns:
+            A ``ProvisionResult`` summarising what happened.
+        """
+        if not task.setup_commands:
+            return ProvisionResult(resources_created=0)
+        return self._apply_cli_commands(task.setup_commands)
+    # -- Provisioning strategies ----------------------------------------------
+    def _apply_cli_commands(
+        self, commands: list[SetupCommand]
+    ) -> ProvisionResult:
+        """Execute a list of setup commands against MiniStack."""
+        errors: list[str] = []
+        resources_created = 0
+        for setup_cmd in commands:
+            success, _stdout, stderr = self._backend.execute_command(
+                setup_cmd.command
+            )
+            if success:
+                resources_created += 1
+            else:
+                msg = f"Setup command failed: {setup_cmd.command} — {stderr}"
+                if setup_cmd.ignore_failure:
+                    logger.info("Ignoring failed setup command: %s", msg)
+                else:
+                    logger.warning(msg)
+                    errors.append(msg)
+        return ProvisionResult(
+            success=len(errors) == 0,
+            method=ProvisionMethod.CLI_COMMANDS,
+            resources_created=resources_created,
+            errors=errors,
+        )

server/services/episode_tracker.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""Per-episode command history tracker for multi-step task evaluation."""
+from __future__ import annotations
+import logging
+import re
+from pydantic import BaseModel, Field
+logger = logging.getLogger(__name__)
+# Maps common AWS CLI flag names to resource identifiers
+_RESOURCE_FLAGS: list[str] = [
+    "--bucket",
+    "--table-name",
+    "--function-name",
+    "--queue-name",
+    "--topic-name",
+    "--role-name",
+    "--rest-api-id",
+    "--name",
+    "--resource",
+]
+class StepRecord(BaseModel):
+    """A single command executed within an episode."""
+    command: str
+    success: bool
+    stdout: str = ""
+    stderr: str = ""
+    step_number: int = Field(ge=0)
+def _parse_aws_command(command: str) -> tuple[str | None, str | None]:
+    """Extract (service, operation) from an AWS CLI command.
+    Example: 'aws s3api create-bucket --bucket foo' -> ('s3api', 'create-bucket')
+    """
+    parts = command.strip().split()
+    if len(parts) < 3 or parts[0] != "aws":
+        return None, None
+    return parts[1], parts[2]
+def _command_mentions_resource(command: str, resource: str) -> bool:
+    """Check if the command references a specific resource name."""
+    parts = command.strip().split()
+    for i, part in enumerate(parts):
+        if part in _RESOURCE_FLAGS and i + 1 < len(parts):
+            if parts[i + 1] == resource:
+                return True
+    # Also match if the resource appears as a value in key=value flags
+    # e.g. --table-name=orders
+    for part in parts:
+        for flag in _RESOURCE_FLAGS:
+            if part.startswith(f"{flag}=") and part.split("=", 1)[1] == resource:
+                return True
+    # Match resource in ARN-like patterns or bare arguments
+    if re.search(rf"\b{re.escape(resource)}\b", command):
+        return True
+    return False
+class EpisodeTracker:
+    """Tracks command history within a single episode for grading."""
+    def __init__(self) -> None:
+        self._history: list[StepRecord] = []
+        self._step_counter: int = 0
+        self._previous_progress: float = 0.0
+        # Track which (operation, resource) pairs have been credited
+        self._credited_operations: set[tuple[str, str | None]] = set()
+    def reset(self) -> None:
+        self._history.clear()
+        self._step_counter = 0
+        self._previous_progress = 0.0
+        self._credited_operations.clear()
+    def record_step(
+        self, command: str, success: bool, stdout: str, stderr: str
+    ) -> StepRecord:
+        record = StepRecord(
+            command=command,
+            success=success,
+            stdout=stdout,
+            stderr=stderr,
+            step_number=self._step_counter,
+        )
+        self._history.append(record)
+        self._step_counter += 1
+        return record
+    def has_executed_operation(
+        self, operation: str, resource: str | None = None
+    ) -> bool:
+        """Check if a successful command matching (operation, resource) exists in history."""
+        for record in self._history:
+            if not record.success:
+                continue
+            _, cmd_op = _parse_aws_command(record.command)
+            if cmd_op != operation:
+                continue
+            if resource is not None and not _command_mentions_resource(
+                record.command, resource
+            ):
+                continue
+            return True
+        return False
+    def has_used_service(self, service: str) -> bool:
+        """Check if any successful command targeted the given AWS service."""
+        for record in self._history:
+            if not record.success:
+                continue
+            cmd_svc, _ = _parse_aws_command(record.command)
+            if cmd_svc is not None and service in cmd_svc:
+                return True
+        return False
+    def is_operation_already_credited(
+        self, operation: str, resource: str | None
+    ) -> bool:
+        return (operation, resource) in self._credited_operations
+    def credit_operation(self, operation: str, resource: str | None) -> None:
+        self._credited_operations.add((operation, resource))
+    @property
+    def command_history(self) -> list[StepRecord]:
+        return list(self._history)
+    @property
+    def step_count(self) -> int:
+        return self._step_counter
+    @property
+    def previous_progress(self) -> float:
+        return self._previous_progress
+    @previous_progress.setter
+    def previous_progress(self, value: float) -> None:
+        self._previous_progress = value

server/services/resource_verifier.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Resource verification service — queries MiniStack for ground-truth state."""
+from __future__ import annotations
+import json
+import logging
+from typing import Any
+from server.services.aws_backend import AwsBackend
+logger = logging.getLogger(__name__)
+def _extract_json_path(data: Any, path: str) -> Any:
+    """Simple JSON path extractor supporting dot notation and array indexing.
+    Supports paths like: $.Table.ProvisionedThroughput.ReadCapacityUnits
+                         $.Rules[0].Expiration.Days
+                         $.Buckets[].Name
+    """
+    parts = path.lstrip("$").lstrip(".").split(".")
+    current = data
+    for part in parts:
+        if current is None:
+            return None
+        # Handle array index like Rules[0]
+        if "[" in part:
+            key, idx_str = part.split("[", 1)
+            idx_str = idx_str.rstrip("]")
+            if key:
+                current = current.get(key) if isinstance(current, dict) else None
+            if current is None:
+                return None
+            if idx_str == "":
+                # Wildcard — return list of values
+                if isinstance(current, list):
+                    remaining = ".".join(parts[parts.index(part) + 1 :])
+                    if remaining:
+                        return [
+                            _extract_json_path(item, f"$.{remaining}")
+                            for item in current
+                        ]
+                    return current
+                return None
+            try:
+                current = current[int(idx_str)]
+            except (IndexError, TypeError):
+                return None
+        else:
+            current = current.get(part) if isinstance(current, dict) else None
+    return current
+class ResourceVerifier:
+    """Verifies resource state by querying MiniStack via AWS CLI."""
+    def __init__(self, backend: AwsBackend) -> None:
+        self._backend = backend
+    def resource_exists(self, service: str, name: str) -> bool:
+        """Check if a specific resource exists in MiniStack.
+        Uses service-specific verification commands and checks for the
+        exact resource name (not just any resource of that type).
+        """
+        service_lower = service.lower()
+        verifiers = {
+            "s3": self._check_s3_bucket,
+            "dynamodb": self._check_dynamodb_table,
+            "lambda": self._check_lambda_function,
+            "sqs": self._check_sqs_queue,
+            "sns": self._check_sns_topic,
+            "iam": self._check_iam_role,
+            "apigateway": self._check_apigateway,
+        }
+        verifier = verifiers.get(service_lower)
+        if verifier is None:
+            logger.warning("No verifier for service: %s", service)
+            return False
+        return verifier(name)
+    def check_state(self, state_check: dict[str, Any]) -> bool:
+        """Run an arbitrary command and assert on its output.
+        Supports:
+          - output_contains: substring check on stdout
+          - json_path + expected: extract value from JSON stdout and compare
+        """
+        command = state_check.get("command", "")
+        if not command:
+            return False
+        success, stdout, _ = self._backend.execute_command(command)
+        if not success:
+            return False
+        # Check output_contains
+        if "output_contains" in state_check:
+            if state_check["output_contains"] not in stdout:
+                return False
+        # Check json_path + expected
+        if "json_path" in state_check and "expected" in state_check:
+            try:
+                data = json.loads(stdout)
+                value = _extract_json_path(data, state_check["json_path"])
+                expected = state_check["expected"]
+                # Compare as strings for flexibility
+                if str(value) != str(expected):
+                    return False
+            except (json.JSONDecodeError, KeyError, TypeError):
+                return False
+        return True
+    # -- Service-specific verifiers -------------------------------------------
+    def _check_s3_bucket(self, name: str) -> bool:
+        success, stdout, _ = self._backend.execute_command(
+            "aws s3api list-buckets --output json"
+        )
+        if not success:
+            return False
+        try:
+            data = json.loads(stdout)
+            buckets = data.get("Buckets", [])
+            return any(b.get("Name") == name for b in buckets)
+        except (json.JSONDecodeError, TypeError):
+            return False
+    def _check_dynamodb_table(self, name: str) -> bool:
+        success, _, _ = self._backend.execute_command(
+            f"aws dynamodb describe-table --table-name {name}"
+        )
+        return success
+    def _check_lambda_function(self, name: str) -> bool:
+        success, _, _ = self._backend.execute_command(
+            f"aws lambda get-function --function-name {name}"
+        )
+        return success
+    def _check_sqs_queue(self, name: str) -> bool:
+        success, _, _ = self._backend.execute_command(
+            f"aws sqs get-queue-url --queue-name {name}"
+        )
+        return success
+    def _check_sns_topic(self, name: str) -> bool:
+        success, stdout, _ = self._backend.execute_command(
+            "aws sns list-topics --output json"
+        )
+        if not success:
+            return False
+        try:
+            data = json.loads(stdout)
+            topics = data.get("Topics", [])
+            return any(name in t.get("TopicArn", "") for t in topics)
+        except (json.JSONDecodeError, TypeError):
+            return False
+    def _check_iam_role(self, name: str) -> bool:
+        success, _, _ = self._backend.execute_command(
+            f"aws iam get-role --role-name {name}"
+        )
+        return success
+    def _check_apigateway(self, name: str) -> bool:
+        success, stdout, _ = self._backend.execute_command(
+            "aws apigateway get-rest-apis --output json"
+        )
+        if not success:
+            return False
+        try:
+            data = json.loads(stdout)
+            items = data.get("items", [])
+            return any(i.get("name") == name for i in items)
+        except (json.JSONDecodeError, TypeError):
+            return False

server/services/task_grader.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""Task grading engine — evaluates task completion and computes shaped rewards.
+All rewards are in the [0.0, 1.0] range. Only full task completion yields 1.0.
+Includes anti-reward-hacking defenses.
+"""
+from __future__ import annotations
+import logging
+from pydantic import BaseModel, Field
+from models import SuccessCriteria, Task
+from server.services.aws_backend import AwsBackend
+from server.services.episode_tracker import EpisodeTracker, StepRecord
+from server.services.resource_verifier import ResourceVerifier
+logger = logging.getLogger(__name__)
+class GradeResult(BaseModel):
+    """Outcome of grading a single step."""
+    task_achieved: bool = False
+    partial_progress: float = Field(default=0.0, ge=0.0, le=1.0)
+    reward: float = Field(default=0.0, ge=0.0, le=1.0)
+    reason: str = ""
+class TaskGrader:
+    """Evaluates task completion and computes shaped rewards.
+    Dispatches to different grading strategies based on which fields
+    are populated on the task's ``SuccessCriteria``.
+    """
+    def __init__(self, backend: AwsBackend) -> None:
+        self._verifier = ResourceVerifier(backend)
+    def grade(
+        self,
+        task: Task,
+        tracker: EpisodeTracker,
+        latest_step: StepRecord,
+    ) -> GradeResult:
+        criteria = task.success_criteria
+        # Dispatch based on populated criteria fields
+        if criteria.state_checks:
+            result = self._grade_state_checks(criteria, tracker)
+        elif criteria.steps:
+            result = self._grade_multi_step(criteria, tracker)
+        elif criteria.resource_exists is not None:
+            result = self._grade_resource_creation(criteria, latest_step)
+        elif criteria.command_contains is not None:
+            result = self._grade_command_match(criteria, latest_step)
+        else:
+            result = GradeResult(reason="no recognised success_criteria fields")
+        # Compute shaped reward
+        result.reward = self._compute_reward(result, latest_step, tracker)
+        # Update tracker's previous progress (monotonic — never decrease)
+        if result.partial_progress > tracker.previous_progress:
+            tracker.previous_progress = result.partial_progress
+        return result
+    # -- Grading strategies ---------------------------------------------------
+    def _grade_command_match(
+        self, criteria: SuccessCriteria, latest_step: StepRecord
+    ) -> GradeResult:
+        """Warmup: check the latest command matches expected service + operation."""
+        cmd = latest_step.command.lower()
+        contains = (criteria.command_contains or "").lower()
+        operation = (criteria.operation or "").lower()
+        contains_ok = contains != "" and contains in cmd
+        operation_ok = operation != "" and operation in cmd
+        succeeded = latest_step.success
+        achieved = contains_ok and operation_ok and succeeded
+        return GradeResult(
+            task_achieved=achieved,
+            partial_progress=1.0 if achieved else 0.0,
+            reason=(
+                f"command_match: contains={contains_ok}, "
+                f"op={operation_ok}, success={succeeded}"
+            ),
+        )
+    def _grade_resource_creation(
+        self,
+        criteria: SuccessCriteria,
+        latest_step: StepRecord,
+    ) -> GradeResult:
+        """Beginner: verify the resource actually exists in MiniStack."""
+        re_spec = criteria.resource_exists
+        assert re_spec is not None
+        service = re_spec.service
+        name = re_spec.name
+        exists = self._verifier.resource_exists(service, name)
+        # Command matching gives partial credit (0.5)
+        contains = (criteria.command_contains or "").lower()
+        operation = (criteria.operation or "").lower()
+        cmd = latest_step.command.lower()
+        cmd_ok = contains in cmd and operation in cmd and latest_step.success
+        if exists:
+            progress = 1.0
+        elif cmd_ok:
+            progress = 0.5
+        else:
+            progress = 0.0
+        return GradeResult(
+            task_achieved=exists,
+            partial_progress=progress,
+            reason=(
+                f"resource_creation: exists={exists}, "
+                f"cmd_ok={cmd_ok}, service={service}, name={name}"
+            ),
+        )
+    def _grade_multi_step(
+        self, criteria: SuccessCriteria, tracker: EpisodeTracker
+    ) -> GradeResult:
+        """Intermediate/Advanced: check ordered step completion."""
+        steps = criteria.steps
+        if not steps:
+            return GradeResult(reason="empty steps list")
+        completed = 0
+        for step in steps:
+            if tracker.has_executed_operation(step.operation, step.resource):
+                completed += 1
+            else:
+                break  # ordered — stop at first incomplete step
+        total = len(steps)
+        progress = completed / total if total > 0 else 0.0
+        # For advanced tasks with services requirement, also check services
+        services_required = criteria.services
+        services_met = all(
+            tracker.has_used_service(svc) for svc in services_required
+        )
+        achieved = completed == total and (not services_required or services_met)
+        return GradeResult(
+            task_achieved=achieved,
+            partial_progress=progress,
+            reason=(
+                f"multi_step: {completed}/{total} steps, "
+                f"services_met={services_met if services_required else 'n/a'}"
+            ),
+        )
+    def _grade_state_checks(
+        self, criteria: SuccessCriteria, tracker: EpisodeTracker
+    ) -> GradeResult:
+        """Expert/SRE: verify end-state via arbitrary commands.
+        state_checks are the source of truth for task completion.
+        steps (if present) provide partial progress signals only.
+        """
+        state_checks = criteria.state_checks
+        steps = criteria.steps
+        # Evaluate state checks (ground truth)
+        checks_passed = 0
+        for check in state_checks:
+            check_dict = check.model_dump(exclude_none=True)
+            if self._verifier.check_state(check_dict):
+                checks_passed += 1
+        total_checks = len(state_checks)
+        all_checks_pass = checks_passed == total_checks and total_checks > 0
+        # Evaluate steps for partial progress signal
+        steps_completed = 0
+        for step in steps:
+            if tracker.has_executed_operation(step.operation, step.resource):
+                steps_completed += 1
+            else:
+                break
+        # Progress combines steps (for dense signal) and state checks
+        total_steps = len(steps)
+        if total_steps > 0:
+            step_progress = steps_completed / total_steps
+        else:
+            step_progress = 0.0
+        # Weight: steps give up to 0.7, state checks give the remaining 0.3
+        if total_checks > 0:
+            check_progress = checks_passed / total_checks
+            progress = step_progress * 0.7 + check_progress * 0.3
+        else:
+            progress = step_progress
+        # Check services requirement
+        services_required = criteria.services
+        services_met = all(
+            tracker.has_used_service(svc) for svc in services_required
+        )
+        # Task achieved only when ALL state checks pass
+        achieved = all_checks_pass and (not services_required or services_met)
+        return GradeResult(
+            task_achieved=achieved,
+            partial_progress=min(progress, 1.0),
+            reason=(
+                f"state_checks: {checks_passed}/{total_checks} passed, "
+                f"steps: {steps_completed}/{total_steps}, "
+                f"services_met={services_met if services_required else 'n/a'}"
+            ),
+        )
+    # -- Reward shaping -------------------------------------------------------
+    def _compute_reward(
+        self,
+        result: GradeResult,
+        latest_step: StepRecord,
+        tracker: EpisodeTracker,
+    ) -> float:
+        """Compute a shaped reward in [0.0, 1.0]."""
+        if result.task_achieved:
+            return 1.0
+        # Base: partial progress scaled to 0.0–0.8 range
+        progress_reward = result.partial_progress * 0.8
+        # Bonus for advancing progress (dense signal)
+        progress_delta = result.partial_progress - tracker.previous_progress
+        if progress_delta > 0:
+            progress_reward += 0.1
+        # Penalty for failed commands
+        if not latest_step.success:
+            progress_reward *= 0.5
+        # Clamp to [0.0, 0.99] — never reach 1.0 without achieving
+        return min(max(progress_reward, 0.0), 0.99)

server/services/tasks/advanced.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+- task_id: 15
+  description: >
+    Create a Lambda function 'processor' with an IAM execution role,
+    then create an SQS queue 'work-items' and configure it as an
+    event source for the Lambda function.
+  success_criteria:
+    services:
+      - iam
+      - lambda
+      - sqs
+    steps:
+      - operation: create-role
+      - operation: create-function
+        resource: processor
+      - operation: create-queue
+        resource: work-items
+      - operation: create-event-source-mapping
+- task_id: 16
+  description: >
+    Deploy a serverless API: create a DynamoDB table 'products',
+    create an IAM role for Lambda, create a Lambda function 'product-api',
+    and set up an API Gateway REST API with a GET method on /products
+    integrated with the Lambda.
+  success_criteria:
+    services:
+      - dynamodb
+      - iam
+      - lambda
+      - apigateway
+    steps:
+      - operation: create-table
+        resource: products
+      - operation: create-role
+      - operation: create-function
+        resource: product-api
+      - operation: create-rest-api
+      - operation: create-resource
+      - operation: put-method
+      - operation: put-integration
+- task_id: 17
+  description: >
+    Build a fan-out notification system: create an SNS topic 'order-events',
+    create two SQS queues 'shipping-queue' and 'billing-queue',
+    subscribe both queues to the SNS topic, then publish a test message.
+  success_criteria:
+    services:
+      - sns
+      - sqs
+    steps:
+      - operation: create-topic
+        resource: order-events
+      - operation: create-queue
+        resource: shipping-queue
+      - operation: create-queue
+        resource: billing-queue
+      - operation: subscribe
+      - operation: subscribe
+      - operation: publish

server/services/tasks/beginner.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+- task_id: 6
+  description: Create an S3 bucket named 'my-test-bucket'.
+  success_criteria:
+    command_contains: s3api
+    operation: create-bucket
+    resource_exists:
+      service: s3
+      name: my-test-bucket
+- task_id: 7
+  description: Create a DynamoDB table named 'users' with a partition key 'user_id' (String type).
+  success_criteria:
+    command_contains: dynamodb
+    operation: create-table
+    resource_exists:
+      service: dynamodb
+      name: users
+- task_id: 8
+  description: Create an SQS queue named 'task-queue'.
+  success_criteria:
+    command_contains: sqs
+    operation: create-queue
+    resource_exists:
+      service: sqs
+      name: task-queue
+- task_id: 9
+  description: Create an SNS topic named 'notifications'.
+  success_criteria:
+    command_contains: sns
+    operation: create-topic
+    resource_exists:
+      service: sns
+      name: notifications
+- task_id: 10
+  description: Create a Lambda function named 'hello-world' using the python3.12 runtime.
+  success_criteria:
+    command_contains: lambda
+    operation: create-function
+    resource_exists:
+      service: lambda
+      name: hello-world

server/services/tasks/expert.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+- task_id: 18
+  description: >
+    SRE Incident: A Lambda function 'order-processor' exists but its IAM role
+    is missing the required SQS permissions. The function's event source mapping
+    to the 'incoming-orders' SQS queue is failing. Diagnose the issue, attach
+    the correct SQS policy to the role, and create the event source mapping.
+  setup_commands:
+    - >-
+      aws iam create-role --role-name broken-lambda-role
+      --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"lambda.amazonaws.com"},"Action":"sts:AssumeRole"}]}'
+    - >-
+      aws iam attach-role-policy --role-name broken-lambda-role
+      --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
+    - >-
+      aws lambda create-function --function-name order-processor
+      --runtime python3.12 --handler index.handler
+      --role arn:aws:iam::000000000000:role/broken-lambda-role
+      --zip-file fileb:///tmp/dummy.zip
+    - aws sqs create-queue --queue-name incoming-orders
+  success_criteria:
+    services:
+      - iam
+      - lambda
+      - sqs
+    state_checks:
+      - command: aws iam list-attached-role-policies --role-name broken-lambda-role
+        output_contains: "SQS"
+      - command: aws lambda list-event-source-mappings --function-name order-processor
+        output_contains: "incoming-orders"
+    steps:
+      - operation: attach-role-policy
+        resource: broken-lambda-role
+      - operation: create-event-source-mapping
+- task_id: 19
+  description: >
+    SRE Incident: An S3 bucket 'app-config-store' was created to host
+    configuration files, but versioning was never enabled. A recent
+    accidental overwrite lost critical config. Enable versioning on the
+    bucket and add a lifecycle rule named 'cleanup-old-versions' that
+    expires non-current object versions after 30 days.
+  setup_commands:
+    - aws s3api create-bucket --bucket app-config-store
+    - >-
+      aws s3api put-object --bucket app-config-store
+      --key config/app.json --body /dev/null
+  success_criteria:
+    services:
+      - s3
+    state_checks:
+      - command: aws s3api get-bucket-versioning --bucket app-config-store
+        output_contains: "Enabled"
+      - command: aws s3api get-bucket-lifecycle-configuration --bucket app-config-store
+        output_contains: "cleanup-old-versions"
+    steps:
+      - operation: put-bucket-versioning
+        resource: app-config-store
+      - operation: put-bucket-lifecycle-configuration
+        resource: app-config-store
+- task_id: 20
+  description: >
+    SRE Incident: A DynamoDB table 'session-store' is experiencing throttling
+    because it was provisioned with only 1 RCU and 1 WCU. An SNS topic
+    'ops-alerts' exists but has no subscriptions, so no one is being notified.
+    Fix the table by updating its throughput to 50 RCU and 50 WCU, then create
+    an SQS queue 'ops-alert-inbox' and subscribe it to the SNS topic.
+  setup_commands:
+    - >-
+      aws dynamodb create-table --table-name session-store
+      --attribute-definitions AttributeName=session_id,AttributeType=S
+      --key-schema AttributeName=session_id,KeyType=HASH
+      --provisioned-throughput ReadCapacityUnits=1,WriteCapacityUnits=1
+    - aws sns create-topic --name ops-alerts
+  success_criteria:
+    services:
+      - dynamodb
+      - sns
+      - sqs
+    state_checks:
+      - command: aws dynamodb describe-table --table-name session-store
+        json_path: "$.Table.ProvisionedThroughput.ReadCapacityUnits"
+        expected: 50
+      - command: aws dynamodb describe-table --table-name session-store
+        json_path: "$.Table.ProvisionedThroughput.WriteCapacityUnits"
+        expected: 50
+      - command: >-
+          aws sns list-subscriptions-by-topic
+          --topic-arn arn:aws:sns:us-east-1:000000000000:ops-alerts
+        output_contains: "sqs"
+    steps:
+      - operation: update-table
+        resource: session-store
+      - operation: create-queue
+        resource: ops-alert-inbox
+      - operation: subscribe
+        resource: ops-alerts

server/services/tasks/intermediate.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+- task_id: 11
+  description: Create an S3 bucket named 'data-pipeline' and upload a file to it.
+  success_criteria:
+    steps:
+      - operation: create-bucket
+        resource: data-pipeline
+      - operation: put-object
+        resource: data-pipeline
+- task_id: 12
+  description: >
+    Create a DynamoDB table named 'orders' with partition key 'order_id' (S),
+    then insert an item with order_id '001' and status 'pending'.
+  success_criteria:
+    steps:
+      - operation: create-table
+        resource: orders
+      - operation: put-item
+        resource: orders
+- task_id: 13
+  description: >
+    Create an SNS topic named 'alerts', then create an SQS queue named
+    'alert-inbox' and subscribe the queue to the topic.
+  success_criteria:
+    steps:
+      - operation: create-topic
+        resource: alerts
+      - operation: create-queue
+        resource: alert-inbox
+      - operation: subscribe
+        resource: alerts
+- task_id: 14
+  description: >
+    Create an IAM role named 'lambda-exec-role' with an assume-role policy
+    for Lambda, then attach the AWSLambdaBasicExecutionRole managed policy to it.
+  success_criteria:
+    steps:
+      - operation: create-role
+        resource: lambda-exec-role
+      - operation: attach-role-policy
+        resource: lambda-exec-role

server/services/tasks/warmup.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+- task_id: 0
+  description: List all S3 buckets in the environment.
+  success_criteria:
+    command_contains: s3
+    operation: ls
+- task_id: 1
+  description: Describe all EC2 instances in the environment.
+  success_criteria:
+    command_contains: ec2
+    operation: describe-instances
+- task_id: 2
+  description: List all DynamoDB tables.
+  success_criteria:
+    command_contains: dynamodb
+    operation: list-tables
+- task_id: 3
+  description: List all Lambda functions.
+  success_criteria:
+    command_contains: lambda
+    operation: list-functions
+- task_id: 4
+  description: List all SQS queues in the environment.
+  success_criteria:
+    command_contains: sqs
+    operation: list-queues
+- task_id: 5
+  description: List all SNS topics in the environment.
+  success_criteria:
+    command_contains: sns
+    operation: list-topics

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff