Sizzing commited on
Commit
2073b3f
·
verified ·
1 Parent(s): b626a01

Upload folder using huggingface_hub

Browse files
Dockerfile ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=aws_rl_env
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the uv-managed Python interpreter from builder
63
+ COPY --from=builder /root/.local/share/uv/python /root/.local/share/uv/python
64
+
65
+ # Copy the virtual environment from builder
66
+ COPY --from=builder /app/env/.venv /app/.venv
67
+
68
+ # Copy the environment code
69
+ COPY --from=builder /app/env /app/env
70
+
71
+ # Install AWS CLI
72
+ RUN apt-get update && \
73
+ apt-get install -y --no-install-recommends awscli && \
74
+ rm -rf /var/lib/apt/lists/*
75
+
76
+ # Configure AWS CLI to point to MiniStack
77
+ RUN mkdir -p /root/.aws && \
78
+ printf '[default]\nregion = us-east-1\noutput = json\n' > /root/.aws/config && \
79
+ printf '[default]\naws_access_key_id = test\naws_secret_access_key = test\n' > /root/.aws/credentials
80
+ ENV AWS_ENDPOINT_URL=http://localhost:4566
81
+
82
+ # Enable the web interface for OpenEnv (if applicable)
83
+ # ENV ENABLE_WEB_INTERFACE=true
84
+
85
+ # Set PATH to use the virtual environment
86
+ ENV PATH="/app/.venv/bin:$PATH"
87
+
88
+ # Set PYTHONPATH so imports work correctly
89
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
90
+
91
+
92
+ # DEV_MODE=1 enables live reload via --reload flag
93
+ ENV DEV_MODE=0
94
+
95
+ # Entrypoint: start MiniStack in background, then run the FastAPI server
96
+ CMD ["sh", "-c", "ministack & sleep 2 && uvicorn server.app:app --host 0.0.0.0 --port 8000 $([ \"$DEV_MODE\" = '1' ] && echo '--reload --reload-dir /app/env')"]
Makefile ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project settings
2
+ PROJECT_NAME := openenv-aws_rl_env
3
+ PYTHON := python3
4
+ UV := uv
5
+ DOCKER_IMAGE := aws-rl-env
6
+ DOCKER_TAG := latest
7
+ SERVER_HOST := 0.0.0.0
8
+ SERVER_PORT := 8000
9
+
10
+ .DEFAULT_GOAL := help
11
+
12
+ # ──────────────────────────────────────────────
13
+ # Setup & Dependencies
14
+ # ──────────────────────────────────────────────
15
+
16
+ .PHONY: install
17
+ install: ## Install project dependencies
18
+ $(UV) sync --frozen
19
+
20
+ .PHONY: install-dev
21
+ install-dev: ## Install project with dev dependencies
22
+ $(UV) sync --frozen --all-extras
23
+
24
+ .PHONY: lock
25
+ lock: ## Update the lockfile
26
+ $(UV) lock
27
+
28
+ # ──────────────────────────────────────────────
29
+ # Development
30
+ # ──────────────────────────────────────────────
31
+
32
+ .PHONY: run
33
+ run: ## Run with MiniStack + FastAPI server (mirrors Docker CMD)
34
+ ministack & sleep 2 && $(UV) run uvicorn server.app:app --host $(SERVER_HOST) --port $(SERVER_PORT)
35
+
36
+ # ──────────────────────────────────────────────
37
+ # Code Quality
38
+ # ──────────────────────────────────────────────
39
+
40
+ .PHONY: format
41
+ format: ## Format code with ruff
42
+ $(UV) run ruff format .
43
+
44
+ .PHONY: lint
45
+ lint: ## Lint code with ruff
46
+ $(UV) run ruff check .
47
+
48
+ .PHONY: lint-fix
49
+ lint-fix: ## Lint and auto-fix code with ruff
50
+ $(UV) run ruff check --fix .
51
+
52
+ .PHONY: typecheck
53
+ typecheck: ## Run type checking with mypy
54
+ $(UV) run mypy
55
+
56
+ .PHONY: check
57
+ check: lint typecheck
58
+
59
+ # ──────────────────────────────────────────────
60
+ # Docker
61
+ # ──────────────────────────────────────────────
62
+
63
+ .PHONY: docker-build
64
+ docker-build: ## Build Docker image
65
+ docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
66
+
67
+ .PHONY: docker-run
68
+ docker-run: ## Run Docker container
69
+ docker run --rm -p $(SERVER_PORT):8000 $(DOCKER_IMAGE):$(DOCKER_TAG)
70
+
71
+ .PHONY: docker-run-dev
72
+ docker-run-dev: ## Run Docker container in dev mode with live reload
73
+ docker run --rm -p $(SERVER_PORT):8000 -v $(PWD):/app/env -v /app/env/.venv -e DEV_MODE=1 $(DOCKER_IMAGE):$(DOCKER_TAG)
74
+
75
+ .PHONY: docker-run-detach
76
+ docker-run-detach: ## Run Docker container in background
77
+ docker run -d --rm -p $(SERVER_PORT):8000 --name $(DOCKER_IMAGE) $(DOCKER_IMAGE):$(DOCKER_TAG)
78
+
79
+ .PHONY: docker-stop
80
+ docker-stop: ## Stop the running Docker container
81
+ docker stop $(DOCKER_IMAGE)
82
+
83
+ .PHONY: docker-logs
84
+ docker-logs: ## Tail logs from the running Docker container
85
+ docker logs -f $(DOCKER_IMAGE)
86
+
87
+ .PHONY: docker-shell
88
+ docker-shell: ## Open a shell in the running Docker container
89
+ docker exec -it $(DOCKER_IMAGE) /bin/bash
90
+
91
+ .PHONY: docker-clean
92
+ docker-clean: ## Stop and remove all running containers for this image
93
+ @docker ps -q --filter ancestor=$(DOCKER_IMAGE):$(DOCKER_TAG) | xargs -r docker rm -f
94
+
95
+ .PHONY: docker-health
96
+ docker-health: ## Check health of the running container
97
+ @curl -sf http://localhost:$(SERVER_PORT)/health && echo " OK" || echo " FAIL"
98
+
99
+ # ──────────────────────────────────────────────
100
+ # OpenEnv
101
+ # ──────────────────────────────────────────────
102
+
103
+ .PHONY: openenv-validate
104
+ openenv-validate: ## Validate the OpenEnv configuration
105
+ openenv validate
106
+
107
+ .PHONY: openenv-build
108
+ openenv-build: ## Build the environment using OpenEnv CLI
109
+ openenv build
110
+
111
+ .PHONY: openenv-push
112
+ openenv-push: ## Push the environment to Hugging Face Spaces
113
+ openenv push
114
+
115
+ # ──────────────────────────────────────────────
116
+ # Cleanup
117
+ # ──────────────────────────────────────────────
118
+
119
+ .PHONY: clean
120
+ clean: ## Remove build artifacts and caches
121
+ rm -rf build/ dist/ *.egg-info .eggs/
122
+ rm -rf .pytest_cache/ .mypy_cache/ .ruff_cache/
123
+ rm -rf htmlcov/ .coverage coverage.xml
124
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
125
+ find . -type f -name '*.pyc' -delete 2>/dev/null || true
126
+
127
+ .PHONY: clean-all
128
+ clean-all: clean ## Remove all artifacts including venv
129
+ rm -rf .venv/
130
+
131
+ # ──────────────────────────────────────────────
132
+ # Help
133
+ # ──────────────────────���───────────────────────
134
+
135
+ .PHONY: help
136
+ help: ## Show this help message
137
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
138
+ awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
README.md CHANGED
@@ -1,10 +1,217 @@
1
  ---
2
- title: Aws Rl Env
3
- emoji: 📚
4
- colorFrom: indigo
5
  colorTo: pink
6
  sdk: docker
7
  pinned: false
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AWS RL Environment Server
3
+ emoji: 🥇
4
+ colorFrom: pink
5
  colorTo: pink
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8000
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
  ---
13
 
14
+ # AWS RL Environment
15
+
16
+ An RL environment backed by a **simulated AWS cloud** powered by [MiniStack](https://github.com/Nahuel990/ministack). The agent sends AWS API calls as actions and receives API responses as observations. MiniStack runs inside the same Docker container, emulating 34 AWS services locally.
17
+
18
+ ## Quick Start
19
+
20
+ ```python
21
+ from aws_rl_env import AwsRlAction, AwsRlEnv
22
+
23
+ try:
24
+ # Create environment from Docker image
25
+ env = AwsRlEnv.from_docker_image("aws_rl_env-env:latest")
26
+
27
+ # Reset
28
+ result = env.reset()
29
+ print(f"Episode: {result.observation.episode_id}")
30
+
31
+ # Create an S3 bucket
32
+ result = env.step(AwsRlAction(command="aws s3 mb s3://my-rl-bucket"))
33
+ print(f"Create bucket success: {result.observation.command_success}")
34
+ print(f"Output: {result.observation.command_output}")
35
+
36
+ # Upload a file to the bucket
37
+ result = env.step(AwsRlAction(command="aws s3 cp hello.txt s3://my-rl-bucket/"))
38
+ print(f"Upload success: {result.observation.command_success}")
39
+
40
+ # List buckets
41
+ result = env.step(AwsRlAction(command="aws s3 ls"))
42
+ print(f"Buckets: {result.observation.command_output}")
43
+
44
+ # Describe EC2 instances
45
+ result = env.step(AwsRlAction(command="aws ec2 describe-instances"))
46
+ print(f"EC2 output: {result.observation.command_output}")
47
+
48
+ # Check current task and resource state
49
+ print(f"Task: {result.observation.task}")
50
+ print(f"Task achieved: {result.observation.task_achieved}")
51
+ print(f"Resources: {result.observation.resources}")
52
+
53
+ finally:
54
+ env.close()
55
+ ```
56
+
57
+ ## Supported AWS Services
58
+
59
+ The environment supports **34 AWS services** via MiniStack:
60
+
61
+ | Category | Services |
62
+ |----------|----------|
63
+ | **Storage & DB** | S3, DynamoDB, RDS, ElastiCache, EFS |
64
+ | **Compute** | Lambda, ECS, EC2, Step Functions |
65
+ | **Messaging** | SQS, SNS, Kinesis, EventBridge, Firehose |
66
+ | **API** | API Gateway v1/v2, ALB/ELBv2 |
67
+ | **Security** | IAM, STS, Cognito, ACM, WAF v2, Secrets Manager |
68
+ | **Monitoring** | CloudWatch, CloudWatch Logs, SSM |
69
+ | **Infrastructure** | CloudFormation, Route53 |
70
+ | **Other** | SES, Athena, Glue, EMR |
71
+
72
+ ## Building the Docker Image
73
+
74
+ ```bash
75
+ docker build -t aws_rl_env-env:latest -f Dockerfile .
76
+ ```
77
+
78
+ The Docker image bundles:
79
+ - The RL environment server (port 8000)
80
+ - MiniStack AWS emulator (port 4566)
81
+ - boto3 for AWS SDK access
82
+ - All MiniStack dependencies
83
+
84
+ ## Environment Details
85
+
86
+ ### Core Types
87
+
88
+ - `TaskID` — Unique task identifier (int)
89
+ - `EpisodeID` — Unique episode identifier (str)
90
+ - `StepCount` — Step counter within an episode (int)
91
+ - `AwsService` — Supported AWS services: `s3`, `ec2`, `dynamodb`, `lambda`
92
+
93
+ ### Task
94
+
95
+ **Task**: Defines what the RL agent must accomplish
96
+
97
+ - `task_id` (TaskID) — Unique task identifier
98
+ - `difficulty` (TaskDifficulty) — One of: `warmup`, `beginner`, `intermediate`, `advanced`, `expert`
99
+ - `description` (str) — Human-readable task description
100
+ - `success_criteria` (dict) — Machine-readable criteria to evaluate task completion
101
+
102
+ ### Action
103
+
104
+ **AwsRlAction**: An AWS CLI command to execute against MiniStack
105
+
106
+ - `command` (str) — AWS CLI command to execute, e.g. `"aws s3 ls"`, `"aws ec2 describe-instances"`
107
+
108
+ ### Observation
109
+
110
+ **AwsRlObservation**: The result returned after each step
111
+
112
+ - `episode_id` (EpisodeID) — Unique identifier for the episode
113
+ - `step_count` (StepCount) — Current step count in the episode
114
+ - `command_success` (bool) — Whether the CLI command executed successfully
115
+ - `command_output` (str) — Stdout from the executed AWS CLI command
116
+ - `error` (str) — Stderr if the command failed
117
+ - `resources` (dict[AwsService, dict | list | str]) — Current resource state from MiniStack, keyed by service name
118
+ - `task` (Task | None) — The task the agent is trying to accomplish
119
+ - `task_achieved` (bool) — Whether the task has been achieved
120
+
121
+ ## Architecture
122
+
123
+ ```
124
+ ┌─────────────────────────────────────────┐
125
+ │ Docker Container │
126
+ │ │
127
+ │ ┌──────────────┐ ┌───────────────┐ │
128
+ │ │ RL Server │ │ MiniStack │ │
129
+ │ │ (port 8000) │──▶│ (port 4566) │ │
130
+ │ │ FastAPI + │ │ 34 AWS │ │
131
+ │ │ WebSocket │ │ services │ │
132
+ │ └──────────────┘ └───────────────┘ │
133
+ │ │ │ │
134
+ │ │ boto3 calls │ │
135
+ │ └──��─────────────────┘ │
136
+ └─────────────────────────────────────────┘
137
+
138
+ │ WebSocket / HTTP
139
+
140
+ RL Agent (client)
141
+ ```
142
+
143
+ ## Advanced Usage
144
+
145
+ ### Connecting to an Existing Server
146
+
147
+ ```python
148
+ from aws_rl_env import AwsRlAction, AwsRlEnv
149
+
150
+ env = AwsRlEnv(base_url="http://localhost:8000")
151
+ result = env.reset()
152
+
153
+ # Create a DynamoDB table
154
+ result = env.step(AwsRlAction(
155
+ command="aws dynamodb create-table --table-name my-table --key-schema AttributeName=id,KeyType=HASH --attribute-definitions AttributeName=id,AttributeType=S --billing-mode PAY_PER_REQUEST"
156
+ ))
157
+ print(f"Table created: {result.observation.command_success}")
158
+ print(f"Output: {result.observation.command_output}")
159
+ ```
160
+
161
+ ### Concurrent Sessions
162
+
163
+ ```python
164
+ from aws_rl_env import AwsRlAction, AwsRlEnv
165
+ from concurrent.futures import ThreadPoolExecutor
166
+
167
+ def run_episode(client_id: int):
168
+ with AwsRlEnv(base_url="http://localhost:8000") as env:
169
+ result = env.reset()
170
+ for i in range(10):
171
+ result = env.step(AwsRlAction(
172
+ command=f"aws s3api put-object --bucket client-{client_id} --key step-{i}.txt --body 'data from step {i}'"
173
+ ))
174
+ return client_id, result.observation.command_success
175
+
176
+ with ThreadPoolExecutor(max_workers=4) as executor:
177
+ results = list(executor.map(run_episode, range(4)))
178
+ ```
179
+
180
+ ### Running Locally (without Docker)
181
+
182
+ Start MiniStack and the RL server separately:
183
+
184
+ ```bash
185
+ # Terminal 1: Start MiniStack
186
+ pip install ministack
187
+ ministack # Runs on port 4566
188
+
189
+ # Terminal 2: Start RL server
190
+ export AWS_ENDPOINT_URL=http://localhost:4566
191
+ export AWS_ACCESS_KEY_ID=test
192
+ export AWS_SECRET_ACCESS_KEY=test
193
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
194
+ ```
195
+
196
+ ## Project Structure
197
+
198
+ ```
199
+ aws_rl_env/
200
+ ├── __init__.py # Module exports
201
+ ├── README.md # This file
202
+ ├── Dockerfile # Container image (bundles RL server + MiniStack)
203
+ ├── entrypoint.sh # Starts MiniStack then RL server
204
+ ├── openenv.yaml # OpenEnv manifest
205
+ ├── pyproject.toml # Project metadata and dependencies
206
+ ├── uv.lock # Locked dependencies
207
+ ├── client.py # AwsRlEnv client
208
+ ├── models.py # AwsRlAction and AwsRlObservation models
209
+ ├── ministack/ # MiniStack AWS emulator (bundled)
210
+ │ ├── app.py # MiniStack ASGI application
211
+ │ ├── core/ # Routing, persistence, responses
212
+ │ └── services/ # 34 AWS service implementations
213
+ └── server/
214
+ ├── __init__.py
215
+ ├── aws_rl_env_environment.py # Core RL environment (uses boto3 → MiniStack)
216
+ └── app.py # FastAPI application
217
+ ```
__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Aws Rl Env Environment."""
8
+
9
+ from .client import AwsRlEnv
10
+ from .models import AwsRlAction, AwsRlObservation
11
+
12
+ __all__ = [
13
+ "AwsRlAction",
14
+ "AwsRlObservation",
15
+ "AwsRlEnv",
16
+ ]
client.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Aws Rl Env Environment Client."""
8
+
9
+ from typing import Dict
10
+
11
+ from openenv.core import EnvClient
12
+ from openenv.core.client_types import StepResult
13
+ from openenv.core.env_server.types import State
14
+
15
+ from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount
16
+
17
+
18
+ class AwsRlEnv(EnvClient[AwsRlAction, AwsRlObservation, State]):
19
+ """
20
+ Client for the Aws Rl Env Environment.
21
+
22
+ This client maintains a persistent WebSocket connection to the environment server,
23
+ enabling efficient multi-step interactions with lower latency.
24
+ Each client instance has its own dedicated environment session on the server.
25
+
26
+ Example:
27
+ >>> with AwsRlEnv(base_url="http://localhost:8000") as client:
28
+ ... result = client.reset()
29
+ ... print(result.observation.command_output)
30
+ ...
31
+ ... result = client.step(AwsRlAction(command="aws s3 ls"))
32
+ ... print(result.observation.command_output)
33
+
34
+ Example with Docker:
35
+ >>> client = AwsRlEnv.from_docker_image("aws_rl_env-env:latest")
36
+ >>> try:
37
+ ... result = client.reset()
38
+ ... result = client.step(AwsRlAction(command="aws s3 ls"))
39
+ ... finally:
40
+ ... client.close()
41
+ """
42
+
43
+ def _step_payload(self, action: AwsRlAction) -> Dict:
44
+ """Convert AwsRlAction to JSON payload for step message."""
45
+ return {"command": action.command}
46
+
47
+ def _parse_result(self, payload: Dict) -> StepResult[AwsRlObservation]:
48
+ """Parse server response into StepResult[AwsRlObservation]."""
49
+ obs_data = payload.get("observation", {})
50
+ observation = AwsRlObservation(
51
+ episode_id=EpisodeID(obs_data.get("episode_id", "")),
52
+ step_count=StepCount(obs_data.get("step_count", 0)),
53
+ command_success=obs_data.get("command_success", False),
54
+ command_output=obs_data.get("command_output", ""),
55
+ error=obs_data.get("error", ""),
56
+ task=obs_data.get("task"),
57
+ task_achieved=obs_data.get("task_achieved", False),
58
+ done=payload.get("done", False),
59
+ reward=payload.get("reward", 0.0),
60
+ )
61
+
62
+ return StepResult(
63
+ observation=observation,
64
+ reward=payload.get("reward", 0.0),
65
+ done=payload.get("done", False),
66
+ )
67
+
68
+ def _parse_state(self, payload: Dict) -> State:
69
+ """Parse server response into State object."""
70
+ return State(
71
+ episode_id=payload.get("episode_id"),
72
+ step_count=payload.get("step_count", 0),
73
+ )
inference.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference Script Example
3
+ ===================================
4
+ MANDATORY
5
+ - Before submitting, ensure the following variables are defined in your environment configuration:
6
+ API_BASE_URL The API endpoint for the LLM.
7
+ MODEL_NAME The model identifier to use for inference.
8
+ HF_TOKEN Your Hugging Face / API key.
9
+ LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
10
+ method
11
+
12
+ - Defaults are set only for API_BASE_URL and MODEL_NAME
13
+ (and should reflect your active inference setup):
14
+ API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
15
+ MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
16
+
17
+ - The inference script must be named `inference.py` and placed in the root directory of the project
18
+ - Participants must use OpenAI Client for all LLM calls using above variables
19
+
20
+ STDOUT FORMAT
21
+ - The script must emit exactly three line types to stdout, in this order:
22
+
23
+ [START] task=<task_name> env=<benchmark> model=<model_name>
24
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
25
+ [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
26
+
27
+ Rules:
28
+ - One [START] line at episode begin.
29
+ - One [STEP] line per step, immediately after env.step() returns.
30
+ - One [END] line after env.close(), always emitted (even on exception).
31
+ - reward and rewards are formatted to 2 decimal places.
32
+ - done and success are lowercase booleans: true or false.
33
+ - error is the raw last_action_error string, or null if none.
34
+ - All fields on a single line with no newlines within a line.
35
+
36
+ Example:
37
+ [START] task=create-s3-bucket env=aws_rl_env model=Qwen2.5-72B-Instruct
38
+ [STEP] step=1 action=aws s3api create-bucket --bucket my-test-bucket reward=1.00 done=false error=null
39
+ [END] success=true steps=1 rewards=1.00
40
+ """
41
+
42
+ import asyncio
43
+ import os
44
+ import textwrap
45
+ from typing import List, Optional
46
+
47
+ from dotenv import load_dotenv
48
+ from openai import OpenAI
49
+
50
+ from client import AwsRlEnv
51
+ from models import AwsRlAction
52
+
53
+ load_dotenv() # Load variables from .env file if present
54
+
55
+ IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
56
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
57
+
58
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
59
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
60
+ BENCHMARK = os.getenv("BENCHMARK", "aws_rl_env")
61
+ MAX_STEPS = int(os.getenv("MAX_STEPS", "15"))
62
+ TEMPERATURE = 0.7
63
+ MAX_TOKENS = 512
64
+ SUCCESS_SCORE_THRESHOLD = 1.0 # task_achieved yields reward=1.0
65
+
66
+ SYSTEM_PROMPT = textwrap.dedent(
67
+ """
68
+ You are an AWS cloud engineer interacting with a real AWS environment via CLI.
69
+ Each turn you must send exactly ONE valid AWS CLI command (starting with 'aws').
70
+
71
+ You will be given a task to accomplish. Read the task description carefully.
72
+ Use the command output and error messages to guide your next action.
73
+
74
+ Rules:
75
+ - Only send AWS CLI commands (e.g. 'aws s3 ls', 'aws dynamodb create-table ...')
76
+ - One command per turn — no pipes, no shell syntax, no chaining
77
+ - Reply with ONLY the command, nothing else — no explanations, no quotes
78
+ """
79
+ ).strip()
80
+
81
+
82
+ def log_start(task: str, env: str, model: str) -> None:
83
+ print(f"[START] task={task} env={env} model={model}", flush=True)
84
+
85
+
86
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
87
+ error_val = error if error else "null"
88
+ done_val = str(done).lower()
89
+ print(
90
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
91
+ flush=True,
92
+ )
93
+
94
+
95
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
96
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
97
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
98
+
99
+
100
+ def build_user_prompt(
101
+ task_description: str,
102
+ step: int,
103
+ last_output: str,
104
+ last_error: str,
105
+ last_reward: float,
106
+ history: List[str],
107
+ ) -> str:
108
+ history_block = "\n".join(history[-6:]) if history else "None"
109
+ return textwrap.dedent(
110
+ f"""
111
+ TASK: {task_description}
112
+
113
+ Step: {step}
114
+ Last command output: {last_output!r}
115
+ Last error: {last_error!r}
116
+ Last reward: {last_reward:.2f}
117
+
118
+ Previous steps:
119
+ {history_block}
120
+
121
+ Send your next AWS CLI command.
122
+ """
123
+ ).strip()
124
+
125
+
126
+ def get_model_command(
127
+ client: OpenAI,
128
+ task_description: str,
129
+ step: int,
130
+ last_output: str,
131
+ last_error: str,
132
+ last_reward: float,
133
+ history: List[str],
134
+ ) -> str:
135
+ user_prompt = build_user_prompt(
136
+ task_description, step, last_output, last_error, last_reward, history
137
+ )
138
+ try:
139
+ completion = client.chat.completions.create(
140
+ model=MODEL_NAME,
141
+ messages=[
142
+ {"role": "system", "content": SYSTEM_PROMPT},
143
+ {"role": "user", "content": user_prompt},
144
+ ],
145
+ temperature=TEMPERATURE,
146
+ max_tokens=MAX_TOKENS,
147
+ stream=False,
148
+ )
149
+ text = (completion.choices[0].message.content or "").strip()
150
+ # Strip markdown code fences if the model wraps the command
151
+ if text.startswith("```"):
152
+ lines = text.split("\n")
153
+ text = "\n".join(
154
+ line for line in lines if not line.startswith("```")
155
+ ).strip()
156
+ return text if text.startswith("aws ") else "aws help"
157
+ except Exception as exc:
158
+ print(f"[DEBUG] Model request failed: {exc}", flush=True)
159
+ return "aws help"
160
+
161
+
162
+ async def main() -> None:
163
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
164
+
165
+ env = await AwsRlEnv.from_docker_image(IMAGE_NAME)
166
+
167
+ history: List[str] = []
168
+ rewards: List[float] = []
169
+ steps_taken = 0
170
+ score = 0.0
171
+ success = False
172
+ task_name = "unknown"
173
+ task_description = ""
174
+
175
+ try:
176
+ result = await env.reset() # OpenENV.reset()
177
+ obs = result.observation
178
+
179
+ # Extract task info from the first observation
180
+ if obs.task is not None:
181
+ task_name = f"task-{obs.task.task_id}"
182
+ task_description = obs.task.description
183
+ else:
184
+ task_description = "No task assigned."
185
+
186
+ log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
187
+
188
+ last_output = obs.command_output
189
+ last_error = ""
190
+ last_reward = 0.0
191
+
192
+ for step in range(1, MAX_STEPS + 1):
193
+ if result.done:
194
+ break
195
+
196
+ command = get_model_command(
197
+ client, task_description, step,
198
+ last_output, last_error, last_reward, history,
199
+ )
200
+
201
+ result = await env.step(AwsRlAction(command=command))
202
+ obs = result.observation
203
+
204
+ reward = result.reward or 0.0
205
+ done = result.done
206
+ error = obs.error if obs.error else None
207
+
208
+ rewards.append(reward)
209
+ steps_taken = step
210
+ last_output = obs.command_output
211
+ last_error = obs.error
212
+ last_reward = reward
213
+
214
+ log_step(step=step, action=command, reward=reward, done=done, error=error)
215
+
216
+ status = "OK" if obs.command_success else "FAIL"
217
+ history.append(f"Step {step} [{status}]: {command} -> reward={reward:.2f}")
218
+
219
+ # Task achieved — episode success
220
+ if obs.task_achieved:
221
+ success = True
222
+ break
223
+
224
+ if done:
225
+ break
226
+
227
+ score = max(rewards) if rewards else 0.0
228
+ score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
229
+ if not success:
230
+ success = score >= SUCCESS_SCORE_THRESHOLD
231
+
232
+ finally:
233
+ try:
234
+ await env.close()
235
+ except Exception as e:
236
+ print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
237
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
238
+
239
+
240
+ if __name__ == "__main__":
241
+ asyncio.run(main())
models.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data models for the Aws Rl Env Environment.
3
+ """
4
+
5
+ from enum import Enum
6
+ from typing import NewType, Union
7
+
8
+ from openenv.core.env_server.types import Action, Observation
9
+ from pydantic import BaseModel, Field
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Core Types
13
+ # ---------------------------------------------------------------------------
14
+
15
+ TaskID = NewType("TaskID", int)
16
+ EpisodeID = NewType("EpisodeID", str)
17
+ StepCount = NewType("StepCount", int)
18
+
19
+
20
+ class AwsService(str, Enum):
21
+ S3 = "s3"
22
+ EC2 = "ec2"
23
+ DYNAMODB = "dynamodb"
24
+ LAMBDA = "lambda"
25
+ SQS = "sqs"
26
+ SNS = "sns"
27
+ IAM = "iam"
28
+ APIGATEWAY = "apigateway"
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # RL Task Definition
33
+ # ---------------------------------------------------------------------------
34
+
35
+
36
+ class TaskDifficulty(str, Enum):
37
+ WARMUP = "warmup"
38
+ BEGINNER = "beginner"
39
+ INTERMEDIATE = "intermediate"
40
+ ADVANCED = "advanced"
41
+ EXPERT = "expert"
42
+
43
+
44
+ class TierConfig(BaseModel):
45
+ """Configuration for a single difficulty tier's promotion and mastery rules."""
46
+
47
+ min_episodes: int = Field(..., ge=0, description="Minimum episodes before promotion eligible")
48
+ advance_rate: float = Field(..., ge=0.0, le=1.0, description="Tier success rate to advance")
49
+ mastery_window: int = Field(default=10, ge=1, description="Sliding window size for success rate")
50
+ mastery_threshold: float = Field(
51
+ default=0.7, ge=0.0, le=1.0, description="Per-task graduation threshold"
52
+ )
53
+ fast_track_rate: float = Field(
54
+ default=0.9, ge=0.0, le=1.0,
55
+ description="Success rate for early promotion after 3 episodes",
56
+ )
57
+
58
+
59
+ class SpacedRepState(BaseModel):
60
+ """Tracks spaced repetition schedule for a graduated task."""
61
+
62
+ interval: int = Field(default=3, ge=1, description="Episodes until next re-test")
63
+ last_graduated_episode: int = Field(
64
+ default=0, ge=0, description="Episode number when task was last graduated"
65
+ )
66
+
67
+
68
+ class SetupCommand(BaseModel):
69
+ """A single AWS CLI command executed during environment setup before the agent acts."""
70
+
71
+ command: str = Field(..., description="AWS CLI command to execute")
72
+ description: str | None = Field(
73
+ default=None, description="Human-readable explanation of what this command sets up"
74
+ )
75
+ ignore_failure: bool = Field(
76
+ default=False,
77
+ description="If True, continue setup even if this command fails",
78
+ )
79
+
80
+
81
+ class ResourceExistsCheck(BaseModel):
82
+ """Checks that a specific named resource exists in MiniStack."""
83
+
84
+ service: AwsService = Field(..., description="AWS service to verify the resource in")
85
+ name: str = Field(..., description="Exact resource name to verify")
86
+
87
+
88
+ class StepCriteria(BaseModel):
89
+ """A single required step in a multi-step task."""
90
+
91
+ operation: str = Field(..., description="AWS CLI operation, e.g. 'create-bucket'")
92
+ resource: str | None = Field(
93
+ default=None, description="Resource name the operation must target"
94
+ )
95
+
96
+
97
+ class StateCheck(BaseModel):
98
+ """An assertion about the environment's end-state, evaluated via AWS CLI."""
99
+
100
+ command: str = Field(..., description="AWS CLI command to run for verification")
101
+ output_contains: str | None = Field(
102
+ default=None, description="Substring that must appear in stdout"
103
+ )
104
+ json_path: str | None = Field(
105
+ default=None, description="JSON path to extract from stdout, e.g. '$.Table.Name'"
106
+ )
107
+ expected: int | float | str | bool | None = Field(
108
+ default=None, description="Expected value at json_path"
109
+ )
110
+
111
+
112
+ class SuccessCriteria(BaseModel):
113
+ """Machine-readable criteria to evaluate task completion.
114
+
115
+ Different tiers populate different fields:
116
+ - Warmup: command_contains + operation
117
+ - Beginner: command_contains + operation + resource_exists
118
+ - Intermediate: steps
119
+ - Advanced: services + steps
120
+ - Expert: services + state_checks + steps (optional)
121
+ """
122
+
123
+ command_contains: str | None = Field(
124
+ default=None, description="Substring the agent's command must contain"
125
+ )
126
+ operation: str | None = Field(
127
+ default=None, description="AWS CLI operation the agent must invoke"
128
+ )
129
+ resource_exists: ResourceExistsCheck | None = Field(
130
+ default=None, description="Resource that must exist after the agent acts"
131
+ )
132
+ steps: list[StepCriteria] = Field(
133
+ default_factory=list, description="Ordered sequence of required operations"
134
+ )
135
+ services: list[AwsService] = Field(
136
+ default_factory=list, description="AWS services the agent must interact with"
137
+ )
138
+ state_checks: list[StateCheck] = Field(
139
+ default_factory=list,
140
+ description="End-state assertions — source of truth for expert/SRE tasks",
141
+ )
142
+
143
+
144
+ class Task(BaseModel):
145
+ """Defines a task the RL agent must accomplish in the AWS environment."""
146
+
147
+ task_id: TaskID = Field(..., ge=0, description="Unique task identifier")
148
+ difficulty: TaskDifficulty = Field(
149
+ default=TaskDifficulty.WARMUP, description="Task difficulty level"
150
+ )
151
+ description: str = Field(..., description="Human-readable task description")
152
+ success_criteria: SuccessCriteria = Field(
153
+ default_factory=SuccessCriteria,
154
+ description="Machine-readable criteria to evaluate task completion",
155
+ )
156
+ setup_commands: list[SetupCommand] = Field(
157
+ default_factory=list,
158
+ description="Commands to run during reset to set up initial state (e.g. for SRE tasks)",
159
+ )
160
+
161
+
162
+ # ---------------------------------------------------------------------------
163
+ # Action & Observation
164
+ # ---------------------------------------------------------------------------
165
+
166
+
167
+ class AwsRlAction(Action):
168
+ """Action for the Aws Rl Env environment — an AWS CLI command to execute against MiniStack."""
169
+
170
+ command: str = Field(
171
+ ...,
172
+ description="AWS CLI command to execute, e.g. 'aws s3 ls', 'aws ec2 describe-instances'",
173
+ )
174
+
175
+
176
+ class AwsRlObservation(Observation):
177
+ """Observation returned after each step in the AWS RL environment."""
178
+
179
+ episode_id: EpisodeID = Field(..., description="Unique identifier for the episode")
180
+ step_count: StepCount = Field(..., ge=0, description="Current step count in the episode")
181
+ command_success: bool = Field(
182
+ ..., description="Whether the CLI command executed successfully"
183
+ )
184
+ command_output: str = Field(
185
+ default="", description="Stdout from the executed AWS CLI command"
186
+ )
187
+ error: str = Field(default="", description="Stderr if the command failed")
188
+ resources: dict[AwsService, Union[dict, list, str]] = Field(
189
+ default_factory=dict,
190
+ description="Current resource state from MiniStack, keyed by service name",
191
+ )
192
+ task: Task | None = Field(
193
+ default=None, description="The task the agent is trying to accomplish"
194
+ )
195
+ task_achieved: bool = Field(
196
+ default=False, description="Whether the task has been achieved"
197
+ )
openenv.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: aws_rl_env
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+
openenv_aws_rl_env.egg-info/PKG-INFO ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: openenv-aws_rl_env
3
+ Version: 0.1.0
4
+ Summary: Aws Rl Env environment for OpenEnv
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: openenv-core[core]>=0.2.2
7
+ Requires-Dist: ministack>=1.1.24
8
+ Requires-Dist: python-dotenv>=1.0.0
9
+ Provides-Extra: dev
10
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
11
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
12
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
13
+ Requires-Dist: mypy>=1.10.0; extra == "dev"
14
+ Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
openenv_aws_rl_env.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ __init__.py
3
+ client.py
4
+ inference.py
5
+ models.py
6
+ pyproject.toml
7
+ ./__init__.py
8
+ ./client.py
9
+ ./inference.py
10
+ ./models.py
11
+ openenv_aws_rl_env.egg-info/PKG-INFO
12
+ openenv_aws_rl_env.egg-info/SOURCES.txt
13
+ openenv_aws_rl_env.egg-info/dependency_links.txt
14
+ openenv_aws_rl_env.egg-info/entry_points.txt
15
+ openenv_aws_rl_env.egg-info/requires.txt
16
+ openenv_aws_rl_env.egg-info/top_level.txt
17
+ server/__init__.py
18
+ server/app.py
19
+ server/aws_rl_env_environment.py
openenv_aws_rl_env.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
openenv_aws_rl_env.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ server = aws_rl_env.server.app:main
openenv_aws_rl_env.egg-info/requires.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.2
2
+ ministack>=1.1.24
3
+ python-dotenv>=1.0.0
4
+
5
+ [dev]
6
+ pytest>=8.0.0
7
+ pytest-cov>=4.0.0
8
+ ruff>=0.4.0
9
+ mypy>=1.10.0
10
+ types-PyYAML>=6.0.0
openenv_aws_rl_env.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ aws_rl_env
pyproject.toml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-aws_rl_env"
13
+ version = "0.1.0"
14
+ description = "Aws Rl Env environment for OpenEnv"
15
+ requires-python = ">=3.12"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.2",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ # Examples:
24
+ # "numpy>=1.19.0",
25
+ # "torch>=2.0.0",
26
+ # "gymnasium>=0.29.0",
27
+ # "openspiel>=1.0.0",
28
+ # "smolagents>=1.22.0,<2",
29
+ "ministack>=1.1.24",
30
+ "python-dotenv>=1.0.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=8.0.0",
36
+ "pytest-cov>=4.0.0",
37
+ "ruff>=0.4.0",
38
+ "mypy>=1.10.0",
39
+ "types-PyYAML>=6.0.0",
40
+ ]
41
+
42
+ [project.scripts]
43
+ # Server entry point - enables running via: uv run --project . server
44
+ # or: python -m aws_rl_env.server.app
45
+ server = "aws_rl_env.server.app:main"
46
+
47
+ [tool.setuptools]
48
+ include-package-data = true
49
+ packages = ["aws_rl_env", "aws_rl_env.server"]
50
+ package-dir = { "aws_rl_env" = ".", "aws_rl_env.server" = "server" }
51
+
52
+ [tool.mypy]
53
+ files = ["*.py", "server/"]
54
+ ignore_missing_imports = true
55
+ namespace_packages = true
56
+ explicit_package_bases = true
57
+ mypy_path = "."
server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Aws Rl Env environment server components."""
8
+
9
+ from .aws_rl_env_environment import AwsRlEnvironment
10
+
11
+ __all__ = ["AwsRlEnvironment"]
server/app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ FastAPI application for the Aws Rl Env Environment.
9
+
10
+ This module creates an HTTP server that exposes the AwsRlEnvironment
11
+ over HTTP and WebSocket endpoints, compatible with EnvClient.
12
+
13
+ Endpoints:
14
+ - POST /reset: Reset the environment
15
+ - POST /step: Execute an action
16
+ - GET /state: Get current environment state
17
+ - GET /schema: Get action/observation schemas
18
+ - WS /ws: WebSocket endpoint for persistent sessions
19
+
20
+ Usage:
21
+ # Development (with auto-reload):
22
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
+
24
+ # Production:
25
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
+
27
+ # Or run directly:
28
+ python -m server.app
29
+ """
30
+
31
+ try:
32
+ from openenv.core.env_server.http_server import create_app
33
+ except Exception as e: # pragma: no cover
34
+ raise ImportError(
35
+ "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
36
+ ) from e
37
+
38
+
39
+ from models import AwsRlAction, AwsRlObservation
40
+ from server.aws_rl_env_environment import AwsRlEnvironment
41
+
42
+
43
+ # Create the app with web interface and README integration
44
+ app = create_app(
45
+ AwsRlEnvironment,
46
+ AwsRlAction,
47
+ AwsRlObservation,
48
+ env_name="aws_rl_env",
49
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
50
+ )
51
+
52
+
53
+ def main(host: str = "0.0.0.0", port: int = 8000):
54
+ """
55
+ Entry point for direct execution via uv run or python -m.
56
+
57
+ This function enables running the server without Docker:
58
+ uv run --project . server
59
+ uv run --project . server --port 8001
60
+ python -m aws_rl_env.server.app
61
+
62
+ Args:
63
+ host: Host address to bind to (default: "0.0.0.0")
64
+ port: Port number to listen on (default: 8000)
65
+
66
+ For production deployments, consider using uvicorn directly with
67
+ multiple workers:
68
+ uvicorn aws_rl_env.server.app:app --workers 4
69
+ """
70
+ import uvicorn
71
+
72
+ uvicorn.run(app, host=host, port=port)
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()
server/aws_rl_env_environment.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Aws Rl Env Environment Implementation.
9
+
10
+ An RL environment backed by a simulated AWS cloud powered by MiniStack.
11
+ The agent sends AWS CLI commands as actions and receives CLI output plus
12
+ the current resource state as observations.
13
+ """
14
+
15
+ import logging
16
+
17
+ from typing import Any, Optional
18
+ from uuid import uuid4
19
+
20
+ from openenv.core.env_server.interfaces import Environment
21
+ from openenv.core.env_server.types import State
22
+
23
+ from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount, Task
24
+ from server.services.aws_backend import AwsBackend
25
+ from server.services.curriculum import Curriculum
26
+ from server.services.environment_designer import EnvironmentDesigner
27
+ from server.services.episode_tracker import EpisodeTracker
28
+ from server.services.task_grader import TaskGrader
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, State]):
34
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
35
+
36
+ def __init__(self) -> None:
37
+ print("Initializing AWS RL Environment...")
38
+ self._state = State(episode_id=str(uuid4()), step_count=0)
39
+ self._backend = AwsBackend()
40
+ self._curriculum = Curriculum()
41
+ self._grader = TaskGrader(self._backend)
42
+ self._designer = EnvironmentDesigner(self._backend)
43
+ self._tracker = EpisodeTracker()
44
+ self._current_task: Task | None = None
45
+
46
+ def reset(
47
+ self,
48
+ seed: Optional[int] = None,
49
+ episode_id: Optional[str] = None,
50
+ **kwargs: Any,
51
+ ) -> AwsRlObservation:
52
+ self._backend.reset_environment()
53
+ self._state = State(episode_id=episode_id or str(uuid4()), step_count=0)
54
+ self._tracker.reset()
55
+ self._current_task = self._curriculum.next_task()
56
+
57
+ self._designer.apply(self._current_task)
58
+
59
+ return AwsRlObservation(
60
+ episode_id=EpisodeID(self._state.episode_id or ""),
61
+ step_count=StepCount(self._state.step_count),
62
+ command_success=True,
63
+ command_output="Environment reset. MiniStack state wiped.",
64
+ task=self._current_task,
65
+ done=False,
66
+ reward=0.0,
67
+ )
68
+
69
+ def step(
70
+ self,
71
+ action: AwsRlAction,
72
+ timeout_s: Optional[float] = None,
73
+ **kwargs: Any,
74
+ ) -> AwsRlObservation:
75
+ assert self._current_task is not None, "Call reset() before step()"
76
+ self._state.step_count += 1
77
+
78
+ # Anti-hack: only allow AWS CLI commands
79
+ command = action.command.strip()
80
+ if not command.startswith("aws "):
81
+ return AwsRlObservation(
82
+ episode_id=EpisodeID(self._state.episode_id or ""),
83
+ step_count=StepCount(self._state.step_count),
84
+ command_success=False,
85
+ command_output="",
86
+ error="Only AWS CLI commands (starting with 'aws') are allowed.",
87
+ task=self._current_task,
88
+ task_achieved=False,
89
+ done=False,
90
+ reward=0.0,
91
+ )
92
+
93
+ success, stdout, stderr = self._backend.execute_command(command)
94
+
95
+ # Record in tracker
96
+ latest_step = self._tracker.record_step(command, success, stdout, stderr)
97
+
98
+ # Grade the task
99
+ task_achieved = False
100
+
101
+ grade_result = self._grader.grade(
102
+ self._current_task, self._tracker, latest_step
103
+ )
104
+ task_achieved = grade_result.task_achieved
105
+ reward = grade_result.reward
106
+
107
+ if task_achieved:
108
+ self._curriculum.record_result(
109
+ self._current_task, achieved=True, reward=reward
110
+ )
111
+
112
+ return AwsRlObservation(
113
+ episode_id=EpisodeID(self._state.episode_id or ""),
114
+ step_count=StepCount(self._state.step_count),
115
+ command_success=success,
116
+ command_output=stdout,
117
+ error=stderr,
118
+ task=self._current_task,
119
+ task_achieved=task_achieved,
120
+ done=task_achieved,
121
+ reward=reward,
122
+ )
123
+
124
+ @property
125
+ def state(self) -> State:
126
+ return self._state
server/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openenv[core]>=0.2.0
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.24.0
4
+
5
+
6
+
server/services/__init__.py ADDED
File without changes
server/services/aws_backend.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backend service for managing AWS interactions via MiniStack."""
2
+
3
+ import logging
4
+ import os
5
+ import subprocess
6
+
7
+ import httpx
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ MINISTACK_URL = os.getenv("MINISTACK_URL", "http://localhost:4566")
12
+
13
+
14
+ class AwsBackend:
15
+ """Backend service for executing AWS CLI commands against MiniStack."""
16
+
17
+ def __init__(self, ministack_url: str = MINISTACK_URL) -> None:
18
+ self._ministack_url = ministack_url
19
+
20
+ def reset_environment(self) -> None:
21
+ """Wipe all MiniStack service state via POST /_ministack/reset."""
22
+ try:
23
+ resp = httpx.post(
24
+ f"{self._ministack_url}/_ministack/reset", timeout=10
25
+ )
26
+ resp.raise_for_status()
27
+ logger.info("MiniStack state reset successfully")
28
+ except httpx.HTTPError as e:
29
+ logger.warning("Failed to reset MiniStack state: %s", e)
30
+ raise
31
+
32
+ def execute_command(self, command: str) -> tuple[bool, str, str]:
33
+ """Execute an AWS CLI command against MiniStack.
34
+
35
+ Args:
36
+ command: Raw AWS CLI command, e.g. 'aws s3 ls'
37
+
38
+ Returns:
39
+ Tuple of (success, stdout, stderr)
40
+ """
41
+ env = {
42
+ **os.environ,
43
+ "AWS_ENDPOINT_URL": self._ministack_url,
44
+ "AWS_ACCESS_KEY_ID": "test",
45
+ "AWS_SECRET_ACCESS_KEY": "test",
46
+ "AWS_DEFAULT_REGION": "us-east-1",
47
+ }
48
+
49
+ try:
50
+ result = subprocess.run(
51
+ command.split(),
52
+ capture_output=True,
53
+ text=True,
54
+ timeout=30,
55
+ env=env,
56
+ )
57
+ return (
58
+ result.returncode == 0,
59
+ result.stdout,
60
+ result.stderr,
61
+ )
62
+ except subprocess.TimeoutExpired:
63
+ return False, "", "Command timed out after 30s"
64
+ except Exception as e:
65
+ return False, "", str(e)
server/services/curriculum.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Curriculum manager for progressive LLM training in the AWS RL environment.
2
+
3
+ Training flow:
4
+ 1. Agent starts at the warmup tier with simple listing tasks.
5
+ 2. A priority queue selects the next task based on weakness, novelty,
6
+ spaced repetition, and recency — replacing blind round-robin.
7
+ 3. Per-task mastery tracking graduates individual tasks once the agent
8
+ demonstrates sustained competence.
9
+ 4. Graduated tasks resurface via spaced repetition at exponentially
10
+ increasing intervals to prevent catastrophic forgetting.
11
+ 5. Fast-track promotion lets strong agents skip minimum episode waits.
12
+ 6. Exponential decay on history ensures recent results matter more.
13
+ """
14
+
15
+ import heapq
16
+ import logging
17
+ import random
18
+ from collections import defaultdict
19
+ from pathlib import Path
20
+
21
+ import yaml
22
+
23
+ from models import (
24
+ SetupCommand,
25
+ SpacedRepState,
26
+ SuccessCriteria,
27
+ Task,
28
+ TaskDifficulty,
29
+ TaskID,
30
+ TierConfig,
31
+ )
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ TASKS_DIR = Path(__file__).parent / "tasks"
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Per-tier configuration
39
+ # ---------------------------------------------------------------------------
40
+
41
+ TIER_CONFIGS: dict[TaskDifficulty, TierConfig] = {
42
+ TaskDifficulty.WARMUP: TierConfig(
43
+ min_episodes=5, advance_rate=0.6, mastery_window=10,
44
+ mastery_threshold=0.7, fast_track_rate=0.9,
45
+ ),
46
+ TaskDifficulty.BEGINNER: TierConfig(
47
+ min_episodes=5, advance_rate=0.6, mastery_window=10,
48
+ mastery_threshold=0.7, fast_track_rate=0.9,
49
+ ),
50
+ TaskDifficulty.INTERMEDIATE: TierConfig(
51
+ min_episodes=8, advance_rate=0.65, mastery_window=10,
52
+ mastery_threshold=0.7, fast_track_rate=0.9,
53
+ ),
54
+ TaskDifficulty.ADVANCED: TierConfig(
55
+ min_episodes=10, advance_rate=0.7, mastery_window=10,
56
+ mastery_threshold=0.7, fast_track_rate=0.9,
57
+ ),
58
+ TaskDifficulty.EXPERT: TierConfig(
59
+ min_episodes=0, advance_rate=1.0, mastery_window=10,
60
+ mastery_threshold=0.7, fast_track_rate=1.0,
61
+ ),
62
+ }
63
+
64
+ # Map YAML filenames to difficulty tiers
65
+ _TIER_FILES: dict[TaskDifficulty, str] = {
66
+ TaskDifficulty.WARMUP: "warmup.yaml",
67
+ TaskDifficulty.BEGINNER: "beginner.yaml",
68
+ TaskDifficulty.INTERMEDIATE: "intermediate.yaml",
69
+ TaskDifficulty.ADVANCED: "advanced.yaml",
70
+ TaskDifficulty.EXPERT: "expert.yaml",
71
+ }
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Priority score tuning constants
75
+ # ---------------------------------------------------------------------------
76
+
77
+ _NOVELTY_BONUS = 100 # untried tasks — explore first
78
+ _WEAKNESS_WEIGHT = 50 # multiplied by (1 - success_rate)
79
+ _SPACED_REP_BONUS = 30 # graduated task due for re-test
80
+ _RECENCY_PENALTY = 20 # attempted in last 2 episodes
81
+
82
+ # Exponential decay factor for weighted success rate
83
+ _DECAY_FACTOR = 0.85
84
+
85
+ # Minimum attempts before a task can be graduated
86
+ _MIN_ATTEMPTS_FOR_MASTERY = 3
87
+
88
+ # Fast-track requires at least this many episodes in the tier
89
+ _FAST_TRACK_MIN_EPISODES = 3
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # YAML loader
94
+ # ---------------------------------------------------------------------------
95
+
96
+
97
+ def load_tier(difficulty: TaskDifficulty, tasks_dir: Path = TASKS_DIR) -> list[Task]:
98
+ """Load tasks for a single difficulty tier from its YAML file."""
99
+ filename = _TIER_FILES.get(difficulty)
100
+ if filename is None:
101
+ logger.warning("No file mapping for difficulty: %s", difficulty.value)
102
+ return []
103
+
104
+ filepath = tasks_dir / filename
105
+ if not filepath.exists():
106
+ logger.warning("Task file not found: %s", filepath)
107
+ return []
108
+
109
+ with open(filepath) as f:
110
+ entries = yaml.safe_load(f) or []
111
+
112
+ tasks = [
113
+ Task(
114
+ task_id=TaskID(entry["task_id"]),
115
+ difficulty=difficulty,
116
+ description=entry["description"],
117
+ success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
118
+ setup_commands=[
119
+ SetupCommand(command=cmd) if isinstance(cmd, str) else SetupCommand(**cmd)
120
+ for cmd in entry.get("setup_commands", [])
121
+ ],
122
+ )
123
+ for entry in entries
124
+ ]
125
+ logger.info("Loaded %d %s tasks from %s", len(tasks), difficulty.value, filepath.name)
126
+ return tasks
127
+
128
+
129
+ # ---------------------------------------------------------------------------
130
+ # Helpers
131
+ # ---------------------------------------------------------------------------
132
+
133
+
134
+ def _weighted_success_rate(results: list[bool], decay: float = _DECAY_FACTOR) -> float:
135
+ """Compute success rate with exponential decay — recent results matter more."""
136
+ if not results:
137
+ return 0.0
138
+ weights = [decay ** i for i in range(len(results) - 1, -1, -1)]
139
+ total_weight = sum(weights)
140
+ return sum(w * float(r) for w, r in zip(weights, results)) / total_weight
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # Curriculum
145
+ # ---------------------------------------------------------------------------
146
+
147
+
148
+ class Curriculum:
149
+ """Manages progressive task assignment with priority-queue-based selection.
150
+
151
+ Features:
152
+ - Priority queue task selection (novelty, weakness, spaced rep, recency)
153
+ - Per-task mastery tracking with graduation
154
+ - Spaced repetition for graduated tasks (prevents catastrophic forgetting)
155
+ - Fast-track tier promotion for strong agents
156
+ - Exponential decay on success history
157
+ - Rich observability via get_stats()
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ tier_configs: dict[TaskDifficulty, TierConfig] | None = None,
163
+ tasks_dir: Path = TASKS_DIR,
164
+ ) -> None:
165
+ self._tier_configs = tier_configs or TIER_CONFIGS
166
+ self._tasks_dir = tasks_dir
167
+
168
+ # Ordered difficulty progression
169
+ self._levels = list(TaskDifficulty)
170
+
171
+ # Tier tracking
172
+ self._current_level_idx: int = 0
173
+ self._tier_episodes: int = 0
174
+ self._tier_results: list[bool] = [] # results within current tier
175
+
176
+ # Per-task tracking
177
+ self._task_history: dict[TaskID, list[bool]] = defaultdict(list)
178
+ self._task_attempt_count: dict[TaskID, int] = defaultdict(int)
179
+ self._last_attempted_episode: dict[TaskID, int] = {}
180
+ self._graduated_tasks: set[TaskID] = set()
181
+ self._spaced_rep: dict[TaskID, SpacedRepState] = {}
182
+
183
+ # Global counters
184
+ self._episode_count: int = 0
185
+ self._episode_rewards: list[float] = []
186
+
187
+ # Load starting tier
188
+ self._current_tasks: list[Task] = load_tier(
189
+ self.current_difficulty, self._tasks_dir
190
+ )
191
+ self._task_map: dict[TaskID, Task] = {t.task_id: t for t in self._current_tasks}
192
+
193
+ # Priority queue: list of (-score, random_tiebreaker, task_id)
194
+ self._priority_queue: list[tuple[float, float, TaskID]] = []
195
+ self._rebuild_priority_queue()
196
+
197
+ logger.info(
198
+ "Curriculum initialised — starting at %s with %d tasks",
199
+ self.current_difficulty.value,
200
+ len(self._current_tasks),
201
+ )
202
+
203
+ # -- Properties -----------------------------------------------------------
204
+
205
+ @property
206
+ def current_difficulty(self) -> TaskDifficulty:
207
+ return self._levels[self._current_level_idx]
208
+
209
+ @property
210
+ def tier_config(self) -> TierConfig:
211
+ return self._tier_configs[self.current_difficulty]
212
+
213
+ @property
214
+ def current_level_success_rate(self) -> float:
215
+ return _weighted_success_rate(self._tier_results)
216
+
217
+ @property
218
+ def is_warmup(self) -> bool:
219
+ return self.current_difficulty == TaskDifficulty.WARMUP
220
+
221
+ # -- Public API -----------------------------------------------------------
222
+
223
+ def next_task(self) -> Task:
224
+ """Select the highest-priority task from the current tier."""
225
+ if not self._current_tasks:
226
+ self._current_tasks = load_tier(
227
+ self.current_difficulty, self._tasks_dir
228
+ )
229
+ self._task_map = {t.task_id: t for t in self._current_tasks}
230
+ self._rebuild_priority_queue()
231
+
232
+ if not self._priority_queue:
233
+ self._rebuild_priority_queue()
234
+
235
+ # Pop highest priority (most negative = highest score)
236
+ _, _, task_id = heapq.heappop(self._priority_queue)
237
+ task = self._task_map[task_id]
238
+
239
+ # If queue is now empty, rebuild for next call
240
+ if not self._priority_queue:
241
+ self._rebuild_priority_queue()
242
+
243
+ return task
244
+
245
+ def record_result(
246
+ self, task: Task, achieved: bool, reward: float = 0.0
247
+ ) -> None:
248
+ """Record episode outcome, update mastery, check promotion."""
249
+ self._episode_count += 1
250
+ self._tier_episodes += 1
251
+ self._episode_rewards.append(reward)
252
+
253
+ # Per-tier results
254
+ self._tier_results.append(achieved)
255
+
256
+ # Per-task results
257
+ self._task_history[task.task_id].append(achieved)
258
+ self._task_attempt_count[task.task_id] += 1
259
+ self._last_attempted_episode[task.task_id] = self._episode_count
260
+
261
+ # Check mastery
262
+ self._check_mastery(task.task_id)
263
+
264
+ # Check tier promotion
265
+ self._maybe_promote()
266
+
267
+ # Rebuild priority queue with updated scores
268
+ self._rebuild_priority_queue()
269
+
270
+ logger.info(
271
+ "Episode %d: task=%d difficulty=%s achieved=%s tier_rate=%.2f",
272
+ self._episode_count,
273
+ task.task_id,
274
+ task.difficulty.value,
275
+ achieved,
276
+ self.current_level_success_rate,
277
+ )
278
+
279
+ def reset(self) -> None:
280
+ """Reset curriculum back to warmup (full training restart)."""
281
+ self._current_level_idx = 0
282
+ self._tier_episodes = 0
283
+ self._tier_results.clear()
284
+ self._task_history.clear()
285
+ self._task_attempt_count.clear()
286
+ self._last_attempted_episode.clear()
287
+ self._graduated_tasks.clear()
288
+ self._spaced_rep.clear()
289
+ self._episode_count = 0
290
+ self._episode_rewards.clear()
291
+ self._current_tasks = load_tier(self.current_difficulty, self._tasks_dir)
292
+ self._task_map = {t.task_id: t for t in self._current_tasks}
293
+ self._rebuild_priority_queue()
294
+ logger.info("Curriculum reset to %s", self.current_difficulty.value)
295
+
296
+ # -- Observability --------------------------------------------------------
297
+
298
+ def get_skill_profile(self) -> dict[TaskID, float]:
299
+ """Weighted success rate per task over recent history."""
300
+ config = self.tier_config
301
+ return {
302
+ task_id: round(
303
+ _weighted_success_rate(results[-config.mastery_window:]), 2
304
+ )
305
+ for task_id, results in self._task_history.items()
306
+ if results
307
+ }
308
+
309
+ def get_weak_spots(self) -> list[TaskID]:
310
+ """Tasks in the current tier below mastery threshold."""
311
+ config = self.tier_config
312
+ profile = self.get_skill_profile()
313
+ return [
314
+ task_id
315
+ for task_id in self._task_map
316
+ if profile.get(task_id, 0.0) < config.mastery_threshold
317
+ and task_id not in self._graduated_tasks
318
+ ]
319
+
320
+ def get_stats(self) -> dict:
321
+ """Full curriculum state for logging/debugging."""
322
+ return {
323
+ "episode_count": self._episode_count,
324
+ "tier": self.current_difficulty.value,
325
+ "tier_episodes": self._tier_episodes,
326
+ "tier_success_rate": round(self.current_level_success_rate, 3),
327
+ "graduated_tasks": sorted(self._graduated_tasks),
328
+ "weak_spots": self.get_weak_spots(),
329
+ "skill_profile": self.get_skill_profile(),
330
+ "spaced_rep_due": [
331
+ int(tid)
332
+ for tid in self._task_map
333
+ if self._is_spaced_rep_due(tid)
334
+ ],
335
+ "avg_reward_last_10": round(
336
+ sum(self._episode_rewards[-10:])
337
+ / max(1, len(self._episode_rewards[-10:])),
338
+ 3,
339
+ ),
340
+ }
341
+
342
+ # -- Priority queue -------------------------------------------------------
343
+
344
+ def _compute_priority(self, task_id: TaskID) -> float:
345
+ """Compute composite priority score for a task. Higher = selected sooner."""
346
+ config = self.tier_config
347
+ score = 0.0
348
+
349
+ attempts = self._task_attempt_count.get(task_id, 0)
350
+
351
+ # Novelty: never attempted → explore first
352
+ if attempts == 0:
353
+ score += _NOVELTY_BONUS
354
+ return score # no other signals available yet
355
+
356
+ # Weakness: worse tasks get higher priority
357
+ results = self._task_history.get(task_id, [])
358
+ task_rate = _weighted_success_rate(results[-config.mastery_window:])
359
+ score += _WEAKNESS_WEIGHT * (1.0 - task_rate)
360
+
361
+ # Spaced repetition: graduated task due for re-test
362
+ if task_id in self._graduated_tasks and self._is_spaced_rep_due(task_id):
363
+ score += _SPACED_REP_BONUS
364
+
365
+ # Recency penalty: attempted in last 2 episodes
366
+ last_ep = self._last_attempted_episode.get(task_id, -100)
367
+ if self._episode_count - last_ep <= 2:
368
+ score -= _RECENCY_PENALTY
369
+
370
+ return score
371
+
372
+ def _rebuild_priority_queue(self) -> None:
373
+ """Recompute priorities for all current-tier tasks and rebuild the heap."""
374
+ self._priority_queue.clear()
375
+ for task in self._current_tasks:
376
+ score = self._compute_priority(task.task_id)
377
+ # heapq is a min-heap, so negate score for max-priority-first
378
+ # random tiebreaker prevents deterministic ordering among equal scores
379
+ heapq.heappush(
380
+ self._priority_queue,
381
+ (-score, random.random(), task.task_id),
382
+ )
383
+
384
+ # -- Mastery & spaced repetition ------------------------------------------
385
+
386
+ def _check_mastery(self, task_id: TaskID) -> None:
387
+ """Check if a task should be graduated or un-graduated."""
388
+ config = self.tier_config
389
+ results = self._task_history.get(task_id, [])
390
+ recent = results[-config.mastery_window:]
391
+
392
+ if len(recent) < _MIN_ATTEMPTS_FOR_MASTERY:
393
+ return
394
+
395
+ rate = _weighted_success_rate(recent)
396
+
397
+ if rate >= config.mastery_threshold:
398
+ if task_id not in self._graduated_tasks:
399
+ self._graduated_tasks.add(task_id)
400
+ self._spaced_rep[task_id] = SpacedRepState(
401
+ interval=3,
402
+ last_graduated_episode=self._episode_count,
403
+ )
404
+ logger.info(
405
+ "Task %d GRADUATED (rate=%.2f) — scheduling spaced repetition",
406
+ task_id,
407
+ rate,
408
+ )
409
+ else:
410
+ # Un-graduate if performance dropped
411
+ if task_id in self._graduated_tasks:
412
+ self._graduated_tasks.discard(task_id)
413
+ self._spaced_rep.pop(task_id, None)
414
+ logger.info(
415
+ "Task %d UN-GRADUATED (rate=%.2f) — resetting to active",
416
+ task_id,
417
+ rate,
418
+ )
419
+
420
+ def _is_spaced_rep_due(self, task_id: TaskID) -> bool:
421
+ """Check if a graduated task is due for a re-test."""
422
+ state = self._spaced_rep.get(task_id)
423
+ if state is None:
424
+ return False
425
+ episodes_since = self._episode_count - state.last_graduated_episode
426
+ return episodes_since >= state.interval
427
+
428
+ def _advance_spaced_rep(self, task_id: TaskID) -> None:
429
+ """Double the interval after a successful re-test."""
430
+ state = self._spaced_rep.get(task_id)
431
+ if state is not None:
432
+ state.interval = min(state.interval * 2, 48) # cap at 48 episodes
433
+ state.last_graduated_episode = self._episode_count
434
+
435
+ # -- Tier promotion -------------------------------------------------------
436
+
437
+ def _maybe_promote(self) -> None:
438
+ """Advance to the next difficulty tier if the agent is ready."""
439
+ if self._current_level_idx >= len(self._levels) - 1:
440
+ return # already at max tier
441
+
442
+ config = self.tier_config
443
+ rate = self.current_level_success_rate
444
+
445
+ # Fast-track: high success rate after minimum 3 episodes
446
+ fast_track = (
447
+ self._tier_episodes >= _FAST_TRACK_MIN_EPISODES
448
+ and rate >= config.fast_track_rate
449
+ )
450
+
451
+ if not fast_track and self._tier_episodes < config.min_episodes:
452
+ return
453
+
454
+ if rate < config.advance_rate:
455
+ return
456
+
457
+ prev_tier = self.current_difficulty.value
458
+ prev_rate = rate
459
+ self._current_level_idx += 1
460
+ self._tier_episodes = 0
461
+ self._tier_results.clear()
462
+ self._current_tasks = load_tier(self.current_difficulty, self._tasks_dir)
463
+ self._task_map = {t.task_id: t for t in self._current_tasks}
464
+ self._rebuild_priority_queue()
465
+ logger.info(
466
+ "PROMOTED from %s to %s (rate=%.2f%s)",
467
+ prev_tier,
468
+ self.current_difficulty.value,
469
+ prev_rate,
470
+ ", FAST-TRACK" if fast_track else "",
471
+ )
server/services/environment_designer.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Environment designer — provisions initial AWS state for each task.
2
+
3
+ Currently supports raw AWS CLI setup commands. Designed to be extended
4
+ with CloudFormation YAML template support so that each difficulty level
5
+ can declaratively define its starting infrastructure.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from enum import Enum
12
+
13
+ from pydantic import BaseModel, Field
14
+
15
+ from models import SetupCommand, Task
16
+ from server.services.aws_backend import AwsBackend
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class ProvisionMethod(str, Enum):
22
+ """How the initial environment state is provisioned."""
23
+
24
+ CLI_COMMANDS = "cli_commands"
25
+ CLOUDFORMATION = "cloudformation"
26
+
27
+
28
+ class ProvisionResult(BaseModel):
29
+ """Outcome of provisioning the environment for a task."""
30
+
31
+ success: bool = True
32
+ method: ProvisionMethod = ProvisionMethod.CLI_COMMANDS
33
+ resources_created: int = 0
34
+ errors: list[str] = Field(default_factory=list)
35
+
36
+
37
+ class EnvironmentDesigner:
38
+ """Provisions the initial AWS state required by a task before the agent acts.
39
+
40
+ Usage::
41
+
42
+ designer = EnvironmentDesigner(backend)
43
+ result = designer.apply(task)
44
+ if not result.success:
45
+ logger.error("Failed to set up environment: %s", result.errors)
46
+ """
47
+
48
+ def __init__(self, backend: AwsBackend) -> None:
49
+ self._backend = backend
50
+
51
+ def apply(self, task: Task) -> ProvisionResult:
52
+ """Apply the task's environment setup to MiniStack.
53
+
54
+ Dispatches to the appropriate provisioning method based on what the
55
+ task defines. Currently supports ``setup_commands``; CloudFormation
56
+ support can be added by extending this method.
57
+
58
+ Returns:
59
+ A ``ProvisionResult`` summarising what happened.
60
+ """
61
+ if not task.setup_commands:
62
+ return ProvisionResult(resources_created=0)
63
+
64
+ return self._apply_cli_commands(task.setup_commands)
65
+
66
+ # -- Provisioning strategies ----------------------------------------------
67
+
68
+ def _apply_cli_commands(
69
+ self, commands: list[SetupCommand]
70
+ ) -> ProvisionResult:
71
+ """Execute a list of setup commands against MiniStack."""
72
+ errors: list[str] = []
73
+ resources_created = 0
74
+
75
+ for setup_cmd in commands:
76
+ success, _stdout, stderr = self._backend.execute_command(
77
+ setup_cmd.command
78
+ )
79
+ if success:
80
+ resources_created += 1
81
+ else:
82
+ msg = f"Setup command failed: {setup_cmd.command} — {stderr}"
83
+ if setup_cmd.ignore_failure:
84
+ logger.info("Ignoring failed setup command: %s", msg)
85
+ else:
86
+ logger.warning(msg)
87
+ errors.append(msg)
88
+
89
+ return ProvisionResult(
90
+ success=len(errors) == 0,
91
+ method=ProvisionMethod.CLI_COMMANDS,
92
+ resources_created=resources_created,
93
+ errors=errors,
94
+ )
server/services/episode_tracker.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Per-episode command history tracker for multi-step task evaluation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Maps common AWS CLI flag names to resource identifiers
13
+ _RESOURCE_FLAGS: list[str] = [
14
+ "--bucket",
15
+ "--table-name",
16
+ "--function-name",
17
+ "--queue-name",
18
+ "--topic-name",
19
+ "--role-name",
20
+ "--rest-api-id",
21
+ "--name",
22
+ "--resource",
23
+ ]
24
+
25
+
26
+ class StepRecord(BaseModel):
27
+ """A single command executed within an episode."""
28
+
29
+ command: str
30
+ success: bool
31
+ stdout: str = ""
32
+ stderr: str = ""
33
+ step_number: int = Field(ge=0)
34
+
35
+
36
+ def _parse_aws_command(command: str) -> tuple[str | None, str | None]:
37
+ """Extract (service, operation) from an AWS CLI command.
38
+
39
+ Example: 'aws s3api create-bucket --bucket foo' -> ('s3api', 'create-bucket')
40
+ """
41
+ parts = command.strip().split()
42
+ if len(parts) < 3 or parts[0] != "aws":
43
+ return None, None
44
+ return parts[1], parts[2]
45
+
46
+
47
+ def _command_mentions_resource(command: str, resource: str) -> bool:
48
+ """Check if the command references a specific resource name."""
49
+ parts = command.strip().split()
50
+ for i, part in enumerate(parts):
51
+ if part in _RESOURCE_FLAGS and i + 1 < len(parts):
52
+ if parts[i + 1] == resource:
53
+ return True
54
+ # Also match if the resource appears as a value in key=value flags
55
+ # e.g. --table-name=orders
56
+ for part in parts:
57
+ for flag in _RESOURCE_FLAGS:
58
+ if part.startswith(f"{flag}=") and part.split("=", 1)[1] == resource:
59
+ return True
60
+ # Match resource in ARN-like patterns or bare arguments
61
+ if re.search(rf"\b{re.escape(resource)}\b", command):
62
+ return True
63
+ return False
64
+
65
+
66
+ class EpisodeTracker:
67
+ """Tracks command history within a single episode for grading."""
68
+
69
+ def __init__(self) -> None:
70
+ self._history: list[StepRecord] = []
71
+ self._step_counter: int = 0
72
+ self._previous_progress: float = 0.0
73
+ # Track which (operation, resource) pairs have been credited
74
+ self._credited_operations: set[tuple[str, str | None]] = set()
75
+
76
+ def reset(self) -> None:
77
+ self._history.clear()
78
+ self._step_counter = 0
79
+ self._previous_progress = 0.0
80
+ self._credited_operations.clear()
81
+
82
+ def record_step(
83
+ self, command: str, success: bool, stdout: str, stderr: str
84
+ ) -> StepRecord:
85
+ record = StepRecord(
86
+ command=command,
87
+ success=success,
88
+ stdout=stdout,
89
+ stderr=stderr,
90
+ step_number=self._step_counter,
91
+ )
92
+ self._history.append(record)
93
+ self._step_counter += 1
94
+ return record
95
+
96
+ def has_executed_operation(
97
+ self, operation: str, resource: str | None = None
98
+ ) -> bool:
99
+ """Check if a successful command matching (operation, resource) exists in history."""
100
+ for record in self._history:
101
+ if not record.success:
102
+ continue
103
+ _, cmd_op = _parse_aws_command(record.command)
104
+ if cmd_op != operation:
105
+ continue
106
+ if resource is not None and not _command_mentions_resource(
107
+ record.command, resource
108
+ ):
109
+ continue
110
+ return True
111
+ return False
112
+
113
+ def has_used_service(self, service: str) -> bool:
114
+ """Check if any successful command targeted the given AWS service."""
115
+ for record in self._history:
116
+ if not record.success:
117
+ continue
118
+ cmd_svc, _ = _parse_aws_command(record.command)
119
+ if cmd_svc is not None and service in cmd_svc:
120
+ return True
121
+ return False
122
+
123
+ def is_operation_already_credited(
124
+ self, operation: str, resource: str | None
125
+ ) -> bool:
126
+ return (operation, resource) in self._credited_operations
127
+
128
+ def credit_operation(self, operation: str, resource: str | None) -> None:
129
+ self._credited_operations.add((operation, resource))
130
+
131
+ @property
132
+ def command_history(self) -> list[StepRecord]:
133
+ return list(self._history)
134
+
135
+ @property
136
+ def step_count(self) -> int:
137
+ return self._step_counter
138
+
139
+ @property
140
+ def previous_progress(self) -> float:
141
+ return self._previous_progress
142
+
143
+ @previous_progress.setter
144
+ def previous_progress(self, value: float) -> None:
145
+ self._previous_progress = value
server/services/resource_verifier.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Resource verification service — queries MiniStack for ground-truth state."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from typing import Any
8
+
9
+ from server.services.aws_backend import AwsBackend
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def _extract_json_path(data: Any, path: str) -> Any:
15
+ """Simple JSON path extractor supporting dot notation and array indexing.
16
+
17
+ Supports paths like: $.Table.ProvisionedThroughput.ReadCapacityUnits
18
+ $.Rules[0].Expiration.Days
19
+ $.Buckets[].Name
20
+ """
21
+ parts = path.lstrip("$").lstrip(".").split(".")
22
+ current = data
23
+ for part in parts:
24
+ if current is None:
25
+ return None
26
+ # Handle array index like Rules[0]
27
+ if "[" in part:
28
+ key, idx_str = part.split("[", 1)
29
+ idx_str = idx_str.rstrip("]")
30
+ if key:
31
+ current = current.get(key) if isinstance(current, dict) else None
32
+ if current is None:
33
+ return None
34
+ if idx_str == "":
35
+ # Wildcard — return list of values
36
+ if isinstance(current, list):
37
+ remaining = ".".join(parts[parts.index(part) + 1 :])
38
+ if remaining:
39
+ return [
40
+ _extract_json_path(item, f"$.{remaining}")
41
+ for item in current
42
+ ]
43
+ return current
44
+ return None
45
+ try:
46
+ current = current[int(idx_str)]
47
+ except (IndexError, TypeError):
48
+ return None
49
+ else:
50
+ current = current.get(part) if isinstance(current, dict) else None
51
+ return current
52
+
53
+
54
+ class ResourceVerifier:
55
+ """Verifies resource state by querying MiniStack via AWS CLI."""
56
+
57
+ def __init__(self, backend: AwsBackend) -> None:
58
+ self._backend = backend
59
+
60
+ def resource_exists(self, service: str, name: str) -> bool:
61
+ """Check if a specific resource exists in MiniStack.
62
+
63
+ Uses service-specific verification commands and checks for the
64
+ exact resource name (not just any resource of that type).
65
+ """
66
+ service_lower = service.lower()
67
+ verifiers = {
68
+ "s3": self._check_s3_bucket,
69
+ "dynamodb": self._check_dynamodb_table,
70
+ "lambda": self._check_lambda_function,
71
+ "sqs": self._check_sqs_queue,
72
+ "sns": self._check_sns_topic,
73
+ "iam": self._check_iam_role,
74
+ "apigateway": self._check_apigateway,
75
+ }
76
+ verifier = verifiers.get(service_lower)
77
+ if verifier is None:
78
+ logger.warning("No verifier for service: %s", service)
79
+ return False
80
+ return verifier(name)
81
+
82
+ def check_state(self, state_check: dict[str, Any]) -> bool:
83
+ """Run an arbitrary command and assert on its output.
84
+
85
+ Supports:
86
+ - output_contains: substring check on stdout
87
+ - json_path + expected: extract value from JSON stdout and compare
88
+ """
89
+ command = state_check.get("command", "")
90
+ if not command:
91
+ return False
92
+
93
+ success, stdout, _ = self._backend.execute_command(command)
94
+ if not success:
95
+ return False
96
+
97
+ # Check output_contains
98
+ if "output_contains" in state_check:
99
+ if state_check["output_contains"] not in stdout:
100
+ return False
101
+
102
+ # Check json_path + expected
103
+ if "json_path" in state_check and "expected" in state_check:
104
+ try:
105
+ data = json.loads(stdout)
106
+ value = _extract_json_path(data, state_check["json_path"])
107
+ expected = state_check["expected"]
108
+ # Compare as strings for flexibility
109
+ if str(value) != str(expected):
110
+ return False
111
+ except (json.JSONDecodeError, KeyError, TypeError):
112
+ return False
113
+
114
+ return True
115
+
116
+ # -- Service-specific verifiers -------------------------------------------
117
+
118
+ def _check_s3_bucket(self, name: str) -> bool:
119
+ success, stdout, _ = self._backend.execute_command(
120
+ "aws s3api list-buckets --output json"
121
+ )
122
+ if not success:
123
+ return False
124
+ try:
125
+ data = json.loads(stdout)
126
+ buckets = data.get("Buckets", [])
127
+ return any(b.get("Name") == name for b in buckets)
128
+ except (json.JSONDecodeError, TypeError):
129
+ return False
130
+
131
+ def _check_dynamodb_table(self, name: str) -> bool:
132
+ success, _, _ = self._backend.execute_command(
133
+ f"aws dynamodb describe-table --table-name {name}"
134
+ )
135
+ return success
136
+
137
+ def _check_lambda_function(self, name: str) -> bool:
138
+ success, _, _ = self._backend.execute_command(
139
+ f"aws lambda get-function --function-name {name}"
140
+ )
141
+ return success
142
+
143
+ def _check_sqs_queue(self, name: str) -> bool:
144
+ success, _, _ = self._backend.execute_command(
145
+ f"aws sqs get-queue-url --queue-name {name}"
146
+ )
147
+ return success
148
+
149
+ def _check_sns_topic(self, name: str) -> bool:
150
+ success, stdout, _ = self._backend.execute_command(
151
+ "aws sns list-topics --output json"
152
+ )
153
+ if not success:
154
+ return False
155
+ try:
156
+ data = json.loads(stdout)
157
+ topics = data.get("Topics", [])
158
+ return any(name in t.get("TopicArn", "") for t in topics)
159
+ except (json.JSONDecodeError, TypeError):
160
+ return False
161
+
162
+ def _check_iam_role(self, name: str) -> bool:
163
+ success, _, _ = self._backend.execute_command(
164
+ f"aws iam get-role --role-name {name}"
165
+ )
166
+ return success
167
+
168
+ def _check_apigateway(self, name: str) -> bool:
169
+ success, stdout, _ = self._backend.execute_command(
170
+ "aws apigateway get-rest-apis --output json"
171
+ )
172
+ if not success:
173
+ return False
174
+ try:
175
+ data = json.loads(stdout)
176
+ items = data.get("items", [])
177
+ return any(i.get("name") == name for i in items)
178
+ except (json.JSONDecodeError, TypeError):
179
+ return False
server/services/task_grader.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Task grading engine — evaluates task completion and computes shaped rewards.
2
+
3
+ All rewards are in the [0.0, 1.0] range. Only full task completion yields 1.0.
4
+ Includes anti-reward-hacking defenses.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from models import SuccessCriteria, Task
14
+ from server.services.aws_backend import AwsBackend
15
+ from server.services.episode_tracker import EpisodeTracker, StepRecord
16
+ from server.services.resource_verifier import ResourceVerifier
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class GradeResult(BaseModel):
22
+ """Outcome of grading a single step."""
23
+
24
+ task_achieved: bool = False
25
+ partial_progress: float = Field(default=0.0, ge=0.0, le=1.0)
26
+ reward: float = Field(default=0.0, ge=0.0, le=1.0)
27
+ reason: str = ""
28
+
29
+
30
+ class TaskGrader:
31
+ """Evaluates task completion and computes shaped rewards.
32
+
33
+ Dispatches to different grading strategies based on which fields
34
+ are populated on the task's ``SuccessCriteria``.
35
+ """
36
+
37
+ def __init__(self, backend: AwsBackend) -> None:
38
+ self._verifier = ResourceVerifier(backend)
39
+
40
+ def grade(
41
+ self,
42
+ task: Task,
43
+ tracker: EpisodeTracker,
44
+ latest_step: StepRecord,
45
+ ) -> GradeResult:
46
+ criteria = task.success_criteria
47
+
48
+ # Dispatch based on populated criteria fields
49
+ if criteria.state_checks:
50
+ result = self._grade_state_checks(criteria, tracker)
51
+ elif criteria.steps:
52
+ result = self._grade_multi_step(criteria, tracker)
53
+ elif criteria.resource_exists is not None:
54
+ result = self._grade_resource_creation(criteria, latest_step)
55
+ elif criteria.command_contains is not None:
56
+ result = self._grade_command_match(criteria, latest_step)
57
+ else:
58
+ result = GradeResult(reason="no recognised success_criteria fields")
59
+
60
+ # Compute shaped reward
61
+ result.reward = self._compute_reward(result, latest_step, tracker)
62
+
63
+ # Update tracker's previous progress (monotonic — never decrease)
64
+ if result.partial_progress > tracker.previous_progress:
65
+ tracker.previous_progress = result.partial_progress
66
+
67
+ return result
68
+
69
+ # -- Grading strategies ---------------------------------------------------
70
+
71
+ def _grade_command_match(
72
+ self, criteria: SuccessCriteria, latest_step: StepRecord
73
+ ) -> GradeResult:
74
+ """Warmup: check the latest command matches expected service + operation."""
75
+ cmd = latest_step.command.lower()
76
+ contains = (criteria.command_contains or "").lower()
77
+ operation = (criteria.operation or "").lower()
78
+
79
+ contains_ok = contains != "" and contains in cmd
80
+ operation_ok = operation != "" and operation in cmd
81
+ succeeded = latest_step.success
82
+ achieved = contains_ok and operation_ok and succeeded
83
+
84
+ return GradeResult(
85
+ task_achieved=achieved,
86
+ partial_progress=1.0 if achieved else 0.0,
87
+ reason=(
88
+ f"command_match: contains={contains_ok}, "
89
+ f"op={operation_ok}, success={succeeded}"
90
+ ),
91
+ )
92
+
93
+ def _grade_resource_creation(
94
+ self,
95
+ criteria: SuccessCriteria,
96
+ latest_step: StepRecord,
97
+ ) -> GradeResult:
98
+ """Beginner: verify the resource actually exists in MiniStack."""
99
+ re_spec = criteria.resource_exists
100
+ assert re_spec is not None
101
+ service = re_spec.service
102
+ name = re_spec.name
103
+
104
+ exists = self._verifier.resource_exists(service, name)
105
+
106
+ # Command matching gives partial credit (0.5)
107
+ contains = (criteria.command_contains or "").lower()
108
+ operation = (criteria.operation or "").lower()
109
+ cmd = latest_step.command.lower()
110
+ cmd_ok = contains in cmd and operation in cmd and latest_step.success
111
+
112
+ if exists:
113
+ progress = 1.0
114
+ elif cmd_ok:
115
+ progress = 0.5
116
+ else:
117
+ progress = 0.0
118
+
119
+ return GradeResult(
120
+ task_achieved=exists,
121
+ partial_progress=progress,
122
+ reason=(
123
+ f"resource_creation: exists={exists}, "
124
+ f"cmd_ok={cmd_ok}, service={service}, name={name}"
125
+ ),
126
+ )
127
+
128
+ def _grade_multi_step(
129
+ self, criteria: SuccessCriteria, tracker: EpisodeTracker
130
+ ) -> GradeResult:
131
+ """Intermediate/Advanced: check ordered step completion."""
132
+ steps = criteria.steps
133
+ if not steps:
134
+ return GradeResult(reason="empty steps list")
135
+
136
+ completed = 0
137
+ for step in steps:
138
+ if tracker.has_executed_operation(step.operation, step.resource):
139
+ completed += 1
140
+ else:
141
+ break # ordered — stop at first incomplete step
142
+
143
+ total = len(steps)
144
+ progress = completed / total if total > 0 else 0.0
145
+
146
+ # For advanced tasks with services requirement, also check services
147
+ services_required = criteria.services
148
+ services_met = all(
149
+ tracker.has_used_service(svc) for svc in services_required
150
+ )
151
+
152
+ achieved = completed == total and (not services_required or services_met)
153
+
154
+ return GradeResult(
155
+ task_achieved=achieved,
156
+ partial_progress=progress,
157
+ reason=(
158
+ f"multi_step: {completed}/{total} steps, "
159
+ f"services_met={services_met if services_required else 'n/a'}"
160
+ ),
161
+ )
162
+
163
+ def _grade_state_checks(
164
+ self, criteria: SuccessCriteria, tracker: EpisodeTracker
165
+ ) -> GradeResult:
166
+ """Expert/SRE: verify end-state via arbitrary commands.
167
+
168
+ state_checks are the source of truth for task completion.
169
+ steps (if present) provide partial progress signals only.
170
+ """
171
+ state_checks = criteria.state_checks
172
+ steps = criteria.steps
173
+
174
+ # Evaluate state checks (ground truth)
175
+ checks_passed = 0
176
+ for check in state_checks:
177
+ check_dict = check.model_dump(exclude_none=True)
178
+ if self._verifier.check_state(check_dict):
179
+ checks_passed += 1
180
+
181
+ total_checks = len(state_checks)
182
+ all_checks_pass = checks_passed == total_checks and total_checks > 0
183
+
184
+ # Evaluate steps for partial progress signal
185
+ steps_completed = 0
186
+ for step in steps:
187
+ if tracker.has_executed_operation(step.operation, step.resource):
188
+ steps_completed += 1
189
+ else:
190
+ break
191
+
192
+ # Progress combines steps (for dense signal) and state checks
193
+ total_steps = len(steps)
194
+ if total_steps > 0:
195
+ step_progress = steps_completed / total_steps
196
+ else:
197
+ step_progress = 0.0
198
+
199
+ # Weight: steps give up to 0.7, state checks give the remaining 0.3
200
+ if total_checks > 0:
201
+ check_progress = checks_passed / total_checks
202
+ progress = step_progress * 0.7 + check_progress * 0.3
203
+ else:
204
+ progress = step_progress
205
+
206
+ # Check services requirement
207
+ services_required = criteria.services
208
+ services_met = all(
209
+ tracker.has_used_service(svc) for svc in services_required
210
+ )
211
+
212
+ # Task achieved only when ALL state checks pass
213
+ achieved = all_checks_pass and (not services_required or services_met)
214
+
215
+ return GradeResult(
216
+ task_achieved=achieved,
217
+ partial_progress=min(progress, 1.0),
218
+ reason=(
219
+ f"state_checks: {checks_passed}/{total_checks} passed, "
220
+ f"steps: {steps_completed}/{total_steps}, "
221
+ f"services_met={services_met if services_required else 'n/a'}"
222
+ ),
223
+ )
224
+
225
+ # -- Reward shaping -------------------------------------------------------
226
+
227
+ def _compute_reward(
228
+ self,
229
+ result: GradeResult,
230
+ latest_step: StepRecord,
231
+ tracker: EpisodeTracker,
232
+ ) -> float:
233
+ """Compute a shaped reward in [0.0, 1.0]."""
234
+ if result.task_achieved:
235
+ return 1.0
236
+
237
+ # Base: partial progress scaled to 0.0–0.8 range
238
+ progress_reward = result.partial_progress * 0.8
239
+
240
+ # Bonus for advancing progress (dense signal)
241
+ progress_delta = result.partial_progress - tracker.previous_progress
242
+ if progress_delta > 0:
243
+ progress_reward += 0.1
244
+
245
+ # Penalty for failed commands
246
+ if not latest_step.success:
247
+ progress_reward *= 0.5
248
+
249
+ # Clamp to [0.0, 0.99] — never reach 1.0 without achieving
250
+ return min(max(progress_reward, 0.0), 0.99)
server/services/tasks/advanced.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - task_id: 15
2
+ description: >
3
+ Create a Lambda function 'processor' with an IAM execution role,
4
+ then create an SQS queue 'work-items' and configure it as an
5
+ event source for the Lambda function.
6
+ success_criteria:
7
+ services:
8
+ - iam
9
+ - lambda
10
+ - sqs
11
+ steps:
12
+ - operation: create-role
13
+ - operation: create-function
14
+ resource: processor
15
+ - operation: create-queue
16
+ resource: work-items
17
+ - operation: create-event-source-mapping
18
+
19
+ - task_id: 16
20
+ description: >
21
+ Deploy a serverless API: create a DynamoDB table 'products',
22
+ create an IAM role for Lambda, create a Lambda function 'product-api',
23
+ and set up an API Gateway REST API with a GET method on /products
24
+ integrated with the Lambda.
25
+ success_criteria:
26
+ services:
27
+ - dynamodb
28
+ - iam
29
+ - lambda
30
+ - apigateway
31
+ steps:
32
+ - operation: create-table
33
+ resource: products
34
+ - operation: create-role
35
+ - operation: create-function
36
+ resource: product-api
37
+ - operation: create-rest-api
38
+ - operation: create-resource
39
+ - operation: put-method
40
+ - operation: put-integration
41
+
42
+ - task_id: 17
43
+ description: >
44
+ Build a fan-out notification system: create an SNS topic 'order-events',
45
+ create two SQS queues 'shipping-queue' and 'billing-queue',
46
+ subscribe both queues to the SNS topic, then publish a test message.
47
+ success_criteria:
48
+ services:
49
+ - sns
50
+ - sqs
51
+ steps:
52
+ - operation: create-topic
53
+ resource: order-events
54
+ - operation: create-queue
55
+ resource: shipping-queue
56
+ - operation: create-queue
57
+ resource: billing-queue
58
+ - operation: subscribe
59
+ - operation: subscribe
60
+ - operation: publish
server/services/tasks/beginner.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - task_id: 6
2
+ description: Create an S3 bucket named 'my-test-bucket'.
3
+ success_criteria:
4
+ command_contains: s3api
5
+ operation: create-bucket
6
+ resource_exists:
7
+ service: s3
8
+ name: my-test-bucket
9
+
10
+ - task_id: 7
11
+ description: Create a DynamoDB table named 'users' with a partition key 'user_id' (String type).
12
+ success_criteria:
13
+ command_contains: dynamodb
14
+ operation: create-table
15
+ resource_exists:
16
+ service: dynamodb
17
+ name: users
18
+
19
+ - task_id: 8
20
+ description: Create an SQS queue named 'task-queue'.
21
+ success_criteria:
22
+ command_contains: sqs
23
+ operation: create-queue
24
+ resource_exists:
25
+ service: sqs
26
+ name: task-queue
27
+
28
+ - task_id: 9
29
+ description: Create an SNS topic named 'notifications'.
30
+ success_criteria:
31
+ command_contains: sns
32
+ operation: create-topic
33
+ resource_exists:
34
+ service: sns
35
+ name: notifications
36
+
37
+ - task_id: 10
38
+ description: Create a Lambda function named 'hello-world' using the python3.12 runtime.
39
+ success_criteria:
40
+ command_contains: lambda
41
+ operation: create-function
42
+ resource_exists:
43
+ service: lambda
44
+ name: hello-world
server/services/tasks/expert.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - task_id: 18
2
+ description: >
3
+ SRE Incident: A Lambda function 'order-processor' exists but its IAM role
4
+ is missing the required SQS permissions. The function's event source mapping
5
+ to the 'incoming-orders' SQS queue is failing. Diagnose the issue, attach
6
+ the correct SQS policy to the role, and create the event source mapping.
7
+ setup_commands:
8
+ - >-
9
+ aws iam create-role --role-name broken-lambda-role
10
+ --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"lambda.amazonaws.com"},"Action":"sts:AssumeRole"}]}'
11
+ - >-
12
+ aws iam attach-role-policy --role-name broken-lambda-role
13
+ --policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
14
+ - >-
15
+ aws lambda create-function --function-name order-processor
16
+ --runtime python3.12 --handler index.handler
17
+ --role arn:aws:iam::000000000000:role/broken-lambda-role
18
+ --zip-file fileb:///tmp/dummy.zip
19
+ - aws sqs create-queue --queue-name incoming-orders
20
+ success_criteria:
21
+ services:
22
+ - iam
23
+ - lambda
24
+ - sqs
25
+ state_checks:
26
+ - command: aws iam list-attached-role-policies --role-name broken-lambda-role
27
+ output_contains: "SQS"
28
+ - command: aws lambda list-event-source-mappings --function-name order-processor
29
+ output_contains: "incoming-orders"
30
+ steps:
31
+ - operation: attach-role-policy
32
+ resource: broken-lambda-role
33
+ - operation: create-event-source-mapping
34
+
35
+ - task_id: 19
36
+ description: >
37
+ SRE Incident: An S3 bucket 'app-config-store' was created to host
38
+ configuration files, but versioning was never enabled. A recent
39
+ accidental overwrite lost critical config. Enable versioning on the
40
+ bucket and add a lifecycle rule named 'cleanup-old-versions' that
41
+ expires non-current object versions after 30 days.
42
+ setup_commands:
43
+ - aws s3api create-bucket --bucket app-config-store
44
+ - >-
45
+ aws s3api put-object --bucket app-config-store
46
+ --key config/app.json --body /dev/null
47
+ success_criteria:
48
+ services:
49
+ - s3
50
+ state_checks:
51
+ - command: aws s3api get-bucket-versioning --bucket app-config-store
52
+ output_contains: "Enabled"
53
+ - command: aws s3api get-bucket-lifecycle-configuration --bucket app-config-store
54
+ output_contains: "cleanup-old-versions"
55
+ steps:
56
+ - operation: put-bucket-versioning
57
+ resource: app-config-store
58
+ - operation: put-bucket-lifecycle-configuration
59
+ resource: app-config-store
60
+
61
+ - task_id: 20
62
+ description: >
63
+ SRE Incident: A DynamoDB table 'session-store' is experiencing throttling
64
+ because it was provisioned with only 1 RCU and 1 WCU. An SNS topic
65
+ 'ops-alerts' exists but has no subscriptions, so no one is being notified.
66
+ Fix the table by updating its throughput to 50 RCU and 50 WCU, then create
67
+ an SQS queue 'ops-alert-inbox' and subscribe it to the SNS topic.
68
+ setup_commands:
69
+ - >-
70
+ aws dynamodb create-table --table-name session-store
71
+ --attribute-definitions AttributeName=session_id,AttributeType=S
72
+ --key-schema AttributeName=session_id,KeyType=HASH
73
+ --provisioned-throughput ReadCapacityUnits=1,WriteCapacityUnits=1
74
+ - aws sns create-topic --name ops-alerts
75
+ success_criteria:
76
+ services:
77
+ - dynamodb
78
+ - sns
79
+ - sqs
80
+ state_checks:
81
+ - command: aws dynamodb describe-table --table-name session-store
82
+ json_path: "$.Table.ProvisionedThroughput.ReadCapacityUnits"
83
+ expected: 50
84
+ - command: aws dynamodb describe-table --table-name session-store
85
+ json_path: "$.Table.ProvisionedThroughput.WriteCapacityUnits"
86
+ expected: 50
87
+ - command: >-
88
+ aws sns list-subscriptions-by-topic
89
+ --topic-arn arn:aws:sns:us-east-1:000000000000:ops-alerts
90
+ output_contains: "sqs"
91
+ steps:
92
+ - operation: update-table
93
+ resource: session-store
94
+ - operation: create-queue
95
+ resource: ops-alert-inbox
96
+ - operation: subscribe
97
+ resource: ops-alerts
server/services/tasks/intermediate.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - task_id: 11
2
+ description: Create an S3 bucket named 'data-pipeline' and upload a file to it.
3
+ success_criteria:
4
+ steps:
5
+ - operation: create-bucket
6
+ resource: data-pipeline
7
+ - operation: put-object
8
+ resource: data-pipeline
9
+
10
+ - task_id: 12
11
+ description: >
12
+ Create a DynamoDB table named 'orders' with partition key 'order_id' (S),
13
+ then insert an item with order_id '001' and status 'pending'.
14
+ success_criteria:
15
+ steps:
16
+ - operation: create-table
17
+ resource: orders
18
+ - operation: put-item
19
+ resource: orders
20
+
21
+ - task_id: 13
22
+ description: >
23
+ Create an SNS topic named 'alerts', then create an SQS queue named
24
+ 'alert-inbox' and subscribe the queue to the topic.
25
+ success_criteria:
26
+ steps:
27
+ - operation: create-topic
28
+ resource: alerts
29
+ - operation: create-queue
30
+ resource: alert-inbox
31
+ - operation: subscribe
32
+ resource: alerts
33
+
34
+ - task_id: 14
35
+ description: >
36
+ Create an IAM role named 'lambda-exec-role' with an assume-role policy
37
+ for Lambda, then attach the AWSLambdaBasicExecutionRole managed policy to it.
38
+ success_criteria:
39
+ steps:
40
+ - operation: create-role
41
+ resource: lambda-exec-role
42
+ - operation: attach-role-policy
43
+ resource: lambda-exec-role
server/services/tasks/warmup.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - task_id: 0
2
+ description: List all S3 buckets in the environment.
3
+ success_criteria:
4
+ command_contains: s3
5
+ operation: ls
6
+
7
+ - task_id: 1
8
+ description: Describe all EC2 instances in the environment.
9
+ success_criteria:
10
+ command_contains: ec2
11
+ operation: describe-instances
12
+
13
+ - task_id: 2
14
+ description: List all DynamoDB tables.
15
+ success_criteria:
16
+ command_contains: dynamodb
17
+ operation: list-tables
18
+
19
+ - task_id: 3
20
+ description: List all Lambda functions.
21
+ success_criteria:
22
+ command_contains: lambda
23
+ operation: list-functions
24
+
25
+ - task_id: 4
26
+ description: List all SQS queues in the environment.
27
+ success_criteria:
28
+ command_contains: sqs
29
+ operation: list-queues
30
+
31
+ - task_id: 5
32
+ description: List all SNS topics in the environment.
33
+ success_criteria:
34
+ command_contains: sns
35
+ operation: list-topics
uv.lock ADDED
The diff for this file is too large to render. See raw diff