Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- Dockerfile +96 -0
- Makefile +138 -0
- README.md +211 -4
- __init__.py +16 -0
- client.py +73 -0
- inference.py +241 -0
- models.py +197 -0
- openenv.yaml +7 -0
- openenv_aws_rl_env.egg-info/PKG-INFO +14 -0
- openenv_aws_rl_env.egg-info/SOURCES.txt +19 -0
- openenv_aws_rl_env.egg-info/dependency_links.txt +1 -0
- openenv_aws_rl_env.egg-info/entry_points.txt +2 -0
- openenv_aws_rl_env.egg-info/requires.txt +10 -0
- openenv_aws_rl_env.egg-info/top_level.txt +1 -0
- pyproject.toml +57 -0
- server/__init__.py +11 -0
- server/app.py +76 -0
- server/aws_rl_env_environment.py +126 -0
- server/requirements.txt +6 -0
- server/services/__init__.py +0 -0
- server/services/aws_backend.py +65 -0
- server/services/curriculum.py +471 -0
- server/services/environment_designer.py +94 -0
- server/services/episode_tracker.py +145 -0
- server/services/resource_verifier.py +179 -0
- server/services/task_grader.py +250 -0
- server/services/tasks/advanced.yaml +60 -0
- server/services/tasks/beginner.yaml +44 -0
- server/services/tasks/expert.yaml +97 -0
- server/services/tasks/intermediate.yaml +43 -0
- server/services/tasks/warmup.yaml +35 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=aws_rl_env
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the uv-managed Python interpreter from builder
|
| 63 |
+
COPY --from=builder /root/.local/share/uv/python /root/.local/share/uv/python
|
| 64 |
+
|
| 65 |
+
# Copy the virtual environment from builder
|
| 66 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 67 |
+
|
| 68 |
+
# Copy the environment code
|
| 69 |
+
COPY --from=builder /app/env /app/env
|
| 70 |
+
|
| 71 |
+
# Install AWS CLI
|
| 72 |
+
RUN apt-get update && \
|
| 73 |
+
apt-get install -y --no-install-recommends awscli && \
|
| 74 |
+
rm -rf /var/lib/apt/lists/*
|
| 75 |
+
|
| 76 |
+
# Configure AWS CLI to point to MiniStack
|
| 77 |
+
RUN mkdir -p /root/.aws && \
|
| 78 |
+
printf '[default]\nregion = us-east-1\noutput = json\n' > /root/.aws/config && \
|
| 79 |
+
printf '[default]\naws_access_key_id = test\naws_secret_access_key = test\n' > /root/.aws/credentials
|
| 80 |
+
ENV AWS_ENDPOINT_URL=http://localhost:4566
|
| 81 |
+
|
| 82 |
+
# Enable the web interface for OpenEnv (if applicable)
|
| 83 |
+
# ENV ENABLE_WEB_INTERFACE=true
|
| 84 |
+
|
| 85 |
+
# Set PATH to use the virtual environment
|
| 86 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 87 |
+
|
| 88 |
+
# Set PYTHONPATH so imports work correctly
|
| 89 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# DEV_MODE=1 enables live reload via --reload flag
|
| 93 |
+
ENV DEV_MODE=0
|
| 94 |
+
|
| 95 |
+
# Entrypoint: start MiniStack in background, then run the FastAPI server
|
| 96 |
+
CMD ["sh", "-c", "ministack & sleep 2 && uvicorn server.app:app --host 0.0.0.0 --port 8000 $([ \"$DEV_MODE\" = '1' ] && echo '--reload --reload-dir /app/env')"]
|
Makefile
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Project settings
|
| 2 |
+
PROJECT_NAME := openenv-aws_rl_env
|
| 3 |
+
PYTHON := python3
|
| 4 |
+
UV := uv
|
| 5 |
+
DOCKER_IMAGE := aws-rl-env
|
| 6 |
+
DOCKER_TAG := latest
|
| 7 |
+
SERVER_HOST := 0.0.0.0
|
| 8 |
+
SERVER_PORT := 8000
|
| 9 |
+
|
| 10 |
+
.DEFAULT_GOAL := help
|
| 11 |
+
|
| 12 |
+
# ──────────────────────────────────────────────
|
| 13 |
+
# Setup & Dependencies
|
| 14 |
+
# ──────────────────────────────────────────────
|
| 15 |
+
|
| 16 |
+
.PHONY: install
|
| 17 |
+
install: ## Install project dependencies
|
| 18 |
+
$(UV) sync --frozen
|
| 19 |
+
|
| 20 |
+
.PHONY: install-dev
|
| 21 |
+
install-dev: ## Install project with dev dependencies
|
| 22 |
+
$(UV) sync --frozen --all-extras
|
| 23 |
+
|
| 24 |
+
.PHONY: lock
|
| 25 |
+
lock: ## Update the lockfile
|
| 26 |
+
$(UV) lock
|
| 27 |
+
|
| 28 |
+
# ──────────────────────────────────────────────
|
| 29 |
+
# Development
|
| 30 |
+
# ──────────────────────────────────────────────
|
| 31 |
+
|
| 32 |
+
.PHONY: run
|
| 33 |
+
run: ## Run with MiniStack + FastAPI server (mirrors Docker CMD)
|
| 34 |
+
ministack & sleep 2 && $(UV) run uvicorn server.app:app --host $(SERVER_HOST) --port $(SERVER_PORT)
|
| 35 |
+
|
| 36 |
+
# ──────────────────────────────────────────────
|
| 37 |
+
# Code Quality
|
| 38 |
+
# ──────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
.PHONY: format
|
| 41 |
+
format: ## Format code with ruff
|
| 42 |
+
$(UV) run ruff format .
|
| 43 |
+
|
| 44 |
+
.PHONY: lint
|
| 45 |
+
lint: ## Lint code with ruff
|
| 46 |
+
$(UV) run ruff check .
|
| 47 |
+
|
| 48 |
+
.PHONY: lint-fix
|
| 49 |
+
lint-fix: ## Lint and auto-fix code with ruff
|
| 50 |
+
$(UV) run ruff check --fix .
|
| 51 |
+
|
| 52 |
+
.PHONY: typecheck
|
| 53 |
+
typecheck: ## Run type checking with mypy
|
| 54 |
+
$(UV) run mypy
|
| 55 |
+
|
| 56 |
+
.PHONY: check
|
| 57 |
+
check: lint typecheck
|
| 58 |
+
|
| 59 |
+
# ──────────────────────────────────────────────
|
| 60 |
+
# Docker
|
| 61 |
+
# ──────────────────────────────────────────────
|
| 62 |
+
|
| 63 |
+
.PHONY: docker-build
|
| 64 |
+
docker-build: ## Build Docker image
|
| 65 |
+
docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
|
| 66 |
+
|
| 67 |
+
.PHONY: docker-run
|
| 68 |
+
docker-run: ## Run Docker container
|
| 69 |
+
docker run --rm -p $(SERVER_PORT):8000 $(DOCKER_IMAGE):$(DOCKER_TAG)
|
| 70 |
+
|
| 71 |
+
.PHONY: docker-run-dev
|
| 72 |
+
docker-run-dev: ## Run Docker container in dev mode with live reload
|
| 73 |
+
docker run --rm -p $(SERVER_PORT):8000 -v $(PWD):/app/env -v /app/env/.venv -e DEV_MODE=1 $(DOCKER_IMAGE):$(DOCKER_TAG)
|
| 74 |
+
|
| 75 |
+
.PHONY: docker-run-detach
|
| 76 |
+
docker-run-detach: ## Run Docker container in background
|
| 77 |
+
docker run -d --rm -p $(SERVER_PORT):8000 --name $(DOCKER_IMAGE) $(DOCKER_IMAGE):$(DOCKER_TAG)
|
| 78 |
+
|
| 79 |
+
.PHONY: docker-stop
|
| 80 |
+
docker-stop: ## Stop the running Docker container
|
| 81 |
+
docker stop $(DOCKER_IMAGE)
|
| 82 |
+
|
| 83 |
+
.PHONY: docker-logs
|
| 84 |
+
docker-logs: ## Tail logs from the running Docker container
|
| 85 |
+
docker logs -f $(DOCKER_IMAGE)
|
| 86 |
+
|
| 87 |
+
.PHONY: docker-shell
|
| 88 |
+
docker-shell: ## Open a shell in the running Docker container
|
| 89 |
+
docker exec -it $(DOCKER_IMAGE) /bin/bash
|
| 90 |
+
|
| 91 |
+
.PHONY: docker-clean
|
| 92 |
+
docker-clean: ## Stop and remove all running containers for this image
|
| 93 |
+
@docker ps -q --filter ancestor=$(DOCKER_IMAGE):$(DOCKER_TAG) | xargs -r docker rm -f
|
| 94 |
+
|
| 95 |
+
.PHONY: docker-health
|
| 96 |
+
docker-health: ## Check health of the running container
|
| 97 |
+
@curl -sf http://localhost:$(SERVER_PORT)/health && echo " OK" || echo " FAIL"
|
| 98 |
+
|
| 99 |
+
# ──────────────────────────────────────────────
|
| 100 |
+
# OpenEnv
|
| 101 |
+
# ──────────────────────────────────────────────
|
| 102 |
+
|
| 103 |
+
.PHONY: openenv-validate
|
| 104 |
+
openenv-validate: ## Validate the OpenEnv configuration
|
| 105 |
+
openenv validate
|
| 106 |
+
|
| 107 |
+
.PHONY: openenv-build
|
| 108 |
+
openenv-build: ## Build the environment using OpenEnv CLI
|
| 109 |
+
openenv build
|
| 110 |
+
|
| 111 |
+
.PHONY: openenv-push
|
| 112 |
+
openenv-push: ## Push the environment to Hugging Face Spaces
|
| 113 |
+
openenv push
|
| 114 |
+
|
| 115 |
+
# ──────────────────────────────────────────────
|
| 116 |
+
# Cleanup
|
| 117 |
+
# ──────────────────────────────────────────────
|
| 118 |
+
|
| 119 |
+
.PHONY: clean
|
| 120 |
+
clean: ## Remove build artifacts and caches
|
| 121 |
+
rm -rf build/ dist/ *.egg-info .eggs/
|
| 122 |
+
rm -rf .pytest_cache/ .mypy_cache/ .ruff_cache/
|
| 123 |
+
rm -rf htmlcov/ .coverage coverage.xml
|
| 124 |
+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|
| 125 |
+
find . -type f -name '*.pyc' -delete 2>/dev/null || true
|
| 126 |
+
|
| 127 |
+
.PHONY: clean-all
|
| 128 |
+
clean-all: clean ## Remove all artifacts including venv
|
| 129 |
+
rm -rf .venv/
|
| 130 |
+
|
| 131 |
+
# ──────────────────────────────────────────────
|
| 132 |
+
# Help
|
| 133 |
+
# ──────────────────────���───────────────────────
|
| 134 |
+
|
| 135 |
+
.PHONY: help
|
| 136 |
+
help: ## Show this help message
|
| 137 |
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
|
| 138 |
+
awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
|
README.md
CHANGED
|
@@ -1,10 +1,217 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: pink
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: AWS RL Environment Server
|
| 3 |
+
emoji: 🥇
|
| 4 |
+
colorFrom: pink
|
| 5 |
colorTo: pink
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# AWS RL Environment
|
| 15 |
+
|
| 16 |
+
An RL environment backed by a **simulated AWS cloud** powered by [MiniStack](https://github.com/Nahuel990/ministack). The agent sends AWS API calls as actions and receives API responses as observations. MiniStack runs inside the same Docker container, emulating 34 AWS services locally.
|
| 17 |
+
|
| 18 |
+
## Quick Start
|
| 19 |
+
|
| 20 |
+
```python
|
| 21 |
+
from aws_rl_env import AwsRlAction, AwsRlEnv
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
# Create environment from Docker image
|
| 25 |
+
env = AwsRlEnv.from_docker_image("aws_rl_env-env:latest")
|
| 26 |
+
|
| 27 |
+
# Reset
|
| 28 |
+
result = env.reset()
|
| 29 |
+
print(f"Episode: {result.observation.episode_id}")
|
| 30 |
+
|
| 31 |
+
# Create an S3 bucket
|
| 32 |
+
result = env.step(AwsRlAction(command="aws s3 mb s3://my-rl-bucket"))
|
| 33 |
+
print(f"Create bucket success: {result.observation.command_success}")
|
| 34 |
+
print(f"Output: {result.observation.command_output}")
|
| 35 |
+
|
| 36 |
+
# Upload a file to the bucket
|
| 37 |
+
result = env.step(AwsRlAction(command="aws s3 cp hello.txt s3://my-rl-bucket/"))
|
| 38 |
+
print(f"Upload success: {result.observation.command_success}")
|
| 39 |
+
|
| 40 |
+
# List buckets
|
| 41 |
+
result = env.step(AwsRlAction(command="aws s3 ls"))
|
| 42 |
+
print(f"Buckets: {result.observation.command_output}")
|
| 43 |
+
|
| 44 |
+
# Describe EC2 instances
|
| 45 |
+
result = env.step(AwsRlAction(command="aws ec2 describe-instances"))
|
| 46 |
+
print(f"EC2 output: {result.observation.command_output}")
|
| 47 |
+
|
| 48 |
+
# Check current task and resource state
|
| 49 |
+
print(f"Task: {result.observation.task}")
|
| 50 |
+
print(f"Task achieved: {result.observation.task_achieved}")
|
| 51 |
+
print(f"Resources: {result.observation.resources}")
|
| 52 |
+
|
| 53 |
+
finally:
|
| 54 |
+
env.close()
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## Supported AWS Services
|
| 58 |
+
|
| 59 |
+
The environment supports **34 AWS services** via MiniStack:
|
| 60 |
+
|
| 61 |
+
| Category | Services |
|
| 62 |
+
|----------|----------|
|
| 63 |
+
| **Storage & DB** | S3, DynamoDB, RDS, ElastiCache, EFS |
|
| 64 |
+
| **Compute** | Lambda, ECS, EC2, Step Functions |
|
| 65 |
+
| **Messaging** | SQS, SNS, Kinesis, EventBridge, Firehose |
|
| 66 |
+
| **API** | API Gateway v1/v2, ALB/ELBv2 |
|
| 67 |
+
| **Security** | IAM, STS, Cognito, ACM, WAF v2, Secrets Manager |
|
| 68 |
+
| **Monitoring** | CloudWatch, CloudWatch Logs, SSM |
|
| 69 |
+
| **Infrastructure** | CloudFormation, Route53 |
|
| 70 |
+
| **Other** | SES, Athena, Glue, EMR |
|
| 71 |
+
|
| 72 |
+
## Building the Docker Image
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
docker build -t aws_rl_env-env:latest -f Dockerfile .
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
The Docker image bundles:
|
| 79 |
+
- The RL environment server (port 8000)
|
| 80 |
+
- MiniStack AWS emulator (port 4566)
|
| 81 |
+
- boto3 for AWS SDK access
|
| 82 |
+
- All MiniStack dependencies
|
| 83 |
+
|
| 84 |
+
## Environment Details
|
| 85 |
+
|
| 86 |
+
### Core Types
|
| 87 |
+
|
| 88 |
+
- `TaskID` — Unique task identifier (int)
|
| 89 |
+
- `EpisodeID` — Unique episode identifier (str)
|
| 90 |
+
- `StepCount` — Step counter within an episode (int)
|
| 91 |
+
- `AwsService` — Supported AWS services: `s3`, `ec2`, `dynamodb`, `lambda`
|
| 92 |
+
|
| 93 |
+
### Task
|
| 94 |
+
|
| 95 |
+
**Task**: Defines what the RL agent must accomplish
|
| 96 |
+
|
| 97 |
+
- `task_id` (TaskID) — Unique task identifier
|
| 98 |
+
- `difficulty` (TaskDifficulty) — One of: `warmup`, `beginner`, `intermediate`, `advanced`, `expert`
|
| 99 |
+
- `description` (str) — Human-readable task description
|
| 100 |
+
- `success_criteria` (dict) — Machine-readable criteria to evaluate task completion
|
| 101 |
+
|
| 102 |
+
### Action
|
| 103 |
+
|
| 104 |
+
**AwsRlAction**: An AWS CLI command to execute against MiniStack
|
| 105 |
+
|
| 106 |
+
- `command` (str) — AWS CLI command to execute, e.g. `"aws s3 ls"`, `"aws ec2 describe-instances"`
|
| 107 |
+
|
| 108 |
+
### Observation
|
| 109 |
+
|
| 110 |
+
**AwsRlObservation**: The result returned after each step
|
| 111 |
+
|
| 112 |
+
- `episode_id` (EpisodeID) — Unique identifier for the episode
|
| 113 |
+
- `step_count` (StepCount) — Current step count in the episode
|
| 114 |
+
- `command_success` (bool) — Whether the CLI command executed successfully
|
| 115 |
+
- `command_output` (str) — Stdout from the executed AWS CLI command
|
| 116 |
+
- `error` (str) — Stderr if the command failed
|
| 117 |
+
- `resources` (dict[AwsService, dict | list | str]) — Current resource state from MiniStack, keyed by service name
|
| 118 |
+
- `task` (Task | None) — The task the agent is trying to accomplish
|
| 119 |
+
- `task_achieved` (bool) — Whether the task has been achieved
|
| 120 |
+
|
| 121 |
+
## Architecture
|
| 122 |
+
|
| 123 |
+
```
|
| 124 |
+
┌─────────────────────────────────────────┐
|
| 125 |
+
│ Docker Container │
|
| 126 |
+
│ │
|
| 127 |
+
│ ┌──────────────┐ ┌───────────────┐ │
|
| 128 |
+
│ │ RL Server │ │ MiniStack │ │
|
| 129 |
+
│ │ (port 8000) │──▶│ (port 4566) │ │
|
| 130 |
+
│ │ FastAPI + │ │ 34 AWS │ │
|
| 131 |
+
│ │ WebSocket │ │ services │ │
|
| 132 |
+
│ └──────────────┘ └───────────────┘ │
|
| 133 |
+
│ │ │ │
|
| 134 |
+
│ │ boto3 calls │ │
|
| 135 |
+
│ └──��─────────────────┘ │
|
| 136 |
+
└─────────────────────────────────────────┘
|
| 137 |
+
▲
|
| 138 |
+
│ WebSocket / HTTP
|
| 139 |
+
│
|
| 140 |
+
RL Agent (client)
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
## Advanced Usage
|
| 144 |
+
|
| 145 |
+
### Connecting to an Existing Server
|
| 146 |
+
|
| 147 |
+
```python
|
| 148 |
+
from aws_rl_env import AwsRlAction, AwsRlEnv
|
| 149 |
+
|
| 150 |
+
env = AwsRlEnv(base_url="http://localhost:8000")
|
| 151 |
+
result = env.reset()
|
| 152 |
+
|
| 153 |
+
# Create a DynamoDB table
|
| 154 |
+
result = env.step(AwsRlAction(
|
| 155 |
+
command="aws dynamodb create-table --table-name my-table --key-schema AttributeName=id,KeyType=HASH --attribute-definitions AttributeName=id,AttributeType=S --billing-mode PAY_PER_REQUEST"
|
| 156 |
+
))
|
| 157 |
+
print(f"Table created: {result.observation.command_success}")
|
| 158 |
+
print(f"Output: {result.observation.command_output}")
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
### Concurrent Sessions
|
| 162 |
+
|
| 163 |
+
```python
|
| 164 |
+
from aws_rl_env import AwsRlAction, AwsRlEnv
|
| 165 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 166 |
+
|
| 167 |
+
def run_episode(client_id: int):
|
| 168 |
+
with AwsRlEnv(base_url="http://localhost:8000") as env:
|
| 169 |
+
result = env.reset()
|
| 170 |
+
for i in range(10):
|
| 171 |
+
result = env.step(AwsRlAction(
|
| 172 |
+
command=f"aws s3api put-object --bucket client-{client_id} --key step-{i}.txt --body 'data from step {i}'"
|
| 173 |
+
))
|
| 174 |
+
return client_id, result.observation.command_success
|
| 175 |
+
|
| 176 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 177 |
+
results = list(executor.map(run_episode, range(4)))
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
### Running Locally (without Docker)
|
| 181 |
+
|
| 182 |
+
Start MiniStack and the RL server separately:
|
| 183 |
+
|
| 184 |
+
```bash
|
| 185 |
+
# Terminal 1: Start MiniStack
|
| 186 |
+
pip install ministack
|
| 187 |
+
ministack # Runs on port 4566
|
| 188 |
+
|
| 189 |
+
# Terminal 2: Start RL server
|
| 190 |
+
export AWS_ENDPOINT_URL=http://localhost:4566
|
| 191 |
+
export AWS_ACCESS_KEY_ID=test
|
| 192 |
+
export AWS_SECRET_ACCESS_KEY=test
|
| 193 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
## Project Structure
|
| 197 |
+
|
| 198 |
+
```
|
| 199 |
+
aws_rl_env/
|
| 200 |
+
├── __init__.py # Module exports
|
| 201 |
+
├── README.md # This file
|
| 202 |
+
├── Dockerfile # Container image (bundles RL server + MiniStack)
|
| 203 |
+
├── entrypoint.sh # Starts MiniStack then RL server
|
| 204 |
+
├── openenv.yaml # OpenEnv manifest
|
| 205 |
+
├── pyproject.toml # Project metadata and dependencies
|
| 206 |
+
├── uv.lock # Locked dependencies
|
| 207 |
+
├── client.py # AwsRlEnv client
|
| 208 |
+
├── models.py # AwsRlAction and AwsRlObservation models
|
| 209 |
+
├── ministack/ # MiniStack AWS emulator (bundled)
|
| 210 |
+
│ ├── app.py # MiniStack ASGI application
|
| 211 |
+
│ ├── core/ # Routing, persistence, responses
|
| 212 |
+
│ └── services/ # 34 AWS service implementations
|
| 213 |
+
└── server/
|
| 214 |
+
├── __init__.py
|
| 215 |
+
├── aws_rl_env_environment.py # Core RL environment (uses boto3 → MiniStack)
|
| 216 |
+
└── app.py # FastAPI application
|
| 217 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Aws Rl Env Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import AwsRlEnv
|
| 10 |
+
from .models import AwsRlAction, AwsRlObservation
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"AwsRlAction",
|
| 14 |
+
"AwsRlObservation",
|
| 15 |
+
"AwsRlEnv",
|
| 16 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Aws Rl Env Environment Client."""
|
| 8 |
+
|
| 9 |
+
from typing import Dict
|
| 10 |
+
|
| 11 |
+
from openenv.core import EnvClient
|
| 12 |
+
from openenv.core.client_types import StepResult
|
| 13 |
+
from openenv.core.env_server.types import State
|
| 14 |
+
|
| 15 |
+
from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class AwsRlEnv(EnvClient[AwsRlAction, AwsRlObservation, State]):
|
| 19 |
+
"""
|
| 20 |
+
Client for the Aws Rl Env Environment.
|
| 21 |
+
|
| 22 |
+
This client maintains a persistent WebSocket connection to the environment server,
|
| 23 |
+
enabling efficient multi-step interactions with lower latency.
|
| 24 |
+
Each client instance has its own dedicated environment session on the server.
|
| 25 |
+
|
| 26 |
+
Example:
|
| 27 |
+
>>> with AwsRlEnv(base_url="http://localhost:8000") as client:
|
| 28 |
+
... result = client.reset()
|
| 29 |
+
... print(result.observation.command_output)
|
| 30 |
+
...
|
| 31 |
+
... result = client.step(AwsRlAction(command="aws s3 ls"))
|
| 32 |
+
... print(result.observation.command_output)
|
| 33 |
+
|
| 34 |
+
Example with Docker:
|
| 35 |
+
>>> client = AwsRlEnv.from_docker_image("aws_rl_env-env:latest")
|
| 36 |
+
>>> try:
|
| 37 |
+
... result = client.reset()
|
| 38 |
+
... result = client.step(AwsRlAction(command="aws s3 ls"))
|
| 39 |
+
... finally:
|
| 40 |
+
... client.close()
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def _step_payload(self, action: AwsRlAction) -> Dict:
|
| 44 |
+
"""Convert AwsRlAction to JSON payload for step message."""
|
| 45 |
+
return {"command": action.command}
|
| 46 |
+
|
| 47 |
+
def _parse_result(self, payload: Dict) -> StepResult[AwsRlObservation]:
|
| 48 |
+
"""Parse server response into StepResult[AwsRlObservation]."""
|
| 49 |
+
obs_data = payload.get("observation", {})
|
| 50 |
+
observation = AwsRlObservation(
|
| 51 |
+
episode_id=EpisodeID(obs_data.get("episode_id", "")),
|
| 52 |
+
step_count=StepCount(obs_data.get("step_count", 0)),
|
| 53 |
+
command_success=obs_data.get("command_success", False),
|
| 54 |
+
command_output=obs_data.get("command_output", ""),
|
| 55 |
+
error=obs_data.get("error", ""),
|
| 56 |
+
task=obs_data.get("task"),
|
| 57 |
+
task_achieved=obs_data.get("task_achieved", False),
|
| 58 |
+
done=payload.get("done", False),
|
| 59 |
+
reward=payload.get("reward", 0.0),
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
return StepResult(
|
| 63 |
+
observation=observation,
|
| 64 |
+
reward=payload.get("reward", 0.0),
|
| 65 |
+
done=payload.get("done", False),
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 69 |
+
"""Parse server response into State object."""
|
| 70 |
+
return State(
|
| 71 |
+
episode_id=payload.get("episode_id"),
|
| 72 |
+
step_count=payload.get("step_count", 0),
|
| 73 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference Script Example
|
| 3 |
+
===================================
|
| 4 |
+
MANDATORY
|
| 5 |
+
- Before submitting, ensure the following variables are defined in your environment configuration:
|
| 6 |
+
API_BASE_URL The API endpoint for the LLM.
|
| 7 |
+
MODEL_NAME The model identifier to use for inference.
|
| 8 |
+
HF_TOKEN Your Hugging Face / API key.
|
| 9 |
+
LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
|
| 10 |
+
method
|
| 11 |
+
|
| 12 |
+
- Defaults are set only for API_BASE_URL and MODEL_NAME
|
| 13 |
+
(and should reflect your active inference setup):
|
| 14 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
|
| 15 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
|
| 16 |
+
|
| 17 |
+
- The inference script must be named `inference.py` and placed in the root directory of the project
|
| 18 |
+
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 19 |
+
|
| 20 |
+
STDOUT FORMAT
|
| 21 |
+
- The script must emit exactly three line types to stdout, in this order:
|
| 22 |
+
|
| 23 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 24 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 25 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 26 |
+
|
| 27 |
+
Rules:
|
| 28 |
+
- One [START] line at episode begin.
|
| 29 |
+
- One [STEP] line per step, immediately after env.step() returns.
|
| 30 |
+
- One [END] line after env.close(), always emitted (even on exception).
|
| 31 |
+
- reward and rewards are formatted to 2 decimal places.
|
| 32 |
+
- done and success are lowercase booleans: true or false.
|
| 33 |
+
- error is the raw last_action_error string, or null if none.
|
| 34 |
+
- All fields on a single line with no newlines within a line.
|
| 35 |
+
|
| 36 |
+
Example:
|
| 37 |
+
[START] task=create-s3-bucket env=aws_rl_env model=Qwen2.5-72B-Instruct
|
| 38 |
+
[STEP] step=1 action=aws s3api create-bucket --bucket my-test-bucket reward=1.00 done=false error=null
|
| 39 |
+
[END] success=true steps=1 rewards=1.00
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
import asyncio
|
| 43 |
+
import os
|
| 44 |
+
import textwrap
|
| 45 |
+
from typing import List, Optional
|
| 46 |
+
|
| 47 |
+
from dotenv import load_dotenv
|
| 48 |
+
from openai import OpenAI
|
| 49 |
+
|
| 50 |
+
from client import AwsRlEnv
|
| 51 |
+
from models import AwsRlAction
|
| 52 |
+
|
| 53 |
+
load_dotenv() # Load variables from .env file if present
|
| 54 |
+
|
| 55 |
+
IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
|
| 56 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 57 |
+
|
| 58 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 59 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 60 |
+
BENCHMARK = os.getenv("BENCHMARK", "aws_rl_env")
|
| 61 |
+
MAX_STEPS = int(os.getenv("MAX_STEPS", "15"))
|
| 62 |
+
TEMPERATURE = 0.7
|
| 63 |
+
MAX_TOKENS = 512
|
| 64 |
+
SUCCESS_SCORE_THRESHOLD = 1.0 # task_achieved yields reward=1.0
|
| 65 |
+
|
| 66 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 67 |
+
"""
|
| 68 |
+
You are an AWS cloud engineer interacting with a real AWS environment via CLI.
|
| 69 |
+
Each turn you must send exactly ONE valid AWS CLI command (starting with 'aws').
|
| 70 |
+
|
| 71 |
+
You will be given a task to accomplish. Read the task description carefully.
|
| 72 |
+
Use the command output and error messages to guide your next action.
|
| 73 |
+
|
| 74 |
+
Rules:
|
| 75 |
+
- Only send AWS CLI commands (e.g. 'aws s3 ls', 'aws dynamodb create-table ...')
|
| 76 |
+
- One command per turn — no pipes, no shell syntax, no chaining
|
| 77 |
+
- Reply with ONLY the command, nothing else — no explanations, no quotes
|
| 78 |
+
"""
|
| 79 |
+
).strip()
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 83 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 87 |
+
error_val = error if error else "null"
|
| 88 |
+
done_val = str(done).lower()
|
| 89 |
+
print(
|
| 90 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 91 |
+
flush=True,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 96 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 97 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def build_user_prompt(
|
| 101 |
+
task_description: str,
|
| 102 |
+
step: int,
|
| 103 |
+
last_output: str,
|
| 104 |
+
last_error: str,
|
| 105 |
+
last_reward: float,
|
| 106 |
+
history: List[str],
|
| 107 |
+
) -> str:
|
| 108 |
+
history_block = "\n".join(history[-6:]) if history else "None"
|
| 109 |
+
return textwrap.dedent(
|
| 110 |
+
f"""
|
| 111 |
+
TASK: {task_description}
|
| 112 |
+
|
| 113 |
+
Step: {step}
|
| 114 |
+
Last command output: {last_output!r}
|
| 115 |
+
Last error: {last_error!r}
|
| 116 |
+
Last reward: {last_reward:.2f}
|
| 117 |
+
|
| 118 |
+
Previous steps:
|
| 119 |
+
{history_block}
|
| 120 |
+
|
| 121 |
+
Send your next AWS CLI command.
|
| 122 |
+
"""
|
| 123 |
+
).strip()
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def get_model_command(
|
| 127 |
+
client: OpenAI,
|
| 128 |
+
task_description: str,
|
| 129 |
+
step: int,
|
| 130 |
+
last_output: str,
|
| 131 |
+
last_error: str,
|
| 132 |
+
last_reward: float,
|
| 133 |
+
history: List[str],
|
| 134 |
+
) -> str:
|
| 135 |
+
user_prompt = build_user_prompt(
|
| 136 |
+
task_description, step, last_output, last_error, last_reward, history
|
| 137 |
+
)
|
| 138 |
+
try:
|
| 139 |
+
completion = client.chat.completions.create(
|
| 140 |
+
model=MODEL_NAME,
|
| 141 |
+
messages=[
|
| 142 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 143 |
+
{"role": "user", "content": user_prompt},
|
| 144 |
+
],
|
| 145 |
+
temperature=TEMPERATURE,
|
| 146 |
+
max_tokens=MAX_TOKENS,
|
| 147 |
+
stream=False,
|
| 148 |
+
)
|
| 149 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 150 |
+
# Strip markdown code fences if the model wraps the command
|
| 151 |
+
if text.startswith("```"):
|
| 152 |
+
lines = text.split("\n")
|
| 153 |
+
text = "\n".join(
|
| 154 |
+
line for line in lines if not line.startswith("```")
|
| 155 |
+
).strip()
|
| 156 |
+
return text if text.startswith("aws ") else "aws help"
|
| 157 |
+
except Exception as exc:
|
| 158 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 159 |
+
return "aws help"
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
async def main() -> None:
|
| 163 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 164 |
+
|
| 165 |
+
env = await AwsRlEnv.from_docker_image(IMAGE_NAME)
|
| 166 |
+
|
| 167 |
+
history: List[str] = []
|
| 168 |
+
rewards: List[float] = []
|
| 169 |
+
steps_taken = 0
|
| 170 |
+
score = 0.0
|
| 171 |
+
success = False
|
| 172 |
+
task_name = "unknown"
|
| 173 |
+
task_description = ""
|
| 174 |
+
|
| 175 |
+
try:
|
| 176 |
+
result = await env.reset() # OpenENV.reset()
|
| 177 |
+
obs = result.observation
|
| 178 |
+
|
| 179 |
+
# Extract task info from the first observation
|
| 180 |
+
if obs.task is not None:
|
| 181 |
+
task_name = f"task-{obs.task.task_id}"
|
| 182 |
+
task_description = obs.task.description
|
| 183 |
+
else:
|
| 184 |
+
task_description = "No task assigned."
|
| 185 |
+
|
| 186 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 187 |
+
|
| 188 |
+
last_output = obs.command_output
|
| 189 |
+
last_error = ""
|
| 190 |
+
last_reward = 0.0
|
| 191 |
+
|
| 192 |
+
for step in range(1, MAX_STEPS + 1):
|
| 193 |
+
if result.done:
|
| 194 |
+
break
|
| 195 |
+
|
| 196 |
+
command = get_model_command(
|
| 197 |
+
client, task_description, step,
|
| 198 |
+
last_output, last_error, last_reward, history,
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
result = await env.step(AwsRlAction(command=command))
|
| 202 |
+
obs = result.observation
|
| 203 |
+
|
| 204 |
+
reward = result.reward or 0.0
|
| 205 |
+
done = result.done
|
| 206 |
+
error = obs.error if obs.error else None
|
| 207 |
+
|
| 208 |
+
rewards.append(reward)
|
| 209 |
+
steps_taken = step
|
| 210 |
+
last_output = obs.command_output
|
| 211 |
+
last_error = obs.error
|
| 212 |
+
last_reward = reward
|
| 213 |
+
|
| 214 |
+
log_step(step=step, action=command, reward=reward, done=done, error=error)
|
| 215 |
+
|
| 216 |
+
status = "OK" if obs.command_success else "FAIL"
|
| 217 |
+
history.append(f"Step {step} [{status}]: {command} -> reward={reward:.2f}")
|
| 218 |
+
|
| 219 |
+
# Task achieved — episode success
|
| 220 |
+
if obs.task_achieved:
|
| 221 |
+
success = True
|
| 222 |
+
break
|
| 223 |
+
|
| 224 |
+
if done:
|
| 225 |
+
break
|
| 226 |
+
|
| 227 |
+
score = max(rewards) if rewards else 0.0
|
| 228 |
+
score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
|
| 229 |
+
if not success:
|
| 230 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 231 |
+
|
| 232 |
+
finally:
|
| 233 |
+
try:
|
| 234 |
+
await env.close()
|
| 235 |
+
except Exception as e:
|
| 236 |
+
print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
|
| 237 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
if __name__ == "__main__":
|
| 241 |
+
asyncio.run(main())
|
models.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data models for the Aws Rl Env Environment.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from typing import NewType, Union
|
| 7 |
+
|
| 8 |
+
from openenv.core.env_server.types import Action, Observation
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
# Core Types
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
TaskID = NewType("TaskID", int)
|
| 16 |
+
EpisodeID = NewType("EpisodeID", str)
|
| 17 |
+
StepCount = NewType("StepCount", int)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AwsService(str, Enum):
|
| 21 |
+
S3 = "s3"
|
| 22 |
+
EC2 = "ec2"
|
| 23 |
+
DYNAMODB = "dynamodb"
|
| 24 |
+
LAMBDA = "lambda"
|
| 25 |
+
SQS = "sqs"
|
| 26 |
+
SNS = "sns"
|
| 27 |
+
IAM = "iam"
|
| 28 |
+
APIGATEWAY = "apigateway"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# RL Task Definition
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class TaskDifficulty(str, Enum):
|
| 37 |
+
WARMUP = "warmup"
|
| 38 |
+
BEGINNER = "beginner"
|
| 39 |
+
INTERMEDIATE = "intermediate"
|
| 40 |
+
ADVANCED = "advanced"
|
| 41 |
+
EXPERT = "expert"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class TierConfig(BaseModel):
|
| 45 |
+
"""Configuration for a single difficulty tier's promotion and mastery rules."""
|
| 46 |
+
|
| 47 |
+
min_episodes: int = Field(..., ge=0, description="Minimum episodes before promotion eligible")
|
| 48 |
+
advance_rate: float = Field(..., ge=0.0, le=1.0, description="Tier success rate to advance")
|
| 49 |
+
mastery_window: int = Field(default=10, ge=1, description="Sliding window size for success rate")
|
| 50 |
+
mastery_threshold: float = Field(
|
| 51 |
+
default=0.7, ge=0.0, le=1.0, description="Per-task graduation threshold"
|
| 52 |
+
)
|
| 53 |
+
fast_track_rate: float = Field(
|
| 54 |
+
default=0.9, ge=0.0, le=1.0,
|
| 55 |
+
description="Success rate for early promotion after 3 episodes",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class SpacedRepState(BaseModel):
|
| 60 |
+
"""Tracks spaced repetition schedule for a graduated task."""
|
| 61 |
+
|
| 62 |
+
interval: int = Field(default=3, ge=1, description="Episodes until next re-test")
|
| 63 |
+
last_graduated_episode: int = Field(
|
| 64 |
+
default=0, ge=0, description="Episode number when task was last graduated"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class SetupCommand(BaseModel):
|
| 69 |
+
"""A single AWS CLI command executed during environment setup before the agent acts."""
|
| 70 |
+
|
| 71 |
+
command: str = Field(..., description="AWS CLI command to execute")
|
| 72 |
+
description: str | None = Field(
|
| 73 |
+
default=None, description="Human-readable explanation of what this command sets up"
|
| 74 |
+
)
|
| 75 |
+
ignore_failure: bool = Field(
|
| 76 |
+
default=False,
|
| 77 |
+
description="If True, continue setup even if this command fails",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class ResourceExistsCheck(BaseModel):
|
| 82 |
+
"""Checks that a specific named resource exists in MiniStack."""
|
| 83 |
+
|
| 84 |
+
service: AwsService = Field(..., description="AWS service to verify the resource in")
|
| 85 |
+
name: str = Field(..., description="Exact resource name to verify")
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class StepCriteria(BaseModel):
|
| 89 |
+
"""A single required step in a multi-step task."""
|
| 90 |
+
|
| 91 |
+
operation: str = Field(..., description="AWS CLI operation, e.g. 'create-bucket'")
|
| 92 |
+
resource: str | None = Field(
|
| 93 |
+
default=None, description="Resource name the operation must target"
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class StateCheck(BaseModel):
|
| 98 |
+
"""An assertion about the environment's end-state, evaluated via AWS CLI."""
|
| 99 |
+
|
| 100 |
+
command: str = Field(..., description="AWS CLI command to run for verification")
|
| 101 |
+
output_contains: str | None = Field(
|
| 102 |
+
default=None, description="Substring that must appear in stdout"
|
| 103 |
+
)
|
| 104 |
+
json_path: str | None = Field(
|
| 105 |
+
default=None, description="JSON path to extract from stdout, e.g. '$.Table.Name'"
|
| 106 |
+
)
|
| 107 |
+
expected: int | float | str | bool | None = Field(
|
| 108 |
+
default=None, description="Expected value at json_path"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class SuccessCriteria(BaseModel):
|
| 113 |
+
"""Machine-readable criteria to evaluate task completion.
|
| 114 |
+
|
| 115 |
+
Different tiers populate different fields:
|
| 116 |
+
- Warmup: command_contains + operation
|
| 117 |
+
- Beginner: command_contains + operation + resource_exists
|
| 118 |
+
- Intermediate: steps
|
| 119 |
+
- Advanced: services + steps
|
| 120 |
+
- Expert: services + state_checks + steps (optional)
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
command_contains: str | None = Field(
|
| 124 |
+
default=None, description="Substring the agent's command must contain"
|
| 125 |
+
)
|
| 126 |
+
operation: str | None = Field(
|
| 127 |
+
default=None, description="AWS CLI operation the agent must invoke"
|
| 128 |
+
)
|
| 129 |
+
resource_exists: ResourceExistsCheck | None = Field(
|
| 130 |
+
default=None, description="Resource that must exist after the agent acts"
|
| 131 |
+
)
|
| 132 |
+
steps: list[StepCriteria] = Field(
|
| 133 |
+
default_factory=list, description="Ordered sequence of required operations"
|
| 134 |
+
)
|
| 135 |
+
services: list[AwsService] = Field(
|
| 136 |
+
default_factory=list, description="AWS services the agent must interact with"
|
| 137 |
+
)
|
| 138 |
+
state_checks: list[StateCheck] = Field(
|
| 139 |
+
default_factory=list,
|
| 140 |
+
description="End-state assertions — source of truth for expert/SRE tasks",
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class Task(BaseModel):
|
| 145 |
+
"""Defines a task the RL agent must accomplish in the AWS environment."""
|
| 146 |
+
|
| 147 |
+
task_id: TaskID = Field(..., ge=0, description="Unique task identifier")
|
| 148 |
+
difficulty: TaskDifficulty = Field(
|
| 149 |
+
default=TaskDifficulty.WARMUP, description="Task difficulty level"
|
| 150 |
+
)
|
| 151 |
+
description: str = Field(..., description="Human-readable task description")
|
| 152 |
+
success_criteria: SuccessCriteria = Field(
|
| 153 |
+
default_factory=SuccessCriteria,
|
| 154 |
+
description="Machine-readable criteria to evaluate task completion",
|
| 155 |
+
)
|
| 156 |
+
setup_commands: list[SetupCommand] = Field(
|
| 157 |
+
default_factory=list,
|
| 158 |
+
description="Commands to run during reset to set up initial state (e.g. for SRE tasks)",
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
# Action & Observation
|
| 164 |
+
# ---------------------------------------------------------------------------
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class AwsRlAction(Action):
|
| 168 |
+
"""Action for the Aws Rl Env environment — an AWS CLI command to execute against MiniStack."""
|
| 169 |
+
|
| 170 |
+
command: str = Field(
|
| 171 |
+
...,
|
| 172 |
+
description="AWS CLI command to execute, e.g. 'aws s3 ls', 'aws ec2 describe-instances'",
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
class AwsRlObservation(Observation):
|
| 177 |
+
"""Observation returned after each step in the AWS RL environment."""
|
| 178 |
+
|
| 179 |
+
episode_id: EpisodeID = Field(..., description="Unique identifier for the episode")
|
| 180 |
+
step_count: StepCount = Field(..., ge=0, description="Current step count in the episode")
|
| 181 |
+
command_success: bool = Field(
|
| 182 |
+
..., description="Whether the CLI command executed successfully"
|
| 183 |
+
)
|
| 184 |
+
command_output: str = Field(
|
| 185 |
+
default="", description="Stdout from the executed AWS CLI command"
|
| 186 |
+
)
|
| 187 |
+
error: str = Field(default="", description="Stderr if the command failed")
|
| 188 |
+
resources: dict[AwsService, Union[dict, list, str]] = Field(
|
| 189 |
+
default_factory=dict,
|
| 190 |
+
description="Current resource state from MiniStack, keyed by service name",
|
| 191 |
+
)
|
| 192 |
+
task: Task | None = Field(
|
| 193 |
+
default=None, description="The task the agent is trying to accomplish"
|
| 194 |
+
)
|
| 195 |
+
task_achieved: bool = Field(
|
| 196 |
+
default=False, description="Whether the task has been achieved"
|
| 197 |
+
)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: aws_rl_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
openenv_aws_rl_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-aws_rl_env
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Aws Rl Env environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.12
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
+
Requires-Dist: ministack>=1.1.24
|
| 8 |
+
Requires-Dist: python-dotenv>=1.0.0
|
| 9 |
+
Provides-Extra: dev
|
| 10 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 11 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
| 12 |
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
| 13 |
+
Requires-Dist: mypy>=1.10.0; extra == "dev"
|
| 14 |
+
Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
|
openenv_aws_rl_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
__init__.py
|
| 3 |
+
client.py
|
| 4 |
+
inference.py
|
| 5 |
+
models.py
|
| 6 |
+
pyproject.toml
|
| 7 |
+
./__init__.py
|
| 8 |
+
./client.py
|
| 9 |
+
./inference.py
|
| 10 |
+
./models.py
|
| 11 |
+
openenv_aws_rl_env.egg-info/PKG-INFO
|
| 12 |
+
openenv_aws_rl_env.egg-info/SOURCES.txt
|
| 13 |
+
openenv_aws_rl_env.egg-info/dependency_links.txt
|
| 14 |
+
openenv_aws_rl_env.egg-info/entry_points.txt
|
| 15 |
+
openenv_aws_rl_env.egg-info/requires.txt
|
| 16 |
+
openenv_aws_rl_env.egg-info/top_level.txt
|
| 17 |
+
server/__init__.py
|
| 18 |
+
server/app.py
|
| 19 |
+
server/aws_rl_env_environment.py
|
openenv_aws_rl_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_aws_rl_env.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = aws_rl_env.server.app:main
|
openenv_aws_rl_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
ministack>=1.1.24
|
| 3 |
+
python-dotenv>=1.0.0
|
| 4 |
+
|
| 5 |
+
[dev]
|
| 6 |
+
pytest>=8.0.0
|
| 7 |
+
pytest-cov>=4.0.0
|
| 8 |
+
ruff>=0.4.0
|
| 9 |
+
mypy>=1.10.0
|
| 10 |
+
types-PyYAML>=6.0.0
|
openenv_aws_rl_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
aws_rl_env
|
pyproject.toml
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-aws_rl_env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Aws Rl Env environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.12"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.2",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
# Add all dependencies needed for your environment here
|
| 23 |
+
# Examples:
|
| 24 |
+
# "numpy>=1.19.0",
|
| 25 |
+
# "torch>=2.0.0",
|
| 26 |
+
# "gymnasium>=0.29.0",
|
| 27 |
+
# "openspiel>=1.0.0",
|
| 28 |
+
# "smolagents>=1.22.0,<2",
|
| 29 |
+
"ministack>=1.1.24",
|
| 30 |
+
"python-dotenv>=1.0.0",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
[project.optional-dependencies]
|
| 34 |
+
dev = [
|
| 35 |
+
"pytest>=8.0.0",
|
| 36 |
+
"pytest-cov>=4.0.0",
|
| 37 |
+
"ruff>=0.4.0",
|
| 38 |
+
"mypy>=1.10.0",
|
| 39 |
+
"types-PyYAML>=6.0.0",
|
| 40 |
+
]
|
| 41 |
+
|
| 42 |
+
[project.scripts]
|
| 43 |
+
# Server entry point - enables running via: uv run --project . server
|
| 44 |
+
# or: python -m aws_rl_env.server.app
|
| 45 |
+
server = "aws_rl_env.server.app:main"
|
| 46 |
+
|
| 47 |
+
[tool.setuptools]
|
| 48 |
+
include-package-data = true
|
| 49 |
+
packages = ["aws_rl_env", "aws_rl_env.server"]
|
| 50 |
+
package-dir = { "aws_rl_env" = ".", "aws_rl_env.server" = "server" }
|
| 51 |
+
|
| 52 |
+
[tool.mypy]
|
| 53 |
+
files = ["*.py", "server/"]
|
| 54 |
+
ignore_missing_imports = true
|
| 55 |
+
namespace_packages = true
|
| 56 |
+
explicit_package_bases = true
|
| 57 |
+
mypy_path = "."
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Aws Rl Env environment server components."""
|
| 8 |
+
|
| 9 |
+
from .aws_rl_env_environment import AwsRlEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["AwsRlEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
FastAPI application for the Aws Rl Env Environment.
|
| 9 |
+
|
| 10 |
+
This module creates an HTTP server that exposes the AwsRlEnvironment
|
| 11 |
+
over HTTP and WebSocket endpoints, compatible with EnvClient.
|
| 12 |
+
|
| 13 |
+
Endpoints:
|
| 14 |
+
- POST /reset: Reset the environment
|
| 15 |
+
- POST /step: Execute an action
|
| 16 |
+
- GET /state: Get current environment state
|
| 17 |
+
- GET /schema: Get action/observation schemas
|
| 18 |
+
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
# Development (with auto-reload):
|
| 22 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 23 |
+
|
| 24 |
+
# Production:
|
| 25 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 26 |
+
|
| 27 |
+
# Or run directly:
|
| 28 |
+
python -m server.app
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
from openenv.core.env_server.http_server import create_app
|
| 33 |
+
except Exception as e: # pragma: no cover
|
| 34 |
+
raise ImportError(
|
| 35 |
+
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
| 36 |
+
) from e
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
from models import AwsRlAction, AwsRlObservation
|
| 40 |
+
from server.aws_rl_env_environment import AwsRlEnvironment
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Create the app with web interface and README integration
|
| 44 |
+
app = create_app(
|
| 45 |
+
AwsRlEnvironment,
|
| 46 |
+
AwsRlAction,
|
| 47 |
+
AwsRlObservation,
|
| 48 |
+
env_name="aws_rl_env",
|
| 49 |
+
max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 54 |
+
"""
|
| 55 |
+
Entry point for direct execution via uv run or python -m.
|
| 56 |
+
|
| 57 |
+
This function enables running the server without Docker:
|
| 58 |
+
uv run --project . server
|
| 59 |
+
uv run --project . server --port 8001
|
| 60 |
+
python -m aws_rl_env.server.app
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
host: Host address to bind to (default: "0.0.0.0")
|
| 64 |
+
port: Port number to listen on (default: 8000)
|
| 65 |
+
|
| 66 |
+
For production deployments, consider using uvicorn directly with
|
| 67 |
+
multiple workers:
|
| 68 |
+
uvicorn aws_rl_env.server.app:app --workers 4
|
| 69 |
+
"""
|
| 70 |
+
import uvicorn
|
| 71 |
+
|
| 72 |
+
uvicorn.run(app, host=host, port=port)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
main()
|
server/aws_rl_env_environment.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Aws Rl Env Environment Implementation.
|
| 9 |
+
|
| 10 |
+
An RL environment backed by a simulated AWS cloud powered by MiniStack.
|
| 11 |
+
The agent sends AWS CLI commands as actions and receives CLI output plus
|
| 12 |
+
the current resource state as observations.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import logging
|
| 16 |
+
|
| 17 |
+
from typing import Any, Optional
|
| 18 |
+
from uuid import uuid4
|
| 19 |
+
|
| 20 |
+
from openenv.core.env_server.interfaces import Environment
|
| 21 |
+
from openenv.core.env_server.types import State
|
| 22 |
+
|
| 23 |
+
from models import AwsRlAction, AwsRlObservation, EpisodeID, StepCount, Task
|
| 24 |
+
from server.services.aws_backend import AwsBackend
|
| 25 |
+
from server.services.curriculum import Curriculum
|
| 26 |
+
from server.services.environment_designer import EnvironmentDesigner
|
| 27 |
+
from server.services.episode_tracker import EpisodeTracker
|
| 28 |
+
from server.services.task_grader import TaskGrader
|
| 29 |
+
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class AwsRlEnvironment(Environment[AwsRlAction, AwsRlObservation, State]):
|
| 34 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 35 |
+
|
| 36 |
+
def __init__(self) -> None:
|
| 37 |
+
print("Initializing AWS RL Environment...")
|
| 38 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 39 |
+
self._backend = AwsBackend()
|
| 40 |
+
self._curriculum = Curriculum()
|
| 41 |
+
self._grader = TaskGrader(self._backend)
|
| 42 |
+
self._designer = EnvironmentDesigner(self._backend)
|
| 43 |
+
self._tracker = EpisodeTracker()
|
| 44 |
+
self._current_task: Task | None = None
|
| 45 |
+
|
| 46 |
+
def reset(
|
| 47 |
+
self,
|
| 48 |
+
seed: Optional[int] = None,
|
| 49 |
+
episode_id: Optional[str] = None,
|
| 50 |
+
**kwargs: Any,
|
| 51 |
+
) -> AwsRlObservation:
|
| 52 |
+
self._backend.reset_environment()
|
| 53 |
+
self._state = State(episode_id=episode_id or str(uuid4()), step_count=0)
|
| 54 |
+
self._tracker.reset()
|
| 55 |
+
self._current_task = self._curriculum.next_task()
|
| 56 |
+
|
| 57 |
+
self._designer.apply(self._current_task)
|
| 58 |
+
|
| 59 |
+
return AwsRlObservation(
|
| 60 |
+
episode_id=EpisodeID(self._state.episode_id or ""),
|
| 61 |
+
step_count=StepCount(self._state.step_count),
|
| 62 |
+
command_success=True,
|
| 63 |
+
command_output="Environment reset. MiniStack state wiped.",
|
| 64 |
+
task=self._current_task,
|
| 65 |
+
done=False,
|
| 66 |
+
reward=0.0,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
def step(
|
| 70 |
+
self,
|
| 71 |
+
action: AwsRlAction,
|
| 72 |
+
timeout_s: Optional[float] = None,
|
| 73 |
+
**kwargs: Any,
|
| 74 |
+
) -> AwsRlObservation:
|
| 75 |
+
assert self._current_task is not None, "Call reset() before step()"
|
| 76 |
+
self._state.step_count += 1
|
| 77 |
+
|
| 78 |
+
# Anti-hack: only allow AWS CLI commands
|
| 79 |
+
command = action.command.strip()
|
| 80 |
+
if not command.startswith("aws "):
|
| 81 |
+
return AwsRlObservation(
|
| 82 |
+
episode_id=EpisodeID(self._state.episode_id or ""),
|
| 83 |
+
step_count=StepCount(self._state.step_count),
|
| 84 |
+
command_success=False,
|
| 85 |
+
command_output="",
|
| 86 |
+
error="Only AWS CLI commands (starting with 'aws') are allowed.",
|
| 87 |
+
task=self._current_task,
|
| 88 |
+
task_achieved=False,
|
| 89 |
+
done=False,
|
| 90 |
+
reward=0.0,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
success, stdout, stderr = self._backend.execute_command(command)
|
| 94 |
+
|
| 95 |
+
# Record in tracker
|
| 96 |
+
latest_step = self._tracker.record_step(command, success, stdout, stderr)
|
| 97 |
+
|
| 98 |
+
# Grade the task
|
| 99 |
+
task_achieved = False
|
| 100 |
+
|
| 101 |
+
grade_result = self._grader.grade(
|
| 102 |
+
self._current_task, self._tracker, latest_step
|
| 103 |
+
)
|
| 104 |
+
task_achieved = grade_result.task_achieved
|
| 105 |
+
reward = grade_result.reward
|
| 106 |
+
|
| 107 |
+
if task_achieved:
|
| 108 |
+
self._curriculum.record_result(
|
| 109 |
+
self._current_task, achieved=True, reward=reward
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
return AwsRlObservation(
|
| 113 |
+
episode_id=EpisodeID(self._state.episode_id or ""),
|
| 114 |
+
step_count=StepCount(self._state.step_count),
|
| 115 |
+
command_success=success,
|
| 116 |
+
command_output=stdout,
|
| 117 |
+
error=stderr,
|
| 118 |
+
task=self._current_task,
|
| 119 |
+
task_achieved=task_achieved,
|
| 120 |
+
done=task_achieved,
|
| 121 |
+
reward=reward,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
@property
|
| 125 |
+
def state(self) -> State:
|
| 126 |
+
return self._state
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv[core]>=0.2.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
server/services/__init__.py
ADDED
|
File without changes
|
server/services/aws_backend.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Backend service for managing AWS interactions via MiniStack."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
import subprocess
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
MINISTACK_URL = os.getenv("MINISTACK_URL", "http://localhost:4566")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class AwsBackend:
|
| 15 |
+
"""Backend service for executing AWS CLI commands against MiniStack."""
|
| 16 |
+
|
| 17 |
+
def __init__(self, ministack_url: str = MINISTACK_URL) -> None:
|
| 18 |
+
self._ministack_url = ministack_url
|
| 19 |
+
|
| 20 |
+
def reset_environment(self) -> None:
|
| 21 |
+
"""Wipe all MiniStack service state via POST /_ministack/reset."""
|
| 22 |
+
try:
|
| 23 |
+
resp = httpx.post(
|
| 24 |
+
f"{self._ministack_url}/_ministack/reset", timeout=10
|
| 25 |
+
)
|
| 26 |
+
resp.raise_for_status()
|
| 27 |
+
logger.info("MiniStack state reset successfully")
|
| 28 |
+
except httpx.HTTPError as e:
|
| 29 |
+
logger.warning("Failed to reset MiniStack state: %s", e)
|
| 30 |
+
raise
|
| 31 |
+
|
| 32 |
+
def execute_command(self, command: str) -> tuple[bool, str, str]:
|
| 33 |
+
"""Execute an AWS CLI command against MiniStack.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
command: Raw AWS CLI command, e.g. 'aws s3 ls'
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
Tuple of (success, stdout, stderr)
|
| 40 |
+
"""
|
| 41 |
+
env = {
|
| 42 |
+
**os.environ,
|
| 43 |
+
"AWS_ENDPOINT_URL": self._ministack_url,
|
| 44 |
+
"AWS_ACCESS_KEY_ID": "test",
|
| 45 |
+
"AWS_SECRET_ACCESS_KEY": "test",
|
| 46 |
+
"AWS_DEFAULT_REGION": "us-east-1",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
result = subprocess.run(
|
| 51 |
+
command.split(),
|
| 52 |
+
capture_output=True,
|
| 53 |
+
text=True,
|
| 54 |
+
timeout=30,
|
| 55 |
+
env=env,
|
| 56 |
+
)
|
| 57 |
+
return (
|
| 58 |
+
result.returncode == 0,
|
| 59 |
+
result.stdout,
|
| 60 |
+
result.stderr,
|
| 61 |
+
)
|
| 62 |
+
except subprocess.TimeoutExpired:
|
| 63 |
+
return False, "", "Command timed out after 30s"
|
| 64 |
+
except Exception as e:
|
| 65 |
+
return False, "", str(e)
|
server/services/curriculum.py
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Curriculum manager for progressive LLM training in the AWS RL environment.
|
| 2 |
+
|
| 3 |
+
Training flow:
|
| 4 |
+
1. Agent starts at the warmup tier with simple listing tasks.
|
| 5 |
+
2. A priority queue selects the next task based on weakness, novelty,
|
| 6 |
+
spaced repetition, and recency — replacing blind round-robin.
|
| 7 |
+
3. Per-task mastery tracking graduates individual tasks once the agent
|
| 8 |
+
demonstrates sustained competence.
|
| 9 |
+
4. Graduated tasks resurface via spaced repetition at exponentially
|
| 10 |
+
increasing intervals to prevent catastrophic forgetting.
|
| 11 |
+
5. Fast-track promotion lets strong agents skip minimum episode waits.
|
| 12 |
+
6. Exponential decay on history ensures recent results matter more.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import heapq
|
| 16 |
+
import logging
|
| 17 |
+
import random
|
| 18 |
+
from collections import defaultdict
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
import yaml
|
| 22 |
+
|
| 23 |
+
from models import (
|
| 24 |
+
SetupCommand,
|
| 25 |
+
SpacedRepState,
|
| 26 |
+
SuccessCriteria,
|
| 27 |
+
Task,
|
| 28 |
+
TaskDifficulty,
|
| 29 |
+
TaskID,
|
| 30 |
+
TierConfig,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
logger = logging.getLogger(__name__)
|
| 34 |
+
|
| 35 |
+
TASKS_DIR = Path(__file__).parent / "tasks"
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Per-tier configuration
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
TIER_CONFIGS: dict[TaskDifficulty, TierConfig] = {
|
| 42 |
+
TaskDifficulty.WARMUP: TierConfig(
|
| 43 |
+
min_episodes=5, advance_rate=0.6, mastery_window=10,
|
| 44 |
+
mastery_threshold=0.7, fast_track_rate=0.9,
|
| 45 |
+
),
|
| 46 |
+
TaskDifficulty.BEGINNER: TierConfig(
|
| 47 |
+
min_episodes=5, advance_rate=0.6, mastery_window=10,
|
| 48 |
+
mastery_threshold=0.7, fast_track_rate=0.9,
|
| 49 |
+
),
|
| 50 |
+
TaskDifficulty.INTERMEDIATE: TierConfig(
|
| 51 |
+
min_episodes=8, advance_rate=0.65, mastery_window=10,
|
| 52 |
+
mastery_threshold=0.7, fast_track_rate=0.9,
|
| 53 |
+
),
|
| 54 |
+
TaskDifficulty.ADVANCED: TierConfig(
|
| 55 |
+
min_episodes=10, advance_rate=0.7, mastery_window=10,
|
| 56 |
+
mastery_threshold=0.7, fast_track_rate=0.9,
|
| 57 |
+
),
|
| 58 |
+
TaskDifficulty.EXPERT: TierConfig(
|
| 59 |
+
min_episodes=0, advance_rate=1.0, mastery_window=10,
|
| 60 |
+
mastery_threshold=0.7, fast_track_rate=1.0,
|
| 61 |
+
),
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Map YAML filenames to difficulty tiers
|
| 65 |
+
_TIER_FILES: dict[TaskDifficulty, str] = {
|
| 66 |
+
TaskDifficulty.WARMUP: "warmup.yaml",
|
| 67 |
+
TaskDifficulty.BEGINNER: "beginner.yaml",
|
| 68 |
+
TaskDifficulty.INTERMEDIATE: "intermediate.yaml",
|
| 69 |
+
TaskDifficulty.ADVANCED: "advanced.yaml",
|
| 70 |
+
TaskDifficulty.EXPERT: "expert.yaml",
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
# Priority score tuning constants
|
| 75 |
+
# ---------------------------------------------------------------------------
|
| 76 |
+
|
| 77 |
+
_NOVELTY_BONUS = 100 # untried tasks — explore first
|
| 78 |
+
_WEAKNESS_WEIGHT = 50 # multiplied by (1 - success_rate)
|
| 79 |
+
_SPACED_REP_BONUS = 30 # graduated task due for re-test
|
| 80 |
+
_RECENCY_PENALTY = 20 # attempted in last 2 episodes
|
| 81 |
+
|
| 82 |
+
# Exponential decay factor for weighted success rate
|
| 83 |
+
_DECAY_FACTOR = 0.85
|
| 84 |
+
|
| 85 |
+
# Minimum attempts before a task can be graduated
|
| 86 |
+
_MIN_ATTEMPTS_FOR_MASTERY = 3
|
| 87 |
+
|
| 88 |
+
# Fast-track requires at least this many episodes in the tier
|
| 89 |
+
_FAST_TRACK_MIN_EPISODES = 3
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
# YAML loader
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def load_tier(difficulty: TaskDifficulty, tasks_dir: Path = TASKS_DIR) -> list[Task]:
|
| 98 |
+
"""Load tasks for a single difficulty tier from its YAML file."""
|
| 99 |
+
filename = _TIER_FILES.get(difficulty)
|
| 100 |
+
if filename is None:
|
| 101 |
+
logger.warning("No file mapping for difficulty: %s", difficulty.value)
|
| 102 |
+
return []
|
| 103 |
+
|
| 104 |
+
filepath = tasks_dir / filename
|
| 105 |
+
if not filepath.exists():
|
| 106 |
+
logger.warning("Task file not found: %s", filepath)
|
| 107 |
+
return []
|
| 108 |
+
|
| 109 |
+
with open(filepath) as f:
|
| 110 |
+
entries = yaml.safe_load(f) or []
|
| 111 |
+
|
| 112 |
+
tasks = [
|
| 113 |
+
Task(
|
| 114 |
+
task_id=TaskID(entry["task_id"]),
|
| 115 |
+
difficulty=difficulty,
|
| 116 |
+
description=entry["description"],
|
| 117 |
+
success_criteria=SuccessCriteria(**entry.get("success_criteria", {})),
|
| 118 |
+
setup_commands=[
|
| 119 |
+
SetupCommand(command=cmd) if isinstance(cmd, str) else SetupCommand(**cmd)
|
| 120 |
+
for cmd in entry.get("setup_commands", [])
|
| 121 |
+
],
|
| 122 |
+
)
|
| 123 |
+
for entry in entries
|
| 124 |
+
]
|
| 125 |
+
logger.info("Loaded %d %s tasks from %s", len(tasks), difficulty.value, filepath.name)
|
| 126 |
+
return tasks
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# ---------------------------------------------------------------------------
|
| 130 |
+
# Helpers
|
| 131 |
+
# ---------------------------------------------------------------------------
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _weighted_success_rate(results: list[bool], decay: float = _DECAY_FACTOR) -> float:
|
| 135 |
+
"""Compute success rate with exponential decay — recent results matter more."""
|
| 136 |
+
if not results:
|
| 137 |
+
return 0.0
|
| 138 |
+
weights = [decay ** i for i in range(len(results) - 1, -1, -1)]
|
| 139 |
+
total_weight = sum(weights)
|
| 140 |
+
return sum(w * float(r) for w, r in zip(weights, results)) / total_weight
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# ---------------------------------------------------------------------------
|
| 144 |
+
# Curriculum
|
| 145 |
+
# ---------------------------------------------------------------------------
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class Curriculum:
|
| 149 |
+
"""Manages progressive task assignment with priority-queue-based selection.
|
| 150 |
+
|
| 151 |
+
Features:
|
| 152 |
+
- Priority queue task selection (novelty, weakness, spaced rep, recency)
|
| 153 |
+
- Per-task mastery tracking with graduation
|
| 154 |
+
- Spaced repetition for graduated tasks (prevents catastrophic forgetting)
|
| 155 |
+
- Fast-track tier promotion for strong agents
|
| 156 |
+
- Exponential decay on success history
|
| 157 |
+
- Rich observability via get_stats()
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
def __init__(
|
| 161 |
+
self,
|
| 162 |
+
tier_configs: dict[TaskDifficulty, TierConfig] | None = None,
|
| 163 |
+
tasks_dir: Path = TASKS_DIR,
|
| 164 |
+
) -> None:
|
| 165 |
+
self._tier_configs = tier_configs or TIER_CONFIGS
|
| 166 |
+
self._tasks_dir = tasks_dir
|
| 167 |
+
|
| 168 |
+
# Ordered difficulty progression
|
| 169 |
+
self._levels = list(TaskDifficulty)
|
| 170 |
+
|
| 171 |
+
# Tier tracking
|
| 172 |
+
self._current_level_idx: int = 0
|
| 173 |
+
self._tier_episodes: int = 0
|
| 174 |
+
self._tier_results: list[bool] = [] # results within current tier
|
| 175 |
+
|
| 176 |
+
# Per-task tracking
|
| 177 |
+
self._task_history: dict[TaskID, list[bool]] = defaultdict(list)
|
| 178 |
+
self._task_attempt_count: dict[TaskID, int] = defaultdict(int)
|
| 179 |
+
self._last_attempted_episode: dict[TaskID, int] = {}
|
| 180 |
+
self._graduated_tasks: set[TaskID] = set()
|
| 181 |
+
self._spaced_rep: dict[TaskID, SpacedRepState] = {}
|
| 182 |
+
|
| 183 |
+
# Global counters
|
| 184 |
+
self._episode_count: int = 0
|
| 185 |
+
self._episode_rewards: list[float] = []
|
| 186 |
+
|
| 187 |
+
# Load starting tier
|
| 188 |
+
self._current_tasks: list[Task] = load_tier(
|
| 189 |
+
self.current_difficulty, self._tasks_dir
|
| 190 |
+
)
|
| 191 |
+
self._task_map: dict[TaskID, Task] = {t.task_id: t for t in self._current_tasks}
|
| 192 |
+
|
| 193 |
+
# Priority queue: list of (-score, random_tiebreaker, task_id)
|
| 194 |
+
self._priority_queue: list[tuple[float, float, TaskID]] = []
|
| 195 |
+
self._rebuild_priority_queue()
|
| 196 |
+
|
| 197 |
+
logger.info(
|
| 198 |
+
"Curriculum initialised — starting at %s with %d tasks",
|
| 199 |
+
self.current_difficulty.value,
|
| 200 |
+
len(self._current_tasks),
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
# -- Properties -----------------------------------------------------------
|
| 204 |
+
|
| 205 |
+
@property
|
| 206 |
+
def current_difficulty(self) -> TaskDifficulty:
|
| 207 |
+
return self._levels[self._current_level_idx]
|
| 208 |
+
|
| 209 |
+
@property
|
| 210 |
+
def tier_config(self) -> TierConfig:
|
| 211 |
+
return self._tier_configs[self.current_difficulty]
|
| 212 |
+
|
| 213 |
+
@property
|
| 214 |
+
def current_level_success_rate(self) -> float:
|
| 215 |
+
return _weighted_success_rate(self._tier_results)
|
| 216 |
+
|
| 217 |
+
@property
|
| 218 |
+
def is_warmup(self) -> bool:
|
| 219 |
+
return self.current_difficulty == TaskDifficulty.WARMUP
|
| 220 |
+
|
| 221 |
+
# -- Public API -----------------------------------------------------------
|
| 222 |
+
|
| 223 |
+
def next_task(self) -> Task:
|
| 224 |
+
"""Select the highest-priority task from the current tier."""
|
| 225 |
+
if not self._current_tasks:
|
| 226 |
+
self._current_tasks = load_tier(
|
| 227 |
+
self.current_difficulty, self._tasks_dir
|
| 228 |
+
)
|
| 229 |
+
self._task_map = {t.task_id: t for t in self._current_tasks}
|
| 230 |
+
self._rebuild_priority_queue()
|
| 231 |
+
|
| 232 |
+
if not self._priority_queue:
|
| 233 |
+
self._rebuild_priority_queue()
|
| 234 |
+
|
| 235 |
+
# Pop highest priority (most negative = highest score)
|
| 236 |
+
_, _, task_id = heapq.heappop(self._priority_queue)
|
| 237 |
+
task = self._task_map[task_id]
|
| 238 |
+
|
| 239 |
+
# If queue is now empty, rebuild for next call
|
| 240 |
+
if not self._priority_queue:
|
| 241 |
+
self._rebuild_priority_queue()
|
| 242 |
+
|
| 243 |
+
return task
|
| 244 |
+
|
| 245 |
+
def record_result(
|
| 246 |
+
self, task: Task, achieved: bool, reward: float = 0.0
|
| 247 |
+
) -> None:
|
| 248 |
+
"""Record episode outcome, update mastery, check promotion."""
|
| 249 |
+
self._episode_count += 1
|
| 250 |
+
self._tier_episodes += 1
|
| 251 |
+
self._episode_rewards.append(reward)
|
| 252 |
+
|
| 253 |
+
# Per-tier results
|
| 254 |
+
self._tier_results.append(achieved)
|
| 255 |
+
|
| 256 |
+
# Per-task results
|
| 257 |
+
self._task_history[task.task_id].append(achieved)
|
| 258 |
+
self._task_attempt_count[task.task_id] += 1
|
| 259 |
+
self._last_attempted_episode[task.task_id] = self._episode_count
|
| 260 |
+
|
| 261 |
+
# Check mastery
|
| 262 |
+
self._check_mastery(task.task_id)
|
| 263 |
+
|
| 264 |
+
# Check tier promotion
|
| 265 |
+
self._maybe_promote()
|
| 266 |
+
|
| 267 |
+
# Rebuild priority queue with updated scores
|
| 268 |
+
self._rebuild_priority_queue()
|
| 269 |
+
|
| 270 |
+
logger.info(
|
| 271 |
+
"Episode %d: task=%d difficulty=%s achieved=%s tier_rate=%.2f",
|
| 272 |
+
self._episode_count,
|
| 273 |
+
task.task_id,
|
| 274 |
+
task.difficulty.value,
|
| 275 |
+
achieved,
|
| 276 |
+
self.current_level_success_rate,
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
def reset(self) -> None:
|
| 280 |
+
"""Reset curriculum back to warmup (full training restart)."""
|
| 281 |
+
self._current_level_idx = 0
|
| 282 |
+
self._tier_episodes = 0
|
| 283 |
+
self._tier_results.clear()
|
| 284 |
+
self._task_history.clear()
|
| 285 |
+
self._task_attempt_count.clear()
|
| 286 |
+
self._last_attempted_episode.clear()
|
| 287 |
+
self._graduated_tasks.clear()
|
| 288 |
+
self._spaced_rep.clear()
|
| 289 |
+
self._episode_count = 0
|
| 290 |
+
self._episode_rewards.clear()
|
| 291 |
+
self._current_tasks = load_tier(self.current_difficulty, self._tasks_dir)
|
| 292 |
+
self._task_map = {t.task_id: t for t in self._current_tasks}
|
| 293 |
+
self._rebuild_priority_queue()
|
| 294 |
+
logger.info("Curriculum reset to %s", self.current_difficulty.value)
|
| 295 |
+
|
| 296 |
+
# -- Observability --------------------------------------------------------
|
| 297 |
+
|
| 298 |
+
def get_skill_profile(self) -> dict[TaskID, float]:
|
| 299 |
+
"""Weighted success rate per task over recent history."""
|
| 300 |
+
config = self.tier_config
|
| 301 |
+
return {
|
| 302 |
+
task_id: round(
|
| 303 |
+
_weighted_success_rate(results[-config.mastery_window:]), 2
|
| 304 |
+
)
|
| 305 |
+
for task_id, results in self._task_history.items()
|
| 306 |
+
if results
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
def get_weak_spots(self) -> list[TaskID]:
|
| 310 |
+
"""Tasks in the current tier below mastery threshold."""
|
| 311 |
+
config = self.tier_config
|
| 312 |
+
profile = self.get_skill_profile()
|
| 313 |
+
return [
|
| 314 |
+
task_id
|
| 315 |
+
for task_id in self._task_map
|
| 316 |
+
if profile.get(task_id, 0.0) < config.mastery_threshold
|
| 317 |
+
and task_id not in self._graduated_tasks
|
| 318 |
+
]
|
| 319 |
+
|
| 320 |
+
def get_stats(self) -> dict:
|
| 321 |
+
"""Full curriculum state for logging/debugging."""
|
| 322 |
+
return {
|
| 323 |
+
"episode_count": self._episode_count,
|
| 324 |
+
"tier": self.current_difficulty.value,
|
| 325 |
+
"tier_episodes": self._tier_episodes,
|
| 326 |
+
"tier_success_rate": round(self.current_level_success_rate, 3),
|
| 327 |
+
"graduated_tasks": sorted(self._graduated_tasks),
|
| 328 |
+
"weak_spots": self.get_weak_spots(),
|
| 329 |
+
"skill_profile": self.get_skill_profile(),
|
| 330 |
+
"spaced_rep_due": [
|
| 331 |
+
int(tid)
|
| 332 |
+
for tid in self._task_map
|
| 333 |
+
if self._is_spaced_rep_due(tid)
|
| 334 |
+
],
|
| 335 |
+
"avg_reward_last_10": round(
|
| 336 |
+
sum(self._episode_rewards[-10:])
|
| 337 |
+
/ max(1, len(self._episode_rewards[-10:])),
|
| 338 |
+
3,
|
| 339 |
+
),
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
# -- Priority queue -------------------------------------------------------
|
| 343 |
+
|
| 344 |
+
def _compute_priority(self, task_id: TaskID) -> float:
|
| 345 |
+
"""Compute composite priority score for a task. Higher = selected sooner."""
|
| 346 |
+
config = self.tier_config
|
| 347 |
+
score = 0.0
|
| 348 |
+
|
| 349 |
+
attempts = self._task_attempt_count.get(task_id, 0)
|
| 350 |
+
|
| 351 |
+
# Novelty: never attempted → explore first
|
| 352 |
+
if attempts == 0:
|
| 353 |
+
score += _NOVELTY_BONUS
|
| 354 |
+
return score # no other signals available yet
|
| 355 |
+
|
| 356 |
+
# Weakness: worse tasks get higher priority
|
| 357 |
+
results = self._task_history.get(task_id, [])
|
| 358 |
+
task_rate = _weighted_success_rate(results[-config.mastery_window:])
|
| 359 |
+
score += _WEAKNESS_WEIGHT * (1.0 - task_rate)
|
| 360 |
+
|
| 361 |
+
# Spaced repetition: graduated task due for re-test
|
| 362 |
+
if task_id in self._graduated_tasks and self._is_spaced_rep_due(task_id):
|
| 363 |
+
score += _SPACED_REP_BONUS
|
| 364 |
+
|
| 365 |
+
# Recency penalty: attempted in last 2 episodes
|
| 366 |
+
last_ep = self._last_attempted_episode.get(task_id, -100)
|
| 367 |
+
if self._episode_count - last_ep <= 2:
|
| 368 |
+
score -= _RECENCY_PENALTY
|
| 369 |
+
|
| 370 |
+
return score
|
| 371 |
+
|
| 372 |
+
def _rebuild_priority_queue(self) -> None:
|
| 373 |
+
"""Recompute priorities for all current-tier tasks and rebuild the heap."""
|
| 374 |
+
self._priority_queue.clear()
|
| 375 |
+
for task in self._current_tasks:
|
| 376 |
+
score = self._compute_priority(task.task_id)
|
| 377 |
+
# heapq is a min-heap, so negate score for max-priority-first
|
| 378 |
+
# random tiebreaker prevents deterministic ordering among equal scores
|
| 379 |
+
heapq.heappush(
|
| 380 |
+
self._priority_queue,
|
| 381 |
+
(-score, random.random(), task.task_id),
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
# -- Mastery & spaced repetition ------------------------------------------
|
| 385 |
+
|
| 386 |
+
def _check_mastery(self, task_id: TaskID) -> None:
|
| 387 |
+
"""Check if a task should be graduated or un-graduated."""
|
| 388 |
+
config = self.tier_config
|
| 389 |
+
results = self._task_history.get(task_id, [])
|
| 390 |
+
recent = results[-config.mastery_window:]
|
| 391 |
+
|
| 392 |
+
if len(recent) < _MIN_ATTEMPTS_FOR_MASTERY:
|
| 393 |
+
return
|
| 394 |
+
|
| 395 |
+
rate = _weighted_success_rate(recent)
|
| 396 |
+
|
| 397 |
+
if rate >= config.mastery_threshold:
|
| 398 |
+
if task_id not in self._graduated_tasks:
|
| 399 |
+
self._graduated_tasks.add(task_id)
|
| 400 |
+
self._spaced_rep[task_id] = SpacedRepState(
|
| 401 |
+
interval=3,
|
| 402 |
+
last_graduated_episode=self._episode_count,
|
| 403 |
+
)
|
| 404 |
+
logger.info(
|
| 405 |
+
"Task %d GRADUATED (rate=%.2f) — scheduling spaced repetition",
|
| 406 |
+
task_id,
|
| 407 |
+
rate,
|
| 408 |
+
)
|
| 409 |
+
else:
|
| 410 |
+
# Un-graduate if performance dropped
|
| 411 |
+
if task_id in self._graduated_tasks:
|
| 412 |
+
self._graduated_tasks.discard(task_id)
|
| 413 |
+
self._spaced_rep.pop(task_id, None)
|
| 414 |
+
logger.info(
|
| 415 |
+
"Task %d UN-GRADUATED (rate=%.2f) — resetting to active",
|
| 416 |
+
task_id,
|
| 417 |
+
rate,
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
def _is_spaced_rep_due(self, task_id: TaskID) -> bool:
|
| 421 |
+
"""Check if a graduated task is due for a re-test."""
|
| 422 |
+
state = self._spaced_rep.get(task_id)
|
| 423 |
+
if state is None:
|
| 424 |
+
return False
|
| 425 |
+
episodes_since = self._episode_count - state.last_graduated_episode
|
| 426 |
+
return episodes_since >= state.interval
|
| 427 |
+
|
| 428 |
+
def _advance_spaced_rep(self, task_id: TaskID) -> None:
|
| 429 |
+
"""Double the interval after a successful re-test."""
|
| 430 |
+
state = self._spaced_rep.get(task_id)
|
| 431 |
+
if state is not None:
|
| 432 |
+
state.interval = min(state.interval * 2, 48) # cap at 48 episodes
|
| 433 |
+
state.last_graduated_episode = self._episode_count
|
| 434 |
+
|
| 435 |
+
# -- Tier promotion -------------------------------------------------------
|
| 436 |
+
|
| 437 |
+
def _maybe_promote(self) -> None:
|
| 438 |
+
"""Advance to the next difficulty tier if the agent is ready."""
|
| 439 |
+
if self._current_level_idx >= len(self._levels) - 1:
|
| 440 |
+
return # already at max tier
|
| 441 |
+
|
| 442 |
+
config = self.tier_config
|
| 443 |
+
rate = self.current_level_success_rate
|
| 444 |
+
|
| 445 |
+
# Fast-track: high success rate after minimum 3 episodes
|
| 446 |
+
fast_track = (
|
| 447 |
+
self._tier_episodes >= _FAST_TRACK_MIN_EPISODES
|
| 448 |
+
and rate >= config.fast_track_rate
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
if not fast_track and self._tier_episodes < config.min_episodes:
|
| 452 |
+
return
|
| 453 |
+
|
| 454 |
+
if rate < config.advance_rate:
|
| 455 |
+
return
|
| 456 |
+
|
| 457 |
+
prev_tier = self.current_difficulty.value
|
| 458 |
+
prev_rate = rate
|
| 459 |
+
self._current_level_idx += 1
|
| 460 |
+
self._tier_episodes = 0
|
| 461 |
+
self._tier_results.clear()
|
| 462 |
+
self._current_tasks = load_tier(self.current_difficulty, self._tasks_dir)
|
| 463 |
+
self._task_map = {t.task_id: t for t in self._current_tasks}
|
| 464 |
+
self._rebuild_priority_queue()
|
| 465 |
+
logger.info(
|
| 466 |
+
"PROMOTED from %s to %s (rate=%.2f%s)",
|
| 467 |
+
prev_tier,
|
| 468 |
+
self.current_difficulty.value,
|
| 469 |
+
prev_rate,
|
| 470 |
+
", FAST-TRACK" if fast_track else "",
|
| 471 |
+
)
|
server/services/environment_designer.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Environment designer — provisions initial AWS state for each task.
|
| 2 |
+
|
| 3 |
+
Currently supports raw AWS CLI setup commands. Designed to be extended
|
| 4 |
+
with CloudFormation YAML template support so that each difficulty level
|
| 5 |
+
can declaratively define its starting infrastructure.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import logging
|
| 11 |
+
from enum import Enum
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, Field
|
| 14 |
+
|
| 15 |
+
from models import SetupCommand, Task
|
| 16 |
+
from server.services.aws_backend import AwsBackend
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class ProvisionMethod(str, Enum):
|
| 22 |
+
"""How the initial environment state is provisioned."""
|
| 23 |
+
|
| 24 |
+
CLI_COMMANDS = "cli_commands"
|
| 25 |
+
CLOUDFORMATION = "cloudformation"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ProvisionResult(BaseModel):
|
| 29 |
+
"""Outcome of provisioning the environment for a task."""
|
| 30 |
+
|
| 31 |
+
success: bool = True
|
| 32 |
+
method: ProvisionMethod = ProvisionMethod.CLI_COMMANDS
|
| 33 |
+
resources_created: int = 0
|
| 34 |
+
errors: list[str] = Field(default_factory=list)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class EnvironmentDesigner:
|
| 38 |
+
"""Provisions the initial AWS state required by a task before the agent acts.
|
| 39 |
+
|
| 40 |
+
Usage::
|
| 41 |
+
|
| 42 |
+
designer = EnvironmentDesigner(backend)
|
| 43 |
+
result = designer.apply(task)
|
| 44 |
+
if not result.success:
|
| 45 |
+
logger.error("Failed to set up environment: %s", result.errors)
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
def __init__(self, backend: AwsBackend) -> None:
|
| 49 |
+
self._backend = backend
|
| 50 |
+
|
| 51 |
+
def apply(self, task: Task) -> ProvisionResult:
|
| 52 |
+
"""Apply the task's environment setup to MiniStack.
|
| 53 |
+
|
| 54 |
+
Dispatches to the appropriate provisioning method based on what the
|
| 55 |
+
task defines. Currently supports ``setup_commands``; CloudFormation
|
| 56 |
+
support can be added by extending this method.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
A ``ProvisionResult`` summarising what happened.
|
| 60 |
+
"""
|
| 61 |
+
if not task.setup_commands:
|
| 62 |
+
return ProvisionResult(resources_created=0)
|
| 63 |
+
|
| 64 |
+
return self._apply_cli_commands(task.setup_commands)
|
| 65 |
+
|
| 66 |
+
# -- Provisioning strategies ----------------------------------------------
|
| 67 |
+
|
| 68 |
+
def _apply_cli_commands(
|
| 69 |
+
self, commands: list[SetupCommand]
|
| 70 |
+
) -> ProvisionResult:
|
| 71 |
+
"""Execute a list of setup commands against MiniStack."""
|
| 72 |
+
errors: list[str] = []
|
| 73 |
+
resources_created = 0
|
| 74 |
+
|
| 75 |
+
for setup_cmd in commands:
|
| 76 |
+
success, _stdout, stderr = self._backend.execute_command(
|
| 77 |
+
setup_cmd.command
|
| 78 |
+
)
|
| 79 |
+
if success:
|
| 80 |
+
resources_created += 1
|
| 81 |
+
else:
|
| 82 |
+
msg = f"Setup command failed: {setup_cmd.command} — {stderr}"
|
| 83 |
+
if setup_cmd.ignore_failure:
|
| 84 |
+
logger.info("Ignoring failed setup command: %s", msg)
|
| 85 |
+
else:
|
| 86 |
+
logger.warning(msg)
|
| 87 |
+
errors.append(msg)
|
| 88 |
+
|
| 89 |
+
return ProvisionResult(
|
| 90 |
+
success=len(errors) == 0,
|
| 91 |
+
method=ProvisionMethod.CLI_COMMANDS,
|
| 92 |
+
resources_created=resources_created,
|
| 93 |
+
errors=errors,
|
| 94 |
+
)
|
server/services/episode_tracker.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Per-episode command history tracker for multi-step task evaluation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
# Maps common AWS CLI flag names to resource identifiers
|
| 13 |
+
_RESOURCE_FLAGS: list[str] = [
|
| 14 |
+
"--bucket",
|
| 15 |
+
"--table-name",
|
| 16 |
+
"--function-name",
|
| 17 |
+
"--queue-name",
|
| 18 |
+
"--topic-name",
|
| 19 |
+
"--role-name",
|
| 20 |
+
"--rest-api-id",
|
| 21 |
+
"--name",
|
| 22 |
+
"--resource",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class StepRecord(BaseModel):
|
| 27 |
+
"""A single command executed within an episode."""
|
| 28 |
+
|
| 29 |
+
command: str
|
| 30 |
+
success: bool
|
| 31 |
+
stdout: str = ""
|
| 32 |
+
stderr: str = ""
|
| 33 |
+
step_number: int = Field(ge=0)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _parse_aws_command(command: str) -> tuple[str | None, str | None]:
|
| 37 |
+
"""Extract (service, operation) from an AWS CLI command.
|
| 38 |
+
|
| 39 |
+
Example: 'aws s3api create-bucket --bucket foo' -> ('s3api', 'create-bucket')
|
| 40 |
+
"""
|
| 41 |
+
parts = command.strip().split()
|
| 42 |
+
if len(parts) < 3 or parts[0] != "aws":
|
| 43 |
+
return None, None
|
| 44 |
+
return parts[1], parts[2]
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _command_mentions_resource(command: str, resource: str) -> bool:
|
| 48 |
+
"""Check if the command references a specific resource name."""
|
| 49 |
+
parts = command.strip().split()
|
| 50 |
+
for i, part in enumerate(parts):
|
| 51 |
+
if part in _RESOURCE_FLAGS and i + 1 < len(parts):
|
| 52 |
+
if parts[i + 1] == resource:
|
| 53 |
+
return True
|
| 54 |
+
# Also match if the resource appears as a value in key=value flags
|
| 55 |
+
# e.g. --table-name=orders
|
| 56 |
+
for part in parts:
|
| 57 |
+
for flag in _RESOURCE_FLAGS:
|
| 58 |
+
if part.startswith(f"{flag}=") and part.split("=", 1)[1] == resource:
|
| 59 |
+
return True
|
| 60 |
+
# Match resource in ARN-like patterns or bare arguments
|
| 61 |
+
if re.search(rf"\b{re.escape(resource)}\b", command):
|
| 62 |
+
return True
|
| 63 |
+
return False
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class EpisodeTracker:
|
| 67 |
+
"""Tracks command history within a single episode for grading."""
|
| 68 |
+
|
| 69 |
+
def __init__(self) -> None:
|
| 70 |
+
self._history: list[StepRecord] = []
|
| 71 |
+
self._step_counter: int = 0
|
| 72 |
+
self._previous_progress: float = 0.0
|
| 73 |
+
# Track which (operation, resource) pairs have been credited
|
| 74 |
+
self._credited_operations: set[tuple[str, str | None]] = set()
|
| 75 |
+
|
| 76 |
+
def reset(self) -> None:
|
| 77 |
+
self._history.clear()
|
| 78 |
+
self._step_counter = 0
|
| 79 |
+
self._previous_progress = 0.0
|
| 80 |
+
self._credited_operations.clear()
|
| 81 |
+
|
| 82 |
+
def record_step(
|
| 83 |
+
self, command: str, success: bool, stdout: str, stderr: str
|
| 84 |
+
) -> StepRecord:
|
| 85 |
+
record = StepRecord(
|
| 86 |
+
command=command,
|
| 87 |
+
success=success,
|
| 88 |
+
stdout=stdout,
|
| 89 |
+
stderr=stderr,
|
| 90 |
+
step_number=self._step_counter,
|
| 91 |
+
)
|
| 92 |
+
self._history.append(record)
|
| 93 |
+
self._step_counter += 1
|
| 94 |
+
return record
|
| 95 |
+
|
| 96 |
+
def has_executed_operation(
|
| 97 |
+
self, operation: str, resource: str | None = None
|
| 98 |
+
) -> bool:
|
| 99 |
+
"""Check if a successful command matching (operation, resource) exists in history."""
|
| 100 |
+
for record in self._history:
|
| 101 |
+
if not record.success:
|
| 102 |
+
continue
|
| 103 |
+
_, cmd_op = _parse_aws_command(record.command)
|
| 104 |
+
if cmd_op != operation:
|
| 105 |
+
continue
|
| 106 |
+
if resource is not None and not _command_mentions_resource(
|
| 107 |
+
record.command, resource
|
| 108 |
+
):
|
| 109 |
+
continue
|
| 110 |
+
return True
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
def has_used_service(self, service: str) -> bool:
|
| 114 |
+
"""Check if any successful command targeted the given AWS service."""
|
| 115 |
+
for record in self._history:
|
| 116 |
+
if not record.success:
|
| 117 |
+
continue
|
| 118 |
+
cmd_svc, _ = _parse_aws_command(record.command)
|
| 119 |
+
if cmd_svc is not None and service in cmd_svc:
|
| 120 |
+
return True
|
| 121 |
+
return False
|
| 122 |
+
|
| 123 |
+
def is_operation_already_credited(
|
| 124 |
+
self, operation: str, resource: str | None
|
| 125 |
+
) -> bool:
|
| 126 |
+
return (operation, resource) in self._credited_operations
|
| 127 |
+
|
| 128 |
+
def credit_operation(self, operation: str, resource: str | None) -> None:
|
| 129 |
+
self._credited_operations.add((operation, resource))
|
| 130 |
+
|
| 131 |
+
@property
|
| 132 |
+
def command_history(self) -> list[StepRecord]:
|
| 133 |
+
return list(self._history)
|
| 134 |
+
|
| 135 |
+
@property
|
| 136 |
+
def step_count(self) -> int:
|
| 137 |
+
return self._step_counter
|
| 138 |
+
|
| 139 |
+
@property
|
| 140 |
+
def previous_progress(self) -> float:
|
| 141 |
+
return self._previous_progress
|
| 142 |
+
|
| 143 |
+
@previous_progress.setter
|
| 144 |
+
def previous_progress(self, value: float) -> None:
|
| 145 |
+
self._previous_progress = value
|
server/services/resource_verifier.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Resource verification service — queries MiniStack for ground-truth state."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from server.services.aws_backend import AwsBackend
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _extract_json_path(data: Any, path: str) -> Any:
|
| 15 |
+
"""Simple JSON path extractor supporting dot notation and array indexing.
|
| 16 |
+
|
| 17 |
+
Supports paths like: $.Table.ProvisionedThroughput.ReadCapacityUnits
|
| 18 |
+
$.Rules[0].Expiration.Days
|
| 19 |
+
$.Buckets[].Name
|
| 20 |
+
"""
|
| 21 |
+
parts = path.lstrip("$").lstrip(".").split(".")
|
| 22 |
+
current = data
|
| 23 |
+
for part in parts:
|
| 24 |
+
if current is None:
|
| 25 |
+
return None
|
| 26 |
+
# Handle array index like Rules[0]
|
| 27 |
+
if "[" in part:
|
| 28 |
+
key, idx_str = part.split("[", 1)
|
| 29 |
+
idx_str = idx_str.rstrip("]")
|
| 30 |
+
if key:
|
| 31 |
+
current = current.get(key) if isinstance(current, dict) else None
|
| 32 |
+
if current is None:
|
| 33 |
+
return None
|
| 34 |
+
if idx_str == "":
|
| 35 |
+
# Wildcard — return list of values
|
| 36 |
+
if isinstance(current, list):
|
| 37 |
+
remaining = ".".join(parts[parts.index(part) + 1 :])
|
| 38 |
+
if remaining:
|
| 39 |
+
return [
|
| 40 |
+
_extract_json_path(item, f"$.{remaining}")
|
| 41 |
+
for item in current
|
| 42 |
+
]
|
| 43 |
+
return current
|
| 44 |
+
return None
|
| 45 |
+
try:
|
| 46 |
+
current = current[int(idx_str)]
|
| 47 |
+
except (IndexError, TypeError):
|
| 48 |
+
return None
|
| 49 |
+
else:
|
| 50 |
+
current = current.get(part) if isinstance(current, dict) else None
|
| 51 |
+
return current
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class ResourceVerifier:
|
| 55 |
+
"""Verifies resource state by querying MiniStack via AWS CLI."""
|
| 56 |
+
|
| 57 |
+
def __init__(self, backend: AwsBackend) -> None:
|
| 58 |
+
self._backend = backend
|
| 59 |
+
|
| 60 |
+
def resource_exists(self, service: str, name: str) -> bool:
|
| 61 |
+
"""Check if a specific resource exists in MiniStack.
|
| 62 |
+
|
| 63 |
+
Uses service-specific verification commands and checks for the
|
| 64 |
+
exact resource name (not just any resource of that type).
|
| 65 |
+
"""
|
| 66 |
+
service_lower = service.lower()
|
| 67 |
+
verifiers = {
|
| 68 |
+
"s3": self._check_s3_bucket,
|
| 69 |
+
"dynamodb": self._check_dynamodb_table,
|
| 70 |
+
"lambda": self._check_lambda_function,
|
| 71 |
+
"sqs": self._check_sqs_queue,
|
| 72 |
+
"sns": self._check_sns_topic,
|
| 73 |
+
"iam": self._check_iam_role,
|
| 74 |
+
"apigateway": self._check_apigateway,
|
| 75 |
+
}
|
| 76 |
+
verifier = verifiers.get(service_lower)
|
| 77 |
+
if verifier is None:
|
| 78 |
+
logger.warning("No verifier for service: %s", service)
|
| 79 |
+
return False
|
| 80 |
+
return verifier(name)
|
| 81 |
+
|
| 82 |
+
def check_state(self, state_check: dict[str, Any]) -> bool:
|
| 83 |
+
"""Run an arbitrary command and assert on its output.
|
| 84 |
+
|
| 85 |
+
Supports:
|
| 86 |
+
- output_contains: substring check on stdout
|
| 87 |
+
- json_path + expected: extract value from JSON stdout and compare
|
| 88 |
+
"""
|
| 89 |
+
command = state_check.get("command", "")
|
| 90 |
+
if not command:
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
success, stdout, _ = self._backend.execute_command(command)
|
| 94 |
+
if not success:
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
# Check output_contains
|
| 98 |
+
if "output_contains" in state_check:
|
| 99 |
+
if state_check["output_contains"] not in stdout:
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
# Check json_path + expected
|
| 103 |
+
if "json_path" in state_check and "expected" in state_check:
|
| 104 |
+
try:
|
| 105 |
+
data = json.loads(stdout)
|
| 106 |
+
value = _extract_json_path(data, state_check["json_path"])
|
| 107 |
+
expected = state_check["expected"]
|
| 108 |
+
# Compare as strings for flexibility
|
| 109 |
+
if str(value) != str(expected):
|
| 110 |
+
return False
|
| 111 |
+
except (json.JSONDecodeError, KeyError, TypeError):
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
return True
|
| 115 |
+
|
| 116 |
+
# -- Service-specific verifiers -------------------------------------------
|
| 117 |
+
|
| 118 |
+
def _check_s3_bucket(self, name: str) -> bool:
|
| 119 |
+
success, stdout, _ = self._backend.execute_command(
|
| 120 |
+
"aws s3api list-buckets --output json"
|
| 121 |
+
)
|
| 122 |
+
if not success:
|
| 123 |
+
return False
|
| 124 |
+
try:
|
| 125 |
+
data = json.loads(stdout)
|
| 126 |
+
buckets = data.get("Buckets", [])
|
| 127 |
+
return any(b.get("Name") == name for b in buckets)
|
| 128 |
+
except (json.JSONDecodeError, TypeError):
|
| 129 |
+
return False
|
| 130 |
+
|
| 131 |
+
def _check_dynamodb_table(self, name: str) -> bool:
|
| 132 |
+
success, _, _ = self._backend.execute_command(
|
| 133 |
+
f"aws dynamodb describe-table --table-name {name}"
|
| 134 |
+
)
|
| 135 |
+
return success
|
| 136 |
+
|
| 137 |
+
def _check_lambda_function(self, name: str) -> bool:
|
| 138 |
+
success, _, _ = self._backend.execute_command(
|
| 139 |
+
f"aws lambda get-function --function-name {name}"
|
| 140 |
+
)
|
| 141 |
+
return success
|
| 142 |
+
|
| 143 |
+
def _check_sqs_queue(self, name: str) -> bool:
|
| 144 |
+
success, _, _ = self._backend.execute_command(
|
| 145 |
+
f"aws sqs get-queue-url --queue-name {name}"
|
| 146 |
+
)
|
| 147 |
+
return success
|
| 148 |
+
|
| 149 |
+
def _check_sns_topic(self, name: str) -> bool:
|
| 150 |
+
success, stdout, _ = self._backend.execute_command(
|
| 151 |
+
"aws sns list-topics --output json"
|
| 152 |
+
)
|
| 153 |
+
if not success:
|
| 154 |
+
return False
|
| 155 |
+
try:
|
| 156 |
+
data = json.loads(stdout)
|
| 157 |
+
topics = data.get("Topics", [])
|
| 158 |
+
return any(name in t.get("TopicArn", "") for t in topics)
|
| 159 |
+
except (json.JSONDecodeError, TypeError):
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
def _check_iam_role(self, name: str) -> bool:
|
| 163 |
+
success, _, _ = self._backend.execute_command(
|
| 164 |
+
f"aws iam get-role --role-name {name}"
|
| 165 |
+
)
|
| 166 |
+
return success
|
| 167 |
+
|
| 168 |
+
def _check_apigateway(self, name: str) -> bool:
|
| 169 |
+
success, stdout, _ = self._backend.execute_command(
|
| 170 |
+
"aws apigateway get-rest-apis --output json"
|
| 171 |
+
)
|
| 172 |
+
if not success:
|
| 173 |
+
return False
|
| 174 |
+
try:
|
| 175 |
+
data = json.loads(stdout)
|
| 176 |
+
items = data.get("items", [])
|
| 177 |
+
return any(i.get("name") == name for i in items)
|
| 178 |
+
except (json.JSONDecodeError, TypeError):
|
| 179 |
+
return False
|
server/services/task_grader.py
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task grading engine — evaluates task completion and computes shaped rewards.
|
| 2 |
+
|
| 3 |
+
All rewards are in the [0.0, 1.0] range. Only full task completion yields 1.0.
|
| 4 |
+
Includes anti-reward-hacking defenses.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
from pydantic import BaseModel, Field
|
| 12 |
+
|
| 13 |
+
from models import SuccessCriteria, Task
|
| 14 |
+
from server.services.aws_backend import AwsBackend
|
| 15 |
+
from server.services.episode_tracker import EpisodeTracker, StepRecord
|
| 16 |
+
from server.services.resource_verifier import ResourceVerifier
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class GradeResult(BaseModel):
|
| 22 |
+
"""Outcome of grading a single step."""
|
| 23 |
+
|
| 24 |
+
task_achieved: bool = False
|
| 25 |
+
partial_progress: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 26 |
+
reward: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 27 |
+
reason: str = ""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TaskGrader:
|
| 31 |
+
"""Evaluates task completion and computes shaped rewards.
|
| 32 |
+
|
| 33 |
+
Dispatches to different grading strategies based on which fields
|
| 34 |
+
are populated on the task's ``SuccessCriteria``.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, backend: AwsBackend) -> None:
|
| 38 |
+
self._verifier = ResourceVerifier(backend)
|
| 39 |
+
|
| 40 |
+
def grade(
|
| 41 |
+
self,
|
| 42 |
+
task: Task,
|
| 43 |
+
tracker: EpisodeTracker,
|
| 44 |
+
latest_step: StepRecord,
|
| 45 |
+
) -> GradeResult:
|
| 46 |
+
criteria = task.success_criteria
|
| 47 |
+
|
| 48 |
+
# Dispatch based on populated criteria fields
|
| 49 |
+
if criteria.state_checks:
|
| 50 |
+
result = self._grade_state_checks(criteria, tracker)
|
| 51 |
+
elif criteria.steps:
|
| 52 |
+
result = self._grade_multi_step(criteria, tracker)
|
| 53 |
+
elif criteria.resource_exists is not None:
|
| 54 |
+
result = self._grade_resource_creation(criteria, latest_step)
|
| 55 |
+
elif criteria.command_contains is not None:
|
| 56 |
+
result = self._grade_command_match(criteria, latest_step)
|
| 57 |
+
else:
|
| 58 |
+
result = GradeResult(reason="no recognised success_criteria fields")
|
| 59 |
+
|
| 60 |
+
# Compute shaped reward
|
| 61 |
+
result.reward = self._compute_reward(result, latest_step, tracker)
|
| 62 |
+
|
| 63 |
+
# Update tracker's previous progress (monotonic — never decrease)
|
| 64 |
+
if result.partial_progress > tracker.previous_progress:
|
| 65 |
+
tracker.previous_progress = result.partial_progress
|
| 66 |
+
|
| 67 |
+
return result
|
| 68 |
+
|
| 69 |
+
# -- Grading strategies ---------------------------------------------------
|
| 70 |
+
|
| 71 |
+
def _grade_command_match(
|
| 72 |
+
self, criteria: SuccessCriteria, latest_step: StepRecord
|
| 73 |
+
) -> GradeResult:
|
| 74 |
+
"""Warmup: check the latest command matches expected service + operation."""
|
| 75 |
+
cmd = latest_step.command.lower()
|
| 76 |
+
contains = (criteria.command_contains or "").lower()
|
| 77 |
+
operation = (criteria.operation or "").lower()
|
| 78 |
+
|
| 79 |
+
contains_ok = contains != "" and contains in cmd
|
| 80 |
+
operation_ok = operation != "" and operation in cmd
|
| 81 |
+
succeeded = latest_step.success
|
| 82 |
+
achieved = contains_ok and operation_ok and succeeded
|
| 83 |
+
|
| 84 |
+
return GradeResult(
|
| 85 |
+
task_achieved=achieved,
|
| 86 |
+
partial_progress=1.0 if achieved else 0.0,
|
| 87 |
+
reason=(
|
| 88 |
+
f"command_match: contains={contains_ok}, "
|
| 89 |
+
f"op={operation_ok}, success={succeeded}"
|
| 90 |
+
),
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def _grade_resource_creation(
|
| 94 |
+
self,
|
| 95 |
+
criteria: SuccessCriteria,
|
| 96 |
+
latest_step: StepRecord,
|
| 97 |
+
) -> GradeResult:
|
| 98 |
+
"""Beginner: verify the resource actually exists in MiniStack."""
|
| 99 |
+
re_spec = criteria.resource_exists
|
| 100 |
+
assert re_spec is not None
|
| 101 |
+
service = re_spec.service
|
| 102 |
+
name = re_spec.name
|
| 103 |
+
|
| 104 |
+
exists = self._verifier.resource_exists(service, name)
|
| 105 |
+
|
| 106 |
+
# Command matching gives partial credit (0.5)
|
| 107 |
+
contains = (criteria.command_contains or "").lower()
|
| 108 |
+
operation = (criteria.operation or "").lower()
|
| 109 |
+
cmd = latest_step.command.lower()
|
| 110 |
+
cmd_ok = contains in cmd and operation in cmd and latest_step.success
|
| 111 |
+
|
| 112 |
+
if exists:
|
| 113 |
+
progress = 1.0
|
| 114 |
+
elif cmd_ok:
|
| 115 |
+
progress = 0.5
|
| 116 |
+
else:
|
| 117 |
+
progress = 0.0
|
| 118 |
+
|
| 119 |
+
return GradeResult(
|
| 120 |
+
task_achieved=exists,
|
| 121 |
+
partial_progress=progress,
|
| 122 |
+
reason=(
|
| 123 |
+
f"resource_creation: exists={exists}, "
|
| 124 |
+
f"cmd_ok={cmd_ok}, service={service}, name={name}"
|
| 125 |
+
),
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
def _grade_multi_step(
|
| 129 |
+
self, criteria: SuccessCriteria, tracker: EpisodeTracker
|
| 130 |
+
) -> GradeResult:
|
| 131 |
+
"""Intermediate/Advanced: check ordered step completion."""
|
| 132 |
+
steps = criteria.steps
|
| 133 |
+
if not steps:
|
| 134 |
+
return GradeResult(reason="empty steps list")
|
| 135 |
+
|
| 136 |
+
completed = 0
|
| 137 |
+
for step in steps:
|
| 138 |
+
if tracker.has_executed_operation(step.operation, step.resource):
|
| 139 |
+
completed += 1
|
| 140 |
+
else:
|
| 141 |
+
break # ordered — stop at first incomplete step
|
| 142 |
+
|
| 143 |
+
total = len(steps)
|
| 144 |
+
progress = completed / total if total > 0 else 0.0
|
| 145 |
+
|
| 146 |
+
# For advanced tasks with services requirement, also check services
|
| 147 |
+
services_required = criteria.services
|
| 148 |
+
services_met = all(
|
| 149 |
+
tracker.has_used_service(svc) for svc in services_required
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
achieved = completed == total and (not services_required or services_met)
|
| 153 |
+
|
| 154 |
+
return GradeResult(
|
| 155 |
+
task_achieved=achieved,
|
| 156 |
+
partial_progress=progress,
|
| 157 |
+
reason=(
|
| 158 |
+
f"multi_step: {completed}/{total} steps, "
|
| 159 |
+
f"services_met={services_met if services_required else 'n/a'}"
|
| 160 |
+
),
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
def _grade_state_checks(
|
| 164 |
+
self, criteria: SuccessCriteria, tracker: EpisodeTracker
|
| 165 |
+
) -> GradeResult:
|
| 166 |
+
"""Expert/SRE: verify end-state via arbitrary commands.
|
| 167 |
+
|
| 168 |
+
state_checks are the source of truth for task completion.
|
| 169 |
+
steps (if present) provide partial progress signals only.
|
| 170 |
+
"""
|
| 171 |
+
state_checks = criteria.state_checks
|
| 172 |
+
steps = criteria.steps
|
| 173 |
+
|
| 174 |
+
# Evaluate state checks (ground truth)
|
| 175 |
+
checks_passed = 0
|
| 176 |
+
for check in state_checks:
|
| 177 |
+
check_dict = check.model_dump(exclude_none=True)
|
| 178 |
+
if self._verifier.check_state(check_dict):
|
| 179 |
+
checks_passed += 1
|
| 180 |
+
|
| 181 |
+
total_checks = len(state_checks)
|
| 182 |
+
all_checks_pass = checks_passed == total_checks and total_checks > 0
|
| 183 |
+
|
| 184 |
+
# Evaluate steps for partial progress signal
|
| 185 |
+
steps_completed = 0
|
| 186 |
+
for step in steps:
|
| 187 |
+
if tracker.has_executed_operation(step.operation, step.resource):
|
| 188 |
+
steps_completed += 1
|
| 189 |
+
else:
|
| 190 |
+
break
|
| 191 |
+
|
| 192 |
+
# Progress combines steps (for dense signal) and state checks
|
| 193 |
+
total_steps = len(steps)
|
| 194 |
+
if total_steps > 0:
|
| 195 |
+
step_progress = steps_completed / total_steps
|
| 196 |
+
else:
|
| 197 |
+
step_progress = 0.0
|
| 198 |
+
|
| 199 |
+
# Weight: steps give up to 0.7, state checks give the remaining 0.3
|
| 200 |
+
if total_checks > 0:
|
| 201 |
+
check_progress = checks_passed / total_checks
|
| 202 |
+
progress = step_progress * 0.7 + check_progress * 0.3
|
| 203 |
+
else:
|
| 204 |
+
progress = step_progress
|
| 205 |
+
|
| 206 |
+
# Check services requirement
|
| 207 |
+
services_required = criteria.services
|
| 208 |
+
services_met = all(
|
| 209 |
+
tracker.has_used_service(svc) for svc in services_required
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Task achieved only when ALL state checks pass
|
| 213 |
+
achieved = all_checks_pass and (not services_required or services_met)
|
| 214 |
+
|
| 215 |
+
return GradeResult(
|
| 216 |
+
task_achieved=achieved,
|
| 217 |
+
partial_progress=min(progress, 1.0),
|
| 218 |
+
reason=(
|
| 219 |
+
f"state_checks: {checks_passed}/{total_checks} passed, "
|
| 220 |
+
f"steps: {steps_completed}/{total_steps}, "
|
| 221 |
+
f"services_met={services_met if services_required else 'n/a'}"
|
| 222 |
+
),
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# -- Reward shaping -------------------------------------------------------
|
| 226 |
+
|
| 227 |
+
def _compute_reward(
|
| 228 |
+
self,
|
| 229 |
+
result: GradeResult,
|
| 230 |
+
latest_step: StepRecord,
|
| 231 |
+
tracker: EpisodeTracker,
|
| 232 |
+
) -> float:
|
| 233 |
+
"""Compute a shaped reward in [0.0, 1.0]."""
|
| 234 |
+
if result.task_achieved:
|
| 235 |
+
return 1.0
|
| 236 |
+
|
| 237 |
+
# Base: partial progress scaled to 0.0–0.8 range
|
| 238 |
+
progress_reward = result.partial_progress * 0.8
|
| 239 |
+
|
| 240 |
+
# Bonus for advancing progress (dense signal)
|
| 241 |
+
progress_delta = result.partial_progress - tracker.previous_progress
|
| 242 |
+
if progress_delta > 0:
|
| 243 |
+
progress_reward += 0.1
|
| 244 |
+
|
| 245 |
+
# Penalty for failed commands
|
| 246 |
+
if not latest_step.success:
|
| 247 |
+
progress_reward *= 0.5
|
| 248 |
+
|
| 249 |
+
# Clamp to [0.0, 0.99] — never reach 1.0 without achieving
|
| 250 |
+
return min(max(progress_reward, 0.0), 0.99)
|
server/services/tasks/advanced.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- task_id: 15
|
| 2 |
+
description: >
|
| 3 |
+
Create a Lambda function 'processor' with an IAM execution role,
|
| 4 |
+
then create an SQS queue 'work-items' and configure it as an
|
| 5 |
+
event source for the Lambda function.
|
| 6 |
+
success_criteria:
|
| 7 |
+
services:
|
| 8 |
+
- iam
|
| 9 |
+
- lambda
|
| 10 |
+
- sqs
|
| 11 |
+
steps:
|
| 12 |
+
- operation: create-role
|
| 13 |
+
- operation: create-function
|
| 14 |
+
resource: processor
|
| 15 |
+
- operation: create-queue
|
| 16 |
+
resource: work-items
|
| 17 |
+
- operation: create-event-source-mapping
|
| 18 |
+
|
| 19 |
+
- task_id: 16
|
| 20 |
+
description: >
|
| 21 |
+
Deploy a serverless API: create a DynamoDB table 'products',
|
| 22 |
+
create an IAM role for Lambda, create a Lambda function 'product-api',
|
| 23 |
+
and set up an API Gateway REST API with a GET method on /products
|
| 24 |
+
integrated with the Lambda.
|
| 25 |
+
success_criteria:
|
| 26 |
+
services:
|
| 27 |
+
- dynamodb
|
| 28 |
+
- iam
|
| 29 |
+
- lambda
|
| 30 |
+
- apigateway
|
| 31 |
+
steps:
|
| 32 |
+
- operation: create-table
|
| 33 |
+
resource: products
|
| 34 |
+
- operation: create-role
|
| 35 |
+
- operation: create-function
|
| 36 |
+
resource: product-api
|
| 37 |
+
- operation: create-rest-api
|
| 38 |
+
- operation: create-resource
|
| 39 |
+
- operation: put-method
|
| 40 |
+
- operation: put-integration
|
| 41 |
+
|
| 42 |
+
- task_id: 17
|
| 43 |
+
description: >
|
| 44 |
+
Build a fan-out notification system: create an SNS topic 'order-events',
|
| 45 |
+
create two SQS queues 'shipping-queue' and 'billing-queue',
|
| 46 |
+
subscribe both queues to the SNS topic, then publish a test message.
|
| 47 |
+
success_criteria:
|
| 48 |
+
services:
|
| 49 |
+
- sns
|
| 50 |
+
- sqs
|
| 51 |
+
steps:
|
| 52 |
+
- operation: create-topic
|
| 53 |
+
resource: order-events
|
| 54 |
+
- operation: create-queue
|
| 55 |
+
resource: shipping-queue
|
| 56 |
+
- operation: create-queue
|
| 57 |
+
resource: billing-queue
|
| 58 |
+
- operation: subscribe
|
| 59 |
+
- operation: subscribe
|
| 60 |
+
- operation: publish
|
server/services/tasks/beginner.yaml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- task_id: 6
|
| 2 |
+
description: Create an S3 bucket named 'my-test-bucket'.
|
| 3 |
+
success_criteria:
|
| 4 |
+
command_contains: s3api
|
| 5 |
+
operation: create-bucket
|
| 6 |
+
resource_exists:
|
| 7 |
+
service: s3
|
| 8 |
+
name: my-test-bucket
|
| 9 |
+
|
| 10 |
+
- task_id: 7
|
| 11 |
+
description: Create a DynamoDB table named 'users' with a partition key 'user_id' (String type).
|
| 12 |
+
success_criteria:
|
| 13 |
+
command_contains: dynamodb
|
| 14 |
+
operation: create-table
|
| 15 |
+
resource_exists:
|
| 16 |
+
service: dynamodb
|
| 17 |
+
name: users
|
| 18 |
+
|
| 19 |
+
- task_id: 8
|
| 20 |
+
description: Create an SQS queue named 'task-queue'.
|
| 21 |
+
success_criteria:
|
| 22 |
+
command_contains: sqs
|
| 23 |
+
operation: create-queue
|
| 24 |
+
resource_exists:
|
| 25 |
+
service: sqs
|
| 26 |
+
name: task-queue
|
| 27 |
+
|
| 28 |
+
- task_id: 9
|
| 29 |
+
description: Create an SNS topic named 'notifications'.
|
| 30 |
+
success_criteria:
|
| 31 |
+
command_contains: sns
|
| 32 |
+
operation: create-topic
|
| 33 |
+
resource_exists:
|
| 34 |
+
service: sns
|
| 35 |
+
name: notifications
|
| 36 |
+
|
| 37 |
+
- task_id: 10
|
| 38 |
+
description: Create a Lambda function named 'hello-world' using the python3.12 runtime.
|
| 39 |
+
success_criteria:
|
| 40 |
+
command_contains: lambda
|
| 41 |
+
operation: create-function
|
| 42 |
+
resource_exists:
|
| 43 |
+
service: lambda
|
| 44 |
+
name: hello-world
|
server/services/tasks/expert.yaml
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- task_id: 18
|
| 2 |
+
description: >
|
| 3 |
+
SRE Incident: A Lambda function 'order-processor' exists but its IAM role
|
| 4 |
+
is missing the required SQS permissions. The function's event source mapping
|
| 5 |
+
to the 'incoming-orders' SQS queue is failing. Diagnose the issue, attach
|
| 6 |
+
the correct SQS policy to the role, and create the event source mapping.
|
| 7 |
+
setup_commands:
|
| 8 |
+
- >-
|
| 9 |
+
aws iam create-role --role-name broken-lambda-role
|
| 10 |
+
--assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"lambda.amazonaws.com"},"Action":"sts:AssumeRole"}]}'
|
| 11 |
+
- >-
|
| 12 |
+
aws iam attach-role-policy --role-name broken-lambda-role
|
| 13 |
+
--policy-arn arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
|
| 14 |
+
- >-
|
| 15 |
+
aws lambda create-function --function-name order-processor
|
| 16 |
+
--runtime python3.12 --handler index.handler
|
| 17 |
+
--role arn:aws:iam::000000000000:role/broken-lambda-role
|
| 18 |
+
--zip-file fileb:///tmp/dummy.zip
|
| 19 |
+
- aws sqs create-queue --queue-name incoming-orders
|
| 20 |
+
success_criteria:
|
| 21 |
+
services:
|
| 22 |
+
- iam
|
| 23 |
+
- lambda
|
| 24 |
+
- sqs
|
| 25 |
+
state_checks:
|
| 26 |
+
- command: aws iam list-attached-role-policies --role-name broken-lambda-role
|
| 27 |
+
output_contains: "SQS"
|
| 28 |
+
- command: aws lambda list-event-source-mappings --function-name order-processor
|
| 29 |
+
output_contains: "incoming-orders"
|
| 30 |
+
steps:
|
| 31 |
+
- operation: attach-role-policy
|
| 32 |
+
resource: broken-lambda-role
|
| 33 |
+
- operation: create-event-source-mapping
|
| 34 |
+
|
| 35 |
+
- task_id: 19
|
| 36 |
+
description: >
|
| 37 |
+
SRE Incident: An S3 bucket 'app-config-store' was created to host
|
| 38 |
+
configuration files, but versioning was never enabled. A recent
|
| 39 |
+
accidental overwrite lost critical config. Enable versioning on the
|
| 40 |
+
bucket and add a lifecycle rule named 'cleanup-old-versions' that
|
| 41 |
+
expires non-current object versions after 30 days.
|
| 42 |
+
setup_commands:
|
| 43 |
+
- aws s3api create-bucket --bucket app-config-store
|
| 44 |
+
- >-
|
| 45 |
+
aws s3api put-object --bucket app-config-store
|
| 46 |
+
--key config/app.json --body /dev/null
|
| 47 |
+
success_criteria:
|
| 48 |
+
services:
|
| 49 |
+
- s3
|
| 50 |
+
state_checks:
|
| 51 |
+
- command: aws s3api get-bucket-versioning --bucket app-config-store
|
| 52 |
+
output_contains: "Enabled"
|
| 53 |
+
- command: aws s3api get-bucket-lifecycle-configuration --bucket app-config-store
|
| 54 |
+
output_contains: "cleanup-old-versions"
|
| 55 |
+
steps:
|
| 56 |
+
- operation: put-bucket-versioning
|
| 57 |
+
resource: app-config-store
|
| 58 |
+
- operation: put-bucket-lifecycle-configuration
|
| 59 |
+
resource: app-config-store
|
| 60 |
+
|
| 61 |
+
- task_id: 20
|
| 62 |
+
description: >
|
| 63 |
+
SRE Incident: A DynamoDB table 'session-store' is experiencing throttling
|
| 64 |
+
because it was provisioned with only 1 RCU and 1 WCU. An SNS topic
|
| 65 |
+
'ops-alerts' exists but has no subscriptions, so no one is being notified.
|
| 66 |
+
Fix the table by updating its throughput to 50 RCU and 50 WCU, then create
|
| 67 |
+
an SQS queue 'ops-alert-inbox' and subscribe it to the SNS topic.
|
| 68 |
+
setup_commands:
|
| 69 |
+
- >-
|
| 70 |
+
aws dynamodb create-table --table-name session-store
|
| 71 |
+
--attribute-definitions AttributeName=session_id,AttributeType=S
|
| 72 |
+
--key-schema AttributeName=session_id,KeyType=HASH
|
| 73 |
+
--provisioned-throughput ReadCapacityUnits=1,WriteCapacityUnits=1
|
| 74 |
+
- aws sns create-topic --name ops-alerts
|
| 75 |
+
success_criteria:
|
| 76 |
+
services:
|
| 77 |
+
- dynamodb
|
| 78 |
+
- sns
|
| 79 |
+
- sqs
|
| 80 |
+
state_checks:
|
| 81 |
+
- command: aws dynamodb describe-table --table-name session-store
|
| 82 |
+
json_path: "$.Table.ProvisionedThroughput.ReadCapacityUnits"
|
| 83 |
+
expected: 50
|
| 84 |
+
- command: aws dynamodb describe-table --table-name session-store
|
| 85 |
+
json_path: "$.Table.ProvisionedThroughput.WriteCapacityUnits"
|
| 86 |
+
expected: 50
|
| 87 |
+
- command: >-
|
| 88 |
+
aws sns list-subscriptions-by-topic
|
| 89 |
+
--topic-arn arn:aws:sns:us-east-1:000000000000:ops-alerts
|
| 90 |
+
output_contains: "sqs"
|
| 91 |
+
steps:
|
| 92 |
+
- operation: update-table
|
| 93 |
+
resource: session-store
|
| 94 |
+
- operation: create-queue
|
| 95 |
+
resource: ops-alert-inbox
|
| 96 |
+
- operation: subscribe
|
| 97 |
+
resource: ops-alerts
|
server/services/tasks/intermediate.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- task_id: 11
|
| 2 |
+
description: Create an S3 bucket named 'data-pipeline' and upload a file to it.
|
| 3 |
+
success_criteria:
|
| 4 |
+
steps:
|
| 5 |
+
- operation: create-bucket
|
| 6 |
+
resource: data-pipeline
|
| 7 |
+
- operation: put-object
|
| 8 |
+
resource: data-pipeline
|
| 9 |
+
|
| 10 |
+
- task_id: 12
|
| 11 |
+
description: >
|
| 12 |
+
Create a DynamoDB table named 'orders' with partition key 'order_id' (S),
|
| 13 |
+
then insert an item with order_id '001' and status 'pending'.
|
| 14 |
+
success_criteria:
|
| 15 |
+
steps:
|
| 16 |
+
- operation: create-table
|
| 17 |
+
resource: orders
|
| 18 |
+
- operation: put-item
|
| 19 |
+
resource: orders
|
| 20 |
+
|
| 21 |
+
- task_id: 13
|
| 22 |
+
description: >
|
| 23 |
+
Create an SNS topic named 'alerts', then create an SQS queue named
|
| 24 |
+
'alert-inbox' and subscribe the queue to the topic.
|
| 25 |
+
success_criteria:
|
| 26 |
+
steps:
|
| 27 |
+
- operation: create-topic
|
| 28 |
+
resource: alerts
|
| 29 |
+
- operation: create-queue
|
| 30 |
+
resource: alert-inbox
|
| 31 |
+
- operation: subscribe
|
| 32 |
+
resource: alerts
|
| 33 |
+
|
| 34 |
+
- task_id: 14
|
| 35 |
+
description: >
|
| 36 |
+
Create an IAM role named 'lambda-exec-role' with an assume-role policy
|
| 37 |
+
for Lambda, then attach the AWSLambdaBasicExecutionRole managed policy to it.
|
| 38 |
+
success_criteria:
|
| 39 |
+
steps:
|
| 40 |
+
- operation: create-role
|
| 41 |
+
resource: lambda-exec-role
|
| 42 |
+
- operation: attach-role-policy
|
| 43 |
+
resource: lambda-exec-role
|
server/services/tasks/warmup.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- task_id: 0
|
| 2 |
+
description: List all S3 buckets in the environment.
|
| 3 |
+
success_criteria:
|
| 4 |
+
command_contains: s3
|
| 5 |
+
operation: ls
|
| 6 |
+
|
| 7 |
+
- task_id: 1
|
| 8 |
+
description: Describe all EC2 instances in the environment.
|
| 9 |
+
success_criteria:
|
| 10 |
+
command_contains: ec2
|
| 11 |
+
operation: describe-instances
|
| 12 |
+
|
| 13 |
+
- task_id: 2
|
| 14 |
+
description: List all DynamoDB tables.
|
| 15 |
+
success_criteria:
|
| 16 |
+
command_contains: dynamodb
|
| 17 |
+
operation: list-tables
|
| 18 |
+
|
| 19 |
+
- task_id: 3
|
| 20 |
+
description: List all Lambda functions.
|
| 21 |
+
success_criteria:
|
| 22 |
+
command_contains: lambda
|
| 23 |
+
operation: list-functions
|
| 24 |
+
|
| 25 |
+
- task_id: 4
|
| 26 |
+
description: List all SQS queues in the environment.
|
| 27 |
+
success_criteria:
|
| 28 |
+
command_contains: sqs
|
| 29 |
+
operation: list-queues
|
| 30 |
+
|
| 31 |
+
- task_id: 5
|
| 32 |
+
description: List all SNS topics in the environment.
|
| 33 |
+
success_criteria:
|
| 34 |
+
command_contains: sns
|
| 35 |
+
operation: list-topics
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|