Spaces:
Sleeping
Sleeping
final
Browse files- .claude/settings.json +19 -0
- Dockerfile +2 -21
- Pre_Validation_Script.sh +185 -0
- README.md +103 -139
- Sample_Inference_Script.py +187 -0
- app.py +21 -25
- baseline.py +0 -309
- data.py +5 -82
- environment.py +149 -347
- graders.py +225 -172
- inference.py +262 -323
- models.py +54 -28
- openenv.yaml +0 -8
- requirements.txt +0 -1
- test_integration.py +92 -96
- tests_new.py +175 -164
.claude/settings.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"permissions": {
|
| 3 |
+
"allow": [
|
| 4 |
+
"Bash(python -m pytest tests_new.py -v)",
|
| 5 |
+
"Bash(pip install:*)",
|
| 6 |
+
"Bash(pytest tests_new.py -v)",
|
| 7 |
+
"Bash(/var/data/python/bin/pytest tests_new.py -v)",
|
| 8 |
+
"Bash(python test_integration.py)",
|
| 9 |
+
"Bash(docker build:*)",
|
| 10 |
+
"Bash(/var/data/python/bin/pytest tests_new.py -v --tb=short)",
|
| 11 |
+
"WebFetch(domain:exploring-solver-openenv-solvor.hf.space)",
|
| 12 |
+
"Bash(curl -s -o /tmp/reset_empty.json -w 'HTTP %{http_code}' -X POST -H 'Content-Type: application/json' -d '{}' https://exploring-solver-openenv-solvor.hf.space/reset --max-time 30)",
|
| 13 |
+
"Read(//tmp/**)",
|
| 14 |
+
"Bash(curl -s -X POST -H 'Content-Type: application/json' -d '{\"task_id\":\"task1\"}' https://exploring-solver-openenv-solvor.hf.space/reset --max-time 30)",
|
| 15 |
+
"Bash(python3 -m json.tool)",
|
| 16 |
+
"Bash(python3 test_integration.py)"
|
| 17 |
+
]
|
| 18 |
+
}
|
| 19 |
+
}
|
Dockerfile
CHANGED
|
@@ -1,42 +1,23 @@
|
|
| 1 |
# ---------------------------------------------------------------
|
| 2 |
-
#
|
| 3 |
# Space SDK: Docker | Port: 7860
|
| 4 |
# ---------------------------------------------------------------
|
| 5 |
FROM python:3.11-slim
|
| 6 |
|
| 7 |
-
# Install system utilities for DevOps tasks
|
| 8 |
-
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
-
nginx \
|
| 10 |
-
docker.io \
|
| 11 |
-
systemctl \
|
| 12 |
-
curl \
|
| 13 |
-
git \
|
| 14 |
-
vim \
|
| 15 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
-
|
| 17 |
-
# Create non-root user for Hugging Face Spaces
|
| 18 |
RUN useradd -m -u 1000 appuser
|
| 19 |
|
| 20 |
WORKDIR /app
|
| 21 |
|
| 22 |
-
# Install Python dependencies first (layer caching)
|
| 23 |
COPY requirements.txt .
|
| 24 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
|
| 26 |
-
# Copy application code
|
| 27 |
COPY --chown=appuser:appuser . .
|
| 28 |
|
| 29 |
-
# HF Spaces compatibility
|
| 30 |
-
RUN chmod +x /app/app.py 2>/dev/null || true
|
| 31 |
-
|
| 32 |
USER appuser
|
| 33 |
|
| 34 |
-
# Expose the port HF Spaces expects
|
| 35 |
EXPOSE 7860
|
| 36 |
|
| 37 |
-
# Health check
|
| 38 |
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 39 |
CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
|
| 40 |
|
| 41 |
-
|
| 42 |
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--reload"]
|
|
|
|
| 1 |
# ---------------------------------------------------------------
|
| 2 |
+
# SupportEnv — Hugging Face Spaces Docker container
|
| 3 |
# Space SDK: Docker | Port: 7860
|
| 4 |
# ---------------------------------------------------------------
|
| 5 |
FROM python:3.11-slim
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
RUN useradd -m -u 1000 appuser
|
| 8 |
|
| 9 |
WORKDIR /app
|
| 10 |
|
|
|
|
| 11 |
COPY requirements.txt .
|
| 12 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 13 |
|
|
|
|
| 14 |
COPY --chown=appuser:appuser . .
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
USER appuser
|
| 17 |
|
|
|
|
| 18 |
EXPOSE 7860
|
| 19 |
|
|
|
|
| 20 |
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 21 |
CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
|
| 22 |
|
| 23 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
|
|
Pre_Validation_Script.sh
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
#
|
| 7 |
+
# Prerequisites:
|
| 8 |
+
# - Docker: https://docs.docker.com/get-docker/
|
| 9 |
+
# - openenv-core: pip install openenv-core
|
| 10 |
+
# - curl (usually pre-installed)
|
| 11 |
+
#
|
| 12 |
+
# Run:
|
| 13 |
+
# curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
|
| 14 |
+
#
|
| 15 |
+
# Or download and run locally:
|
| 16 |
+
# chmod +x validate-submission.sh
|
| 17 |
+
# ./validate-submission.sh <ping_url> [repo_dir]
|
| 18 |
+
#
|
| 19 |
+
# Arguments:
|
| 20 |
+
# ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
|
| 21 |
+
# repo_dir Path to your repo (default: current directory)
|
| 22 |
+
#
|
| 23 |
+
# Examples:
|
| 24 |
+
# ./validate-submission.sh https://my-team.hf.space
|
| 25 |
+
# ./validate-submission.sh https://my-team.hf.space ./my-repo
|
| 26 |
+
#
|
| 27 |
+
|
| 28 |
+
set -uo pipefail
|
| 29 |
+
|
| 30 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 31 |
+
if [ -t 1 ]; then
|
| 32 |
+
RED='\033[0;31m'
|
| 33 |
+
GREEN='\033[0;32m'
|
| 34 |
+
YELLOW='\033[1;33m'
|
| 35 |
+
BOLD='\033[1m'
|
| 36 |
+
NC='\033[0m'
|
| 37 |
+
else
|
| 38 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
run_with_timeout() {
|
| 42 |
+
local secs="$1"; shift
|
| 43 |
+
if command -v timeout &>/dev/null; then
|
| 44 |
+
timeout "$secs" "$@"
|
| 45 |
+
elif command -v gtimeout &>/dev/null; then
|
| 46 |
+
gtimeout "$secs" "$@"
|
| 47 |
+
else
|
| 48 |
+
"$@" &
|
| 49 |
+
local pid=$!
|
| 50 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 51 |
+
local watcher=$!
|
| 52 |
+
wait "$pid" 2>/dev/null
|
| 53 |
+
local rc=$?
|
| 54 |
+
kill "$watcher" 2>/dev/null
|
| 55 |
+
wait "$watcher" 2>/dev/null
|
| 56 |
+
return $rc
|
| 57 |
+
fi
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
portable_mktemp() {
|
| 61 |
+
local prefix="${1:-validate}"
|
| 62 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
CLEANUP_FILES=()
|
| 66 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 67 |
+
trap cleanup EXIT
|
| 68 |
+
|
| 69 |
+
PING_URL="${1:-}"
|
| 70 |
+
REPO_DIR="${2:-.}"
|
| 71 |
+
|
| 72 |
+
if [ -z "$PING_URL" ]; then
|
| 73 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 74 |
+
printf "\n"
|
| 75 |
+
printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
|
| 76 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 77 |
+
exit 1
|
| 78 |
+
fi
|
| 79 |
+
|
| 80 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 81 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 82 |
+
exit 1
|
| 83 |
+
fi
|
| 84 |
+
PING_URL="${PING_URL%/}"
|
| 85 |
+
export PING_URL
|
| 86 |
+
PASS=0
|
| 87 |
+
|
| 88 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 89 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 90 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 91 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 92 |
+
stop_at() {
|
| 93 |
+
printf "\n"
|
| 94 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 95 |
+
exit 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
printf "\n"
|
| 99 |
+
printf "${BOLD}========================================${NC}\n"
|
| 100 |
+
printf "${BOLD} OpenEnv Submission Validator${NC}\n"
|
| 101 |
+
printf "${BOLD}========================================${NC}\n"
|
| 102 |
+
log "Repo: $REPO_DIR"
|
| 103 |
+
log "Ping URL: $PING_URL"
|
| 104 |
+
printf "\n"
|
| 105 |
+
|
| 106 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 107 |
+
|
| 108 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 109 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 110 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 111 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 112 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 113 |
+
|
| 114 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 115 |
+
pass "HF Space is live and responds to /reset"
|
| 116 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 117 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 118 |
+
hint "Check your network connection and that the Space is running."
|
| 119 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 120 |
+
stop_at "Step 1"
|
| 121 |
+
else
|
| 122 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 123 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 124 |
+
hint "Try opening $PING_URL in your browser first."
|
| 125 |
+
stop_at "Step 1"
|
| 126 |
+
fi
|
| 127 |
+
|
| 128 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 129 |
+
|
| 130 |
+
if ! command -v docker &>/dev/null; then
|
| 131 |
+
fail "docker command not found"
|
| 132 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 133 |
+
stop_at "Step 2"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 137 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 138 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 139 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 140 |
+
else
|
| 141 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 142 |
+
stop_at "Step 2"
|
| 143 |
+
fi
|
| 144 |
+
|
| 145 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 146 |
+
|
| 147 |
+
BUILD_OK=false
|
| 148 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 149 |
+
|
| 150 |
+
if [ "$BUILD_OK" = true ]; then
|
| 151 |
+
pass "Docker build succeeded"
|
| 152 |
+
else
|
| 153 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 154 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 155 |
+
stop_at "Step 2"
|
| 156 |
+
fi
|
| 157 |
+
|
| 158 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 159 |
+
|
| 160 |
+
if ! command -v openenv &>/dev/null; then
|
| 161 |
+
fail "openenv command not found"
|
| 162 |
+
hint "Install it: pip install openenv-core"
|
| 163 |
+
stop_at "Step 3"
|
| 164 |
+
fi
|
| 165 |
+
|
| 166 |
+
VALIDATE_OK=false
|
| 167 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 168 |
+
|
| 169 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 170 |
+
pass "openenv validate passed"
|
| 171 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 172 |
+
else
|
| 173 |
+
fail "openenv validate failed"
|
| 174 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 175 |
+
stop_at "Step 3"
|
| 176 |
+
fi
|
| 177 |
+
|
| 178 |
+
printf "\n"
|
| 179 |
+
printf "${BOLD}========================================${NC}\n"
|
| 180 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 181 |
+
printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
|
| 182 |
+
printf "${BOLD}========================================${NC}\n"
|
| 183 |
+
printf "\n"
|
| 184 |
+
|
| 185 |
+
exit 0
|
README.md
CHANGED
|
@@ -1,161 +1,125 @@
|
|
| 1 |
-
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
| 8 |
tags:
|
| 9 |
- openenv
|
| 10 |
-
-
|
| 11 |
-
-
|
| 12 |
-
-
|
| 13 |
- agent-evaluation
|
| 14 |
pinned: false
|
| 15 |
---
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
Invoke-WebRequest -Uri "http://127.0.0.1:7860/health" -UseBasicParsing
|
| 75 |
-
|
| 76 |
-
If working, response includes status: healthy.
|
| 77 |
-
|
| 78 |
-
### 4) Run built-in integration test
|
| 79 |
-
|
| 80 |
-
python test_integration.py
|
| 81 |
-
|
| 82 |
-
If working, you should see all 3 tasks run and a final success message.
|
| 83 |
-
|
| 84 |
-
## Minimal API Example (Normal)
|
| 85 |
-
|
| 86 |
-
PowerShell example:
|
| 87 |
-
|
| 88 |
-
$reset = Invoke-WebRequest -Uri "http://127.0.0.1:7860/reset" -Method POST -ContentType "application/json" -Body '{"task_id":"task1"}' | Select-Object -ExpandProperty Content | ConvertFrom-Json
|
| 89 |
-
$episodeId = $reset.episode_id
|
| 90 |
-
|
| 91 |
-
$step = @{
|
| 92 |
-
episode_id = $episodeId
|
| 93 |
-
action = @{
|
| 94 |
-
action_type = "bash_cmd"
|
| 95 |
-
command = "systemctl restart nginx"
|
| 96 |
}
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
|
| 109 |
-
python inference.py --task task1 --model gpt-4o-mini
|
| 110 |
|
| 111 |
-
|
| 112 |
|
| 113 |
-
|
| 114 |
|
| 115 |
-
|
| 116 |
|
| 117 |
-
|
| 118 |
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
|
| 122 |
-
$env:OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
|
| 123 |
-
python inference.py --task task1 --model gemini-2.5-flash
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
|
|
|
| 129 |
|
| 130 |
## Docker
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
docker
|
| 135 |
-
|
| 136 |
-
Run:
|
| 137 |
-
|
| 138 |
-
docker run -p 7860:7860 devopsenv
|
| 139 |
-
|
| 140 |
-
Then open:
|
| 141 |
-
- http://127.0.0.1:7860/health
|
| 142 |
-
- http://127.0.0.1:7860/docs
|
| 143 |
-
|
| 144 |
-
## Project Files
|
| 145 |
-
|
| 146 |
-
- app.py: FastAPI API
|
| 147 |
-
- environment.py: episode logic and simulator
|
| 148 |
-
- graders.py: deterministic scoring
|
| 149 |
-
- data.py: task metadata
|
| 150 |
-
- models.py: Pydantic schemas
|
| 151 |
-
- inference.py: LLM baseline runner
|
| 152 |
-
- test_integration.py: local end-to-end check
|
| 153 |
-
|
| 154 |
-
## Troubleshooting
|
| 155 |
-
|
| 156 |
-
- Port already in use:
|
| 157 |
-
- change server port or stop old process.
|
| 158 |
-
- 400/404 from API:
|
| 159 |
-
- check episode_id and task_id values.
|
| 160 |
-
- LLM errors:
|
| 161 |
-
- verify API key, model name, and OPENAI_BASE_URL for Gemini.
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SupportEnv
|
| 3 |
+
emoji: 🎫
|
| 4 |
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
| 8 |
tags:
|
| 9 |
- openenv
|
| 10 |
+
- customer-support
|
| 11 |
+
- nlp
|
| 12 |
+
- ticket-triage
|
| 13 |
- agent-evaluation
|
| 14 |
pinned: false
|
| 15 |
---
|
| 16 |
|
| 17 |
+
# SupportEnv
|
| 18 |
+
|
| 19 |
+
SupportEnv is an OpenEnv-compliant environment for evaluating LLM agents on customer support ticket triage. Each episode presents a realistic support ticket and asks the agent to classify, extract, or resolve it — scored deterministically against ground-truth labels.
|
| 20 |
+
|
| 21 |
+
## Tasks
|
| 22 |
+
|
| 23 |
+
| Task | Difficulty | Action | Max Steps |
|
| 24 |
+
|------|-----------|--------|-----------|
|
| 25 |
+
| Task 1 — Ticket Classification | Easy | `classify` | 3 |
|
| 26 |
+
| Task 2 — Information Extraction | Medium | `extract` | 5 |
|
| 27 |
+
| Task 3 — Resolution Generation | Hard | `respond` | 8 |
|
| 28 |
+
|
| 29 |
+
**Task 1 — Ticket Classification (Easy)**
|
| 30 |
+
Assign a `category` (billing / technical / account / feature_request / complaint / general) and `priority` (low / medium / high / critical) to each ticket.
|
| 31 |
+
|
| 32 |
+
**Task 2 — Information Extraction (Medium)**
|
| 33 |
+
Extract structured entities (IDs, names, amounts, dates) and identify the list of required resolution actions.
|
| 34 |
+
|
| 35 |
+
**Task 3 — Resolution Generation (Hard)**
|
| 36 |
+
Write a professional customer-facing response and an ordered list of internal resolution steps. Graded on keyword coverage, step completeness, tone adherence, and minimum length.
|
| 37 |
+
|
| 38 |
+
## API
|
| 39 |
+
|
| 40 |
+
| Method | Endpoint | Description |
|
| 41 |
+
|--------|----------|-------------|
|
| 42 |
+
| `POST` | `/reset` | Start a new episode |
|
| 43 |
+
| `POST` | `/step` | Submit an action |
|
| 44 |
+
| `GET` | `/state` | Get current episode state |
|
| 45 |
+
| `POST` | `/grader` | Grade a finished episode |
|
| 46 |
+
| `GET` | `/tasks` | List all tasks |
|
| 47 |
+
| `GET` | `/health` | Liveness check |
|
| 48 |
+
| `GET` | `/docs` | OpenAPI docs |
|
| 49 |
+
|
| 50 |
+
### Reset
|
| 51 |
+
```json
|
| 52 |
+
POST /reset
|
| 53 |
+
{"task_id": "task1", "ticket_index": 0}
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Step — Task 1 (classify)
|
| 57 |
+
```json
|
| 58 |
+
POST /step
|
| 59 |
+
{
|
| 60 |
+
"episode_id": "<id>",
|
| 61 |
+
"action": {"action_type": "classify", "category": "billing", "priority": "high"}
|
| 62 |
+
}
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### Step — Task 2 (extract)
|
| 66 |
+
```json
|
| 67 |
+
POST /step
|
| 68 |
+
{
|
| 69 |
+
"episode_id": "<id>",
|
| 70 |
+
"action": {
|
| 71 |
+
"action_type": "extract",
|
| 72 |
+
"extracted_entities": {"customer_name": "Alice", "invoice_number": "INV-001"},
|
| 73 |
+
"required_actions": ["issue_refund", "send_corrected_invoice"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
}
|
| 75 |
+
}
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### Step — Task 3 (respond)
|
| 79 |
+
```json
|
| 80 |
+
POST /step
|
| 81 |
+
{
|
| 82 |
+
"episode_id": "<id>",
|
| 83 |
+
"action": {
|
| 84 |
+
"action_type": "respond",
|
| 85 |
+
"response_text": "Dear customer, we sincerely apologize...",
|
| 86 |
+
"resolution_steps": ["verify_account", "issue_refund", "send_confirmation"]
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
```
|
| 90 |
|
| 91 |
+
### Submit
|
| 92 |
+
```json
|
| 93 |
+
POST /step
|
| 94 |
+
{"episode_id": "<id>", "action": {"action_type": "submit"}}
|
| 95 |
+
```
|
| 96 |
|
| 97 |
+
## Scoring
|
|
|
|
| 98 |
|
| 99 |
+
**Task 1:** category match (0.50) + priority match (0.40) + efficiency (0.10)
|
| 100 |
|
| 101 |
+
**Task 2:** entity coverage (0.60) + action coverage (0.30) + no hallucination (0.10)
|
| 102 |
|
| 103 |
+
**Task 3:** keyword coverage (0.30) + step coverage (0.30) + tone compliance (0.25) + length adequate (0.10) + non-empty steps (0.05)
|
| 104 |
|
| 105 |
+
## Running Locally
|
| 106 |
|
| 107 |
+
```bash
|
| 108 |
+
pip install -r requirements.txt
|
| 109 |
+
uvicorn app:app --host 0.0.0.0 --port 7860
|
| 110 |
+
```
|
| 111 |
|
| 112 |
+
## Running the Baseline Agent
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
```bash
|
| 115 |
+
export HF_TOKEN=your_token_here
|
| 116 |
+
export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 117 |
+
python inference.py
|
| 118 |
+
```
|
| 119 |
|
| 120 |
## Docker
|
| 121 |
|
| 122 |
+
```bash
|
| 123 |
+
docker build -t supportenv .
|
| 124 |
+
docker run -p 7860:7860 supportenv
|
| 125 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Sample_Inference_Script.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference Script Example
|
| 3 |
+
===================================
|
| 4 |
+
MANDATORY
|
| 5 |
+
- Before submitting, ensure the following variables are defined in your environment configuration:
|
| 6 |
+
API_BASE_URL The API endpoint for the LLM.
|
| 7 |
+
MODEL_NAME The model identifier to use for inference.
|
| 8 |
+
HF_TOKEN Your Hugging Face / API key.
|
| 9 |
+
LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
|
| 10 |
+
method
|
| 11 |
+
|
| 12 |
+
- Defaults are set only for API_BASE_URL and MODEL_NAME
|
| 13 |
+
(and should reflect your active inference setup):
|
| 14 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
|
| 15 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
|
| 16 |
+
|
| 17 |
+
- The inference script must be named `inference.py` and placed in the root directory of the project
|
| 18 |
+
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 19 |
+
|
| 20 |
+
STDOUT FORMAT
|
| 21 |
+
- The script must emit exactly three line types to stdout, in this order:
|
| 22 |
+
|
| 23 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 24 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 25 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 26 |
+
|
| 27 |
+
Rules:
|
| 28 |
+
- One [START] line at episode begin.
|
| 29 |
+
- One [STEP] line per step, immediately after env.step() returns.
|
| 30 |
+
- One [END] line after env.close(), always emitted (even on exception).
|
| 31 |
+
- reward and rewards are formatted to 2 decimal places.
|
| 32 |
+
- done and success are lowercase booleans: true or false.
|
| 33 |
+
- error is the raw last_action_error string, or null if none.
|
| 34 |
+
- All fields on a single line with no newlines within a line.
|
| 35 |
+
|
| 36 |
+
Example:
|
| 37 |
+
[START] task=click-test env=miniwob model=Qwen3-VL-30B
|
| 38 |
+
[STEP] step=1 action=click('123') reward=0.00 done=false error=null
|
| 39 |
+
[STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
|
| 40 |
+
[STEP] step=3 action=click('789') reward=1.00 done=true error=null
|
| 41 |
+
[END] success=true steps=3 rewards=0.00,0.00,1.00
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
import asyncio
|
| 45 |
+
import os
|
| 46 |
+
import textwrap
|
| 47 |
+
from typing import List, Optional
|
| 48 |
+
|
| 49 |
+
from openai import OpenAI
|
| 50 |
+
|
| 51 |
+
from my_env_v4 import MyEnvV4Action, MyEnvV4Env
|
| 52 |
+
IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
|
| 53 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 54 |
+
|
| 55 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 56 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 57 |
+
TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
|
| 58 |
+
BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
|
| 59 |
+
MAX_STEPS = 8
|
| 60 |
+
TEMPERATURE = 0.7
|
| 61 |
+
MAX_TOKENS = 150
|
| 62 |
+
SUCCESS_SCORE_THRESHOLD = 0.1 # normalized score in [0, 1]
|
| 63 |
+
|
| 64 |
+
# Max possible reward: each token contributes 0.1, across all steps
|
| 65 |
+
_MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
|
| 66 |
+
MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
|
| 67 |
+
|
| 68 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 69 |
+
"""
|
| 70 |
+
You are interacting with a simple echo environment.
|
| 71 |
+
Each turn you must send a message. The environment will echo it back.
|
| 72 |
+
Reward is proportional to message length: reward = len(message) * 0.1
|
| 73 |
+
Your goal is to maximize total reward by sending meaningful, substantive messages.
|
| 74 |
+
Reply with exactly one message string — no quotes, no prefixes, just the message text.
|
| 75 |
+
"""
|
| 76 |
+
).strip()
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 80 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 84 |
+
error_val = error if error else "null"
|
| 85 |
+
done_val = str(done).lower()
|
| 86 |
+
print(
|
| 87 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 88 |
+
flush=True,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 93 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 94 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
|
| 98 |
+
history_block = "\n".join(history[-4:]) if history else "None"
|
| 99 |
+
return textwrap.dedent(
|
| 100 |
+
f"""
|
| 101 |
+
Step: {step}
|
| 102 |
+
Last echoed message: {last_echoed!r}
|
| 103 |
+
Last reward: {last_reward:.2f}
|
| 104 |
+
Previous steps:
|
| 105 |
+
{history_block}
|
| 106 |
+
Send your next message.
|
| 107 |
+
"""
|
| 108 |
+
).strip()
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
|
| 112 |
+
user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
|
| 113 |
+
try:
|
| 114 |
+
completion = client.chat.completions.create(
|
| 115 |
+
model=MODEL_NAME,
|
| 116 |
+
messages=[
|
| 117 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 118 |
+
{"role": "user", "content": user_prompt},
|
| 119 |
+
],
|
| 120 |
+
temperature=TEMPERATURE,
|
| 121 |
+
max_tokens=MAX_TOKENS,
|
| 122 |
+
stream=False,
|
| 123 |
+
)
|
| 124 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 125 |
+
return text if text else "hello"
|
| 126 |
+
except Exception as exc:
|
| 127 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 128 |
+
return "hello"
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
async def main() -> None:
|
| 132 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 133 |
+
|
| 134 |
+
env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
|
| 135 |
+
|
| 136 |
+
history: List[str] = []
|
| 137 |
+
rewards: List[float] = []
|
| 138 |
+
steps_taken = 0
|
| 139 |
+
score = 0.0
|
| 140 |
+
success = False
|
| 141 |
+
|
| 142 |
+
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
result = await env.reset() # OpenENV.reset()
|
| 146 |
+
last_echoed = result.observation.echoed_message
|
| 147 |
+
last_reward = 0.0
|
| 148 |
+
|
| 149 |
+
for step in range(1, MAX_STEPS + 1):
|
| 150 |
+
if result.done:
|
| 151 |
+
break
|
| 152 |
+
|
| 153 |
+
message = get_model_message(client, step, last_echoed, last_reward, history)
|
| 154 |
+
|
| 155 |
+
result = await env.step(MyEnvV4Action(message=message))
|
| 156 |
+
obs = result.observation
|
| 157 |
+
|
| 158 |
+
reward = result.reward or 0.0
|
| 159 |
+
done = result.done
|
| 160 |
+
error = None
|
| 161 |
+
|
| 162 |
+
rewards.append(reward)
|
| 163 |
+
steps_taken = step
|
| 164 |
+
last_echoed = obs.echoed_message
|
| 165 |
+
last_reward = reward
|
| 166 |
+
|
| 167 |
+
log_step(step=step, action=message, reward=reward, done=done, error=error)
|
| 168 |
+
|
| 169 |
+
history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
|
| 170 |
+
|
| 171 |
+
if done:
|
| 172 |
+
break
|
| 173 |
+
|
| 174 |
+
score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
|
| 175 |
+
score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
|
| 176 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 177 |
+
|
| 178 |
+
finally:
|
| 179 |
+
try:
|
| 180 |
+
await env.close()
|
| 181 |
+
except Exception as e:
|
| 182 |
+
print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
|
| 183 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
asyncio.run(main())
|
app.py
CHANGED
|
@@ -1,23 +1,20 @@
|
|
| 1 |
"""
|
| 2 |
-
FastAPI server for
|
| 3 |
|
| 4 |
Endpoints:
|
| 5 |
-
|
| 6 |
-
POST /
|
| 7 |
-
|
| 8 |
-
GET /
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
GET /
|
| 12 |
-
GET / Info / spec link
|
| 13 |
"""
|
| 14 |
from __future__ import annotations
|
| 15 |
|
| 16 |
import os
|
| 17 |
-
import uuid
|
| 18 |
-
import json
|
| 19 |
-
from typing import Any, Dict, List, Optional
|
| 20 |
from datetime import datetime
|
|
|
|
| 21 |
|
| 22 |
from fastapi import FastAPI, HTTPException, Query
|
| 23 |
from fastapi.middleware.cors import CORSMiddleware
|
|
@@ -27,17 +24,13 @@ import environment as env
|
|
| 27 |
from data import TASK_META
|
| 28 |
from models import (
|
| 29 |
Action,
|
| 30 |
-
Observation,
|
| 31 |
-
State,
|
| 32 |
-
StepResult,
|
| 33 |
-
TaskInfo,
|
| 34 |
-
Reward,
|
| 35 |
GraderResponse,
|
|
|
|
| 36 |
)
|
| 37 |
|
| 38 |
app = FastAPI(
|
| 39 |
-
title="
|
| 40 |
-
description="An OpenEnv-compliant
|
| 41 |
version="1.0.0",
|
| 42 |
docs_url="/docs",
|
| 43 |
redoc_url="/redoc",
|
|
@@ -51,8 +44,13 @@ app.add_middleware(
|
|
| 51 |
)
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
class ResetRequest(BaseModel):
|
| 55 |
-
task_id: str
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
class StepRequest(BaseModel):
|
|
@@ -71,10 +69,9 @@ class GraderRequest(BaseModel):
|
|
| 71 |
@app.get("/", tags=["meta"])
|
| 72 |
def root():
|
| 73 |
return {
|
| 74 |
-
"name": "
|
| 75 |
"version": "1.0.0",
|
| 76 |
-
"description": "OpenEnv
|
| 77 |
-
"openenv_spec": "https://github.com/meta-pytorch/OpenEnv",
|
| 78 |
"tasks": list(TASK_META.keys()),
|
| 79 |
"endpoints": {
|
| 80 |
"reset": "POST /reset",
|
|
@@ -112,7 +109,7 @@ def tasks():
|
|
| 112 |
@app.post("/reset", tags=["control"])
|
| 113 |
def reset(req: ResetRequest):
|
| 114 |
try:
|
| 115 |
-
obs = env.reset(req.task_id)
|
| 116 |
return obs.model_dump()
|
| 117 |
except ValueError as e:
|
| 118 |
raise HTTPException(status_code=400, detail=str(e))
|
|
@@ -155,4 +152,3 @@ if __name__ == "__main__":
|
|
| 155 |
import uvicorn
|
| 156 |
port = int(os.environ.get("PORT", 7860))
|
| 157 |
uvicorn.run(app, host="0.0.0.0", port=port, workers=1)
|
| 158 |
-
|
|
|
|
| 1 |
"""
|
| 2 |
+
FastAPI server for SupportEnv — Customer Support Ticket Triage.
|
| 3 |
|
| 4 |
Endpoints:
|
| 5 |
+
POST /reset Create a new episode
|
| 6 |
+
POST /step Advance the episode
|
| 7 |
+
GET /state Current episode state
|
| 8 |
+
GET /tasks List tasks and action schema
|
| 9 |
+
POST /grader Grade a finished episode
|
| 10 |
+
GET /health Liveness check
|
| 11 |
+
GET / Info / spec link
|
|
|
|
| 12 |
"""
|
| 13 |
from __future__ import annotations
|
| 14 |
|
| 15 |
import os
|
|
|
|
|
|
|
|
|
|
| 16 |
from datetime import datetime
|
| 17 |
+
from typing import Optional
|
| 18 |
|
| 19 |
from fastapi import FastAPI, HTTPException, Query
|
| 20 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 24 |
from data import TASK_META
|
| 25 |
from models import (
|
| 26 |
Action,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
GraderResponse,
|
| 28 |
+
TaskInfo,
|
| 29 |
)
|
| 30 |
|
| 31 |
app = FastAPI(
|
| 32 |
+
title="SupportEnv",
|
| 33 |
+
description="An OpenEnv-compliant customer support ticket triage environment.",
|
| 34 |
version="1.0.0",
|
| 35 |
docs_url="/docs",
|
| 36 |
redoc_url="/redoc",
|
|
|
|
| 44 |
)
|
| 45 |
|
| 46 |
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Request schemas
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
|
| 51 |
class ResetRequest(BaseModel):
|
| 52 |
+
task_id: str = "task1"
|
| 53 |
+
ticket_index: Optional[int] = 0
|
| 54 |
|
| 55 |
|
| 56 |
class StepRequest(BaseModel):
|
|
|
|
| 69 |
@app.get("/", tags=["meta"])
|
| 70 |
def root():
|
| 71 |
return {
|
| 72 |
+
"name": "SupportEnv",
|
| 73 |
"version": "1.0.0",
|
| 74 |
+
"description": "OpenEnv customer support ticket triage environment",
|
|
|
|
| 75 |
"tasks": list(TASK_META.keys()),
|
| 76 |
"endpoints": {
|
| 77 |
"reset": "POST /reset",
|
|
|
|
| 109 |
@app.post("/reset", tags=["control"])
|
| 110 |
def reset(req: ResetRequest):
|
| 111 |
try:
|
| 112 |
+
obs = env.reset(req.task_id, ticket_index=req.ticket_index or 0)
|
| 113 |
return obs.model_dump()
|
| 114 |
except ValueError as e:
|
| 115 |
raise HTTPException(status_code=400, detail=str(e))
|
|
|
|
| 152 |
import uvicorn
|
| 153 |
port = int(os.environ.get("PORT", 7860))
|
| 154 |
uvicorn.run(app, host="0.0.0.0", port=port, workers=1)
|
|
|
baseline.py
DELETED
|
@@ -1,309 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
SupportEnv — FastAPI server
|
| 3 |
-
|
| 4 |
-
Endpoints
|
| 5 |
-
---------
|
| 6 |
-
POST /reset Create a new episode
|
| 7 |
-
POST /step Advance the episode
|
| 8 |
-
GET /state Current episode state
|
| 9 |
-
GET /tasks List tasks and action schema
|
| 10 |
-
POST /grader Grade a finished episode
|
| 11 |
-
POST /baseline Run the built-in baseline agent on all tasks
|
| 12 |
-
GET /health Liveness check
|
| 13 |
-
GET / Info / spec link
|
| 14 |
-
"""
|
| 15 |
-
from __future__ import annotations
|
| 16 |
-
|
| 17 |
-
import os
|
| 18 |
-
import subprocess
|
| 19 |
-
import sys
|
| 20 |
-
import tempfile
|
| 21 |
-
from typing import Any, Dict, List, Optional
|
| 22 |
-
|
| 23 |
-
from fastapi import FastAPI, HTTPException, Query
|
| 24 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 25 |
-
from pydantic import BaseModel
|
| 26 |
-
|
| 27 |
-
import environment as env
|
| 28 |
-
from data import TASK_META
|
| 29 |
-
from models import (
|
| 30 |
-
Action,
|
| 31 |
-
BaselineResult,
|
| 32 |
-
GraderResponse,
|
| 33 |
-
Observation,
|
| 34 |
-
State,
|
| 35 |
-
StepResult,
|
| 36 |
-
TaskInfo,
|
| 37 |
-
)
|
| 38 |
-
|
| 39 |
-
app = FastAPI(
|
| 40 |
-
title="SupportEnv",
|
| 41 |
-
description=(
|
| 42 |
-
"An OpenEnv-compliant customer-support triage environment. "
|
| 43 |
-
"Agents learn to classify, extract information from, and resolve "
|
| 44 |
-
"real-world SaaS support tickets."
|
| 45 |
-
),
|
| 46 |
-
version="1.0.0",
|
| 47 |
-
docs_url="/docs",
|
| 48 |
-
redoc_url="/redoc",
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
app.add_middleware(
|
| 52 |
-
CORSMiddleware,
|
| 53 |
-
allow_origins=["*"],
|
| 54 |
-
allow_methods=["*"],
|
| 55 |
-
allow_headers=["*"],
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
# ---------------------------------------------------------------------------
|
| 60 |
-
# Request / response shapes for endpoints not covered by models.py
|
| 61 |
-
# ---------------------------------------------------------------------------
|
| 62 |
-
|
| 63 |
-
class ResetRequest(BaseModel):
|
| 64 |
-
task_id: str
|
| 65 |
-
ticket_index: Optional[int] = None
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
class StepRequest(BaseModel):
|
| 69 |
-
episode_id: str
|
| 70 |
-
action: Action
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
class GraderRequest(BaseModel):
|
| 74 |
-
episode_id: str
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
# ---------------------------------------------------------------------------
|
| 78 |
-
# Endpoints
|
| 79 |
-
# ---------------------------------------------------------------------------
|
| 80 |
-
|
| 81 |
-
@app.get("/", tags=["meta"])
|
| 82 |
-
def root():
|
| 83 |
-
return {
|
| 84 |
-
"name": "SupportEnv",
|
| 85 |
-
"version": "1.0.0",
|
| 86 |
-
"description": "OpenEnv customer-support ticket triage environment",
|
| 87 |
-
"openenv_spec": "https://github.com/openenv/openenv",
|
| 88 |
-
"tasks": list(TASK_META.keys()),
|
| 89 |
-
"endpoints": {
|
| 90 |
-
"reset": "POST /reset",
|
| 91 |
-
"step": "POST /step",
|
| 92 |
-
"state": "GET /state?episode_id=...",
|
| 93 |
-
"tasks": "GET /tasks",
|
| 94 |
-
"grader": "POST /grader",
|
| 95 |
-
"baseline": "POST /baseline",
|
| 96 |
-
"health": "GET /health",
|
| 97 |
-
"docs": "GET /docs",
|
| 98 |
-
},
|
| 99 |
-
}
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
@app.get("/health", tags=["meta"])
|
| 103 |
-
def health():
|
| 104 |
-
return {"status": "ok"}
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
# ---------------------------------------------------------------------------
|
| 108 |
-
# Core OpenEnv endpoints
|
| 109 |
-
# ---------------------------------------------------------------------------
|
| 110 |
-
|
| 111 |
-
@app.post("/reset", response_model=Observation, tags=["openenv"])
|
| 112 |
-
def reset(request: ResetRequest) -> Observation:
|
| 113 |
-
"""
|
| 114 |
-
Start a new episode.
|
| 115 |
-
|
| 116 |
-
- **task_id**: `task1` | `task2` | `task3`
|
| 117 |
-
- **ticket_index**: 0-indexed ticket to use (optional; default 0)
|
| 118 |
-
"""
|
| 119 |
-
try:
|
| 120 |
-
return env.reset(request.task_id, request.ticket_index)
|
| 121 |
-
except ValueError as e:
|
| 122 |
-
raise HTTPException(status_code=400, detail=str(e))
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
@app.post("/step", response_model=StepResult, tags=["openenv"])
|
| 126 |
-
def step(request: StepRequest) -> StepResult:
|
| 127 |
-
"""
|
| 128 |
-
Submit an action and advance the episode.
|
| 129 |
-
|
| 130 |
-
The `action` object must include `action_type` and the fields relevant
|
| 131 |
-
to that action type (see GET /tasks for the schema).
|
| 132 |
-
"""
|
| 133 |
-
try:
|
| 134 |
-
return env.step(request.episode_id, request.action)
|
| 135 |
-
except KeyError:
|
| 136 |
-
raise HTTPException(
|
| 137 |
-
status_code=404,
|
| 138 |
-
detail=f"Episode '{request.episode_id}' not found. Call POST /reset first.",
|
| 139 |
-
)
|
| 140 |
-
except ValueError as e:
|
| 141 |
-
raise HTTPException(status_code=400, detail=str(e))
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
@app.get("/state", response_model=State, tags=["openenv"])
|
| 145 |
-
def state(episode_id: str = Query(..., description="Episode UUID from POST /reset")) -> State:
|
| 146 |
-
"""Return the current state of an episode."""
|
| 147 |
-
try:
|
| 148 |
-
return env.state(episode_id)
|
| 149 |
-
except KeyError:
|
| 150 |
-
raise HTTPException(
|
| 151 |
-
status_code=404,
|
| 152 |
-
detail=f"Episode '{episode_id}' not found.",
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
# ---------------------------------------------------------------------------
|
| 157 |
-
# /tasks — task listing + action schema
|
| 158 |
-
# ---------------------------------------------------------------------------
|
| 159 |
-
|
| 160 |
-
# JSON Schema for the Action model (subset used in each task)
|
| 161 |
-
_BASE_ACTION_SCHEMA = {
|
| 162 |
-
"type": "object",
|
| 163 |
-
"required": ["action_type"],
|
| 164 |
-
"properties": {
|
| 165 |
-
"action_type": {
|
| 166 |
-
"type": "string",
|
| 167 |
-
"description": "One of the available_actions listed in the Observation",
|
| 168 |
-
},
|
| 169 |
-
},
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
_ACTION_SCHEMAS: Dict[str, Dict[str, Any]] = {
|
| 173 |
-
"task1": {
|
| 174 |
-
**_BASE_ACTION_SCHEMA,
|
| 175 |
-
"description": "classify action: set category + priority; then submit",
|
| 176 |
-
"properties": {
|
| 177 |
-
**_BASE_ACTION_SCHEMA["properties"],
|
| 178 |
-
"category": {
|
| 179 |
-
"type": "string",
|
| 180 |
-
"enum": [
|
| 181 |
-
"billing", "technical", "account",
|
| 182 |
-
"feature_request", "complaint", "general",
|
| 183 |
-
],
|
| 184 |
-
},
|
| 185 |
-
"priority": {
|
| 186 |
-
"type": "string",
|
| 187 |
-
"enum": ["low", "medium", "high", "critical"],
|
| 188 |
-
},
|
| 189 |
-
},
|
| 190 |
-
},
|
| 191 |
-
"task2": {
|
| 192 |
-
**_BASE_ACTION_SCHEMA,
|
| 193 |
-
"description": "extract action: populate extracted_entities + required_actions; then submit",
|
| 194 |
-
"properties": {
|
| 195 |
-
**_BASE_ACTION_SCHEMA["properties"],
|
| 196 |
-
"extracted_entities": {
|
| 197 |
-
"type": "object",
|
| 198 |
-
"additionalProperties": True,
|
| 199 |
-
"description": "Key-value pairs extracted from the ticket text",
|
| 200 |
-
},
|
| 201 |
-
"required_actions": {
|
| 202 |
-
"type": "array",
|
| 203 |
-
"items": {"type": "string"},
|
| 204 |
-
"description": "List of action identifiers (snake_case) needed to close the ticket",
|
| 205 |
-
},
|
| 206 |
-
},
|
| 207 |
-
},
|
| 208 |
-
"task3": {
|
| 209 |
-
**_BASE_ACTION_SCHEMA,
|
| 210 |
-
"description": (
|
| 211 |
-
"respond or resolve action: write response_text + resolution_steps; "
|
| 212 |
-
"optionally escalate; then submit"
|
| 213 |
-
),
|
| 214 |
-
"properties": {
|
| 215 |
-
**_BASE_ACTION_SCHEMA["properties"],
|
| 216 |
-
"response_text": {
|
| 217 |
-
"type": "string",
|
| 218 |
-
"description": "Full professional response to send to the customer",
|
| 219 |
-
},
|
| 220 |
-
"resolution_steps": {
|
| 221 |
-
"type": "array",
|
| 222 |
-
"items": {"type": "string"},
|
| 223 |
-
"description": "Ordered steps for support staff to resolve the ticket",
|
| 224 |
-
},
|
| 225 |
-
"escalation_team": {
|
| 226 |
-
"type": "string",
|
| 227 |
-
"enum": ["billing_team", "engineering", "account_management", "legal"],
|
| 228 |
-
},
|
| 229 |
-
"escalation_reason": {"type": "string"},
|
| 230 |
-
},
|
| 231 |
-
},
|
| 232 |
-
}
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
@app.get("/tasks", response_model=List[TaskInfo], tags=["openenv"])
|
| 236 |
-
def list_tasks() -> List[TaskInfo]:
|
| 237 |
-
"""Return metadata and action schema for all tasks."""
|
| 238 |
-
result = []
|
| 239 |
-
for task_id, meta in TASK_META.items():
|
| 240 |
-
result.append(
|
| 241 |
-
TaskInfo(
|
| 242 |
-
task_id=task_id,
|
| 243 |
-
name=meta["name"],
|
| 244 |
-
description=meta["description"],
|
| 245 |
-
difficulty=meta["difficulty"],
|
| 246 |
-
max_steps=meta["max_steps"],
|
| 247 |
-
action_schema=_ACTION_SCHEMAS[task_id],
|
| 248 |
-
)
|
| 249 |
-
)
|
| 250 |
-
return result
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
# ---------------------------------------------------------------------------
|
| 254 |
-
# /grader — grade a finished episode
|
| 255 |
-
# ---------------------------------------------------------------------------
|
| 256 |
-
|
| 257 |
-
@app.post("/grader", response_model=GraderResponse, tags=["openenv"])
|
| 258 |
-
def grader(request: GraderRequest) -> GraderResponse:
|
| 259 |
-
"""
|
| 260 |
-
Grade a finished episode.
|
| 261 |
-
|
| 262 |
-
The episode must have reached `done=True` (either via a `submit` action
|
| 263 |
-
or by exhausting `max_steps`).
|
| 264 |
-
"""
|
| 265 |
-
try:
|
| 266 |
-
return env.grade(request.episode_id)
|
| 267 |
-
except KeyError:
|
| 268 |
-
raise HTTPException(
|
| 269 |
-
status_code=404,
|
| 270 |
-
detail=f"Episode '{request.episode_id}' not found.",
|
| 271 |
-
)
|
| 272 |
-
except ValueError as e:
|
| 273 |
-
raise HTTPException(status_code=400, detail=str(e))
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
# ---------------------------------------------------------------------------
|
| 277 |
-
# /baseline — run the built-in baseline agent
|
| 278 |
-
# ---------------------------------------------------------------------------
|
| 279 |
-
|
| 280 |
-
class BaselineRequest(BaseModel):
|
| 281 |
-
model: str = "gpt-4o-mini"
|
| 282 |
-
ticket_index: Optional[int] = 0
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
@app.post("/baseline", response_model=BaselineResult, tags=["openenv"])
|
| 286 |
-
def run_baseline(request: BaselineRequest) -> BaselineResult:
|
| 287 |
-
"""
|
| 288 |
-
Run the heuristic baseline agent against all three tasks.
|
| 289 |
-
|
| 290 |
-
The built-in baseline does NOT require an OpenAI key — it uses the
|
| 291 |
-
deterministic heuristic baseline from `baseline.py`.
|
| 292 |
-
If you want to run the LLM baseline, call `baseline.py` directly.
|
| 293 |
-
"""
|
| 294 |
-
try:
|
| 295 |
-
from baseline import run_heuristic_baseline
|
| 296 |
-
scores = run_heuristic_baseline(
|
| 297 |
-
ticket_index=request.ticket_index or 0
|
| 298 |
-
)
|
| 299 |
-
avg = round(sum(s["score"] for s in scores) / len(scores), 4)
|
| 300 |
-
return BaselineResult(
|
| 301 |
-
model="heuristic-baseline",
|
| 302 |
-
scores=[
|
| 303 |
-
{"task_id": s["task_id"], "score": s["score"], "details": s}
|
| 304 |
-
for s in scores
|
| 305 |
-
],
|
| 306 |
-
average_score=avg,
|
| 307 |
-
)
|
| 308 |
-
except Exception as exc:
|
| 309 |
-
raise HTTPException(status_code=500, detail=str(exc))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data.py
CHANGED
|
@@ -1,92 +1,15 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
Task 1 (easy) —
|
| 5 |
-
Task 2 (medium) —
|
| 6 |
-
Task 3 (hard) —
|
| 7 |
"""
|
| 8 |
from __future__ import annotations
|
| 9 |
from typing import Any, Dict, List
|
| 10 |
|
| 11 |
-
# Nginx service configuration
|
| 12 |
-
NGINX_CONFIG_PATH = "/etc/nginx/nginx.conf"
|
| 13 |
-
NGINX_SYSTEMD_PATH = "/etc/systemd/system/nginx.service"
|
| 14 |
-
|
| 15 |
-
# Docker configuration
|
| 16 |
-
DOCKER_COMPOSE_PATH = "/srv/docker-compose.yml"
|
| 17 |
-
|
| 18 |
-
# Mock API code path
|
| 19 |
-
MOCK_API_PATH = "/opt/mockapi/app.py"
|
| 20 |
-
|
| 21 |
# ---------------------------------------------------------------------------
|
| 22 |
-
# TASK
|
| 23 |
-
# ---------------------------------------------------------------------------
|
| 24 |
-
|
| 25 |
-
TASK_META: Dict[str, Dict[str, Any]] = {
|
| 26 |
-
"task1": {
|
| 27 |
-
"name": "Restart Nginx Service",
|
| 28 |
-
"description": (
|
| 29 |
-
"Production Nginx service has crashed. Restart the service, "
|
| 30 |
-
"verify the configuration syntax, and ensure the server "
|
| 31 |
-
"returns HTTP 200 on port 80. Failing checklist:\n"
|
| 32 |
-
"1. Restart nginx (systemctl restart nginx)\n"
|
| 33 |
-
"2. Verify config syntax (nginx -t)\n"
|
| 34 |
-
"3. Confirm service is running (systemctl status nginx)\n"
|
| 35 |
-
"4. Check HTTP 200 response (curl http://localhost:80)"
|
| 36 |
-
),
|
| 37 |
-
"difficulty": "easy",
|
| 38 |
-
"max_steps": 10,
|
| 39 |
-
"available_actions": ["bash_cmd", "submit"],
|
| 40 |
-
"passing_conditions": [
|
| 41 |
-
"nginx_running",
|
| 42 |
-
"config_valid",
|
| 43 |
-
"http_200_response",
|
| 44 |
-
],
|
| 45 |
-
},
|
| 46 |
-
"task2": {
|
| 47 |
-
"name": "Fix Docker Container Configuration",
|
| 48 |
-
"description": (
|
| 49 |
-
"A critical microservice container is misconfigured. The port "
|
| 50 |
-
"mapping in docker-compose.yml is broken. Fix the configuration, "
|
| 51 |
-
"redeploy the container, and verify it's accessible on the "
|
| 52 |
-
"correct port.\n"
|
| 53 |
-
"1. Edit docker-compose.yml (fix port mapping)\n"
|
| 54 |
-
"2. Restart containers (docker-compose up -d)\n"
|
| 55 |
-
"3. Verify container is running\n"
|
| 56 |
-
"4. Check service responds on mapped port"
|
| 57 |
-
),
|
| 58 |
-
"difficulty": "medium",
|
| 59 |
-
"max_steps": 15,
|
| 60 |
-
"available_actions": ["bash_cmd", "file_edit", "submit"],
|
| 61 |
-
"passing_conditions": [
|
| 62 |
-
"docker_compose_valid",
|
| 63 |
-
"container_running",
|
| 64 |
-
"port_accessible",
|
| 65 |
-
],
|
| 66 |
-
},
|
| 67 |
-
"task3": {
|
| 68 |
-
"name": "Find and Fix Memory Leak in Mock API",
|
| 69 |
-
"description": (
|
| 70 |
-
"The Python API service is leaking memory and consuming excessive "
|
| 71 |
-
"resources. Diagnose the memory leak in /opt/mockapi/app.py, fix "
|
| 72 |
-
"the offending code, and restart the service without root access.\n"
|
| 73 |
-
"1. Identify the memory leak (check processes, logs)\n"
|
| 74 |
-
"2. Kill the runaway process\n"
|
| 75 |
-
"3. Fix the code in app.py (patch the leak)\n"
|
| 76 |
-
"4. Restart the service as appuser\n"
|
| 77 |
-
"5. Verify memory usage is normal"
|
| 78 |
-
),
|
| 79 |
-
"difficulty": "hard",
|
| 80 |
-
"max_steps": 20,
|
| 81 |
-
"available_actions": ["bash_cmd", "file_edit", "submit"],
|
| 82 |
-
"passing_conditions": [
|
| 83 |
-
"process_killed",
|
| 84 |
-
"code_fixed",
|
| 85 |
-
"service_restarted",
|
| 86 |
-
"memory_normal",
|
| 87 |
-
],
|
| 88 |
-
},
|
| 89 |
-
}
|
| 90 |
# Agent must choose: category + priority
|
| 91 |
# Categories: billing | technical | account | feature_request | complaint | general
|
| 92 |
# Priorities: low | medium | high | critical
|
|
|
|
| 1 |
"""
|
| 2 |
+
SupportEnv — Customer Support Ticket Triage data.
|
| 3 |
|
| 4 |
+
Task 1 (easy) — Ticket Classification
|
| 5 |
+
Task 2 (medium) — Information Extraction
|
| 6 |
+
Task 3 (hard) — Resolution Generation
|
| 7 |
"""
|
| 8 |
from __future__ import annotations
|
| 9 |
from typing import Any, Dict, List
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# ---------------------------------------------------------------------------
|
| 12 |
+
# TASK 1 — Ticket Classification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# Agent must choose: category + priority
|
| 14 |
# Categories: billing | technical | account | feature_request | complaint | general
|
| 15 |
# Priorities: low | medium | high | critical
|
environment.py
CHANGED
|
@@ -1,25 +1,23 @@
|
|
| 1 |
"""
|
| 2 |
-
Core
|
| 3 |
|
| 4 |
-
Simulates a
|
| 5 |
-
- Task 1:
|
| 6 |
-
- Task 2:
|
| 7 |
-
- Task 3:
|
| 8 |
|
| 9 |
Manages episode lifecycle:
|
| 10 |
-
reset() → Observation
|
| 11 |
-
step(action)
|
| 12 |
-
get_state()
|
| 13 |
-
grade()
|
| 14 |
"""
|
| 15 |
from __future__ import annotations
|
| 16 |
|
| 17 |
import uuid
|
| 18 |
-
import
|
| 19 |
-
import re
|
| 20 |
-
from typing import Any, Dict, Optional, Tuple, List
|
| 21 |
|
| 22 |
-
from data import TASK_META
|
| 23 |
from graders import grade_task
|
| 24 |
from models import (
|
| 25 |
Action,
|
|
@@ -27,350 +25,75 @@ from models import (
|
|
| 27 |
Reward,
|
| 28 |
State,
|
| 29 |
StepResult,
|
| 30 |
-
|
| 31 |
)
|
| 32 |
|
| 33 |
-
# In-memory store: episode_id →
|
| 34 |
_EPISODES: Dict[str, Dict[str, Any]] = {}
|
| 35 |
|
| 36 |
|
| 37 |
# ---------------------------------------------------------------------------
|
| 38 |
-
#
|
| 39 |
# ---------------------------------------------------------------------------
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
"running_processes": [
|
| 45 |
-
{"pid": 100, "name": "systemd"},
|
| 46 |
-
{"pid": 105, "name": "sshd"},
|
| 47 |
-
# nginx NOT running
|
| 48 |
-
],
|
| 49 |
-
"service_status": {
|
| 50 |
-
"nginx": "inactive",
|
| 51 |
-
"docker": "active",
|
| 52 |
-
"mockapi": "active",
|
| 53 |
-
},
|
| 54 |
-
"http_ports_open": [8080], # 80 is down
|
| 55 |
-
"docker_containers": [],
|
| 56 |
-
"logs": "2026-03-29 01:30:00 nginx crashed\nCore dump detected.\n",
|
| 57 |
-
"files": {
|
| 58 |
-
NGINX_CONFIG_PATH: """
|
| 59 |
-
user nginx;
|
| 60 |
-
worker_processes auto;
|
| 61 |
-
error_log /var/log/nginx/error.log warn;
|
| 62 |
-
pid /var/run/nginx.pid;
|
| 63 |
-
|
| 64 |
-
events {
|
| 65 |
-
worker_connections 1024;
|
| 66 |
-
}
|
| 67 |
-
|
| 68 |
-
http {
|
| 69 |
-
include /etc/nginx/mime.types;
|
| 70 |
-
default_type application/octet-stream;
|
| 71 |
-
sendfile on;
|
| 72 |
-
keepalive_timeout 65;
|
| 73 |
-
|
| 74 |
-
server {
|
| 75 |
-
listen 80 default_server;
|
| 76 |
-
server_name _;
|
| 77 |
-
location / {
|
| 78 |
-
return 200 "OK\\n";
|
| 79 |
-
}
|
| 80 |
-
}
|
| 81 |
-
}""",
|
| 82 |
-
"/etc/systemd/system/nginx.service": """
|
| 83 |
-
[Unit]
|
| 84 |
-
Description=The NGINX HTTP and reverse proxy server
|
| 85 |
-
After=network.target
|
| 86 |
-
|
| 87 |
-
[Service]
|
| 88 |
-
Type=forking
|
| 89 |
-
PIDFile=/var/run/nginx.pid
|
| 90 |
-
ExecStartPre=/usr/sbin/nginx -t
|
| 91 |
-
ExecStart=/usr/sbin/nginx
|
| 92 |
-
ExecReload=/bin/kill -s HUP $MAINPID
|
| 93 |
-
ExecStop=/bin/kill -s QUIT $MAINPID
|
| 94 |
-
PrivateTmp=true
|
| 95 |
-
|
| 96 |
-
[Install]
|
| 97 |
-
WantedBy=multi-user.target""",
|
| 98 |
-
},
|
| 99 |
-
"cpu_usage": 45.2,
|
| 100 |
-
"memory_usage_mb": 256,
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def _create_initial_state_task2() -> Dict[str, Any]:
|
| 105 |
-
"""Task 2: Docker misconfigured."""
|
| 106 |
-
return {
|
| 107 |
-
"running_processes": [
|
| 108 |
-
{"pid": 100, "name": "systemd"},
|
| 109 |
-
{"pid": 105, "name": "sshd"},
|
| 110 |
-
{"pid": 200, "name": "dockerd"},
|
| 111 |
-
],
|
| 112 |
-
"service_status": {
|
| 113 |
-
"nginx": "active",
|
| 114 |
-
"docker": "active",
|
| 115 |
-
"mockapi": "inactive",
|
| 116 |
-
},
|
| 117 |
-
"http_ports_open": [80],
|
| 118 |
-
"docker_containers": [
|
| 119 |
-
{"id": "abc123", "name": "mockapi-svc", "status": "running", "ports": "8000->3000/tcp"}
|
| 120 |
-
],
|
| 121 |
-
"logs": "docker: port 3000 already in use\n",
|
| 122 |
-
"files": {
|
| 123 |
-
"/srv/docker-compose.yml": """
|
| 124 |
-
version: '3.8'
|
| 125 |
-
services:
|
| 126 |
-
mockapi:
|
| 127 |
-
image: mockapi:latest
|
| 128 |
-
ports:
|
| 129 |
-
- "3000:3000"
|
| 130 |
-
environment:
|
| 131 |
-
- PORT=3000
|
| 132 |
-
volumes:
|
| 133 |
-
- ./app.py:/app/app.py""",
|
| 134 |
-
},
|
| 135 |
-
"cpu_usage": 62.0,
|
| 136 |
-
"memory_usage_mb": 1024,
|
| 137 |
-
}
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
def _create_initial_state_task3() -> Dict[str, Any]:
|
| 141 |
-
"""Task 3: Memory leak in mock API."""
|
| 142 |
-
return {
|
| 143 |
-
"running_processes": [
|
| 144 |
-
{"pid": 100, "name": "systemd"},
|
| 145 |
-
{"pid": 105, "name": "sshd"},
|
| 146 |
-
{"pid": 300, "name": "python3", "rss_mb": 2048, "user": "appuser"}, # MEMORY LEAK
|
| 147 |
-
],
|
| 148 |
-
"service_status": {
|
| 149 |
-
"nginx": "active",
|
| 150 |
-
"docker": "active",
|
| 151 |
-
"mockapi": "active",
|
| 152 |
-
},
|
| 153 |
-
"http_ports_open": [80, 5000],
|
| 154 |
-
"docker_containers": [],
|
| 155 |
-
"logs": (
|
| 156 |
-
"2026-03-29 01:45:00 mockapi started\n"
|
| 157 |
-
"2026-03-29 01:46:00 memory usage: 512 MB\n"
|
| 158 |
-
"2026-03-29 01:47:00 memory usage: 1024 MB\n"
|
| 159 |
-
"2026-03-29 01:48:00 memory usage: 1536 MB (WARNING: HIGH)\n"
|
| 160 |
-
"2026-03-29 01:49:00 memory usage: 2048 MB (CRITICAL)\n"
|
| 161 |
-
),
|
| 162 |
-
"files": {
|
| 163 |
-
"/opt/mockapi/app.py": """
|
| 164 |
-
import json
|
| 165 |
-
from flask import Flask
|
| 166 |
-
|
| 167 |
-
app = Flask(__name__)
|
| 168 |
-
|
| 169 |
-
# BUG: This list grows unbounded
|
| 170 |
-
request_cache = []
|
| 171 |
-
|
| 172 |
-
@app.route('/api/data', methods=['GET'])
|
| 173 |
-
def get_data():
|
| 174 |
-
data = {"timestamp": 123456, "value": 42}
|
| 175 |
-
request_cache.append(data) # MEMORY LEAK!
|
| 176 |
-
return json.dumps(data)
|
| 177 |
-
|
| 178 |
-
if __name__ == '__main__':
|
| 179 |
-
app.run(host='0.0.0.0', port=5000)
|
| 180 |
-
""",
|
| 181 |
-
},
|
| 182 |
-
"cpu_usage": 85.5,
|
| 183 |
-
"memory_usage_mb": 2048,
|
| 184 |
-
}
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
NGINX_CONFIG_PATH = "/etc/nginx/nginx.conf"
|
| 188 |
-
DOCKER_COMPOSE_PATH = "/srv/docker-compose.yml"
|
| 189 |
-
MOCK_API_PATH = "/opt/mockapi/app.py"
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
def _build_system_state(task_id: str, ep_state: Dict[str, Any]) -> SystemState:
|
| 193 |
-
"""Build a SystemState object from episode state."""
|
| 194 |
-
state_dict = ep_state["system_state"]
|
| 195 |
-
return SystemState(
|
| 196 |
-
task_id=task_id,
|
| 197 |
-
available_commands=["systemctl", "nginx", "docker", "curl", "ps", "cat", "vim"],
|
| 198 |
-
filesystem_snapshot=json.dumps({
|
| 199 |
-
k: v for k, v in state_dict.get("files", {}).items()
|
| 200 |
-
}),
|
| 201 |
-
running_processes=state_dict.get("running_processes", []),
|
| 202 |
-
service_status=state_dict.get("service_status", {}),
|
| 203 |
-
logs=state_dict.get("logs", ""),
|
| 204 |
-
http_ports_open=state_dict.get("http_ports_open", []),
|
| 205 |
-
docker_containers=state_dict.get("docker_containers", []),
|
| 206 |
-
cpu_usage=state_dict.get("cpu_usage", 0.0),
|
| 207 |
-
memory_usage_mb=state_dict.get("memory_usage_mb", 0),
|
| 208 |
-
)
|
| 209 |
|
| 210 |
|
| 211 |
# ---------------------------------------------------------------------------
|
| 212 |
-
#
|
| 213 |
# ---------------------------------------------------------------------------
|
| 214 |
|
| 215 |
-
def
|
| 216 |
-
"""
|
| 217 |
-
state_dict = ep_state["system_state"]
|
| 218 |
-
lower_cmd = cmd.lower()
|
| 219 |
-
|
| 220 |
-
# Task 1: Nginx commands
|
| 221 |
-
if task_id == "task1":
|
| 222 |
-
if "systemctl restart nginx" in lower_cmd or "systemctl start nginx" in lower_cmd:
|
| 223 |
-
state_dict["service_status"]["nginx"] = "active"
|
| 224 |
-
state_dict["running_processes"].append({"pid": 999, "name": "nginx"})
|
| 225 |
-
state_dict["http_ports_open"] = [80]
|
| 226 |
-
return "Job for nginx.service started successfully."
|
| 227 |
-
elif "systemctl status nginx" in lower_cmd:
|
| 228 |
-
if state_dict["service_status"]["nginx"] == "active":
|
| 229 |
-
return "● nginx.service - NGINX HTTP Server\n Loaded: loaded (/etc/systemd/system/nginx.service)\n Active: active (running)"
|
| 230 |
-
return "● nginx.service - NGINX HTTP Server\n Active: inactive (dead)"
|
| 231 |
-
elif "nginx -t" in lower_cmd:
|
| 232 |
-
return "nginx: the configuration file /etc/nginx/nginx.conf syntax is ok\nnginx: configuration file /etc/nginx/nginx.conf test is successful"
|
| 233 |
-
elif "curl http://localhost:80" in lower_cmd or "curl http://localhost" in lower_cmd:
|
| 234 |
-
if 80 in state_dict["http_ports_open"]:
|
| 235 |
-
return "OK"
|
| 236 |
-
return "curl: (7) Failed to connect to localhost port 80: Connection refused"
|
| 237 |
-
|
| 238 |
-
# Task 2: Docker commands
|
| 239 |
-
elif task_id == "task2":
|
| 240 |
-
if "docker-compose up -d" in lower_cmd:
|
| 241 |
-
if DOCKER_COMPOSE_PATH in state_dict["files"]:
|
| 242 |
-
compose_content = state_dict["files"][DOCKER_COMPOSE_PATH]
|
| 243 |
-
# Check if port is now correct
|
| 244 |
-
if "3000:3000" in compose_content:
|
| 245 |
-
state_dict["docker_containers"] = [
|
| 246 |
-
{"id": "xyz789", "name": "mockapi-svc", "status": "running", "ports": "3000:3000/tcp"}
|
| 247 |
-
]
|
| 248 |
-
state_dict["service_status"]["mockapi"] = "active"
|
| 249 |
-
return "Creating mockapi ... done"
|
| 250 |
-
return "ERROR: docker-compose.yml not found or invalid"
|
| 251 |
-
elif "docker ps" in lower_cmd:
|
| 252 |
-
if state_dict["docker_containers"]:
|
| 253 |
-
return "\n".join([f"{c['id']} {c['name']} {c['status']}" for c in state_dict["docker_containers"]])
|
| 254 |
-
return "No containers running"
|
| 255 |
-
|
| 256 |
-
# Task 3: Process/memory commands
|
| 257 |
-
elif task_id == "task3":
|
| 258 |
-
if "ps aux" in lower_cmd or "ps aux grep python" in lower_cmd:
|
| 259 |
-
output = ""
|
| 260 |
-
for proc in state_dict["running_processes"]:
|
| 261 |
-
if proc.get("name") == "python3":
|
| 262 |
-
output += f"appuser {proc['pid']} 85.5 {proc.get('rss_mb', 512)} python3 /opt/mockapi/app.py\n"
|
| 263 |
-
return output if output else "No python processes found"
|
| 264 |
-
elif "kill" in lower_cmd:
|
| 265 |
-
if "300" in lower_cmd or "python" in lower_cmd:
|
| 266 |
-
state_dict["running_processes"] = [p for p in state_dict["running_processes"] if p.get("name") != "python3"]
|
| 267 |
-
state_dict["service_status"]["mockapi"] = "inactive"
|
| 268 |
-
return "Process killed"
|
| 269 |
-
return "Process not found"
|
| 270 |
-
elif "python3 /opt/mockapi/app.py &" in lower_cmd or "python3 /opt/mockapi/app.py" in lower_cmd:
|
| 271 |
-
state_dict["running_processes"].append({"pid": 301, "name": "python3", "rss_mb": 128, "user": "appuser"})
|
| 272 |
-
state_dict["service_status"]["mockapi"] = "active"
|
| 273 |
-
state_dict["http_ports_open"] = [80, 5000]
|
| 274 |
-
return "Application started"
|
| 275 |
-
|
| 276 |
-
return f"Command '{cmd}' executed (simulated)"
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
def _simulate_file_edit(file_path: str, new_content: str, ep_state: Dict[str, Any]) -> str:
|
| 280 |
-
"""Simulate file editing."""
|
| 281 |
-
state_dict = ep_state["system_state"]
|
| 282 |
-
|
| 283 |
-
if file_path not in state_dict.get("files", {}):
|
| 284 |
-
return f"ERROR: File {file_path} not found"
|
| 285 |
-
|
| 286 |
-
# Detect task 2: Check docker-compose.yml fix
|
| 287 |
-
if file_path == DOCKER_COMPOSE_PATH and "3000:3000" in new_content:
|
| 288 |
-
state_dict["files"][file_path] = new_content
|
| 289 |
-
return f"File {file_path} updated successfully"
|
| 290 |
-
|
| 291 |
-
# Detect task 3: Check mock API fix
|
| 292 |
-
elif file_path == MOCK_API_PATH and "request_cache = []" not in new_content:
|
| 293 |
-
# Verify fix removes the memory leak
|
| 294 |
-
state_dict["files"][file_path] = new_content
|
| 295 |
-
return f"File {file_path} patched successfully"
|
| 296 |
-
|
| 297 |
-
state_dict["files"][file_path] = new_content
|
| 298 |
-
return f"File {file_path} edited"
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
# ---------------------------------------------------------------------------
|
| 302 |
-
# Reward calculation
|
| 303 |
-
# ---------------------------------------------------------------------------
|
| 304 |
-
|
| 305 |
-
def _calculate_step_reward(task_id: str, action: Action, ep_state: Dict[str, Any]) -> Tuple[float, str]:
|
| 306 |
-
"""Calculate reward based on action and task."""
|
| 307 |
-
base_step_cost = -0.01
|
| 308 |
-
reward = base_step_cost
|
| 309 |
-
|
| 310 |
-
if action.action_type == "bash_cmd":
|
| 311 |
-
cmd = action.command or ""
|
| 312 |
-
reward += 0.05
|
| 313 |
-
explanation = f"Executed: {cmd[:50]}"
|
| 314 |
-
return reward, explanation
|
| 315 |
-
|
| 316 |
-
elif action.action_type == "file_edit":
|
| 317 |
-
reward += 0.03
|
| 318 |
-
explanation = f"Edited: {action.file_path}"
|
| 319 |
-
return reward, explanation
|
| 320 |
-
|
| 321 |
-
elif action.action_type == "submit":
|
| 322 |
-
reward += 0.1
|
| 323 |
-
explanation = "Episode submitted for grading"
|
| 324 |
-
return reward, explanation
|
| 325 |
-
|
| 326 |
-
return reward, "Step taken"
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
# ---------------------------------------------------------------------------
|
| 330 |
-
# Core API functions
|
| 331 |
-
# ---------------------------------------------------------------------------
|
| 332 |
-
|
| 333 |
-
def reset(task_id: str) -> Observation:
|
| 334 |
-
"""Create a new episode for the given task."""
|
| 335 |
if task_id not in TASK_META:
|
| 336 |
raise ValueError(f"Unknown task_id {task_id!r}. Valid: {list(TASK_META)}")
|
| 337 |
|
| 338 |
meta = TASK_META[task_id]
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
if
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
initial_sys_state = {}
|
| 349 |
|
| 350 |
episode_id = str(uuid.uuid4())
|
| 351 |
_EPISODES[episode_id] = {
|
| 352 |
"task_id": task_id,
|
|
|
|
|
|
|
| 353 |
"step_number": 0,
|
| 354 |
"max_steps": meta["max_steps"],
|
| 355 |
"done": False,
|
| 356 |
"total_reward": 0.0,
|
| 357 |
"action_history": [],
|
| 358 |
"final_score": None,
|
| 359 |
-
"system_state": initial_sys_state,
|
| 360 |
}
|
| 361 |
|
| 362 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
|
| 364 |
return Observation(
|
| 365 |
task_id=task_id,
|
| 366 |
-
task_description=
|
| 367 |
episode_id=episode_id,
|
| 368 |
-
|
| 369 |
thread_history=[],
|
| 370 |
-
available_actions=
|
| 371 |
step_number=0,
|
| 372 |
max_steps=meta["max_steps"],
|
| 373 |
-
hint=
|
| 374 |
)
|
| 375 |
|
| 376 |
|
|
@@ -379,24 +102,14 @@ def step(episode_id: str, action: Action) -> StepResult:
|
|
| 379 |
ep = _EPISODES.get(episode_id)
|
| 380 |
if ep is None:
|
| 381 |
raise KeyError(f"Episode {episode_id} not found")
|
| 382 |
-
|
| 383 |
if ep["done"]:
|
| 384 |
raise ValueError(f"Episode {episode_id} is already done.")
|
| 385 |
|
| 386 |
task_id = ep["task_id"]
|
| 387 |
-
meta = TASK_META[task_id]
|
| 388 |
|
| 389 |
ep["step_number"] += 1
|
| 390 |
ep["action_history"].append(action.model_dump())
|
| 391 |
|
| 392 |
-
# Execute action
|
| 393 |
-
if action.action_type == "bash_cmd":
|
| 394 |
-
cmd_output = _simulate_bash_cmd(action.command or "", task_id, ep)
|
| 395 |
-
ep["action_history"][-1]["output"] = cmd_output
|
| 396 |
-
elif action.action_type == "file_edit":
|
| 397 |
-
edit_result = _simulate_file_edit(action.file_path or "", action.file_content or "", ep)
|
| 398 |
-
ep["action_history"][-1]["result"] = edit_result
|
| 399 |
-
|
| 400 |
# Determine if done
|
| 401 |
done = False
|
| 402 |
if action.action_type == "submit":
|
|
@@ -404,16 +117,21 @@ def step(episode_id: str, action: Action) -> StepResult:
|
|
| 404 |
elif ep["step_number"] >= ep["max_steps"]:
|
| 405 |
done = True
|
| 406 |
|
| 407 |
-
# Calculate reward
|
| 408 |
-
step_reward, explanation = _calculate_step_reward(task_id, action, ep)
|
| 409 |
|
| 410 |
-
# Apply grader bonus
|
| 411 |
if done:
|
| 412 |
-
final_score,
|
| 413 |
ep["final_score"] = final_score
|
| 414 |
-
|
| 415 |
-
step_reward +=
|
| 416 |
-
explanation += f" | Grader score: {final_score:.3f}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
else:
|
| 418 |
final_score = None
|
| 419 |
|
|
@@ -421,21 +139,34 @@ def step(episode_id: str, action: Action) -> StepResult:
|
|
| 421 |
ep["done"] = done
|
| 422 |
|
| 423 |
# Build observation
|
| 424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
thread_history = [
|
| 426 |
-
{"role": "agent", "content":
|
|
|
|
| 427 |
]
|
| 428 |
|
| 429 |
obs = Observation(
|
| 430 |
task_id=task_id,
|
| 431 |
-
task_description=
|
| 432 |
episode_id=episode_id,
|
| 433 |
-
|
| 434 |
thread_history=thread_history,
|
| 435 |
-
available_actions=
|
| 436 |
step_number=ep["step_number"],
|
| 437 |
max_steps=ep["max_steps"],
|
| 438 |
-
hint=None if done else
|
| 439 |
)
|
| 440 |
|
| 441 |
reward = Reward(
|
|
@@ -444,7 +175,7 @@ def step(episode_id: str, action: Action) -> StepResult:
|
|
| 444 |
explanation=explanation,
|
| 445 |
)
|
| 446 |
|
| 447 |
-
info = {"step": ep["step_number"]}
|
| 448 |
if done:
|
| 449 |
info["final_score"] = final_score
|
| 450 |
|
|
@@ -474,13 +205,84 @@ def grade(episode_id: str) -> Tuple[float, Dict[str, float], str]:
|
|
| 474 |
ep = _EPISODES.get(episode_id)
|
| 475 |
if ep is None:
|
| 476 |
raise KeyError(f"Episode {episode_id} not found")
|
| 477 |
-
|
| 478 |
if not ep.get("done"):
|
| 479 |
raise ValueError(f"Episode {episode_id} is not done yet")
|
| 480 |
|
| 481 |
task_id = ep["task_id"]
|
| 482 |
score, breakdown, feedback = grade_task(task_id, ep)
|
| 483 |
ep["final_score"] = score
|
| 484 |
-
|
| 485 |
return score, breakdown, feedback
|
| 486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Core SupportEnv environment logic.
|
| 3 |
|
| 4 |
+
Simulates a customer support ticket triage workflow:
|
| 5 |
+
- Task 1 (easy): Ticket Classification — assign category + priority
|
| 6 |
+
- Task 2 (medium): Information Extraction — pull entities + required actions
|
| 7 |
+
- Task 3 (hard): Resolution Generation — write response + resolution steps
|
| 8 |
|
| 9 |
Manages episode lifecycle:
|
| 10 |
+
reset(task_id, ticket_index) → Observation
|
| 11 |
+
step(episode_id, action) → StepResult
|
| 12 |
+
get_state(episode_id) → State
|
| 13 |
+
grade(episode_id) → (score, breakdown, feedback)
|
| 14 |
"""
|
| 15 |
from __future__ import annotations
|
| 16 |
|
| 17 |
import uuid
|
| 18 |
+
from typing import Any, Dict, Optional, Tuple
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
from data import TASK_META, get_task_meta, get_tickets
|
| 21 |
from graders import grade_task
|
| 22 |
from models import (
|
| 23 |
Action,
|
|
|
|
| 25 |
Reward,
|
| 26 |
State,
|
| 27 |
StepResult,
|
| 28 |
+
TicketInfo,
|
| 29 |
)
|
| 30 |
|
| 31 |
+
# In-memory store: episode_id → episode dict
|
| 32 |
_EPISODES: Dict[str, Dict[str, Any]] = {}
|
| 33 |
|
| 34 |
|
| 35 |
# ---------------------------------------------------------------------------
|
| 36 |
+
# Reward constants (match openenv.yaml)
|
| 37 |
# ---------------------------------------------------------------------------
|
| 38 |
|
| 39 |
+
STEP_COST = -0.02
|
| 40 |
+
SUBMIT_BONUS = 0.05
|
| 41 |
+
MAX_STEP_PENALTY = -0.10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
# ---------------------------------------------------------------------------
|
| 45 |
+
# Core API
|
| 46 |
# ---------------------------------------------------------------------------
|
| 47 |
|
| 48 |
+
def reset(task_id: str, ticket_index: int = 0) -> Observation:
|
| 49 |
+
"""Create a new episode for the given task and ticket."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
if task_id not in TASK_META:
|
| 51 |
raise ValueError(f"Unknown task_id {task_id!r}. Valid: {list(TASK_META)}")
|
| 52 |
|
| 53 |
meta = TASK_META[task_id]
|
| 54 |
+
tickets = get_tickets(task_id)
|
| 55 |
+
|
| 56 |
+
if ticket_index < 0 or ticket_index >= len(tickets):
|
| 57 |
+
raise ValueError(
|
| 58 |
+
f"ticket_index {ticket_index} out of range [0, {len(tickets) - 1}]"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
ticket_data = tickets[ticket_index]
|
| 62 |
+
safe_meta = get_task_meta(task_id)
|
|
|
|
| 63 |
|
| 64 |
episode_id = str(uuid.uuid4())
|
| 65 |
_EPISODES[episode_id] = {
|
| 66 |
"task_id": task_id,
|
| 67 |
+
"ticket_index": ticket_index,
|
| 68 |
+
"ticket_data": ticket_data,
|
| 69 |
"step_number": 0,
|
| 70 |
"max_steps": meta["max_steps"],
|
| 71 |
"done": False,
|
| 72 |
"total_reward": 0.0,
|
| 73 |
"action_history": [],
|
| 74 |
"final_score": None,
|
|
|
|
| 75 |
}
|
| 76 |
|
| 77 |
+
ticket_info = TicketInfo(
|
| 78 |
+
ticket_id=ticket_data["ticket_id"],
|
| 79 |
+
subject=ticket_data["subject"],
|
| 80 |
+
body=ticket_data["body"],
|
| 81 |
+
customer_tier=ticket_data["customer_tier"],
|
| 82 |
+
account_age_days=ticket_data["account_age_days"],
|
| 83 |
+
previous_tickets=ticket_data["previous_tickets"],
|
| 84 |
+
attachments=ticket_data.get("attachments", []),
|
| 85 |
+
)
|
| 86 |
|
| 87 |
return Observation(
|
| 88 |
task_id=task_id,
|
| 89 |
+
task_description=safe_meta["description"],
|
| 90 |
episode_id=episode_id,
|
| 91 |
+
ticket=ticket_info,
|
| 92 |
thread_history=[],
|
| 93 |
+
available_actions=safe_meta["available_actions"],
|
| 94 |
step_number=0,
|
| 95 |
max_steps=meta["max_steps"],
|
| 96 |
+
hint=_get_hint(task_id, 0),
|
| 97 |
)
|
| 98 |
|
| 99 |
|
|
|
|
| 102 |
ep = _EPISODES.get(episode_id)
|
| 103 |
if ep is None:
|
| 104 |
raise KeyError(f"Episode {episode_id} not found")
|
|
|
|
| 105 |
if ep["done"]:
|
| 106 |
raise ValueError(f"Episode {episode_id} is already done.")
|
| 107 |
|
| 108 |
task_id = ep["task_id"]
|
|
|
|
| 109 |
|
| 110 |
ep["step_number"] += 1
|
| 111 |
ep["action_history"].append(action.model_dump())
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# Determine if done
|
| 114 |
done = False
|
| 115 |
if action.action_type == "submit":
|
|
|
|
| 117 |
elif ep["step_number"] >= ep["max_steps"]:
|
| 118 |
done = True
|
| 119 |
|
| 120 |
+
# Calculate step reward
|
| 121 |
+
step_reward, explanation = _calculate_step_reward(task_id, action, ep, done)
|
| 122 |
|
| 123 |
+
# Apply grader bonus on terminal step
|
| 124 |
if done:
|
| 125 |
+
final_score, _breakdown, _feedback = grade_task(task_id, ep)
|
| 126 |
ep["final_score"] = final_score
|
| 127 |
+
# Grader score is the terminal bonus (0–1)
|
| 128 |
+
step_reward += final_score
|
| 129 |
+
explanation += f" | Grader score: {final_score:.3f}"
|
| 130 |
+
|
| 131 |
+
# Penalty for running out of steps without submitting
|
| 132 |
+
if action.action_type != "submit" and ep["step_number"] >= ep["max_steps"]:
|
| 133 |
+
step_reward += MAX_STEP_PENALTY
|
| 134 |
+
explanation += f" | Max-step penalty: {MAX_STEP_PENALTY}"
|
| 135 |
else:
|
| 136 |
final_score = None
|
| 137 |
|
|
|
|
| 139 |
ep["done"] = done
|
| 140 |
|
| 141 |
# Build observation
|
| 142 |
+
ticket_data = ep["ticket_data"]
|
| 143 |
+
safe_meta = get_task_meta(task_id)
|
| 144 |
+
|
| 145 |
+
ticket_info = TicketInfo(
|
| 146 |
+
ticket_id=ticket_data["ticket_id"],
|
| 147 |
+
subject=ticket_data["subject"],
|
| 148 |
+
body=ticket_data["body"],
|
| 149 |
+
customer_tier=ticket_data["customer_tier"],
|
| 150 |
+
account_age_days=ticket_data["account_age_days"],
|
| 151 |
+
previous_tickets=ticket_data["previous_tickets"],
|
| 152 |
+
attachments=ticket_data.get("attachments", []),
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
thread_history = [
|
| 156 |
+
{"role": "agent", "content": _summarize_action(a)}
|
| 157 |
+
for a in ep["action_history"]
|
| 158 |
]
|
| 159 |
|
| 160 |
obs = Observation(
|
| 161 |
task_id=task_id,
|
| 162 |
+
task_description=safe_meta["description"],
|
| 163 |
episode_id=episode_id,
|
| 164 |
+
ticket=ticket_info,
|
| 165 |
thread_history=thread_history,
|
| 166 |
+
available_actions=safe_meta["available_actions"] if not done else [],
|
| 167 |
step_number=ep["step_number"],
|
| 168 |
max_steps=ep["max_steps"],
|
| 169 |
+
hint=None if done else _get_hint(task_id, ep["step_number"]),
|
| 170 |
)
|
| 171 |
|
| 172 |
reward = Reward(
|
|
|
|
| 175 |
explanation=explanation,
|
| 176 |
)
|
| 177 |
|
| 178 |
+
info: Dict[str, Any] = {"step": ep["step_number"]}
|
| 179 |
if done:
|
| 180 |
info["final_score"] = final_score
|
| 181 |
|
|
|
|
| 205 |
ep = _EPISODES.get(episode_id)
|
| 206 |
if ep is None:
|
| 207 |
raise KeyError(f"Episode {episode_id} not found")
|
|
|
|
| 208 |
if not ep.get("done"):
|
| 209 |
raise ValueError(f"Episode {episode_id} is not done yet")
|
| 210 |
|
| 211 |
task_id = ep["task_id"]
|
| 212 |
score, breakdown, feedback = grade_task(task_id, ep)
|
| 213 |
ep["final_score"] = score
|
|
|
|
| 214 |
return score, breakdown, feedback
|
| 215 |
|
| 216 |
+
|
| 217 |
+
# ---------------------------------------------------------------------------
|
| 218 |
+
# Helpers
|
| 219 |
+
# ---------------------------------------------------------------------------
|
| 220 |
+
|
| 221 |
+
def _calculate_step_reward(
|
| 222 |
+
task_id: str, action: Action, ep: Dict[str, Any], done: bool
|
| 223 |
+
) -> Tuple[float, str]:
|
| 224 |
+
"""Dense per-step reward."""
|
| 225 |
+
reward = STEP_COST # small cost per step
|
| 226 |
+
|
| 227 |
+
if action.action_type == "submit":
|
| 228 |
+
reward += SUBMIT_BONUS
|
| 229 |
+
return reward, "Submitted for grading"
|
| 230 |
+
|
| 231 |
+
# Partial-progress signals based on task
|
| 232 |
+
if task_id == "task1":
|
| 233 |
+
if action.action_type == "classify":
|
| 234 |
+
if action.category:
|
| 235 |
+
reward += 0.02
|
| 236 |
+
if action.priority:
|
| 237 |
+
reward += 0.02
|
| 238 |
+
return reward, f"Classified: category={action.category}, priority={action.priority}"
|
| 239 |
+
|
| 240 |
+
elif task_id == "task2":
|
| 241 |
+
if action.action_type == "extract":
|
| 242 |
+
n_entities = len(action.extracted_entities) if action.extracted_entities else 0
|
| 243 |
+
n_actions = len(action.required_actions) if action.required_actions else 0
|
| 244 |
+
reward += min(n_entities * 0.005, 0.04)
|
| 245 |
+
reward += min(n_actions * 0.005, 0.02)
|
| 246 |
+
return reward, f"Extracted {n_entities} entities, {n_actions} actions"
|
| 247 |
+
|
| 248 |
+
elif task_id == "task3":
|
| 249 |
+
if action.action_type == "respond":
|
| 250 |
+
text_len = len(action.response_text or "")
|
| 251 |
+
n_steps = len(action.resolution_steps) if action.resolution_steps else 0
|
| 252 |
+
if text_len > 0:
|
| 253 |
+
reward += min(text_len * 0.0001, 0.03)
|
| 254 |
+
if n_steps > 0:
|
| 255 |
+
reward += min(n_steps * 0.005, 0.02)
|
| 256 |
+
return reward, f"Response ({text_len} chars), {n_steps} resolution steps"
|
| 257 |
+
|
| 258 |
+
return reward, "Step taken"
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def _summarize_action(action_dict: Dict[str, Any]) -> str:
|
| 262 |
+
"""One-line summary of an action for thread_history."""
|
| 263 |
+
atype = action_dict.get("action_type", "unknown")
|
| 264 |
+
if atype == "classify":
|
| 265 |
+
return f"classify(category={action_dict.get('category')}, priority={action_dict.get('priority')})"
|
| 266 |
+
elif atype == "extract":
|
| 267 |
+
ents = action_dict.get("extracted_entities") or {}
|
| 268 |
+
acts = action_dict.get("required_actions") or []
|
| 269 |
+
return f"extract(entities={list(ents.keys())}, actions={acts})"
|
| 270 |
+
elif atype == "respond":
|
| 271 |
+
text = (action_dict.get("response_text") or "")[:60]
|
| 272 |
+
steps = action_dict.get("resolution_steps") or []
|
| 273 |
+
return f"respond(text='{text}...', steps={len(steps)})"
|
| 274 |
+
elif atype == "submit":
|
| 275 |
+
return "submit()"
|
| 276 |
+
return f"{atype}()"
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def _get_hint(task_id: str, step: int) -> Optional[str]:
|
| 280 |
+
"""Contextual hints to guide the agent."""
|
| 281 |
+
if step == 0:
|
| 282 |
+
hints = {
|
| 283 |
+
"task1": "Read the ticket carefully and classify by category and priority.",
|
| 284 |
+
"task2": "Extract all entities (IDs, names, amounts) and identify required actions.",
|
| 285 |
+
"task3": "Write a professional response and list resolution steps.",
|
| 286 |
+
}
|
| 287 |
+
return hints.get(task_id)
|
| 288 |
+
return None
|
graders.py
CHANGED
|
@@ -1,191 +1,244 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
-
from
|
| 7 |
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
Returns (score, breakdown, feedback)
|
| 14 |
-
"""
|
| 15 |
if task_id == "task1":
|
| 16 |
-
return
|
| 17 |
elif task_id == "task2":
|
| 18 |
-
return
|
| 19 |
elif task_id == "task3":
|
| 20 |
-
return
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
def
|
| 26 |
"""
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
- nginx config is valid (30%)
|
| 32 |
-
- HTTP 200 response on port 80 (40%)
|
| 33 |
"""
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
breakdown = {
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"
|
| 41 |
}
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
def grade_task2(episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
|
| 75 |
"""
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
- docker-compose up -d was successful (25%)
|
| 81 |
-
- Container is running (25%)
|
| 82 |
-
- Service accessible on correct port (25%)
|
| 83 |
"""
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
"
|
| 90 |
-
"
|
| 91 |
-
"container_running": 0.0,
|
| 92 |
-
"port_accessible": 0.0,
|
| 93 |
}
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
"""
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
- Memory usage decreased (25%)
|
| 141 |
"""
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
"
|
| 149 |
-
"
|
| 150 |
-
"
|
| 151 |
-
"memory_reduced": 0.0,
|
| 152 |
}
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Deterministic graders for SupportEnv tasks.
|
| 3 |
|
| 4 |
+
Each grader inspects the agent's action_history against ground-truth data
|
| 5 |
+
and returns (score, breakdown, feedback) where score is in [0.0, 1.0].
|
| 6 |
+
|
| 7 |
+
Task 1 — Classification: category match (0.50) + priority match (0.40) + efficiency (0.10)
|
| 8 |
+
Task 2 — Extraction: entity coverage (0.60) + action coverage (0.30) + no hallucination (0.10)
|
| 9 |
+
Task 3 — Resolution: keyword coverage (0.30) + step coverage (0.30) + tone (0.25) +
|
| 10 |
+
length (0.10) + non-empty steps (0.05)
|
| 11 |
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
|
| 14 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 15 |
|
| 16 |
+
|
| 17 |
+
def grade_task(
|
| 18 |
+
task_id: str, episode_state: Dict[str, Any]
|
| 19 |
+
) -> Tuple[float, Dict[str, float], str]:
|
|
|
|
|
|
|
| 20 |
if task_id == "task1":
|
| 21 |
+
return _grade_classification(episode_state)
|
| 22 |
elif task_id == "task2":
|
| 23 |
+
return _grade_extraction(episode_state)
|
| 24 |
elif task_id == "task3":
|
| 25 |
+
return _grade_resolution(episode_state)
|
| 26 |
+
return 0.0, {}, "Unknown task"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# Helpers
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
def _last_action_of_type(
|
| 34 |
+
history: List[Dict[str, Any]], action_type: str
|
| 35 |
+
) -> Optional[Dict[str, Any]]:
|
| 36 |
+
"""Return the last action matching *action_type*, or None."""
|
| 37 |
+
for action in reversed(history):
|
| 38 |
+
if action.get("action_type") == action_type:
|
| 39 |
+
return action
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _normalize(s: Any) -> str:
|
| 44 |
+
return str(s).strip().lower() if s is not None else ""
|
| 45 |
+
|
| 46 |
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Task 1 — Classification
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
|
| 51 |
+
def _grade_classification(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
|
| 52 |
"""
|
| 53 |
+
Score breakdown:
|
| 54 |
+
category_correct 0.50 — exact match
|
| 55 |
+
priority_correct 0.40 — exact match
|
| 56 |
+
efficiency 0.10 — 1 step = full, degrades linearly
|
|
|
|
|
|
|
| 57 |
"""
|
| 58 |
+
gt = ep["ticket_data"]["ground_truth"]
|
| 59 |
+
history = ep.get("action_history", [])
|
| 60 |
+
|
| 61 |
+
breakdown: Dict[str, float] = {
|
| 62 |
+
"category_correct": 0.0,
|
| 63 |
+
"priority_correct": 0.0,
|
| 64 |
+
"efficiency": 0.0,
|
| 65 |
}
|
| 66 |
+
|
| 67 |
+
classify_action = _last_action_of_type(history, "classify")
|
| 68 |
+
if classify_action is None:
|
| 69 |
+
return 0.0, breakdown, "No classify action found."
|
| 70 |
+
|
| 71 |
+
# Category
|
| 72 |
+
if _normalize(classify_action.get("category")) == _normalize(gt["category"]):
|
| 73 |
+
breakdown["category_correct"] = 0.50
|
| 74 |
+
|
| 75 |
+
# Priority
|
| 76 |
+
if _normalize(classify_action.get("priority")) == _normalize(gt["priority"]):
|
| 77 |
+
breakdown["priority_correct"] = 0.40
|
| 78 |
+
|
| 79 |
+
# Efficiency: full marks if classified in 1 step, degrades linearly
|
| 80 |
+
max_steps = ep.get("max_steps", 3)
|
| 81 |
+
steps_used = ep.get("step_number", max_steps)
|
| 82 |
+
if steps_used <= 1:
|
| 83 |
+
breakdown["efficiency"] = 0.10
|
| 84 |
+
else:
|
| 85 |
+
breakdown["efficiency"] = round(max(0.0, 0.10 * (1 - (steps_used - 1) / max_steps)), 4)
|
| 86 |
+
|
| 87 |
+
score = round(min(sum(breakdown.values()), 1.0), 4)
|
| 88 |
+
parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
|
| 89 |
+
return score, breakdown, f"Task 1: {parts}"
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
# Task 2 — Information Extraction
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
|
| 96 |
+
def _grade_extraction(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
|
|
|
|
|
|
|
| 97 |
"""
|
| 98 |
+
Score breakdown:
|
| 99 |
+
entity_coverage 0.60 — fraction of ground-truth entities matched
|
| 100 |
+
action_coverage 0.30 — fraction of required actions matched
|
| 101 |
+
no_hallucination 0.10 — penalty for extra entities not in ground truth
|
|
|
|
|
|
|
|
|
|
| 102 |
"""
|
| 103 |
+
gt = ep["ticket_data"]["ground_truth"]
|
| 104 |
+
history = ep.get("action_history", [])
|
| 105 |
+
|
| 106 |
+
breakdown: Dict[str, float] = {
|
| 107 |
+
"entity_coverage": 0.0,
|
| 108 |
+
"action_coverage": 0.0,
|
| 109 |
+
"no_hallucination": 0.10, # start with full marks, deduct
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
+
|
| 112 |
+
extract_action = _last_action_of_type(history, "extract")
|
| 113 |
+
if extract_action is None:
|
| 114 |
+
breakdown["no_hallucination"] = 0.0
|
| 115 |
+
return 0.0, breakdown, "No extract action found."
|
| 116 |
+
|
| 117 |
+
# --- Entity coverage ---
|
| 118 |
+
gt_entities: Dict[str, Any] = gt.get("entities", {})
|
| 119 |
+
pred_entities: Dict[str, Any] = extract_action.get("extracted_entities") or {}
|
| 120 |
+
|
| 121 |
+
if gt_entities:
|
| 122 |
+
matched = 0
|
| 123 |
+
for key, gt_val in gt_entities.items():
|
| 124 |
+
pred_val = pred_entities.get(key)
|
| 125 |
+
if pred_val is not None and _entity_matches(gt_val, pred_val):
|
| 126 |
+
matched += 1
|
| 127 |
+
breakdown["entity_coverage"] = round(0.60 * matched / len(gt_entities), 4)
|
| 128 |
+
|
| 129 |
+
# --- Action coverage ---
|
| 130 |
+
gt_actions: List[str] = gt.get("required_actions", [])
|
| 131 |
+
pred_actions: List[str] = extract_action.get("required_actions") or []
|
| 132 |
+
pred_actions_lower = [_normalize(a) for a in pred_actions]
|
| 133 |
+
|
| 134 |
+
if gt_actions:
|
| 135 |
+
matched_actions = sum(
|
| 136 |
+
1 for ga in gt_actions if _normalize(ga) in pred_actions_lower
|
| 137 |
+
)
|
| 138 |
+
breakdown["action_coverage"] = round(0.30 * matched_actions / len(gt_actions), 4)
|
| 139 |
+
|
| 140 |
+
# --- No hallucination ---
|
| 141 |
+
if pred_entities and gt_entities:
|
| 142 |
+
extra_keys = set(pred_entities.keys()) - set(gt_entities.keys())
|
| 143 |
+
if extra_keys:
|
| 144 |
+
penalty = min(len(extra_keys) * 0.02, 0.10)
|
| 145 |
+
breakdown["no_hallucination"] = round(max(0.0, 0.10 - penalty), 4)
|
| 146 |
+
|
| 147 |
+
score = round(min(sum(breakdown.values()), 1.0), 4)
|
| 148 |
+
parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
|
| 149 |
+
return score, breakdown, f"Task 2: {parts}"
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _entity_matches(gt_val: Any, pred_val: Any) -> bool:
|
| 153 |
+
"""Flexible entity comparison — handles strings, lists, and numbers."""
|
| 154 |
+
if isinstance(gt_val, list) and isinstance(pred_val, list):
|
| 155 |
+
gt_set = {_normalize(v) for v in gt_val}
|
| 156 |
+
pred_set = {_normalize(v) for v in pred_val}
|
| 157 |
+
return gt_set == pred_set
|
| 158 |
+
return _normalize(gt_val) == _normalize(pred_val)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# ---------------------------------------------------------------------------
|
| 162 |
+
# Task 3 — Resolution Generation
|
| 163 |
+
# ---------------------------------------------------------------------------
|
| 164 |
+
|
| 165 |
+
def _grade_resolution(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
|
| 166 |
"""
|
| 167 |
+
Score breakdown:
|
| 168 |
+
keyword_coverage 0.30 — fraction of required keywords found in response
|
| 169 |
+
step_coverage 0.30 — fraction of required resolution steps matched
|
| 170 |
+
tone_compliance 0.25 — apology / urgency / timeline adherence
|
| 171 |
+
length_adequate 0.10 — response meets minimum length
|
| 172 |
+
no_empty_steps 0.05 — all resolution steps are non-empty
|
|
|
|
| 173 |
"""
|
| 174 |
+
gt = ep["ticket_data"]["ground_truth"]
|
| 175 |
+
history = ep.get("action_history", [])
|
| 176 |
+
|
| 177 |
+
breakdown: Dict[str, float] = {
|
| 178 |
+
"keyword_coverage": 0.0,
|
| 179 |
+
"step_coverage": 0.0,
|
| 180 |
+
"tone_compliance": 0.0,
|
| 181 |
+
"length_adequate": 0.0,
|
| 182 |
+
"no_empty_steps": 0.05, # assume pass unless empty steps found
|
|
|
|
| 183 |
}
|
| 184 |
+
|
| 185 |
+
respond_action = _last_action_of_type(history, "respond")
|
| 186 |
+
if respond_action is None:
|
| 187 |
+
breakdown["no_empty_steps"] = 0.0
|
| 188 |
+
return 0.0, breakdown, "No respond action found."
|
| 189 |
+
|
| 190 |
+
response_text: str = respond_action.get("response_text") or ""
|
| 191 |
+
resolution_steps: List[str] = respond_action.get("resolution_steps") or []
|
| 192 |
+
response_lower = response_text.lower()
|
| 193 |
+
|
| 194 |
+
# --- Keyword coverage ---
|
| 195 |
+
required_keywords: List[str] = gt.get("required_keywords", [])
|
| 196 |
+
if required_keywords:
|
| 197 |
+
matched_kw = sum(1 for kw in required_keywords if kw.lower() in response_lower)
|
| 198 |
+
breakdown["keyword_coverage"] = round(0.30 * matched_kw / len(required_keywords), 4)
|
| 199 |
+
|
| 200 |
+
# --- Step coverage ---
|
| 201 |
+
gt_steps: List[str] = gt.get("required_resolution_steps", [])
|
| 202 |
+
if gt_steps:
|
| 203 |
+
pred_steps_lower = [_normalize(s) for s in resolution_steps]
|
| 204 |
+
matched_steps = sum(
|
| 205 |
+
1 for gs in gt_steps if _normalize(gs) in pred_steps_lower
|
| 206 |
+
)
|
| 207 |
+
breakdown["step_coverage"] = round(0.30 * matched_steps / len(gt_steps), 4)
|
| 208 |
+
|
| 209 |
+
# --- Tone compliance ---
|
| 210 |
+
tone_req = gt.get("tone_requirements", {})
|
| 211 |
+
tone_checks = 0
|
| 212 |
+
tone_pass = 0
|
| 213 |
+
if tone_req.get("must_apologize"):
|
| 214 |
+
tone_checks += 1
|
| 215 |
+
apology_words = ["apolog", "sorry", "regret", "sincerely"]
|
| 216 |
+
if any(w in response_lower for w in apology_words):
|
| 217 |
+
tone_pass += 1
|
| 218 |
+
if tone_req.get("must_acknowledge_urgency"):
|
| 219 |
+
tone_checks += 1
|
| 220 |
+
urgency_words = ["urgent", "immediately", "priority", "asap", "right away", "as soon as"]
|
| 221 |
+
if any(w in response_lower for w in urgency_words):
|
| 222 |
+
tone_pass += 1
|
| 223 |
+
if tone_req.get("must_provide_timeline"):
|
| 224 |
+
tone_checks += 1
|
| 225 |
+
timeline_words = ["within", "hours", "minutes", "by end of", "shortly", "today", "tomorrow", "timeline", "expect"]
|
| 226 |
+
if any(w in response_lower for w in timeline_words):
|
| 227 |
+
tone_pass += 1
|
| 228 |
+
if tone_checks > 0:
|
| 229 |
+
breakdown["tone_compliance"] = round(0.25 * tone_pass / tone_checks, 4)
|
| 230 |
+
else:
|
| 231 |
+
breakdown["tone_compliance"] = 0.25 # no tone requirements = full marks
|
| 232 |
+
|
| 233 |
+
# --- Length adequate ---
|
| 234 |
+
min_len = gt.get("expected_response_length_min", 80)
|
| 235 |
+
if len(response_text) >= min_len:
|
| 236 |
+
breakdown["length_adequate"] = 0.10
|
| 237 |
+
|
| 238 |
+
# --- Non-empty steps ---
|
| 239 |
+
if not resolution_steps or any(not s.strip() for s in resolution_steps):
|
| 240 |
+
breakdown["no_empty_steps"] = 0.0
|
| 241 |
+
|
| 242 |
+
score = round(min(sum(breakdown.values()), 1.0), 4)
|
| 243 |
+
parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
|
| 244 |
+
return score, breakdown, f"Task 3: {parts}"
|
inference.py
CHANGED
|
@@ -1,351 +1,290 @@
|
|
| 1 |
"""
|
| 2 |
-
Baseline
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
|
|
|
| 10 |
import os
|
| 11 |
import sys
|
| 12 |
-
import json
|
| 13 |
-
import argparse
|
| 14 |
import time
|
| 15 |
-
from
|
| 16 |
-
from typing import Optional
|
| 17 |
|
| 18 |
import requests
|
| 19 |
-
from google import genai
|
| 20 |
from openai import OpenAI
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
"""Create an OpenAI-compatible client for OpenAI-style chat completions."""
|
| 65 |
-
api_key = os.environ.get("OPENAI_API_KEY", "sk-test")
|
| 66 |
-
client_kwargs = {"api_key": api_key}
|
| 67 |
-
if OPENAI_BASE_URL:
|
| 68 |
-
client_kwargs["base_url"] = OPENAI_BASE_URL
|
| 69 |
-
return OpenAI(**client_kwargs)
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def _get_gemini_client() -> genai.Client:
|
| 73 |
-
"""Create a Gemini client using the official google-genai SDK."""
|
| 74 |
-
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("OPENAI_API_KEY", "")
|
| 75 |
-
if not api_key:
|
| 76 |
-
raise ValueError("GEMINI_API_KEY is required for Gemini models")
|
| 77 |
-
return genai.Client(api_key=api_key)
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def _is_gemini_model(model: str) -> bool:
|
| 81 |
-
"""Detect whether the requested model should use the Gemini SDK path."""
|
| 82 |
-
m = (model or "").lower()
|
| 83 |
-
return "gemini" in m
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
def _resolve_gemini_model(model: str) -> str:
|
| 87 |
-
"""Map shorthand Gemini model names to concrete model IDs."""
|
| 88 |
-
m = (model or "").strip()
|
| 89 |
-
if not m or m.lower() == "gemini":
|
| 90 |
-
return GEMINI_DEFAULT_MODEL
|
| 91 |
-
return m
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def _generate_action_text(
|
| 95 |
-
model: str,
|
| 96 |
-
system_prompt: str,
|
| 97 |
-
user_prompt: str,
|
| 98 |
-
openai_client: Optional[OpenAI],
|
| 99 |
-
gemini_client: Optional[genai.Client],
|
| 100 |
-
) -> str:
|
| 101 |
-
"""Generate model output text using Gemini SDK or OpenAI-compatible chat."""
|
| 102 |
-
if _is_gemini_model(model):
|
| 103 |
-
if gemini_client is None:
|
| 104 |
-
raise ValueError("Gemini client was not initialized")
|
| 105 |
-
gemini_model = _resolve_gemini_model(model)
|
| 106 |
-
combined_prompt = (
|
| 107 |
-
f"System instructions:\n{system_prompt}\n\n"
|
| 108 |
-
f"User request:\n{user_prompt}"
|
| 109 |
-
)
|
| 110 |
-
response = gemini_client.models.generate_content(
|
| 111 |
-
model=gemini_model,
|
| 112 |
-
contents=combined_prompt,
|
| 113 |
-
)
|
| 114 |
-
return response.text or ""
|
| 115 |
-
|
| 116 |
-
if openai_client is None:
|
| 117 |
-
raise ValueError("OpenAI client was not initialized")
|
| 118 |
-
|
| 119 |
-
response = openai_client.chat.completions.create(
|
| 120 |
-
model=model,
|
| 121 |
-
messages=[
|
| 122 |
-
{"role": "system", "content": system_prompt},
|
| 123 |
-
{"role": "user", "content": user_prompt},
|
| 124 |
-
],
|
| 125 |
-
temperature=0.3,
|
| 126 |
-
max_tokens=1000,
|
| 127 |
)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
# Prepare prompt for LLM
|
| 176 |
-
system_prompt = """You are an expert Linux DevOps engineer/SRE.
|
| 177 |
-
Your job is to diagnose and fix broken systems using bash commands and file edits.
|
| 178 |
-
You are interacting with a simulated Linux environment.
|
| 179 |
-
|
| 180 |
-
Available actions:
|
| 181 |
-
1. bash_cmd: Execute a bash command
|
| 182 |
-
2. file_edit: Edit a file
|
| 183 |
-
3. submit: Submit when the task is complete
|
| 184 |
-
|
| 185 |
-
Respond in JSON format with this structure:
|
| 186 |
-
{
|
| 187 |
-
"action_type": "bash_cmd" | "file_edit" | "submit",
|
| 188 |
-
"command": "command to execute" (if bash_cmd),
|
| 189 |
-
"file_path": "/path/to/file" (if file_edit),
|
| 190 |
-
"file_content": "new file content" (if file_edit),
|
| 191 |
-
"summary": "why you're taking this action"
|
| 192 |
}
|
| 193 |
|
| 194 |
-
Be strategic:
|
| 195 |
-
- Start by diagnosing the system
|
| 196 |
-
- Use ps, systemctl, curl, etc. to understand issues
|
| 197 |
-
- Fix the root cause
|
| 198 |
-
- Submit when done
|
| 199 |
-
"""
|
| 200 |
-
|
| 201 |
-
user_prompt = f"""
|
| 202 |
-
Current system state:
|
| 203 |
-
- Task: {obs['task_description']}
|
| 204 |
-
- Step: {state['step_number']}/{state['max_steps']}
|
| 205 |
-
- Reward so far: {state['total_reward']:.3f}
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
Previous actions: {len(state['history'])} taken so far
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
|
|
|
| 214 |
|
| 215 |
-
|
| 216 |
-
"""
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
"episode_id": episode_id,
|
| 254 |
-
"action":
|
| 255 |
})
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
done =
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
print(f"Reward: {reward['step_reward']:+.3f} (total: {total_reward:.3f})")
|
| 274 |
-
print(f"Info: {reward['explanation'][:100]}")
|
| 275 |
-
|
| 276 |
-
if done:
|
| 277 |
-
print(f"\n{'='*60}")
|
| 278 |
-
print("EPISODE COMPLETE!")
|
| 279 |
-
print(f"Final Reward: {total_reward:.3f}")
|
| 280 |
-
print(f"Steps taken: {step_count}")
|
| 281 |
-
print(f"{'='*60}\n")
|
| 282 |
-
break
|
| 283 |
-
|
| 284 |
-
except Exception as e:
|
| 285 |
-
print(f"Step error: {e}")
|
| 286 |
-
break
|
| 287 |
-
|
| 288 |
-
# Small delay to avoid rate limiting
|
| 289 |
-
time.sleep(0.5)
|
| 290 |
-
|
| 291 |
-
# Grade the episode
|
| 292 |
-
try:
|
| 293 |
-
grade_result = send_request("POST", "/grader", json={"episode_id": episode_id})
|
| 294 |
-
print(f"\nGrader Results:")
|
| 295 |
-
print(f"Score: {grade_result['score']:.3f}/1.0")
|
| 296 |
-
print(f"Breakdown: {json.dumps(grade_result['breakdown'], indent=2)}")
|
| 297 |
-
print(f"Feedback: {grade_result['feedback']}")
|
| 298 |
-
except Exception as e:
|
| 299 |
-
print(f"Grading error: {e}")
|
| 300 |
-
|
| 301 |
return {
|
| 302 |
"task_id": task_id,
|
| 303 |
-
"
|
| 304 |
-
"
|
| 305 |
-
"
|
| 306 |
-
"
|
| 307 |
}
|
| 308 |
|
| 309 |
|
| 310 |
-
def
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
)
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
|
| 351 |
if __name__ == "__main__":
|
|
|
|
| 1 |
"""
|
| 2 |
+
Baseline inference script for SupportEnv.
|
| 3 |
|
| 4 |
+
Runs an LLM agent against all 3 tasks (5 tickets each) and emits the
|
| 5 |
+
mandatory [START]/[STEP]/[END] stdout format.
|
| 6 |
|
| 7 |
+
Environment variables:
|
| 8 |
+
API_BASE_URL LLM endpoint (default: https://router.huggingface.co/v1)
|
| 9 |
+
MODEL_NAME Model identifier (default: Qwen/Qwen2.5-72B-Instruct)
|
| 10 |
+
HF_TOKEN API key
|
| 11 |
+
API_BASE_URL_ENV SupportEnv server URL (default: http://localhost:7860)
|
| 12 |
"""
|
| 13 |
+
import json
|
| 14 |
import os
|
| 15 |
import sys
|
|
|
|
|
|
|
| 16 |
import time
|
| 17 |
+
from typing import Any, Dict, List, Optional
|
|
|
|
| 18 |
|
| 19 |
import requests
|
|
|
|
| 20 |
from openai import OpenAI
|
| 21 |
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
# Config from environment
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
|
| 26 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 27 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 28 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
|
| 29 |
+
ENV_BASE_URL = os.getenv("API_BASE_URL_ENV", "http://localhost:7860")
|
| 30 |
+
|
| 31 |
+
TEMPERATURE = 0.3
|
| 32 |
+
MAX_TOKENS = 1024
|
| 33 |
+
BENCHMARK = "supportenv"
|
| 34 |
+
|
| 35 |
+
TASKS = [
|
| 36 |
+
{"task_id": "task1", "name": "Ticket Classification", "tickets": 5},
|
| 37 |
+
{"task_id": "task2", "name": "Information Extraction", "tickets": 5},
|
| 38 |
+
{"task_id": "task3", "name": "Resolution Generation", "tickets": 5},
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
# Logging helpers (mandatory format)
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 47 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 51 |
+
error_val = error if error else "null"
|
| 52 |
+
done_val = str(done).lower()
|
| 53 |
+
print(
|
| 54 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
|
| 55 |
+
flush=True,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def log_end(success: bool, steps: int, rewards: List[float]) -> None:
|
| 60 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 61 |
+
print(
|
| 62 |
+
f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
|
| 63 |
+
flush=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
# Environment HTTP helpers
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
def env_request(method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
|
| 72 |
+
url = f"{ENV_BASE_URL}{endpoint}"
|
| 73 |
+
resp = requests.request(method, url, timeout=30, **kwargs)
|
| 74 |
+
resp.raise_for_status()
|
| 75 |
+
return resp.json()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
# LLM prompts per task
|
| 80 |
+
# ---------------------------------------------------------------------------
|
| 81 |
+
|
| 82 |
+
SYSTEM_PROMPTS = {
|
| 83 |
+
"task1": (
|
| 84 |
+
"You are an expert customer support triage agent.\n"
|
| 85 |
+
"Given a support ticket, classify it by:\n"
|
| 86 |
+
" category: one of billing | technical | account | feature_request | complaint | general\n"
|
| 87 |
+
" priority: one of low | medium | high | critical\n\n"
|
| 88 |
+
"Respond with ONLY valid JSON:\n"
|
| 89 |
+
'{"action_type": "classify", "category": "<category>", "priority": "<priority>"}'
|
| 90 |
+
),
|
| 91 |
+
"task2": (
|
| 92 |
+
"You are an expert information extraction agent for customer support.\n"
|
| 93 |
+
"Given a support ticket, extract ALL structured entities and identify required actions.\n\n"
|
| 94 |
+
"Respond with ONLY valid JSON:\n"
|
| 95 |
+
'{"action_type": "extract", "extracted_entities": {"key": "value", ...}, '
|
| 96 |
+
'"required_actions": ["action1", "action2", ...]}'
|
| 97 |
+
),
|
| 98 |
+
"task3": (
|
| 99 |
+
"You are an expert customer support resolution agent.\n"
|
| 100 |
+
"Given a support ticket, write a professional customer-facing response and "
|
| 101 |
+
"list the internal resolution steps.\n\n"
|
| 102 |
+
"Requirements:\n"
|
| 103 |
+
"- response_text: Professional, empathetic response (80+ chars)\n"
|
| 104 |
+
"- resolution_steps: Ordered list of internal action identifiers\n"
|
| 105 |
+
"- If the ticket is urgent, acknowledge urgency and provide a timeline\n"
|
| 106 |
+
"- If appropriate, include an apology\n\n"
|
| 107 |
+
"Respond with ONLY valid JSON:\n"
|
| 108 |
+
'{"action_type": "respond", "response_text": "...", '
|
| 109 |
+
'"resolution_steps": ["step1", "step2", ...]}'
|
| 110 |
+
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
}
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
def build_user_prompt(task_id: str, ticket: Dict[str, Any]) -> str:
|
| 115 |
+
parts = [
|
| 116 |
+
f"Ticket ID: {ticket['ticket_id']}",
|
| 117 |
+
f"Subject: {ticket['subject']}",
|
| 118 |
+
f"Body: {ticket['body']}",
|
| 119 |
+
f"Customer Tier: {ticket['customer_tier']}",
|
| 120 |
+
f"Account Age: {ticket['account_age_days']} days",
|
| 121 |
+
f"Previous Tickets: {ticket['previous_tickets']}",
|
| 122 |
+
]
|
| 123 |
+
if ticket.get("attachments"):
|
| 124 |
+
parts.append(f"Attachments: {', '.join(ticket['attachments'])}")
|
| 125 |
+
return "\n".join(parts)
|
| 126 |
|
|
|
|
| 127 |
|
| 128 |
+
# ---------------------------------------------------------------------------
|
| 129 |
+
# LLM call
|
| 130 |
+
# ---------------------------------------------------------------------------
|
| 131 |
|
| 132 |
+
def call_llm(client: OpenAI, task_id: str, ticket: Dict[str, Any]) -> Dict[str, Any]:
|
| 133 |
+
"""Call the LLM and parse its JSON response into an action dict."""
|
| 134 |
+
system_prompt = SYSTEM_PROMPTS[task_id]
|
| 135 |
+
user_prompt = build_user_prompt(task_id, ticket)
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
completion = client.chat.completions.create(
|
| 139 |
+
model=MODEL_NAME,
|
| 140 |
+
messages=[
|
| 141 |
+
{"role": "system", "content": system_prompt},
|
| 142 |
+
{"role": "user", "content": user_prompt},
|
| 143 |
+
],
|
| 144 |
+
temperature=TEMPERATURE,
|
| 145 |
+
max_tokens=MAX_TOKENS,
|
| 146 |
+
)
|
| 147 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 148 |
+
return _parse_json(text, task_id)
|
| 149 |
+
except Exception as exc:
|
| 150 |
+
print(f"[DEBUG] LLM error: {exc}", file=sys.stderr, flush=True)
|
| 151 |
+
return _fallback_action(task_id)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _parse_json(text: str, task_id: str) -> Dict[str, Any]:
|
| 155 |
+
"""Extract JSON from model output, handling markdown fences."""
|
| 156 |
+
if "```json" in text:
|
| 157 |
+
text = text.split("```json")[1].split("```")[0]
|
| 158 |
+
elif "```" in text:
|
| 159 |
+
text = text.split("```")[1].split("```")[0]
|
| 160 |
+
try:
|
| 161 |
+
return json.loads(text.strip())
|
| 162 |
+
except json.JSONDecodeError:
|
| 163 |
+
print(f"[DEBUG] JSON parse failed: {text[:120]}", file=sys.stderr, flush=True)
|
| 164 |
+
return _fallback_action(task_id)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _fallback_action(task_id: str) -> Dict[str, Any]:
|
| 168 |
+
"""Deterministic fallback when LLM fails."""
|
| 169 |
+
if task_id == "task1":
|
| 170 |
+
return {"action_type": "classify", "category": "general", "priority": "medium"}
|
| 171 |
+
elif task_id == "task2":
|
| 172 |
+
return {"action_type": "extract", "extracted_entities": {}, "required_actions": []}
|
| 173 |
+
return {"action_type": "respond", "response_text": "Thank you for contacting support. We are looking into this.", "resolution_steps": []}
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# ---------------------------------------------------------------------------
|
| 177 |
+
# Run one episode
|
| 178 |
+
# ---------------------------------------------------------------------------
|
| 179 |
+
|
| 180 |
+
def run_episode(
|
| 181 |
+
client: OpenAI, task_id: str, task_name: str, ticket_index: int
|
| 182 |
+
) -> Dict[str, Any]:
|
| 183 |
+
"""Run a single episode: reset → action → submit → grade."""
|
| 184 |
+
log_start(task=f"{task_name}-ticket{ticket_index}", env=BENCHMARK, model=MODEL_NAME)
|
| 185 |
+
|
| 186 |
+
rewards: List[float] = []
|
| 187 |
+
steps_taken = 0
|
| 188 |
+
success = False
|
| 189 |
+
error_msg: Optional[str] = None
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
# Reset
|
| 193 |
+
obs = env_request("POST", "/reset", json={
|
| 194 |
+
"task_id": task_id, "ticket_index": ticket_index
|
| 195 |
+
})
|
| 196 |
+
episode_id = obs["episode_id"]
|
| 197 |
+
ticket = obs["ticket"]
|
| 198 |
+
|
| 199 |
+
# Step 1: LLM generates the action
|
| 200 |
+
action_data = call_llm(client, task_id, ticket)
|
| 201 |
+
result = env_request("POST", "/step", json={
|
| 202 |
+
"episode_id": episode_id, "action": action_data
|
| 203 |
+
})
|
| 204 |
+
steps_taken = 1
|
| 205 |
+
reward_val = result["reward"]["step_reward"]
|
| 206 |
+
rewards.append(reward_val)
|
| 207 |
+
done = result["done"]
|
| 208 |
+
action_summary = _action_summary(action_data)
|
| 209 |
+
log_step(step=1, action=action_summary, reward=reward_val, done=done, error=error_msg)
|
| 210 |
+
|
| 211 |
+
# Step 2: Submit if not already done
|
| 212 |
+
if not done:
|
| 213 |
+
submit_result = env_request("POST", "/step", json={
|
| 214 |
"episode_id": episode_id,
|
| 215 |
+
"action": {"action_type": "submit"},
|
| 216 |
})
|
| 217 |
+
steps_taken = 2
|
| 218 |
+
reward_val = submit_result["reward"]["step_reward"]
|
| 219 |
+
rewards.append(reward_val)
|
| 220 |
+
done = submit_result["done"]
|
| 221 |
+
log_step(step=2, action="submit()", reward=reward_val, done=done, error=None)
|
| 222 |
+
|
| 223 |
+
# Grade
|
| 224 |
+
grade = env_request("POST", "/grader", json={"episode_id": episode_id})
|
| 225 |
+
final_score = grade["score"]
|
| 226 |
+
success = final_score >= 0.5
|
| 227 |
+
|
| 228 |
+
except Exception as exc:
|
| 229 |
+
error_msg = str(exc)
|
| 230 |
+
print(f"[DEBUG] Episode error: {exc}", file=sys.stderr, flush=True)
|
| 231 |
+
|
| 232 |
+
log_end(success=success, steps=steps_taken, rewards=rewards)
|
| 233 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
return {
|
| 235 |
"task_id": task_id,
|
| 236 |
+
"ticket_index": ticket_index,
|
| 237 |
+
"steps": steps_taken,
|
| 238 |
+
"rewards": rewards,
|
| 239 |
+
"success": success,
|
| 240 |
}
|
| 241 |
|
| 242 |
|
| 243 |
+
def _action_summary(action: Dict[str, Any]) -> str:
|
| 244 |
+
atype = action.get("action_type", "unknown")
|
| 245 |
+
if atype == "classify":
|
| 246 |
+
return f"classify({action.get('category')},{action.get('priority')})"
|
| 247 |
+
elif atype == "extract":
|
| 248 |
+
ents = action.get("extracted_entities") or {}
|
| 249 |
+
acts = action.get("required_actions") or []
|
| 250 |
+
return f"extract({len(ents)}ents,{len(acts)}acts)"
|
| 251 |
+
elif atype == "respond":
|
| 252 |
+
tlen = len(action.get("response_text") or "")
|
| 253 |
+
slen = len(action.get("resolution_steps") or [])
|
| 254 |
+
return f"respond({tlen}chars,{slen}steps)"
|
| 255 |
+
return atype
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# ---------------------------------------------------------------------------
|
| 259 |
+
# Main
|
| 260 |
+
# ---------------------------------------------------------------------------
|
| 261 |
+
|
| 262 |
+
def main() -> None:
|
| 263 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 264 |
+
|
| 265 |
+
results = []
|
| 266 |
+
for task_info in TASKS:
|
| 267 |
+
task_id = task_info["task_id"]
|
| 268 |
+
task_name = task_info["name"]
|
| 269 |
+
num_tickets = task_info["tickets"]
|
| 270 |
+
|
| 271 |
+
for ticket_idx in range(num_tickets):
|
| 272 |
+
result = run_episode(client, task_id, task_name, ticket_idx)
|
| 273 |
+
results.append(result)
|
| 274 |
+
time.sleep(0.5) # rate-limit courtesy
|
| 275 |
+
|
| 276 |
+
# Summary
|
| 277 |
+
print("\n" + "=" * 60, flush=True)
|
| 278 |
+
print("BASELINE RESULTS SUMMARY", flush=True)
|
| 279 |
+
print("=" * 60, flush=True)
|
| 280 |
+
for r in results:
|
| 281 |
+
status = "PASS" if r["success"] else "FAIL"
|
| 282 |
+
total_r = sum(r["rewards"])
|
| 283 |
+
print(
|
| 284 |
+
f" {r['task_id']} ticket={r['ticket_index']} "
|
| 285 |
+
f"steps={r['steps']} reward={total_r:.2f} {status}",
|
| 286 |
+
flush=True,
|
| 287 |
+
)
|
| 288 |
|
| 289 |
|
| 290 |
if __name__ == "__main__":
|
models.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
Pydantic models for
|
| 3 |
|
| 4 |
-
Domain:
|
|
|
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
|
|
@@ -10,21 +11,18 @@ from pydantic import BaseModel, Field
|
|
| 10 |
|
| 11 |
|
| 12 |
# ---------------------------------------------------------------------------
|
| 13 |
-
#
|
| 14 |
# ---------------------------------------------------------------------------
|
| 15 |
|
| 16 |
-
class
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
docker_containers: List[Dict[str, str]]
|
| 26 |
-
cpu_usage: float
|
| 27 |
-
memory_usage_mb: int
|
| 28 |
|
| 29 |
|
| 30 |
# ---------------------------------------------------------------------------
|
|
@@ -34,17 +32,17 @@ class SystemState(BaseModel):
|
|
| 34 |
class Observation(BaseModel):
|
| 35 |
"""Everything the agent sees at each step."""
|
| 36 |
task_id: str = Field(description="task1 | task2 | task3")
|
| 37 |
-
task_description: str
|
| 38 |
-
episode_id: str
|
| 39 |
-
|
| 40 |
thread_history: List[Dict[str, str]] = Field(
|
| 41 |
default_factory=list,
|
| 42 |
-
description="Ordered list of {'role': 'agent'|'system', 'content': str}"
|
| 43 |
)
|
| 44 |
available_actions: List[str]
|
| 45 |
step_number: int
|
| 46 |
max_steps: int
|
| 47 |
-
hint: Optional[str] =
|
| 48 |
|
| 49 |
|
| 50 |
# ---------------------------------------------------------------------------
|
|
@@ -52,12 +50,40 @@ class Observation(BaseModel):
|
|
| 52 |
# ---------------------------------------------------------------------------
|
| 53 |
|
| 54 |
class Action(BaseModel):
|
| 55 |
-
"""Agent action
|
| 56 |
-
action_type: str = Field(
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
# ---------------------------------------------------------------------------
|
|
@@ -94,7 +120,7 @@ class State(BaseModel):
|
|
| 94 |
done: bool
|
| 95 |
total_reward: float
|
| 96 |
history: List[Dict[str, Any]] = Field(default_factory=list)
|
| 97 |
-
final_score: Optional[float] =
|
| 98 |
|
| 99 |
|
| 100 |
# ---------------------------------------------------------------------------
|
|
@@ -128,4 +154,4 @@ class BaselineResult(BaseModel):
|
|
| 128 |
final_score: float
|
| 129 |
step_count: int
|
| 130 |
total_reward: float
|
| 131 |
-
actions: List[Dict[str, Any]]
|
|
|
|
| 1 |
"""
|
| 2 |
+
Pydantic models for SupportEnv — Customer Support Ticket Triage.
|
| 3 |
|
| 4 |
+
Domain: SaaS customer support automation
|
| 5 |
+
Tasks: classification, information extraction, resolution generation
|
| 6 |
"""
|
| 7 |
from __future__ import annotations
|
| 8 |
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
# ---------------------------------------------------------------------------
|
| 14 |
+
# Ticket Info (what the agent sees)
|
| 15 |
# ---------------------------------------------------------------------------
|
| 16 |
|
| 17 |
+
class TicketInfo(BaseModel):
|
| 18 |
+
"""A customer support ticket presented to the agent."""
|
| 19 |
+
ticket_id: str
|
| 20 |
+
subject: str
|
| 21 |
+
body: str
|
| 22 |
+
customer_tier: str = Field(description="free | pro | enterprise")
|
| 23 |
+
account_age_days: int
|
| 24 |
+
previous_tickets: int
|
| 25 |
+
attachments: List[str] = Field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# ---------------------------------------------------------------------------
|
|
|
|
| 32 |
class Observation(BaseModel):
|
| 33 |
"""Everything the agent sees at each step."""
|
| 34 |
task_id: str = Field(description="task1 | task2 | task3")
|
| 35 |
+
task_description: str
|
| 36 |
+
episode_id: str
|
| 37 |
+
ticket: TicketInfo
|
| 38 |
thread_history: List[Dict[str, str]] = Field(
|
| 39 |
default_factory=list,
|
| 40 |
+
description="Ordered list of {'role': 'agent'|'system', 'content': str}",
|
| 41 |
)
|
| 42 |
available_actions: List[str]
|
| 43 |
step_number: int
|
| 44 |
max_steps: int
|
| 45 |
+
hint: Optional[str] = None
|
| 46 |
|
| 47 |
|
| 48 |
# ---------------------------------------------------------------------------
|
|
|
|
| 50 |
# ---------------------------------------------------------------------------
|
| 51 |
|
| 52 |
class Action(BaseModel):
|
| 53 |
+
"""Agent action for support ticket processing."""
|
| 54 |
+
action_type: str = Field(
|
| 55 |
+
description="classify | extract | respond | resolve | escalate | submit"
|
| 56 |
+
)
|
| 57 |
+
# Task 1: Classification
|
| 58 |
+
category: Optional[str] = Field(
|
| 59 |
+
default=None,
|
| 60 |
+
description="billing | technical | account | feature_request | complaint | general",
|
| 61 |
+
)
|
| 62 |
+
priority: Optional[str] = Field(
|
| 63 |
+
default=None,
|
| 64 |
+
description="low | medium | high | critical",
|
| 65 |
+
)
|
| 66 |
+
# Task 2: Extraction
|
| 67 |
+
extracted_entities: Optional[Dict[str, Any]] = Field(
|
| 68 |
+
default=None,
|
| 69 |
+
description="Key-value pairs extracted from the ticket",
|
| 70 |
+
)
|
| 71 |
+
required_actions: Optional[List[str]] = Field(
|
| 72 |
+
default=None,
|
| 73 |
+
description="List of actions needed to resolve the ticket",
|
| 74 |
+
)
|
| 75 |
+
# Task 3: Resolution
|
| 76 |
+
response_text: Optional[str] = Field(
|
| 77 |
+
default=None,
|
| 78 |
+
description="Customer-facing response text",
|
| 79 |
+
)
|
| 80 |
+
resolution_steps: Optional[List[str]] = Field(
|
| 81 |
+
default=None,
|
| 82 |
+
description="Ordered list of internal resolution steps",
|
| 83 |
+
)
|
| 84 |
+
# Escalation
|
| 85 |
+
escalation_team: Optional[str] = Field(default=None)
|
| 86 |
+
escalation_reason: Optional[str] = Field(default=None)
|
| 87 |
|
| 88 |
|
| 89 |
# ---------------------------------------------------------------------------
|
|
|
|
| 120 |
done: bool
|
| 121 |
total_reward: float
|
| 122 |
history: List[Dict[str, Any]] = Field(default_factory=list)
|
| 123 |
+
final_score: Optional[float] = None
|
| 124 |
|
| 125 |
|
| 126 |
# ---------------------------------------------------------------------------
|
|
|
|
| 154 |
final_score: float
|
| 155 |
step_count: int
|
| 156 |
total_reward: float
|
| 157 |
+
actions: List[Dict[str, Any]]
|
openenv.yaml
CHANGED
|
@@ -57,14 +57,6 @@ interface:
|
|
| 57 |
episode_id: string
|
| 58 |
response: GraderResponse
|
| 59 |
|
| 60 |
-
baseline:
|
| 61 |
-
method: POST
|
| 62 |
-
path: /baseline
|
| 63 |
-
request:
|
| 64 |
-
model: string # optional, default heuristic
|
| 65 |
-
ticket_index: integer # optional, default 0
|
| 66 |
-
response: BaselineResult
|
| 67 |
-
|
| 68 |
health:
|
| 69 |
method: GET
|
| 70 |
path: /health
|
|
|
|
| 57 |
episode_id: string
|
| 58 |
response: GraderResponse
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
health:
|
| 61 |
method: GET
|
| 62 |
path: /health
|
requirements.txt
CHANGED
|
@@ -5,4 +5,3 @@ openai>=1.35.0
|
|
| 5 |
httpx>=0.27.0
|
| 6 |
python-multipart>=0.0.9
|
| 7 |
requests>=2.31.0
|
| 8 |
-
google-genai>=1.15.0
|
|
|
|
| 5 |
httpx>=0.27.0
|
| 6 |
python-multipart>=0.0.9
|
| 7 |
requests>=2.31.0
|
|
|
test_integration.py
CHANGED
|
@@ -1,116 +1,112 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
|
|
|
| 5 |
"""
|
| 6 |
import environment as env
|
| 7 |
from models import Action
|
| 8 |
|
| 9 |
|
| 10 |
-
def test_task(task_id: str
|
| 11 |
-
"""
|
| 12 |
print(f"\n{'='*60}")
|
| 13 |
print(f"Testing {task_id}")
|
| 14 |
print(f"{'='*60}")
|
| 15 |
-
|
| 16 |
# Reset
|
| 17 |
-
print("1.
|
| 18 |
-
obs = env.reset(task_id)
|
| 19 |
episode_id = obs.episode_id
|
| 20 |
-
print(f"[OK]
|
| 21 |
-
print(f"
|
| 22 |
-
print(f"
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
action_type="file_edit",
|
| 58 |
-
file_path="/opt/mockapi/app.py",
|
| 59 |
-
file_content="import json\nfrom flask import Flask\n\napp = Flask(__name__)\n\n@app.route('/api/data')\ndef get_data():\n return json.dumps({'status': 'ok'})\n\nif __name__ == '__main__':\n app.run()\n"
|
| 60 |
-
)
|
| 61 |
-
else:
|
| 62 |
-
action = Action(action_type="bash_cmd", command="python3 /opt/mockapi/app.py &")
|
| 63 |
-
|
| 64 |
-
try:
|
| 65 |
-
result = env.step(episode_id, action)
|
| 66 |
-
print(f" Step {i+1}: {action.action_type} - Reward: {result.reward.step_reward:+.3f}")
|
| 67 |
-
if result.done:
|
| 68 |
-
print(f" -> Episode completed early")
|
| 69 |
-
break
|
| 70 |
-
except Exception as e:
|
| 71 |
-
print(f" Step {i+1} ERROR: {e}")
|
| 72 |
-
break
|
| 73 |
-
|
| 74 |
-
# Check state
|
| 75 |
-
print(f"\n3. Calling get_state()...")
|
| 76 |
-
state = env.get_state(episode_id)
|
| 77 |
-
print(f"[OK] State: step_number={state.step_number}, total_reward={state.total_reward:.3f}, done={state.done}")
|
| 78 |
-
|
| 79 |
-
# Finish episode if not already done
|
| 80 |
-
if not state.done:
|
| 81 |
-
print(f"\n4. Calling submit()...")
|
| 82 |
result = env.step(episode_id, Action(action_type="submit"))
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
# Grade
|
| 86 |
-
print(
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
| 95 |
|
| 96 |
def main():
|
| 97 |
-
"
|
| 98 |
-
print("
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
if __name__ == "__main__":
|
| 116 |
-
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Integration test for SupportEnv.
|
| 3 |
|
| 4 |
+
Runs a full episode for each task and prints results.
|
| 5 |
+
Usage: python test_integration.py
|
| 6 |
"""
|
| 7 |
import environment as env
|
| 8 |
from models import Action
|
| 9 |
|
| 10 |
|
| 11 |
+
def test_task(task_id: str) -> bool:
|
| 12 |
+
"""Run a full episode for a task. Returns True if passed."""
|
| 13 |
print(f"\n{'='*60}")
|
| 14 |
print(f"Testing {task_id}")
|
| 15 |
print(f"{'='*60}")
|
| 16 |
+
|
| 17 |
# Reset
|
| 18 |
+
print("1. reset()...")
|
| 19 |
+
obs = env.reset(task_id, ticket_index=0)
|
| 20 |
episode_id = obs.episode_id
|
| 21 |
+
print(f" [OK] episode_id={episode_id[:8]}...")
|
| 22 |
+
print(f" ticket_id={obs.ticket.ticket_id} subject={obs.ticket.subject[:50]}")
|
| 23 |
+
print(f" max_steps={obs.max_steps} hint={obs.hint}")
|
| 24 |
+
|
| 25 |
+
# Take a relevant action
|
| 26 |
+
print("2. step() with task action...")
|
| 27 |
+
if task_id == "task1":
|
| 28 |
+
action = Action(action_type="classify", category="billing", priority="high")
|
| 29 |
+
elif task_id == "task2":
|
| 30 |
+
action = Action(
|
| 31 |
+
action_type="extract",
|
| 32 |
+
extracted_entities={"customer_name": "Robert Chen", "account_id": "ACC-78234"},
|
| 33 |
+
required_actions=["issue_refund"],
|
| 34 |
+
)
|
| 35 |
+
else: # task3
|
| 36 |
+
action = Action(
|
| 37 |
+
action_type="respond",
|
| 38 |
+
response_text=(
|
| 39 |
+
"We sincerely apologize for the inconvenience with your password reset. "
|
| 40 |
+
"We will manually reset your password and send a new email immediately. "
|
| 41 |
+
"Please check your spam folder and whitelist our domain. "
|
| 42 |
+
"We will resolve this within the next 30 minutes."
|
| 43 |
+
),
|
| 44 |
+
resolution_steps=[
|
| 45 |
+
"verify_email_delivery",
|
| 46 |
+
"check_spam_filters",
|
| 47 |
+
"manual_password_reset",
|
| 48 |
+
"follow_up_confirmation",
|
| 49 |
+
],
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
result = env.step(episode_id, action)
|
| 53 |
+
print(f" [OK] step_reward={result.reward.step_reward:+.4f} done={result.done}")
|
| 54 |
+
|
| 55 |
+
# Submit
|
| 56 |
+
print("3. step() submit...")
|
| 57 |
+
if not result.done:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
result = env.step(episode_id, Action(action_type="submit"))
|
| 59 |
+
print(f" [OK] done={result.done} total_reward={result.reward.total_reward:.4f}")
|
| 60 |
+
|
| 61 |
+
# State
|
| 62 |
+
print("4. get_state()...")
|
| 63 |
+
state = env.get_state(episode_id)
|
| 64 |
+
print(f" [OK] steps={state.step_number} history_len={len(state.history)}")
|
| 65 |
+
|
| 66 |
# Grade
|
| 67 |
+
print("5. grade()...")
|
| 68 |
+
score, breakdown, feedback = env.grade(episode_id)
|
| 69 |
+
print(f" [OK] score={score:.4f}/1.0")
|
| 70 |
+
print(f" breakdown: {', '.join(f'{k}={v:.2f}' for k, v in breakdown.items())}")
|
| 71 |
+
print(f" feedback: {feedback}")
|
| 72 |
+
|
| 73 |
+
passed = score >= 0.0 # just verify pipeline works
|
| 74 |
+
return passed
|
| 75 |
|
| 76 |
|
| 77 |
def main():
|
| 78 |
+
print("SupportEnv Integration Test")
|
| 79 |
+
print("=" * 60)
|
| 80 |
+
|
| 81 |
+
results = []
|
| 82 |
+
for task_id in ["task1", "task2", "task3"]:
|
| 83 |
+
try:
|
| 84 |
+
ok = test_task(task_id)
|
| 85 |
+
results.append((task_id, ok, None))
|
| 86 |
+
except Exception as exc:
|
| 87 |
+
import traceback
|
| 88 |
+
traceback.print_exc()
|
| 89 |
+
results.append((task_id, False, str(exc)))
|
| 90 |
+
finally:
|
| 91 |
+
env._EPISODES.clear()
|
| 92 |
+
|
| 93 |
+
print(f"\n{'='*60}")
|
| 94 |
+
print("SUMMARY")
|
| 95 |
+
print("=" * 60)
|
| 96 |
+
all_ok = True
|
| 97 |
+
for task_id, ok, err in results:
|
| 98 |
+
status = "[PASS]" if ok else "[FAIL]"
|
| 99 |
+
print(f" {status} {task_id}" + (f" — {err}" if err else ""))
|
| 100 |
+
if not ok:
|
| 101 |
+
all_ok = False
|
| 102 |
+
|
| 103 |
+
if all_ok:
|
| 104 |
+
print("\n[OK] All integration tests passed!")
|
| 105 |
+
else:
|
| 106 |
+
print("\n[FAIL] Some tests failed.")
|
| 107 |
+
return 0 if all_ok else 1
|
| 108 |
|
| 109 |
|
| 110 |
if __name__ == "__main__":
|
| 111 |
+
import sys
|
| 112 |
+
sys.exit(main())
|
tests_new.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
"""
|
| 2 |
-
Comprehensive tests for
|
| 3 |
|
| 4 |
-
Run with: pytest
|
| 5 |
"""
|
| 6 |
import pytest
|
| 7 |
-
import json
|
| 8 |
-
from unittest.mock import patch
|
| 9 |
|
| 10 |
import environment as env
|
| 11 |
from models import Action, Observation, StepResult, State
|
|
@@ -14,232 +12,245 @@ from data import TASK_META
|
|
| 14 |
|
| 15 |
class TestReset:
|
| 16 |
"""Test episode reset functionality."""
|
| 17 |
-
|
| 18 |
-
def
|
| 19 |
-
"""Reset creates a valid episode."""
|
| 20 |
obs = env.reset("task1")
|
| 21 |
-
|
| 22 |
assert isinstance(obs, Observation)
|
| 23 |
assert obs.task_id == "task1"
|
| 24 |
assert obs.episode_id is not None
|
| 25 |
-
assert
|
| 26 |
-
assert obs.system_state is not None
|
| 27 |
assert obs.step_number == 0
|
| 28 |
assert obs.max_steps == TASK_META["task1"]["max_steps"]
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def test_reset_invalid_task(self):
|
| 31 |
-
"""Reset raises error for unknown task."""
|
| 32 |
with pytest.raises(ValueError):
|
| 33 |
-
env.reset("
|
| 34 |
-
|
| 35 |
-
def
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
assert obs.episode_id in env._EPISODES
|
| 40 |
ep = env._EPISODES[obs.episode_id]
|
| 41 |
-
assert ep["task_id"] == "
|
| 42 |
assert ep["step_number"] == 0
|
| 43 |
assert ep["done"] is False
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
class TestStep:
|
| 47 |
"""Test step execution."""
|
| 48 |
-
|
| 49 |
-
def
|
| 50 |
-
"""Step handles bash_cmd action."""
|
| 51 |
obs = env.reset("task1")
|
| 52 |
-
|
| 53 |
-
action = Action(action_type="bash_cmd", command="systemctl status nginx")
|
| 54 |
result = env.step(obs.episode_id, action)
|
| 55 |
-
|
| 56 |
assert isinstance(result, StepResult)
|
| 57 |
assert result.observation.step_number == 1
|
| 58 |
-
assert result.reward.step_reward
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
def test_step_file_edit(self):
|
| 62 |
-
"""Step handles file_edit action."""
|
| 63 |
obs = env.reset("task2")
|
| 64 |
-
|
| 65 |
action = Action(
|
| 66 |
-
action_type="
|
| 67 |
-
|
| 68 |
-
|
| 69 |
)
|
| 70 |
result = env.step(obs.episode_id, action)
|
| 71 |
-
|
| 72 |
-
assert isinstance(result, StepResult)
|
| 73 |
assert result.observation.step_number == 1
|
| 74 |
-
|
| 75 |
-
def
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
result = env.step(obs.episode_id, action)
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
assert result.done is True
|
| 83 |
assert result.observation.available_actions == []
|
| 84 |
-
|
| 85 |
def test_step_invalid_episode(self):
|
| 86 |
-
"""Step raises error for invalid episode."""
|
| 87 |
-
action = Action(action_type="bash_cmd", command="ls")
|
| 88 |
-
|
| 89 |
with pytest.raises(KeyError):
|
| 90 |
-
env.step("
|
| 91 |
-
|
| 92 |
-
def
|
| 93 |
-
"""Step raises error after episode is done."""
|
| 94 |
obs = env.reset("task1")
|
| 95 |
-
|
| 96 |
-
# End the episode
|
| 97 |
-
action1 = Action(action_type="submit")
|
| 98 |
-
env.step(obs.episode_id, action1)
|
| 99 |
-
|
| 100 |
-
# Try to step again
|
| 101 |
with pytest.raises(ValueError):
|
| 102 |
-
env.step(obs.episode_id,
|
| 103 |
-
|
| 104 |
-
def
|
| 105 |
-
"""Episode ends after max_steps."""
|
| 106 |
obs = env.reset("task1")
|
| 107 |
max_steps = obs.max_steps
|
| 108 |
-
|
| 109 |
for i in range(max_steps):
|
| 110 |
-
action = Action(action_type="
|
| 111 |
result = env.step(obs.episode_id, action)
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
|
| 118 |
|
| 119 |
class TestState:
|
| 120 |
"""Test state retrieval."""
|
| 121 |
-
|
| 122 |
-
def
|
| 123 |
-
|
| 124 |
-
obs = env.reset("task3")
|
| 125 |
-
|
| 126 |
state = env.get_state(obs.episode_id)
|
| 127 |
-
|
| 128 |
assert isinstance(state, State)
|
| 129 |
assert state.episode_id == obs.episode_id
|
| 130 |
-
assert state.task_id == "
|
| 131 |
assert state.step_number == 0
|
| 132 |
assert state.done is False
|
| 133 |
-
|
| 134 |
-
def
|
| 135 |
-
"""get_state raises error for invalid episode."""
|
| 136 |
with pytest.raises(KeyError):
|
| 137 |
-
env.get_state("
|
| 138 |
-
|
| 139 |
-
def
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
# Take Actions
|
| 144 |
-
action1 = Action(action_type="bash_cmd", command="ps aux")
|
| 145 |
-
env.step(obs.episode_id, action1)
|
| 146 |
-
|
| 147 |
state = env.get_state(obs.episode_id)
|
| 148 |
-
|
| 149 |
assert len(state.history) == 1
|
| 150 |
-
assert state.history[0]["action_type"] == "
|
| 151 |
|
| 152 |
|
| 153 |
-
class
|
| 154 |
-
"""Test
|
| 155 |
-
|
| 156 |
-
def
|
| 157 |
-
"""
|
| 158 |
-
obs = env.reset("task1")
|
| 159 |
-
|
| 160 |
-
# Run commands to fix nginx
|
| 161 |
-
env.step(obs.episode_id, Action(action_type="bash_cmd", command="systemctl restart nginx"))
|
| 162 |
-
env.step(obs.episode_id, Action(action_type="bash_cmd", command="nginx -t"))
|
| 163 |
-
env.step(obs.episode_id, Action(action_type="bash_cmd", command="curl http://localhost"))
|
| 164 |
env.step(obs.episode_id, Action(action_type="submit"))
|
| 165 |
-
|
| 166 |
score, breakdown, feedback = env.grade(obs.episode_id)
|
| 167 |
-
|
| 168 |
-
assert
|
| 169 |
-
assert "
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
obs = env.reset("task1")
|
| 181 |
-
# Don't finish the episode
|
| 182 |
-
|
| 183 |
with pytest.raises(ValueError):
|
| 184 |
env.grade(obs.episode_id)
|
| 185 |
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
assert obs.system_state.service_status.get("nginx") == "inactive"
|
| 195 |
-
assert 80 not in obs.system_state.http_ports_open
|
| 196 |
-
|
| 197 |
-
def test_task2_initial_state(self):
|
| 198 |
-
"""Task 2 initializes with docker misconfigured."""
|
| 199 |
-
obs = env.reset("task2")
|
| 200 |
-
|
| 201 |
-
assert obs.system_state.service_status.get("docker") == "active"
|
| 202 |
-
assert 80 in obs.system_state.http_ports_open
|
| 203 |
-
|
| 204 |
-
def test_task3_initial_state(self):
|
| 205 |
-
"""Task 3 initializes with memory leak."""
|
| 206 |
-
obs = env.reset("task3")
|
| 207 |
-
|
| 208 |
-
assert obs.system_state.service_status.get("mockapi") == "active"
|
| 209 |
-
# Should have high memory usage
|
| 210 |
-
assert obs.system_state.memory_usage_mb > 1024
|
| 211 |
|
| 212 |
|
| 213 |
class TestRewards:
|
| 214 |
-
"""Test reward
|
| 215 |
-
|
| 216 |
-
def
|
| 217 |
-
"""Taking actions yields positive reward."""
|
| 218 |
obs = env.reset("task1")
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
| 227 |
obs = env.reset("task1")
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
total1 = result1.reward.total_reward
|
| 232 |
-
|
| 233 |
-
result2 = env.step(obs.episode_id, Action(action_type="bash_cmd", command="pwd"))
|
| 234 |
-
total2 = result2.reward.total_reward
|
| 235 |
-
|
| 236 |
-
# Total reward should accumulate
|
| 237 |
-
assert total2 >= total1 or total2 < total1 # Can go either way depending on grader
|
| 238 |
|
| 239 |
|
| 240 |
@pytest.fixture(autouse=True)
|
| 241 |
def cleanup():
|
| 242 |
-
"""Clean up episodes after each test."""
|
| 243 |
yield
|
| 244 |
env._EPISODES.clear()
|
| 245 |
|
|
|
|
| 1 |
"""
|
| 2 |
+
Comprehensive tests for SupportEnv.
|
| 3 |
|
| 4 |
+
Run with: pytest tests_new.py -v
|
| 5 |
"""
|
| 6 |
import pytest
|
|
|
|
|
|
|
| 7 |
|
| 8 |
import environment as env
|
| 9 |
from models import Action, Observation, StepResult, State
|
|
|
|
| 12 |
|
| 13 |
class TestReset:
|
| 14 |
"""Test episode reset functionality."""
|
| 15 |
+
|
| 16 |
+
def test_reset_task1(self):
|
|
|
|
| 17 |
obs = env.reset("task1")
|
|
|
|
| 18 |
assert isinstance(obs, Observation)
|
| 19 |
assert obs.task_id == "task1"
|
| 20 |
assert obs.episode_id is not None
|
| 21 |
+
assert obs.ticket is not None
|
|
|
|
| 22 |
assert obs.step_number == 0
|
| 23 |
assert obs.max_steps == TASK_META["task1"]["max_steps"]
|
| 24 |
+
|
| 25 |
+
def test_reset_task2(self):
|
| 26 |
+
obs = env.reset("task2")
|
| 27 |
+
assert obs.task_id == "task2"
|
| 28 |
+
assert obs.ticket.ticket_id.startswith("T2-")
|
| 29 |
+
|
| 30 |
+
def test_reset_task3(self):
|
| 31 |
+
obs = env.reset("task3")
|
| 32 |
+
assert obs.task_id == "task3"
|
| 33 |
+
assert obs.ticket.ticket_id.startswith("T3-")
|
| 34 |
+
|
| 35 |
+
def test_reset_with_ticket_index(self):
|
| 36 |
+
obs = env.reset("task1", ticket_index=2)
|
| 37 |
+
assert obs.ticket.ticket_id == "T1-003"
|
| 38 |
+
|
| 39 |
def test_reset_invalid_task(self):
|
|
|
|
| 40 |
with pytest.raises(ValueError):
|
| 41 |
+
env.reset("task_unknown")
|
| 42 |
+
|
| 43 |
+
def test_reset_invalid_ticket_index(self):
|
| 44 |
+
with pytest.raises(ValueError):
|
| 45 |
+
env.reset("task1", ticket_index=99)
|
| 46 |
+
|
| 47 |
+
def test_reset_creates_episode(self):
|
| 48 |
+
obs = env.reset("task1")
|
| 49 |
assert obs.episode_id in env._EPISODES
|
| 50 |
ep = env._EPISODES[obs.episode_id]
|
| 51 |
+
assert ep["task_id"] == "task1"
|
| 52 |
assert ep["step_number"] == 0
|
| 53 |
assert ep["done"] is False
|
| 54 |
|
| 55 |
+
def test_reset_hint_on_first_step(self):
|
| 56 |
+
obs = env.reset("task1")
|
| 57 |
+
assert obs.hint is not None
|
| 58 |
+
|
| 59 |
|
| 60 |
class TestStep:
|
| 61 |
"""Test step execution."""
|
| 62 |
+
|
| 63 |
+
def test_step_classify(self):
|
|
|
|
| 64 |
obs = env.reset("task1")
|
| 65 |
+
action = Action(action_type="classify", category="billing", priority="high")
|
|
|
|
| 66 |
result = env.step(obs.episode_id, action)
|
|
|
|
| 67 |
assert isinstance(result, StepResult)
|
| 68 |
assert result.observation.step_number == 1
|
| 69 |
+
assert result.reward.step_reward is not None
|
| 70 |
+
|
| 71 |
+
def test_step_extract(self):
|
|
|
|
|
|
|
| 72 |
obs = env.reset("task2")
|
|
|
|
| 73 |
action = Action(
|
| 74 |
+
action_type="extract",
|
| 75 |
+
extracted_entities={"customer_name": "Alice"},
|
| 76 |
+
required_actions=["issue_refund"],
|
| 77 |
)
|
| 78 |
result = env.step(obs.episode_id, action)
|
|
|
|
|
|
|
| 79 |
assert result.observation.step_number == 1
|
| 80 |
+
|
| 81 |
+
def test_step_respond(self):
|
| 82 |
+
obs = env.reset("task3")
|
| 83 |
+
action = Action(
|
| 84 |
+
action_type="respond",
|
| 85 |
+
response_text="Thank you for reaching out. We sincerely apologize for the inconvenience and will resolve this immediately.",
|
| 86 |
+
resolution_steps=["verify_account", "issue_refund"],
|
| 87 |
+
)
|
| 88 |
result = env.step(obs.episode_id, action)
|
| 89 |
+
assert result.observation.step_number == 1
|
| 90 |
+
|
| 91 |
+
def test_step_submit_marks_done(self):
|
| 92 |
+
obs = env.reset("task1")
|
| 93 |
+
result = env.step(obs.episode_id, Action(action_type="submit"))
|
| 94 |
assert result.done is True
|
| 95 |
assert result.observation.available_actions == []
|
| 96 |
+
|
| 97 |
def test_step_invalid_episode(self):
|
|
|
|
|
|
|
|
|
|
| 98 |
with pytest.raises(KeyError):
|
| 99 |
+
env.step("nonexistent-id", Action(action_type="submit"))
|
| 100 |
+
|
| 101 |
+
def test_step_after_done_raises(self):
|
|
|
|
| 102 |
obs = env.reset("task1")
|
| 103 |
+
env.step(obs.episode_id, Action(action_type="submit"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
with pytest.raises(ValueError):
|
| 105 |
+
env.step(obs.episode_id, Action(action_type="submit"))
|
| 106 |
+
|
| 107 |
+
def test_step_max_steps_ends_episode(self):
|
|
|
|
| 108 |
obs = env.reset("task1")
|
| 109 |
max_steps = obs.max_steps
|
|
|
|
| 110 |
for i in range(max_steps):
|
| 111 |
+
action = Action(action_type="classify", category="general", priority="low")
|
| 112 |
result = env.step(obs.episode_id, action)
|
| 113 |
+
assert result.done is True
|
| 114 |
+
|
| 115 |
+
def test_thread_history_grows(self):
|
| 116 |
+
obs = env.reset("task1")
|
| 117 |
+
env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
|
| 118 |
+
result = env.step(obs.episode_id, Action(action_type="submit"))
|
| 119 |
+
assert len(result.observation.thread_history) == 2
|
| 120 |
|
| 121 |
|
| 122 |
class TestState:
|
| 123 |
"""Test state retrieval."""
|
| 124 |
+
|
| 125 |
+
def test_get_state_initial(self):
|
| 126 |
+
obs = env.reset("task1")
|
|
|
|
|
|
|
| 127 |
state = env.get_state(obs.episode_id)
|
|
|
|
| 128 |
assert isinstance(state, State)
|
| 129 |
assert state.episode_id == obs.episode_id
|
| 130 |
+
assert state.task_id == "task1"
|
| 131 |
assert state.step_number == 0
|
| 132 |
assert state.done is False
|
| 133 |
+
|
| 134 |
+
def test_get_state_invalid(self):
|
|
|
|
| 135 |
with pytest.raises(KeyError):
|
| 136 |
+
env.get_state("bad-id")
|
| 137 |
+
|
| 138 |
+
def test_state_history_after_step(self):
|
| 139 |
+
obs = env.reset("task2")
|
| 140 |
+
env.step(obs.episode_id, Action(action_type="extract", extracted_entities={}, required_actions=[]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
state = env.get_state(obs.episode_id)
|
|
|
|
| 142 |
assert len(state.history) == 1
|
| 143 |
+
assert state.history[0]["action_type"] == "extract"
|
| 144 |
|
| 145 |
|
| 146 |
+
class TestGraders:
|
| 147 |
+
"""Test grading for each task."""
|
| 148 |
+
|
| 149 |
+
def test_grade_task1_perfect(self):
|
| 150 |
+
"""Correct category + priority on ticket 0 (billing/high)."""
|
| 151 |
+
obs = env.reset("task1", ticket_index=0)
|
| 152 |
+
env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
env.step(obs.episode_id, Action(action_type="submit"))
|
|
|
|
| 154 |
score, breakdown, feedback = env.grade(obs.episode_id)
|
| 155 |
+
assert score >= 0.9
|
| 156 |
+
assert breakdown["category_correct"] == 0.50
|
| 157 |
+
assert breakdown["priority_correct"] == 0.40
|
| 158 |
+
|
| 159 |
+
def test_grade_task1_wrong_category(self):
|
| 160 |
+
obs = env.reset("task1", ticket_index=0)
|
| 161 |
+
env.step(obs.episode_id, Action(action_type="classify", category="technical", priority="high"))
|
| 162 |
+
env.step(obs.episode_id, Action(action_type="submit"))
|
| 163 |
+
score, breakdown, _ = env.grade(obs.episode_id)
|
| 164 |
+
assert breakdown["category_correct"] == 0.0
|
| 165 |
+
assert breakdown["priority_correct"] == 0.40
|
| 166 |
+
|
| 167 |
+
def test_grade_task1_no_classify_action(self):
|
| 168 |
+
obs = env.reset("task1")
|
| 169 |
+
env.step(obs.episode_id, Action(action_type="submit"))
|
| 170 |
+
score, _, _ = env.grade(obs.episode_id)
|
| 171 |
+
assert score == 0.0
|
| 172 |
+
|
| 173 |
+
def test_grade_task2_entities(self):
|
| 174 |
+
obs = env.reset("task2", ticket_index=0)
|
| 175 |
+
env.step(obs.episode_id, Action(
|
| 176 |
+
action_type="extract",
|
| 177 |
+
extracted_entities={
|
| 178 |
+
"customer_name": "Robert Chen",
|
| 179 |
+
"account_id": "ACC-78234",
|
| 180 |
+
"invoice_number": "INV-20240312",
|
| 181 |
+
"incorrect_amount": "199.00",
|
| 182 |
+
"correct_amount": "99.00",
|
| 183 |
+
"refund_amount": "100.00",
|
| 184 |
+
},
|
| 185 |
+
required_actions=["issue_refund", "send_corrected_invoice"],
|
| 186 |
+
))
|
| 187 |
+
env.step(obs.episode_id, Action(action_type="submit"))
|
| 188 |
+
score, breakdown, _ = env.grade(obs.episode_id)
|
| 189 |
+
assert breakdown["entity_coverage"] == pytest.approx(0.60, abs=0.01)
|
| 190 |
+
assert breakdown["action_coverage"] == pytest.approx(0.30, abs=0.01)
|
| 191 |
+
|
| 192 |
+
def test_grade_task3_keywords_and_steps(self):
|
| 193 |
+
obs = env.reset("task3", ticket_index=0)
|
| 194 |
+
env.step(obs.episode_id, Action(
|
| 195 |
+
action_type="respond",
|
| 196 |
+
response_text=(
|
| 197 |
+
"We sincerely apologize for the password reset issue. "
|
| 198 |
+
"We will send a new reset email and ask you to check your spam folder "
|
| 199 |
+
"and whitelist our domain. We will have this resolved within the hour."
|
| 200 |
+
),
|
| 201 |
+
resolution_steps=[
|
| 202 |
+
"verify_email_delivery",
|
| 203 |
+
"check_spam_filters",
|
| 204 |
+
"manual_password_reset",
|
| 205 |
+
"follow_up_confirmation",
|
| 206 |
+
],
|
| 207 |
+
))
|
| 208 |
+
env.step(obs.episode_id, Action(action_type="submit"))
|
| 209 |
+
score, breakdown, _ = env.grade(obs.episode_id)
|
| 210 |
+
assert score >= 0.7
|
| 211 |
+
assert breakdown["length_adequate"] == 0.10
|
| 212 |
+
assert breakdown["no_empty_steps"] == 0.05
|
| 213 |
+
|
| 214 |
+
def test_grade_not_done_raises(self):
|
| 215 |
obs = env.reset("task1")
|
|
|
|
|
|
|
| 216 |
with pytest.raises(ValueError):
|
| 217 |
env.grade(obs.episode_id)
|
| 218 |
|
| 219 |
+
def test_grade_invalid_episode_raises(self):
|
| 220 |
+
with pytest.raises(KeyError):
|
| 221 |
+
env.grade("bad-id")
|
| 222 |
|
| 223 |
+
def test_score_in_range(self):
|
| 224 |
+
for task_id in ["task1", "task2", "task3"]:
|
| 225 |
+
obs = env.reset(task_id)
|
| 226 |
+
env.step(obs.episode_id, Action(action_type="submit"))
|
| 227 |
+
score, _, _ = env.grade(obs.episode_id)
|
| 228 |
+
assert 0.0 <= score <= 1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
|
| 231 |
class TestRewards:
|
| 232 |
+
"""Test reward signals."""
|
| 233 |
+
|
| 234 |
+
def test_step_reward_is_float(self):
|
|
|
|
| 235 |
obs = env.reset("task1")
|
| 236 |
+
result = env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
|
| 237 |
+
assert isinstance(result.reward.step_reward, float)
|
| 238 |
+
|
| 239 |
+
def test_total_reward_accumulates(self):
|
| 240 |
+
obs = env.reset("task2")
|
| 241 |
+
r1 = env.step(obs.episode_id, Action(action_type="extract", extracted_entities={}, required_actions=[]))
|
| 242 |
+
r2 = env.step(obs.episode_id, Action(action_type="submit"))
|
| 243 |
+
assert r2.reward.total_reward != r1.reward.total_reward
|
| 244 |
+
|
| 245 |
+
def test_submit_bonus_applied(self):
|
| 246 |
obs = env.reset("task1")
|
| 247 |
+
result = env.step(obs.episode_id, Action(action_type="submit"))
|
| 248 |
+
# submit_bonus=0.05 minus step_cost=0.02 = +0.03 base before grader
|
| 249 |
+
assert result.reward.step_reward > 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
|
| 252 |
@pytest.fixture(autouse=True)
|
| 253 |
def cleanup():
|
|
|
|
| 254 |
yield
|
| 255 |
env._EPISODES.clear()
|
| 256 |
|