Shivoo29 commited on
Commit
ceec48c
·
1 Parent(s): 5b64237
Files changed (16) hide show
  1. .claude/settings.json +19 -0
  2. Dockerfile +2 -21
  3. Pre_Validation_Script.sh +185 -0
  4. README.md +103 -139
  5. Sample_Inference_Script.py +187 -0
  6. app.py +21 -25
  7. baseline.py +0 -309
  8. data.py +5 -82
  9. environment.py +149 -347
  10. graders.py +225 -172
  11. inference.py +262 -323
  12. models.py +54 -28
  13. openenv.yaml +0 -8
  14. requirements.txt +0 -1
  15. test_integration.py +92 -96
  16. tests_new.py +175 -164
.claude/settings.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(python -m pytest tests_new.py -v)",
5
+ "Bash(pip install:*)",
6
+ "Bash(pytest tests_new.py -v)",
7
+ "Bash(/var/data/python/bin/pytest tests_new.py -v)",
8
+ "Bash(python test_integration.py)",
9
+ "Bash(docker build:*)",
10
+ "Bash(/var/data/python/bin/pytest tests_new.py -v --tb=short)",
11
+ "WebFetch(domain:exploring-solver-openenv-solvor.hf.space)",
12
+ "Bash(curl -s -o /tmp/reset_empty.json -w 'HTTP %{http_code}' -X POST -H 'Content-Type: application/json' -d '{}' https://exploring-solver-openenv-solvor.hf.space/reset --max-time 30)",
13
+ "Read(//tmp/**)",
14
+ "Bash(curl -s -X POST -H 'Content-Type: application/json' -d '{\"task_id\":\"task1\"}' https://exploring-solver-openenv-solvor.hf.space/reset --max-time 30)",
15
+ "Bash(python3 -m json.tool)",
16
+ "Bash(python3 test_integration.py)"
17
+ ]
18
+ }
19
+ }
Dockerfile CHANGED
@@ -1,42 +1,23 @@
1
  # ---------------------------------------------------------------
2
- # DevOpsEnv — Hugging Face Spaces Docker container
3
  # Space SDK: Docker | Port: 7860
4
  # ---------------------------------------------------------------
5
  FROM python:3.11-slim
6
 
7
- # Install system utilities for DevOps tasks
8
- RUN apt-get update && apt-get install -y --no-install-recommends \
9
- nginx \
10
- docker.io \
11
- systemctl \
12
- curl \
13
- git \
14
- vim \
15
- && rm -rf /var/lib/apt/lists/*
16
-
17
- # Create non-root user for Hugging Face Spaces
18
  RUN useradd -m -u 1000 appuser
19
 
20
  WORKDIR /app
21
 
22
- # Install Python dependencies first (layer caching)
23
  COPY requirements.txt .
24
  RUN pip install --no-cache-dir -r requirements.txt
25
 
26
- # Copy application code
27
  COPY --chown=appuser:appuser . .
28
 
29
- # HF Spaces compatibility
30
- RUN chmod +x /app/app.py 2>/dev/null || true
31
-
32
  USER appuser
33
 
34
- # Expose the port HF Spaces expects
35
  EXPOSE 7860
36
 
37
- # Health check
38
  HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
39
  CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
40
 
41
- # Start the FastAPI server
42
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--reload"]
 
1
  # ---------------------------------------------------------------
2
+ # SupportEnv — Hugging Face Spaces Docker container
3
  # Space SDK: Docker | Port: 7860
4
  # ---------------------------------------------------------------
5
  FROM python:3.11-slim
6
 
 
 
 
 
 
 
 
 
 
 
 
7
  RUN useradd -m -u 1000 appuser
8
 
9
  WORKDIR /app
10
 
 
11
  COPY requirements.txt .
12
  RUN pip install --no-cache-dir -r requirements.txt
13
 
 
14
  COPY --chown=appuser:appuser . .
15
 
 
 
 
16
  USER appuser
17
 
 
18
  EXPOSE 7860
19
 
 
20
  HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
21
  CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
22
 
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
 
Pre_Validation_Script.sh ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ if ! command -v openenv &>/dev/null; then
161
+ fail "openenv command not found"
162
+ hint "Install it: pip install openenv-core"
163
+ stop_at "Step 3"
164
+ fi
165
+
166
+ VALIDATE_OK=false
167
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
+
169
+ if [ "$VALIDATE_OK" = true ]; then
170
+ pass "openenv validate passed"
171
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
172
+ else
173
+ fail "openenv validate failed"
174
+ printf "%s\n" "$VALIDATE_OUTPUT"
175
+ stop_at "Step 3"
176
+ fi
177
+
178
+ printf "\n"
179
+ printf "${BOLD}========================================${NC}\n"
180
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
181
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
+ printf "${BOLD}========================================${NC}\n"
183
+ printf "\n"
184
+
185
+ exit 0
README.md CHANGED
@@ -1,161 +1,125 @@
1
- ---
2
- title: DevOpsEnv
3
- emoji: 🛠️
4
  colorFrom: blue
5
- colorTo: green
6
  sdk: docker
7
  app_port: 7860
8
  tags:
9
  - openenv
10
- - devops
11
- - sre
12
- - troubleshooting
13
  - agent-evaluation
14
  pinned: false
15
  ---
16
 
17
- # DevOpsEnv
18
-
19
- DevOpsEnv is a practice environment where an agent acts like a junior SRE.
20
-
21
- In each episode, the agent gets a broken Linux-like system and must fix it by:
22
- - Running shell commands
23
- - Editing files
24
- - Submitting when the fix is done
25
-
26
- The server gives rewards during the episode and a final score at the end.
27
-
28
- ## What It Simulates (Simple)
29
-
30
- There are 3 tasks:
31
- - Task 1: Nginx is down. Bring service back and verify HTTP is OK.
32
- - Task 2: Docker compose port mapping is wrong. Fix and redeploy.
33
- - Task 3: Python API has memory leak behavior. Diagnose and reduce memory usage.
34
-
35
- ## How It Works
36
-
37
- Step by step:
38
- 1. Call POST /reset with task_id.
39
- 2. You get episode_id plus current system_state.
40
- 3. Call POST /step with an action.
41
- 4. Repeat steps until done, or send action_type submit.
42
- 5. Call POST /grader to get final score and breakdown.
43
-
44
- Main endpoints:
45
- - GET /health
46
- - GET /tasks
47
- - POST /reset
48
- - POST /step
49
- - GET /state
50
- - POST /grader
51
-
52
- ## Action Types
53
-
54
- - bash_cmd: Run a command like systemctl status nginx
55
- - file_edit: Replace content of a file path
56
- - submit: End episode and grade
57
-
58
- ## Quick Start (Normal)
59
-
60
- ### 1) Install
61
-
62
- Windows PowerShell:
63
-
64
- python -m pip install -r requirements.txt
65
-
66
- ### 2) Start server
67
-
68
- python -m uvicorn app:app --host 0.0.0.0 --port 7860
69
-
70
- ### 3) Check health
71
-
72
- In another terminal:
73
-
74
- Invoke-WebRequest -Uri "http://127.0.0.1:7860/health" -UseBasicParsing
75
-
76
- If working, response includes status: healthy.
77
-
78
- ### 4) Run built-in integration test
79
-
80
- python test_integration.py
81
-
82
- If working, you should see all 3 tasks run and a final success message.
83
-
84
- ## Minimal API Example (Normal)
85
-
86
- PowerShell example:
87
-
88
- $reset = Invoke-WebRequest -Uri "http://127.0.0.1:7860/reset" -Method POST -ContentType "application/json" -Body '{"task_id":"task1"}' | Select-Object -ExpandProperty Content | ConvertFrom-Json
89
- $episodeId = $reset.episode_id
90
-
91
- $step = @{
92
- episode_id = $episodeId
93
- action = @{
94
- action_type = "bash_cmd"
95
- command = "systemctl restart nginx"
96
  }
97
- } | ConvertTo-Json -Depth 5
98
-
99
- Invoke-WebRequest -Uri "http://127.0.0.1:7860/step" -Method POST -ContentType "application/json" -Body $step
100
-
101
- ## Test With LLM (OpenAI Key)
102
-
103
- 1) Keep API server running.
104
- 2) Set key and run inference:
 
 
 
 
 
 
 
105
 
106
- PowerShell:
 
 
 
 
107
 
108
- $env:OPENAI_API_KEY = "your-openai-key"
109
- python inference.py --task task1 --model gpt-4o-mini
110
 
111
- You should see step logs, rewards, and a grader score.
112
 
113
- ## Test With Gemini API Key
114
 
115
- inference.py now supports OpenAI-compatible base URLs.
116
 
117
- Use Gemini via OpenAI-compatible endpoint:
118
 
119
- PowerShell:
 
 
 
120
 
121
- $env:GEMINI_API_KEY = "your-gemini-key"
122
- $env:OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
123
- python inference.py --task task1 --model gemini-2.5-flash
124
 
125
- Notes:
126
- - You can also use OPENAI_API_KEY instead of GEMINI_API_KEY.
127
- - If your model name is unavailable, switch to a Gemini model enabled on your key.
128
- - Keep the environment server running at http://127.0.0.1:7860 (or pass --api-url).
 
129
 
130
  ## Docker
131
 
132
- Build:
133
-
134
- docker build -t devopsenv .
135
-
136
- Run:
137
-
138
- docker run -p 7860:7860 devopsenv
139
-
140
- Then open:
141
- - http://127.0.0.1:7860/health
142
- - http://127.0.0.1:7860/docs
143
-
144
- ## Project Files
145
-
146
- - app.py: FastAPI API
147
- - environment.py: episode logic and simulator
148
- - graders.py: deterministic scoring
149
- - data.py: task metadata
150
- - models.py: Pydantic schemas
151
- - inference.py: LLM baseline runner
152
- - test_integration.py: local end-to-end check
153
-
154
- ## Troubleshooting
155
-
156
- - Port already in use:
157
- - change server port or stop old process.
158
- - 400/404 from API:
159
- - check episode_id and task_id values.
160
- - LLM errors:
161
- - verify API key, model name, and OPENAI_BASE_URL for Gemini.
 
1
+ ---
2
+ title: SupportEnv
3
+ emoji: 🎫
4
  colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
  app_port: 7860
8
  tags:
9
  - openenv
10
+ - customer-support
11
+ - nlp
12
+ - ticket-triage
13
  - agent-evaluation
14
  pinned: false
15
  ---
16
 
17
+ # SupportEnv
18
+
19
+ SupportEnv is an OpenEnv-compliant environment for evaluating LLM agents on customer support ticket triage. Each episode presents a realistic support ticket and asks the agent to classify, extract, or resolve it — scored deterministically against ground-truth labels.
20
+
21
+ ## Tasks
22
+
23
+ | Task | Difficulty | Action | Max Steps |
24
+ |------|-----------|--------|-----------|
25
+ | Task 1 — Ticket Classification | Easy | `classify` | 3 |
26
+ | Task 2 Information Extraction | Medium | `extract` | 5 |
27
+ | Task 3 — Resolution Generation | Hard | `respond` | 8 |
28
+
29
+ **Task 1 — Ticket Classification (Easy)**
30
+ Assign a `category` (billing / technical / account / feature_request / complaint / general) and `priority` (low / medium / high / critical) to each ticket.
31
+
32
+ **Task 2 Information Extraction (Medium)**
33
+ Extract structured entities (IDs, names, amounts, dates) and identify the list of required resolution actions.
34
+
35
+ **Task 3 Resolution Generation (Hard)**
36
+ Write a professional customer-facing response and an ordered list of internal resolution steps. Graded on keyword coverage, step completeness, tone adherence, and minimum length.
37
+
38
+ ## API
39
+
40
+ | Method | Endpoint | Description |
41
+ |--------|----------|-------------|
42
+ | `POST` | `/reset` | Start a new episode |
43
+ | `POST` | `/step` | Submit an action |
44
+ | `GET` | `/state` | Get current episode state |
45
+ | `POST` | `/grader` | Grade a finished episode |
46
+ | `GET` | `/tasks` | List all tasks |
47
+ | `GET` | `/health` | Liveness check |
48
+ | `GET` | `/docs` | OpenAPI docs |
49
+
50
+ ### Reset
51
+ ```json
52
+ POST /reset
53
+ {"task_id": "task1", "ticket_index": 0}
54
+ ```
55
+
56
+ ### Step Task 1 (classify)
57
+ ```json
58
+ POST /step
59
+ {
60
+ "episode_id": "<id>",
61
+ "action": {"action_type": "classify", "category": "billing", "priority": "high"}
62
+ }
63
+ ```
64
+
65
+ ### Step — Task 2 (extract)
66
+ ```json
67
+ POST /step
68
+ {
69
+ "episode_id": "<id>",
70
+ "action": {
71
+ "action_type": "extract",
72
+ "extracted_entities": {"customer_name": "Alice", "invoice_number": "INV-001"},
73
+ "required_actions": ["issue_refund", "send_corrected_invoice"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  }
75
+ }
76
+ ```
77
+
78
+ ### Step — Task 3 (respond)
79
+ ```json
80
+ POST /step
81
+ {
82
+ "episode_id": "<id>",
83
+ "action": {
84
+ "action_type": "respond",
85
+ "response_text": "Dear customer, we sincerely apologize...",
86
+ "resolution_steps": ["verify_account", "issue_refund", "send_confirmation"]
87
+ }
88
+ }
89
+ ```
90
 
91
+ ### Submit
92
+ ```json
93
+ POST /step
94
+ {"episode_id": "<id>", "action": {"action_type": "submit"}}
95
+ ```
96
 
97
+ ## Scoring
 
98
 
99
+ **Task 1:** category match (0.50) + priority match (0.40) + efficiency (0.10)
100
 
101
+ **Task 2:** entity coverage (0.60) + action coverage (0.30) + no hallucination (0.10)
102
 
103
+ **Task 3:** keyword coverage (0.30) + step coverage (0.30) + tone compliance (0.25) + length adequate (0.10) + non-empty steps (0.05)
104
 
105
+ ## Running Locally
106
 
107
+ ```bash
108
+ pip install -r requirements.txt
109
+ uvicorn app:app --host 0.0.0.0 --port 7860
110
+ ```
111
 
112
+ ## Running the Baseline Agent
 
 
113
 
114
+ ```bash
115
+ export HF_TOKEN=your_token_here
116
+ export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
117
+ python inference.py
118
+ ```
119
 
120
  ## Docker
121
 
122
+ ```bash
123
+ docker build -t supportenv .
124
+ docker run -p 7860:7860 supportenv
125
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Sample_Inference_Script.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference Script Example
3
+ ===================================
4
+ MANDATORY
5
+ - Before submitting, ensure the following variables are defined in your environment configuration:
6
+ API_BASE_URL The API endpoint for the LLM.
7
+ MODEL_NAME The model identifier to use for inference.
8
+ HF_TOKEN Your Hugging Face / API key.
9
+ LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
10
+ method
11
+
12
+ - Defaults are set only for API_BASE_URL and MODEL_NAME
13
+ (and should reflect your active inference setup):
14
+ API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
15
+ MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
16
+
17
+ - The inference script must be named `inference.py` and placed in the root directory of the project
18
+ - Participants must use OpenAI Client for all LLM calls using above variables
19
+
20
+ STDOUT FORMAT
21
+ - The script must emit exactly three line types to stdout, in this order:
22
+
23
+ [START] task=<task_name> env=<benchmark> model=<model_name>
24
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
25
+ [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
26
+
27
+ Rules:
28
+ - One [START] line at episode begin.
29
+ - One [STEP] line per step, immediately after env.step() returns.
30
+ - One [END] line after env.close(), always emitted (even on exception).
31
+ - reward and rewards are formatted to 2 decimal places.
32
+ - done and success are lowercase booleans: true or false.
33
+ - error is the raw last_action_error string, or null if none.
34
+ - All fields on a single line with no newlines within a line.
35
+
36
+ Example:
37
+ [START] task=click-test env=miniwob model=Qwen3-VL-30B
38
+ [STEP] step=1 action=click('123') reward=0.00 done=false error=null
39
+ [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
40
+ [STEP] step=3 action=click('789') reward=1.00 done=true error=null
41
+ [END] success=true steps=3 rewards=0.00,0.00,1.00
42
+ """
43
+
44
+ import asyncio
45
+ import os
46
+ import textwrap
47
+ from typing import List, Optional
48
+
49
+ from openai import OpenAI
50
+
51
+ from my_env_v4 import MyEnvV4Action, MyEnvV4Env
52
+ IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
53
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
54
+
55
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
56
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
57
+ TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
58
+ BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
59
+ MAX_STEPS = 8
60
+ TEMPERATURE = 0.7
61
+ MAX_TOKENS = 150
62
+ SUCCESS_SCORE_THRESHOLD = 0.1 # normalized score in [0, 1]
63
+
64
+ # Max possible reward: each token contributes 0.1, across all steps
65
+ _MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
66
+ MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
67
+
68
+ SYSTEM_PROMPT = textwrap.dedent(
69
+ """
70
+ You are interacting with a simple echo environment.
71
+ Each turn you must send a message. The environment will echo it back.
72
+ Reward is proportional to message length: reward = len(message) * 0.1
73
+ Your goal is to maximize total reward by sending meaningful, substantive messages.
74
+ Reply with exactly one message string — no quotes, no prefixes, just the message text.
75
+ """
76
+ ).strip()
77
+
78
+
79
+ def log_start(task: str, env: str, model: str) -> None:
80
+ print(f"[START] task={task} env={env} model={model}", flush=True)
81
+
82
+
83
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
84
+ error_val = error if error else "null"
85
+ done_val = str(done).lower()
86
+ print(
87
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
88
+ flush=True,
89
+ )
90
+
91
+
92
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
93
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
94
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
95
+
96
+
97
+ def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
98
+ history_block = "\n".join(history[-4:]) if history else "None"
99
+ return textwrap.dedent(
100
+ f"""
101
+ Step: {step}
102
+ Last echoed message: {last_echoed!r}
103
+ Last reward: {last_reward:.2f}
104
+ Previous steps:
105
+ {history_block}
106
+ Send your next message.
107
+ """
108
+ ).strip()
109
+
110
+
111
+ def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
112
+ user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
113
+ try:
114
+ completion = client.chat.completions.create(
115
+ model=MODEL_NAME,
116
+ messages=[
117
+ {"role": "system", "content": SYSTEM_PROMPT},
118
+ {"role": "user", "content": user_prompt},
119
+ ],
120
+ temperature=TEMPERATURE,
121
+ max_tokens=MAX_TOKENS,
122
+ stream=False,
123
+ )
124
+ text = (completion.choices[0].message.content or "").strip()
125
+ return text if text else "hello"
126
+ except Exception as exc:
127
+ print(f"[DEBUG] Model request failed: {exc}", flush=True)
128
+ return "hello"
129
+
130
+
131
+ async def main() -> None:
132
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
133
+
134
+ env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
135
+
136
+ history: List[str] = []
137
+ rewards: List[float] = []
138
+ steps_taken = 0
139
+ score = 0.0
140
+ success = False
141
+
142
+ log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
143
+
144
+ try:
145
+ result = await env.reset() # OpenENV.reset()
146
+ last_echoed = result.observation.echoed_message
147
+ last_reward = 0.0
148
+
149
+ for step in range(1, MAX_STEPS + 1):
150
+ if result.done:
151
+ break
152
+
153
+ message = get_model_message(client, step, last_echoed, last_reward, history)
154
+
155
+ result = await env.step(MyEnvV4Action(message=message))
156
+ obs = result.observation
157
+
158
+ reward = result.reward or 0.0
159
+ done = result.done
160
+ error = None
161
+
162
+ rewards.append(reward)
163
+ steps_taken = step
164
+ last_echoed = obs.echoed_message
165
+ last_reward = reward
166
+
167
+ log_step(step=step, action=message, reward=reward, done=done, error=error)
168
+
169
+ history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
170
+
171
+ if done:
172
+ break
173
+
174
+ score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
175
+ score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
176
+ success = score >= SUCCESS_SCORE_THRESHOLD
177
+
178
+ finally:
179
+ try:
180
+ await env.close()
181
+ except Exception as e:
182
+ print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
183
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
184
+
185
+
186
+ if __name__ == "__main__":
187
+ asyncio.run(main())
app.py CHANGED
@@ -1,23 +1,20 @@
1
  """
2
- FastAPI server for DevOpsEnv - Linux DevOps/SRE troubleshooting environment.
3
 
4
  Endpoints:
5
- ---------
6
- POST /reset Create a new episode
7
- POST /step Advance the episode
8
- GET /state Current episode state
9
- GET /tasks List tasks and action schema
10
- POST /grader Grade a finished episode
11
- GET /health Liveness check
12
- GET / Info / spec link
13
  """
14
  from __future__ import annotations
15
 
16
  import os
17
- import uuid
18
- import json
19
- from typing import Any, Dict, List, Optional
20
  from datetime import datetime
 
21
 
22
  from fastapi import FastAPI, HTTPException, Query
23
  from fastapi.middleware.cors import CORSMiddleware
@@ -27,17 +24,13 @@ import environment as env
27
  from data import TASK_META
28
  from models import (
29
  Action,
30
- Observation,
31
- State,
32
- StepResult,
33
- TaskInfo,
34
- Reward,
35
  GraderResponse,
 
36
  )
37
 
38
  app = FastAPI(
39
- title="DevOpsEnv",
40
- description="An OpenEnv-compliant Linux DevOps/SRE troubleshooting environment.",
41
  version="1.0.0",
42
  docs_url="/docs",
43
  redoc_url="/redoc",
@@ -51,8 +44,13 @@ app.add_middleware(
51
  )
52
 
53
 
 
 
 
 
54
  class ResetRequest(BaseModel):
55
- task_id: str
 
56
 
57
 
58
  class StepRequest(BaseModel):
@@ -71,10 +69,9 @@ class GraderRequest(BaseModel):
71
  @app.get("/", tags=["meta"])
72
  def root():
73
  return {
74
- "name": "DevOpsEnv",
75
  "version": "1.0.0",
76
- "description": "OpenEnv DevOps/SRE troubleshooting environment",
77
- "openenv_spec": "https://github.com/meta-pytorch/OpenEnv",
78
  "tasks": list(TASK_META.keys()),
79
  "endpoints": {
80
  "reset": "POST /reset",
@@ -112,7 +109,7 @@ def tasks():
112
  @app.post("/reset", tags=["control"])
113
  def reset(req: ResetRequest):
114
  try:
115
- obs = env.reset(req.task_id)
116
  return obs.model_dump()
117
  except ValueError as e:
118
  raise HTTPException(status_code=400, detail=str(e))
@@ -155,4 +152,3 @@ if __name__ == "__main__":
155
  import uvicorn
156
  port = int(os.environ.get("PORT", 7860))
157
  uvicorn.run(app, host="0.0.0.0", port=port, workers=1)
158
-
 
1
  """
2
+ FastAPI server for SupportEnv Customer Support Ticket Triage.
3
 
4
  Endpoints:
5
+ POST /reset Create a new episode
6
+ POST /step Advance the episode
7
+ GET /state Current episode state
8
+ GET /tasks List tasks and action schema
9
+ POST /grader Grade a finished episode
10
+ GET /health Liveness check
11
+ GET / Info / spec link
 
12
  """
13
  from __future__ import annotations
14
 
15
  import os
 
 
 
16
  from datetime import datetime
17
+ from typing import Optional
18
 
19
  from fastapi import FastAPI, HTTPException, Query
20
  from fastapi.middleware.cors import CORSMiddleware
 
24
  from data import TASK_META
25
  from models import (
26
  Action,
 
 
 
 
 
27
  GraderResponse,
28
+ TaskInfo,
29
  )
30
 
31
  app = FastAPI(
32
+ title="SupportEnv",
33
+ description="An OpenEnv-compliant customer support ticket triage environment.",
34
  version="1.0.0",
35
  docs_url="/docs",
36
  redoc_url="/redoc",
 
44
  )
45
 
46
 
47
+ # ---------------------------------------------------------------------------
48
+ # Request schemas
49
+ # ---------------------------------------------------------------------------
50
+
51
  class ResetRequest(BaseModel):
52
+ task_id: str = "task1"
53
+ ticket_index: Optional[int] = 0
54
 
55
 
56
  class StepRequest(BaseModel):
 
69
  @app.get("/", tags=["meta"])
70
  def root():
71
  return {
72
+ "name": "SupportEnv",
73
  "version": "1.0.0",
74
+ "description": "OpenEnv customer support ticket triage environment",
 
75
  "tasks": list(TASK_META.keys()),
76
  "endpoints": {
77
  "reset": "POST /reset",
 
109
  @app.post("/reset", tags=["control"])
110
  def reset(req: ResetRequest):
111
  try:
112
+ obs = env.reset(req.task_id, ticket_index=req.ticket_index or 0)
113
  return obs.model_dump()
114
  except ValueError as e:
115
  raise HTTPException(status_code=400, detail=str(e))
 
152
  import uvicorn
153
  port = int(os.environ.get("PORT", 7860))
154
  uvicorn.run(app, host="0.0.0.0", port=port, workers=1)
 
baseline.py DELETED
@@ -1,309 +0,0 @@
1
- """
2
- SupportEnv — FastAPI server
3
-
4
- Endpoints
5
- ---------
6
- POST /reset Create a new episode
7
- POST /step Advance the episode
8
- GET /state Current episode state
9
- GET /tasks List tasks and action schema
10
- POST /grader Grade a finished episode
11
- POST /baseline Run the built-in baseline agent on all tasks
12
- GET /health Liveness check
13
- GET / Info / spec link
14
- """
15
- from __future__ import annotations
16
-
17
- import os
18
- import subprocess
19
- import sys
20
- import tempfile
21
- from typing import Any, Dict, List, Optional
22
-
23
- from fastapi import FastAPI, HTTPException, Query
24
- from fastapi.middleware.cors import CORSMiddleware
25
- from pydantic import BaseModel
26
-
27
- import environment as env
28
- from data import TASK_META
29
- from models import (
30
- Action,
31
- BaselineResult,
32
- GraderResponse,
33
- Observation,
34
- State,
35
- StepResult,
36
- TaskInfo,
37
- )
38
-
39
- app = FastAPI(
40
- title="SupportEnv",
41
- description=(
42
- "An OpenEnv-compliant customer-support triage environment. "
43
- "Agents learn to classify, extract information from, and resolve "
44
- "real-world SaaS support tickets."
45
- ),
46
- version="1.0.0",
47
- docs_url="/docs",
48
- redoc_url="/redoc",
49
- )
50
-
51
- app.add_middleware(
52
- CORSMiddleware,
53
- allow_origins=["*"],
54
- allow_methods=["*"],
55
- allow_headers=["*"],
56
- )
57
-
58
-
59
- # ---------------------------------------------------------------------------
60
- # Request / response shapes for endpoints not covered by models.py
61
- # ---------------------------------------------------------------------------
62
-
63
- class ResetRequest(BaseModel):
64
- task_id: str
65
- ticket_index: Optional[int] = None
66
-
67
-
68
- class StepRequest(BaseModel):
69
- episode_id: str
70
- action: Action
71
-
72
-
73
- class GraderRequest(BaseModel):
74
- episode_id: str
75
-
76
-
77
- # ---------------------------------------------------------------------------
78
- # Endpoints
79
- # ---------------------------------------------------------------------------
80
-
81
- @app.get("/", tags=["meta"])
82
- def root():
83
- return {
84
- "name": "SupportEnv",
85
- "version": "1.0.0",
86
- "description": "OpenEnv customer-support ticket triage environment",
87
- "openenv_spec": "https://github.com/openenv/openenv",
88
- "tasks": list(TASK_META.keys()),
89
- "endpoints": {
90
- "reset": "POST /reset",
91
- "step": "POST /step",
92
- "state": "GET /state?episode_id=...",
93
- "tasks": "GET /tasks",
94
- "grader": "POST /grader",
95
- "baseline": "POST /baseline",
96
- "health": "GET /health",
97
- "docs": "GET /docs",
98
- },
99
- }
100
-
101
-
102
- @app.get("/health", tags=["meta"])
103
- def health():
104
- return {"status": "ok"}
105
-
106
-
107
- # ---------------------------------------------------------------------------
108
- # Core OpenEnv endpoints
109
- # ---------------------------------------------------------------------------
110
-
111
- @app.post("/reset", response_model=Observation, tags=["openenv"])
112
- def reset(request: ResetRequest) -> Observation:
113
- """
114
- Start a new episode.
115
-
116
- - **task_id**: `task1` | `task2` | `task3`
117
- - **ticket_index**: 0-indexed ticket to use (optional; default 0)
118
- """
119
- try:
120
- return env.reset(request.task_id, request.ticket_index)
121
- except ValueError as e:
122
- raise HTTPException(status_code=400, detail=str(e))
123
-
124
-
125
- @app.post("/step", response_model=StepResult, tags=["openenv"])
126
- def step(request: StepRequest) -> StepResult:
127
- """
128
- Submit an action and advance the episode.
129
-
130
- The `action` object must include `action_type` and the fields relevant
131
- to that action type (see GET /tasks for the schema).
132
- """
133
- try:
134
- return env.step(request.episode_id, request.action)
135
- except KeyError:
136
- raise HTTPException(
137
- status_code=404,
138
- detail=f"Episode '{request.episode_id}' not found. Call POST /reset first.",
139
- )
140
- except ValueError as e:
141
- raise HTTPException(status_code=400, detail=str(e))
142
-
143
-
144
- @app.get("/state", response_model=State, tags=["openenv"])
145
- def state(episode_id: str = Query(..., description="Episode UUID from POST /reset")) -> State:
146
- """Return the current state of an episode."""
147
- try:
148
- return env.state(episode_id)
149
- except KeyError:
150
- raise HTTPException(
151
- status_code=404,
152
- detail=f"Episode '{episode_id}' not found.",
153
- )
154
-
155
-
156
- # ---------------------------------------------------------------------------
157
- # /tasks — task listing + action schema
158
- # ---------------------------------------------------------------------------
159
-
160
- # JSON Schema for the Action model (subset used in each task)
161
- _BASE_ACTION_SCHEMA = {
162
- "type": "object",
163
- "required": ["action_type"],
164
- "properties": {
165
- "action_type": {
166
- "type": "string",
167
- "description": "One of the available_actions listed in the Observation",
168
- },
169
- },
170
- }
171
-
172
- _ACTION_SCHEMAS: Dict[str, Dict[str, Any]] = {
173
- "task1": {
174
- **_BASE_ACTION_SCHEMA,
175
- "description": "classify action: set category + priority; then submit",
176
- "properties": {
177
- **_BASE_ACTION_SCHEMA["properties"],
178
- "category": {
179
- "type": "string",
180
- "enum": [
181
- "billing", "technical", "account",
182
- "feature_request", "complaint", "general",
183
- ],
184
- },
185
- "priority": {
186
- "type": "string",
187
- "enum": ["low", "medium", "high", "critical"],
188
- },
189
- },
190
- },
191
- "task2": {
192
- **_BASE_ACTION_SCHEMA,
193
- "description": "extract action: populate extracted_entities + required_actions; then submit",
194
- "properties": {
195
- **_BASE_ACTION_SCHEMA["properties"],
196
- "extracted_entities": {
197
- "type": "object",
198
- "additionalProperties": True,
199
- "description": "Key-value pairs extracted from the ticket text",
200
- },
201
- "required_actions": {
202
- "type": "array",
203
- "items": {"type": "string"},
204
- "description": "List of action identifiers (snake_case) needed to close the ticket",
205
- },
206
- },
207
- },
208
- "task3": {
209
- **_BASE_ACTION_SCHEMA,
210
- "description": (
211
- "respond or resolve action: write response_text + resolution_steps; "
212
- "optionally escalate; then submit"
213
- ),
214
- "properties": {
215
- **_BASE_ACTION_SCHEMA["properties"],
216
- "response_text": {
217
- "type": "string",
218
- "description": "Full professional response to send to the customer",
219
- },
220
- "resolution_steps": {
221
- "type": "array",
222
- "items": {"type": "string"},
223
- "description": "Ordered steps for support staff to resolve the ticket",
224
- },
225
- "escalation_team": {
226
- "type": "string",
227
- "enum": ["billing_team", "engineering", "account_management", "legal"],
228
- },
229
- "escalation_reason": {"type": "string"},
230
- },
231
- },
232
- }
233
-
234
-
235
- @app.get("/tasks", response_model=List[TaskInfo], tags=["openenv"])
236
- def list_tasks() -> List[TaskInfo]:
237
- """Return metadata and action schema for all tasks."""
238
- result = []
239
- for task_id, meta in TASK_META.items():
240
- result.append(
241
- TaskInfo(
242
- task_id=task_id,
243
- name=meta["name"],
244
- description=meta["description"],
245
- difficulty=meta["difficulty"],
246
- max_steps=meta["max_steps"],
247
- action_schema=_ACTION_SCHEMAS[task_id],
248
- )
249
- )
250
- return result
251
-
252
-
253
- # ---------------------------------------------------------------------------
254
- # /grader — grade a finished episode
255
- # ---------------------------------------------------------------------------
256
-
257
- @app.post("/grader", response_model=GraderResponse, tags=["openenv"])
258
- def grader(request: GraderRequest) -> GraderResponse:
259
- """
260
- Grade a finished episode.
261
-
262
- The episode must have reached `done=True` (either via a `submit` action
263
- or by exhausting `max_steps`).
264
- """
265
- try:
266
- return env.grade(request.episode_id)
267
- except KeyError:
268
- raise HTTPException(
269
- status_code=404,
270
- detail=f"Episode '{request.episode_id}' not found.",
271
- )
272
- except ValueError as e:
273
- raise HTTPException(status_code=400, detail=str(e))
274
-
275
-
276
- # ---------------------------------------------------------------------------
277
- # /baseline — run the built-in baseline agent
278
- # ---------------------------------------------------------------------------
279
-
280
- class BaselineRequest(BaseModel):
281
- model: str = "gpt-4o-mini"
282
- ticket_index: Optional[int] = 0
283
-
284
-
285
- @app.post("/baseline", response_model=BaselineResult, tags=["openenv"])
286
- def run_baseline(request: BaselineRequest) -> BaselineResult:
287
- """
288
- Run the heuristic baseline agent against all three tasks.
289
-
290
- The built-in baseline does NOT require an OpenAI key — it uses the
291
- deterministic heuristic baseline from `baseline.py`.
292
- If you want to run the LLM baseline, call `baseline.py` directly.
293
- """
294
- try:
295
- from baseline import run_heuristic_baseline
296
- scores = run_heuristic_baseline(
297
- ticket_index=request.ticket_index or 0
298
- )
299
- avg = round(sum(s["score"] for s in scores) / len(scores), 4)
300
- return BaselineResult(
301
- model="heuristic-baseline",
302
- scores=[
303
- {"task_id": s["task_id"], "score": s["score"], "details": s}
304
- for s in scores
305
- ],
306
- average_score=avg,
307
- )
308
- except Exception as exc:
309
- raise HTTPException(status_code=500, detail=str(exc))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data.py CHANGED
@@ -1,92 +1,15 @@
1
  """
2
- Linux DevOps tasks for SRE troubleshooting environment.
3
 
4
- Task 1 (easy) — Restart crashed Nginx service
5
- Task 2 (medium) — Fix Docker container misconfiguration
6
- Task 3 (hard) — Debug and fix memory leak in mock API
7
  """
8
  from __future__ import annotations
9
  from typing import Any, Dict, List
10
 
11
- # Nginx service configuration
12
- NGINX_CONFIG_PATH = "/etc/nginx/nginx.conf"
13
- NGINX_SYSTEMD_PATH = "/etc/systemd/system/nginx.service"
14
-
15
- # Docker configuration
16
- DOCKER_COMPOSE_PATH = "/srv/docker-compose.yml"
17
-
18
- # Mock API code path
19
- MOCK_API_PATH = "/opt/mockapi/app.py"
20
-
21
  # ---------------------------------------------------------------------------
22
- # TASK DEFINITIONS
23
- # ---------------------------------------------------------------------------
24
-
25
- TASK_META: Dict[str, Dict[str, Any]] = {
26
- "task1": {
27
- "name": "Restart Nginx Service",
28
- "description": (
29
- "Production Nginx service has crashed. Restart the service, "
30
- "verify the configuration syntax, and ensure the server "
31
- "returns HTTP 200 on port 80. Failing checklist:\n"
32
- "1. Restart nginx (systemctl restart nginx)\n"
33
- "2. Verify config syntax (nginx -t)\n"
34
- "3. Confirm service is running (systemctl status nginx)\n"
35
- "4. Check HTTP 200 response (curl http://localhost:80)"
36
- ),
37
- "difficulty": "easy",
38
- "max_steps": 10,
39
- "available_actions": ["bash_cmd", "submit"],
40
- "passing_conditions": [
41
- "nginx_running",
42
- "config_valid",
43
- "http_200_response",
44
- ],
45
- },
46
- "task2": {
47
- "name": "Fix Docker Container Configuration",
48
- "description": (
49
- "A critical microservice container is misconfigured. The port "
50
- "mapping in docker-compose.yml is broken. Fix the configuration, "
51
- "redeploy the container, and verify it's accessible on the "
52
- "correct port.\n"
53
- "1. Edit docker-compose.yml (fix port mapping)\n"
54
- "2. Restart containers (docker-compose up -d)\n"
55
- "3. Verify container is running\n"
56
- "4. Check service responds on mapped port"
57
- ),
58
- "difficulty": "medium",
59
- "max_steps": 15,
60
- "available_actions": ["bash_cmd", "file_edit", "submit"],
61
- "passing_conditions": [
62
- "docker_compose_valid",
63
- "container_running",
64
- "port_accessible",
65
- ],
66
- },
67
- "task3": {
68
- "name": "Find and Fix Memory Leak in Mock API",
69
- "description": (
70
- "The Python API service is leaking memory and consuming excessive "
71
- "resources. Diagnose the memory leak in /opt/mockapi/app.py, fix "
72
- "the offending code, and restart the service without root access.\n"
73
- "1. Identify the memory leak (check processes, logs)\n"
74
- "2. Kill the runaway process\n"
75
- "3. Fix the code in app.py (patch the leak)\n"
76
- "4. Restart the service as appuser\n"
77
- "5. Verify memory usage is normal"
78
- ),
79
- "difficulty": "hard",
80
- "max_steps": 20,
81
- "available_actions": ["bash_cmd", "file_edit", "submit"],
82
- "passing_conditions": [
83
- "process_killed",
84
- "code_fixed",
85
- "service_restarted",
86
- "memory_normal",
87
- ],
88
- },
89
- }
90
  # Agent must choose: category + priority
91
  # Categories: billing | technical | account | feature_request | complaint | general
92
  # Priorities: low | medium | high | critical
 
1
  """
2
+ SupportEnv Customer Support Ticket Triage data.
3
 
4
+ Task 1 (easy) — Ticket Classification
5
+ Task 2 (medium) — Information Extraction
6
+ Task 3 (hard) — Resolution Generation
7
  """
8
  from __future__ import annotations
9
  from typing import Any, Dict, List
10
 
 
 
 
 
 
 
 
 
 
 
11
  # ---------------------------------------------------------------------------
12
+ # TASK 1 — Ticket Classification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Agent must choose: category + priority
14
  # Categories: billing | technical | account | feature_request | complaint | general
15
  # Priorities: low | medium | high | critical
environment.py CHANGED
@@ -1,25 +1,23 @@
1
  """
2
- Core DevOpsEnv environment logic.
3
 
4
- Simulates a broken Linux server with:
5
- - Task 1: Crashed Nginx service needing restart
6
- - Task 2: Misconfigured Docker container
7
- - Task 3: Memory leak in Python mock API
8
 
9
  Manages episode lifecycle:
10
- reset() → Observation
11
- step(action) → StepResult
12
- get_state() → State
13
- grade() → (score, breakdown, feedback)
14
  """
15
  from __future__ import annotations
16
 
17
  import uuid
18
- import json
19
- import re
20
- from typing import Any, Dict, Optional, Tuple, List
21
 
22
- from data import TASK_META
23
  from graders import grade_task
24
  from models import (
25
  Action,
@@ -27,350 +25,75 @@ from models import (
27
  Reward,
28
  State,
29
  StepResult,
30
- SystemState,
31
  )
32
 
33
- # In-memory store: episode_id → EpisodeState dict
34
  _EPISODES: Dict[str, Dict[str, Any]] = {}
35
 
36
 
37
  # ---------------------------------------------------------------------------
38
- # Mock filesystem and system state
39
  # ---------------------------------------------------------------------------
40
 
41
- def _create_initial_state_task1() -> Dict[str, Any]:
42
- """Task 1: Nginx is crashed."""
43
- return {
44
- "running_processes": [
45
- {"pid": 100, "name": "systemd"},
46
- {"pid": 105, "name": "sshd"},
47
- # nginx NOT running
48
- ],
49
- "service_status": {
50
- "nginx": "inactive",
51
- "docker": "active",
52
- "mockapi": "active",
53
- },
54
- "http_ports_open": [8080], # 80 is down
55
- "docker_containers": [],
56
- "logs": "2026-03-29 01:30:00 nginx crashed\nCore dump detected.\n",
57
- "files": {
58
- NGINX_CONFIG_PATH: """
59
- user nginx;
60
- worker_processes auto;
61
- error_log /var/log/nginx/error.log warn;
62
- pid /var/run/nginx.pid;
63
-
64
- events {
65
- worker_connections 1024;
66
- }
67
-
68
- http {
69
- include /etc/nginx/mime.types;
70
- default_type application/octet-stream;
71
- sendfile on;
72
- keepalive_timeout 65;
73
-
74
- server {
75
- listen 80 default_server;
76
- server_name _;
77
- location / {
78
- return 200 "OK\\n";
79
- }
80
- }
81
- }""",
82
- "/etc/systemd/system/nginx.service": """
83
- [Unit]
84
- Description=The NGINX HTTP and reverse proxy server
85
- After=network.target
86
-
87
- [Service]
88
- Type=forking
89
- PIDFile=/var/run/nginx.pid
90
- ExecStartPre=/usr/sbin/nginx -t
91
- ExecStart=/usr/sbin/nginx
92
- ExecReload=/bin/kill -s HUP $MAINPID
93
- ExecStop=/bin/kill -s QUIT $MAINPID
94
- PrivateTmp=true
95
-
96
- [Install]
97
- WantedBy=multi-user.target""",
98
- },
99
- "cpu_usage": 45.2,
100
- "memory_usage_mb": 256,
101
- }
102
-
103
-
104
- def _create_initial_state_task2() -> Dict[str, Any]:
105
- """Task 2: Docker misconfigured."""
106
- return {
107
- "running_processes": [
108
- {"pid": 100, "name": "systemd"},
109
- {"pid": 105, "name": "sshd"},
110
- {"pid": 200, "name": "dockerd"},
111
- ],
112
- "service_status": {
113
- "nginx": "active",
114
- "docker": "active",
115
- "mockapi": "inactive",
116
- },
117
- "http_ports_open": [80],
118
- "docker_containers": [
119
- {"id": "abc123", "name": "mockapi-svc", "status": "running", "ports": "8000->3000/tcp"}
120
- ],
121
- "logs": "docker: port 3000 already in use\n",
122
- "files": {
123
- "/srv/docker-compose.yml": """
124
- version: '3.8'
125
- services:
126
- mockapi:
127
- image: mockapi:latest
128
- ports:
129
- - "3000:3000"
130
- environment:
131
- - PORT=3000
132
- volumes:
133
- - ./app.py:/app/app.py""",
134
- },
135
- "cpu_usage": 62.0,
136
- "memory_usage_mb": 1024,
137
- }
138
-
139
-
140
- def _create_initial_state_task3() -> Dict[str, Any]:
141
- """Task 3: Memory leak in mock API."""
142
- return {
143
- "running_processes": [
144
- {"pid": 100, "name": "systemd"},
145
- {"pid": 105, "name": "sshd"},
146
- {"pid": 300, "name": "python3", "rss_mb": 2048, "user": "appuser"}, # MEMORY LEAK
147
- ],
148
- "service_status": {
149
- "nginx": "active",
150
- "docker": "active",
151
- "mockapi": "active",
152
- },
153
- "http_ports_open": [80, 5000],
154
- "docker_containers": [],
155
- "logs": (
156
- "2026-03-29 01:45:00 mockapi started\n"
157
- "2026-03-29 01:46:00 memory usage: 512 MB\n"
158
- "2026-03-29 01:47:00 memory usage: 1024 MB\n"
159
- "2026-03-29 01:48:00 memory usage: 1536 MB (WARNING: HIGH)\n"
160
- "2026-03-29 01:49:00 memory usage: 2048 MB (CRITICAL)\n"
161
- ),
162
- "files": {
163
- "/opt/mockapi/app.py": """
164
- import json
165
- from flask import Flask
166
-
167
- app = Flask(__name__)
168
-
169
- # BUG: This list grows unbounded
170
- request_cache = []
171
-
172
- @app.route('/api/data', methods=['GET'])
173
- def get_data():
174
- data = {"timestamp": 123456, "value": 42}
175
- request_cache.append(data) # MEMORY LEAK!
176
- return json.dumps(data)
177
-
178
- if __name__ == '__main__':
179
- app.run(host='0.0.0.0', port=5000)
180
- """,
181
- },
182
- "cpu_usage": 85.5,
183
- "memory_usage_mb": 2048,
184
- }
185
-
186
-
187
- NGINX_CONFIG_PATH = "/etc/nginx/nginx.conf"
188
- DOCKER_COMPOSE_PATH = "/srv/docker-compose.yml"
189
- MOCK_API_PATH = "/opt/mockapi/app.py"
190
-
191
-
192
- def _build_system_state(task_id: str, ep_state: Dict[str, Any]) -> SystemState:
193
- """Build a SystemState object from episode state."""
194
- state_dict = ep_state["system_state"]
195
- return SystemState(
196
- task_id=task_id,
197
- available_commands=["systemctl", "nginx", "docker", "curl", "ps", "cat", "vim"],
198
- filesystem_snapshot=json.dumps({
199
- k: v for k, v in state_dict.get("files", {}).items()
200
- }),
201
- running_processes=state_dict.get("running_processes", []),
202
- service_status=state_dict.get("service_status", {}),
203
- logs=state_dict.get("logs", ""),
204
- http_ports_open=state_dict.get("http_ports_open", []),
205
- docker_containers=state_dict.get("docker_containers", []),
206
- cpu_usage=state_dict.get("cpu_usage", 0.0),
207
- memory_usage_mb=state_dict.get("memory_usage_mb", 0),
208
- )
209
 
210
 
211
  # ---------------------------------------------------------------------------
212
- # Dynamic execution simulation
213
  # ---------------------------------------------------------------------------
214
 
215
- def _simulate_bash_cmd(cmd: str, task_id: str, ep_state: Dict[str, Any]) -> str:
216
- """Simulate bash command execution."""
217
- state_dict = ep_state["system_state"]
218
- lower_cmd = cmd.lower()
219
-
220
- # Task 1: Nginx commands
221
- if task_id == "task1":
222
- if "systemctl restart nginx" in lower_cmd or "systemctl start nginx" in lower_cmd:
223
- state_dict["service_status"]["nginx"] = "active"
224
- state_dict["running_processes"].append({"pid": 999, "name": "nginx"})
225
- state_dict["http_ports_open"] = [80]
226
- return "Job for nginx.service started successfully."
227
- elif "systemctl status nginx" in lower_cmd:
228
- if state_dict["service_status"]["nginx"] == "active":
229
- return "● nginx.service - NGINX HTTP Server\n Loaded: loaded (/etc/systemd/system/nginx.service)\n Active: active (running)"
230
- return "● nginx.service - NGINX HTTP Server\n Active: inactive (dead)"
231
- elif "nginx -t" in lower_cmd:
232
- return "nginx: the configuration file /etc/nginx/nginx.conf syntax is ok\nnginx: configuration file /etc/nginx/nginx.conf test is successful"
233
- elif "curl http://localhost:80" in lower_cmd or "curl http://localhost" in lower_cmd:
234
- if 80 in state_dict["http_ports_open"]:
235
- return "OK"
236
- return "curl: (7) Failed to connect to localhost port 80: Connection refused"
237
-
238
- # Task 2: Docker commands
239
- elif task_id == "task2":
240
- if "docker-compose up -d" in lower_cmd:
241
- if DOCKER_COMPOSE_PATH in state_dict["files"]:
242
- compose_content = state_dict["files"][DOCKER_COMPOSE_PATH]
243
- # Check if port is now correct
244
- if "3000:3000" in compose_content:
245
- state_dict["docker_containers"] = [
246
- {"id": "xyz789", "name": "mockapi-svc", "status": "running", "ports": "3000:3000/tcp"}
247
- ]
248
- state_dict["service_status"]["mockapi"] = "active"
249
- return "Creating mockapi ... done"
250
- return "ERROR: docker-compose.yml not found or invalid"
251
- elif "docker ps" in lower_cmd:
252
- if state_dict["docker_containers"]:
253
- return "\n".join([f"{c['id']} {c['name']} {c['status']}" for c in state_dict["docker_containers"]])
254
- return "No containers running"
255
-
256
- # Task 3: Process/memory commands
257
- elif task_id == "task3":
258
- if "ps aux" in lower_cmd or "ps aux grep python" in lower_cmd:
259
- output = ""
260
- for proc in state_dict["running_processes"]:
261
- if proc.get("name") == "python3":
262
- output += f"appuser {proc['pid']} 85.5 {proc.get('rss_mb', 512)} python3 /opt/mockapi/app.py\n"
263
- return output if output else "No python processes found"
264
- elif "kill" in lower_cmd:
265
- if "300" in lower_cmd or "python" in lower_cmd:
266
- state_dict["running_processes"] = [p for p in state_dict["running_processes"] if p.get("name") != "python3"]
267
- state_dict["service_status"]["mockapi"] = "inactive"
268
- return "Process killed"
269
- return "Process not found"
270
- elif "python3 /opt/mockapi/app.py &" in lower_cmd or "python3 /opt/mockapi/app.py" in lower_cmd:
271
- state_dict["running_processes"].append({"pid": 301, "name": "python3", "rss_mb": 128, "user": "appuser"})
272
- state_dict["service_status"]["mockapi"] = "active"
273
- state_dict["http_ports_open"] = [80, 5000]
274
- return "Application started"
275
-
276
- return f"Command '{cmd}' executed (simulated)"
277
-
278
-
279
- def _simulate_file_edit(file_path: str, new_content: str, ep_state: Dict[str, Any]) -> str:
280
- """Simulate file editing."""
281
- state_dict = ep_state["system_state"]
282
-
283
- if file_path not in state_dict.get("files", {}):
284
- return f"ERROR: File {file_path} not found"
285
-
286
- # Detect task 2: Check docker-compose.yml fix
287
- if file_path == DOCKER_COMPOSE_PATH and "3000:3000" in new_content:
288
- state_dict["files"][file_path] = new_content
289
- return f"File {file_path} updated successfully"
290
-
291
- # Detect task 3: Check mock API fix
292
- elif file_path == MOCK_API_PATH and "request_cache = []" not in new_content:
293
- # Verify fix removes the memory leak
294
- state_dict["files"][file_path] = new_content
295
- return f"File {file_path} patched successfully"
296
-
297
- state_dict["files"][file_path] = new_content
298
- return f"File {file_path} edited"
299
-
300
-
301
- # ---------------------------------------------------------------------------
302
- # Reward calculation
303
- # ---------------------------------------------------------------------------
304
-
305
- def _calculate_step_reward(task_id: str, action: Action, ep_state: Dict[str, Any]) -> Tuple[float, str]:
306
- """Calculate reward based on action and task."""
307
- base_step_cost = -0.01
308
- reward = base_step_cost
309
-
310
- if action.action_type == "bash_cmd":
311
- cmd = action.command or ""
312
- reward += 0.05
313
- explanation = f"Executed: {cmd[:50]}"
314
- return reward, explanation
315
-
316
- elif action.action_type == "file_edit":
317
- reward += 0.03
318
- explanation = f"Edited: {action.file_path}"
319
- return reward, explanation
320
-
321
- elif action.action_type == "submit":
322
- reward += 0.1
323
- explanation = "Episode submitted for grading"
324
- return reward, explanation
325
-
326
- return reward, "Step taken"
327
-
328
-
329
- # ---------------------------------------------------------------------------
330
- # Core API functions
331
- # ---------------------------------------------------------------------------
332
-
333
- def reset(task_id: str) -> Observation:
334
- """Create a new episode for the given task."""
335
  if task_id not in TASK_META:
336
  raise ValueError(f"Unknown task_id {task_id!r}. Valid: {list(TASK_META)}")
337
 
338
  meta = TASK_META[task_id]
339
-
340
- # Initialize system state based on task
341
- if task_id == "task1":
342
- initial_sys_state = _create_initial_state_task1()
343
- elif task_id == "task2":
344
- initial_sys_state = _create_initial_state_task2()
345
- elif task_id == "task3":
346
- initial_sys_state = _create_initial_state_task3()
347
- else:
348
- initial_sys_state = {}
349
 
350
  episode_id = str(uuid.uuid4())
351
  _EPISODES[episode_id] = {
352
  "task_id": task_id,
 
 
353
  "step_number": 0,
354
  "max_steps": meta["max_steps"],
355
  "done": False,
356
  "total_reward": 0.0,
357
  "action_history": [],
358
  "final_score": None,
359
- "system_state": initial_sys_state,
360
  }
361
 
362
- system_state = _build_system_state(task_id, _EPISODES[episode_id])
 
 
 
 
 
 
 
 
363
 
364
  return Observation(
365
  task_id=task_id,
366
- task_description=meta["description"],
367
  episode_id=episode_id,
368
- system_state=system_state,
369
  thread_history=[],
370
- available_actions=meta["available_actions"],
371
  step_number=0,
372
  max_steps=meta["max_steps"],
373
- hint="Start by diagnosing the system state with basic commands.",
374
  )
375
 
376
 
@@ -379,24 +102,14 @@ def step(episode_id: str, action: Action) -> StepResult:
379
  ep = _EPISODES.get(episode_id)
380
  if ep is None:
381
  raise KeyError(f"Episode {episode_id} not found")
382
-
383
  if ep["done"]:
384
  raise ValueError(f"Episode {episode_id} is already done.")
385
 
386
  task_id = ep["task_id"]
387
- meta = TASK_META[task_id]
388
 
389
  ep["step_number"] += 1
390
  ep["action_history"].append(action.model_dump())
391
 
392
- # Execute action
393
- if action.action_type == "bash_cmd":
394
- cmd_output = _simulate_bash_cmd(action.command or "", task_id, ep)
395
- ep["action_history"][-1]["output"] = cmd_output
396
- elif action.action_type == "file_edit":
397
- edit_result = _simulate_file_edit(action.file_path or "", action.file_content or "", ep)
398
- ep["action_history"][-1]["result"] = edit_result
399
-
400
  # Determine if done
401
  done = False
402
  if action.action_type == "submit":
@@ -404,16 +117,21 @@ def step(episode_id: str, action: Action) -> StepResult:
404
  elif ep["step_number"] >= ep["max_steps"]:
405
  done = True
406
 
407
- # Calculate reward
408
- step_reward, explanation = _calculate_step_reward(task_id, action, ep)
409
 
410
- # Apply grader bonus when done
411
  if done:
412
- final_score, breakdown, grader_feedback = grade_task(task_id, ep)
413
  ep["final_score"] = final_score
414
- bonus = final_score * 0.5
415
- step_reward += bonus
416
- explanation += f" | Grader score: {final_score:.3f} (+{bonus:.3f} bonus)"
 
 
 
 
 
417
  else:
418
  final_score = None
419
 
@@ -421,21 +139,34 @@ def step(episode_id: str, action: Action) -> StepResult:
421
  ep["done"] = done
422
 
423
  # Build observation
424
- system_state = _build_system_state(task_id, ep)
 
 
 
 
 
 
 
 
 
 
 
 
425
  thread_history = [
426
- {"role": "agent", "content": str(a)} for a in ep["action_history"]
 
427
  ]
428
 
429
  obs = Observation(
430
  task_id=task_id,
431
- task_description=meta["description"],
432
  episode_id=episode_id,
433
- system_state=system_state,
434
  thread_history=thread_history,
435
- available_actions=meta["available_actions"] if not done else [],
436
  step_number=ep["step_number"],
437
  max_steps=ep["max_steps"],
438
- hint=None if done else "Continue diagnosing and fixing the issue.",
439
  )
440
 
441
  reward = Reward(
@@ -444,7 +175,7 @@ def step(episode_id: str, action: Action) -> StepResult:
444
  explanation=explanation,
445
  )
446
 
447
- info = {"step": ep["step_number"]}
448
  if done:
449
  info["final_score"] = final_score
450
 
@@ -474,13 +205,84 @@ def grade(episode_id: str) -> Tuple[float, Dict[str, float], str]:
474
  ep = _EPISODES.get(episode_id)
475
  if ep is None:
476
  raise KeyError(f"Episode {episode_id} not found")
477
-
478
  if not ep.get("done"):
479
  raise ValueError(f"Episode {episode_id} is not done yet")
480
 
481
  task_id = ep["task_id"]
482
  score, breakdown, feedback = grade_task(task_id, ep)
483
  ep["final_score"] = score
484
-
485
  return score, breakdown, feedback
486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Core SupportEnv environment logic.
3
 
4
+ Simulates a customer support ticket triage workflow:
5
+ - Task 1 (easy): Ticket Classification assign category + priority
6
+ - Task 2 (medium): Information Extraction — pull entities + required actions
7
+ - Task 3 (hard): Resolution Generation write response + resolution steps
8
 
9
  Manages episode lifecycle:
10
+ reset(task_id, ticket_index) → Observation
11
+ step(episode_id, action) → StepResult
12
+ get_state(episode_id) → State
13
+ grade(episode_id) → (score, breakdown, feedback)
14
  """
15
  from __future__ import annotations
16
 
17
  import uuid
18
+ from typing import Any, Dict, Optional, Tuple
 
 
19
 
20
+ from data import TASK_META, get_task_meta, get_tickets
21
  from graders import grade_task
22
  from models import (
23
  Action,
 
25
  Reward,
26
  State,
27
  StepResult,
28
+ TicketInfo,
29
  )
30
 
31
+ # In-memory store: episode_id → episode dict
32
  _EPISODES: Dict[str, Dict[str, Any]] = {}
33
 
34
 
35
  # ---------------------------------------------------------------------------
36
+ # Reward constants (match openenv.yaml)
37
  # ---------------------------------------------------------------------------
38
 
39
+ STEP_COST = -0.02
40
+ SUBMIT_BONUS = 0.05
41
+ MAX_STEP_PENALTY = -0.10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  # ---------------------------------------------------------------------------
45
+ # Core API
46
  # ---------------------------------------------------------------------------
47
 
48
+ def reset(task_id: str, ticket_index: int = 0) -> Observation:
49
+ """Create a new episode for the given task and ticket."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  if task_id not in TASK_META:
51
  raise ValueError(f"Unknown task_id {task_id!r}. Valid: {list(TASK_META)}")
52
 
53
  meta = TASK_META[task_id]
54
+ tickets = get_tickets(task_id)
55
+
56
+ if ticket_index < 0 or ticket_index >= len(tickets):
57
+ raise ValueError(
58
+ f"ticket_index {ticket_index} out of range [0, {len(tickets) - 1}]"
59
+ )
60
+
61
+ ticket_data = tickets[ticket_index]
62
+ safe_meta = get_task_meta(task_id)
 
63
 
64
  episode_id = str(uuid.uuid4())
65
  _EPISODES[episode_id] = {
66
  "task_id": task_id,
67
+ "ticket_index": ticket_index,
68
+ "ticket_data": ticket_data,
69
  "step_number": 0,
70
  "max_steps": meta["max_steps"],
71
  "done": False,
72
  "total_reward": 0.0,
73
  "action_history": [],
74
  "final_score": None,
 
75
  }
76
 
77
+ ticket_info = TicketInfo(
78
+ ticket_id=ticket_data["ticket_id"],
79
+ subject=ticket_data["subject"],
80
+ body=ticket_data["body"],
81
+ customer_tier=ticket_data["customer_tier"],
82
+ account_age_days=ticket_data["account_age_days"],
83
+ previous_tickets=ticket_data["previous_tickets"],
84
+ attachments=ticket_data.get("attachments", []),
85
+ )
86
 
87
  return Observation(
88
  task_id=task_id,
89
+ task_description=safe_meta["description"],
90
  episode_id=episode_id,
91
+ ticket=ticket_info,
92
  thread_history=[],
93
+ available_actions=safe_meta["available_actions"],
94
  step_number=0,
95
  max_steps=meta["max_steps"],
96
+ hint=_get_hint(task_id, 0),
97
  )
98
 
99
 
 
102
  ep = _EPISODES.get(episode_id)
103
  if ep is None:
104
  raise KeyError(f"Episode {episode_id} not found")
 
105
  if ep["done"]:
106
  raise ValueError(f"Episode {episode_id} is already done.")
107
 
108
  task_id = ep["task_id"]
 
109
 
110
  ep["step_number"] += 1
111
  ep["action_history"].append(action.model_dump())
112
 
 
 
 
 
 
 
 
 
113
  # Determine if done
114
  done = False
115
  if action.action_type == "submit":
 
117
  elif ep["step_number"] >= ep["max_steps"]:
118
  done = True
119
 
120
+ # Calculate step reward
121
+ step_reward, explanation = _calculate_step_reward(task_id, action, ep, done)
122
 
123
+ # Apply grader bonus on terminal step
124
  if done:
125
+ final_score, _breakdown, _feedback = grade_task(task_id, ep)
126
  ep["final_score"] = final_score
127
+ # Grader score is the terminal bonus (0–1)
128
+ step_reward += final_score
129
+ explanation += f" | Grader score: {final_score:.3f}"
130
+
131
+ # Penalty for running out of steps without submitting
132
+ if action.action_type != "submit" and ep["step_number"] >= ep["max_steps"]:
133
+ step_reward += MAX_STEP_PENALTY
134
+ explanation += f" | Max-step penalty: {MAX_STEP_PENALTY}"
135
  else:
136
  final_score = None
137
 
 
139
  ep["done"] = done
140
 
141
  # Build observation
142
+ ticket_data = ep["ticket_data"]
143
+ safe_meta = get_task_meta(task_id)
144
+
145
+ ticket_info = TicketInfo(
146
+ ticket_id=ticket_data["ticket_id"],
147
+ subject=ticket_data["subject"],
148
+ body=ticket_data["body"],
149
+ customer_tier=ticket_data["customer_tier"],
150
+ account_age_days=ticket_data["account_age_days"],
151
+ previous_tickets=ticket_data["previous_tickets"],
152
+ attachments=ticket_data.get("attachments", []),
153
+ )
154
+
155
  thread_history = [
156
+ {"role": "agent", "content": _summarize_action(a)}
157
+ for a in ep["action_history"]
158
  ]
159
 
160
  obs = Observation(
161
  task_id=task_id,
162
+ task_description=safe_meta["description"],
163
  episode_id=episode_id,
164
+ ticket=ticket_info,
165
  thread_history=thread_history,
166
+ available_actions=safe_meta["available_actions"] if not done else [],
167
  step_number=ep["step_number"],
168
  max_steps=ep["max_steps"],
169
+ hint=None if done else _get_hint(task_id, ep["step_number"]),
170
  )
171
 
172
  reward = Reward(
 
175
  explanation=explanation,
176
  )
177
 
178
+ info: Dict[str, Any] = {"step": ep["step_number"]}
179
  if done:
180
  info["final_score"] = final_score
181
 
 
205
  ep = _EPISODES.get(episode_id)
206
  if ep is None:
207
  raise KeyError(f"Episode {episode_id} not found")
 
208
  if not ep.get("done"):
209
  raise ValueError(f"Episode {episode_id} is not done yet")
210
 
211
  task_id = ep["task_id"]
212
  score, breakdown, feedback = grade_task(task_id, ep)
213
  ep["final_score"] = score
 
214
  return score, breakdown, feedback
215
 
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # Helpers
219
+ # ---------------------------------------------------------------------------
220
+
221
+ def _calculate_step_reward(
222
+ task_id: str, action: Action, ep: Dict[str, Any], done: bool
223
+ ) -> Tuple[float, str]:
224
+ """Dense per-step reward."""
225
+ reward = STEP_COST # small cost per step
226
+
227
+ if action.action_type == "submit":
228
+ reward += SUBMIT_BONUS
229
+ return reward, "Submitted for grading"
230
+
231
+ # Partial-progress signals based on task
232
+ if task_id == "task1":
233
+ if action.action_type == "classify":
234
+ if action.category:
235
+ reward += 0.02
236
+ if action.priority:
237
+ reward += 0.02
238
+ return reward, f"Classified: category={action.category}, priority={action.priority}"
239
+
240
+ elif task_id == "task2":
241
+ if action.action_type == "extract":
242
+ n_entities = len(action.extracted_entities) if action.extracted_entities else 0
243
+ n_actions = len(action.required_actions) if action.required_actions else 0
244
+ reward += min(n_entities * 0.005, 0.04)
245
+ reward += min(n_actions * 0.005, 0.02)
246
+ return reward, f"Extracted {n_entities} entities, {n_actions} actions"
247
+
248
+ elif task_id == "task3":
249
+ if action.action_type == "respond":
250
+ text_len = len(action.response_text or "")
251
+ n_steps = len(action.resolution_steps) if action.resolution_steps else 0
252
+ if text_len > 0:
253
+ reward += min(text_len * 0.0001, 0.03)
254
+ if n_steps > 0:
255
+ reward += min(n_steps * 0.005, 0.02)
256
+ return reward, f"Response ({text_len} chars), {n_steps} resolution steps"
257
+
258
+ return reward, "Step taken"
259
+
260
+
261
+ def _summarize_action(action_dict: Dict[str, Any]) -> str:
262
+ """One-line summary of an action for thread_history."""
263
+ atype = action_dict.get("action_type", "unknown")
264
+ if atype == "classify":
265
+ return f"classify(category={action_dict.get('category')}, priority={action_dict.get('priority')})"
266
+ elif atype == "extract":
267
+ ents = action_dict.get("extracted_entities") or {}
268
+ acts = action_dict.get("required_actions") or []
269
+ return f"extract(entities={list(ents.keys())}, actions={acts})"
270
+ elif atype == "respond":
271
+ text = (action_dict.get("response_text") or "")[:60]
272
+ steps = action_dict.get("resolution_steps") or []
273
+ return f"respond(text='{text}...', steps={len(steps)})"
274
+ elif atype == "submit":
275
+ return "submit()"
276
+ return f"{atype}()"
277
+
278
+
279
+ def _get_hint(task_id: str, step: int) -> Optional[str]:
280
+ """Contextual hints to guide the agent."""
281
+ if step == 0:
282
+ hints = {
283
+ "task1": "Read the ticket carefully and classify by category and priority.",
284
+ "task2": "Extract all entities (IDs, names, amounts) and identify required actions.",
285
+ "task3": "Write a professional response and list resolution steps.",
286
+ }
287
+ return hints.get(task_id)
288
+ return None
graders.py CHANGED
@@ -1,191 +1,244 @@
1
  """
2
- Graders for DevOpsEnv tasks.
3
 
4
- Deterministic scoring based on system state changes and action validity.
 
 
 
 
 
 
5
  """
6
- from typing import Any, Dict, Tuple
7
 
 
8
 
9
- def grade_task(task_id: str, episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
10
- """
11
- Grade a completed task episode.
12
-
13
- Returns (score, breakdown, feedback)
14
- """
15
  if task_id == "task1":
16
- return grade_task1(episode_state)
17
  elif task_id == "task2":
18
- return grade_task2(episode_state)
19
  elif task_id == "task3":
20
- return grade_task3(episode_state)
21
- else:
22
- return 0.0, {}, "Unknown task"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
 
 
 
24
 
25
- def grade_task1(episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
26
  """
27
- Grade Task 1: Restart Nginx.
28
-
29
- Success criteria:
30
- - nginx service is running (30%)
31
- - nginx config is valid (30%)
32
- - HTTP 200 response on port 80 (40%)
33
  """
34
- state_dict = episode_state.get("system_state", {})
35
- action_history = episode_state.get("action_history", [])
36
-
37
- breakdown = {
38
- "nginx_running": 0.0,
39
- "config_valid": 0.0,
40
- "http_200": 0.0,
41
  }
42
-
43
- # Check if nginx is running
44
- service_status = state_dict.get("service_status", {})
45
- if service_status.get("nginx") == "active":
46
- breakdown["nginx_running"] = 0.3
47
-
48
- # Check if config validation was attempted and passed
49
- config_valid = False
50
- for action in action_history:
51
- output = action.get("output", "")
52
- if output and ("syntax is ok" in str(output).lower() or "test is successful" in str(output).lower()):
53
- config_valid = True
54
- breakdown["config_valid"] = 0.3
55
- break
56
-
57
- # Check if HTTP 200 response was achieved
58
- http_ports = state_dict.get("http_ports_open", [])
59
- if 80 in http_ports:
60
- # Verify http 200 response was confirmed
61
- for action in action_history:
62
- output = action.get("output", "")
63
- cmd = action.get("command", "")
64
- if output and cmd and "OK" in str(output) and "curl" in str(cmd).lower():
65
- breakdown["http_200"] = 0.4
66
- break
67
-
68
- score = sum(breakdown.values())
69
- feedback = f"Task 1 Grading: nginx_running={breakdown['nginx_running']:.1f}, config_valid={breakdown['config_valid']:.1f}, http_200={breakdown['http_200']:.1f}"
70
-
71
- return min(score, 1.0), breakdown, feedback
72
-
73
-
74
- def grade_task2(episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
75
  """
76
- Grade Task 2: Fix Docker configuration.
77
-
78
- Success criteria:
79
- - docker-compose.yml was edited (25%)
80
- - docker-compose up -d was successful (25%)
81
- - Container is running (25%)
82
- - Service accessible on correct port (25%)
83
  """
84
- state_dict = episode_state.get("system_state", {})
85
- action_history = episode_state.get("action_history", [])
86
- files = state_dict.get("files", {})
87
-
88
- breakdown = {
89
- "file_edited": 0.0,
90
- "compose_ran": 0.0,
91
- "container_running": 0.0,
92
- "port_accessible": 0.0,
93
  }
94
-
95
- # Check if docker-compose.yml was edited correctly
96
- compose_file = "/srv/docker-compose.yml"
97
- if compose_file in files:
98
- content = files[compose_file]
99
- if content and "3000:3000" in str(content):
100
- breakdown["file_edited"] = 0.25
101
-
102
- # Check if docker-compose up -d was run
103
- for action in action_history:
104
- cmd = action.get("command")
105
- if cmd and "docker-compose up -d" in str(cmd):
106
- output = action.get("output", "")
107
- if output and ("done" in str(output).lower() or "created" in str(output).lower()):
108
- breakdown["compose_ran"] = 0.25
109
- break
110
-
111
- # Check if container is running
112
- containers = state_dict.get("docker_containers", [])
113
- if containers:
114
- for container in containers:
115
- if container.get("status") == "running" and "mockapi" in str(container.get("name", "")):
116
- breakdown["container_running"] = 0.25
117
- break
118
-
119
- # Check if port is correctly mapped
120
- if containers:
121
- for container in containers:
122
- if "3000:3000" in str(container.get("ports", "")):
123
- breakdown["port_accessible"] = 0.25
124
- break
125
-
126
- score = sum(breakdown.values())
127
- feedback = f"Task 2 Grading: file_edited={breakdown['file_edited']:.2f}, compose_ran={breakdown['compose_ran']:.2f}, container_running={breakdown['container_running']:.2f}, port_accessible={breakdown['port_accessible']:.2f}"
128
-
129
- return min(score, 1.0), breakdown, feedback
130
-
131
-
132
- def grade_task3(episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  """
134
- Grade Task 3: Fix memory leak.
135
-
136
- Success criteria:
137
- - Process was killed (25%)
138
- - Code was fixed (removing the leak) (25%)
139
- - Service was restarted (25%)
140
- - Memory usage decreased (25%)
141
  """
142
- state_dict = episode_state.get("system_state", {})
143
- action_history = episode_state.get("action_history", [])
144
- files = state_dict.get("files", {})
145
- processes = state_dict.get("running_processes", [])
146
-
147
- breakdown = {
148
- "process_killed": 0.0,
149
- "code_fixed": 0.0,
150
- "service_restarted": 0.0,
151
- "memory_reduced": 0.0,
152
  }
153
-
154
- # Check if python process was killed
155
- has_python_leak = False
156
- if processes:
157
- has_python_leak = any(p.get("name") == "python3" and p.get("rss_mb", 512) > 1024 for p in processes)
158
- if not has_python_leak:
159
- # Process was killed
160
- breakdown["process_killed"] = 0.25
161
-
162
- # Check if code was fixed (removed the memory leak)
163
- app_file = "/opt/mockapi/app.py"
164
- if app_file in files:
165
- content = files[app_file]
166
- # Memory leak is the unbounded list append - check if it is fixed
167
- if content and ("request_cache.append" not in str(content) or "request_cache = []" not in str(content)):
168
- # If it has been removed or replaced with something better
169
- if "request_cache" not in str(content) or "# " in str(content):
170
- breakdown["code_fixed"] = 0.25
171
-
172
- # Check if service was restarted
173
- service_status = state_dict.get("service_status", {})
174
- if service_status.get("mockapi") == "active":
175
- # And there is a newer process
176
- for action in action_history:
177
- cmd = action.get("command", "")
178
- if cmd and "python3" in str(cmd) and ("start" in str(cmd) or "&" in str(cmd)):
179
- breakdown["service_restarted"] = 0.25
180
- break
181
-
182
- # Check if memory usage decreased
183
- initial_memory = 2048
184
- current_memory = state_dict.get("memory_usage_mb", 2048)
185
- if current_memory < initial_memory * 0.75: # At least 25% improvement
186
- breakdown["memory_reduced"] = 0.25
187
-
188
- score = sum(breakdown.values())
189
- feedback = f"Task 3 Grading: process_killed={breakdown['process_killed']:.2f}, code_fixed={breakdown['code_fixed']:.2f}, service_restarted={breakdown['service_restarted']:.2f}, memory_reduced={breakdown['memory_reduced']:.2f}"
190
-
191
- return min(score, 1.0), breakdown, feedback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Deterministic graders for SupportEnv tasks.
3
 
4
+ Each grader inspects the agent's action_history against ground-truth data
5
+ and returns (score, breakdown, feedback) where score is in [0.0, 1.0].
6
+
7
+ Task 1 — Classification: category match (0.50) + priority match (0.40) + efficiency (0.10)
8
+ Task 2 — Extraction: entity coverage (0.60) + action coverage (0.30) + no hallucination (0.10)
9
+ Task 3 — Resolution: keyword coverage (0.30) + step coverage (0.30) + tone (0.25) +
10
+ length (0.10) + non-empty steps (0.05)
11
  """
12
+ from __future__ import annotations
13
 
14
+ from typing import Any, Dict, List, Optional, Tuple
15
 
16
+
17
+ def grade_task(
18
+ task_id: str, episode_state: Dict[str, Any]
19
+ ) -> Tuple[float, Dict[str, float], str]:
 
 
20
  if task_id == "task1":
21
+ return _grade_classification(episode_state)
22
  elif task_id == "task2":
23
+ return _grade_extraction(episode_state)
24
  elif task_id == "task3":
25
+ return _grade_resolution(episode_state)
26
+ return 0.0, {}, "Unknown task"
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Helpers
31
+ # ---------------------------------------------------------------------------
32
+
33
+ def _last_action_of_type(
34
+ history: List[Dict[str, Any]], action_type: str
35
+ ) -> Optional[Dict[str, Any]]:
36
+ """Return the last action matching *action_type*, or None."""
37
+ for action in reversed(history):
38
+ if action.get("action_type") == action_type:
39
+ return action
40
+ return None
41
+
42
+
43
+ def _normalize(s: Any) -> str:
44
+ return str(s).strip().lower() if s is not None else ""
45
+
46
 
47
+ # ---------------------------------------------------------------------------
48
+ # Task 1 — Classification
49
+ # ---------------------------------------------------------------------------
50
 
51
+ def _grade_classification(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
52
  """
53
+ Score breakdown:
54
+ category_correct 0.50 — exact match
55
+ priority_correct 0.40 — exact match
56
+ efficiency 0.10 1 step = full, degrades linearly
 
 
57
  """
58
+ gt = ep["ticket_data"]["ground_truth"]
59
+ history = ep.get("action_history", [])
60
+
61
+ breakdown: Dict[str, float] = {
62
+ "category_correct": 0.0,
63
+ "priority_correct": 0.0,
64
+ "efficiency": 0.0,
65
  }
66
+
67
+ classify_action = _last_action_of_type(history, "classify")
68
+ if classify_action is None:
69
+ return 0.0, breakdown, "No classify action found."
70
+
71
+ # Category
72
+ if _normalize(classify_action.get("category")) == _normalize(gt["category"]):
73
+ breakdown["category_correct"] = 0.50
74
+
75
+ # Priority
76
+ if _normalize(classify_action.get("priority")) == _normalize(gt["priority"]):
77
+ breakdown["priority_correct"] = 0.40
78
+
79
+ # Efficiency: full marks if classified in 1 step, degrades linearly
80
+ max_steps = ep.get("max_steps", 3)
81
+ steps_used = ep.get("step_number", max_steps)
82
+ if steps_used <= 1:
83
+ breakdown["efficiency"] = 0.10
84
+ else:
85
+ breakdown["efficiency"] = round(max(0.0, 0.10 * (1 - (steps_used - 1) / max_steps)), 4)
86
+
87
+ score = round(min(sum(breakdown.values()), 1.0), 4)
88
+ parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
89
+ return score, breakdown, f"Task 1: {parts}"
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # Task 2 Information Extraction
94
+ # ---------------------------------------------------------------------------
95
+
96
+ def _grade_extraction(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
 
 
97
  """
98
+ Score breakdown:
99
+ entity_coverage 0.60 — fraction of ground-truth entities matched
100
+ action_coverage 0.30 — fraction of required actions matched
101
+ no_hallucination 0.10 penalty for extra entities not in ground truth
 
 
 
102
  """
103
+ gt = ep["ticket_data"]["ground_truth"]
104
+ history = ep.get("action_history", [])
105
+
106
+ breakdown: Dict[str, float] = {
107
+ "entity_coverage": 0.0,
108
+ "action_coverage": 0.0,
109
+ "no_hallucination": 0.10, # start with full marks, deduct
 
 
110
  }
111
+
112
+ extract_action = _last_action_of_type(history, "extract")
113
+ if extract_action is None:
114
+ breakdown["no_hallucination"] = 0.0
115
+ return 0.0, breakdown, "No extract action found."
116
+
117
+ # --- Entity coverage ---
118
+ gt_entities: Dict[str, Any] = gt.get("entities", {})
119
+ pred_entities: Dict[str, Any] = extract_action.get("extracted_entities") or {}
120
+
121
+ if gt_entities:
122
+ matched = 0
123
+ for key, gt_val in gt_entities.items():
124
+ pred_val = pred_entities.get(key)
125
+ if pred_val is not None and _entity_matches(gt_val, pred_val):
126
+ matched += 1
127
+ breakdown["entity_coverage"] = round(0.60 * matched / len(gt_entities), 4)
128
+
129
+ # --- Action coverage ---
130
+ gt_actions: List[str] = gt.get("required_actions", [])
131
+ pred_actions: List[str] = extract_action.get("required_actions") or []
132
+ pred_actions_lower = [_normalize(a) for a in pred_actions]
133
+
134
+ if gt_actions:
135
+ matched_actions = sum(
136
+ 1 for ga in gt_actions if _normalize(ga) in pred_actions_lower
137
+ )
138
+ breakdown["action_coverage"] = round(0.30 * matched_actions / len(gt_actions), 4)
139
+
140
+ # --- No hallucination ---
141
+ if pred_entities and gt_entities:
142
+ extra_keys = set(pred_entities.keys()) - set(gt_entities.keys())
143
+ if extra_keys:
144
+ penalty = min(len(extra_keys) * 0.02, 0.10)
145
+ breakdown["no_hallucination"] = round(max(0.0, 0.10 - penalty), 4)
146
+
147
+ score = round(min(sum(breakdown.values()), 1.0), 4)
148
+ parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
149
+ return score, breakdown, f"Task 2: {parts}"
150
+
151
+
152
+ def _entity_matches(gt_val: Any, pred_val: Any) -> bool:
153
+ """Flexible entity comparison — handles strings, lists, and numbers."""
154
+ if isinstance(gt_val, list) and isinstance(pred_val, list):
155
+ gt_set = {_normalize(v) for v in gt_val}
156
+ pred_set = {_normalize(v) for v in pred_val}
157
+ return gt_set == pred_set
158
+ return _normalize(gt_val) == _normalize(pred_val)
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Task 3 — Resolution Generation
163
+ # ---------------------------------------------------------------------------
164
+
165
+ def _grade_resolution(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
166
  """
167
+ Score breakdown:
168
+ keyword_coverage 0.30 — fraction of required keywords found in response
169
+ step_coverage 0.30 — fraction of required resolution steps matched
170
+ tone_compliance 0.25 apology / urgency / timeline adherence
171
+ length_adequate 0.10 response meets minimum length
172
+ no_empty_steps 0.05 all resolution steps are non-empty
 
173
  """
174
+ gt = ep["ticket_data"]["ground_truth"]
175
+ history = ep.get("action_history", [])
176
+
177
+ breakdown: Dict[str, float] = {
178
+ "keyword_coverage": 0.0,
179
+ "step_coverage": 0.0,
180
+ "tone_compliance": 0.0,
181
+ "length_adequate": 0.0,
182
+ "no_empty_steps": 0.05, # assume pass unless empty steps found
 
183
  }
184
+
185
+ respond_action = _last_action_of_type(history, "respond")
186
+ if respond_action is None:
187
+ breakdown["no_empty_steps"] = 0.0
188
+ return 0.0, breakdown, "No respond action found."
189
+
190
+ response_text: str = respond_action.get("response_text") or ""
191
+ resolution_steps: List[str] = respond_action.get("resolution_steps") or []
192
+ response_lower = response_text.lower()
193
+
194
+ # --- Keyword coverage ---
195
+ required_keywords: List[str] = gt.get("required_keywords", [])
196
+ if required_keywords:
197
+ matched_kw = sum(1 for kw in required_keywords if kw.lower() in response_lower)
198
+ breakdown["keyword_coverage"] = round(0.30 * matched_kw / len(required_keywords), 4)
199
+
200
+ # --- Step coverage ---
201
+ gt_steps: List[str] = gt.get("required_resolution_steps", [])
202
+ if gt_steps:
203
+ pred_steps_lower = [_normalize(s) for s in resolution_steps]
204
+ matched_steps = sum(
205
+ 1 for gs in gt_steps if _normalize(gs) in pred_steps_lower
206
+ )
207
+ breakdown["step_coverage"] = round(0.30 * matched_steps / len(gt_steps), 4)
208
+
209
+ # --- Tone compliance ---
210
+ tone_req = gt.get("tone_requirements", {})
211
+ tone_checks = 0
212
+ tone_pass = 0
213
+ if tone_req.get("must_apologize"):
214
+ tone_checks += 1
215
+ apology_words = ["apolog", "sorry", "regret", "sincerely"]
216
+ if any(w in response_lower for w in apology_words):
217
+ tone_pass += 1
218
+ if tone_req.get("must_acknowledge_urgency"):
219
+ tone_checks += 1
220
+ urgency_words = ["urgent", "immediately", "priority", "asap", "right away", "as soon as"]
221
+ if any(w in response_lower for w in urgency_words):
222
+ tone_pass += 1
223
+ if tone_req.get("must_provide_timeline"):
224
+ tone_checks += 1
225
+ timeline_words = ["within", "hours", "minutes", "by end of", "shortly", "today", "tomorrow", "timeline", "expect"]
226
+ if any(w in response_lower for w in timeline_words):
227
+ tone_pass += 1
228
+ if tone_checks > 0:
229
+ breakdown["tone_compliance"] = round(0.25 * tone_pass / tone_checks, 4)
230
+ else:
231
+ breakdown["tone_compliance"] = 0.25 # no tone requirements = full marks
232
+
233
+ # --- Length adequate ---
234
+ min_len = gt.get("expected_response_length_min", 80)
235
+ if len(response_text) >= min_len:
236
+ breakdown["length_adequate"] = 0.10
237
+
238
+ # --- Non-empty steps ---
239
+ if not resolution_steps or any(not s.strip() for s in resolution_steps):
240
+ breakdown["no_empty_steps"] = 0.0
241
+
242
+ score = round(min(sum(breakdown.values()), 1.0), 4)
243
+ parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
244
+ return score, breakdown, f"Task 3: {parts}"
inference.py CHANGED
@@ -1,351 +1,290 @@
1
  """
2
- Baseline LLM inference agent for DevOpsEnv.
3
 
4
- This script reads an OpenEnv environment's state() and uses an LLM to generate
5
- actions that solve the DevOps tasks.
6
 
7
- Usage:
8
- python inference.py --task task1 --model gpt-4 --hf-token <token>
 
 
 
9
  """
 
10
  import os
11
  import sys
12
- import json
13
- import argparse
14
  import time
15
- from pathlib import Path
16
- from typing import Optional
17
 
18
  import requests
19
- from google import genai
20
  from openai import OpenAI
21
 
22
- # Load .env values from current folder (if present) before reading config.
23
- def _load_dotenv_from_workspace() -> None:
24
- """Load KEY=VALUE pairs from .env into os.environ without overriding existing vars."""
25
- dotenv_path = Path(__file__).resolve().parent / ".env"
26
- if not dotenv_path.exists():
27
- return
28
-
29
- for raw_line in dotenv_path.read_text(encoding="utf-8").splitlines():
30
- line = raw_line.strip()
31
- if not line or line.startswith("#"):
32
- continue
33
- if line.startswith("export "):
34
- line = line[7:].strip()
35
- if "=" not in line:
36
- continue
37
-
38
- key, value = line.split("=", 1)
39
- key = key.strip()
40
- value = value.strip()
41
- if not key:
42
- continue
43
-
44
- # Remove surrounding quotes if present.
45
- if (value.startswith('"') and value.endswith('"')) or (
46
- value.startswith("'") and value.endswith("'")
47
- ):
48
- value = value[1:-1]
49
-
50
- os.environ.setdefault(key, value)
51
-
52
-
53
- _load_dotenv_from_workspace()
54
-
55
- # Read config from environment/.env
56
- API_BASE_URL = os.environ.get("API_BASE_URL", "http://localhost:7860")
57
- MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4")
58
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
59
- OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "")
60
- GEMINI_DEFAULT_MODEL = os.environ.get("GEMINI_MODEL", "gemini-3-flash-preview")
61
-
62
-
63
- def _get_openai_client() -> OpenAI:
64
- """Create an OpenAI-compatible client for OpenAI-style chat completions."""
65
- api_key = os.environ.get("OPENAI_API_KEY", "sk-test")
66
- client_kwargs = {"api_key": api_key}
67
- if OPENAI_BASE_URL:
68
- client_kwargs["base_url"] = OPENAI_BASE_URL
69
- return OpenAI(**client_kwargs)
70
-
71
-
72
- def _get_gemini_client() -> genai.Client:
73
- """Create a Gemini client using the official google-genai SDK."""
74
- api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("OPENAI_API_KEY", "")
75
- if not api_key:
76
- raise ValueError("GEMINI_API_KEY is required for Gemini models")
77
- return genai.Client(api_key=api_key)
78
-
79
-
80
- def _is_gemini_model(model: str) -> bool:
81
- """Detect whether the requested model should use the Gemini SDK path."""
82
- m = (model or "").lower()
83
- return "gemini" in m
84
-
85
-
86
- def _resolve_gemini_model(model: str) -> str:
87
- """Map shorthand Gemini model names to concrete model IDs."""
88
- m = (model or "").strip()
89
- if not m or m.lower() == "gemini":
90
- return GEMINI_DEFAULT_MODEL
91
- return m
92
-
93
-
94
- def _generate_action_text(
95
- model: str,
96
- system_prompt: str,
97
- user_prompt: str,
98
- openai_client: Optional[OpenAI],
99
- gemini_client: Optional[genai.Client],
100
- ) -> str:
101
- """Generate model output text using Gemini SDK or OpenAI-compatible chat."""
102
- if _is_gemini_model(model):
103
- if gemini_client is None:
104
- raise ValueError("Gemini client was not initialized")
105
- gemini_model = _resolve_gemini_model(model)
106
- combined_prompt = (
107
- f"System instructions:\n{system_prompt}\n\n"
108
- f"User request:\n{user_prompt}"
109
- )
110
- response = gemini_client.models.generate_content(
111
- model=gemini_model,
112
- contents=combined_prompt,
113
- )
114
- return response.text or ""
115
-
116
- if openai_client is None:
117
- raise ValueError("OpenAI client was not initialized")
118
-
119
- response = openai_client.chat.completions.create(
120
- model=model,
121
- messages=[
122
- {"role": "system", "content": system_prompt},
123
- {"role": "user", "content": user_prompt},
124
- ],
125
- temperature=0.3,
126
- max_tokens=1000,
127
  )
128
- return response.choices[0].message.content or ""
129
-
130
-
131
- def send_request(method: str, endpoint: str, **kwargs):
132
- """Send HTTP request to the environment server."""
133
- url = f"{API_BASE_URL}{endpoint}"
134
- response = requests.request(method, url, timeout=10, **kwargs)
135
- response.raise_for_status()
136
- return response.json()
137
-
138
-
139
- def run_agent(task_id: str, max_steps: int = 20, model: Optional[str] = None) -> dict:
140
- """Run the agent on a specific task."""
141
- model = model or MODEL_NAME
142
- if _is_gemini_model(model):
143
- model = _resolve_gemini_model(model)
144
- openai_client: Optional[OpenAI] = None
145
- gemini_client: Optional[genai.Client] = None
146
- if _is_gemini_model(model):
147
- gemini_client = _get_gemini_client()
148
- else:
149
- openai_client = _get_openai_client()
150
-
151
- # Initialize episode
152
- print(f"\n{'='*60}")
153
- print(f"Starting task: {task_id}")
154
- print(f"Model: {model}")
155
- print(f"{'='*60}\n")
156
-
157
- obs = send_request("POST", "/reset", json={"task_id": task_id})
158
- episode_id = obs["episode_id"]
159
- max_steps = obs["max_steps"]
160
-
161
- print(f"Episode ID: {episode_id}")
162
- print(f"Max Steps: {max_steps}")
163
- print(f"\nTask: {obs['task_description']}\n")
164
-
165
- step_count = 0
166
- total_reward = 0.0
167
- actions_taken = []
168
-
169
- while step_count < max_steps:
170
- step_count += 1
171
-
172
- # Get current state
173
- state = send_request("GET", f"/state?episode_id={episode_id}")
174
-
175
- # Prepare prompt for LLM
176
- system_prompt = """You are an expert Linux DevOps engineer/SRE.
177
- Your job is to diagnose and fix broken systems using bash commands and file edits.
178
- You are interacting with a simulated Linux environment.
179
-
180
- Available actions:
181
- 1. bash_cmd: Execute a bash command
182
- 2. file_edit: Edit a file
183
- 3. submit: Submit when the task is complete
184
-
185
- Respond in JSON format with this structure:
186
- {
187
- "action_type": "bash_cmd" | "file_edit" | "submit",
188
- "command": "command to execute" (if bash_cmd),
189
- "file_path": "/path/to/file" (if file_edit),
190
- "file_content": "new file content" (if file_edit),
191
- "summary": "why you're taking this action"
192
  }
193
 
194
- Be strategic:
195
- - Start by diagnosing the system
196
- - Use ps, systemctl, curl, etc. to understand issues
197
- - Fix the root cause
198
- - Submit when done
199
- """
200
-
201
- user_prompt = f"""
202
- Current system state:
203
- - Task: {obs['task_description']}
204
- - Step: {state['step_number']}/{state['max_steps']}
205
- - Reward so far: {state['total_reward']:.3f}
206
 
207
- System status:
208
- {json.dumps(obs['system_state'], indent=2)}
 
 
 
 
 
 
 
 
 
 
209
 
210
- Previous actions: {len(state['history'])} taken so far
211
 
212
- History of commands:
213
- {json.dumps(state['history'][-3:], indent=2) if state['history'] else 'None yet'}
 
214
 
215
- What should I do next? Think step-by-step about what the issue is and how to fix it.
216
- """
217
-
218
- try:
219
- # Call LLM (Gemini SDK or OpenAI-compatible chat)
220
- response_text = _generate_action_text(
221
- model=model,
222
- system_prompt=system_prompt,
223
- user_prompt=user_prompt,
224
- openai_client=openai_client,
225
- gemini_client=gemini_client,
226
- )
227
- try:
228
- # Try to extract JSON from response
229
- if "```json" in response_text:
230
- json_str = response_text.split("```json")[1].split("```")[0]
231
- elif "```" in response_text:
232
- json_str = response_text.split("```")[1].split("```")[0]
233
- else:
234
- json_str = response_text
235
-
236
- action_data = json.loads(json_str)
237
- except (json.JSONDecodeError, IndexError):
238
- print(f"Failed to parse LLM response: {response_text[:100]}")
239
- # Fallback to simple diagnosis
240
- action_data = {"action_type": "bash_cmd", "command": "ps aux"}
241
-
242
- except Exception as e:
243
- print(f"LLM error: {e}. Falling back to heuristic...")
244
- # Fallback heuristic actions
245
- if step_count == 1:
246
- action_data = {"action_type": "bash_cmd", "command": "systemctl status nginx"}
247
- else:
248
- action_data = {"action_type": "submit", "summary": "Diagnostics complete"}
249
-
250
- # Step in environment
251
- try:
252
- result = send_request("POST", "/step", json={
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  "episode_id": episode_id,
254
- "action": action_data
255
  })
256
-
257
- obs = result["observation"]
258
- reward = result["reward"]
259
- done = result["done"]
260
-
261
- step_count = obs["step_number"]
262
- total_reward = reward["total_reward"]
263
-
264
- actions_taken.append(action_data)
265
-
266
- print(f"\nStep {step_count}/{max_steps}")
267
- print(f"Action: {action_data['action_type']}")
268
- if action_data.get("command"):
269
- print(f"Command: {action_data['command']}")
270
- elif action_data.get("file_path"):
271
- print(f"File: {action_data['file_path']}")
272
-
273
- print(f"Reward: {reward['step_reward']:+.3f} (total: {total_reward:.3f})")
274
- print(f"Info: {reward['explanation'][:100]}")
275
-
276
- if done:
277
- print(f"\n{'='*60}")
278
- print("EPISODE COMPLETE!")
279
- print(f"Final Reward: {total_reward:.3f}")
280
- print(f"Steps taken: {step_count}")
281
- print(f"{'='*60}\n")
282
- break
283
-
284
- except Exception as e:
285
- print(f"Step error: {e}")
286
- break
287
-
288
- # Small delay to avoid rate limiting
289
- time.sleep(0.5)
290
-
291
- # Grade the episode
292
- try:
293
- grade_result = send_request("POST", "/grader", json={"episode_id": episode_id})
294
- print(f"\nGrader Results:")
295
- print(f"Score: {grade_result['score']:.3f}/1.0")
296
- print(f"Breakdown: {json.dumps(grade_result['breakdown'], indent=2)}")
297
- print(f"Feedback: {grade_result['feedback']}")
298
- except Exception as e:
299
- print(f"Grading error: {e}")
300
-
301
  return {
302
  "task_id": task_id,
303
- "episode_id": episode_id,
304
- "final_reward": total_reward,
305
- "step_count": step_count,
306
- "actions": actions_taken,
307
  }
308
 
309
 
310
- def main():
311
- parser = argparse.ArgumentParser(description="Run DevOpsEnv baseline agent")
312
- parser.add_argument("--task", default="task1", help="Task ID (task1, task2, or task3)")
313
- parser.add_argument(
314
- "--model",
315
- default=None,
316
- help=(
317
- "Model name (default: env var MODEL_NAME). "
318
- "For Gemini, pass a real model ID like gemini-3-flash-preview "
319
- "or use --model gemini to auto-resolve to GEMINI_MODEL."
320
- ),
321
- )
322
- parser.add_argument("--api-url", default=None, help="API URL (default: env var API_BASE_URL)")
323
- parser.add_argument("--hf-token", default=None, help="HF token (default: env var HF_TOKEN)")
324
- parser.add_argument(
325
- "--openai-base-url",
326
- default=None,
327
- help="OpenAI-compatible base URL for non-OpenAI providers (for example Gemini OpenAI API)",
328
- )
329
-
330
- args = parser.parse_args()
331
-
332
- # Override env variables if provided
333
- global API_BASE_URL, MODEL_NAME, HF_TOKEN, OPENAI_BASE_URL
334
- if args.api_url:
335
- API_BASE_URL = args.api_url
336
- if args.model:
337
- MODEL_NAME = args.model
338
- if args.hf_token:
339
- HF_TOKEN = args.hf_token
340
- if args.openai_base_url:
341
- OPENAI_BASE_URL = args.openai_base_url
342
-
343
- try:
344
- result = run_agent(args.task, model=MODEL_NAME)
345
- print(json.dumps(result, indent=2))
346
- except Exception as e:
347
- print(f"Fatal error: {e}", file=sys.stderr)
348
- sys.exit(1)
 
 
 
 
 
 
349
 
350
 
351
  if __name__ == "__main__":
 
1
  """
2
+ Baseline inference script for SupportEnv.
3
 
4
+ Runs an LLM agent against all 3 tasks (5 tickets each) and emits the
5
+ mandatory [START]/[STEP]/[END] stdout format.
6
 
7
+ Environment variables:
8
+ API_BASE_URL LLM endpoint (default: https://router.huggingface.co/v1)
9
+ MODEL_NAME Model identifier (default: Qwen/Qwen2.5-72B-Instruct)
10
+ HF_TOKEN API key
11
+ API_BASE_URL_ENV SupportEnv server URL (default: http://localhost:7860)
12
  """
13
+ import json
14
  import os
15
  import sys
 
 
16
  import time
17
+ from typing import Any, Dict, List, Optional
 
18
 
19
  import requests
 
20
  from openai import OpenAI
21
 
22
+ # ---------------------------------------------------------------------------
23
+ # Config from environment
24
+ # ---------------------------------------------------------------------------
25
+
26
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
27
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
28
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
29
+ ENV_BASE_URL = os.getenv("API_BASE_URL_ENV", "http://localhost:7860")
30
+
31
+ TEMPERATURE = 0.3
32
+ MAX_TOKENS = 1024
33
+ BENCHMARK = "supportenv"
34
+
35
+ TASKS = [
36
+ {"task_id": "task1", "name": "Ticket Classification", "tickets": 5},
37
+ {"task_id": "task2", "name": "Information Extraction", "tickets": 5},
38
+ {"task_id": "task3", "name": "Resolution Generation", "tickets": 5},
39
+ ]
40
+
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Logging helpers (mandatory format)
44
+ # ---------------------------------------------------------------------------
45
+
46
+ def log_start(task: str, env: str, model: str) -> None:
47
+ print(f"[START] task={task} env={env} model={model}", flush=True)
48
+
49
+
50
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
51
+ error_val = error if error else "null"
52
+ done_val = str(done).lower()
53
+ print(
54
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
55
+ flush=True,
56
+ )
57
+
58
+
59
+ def log_end(success: bool, steps: int, rewards: List[float]) -> None:
60
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
61
+ print(
62
+ f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
63
+ flush=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  )
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Environment HTTP helpers
69
+ # ---------------------------------------------------------------------------
70
+
71
+ def env_request(method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
72
+ url = f"{ENV_BASE_URL}{endpoint}"
73
+ resp = requests.request(method, url, timeout=30, **kwargs)
74
+ resp.raise_for_status()
75
+ return resp.json()
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # LLM prompts per task
80
+ # ---------------------------------------------------------------------------
81
+
82
+ SYSTEM_PROMPTS = {
83
+ "task1": (
84
+ "You are an expert customer support triage agent.\n"
85
+ "Given a support ticket, classify it by:\n"
86
+ " category: one of billing | technical | account | feature_request | complaint | general\n"
87
+ " priority: one of low | medium | high | critical\n\n"
88
+ "Respond with ONLY valid JSON:\n"
89
+ '{"action_type": "classify", "category": "<category>", "priority": "<priority>"}'
90
+ ),
91
+ "task2": (
92
+ "You are an expert information extraction agent for customer support.\n"
93
+ "Given a support ticket, extract ALL structured entities and identify required actions.\n\n"
94
+ "Respond with ONLY valid JSON:\n"
95
+ '{"action_type": "extract", "extracted_entities": {"key": "value", ...}, '
96
+ '"required_actions": ["action1", "action2", ...]}'
97
+ ),
98
+ "task3": (
99
+ "You are an expert customer support resolution agent.\n"
100
+ "Given a support ticket, write a professional customer-facing response and "
101
+ "list the internal resolution steps.\n\n"
102
+ "Requirements:\n"
103
+ "- response_text: Professional, empathetic response (80+ chars)\n"
104
+ "- resolution_steps: Ordered list of internal action identifiers\n"
105
+ "- If the ticket is urgent, acknowledge urgency and provide a timeline\n"
106
+ "- If appropriate, include an apology\n\n"
107
+ "Respond with ONLY valid JSON:\n"
108
+ '{"action_type": "respond", "response_text": "...", '
109
+ '"resolution_steps": ["step1", "step2", ...]}'
110
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  }
112
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ def build_user_prompt(task_id: str, ticket: Dict[str, Any]) -> str:
115
+ parts = [
116
+ f"Ticket ID: {ticket['ticket_id']}",
117
+ f"Subject: {ticket['subject']}",
118
+ f"Body: {ticket['body']}",
119
+ f"Customer Tier: {ticket['customer_tier']}",
120
+ f"Account Age: {ticket['account_age_days']} days",
121
+ f"Previous Tickets: {ticket['previous_tickets']}",
122
+ ]
123
+ if ticket.get("attachments"):
124
+ parts.append(f"Attachments: {', '.join(ticket['attachments'])}")
125
+ return "\n".join(parts)
126
 
 
127
 
128
+ # ---------------------------------------------------------------------------
129
+ # LLM call
130
+ # ---------------------------------------------------------------------------
131
 
132
+ def call_llm(client: OpenAI, task_id: str, ticket: Dict[str, Any]) -> Dict[str, Any]:
133
+ """Call the LLM and parse its JSON response into an action dict."""
134
+ system_prompt = SYSTEM_PROMPTS[task_id]
135
+ user_prompt = build_user_prompt(task_id, ticket)
136
+
137
+ try:
138
+ completion = client.chat.completions.create(
139
+ model=MODEL_NAME,
140
+ messages=[
141
+ {"role": "system", "content": system_prompt},
142
+ {"role": "user", "content": user_prompt},
143
+ ],
144
+ temperature=TEMPERATURE,
145
+ max_tokens=MAX_TOKENS,
146
+ )
147
+ text = (completion.choices[0].message.content or "").strip()
148
+ return _parse_json(text, task_id)
149
+ except Exception as exc:
150
+ print(f"[DEBUG] LLM error: {exc}", file=sys.stderr, flush=True)
151
+ return _fallback_action(task_id)
152
+
153
+
154
+ def _parse_json(text: str, task_id: str) -> Dict[str, Any]:
155
+ """Extract JSON from model output, handling markdown fences."""
156
+ if "```json" in text:
157
+ text = text.split("```json")[1].split("```")[0]
158
+ elif "```" in text:
159
+ text = text.split("```")[1].split("```")[0]
160
+ try:
161
+ return json.loads(text.strip())
162
+ except json.JSONDecodeError:
163
+ print(f"[DEBUG] JSON parse failed: {text[:120]}", file=sys.stderr, flush=True)
164
+ return _fallback_action(task_id)
165
+
166
+
167
+ def _fallback_action(task_id: str) -> Dict[str, Any]:
168
+ """Deterministic fallback when LLM fails."""
169
+ if task_id == "task1":
170
+ return {"action_type": "classify", "category": "general", "priority": "medium"}
171
+ elif task_id == "task2":
172
+ return {"action_type": "extract", "extracted_entities": {}, "required_actions": []}
173
+ return {"action_type": "respond", "response_text": "Thank you for contacting support. We are looking into this.", "resolution_steps": []}
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # Run one episode
178
+ # ---------------------------------------------------------------------------
179
+
180
+ def run_episode(
181
+ client: OpenAI, task_id: str, task_name: str, ticket_index: int
182
+ ) -> Dict[str, Any]:
183
+ """Run a single episode: reset → action → submit → grade."""
184
+ log_start(task=f"{task_name}-ticket{ticket_index}", env=BENCHMARK, model=MODEL_NAME)
185
+
186
+ rewards: List[float] = []
187
+ steps_taken = 0
188
+ success = False
189
+ error_msg: Optional[str] = None
190
+
191
+ try:
192
+ # Reset
193
+ obs = env_request("POST", "/reset", json={
194
+ "task_id": task_id, "ticket_index": ticket_index
195
+ })
196
+ episode_id = obs["episode_id"]
197
+ ticket = obs["ticket"]
198
+
199
+ # Step 1: LLM generates the action
200
+ action_data = call_llm(client, task_id, ticket)
201
+ result = env_request("POST", "/step", json={
202
+ "episode_id": episode_id, "action": action_data
203
+ })
204
+ steps_taken = 1
205
+ reward_val = result["reward"]["step_reward"]
206
+ rewards.append(reward_val)
207
+ done = result["done"]
208
+ action_summary = _action_summary(action_data)
209
+ log_step(step=1, action=action_summary, reward=reward_val, done=done, error=error_msg)
210
+
211
+ # Step 2: Submit if not already done
212
+ if not done:
213
+ submit_result = env_request("POST", "/step", json={
214
  "episode_id": episode_id,
215
+ "action": {"action_type": "submit"},
216
  })
217
+ steps_taken = 2
218
+ reward_val = submit_result["reward"]["step_reward"]
219
+ rewards.append(reward_val)
220
+ done = submit_result["done"]
221
+ log_step(step=2, action="submit()", reward=reward_val, done=done, error=None)
222
+
223
+ # Grade
224
+ grade = env_request("POST", "/grader", json={"episode_id": episode_id})
225
+ final_score = grade["score"]
226
+ success = final_score >= 0.5
227
+
228
+ except Exception as exc:
229
+ error_msg = str(exc)
230
+ print(f"[DEBUG] Episode error: {exc}", file=sys.stderr, flush=True)
231
+
232
+ log_end(success=success, steps=steps_taken, rewards=rewards)
233
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  return {
235
  "task_id": task_id,
236
+ "ticket_index": ticket_index,
237
+ "steps": steps_taken,
238
+ "rewards": rewards,
239
+ "success": success,
240
  }
241
 
242
 
243
+ def _action_summary(action: Dict[str, Any]) -> str:
244
+ atype = action.get("action_type", "unknown")
245
+ if atype == "classify":
246
+ return f"classify({action.get('category')},{action.get('priority')})"
247
+ elif atype == "extract":
248
+ ents = action.get("extracted_entities") or {}
249
+ acts = action.get("required_actions") or []
250
+ return f"extract({len(ents)}ents,{len(acts)}acts)"
251
+ elif atype == "respond":
252
+ tlen = len(action.get("response_text") or "")
253
+ slen = len(action.get("resolution_steps") or [])
254
+ return f"respond({tlen}chars,{slen}steps)"
255
+ return atype
256
+
257
+
258
+ # ---------------------------------------------------------------------------
259
+ # Main
260
+ # ---------------------------------------------------------------------------
261
+
262
+ def main() -> None:
263
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
264
+
265
+ results = []
266
+ for task_info in TASKS:
267
+ task_id = task_info["task_id"]
268
+ task_name = task_info["name"]
269
+ num_tickets = task_info["tickets"]
270
+
271
+ for ticket_idx in range(num_tickets):
272
+ result = run_episode(client, task_id, task_name, ticket_idx)
273
+ results.append(result)
274
+ time.sleep(0.5) # rate-limit courtesy
275
+
276
+ # Summary
277
+ print("\n" + "=" * 60, flush=True)
278
+ print("BASELINE RESULTS SUMMARY", flush=True)
279
+ print("=" * 60, flush=True)
280
+ for r in results:
281
+ status = "PASS" if r["success"] else "FAIL"
282
+ total_r = sum(r["rewards"])
283
+ print(
284
+ f" {r['task_id']} ticket={r['ticket_index']} "
285
+ f"steps={r['steps']} reward={total_r:.2f} {status}",
286
+ flush=True,
287
+ )
288
 
289
 
290
  if __name__ == "__main__":
models.py CHANGED
@@ -1,7 +1,8 @@
1
  """
2
- Pydantic models for DevOpsEnv OpenEnv environment.
3
 
4
- Domain: Linux DevOps & SRE Troubleshooting
 
5
  """
6
  from __future__ import annotations
7
 
@@ -10,21 +11,18 @@ from pydantic import BaseModel, Field
10
 
11
 
12
  # ---------------------------------------------------------------------------
13
- # System State Models
14
  # ---------------------------------------------------------------------------
15
 
16
- class SystemState(BaseModel):
17
- """Current state of the mock Linux server."""
18
- task_id: str
19
- available_commands: List[str]
20
- filesystem_snapshot: str
21
- running_processes: List[Dict[str, Any]]
22
- service_status: Dict[str, str]
23
- logs: str
24
- http_ports_open: List[int]
25
- docker_containers: List[Dict[str, str]]
26
- cpu_usage: float
27
- memory_usage_mb: int
28
 
29
 
30
  # ---------------------------------------------------------------------------
@@ -34,17 +32,17 @@ class SystemState(BaseModel):
34
  class Observation(BaseModel):
35
  """Everything the agent sees at each step."""
36
  task_id: str = Field(description="task1 | task2 | task3")
37
- task_description: str = Field(description="Human-readable task description")
38
- episode_id: str = Field(description="Unique episode UUID")
39
- system_state: SystemState
40
  thread_history: List[Dict[str, str]] = Field(
41
  default_factory=list,
42
- description="Ordered list of {'role': 'agent'|'system', 'content': str}"
43
  )
44
  available_actions: List[str]
45
  step_number: int
46
  max_steps: int
47
- hint: Optional[str] = Field(default=None)
48
 
49
 
50
  # ---------------------------------------------------------------------------
@@ -52,12 +50,40 @@ class Observation(BaseModel):
52
  # ---------------------------------------------------------------------------
53
 
54
  class Action(BaseModel):
55
- """Agent action: run a bash command, edit a file, or submit."""
56
- action_type: str = Field(description="bash_cmd | file_edit | submit")
57
- command: Optional[str] = Field(default=None, description="Bash command to execute")
58
- file_path: Optional[str] = Field(default=None, description="Absolute path to file to edit")
59
- file_content: Optional[str] = Field(default=None, description="New full content for the file")
60
- summary: Optional[str] = Field(default=None, description="Final summary of actions taken")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  # ---------------------------------------------------------------------------
@@ -94,7 +120,7 @@ class State(BaseModel):
94
  done: bool
95
  total_reward: float
96
  history: List[Dict[str, Any]] = Field(default_factory=list)
97
- final_score: Optional[float] = Field(default=None)
98
 
99
 
100
  # ---------------------------------------------------------------------------
@@ -128,4 +154,4 @@ class BaselineResult(BaseModel):
128
  final_score: float
129
  step_count: int
130
  total_reward: float
131
- actions: List[Dict[str, Any]]
 
1
  """
2
+ Pydantic models for SupportEnv Customer Support Ticket Triage.
3
 
4
+ Domain: SaaS customer support automation
5
+ Tasks: classification, information extraction, resolution generation
6
  """
7
  from __future__ import annotations
8
 
 
11
 
12
 
13
  # ---------------------------------------------------------------------------
14
+ # Ticket Info (what the agent sees)
15
  # ---------------------------------------------------------------------------
16
 
17
+ class TicketInfo(BaseModel):
18
+ """A customer support ticket presented to the agent."""
19
+ ticket_id: str
20
+ subject: str
21
+ body: str
22
+ customer_tier: str = Field(description="free | pro | enterprise")
23
+ account_age_days: int
24
+ previous_tickets: int
25
+ attachments: List[str] = Field(default_factory=list)
 
 
 
26
 
27
 
28
  # ---------------------------------------------------------------------------
 
32
  class Observation(BaseModel):
33
  """Everything the agent sees at each step."""
34
  task_id: str = Field(description="task1 | task2 | task3")
35
+ task_description: str
36
+ episode_id: str
37
+ ticket: TicketInfo
38
  thread_history: List[Dict[str, str]] = Field(
39
  default_factory=list,
40
+ description="Ordered list of {'role': 'agent'|'system', 'content': str}",
41
  )
42
  available_actions: List[str]
43
  step_number: int
44
  max_steps: int
45
+ hint: Optional[str] = None
46
 
47
 
48
  # ---------------------------------------------------------------------------
 
50
  # ---------------------------------------------------------------------------
51
 
52
  class Action(BaseModel):
53
+ """Agent action for support ticket processing."""
54
+ action_type: str = Field(
55
+ description="classify | extract | respond | resolve | escalate | submit"
56
+ )
57
+ # Task 1: Classification
58
+ category: Optional[str] = Field(
59
+ default=None,
60
+ description="billing | technical | account | feature_request | complaint | general",
61
+ )
62
+ priority: Optional[str] = Field(
63
+ default=None,
64
+ description="low | medium | high | critical",
65
+ )
66
+ # Task 2: Extraction
67
+ extracted_entities: Optional[Dict[str, Any]] = Field(
68
+ default=None,
69
+ description="Key-value pairs extracted from the ticket",
70
+ )
71
+ required_actions: Optional[List[str]] = Field(
72
+ default=None,
73
+ description="List of actions needed to resolve the ticket",
74
+ )
75
+ # Task 3: Resolution
76
+ response_text: Optional[str] = Field(
77
+ default=None,
78
+ description="Customer-facing response text",
79
+ )
80
+ resolution_steps: Optional[List[str]] = Field(
81
+ default=None,
82
+ description="Ordered list of internal resolution steps",
83
+ )
84
+ # Escalation
85
+ escalation_team: Optional[str] = Field(default=None)
86
+ escalation_reason: Optional[str] = Field(default=None)
87
 
88
 
89
  # ---------------------------------------------------------------------------
 
120
  done: bool
121
  total_reward: float
122
  history: List[Dict[str, Any]] = Field(default_factory=list)
123
+ final_score: Optional[float] = None
124
 
125
 
126
  # ---------------------------------------------------------------------------
 
154
  final_score: float
155
  step_count: int
156
  total_reward: float
157
+ actions: List[Dict[str, Any]]
openenv.yaml CHANGED
@@ -57,14 +57,6 @@ interface:
57
  episode_id: string
58
  response: GraderResponse
59
 
60
- baseline:
61
- method: POST
62
- path: /baseline
63
- request:
64
- model: string # optional, default heuristic
65
- ticket_index: integer # optional, default 0
66
- response: BaselineResult
67
-
68
  health:
69
  method: GET
70
  path: /health
 
57
  episode_id: string
58
  response: GraderResponse
59
 
 
 
 
 
 
 
 
 
60
  health:
61
  method: GET
62
  path: /health
requirements.txt CHANGED
@@ -5,4 +5,3 @@ openai>=1.35.0
5
  httpx>=0.27.0
6
  python-multipart>=0.0.9
7
  requests>=2.31.0
8
- google-genai>=1.15.0
 
5
  httpx>=0.27.0
6
  python-multipart>=0.0.9
7
  requests>=2.31.0
 
test_integration.py CHANGED
@@ -1,116 +1,112 @@
1
  """
2
- Quick integration test for DevOpsEnv.
3
 
4
- This runs a full episode for each task to verify everything works.
 
5
  """
6
  import environment as env
7
  from models import Action
8
 
9
 
10
- def test_task(task_id: str, max_test_steps: int = 5):
11
- """Test a single task."""
12
  print(f"\n{'='*60}")
13
  print(f"Testing {task_id}")
14
  print(f"{'='*60}")
15
-
16
  # Reset
17
- print("1. Calling reset()...")
18
- obs = env.reset(task_id)
19
  episode_id = obs.episode_id
20
- print(f"[OK] Episode created: {episode_id}")
21
- print(f" Task: {obs.task_description[:80]}...")
22
- print(f" Max steps: {obs.max_steps}")
23
- print(f" System state: cpu={obs.system_state.cpu_usage:.1f}%, mem={obs.system_state.memory_usage_mb}MB")
24
-
25
- # Take steps
26
- print(f"\n2. Taking {max_test_steps} steps...")
27
- for i in range(max_test_steps):
28
- if task_id == "task1":
29
- if i == 0:
30
- action = Action(action_type="bash_cmd", command="systemctl status nginx")
31
- elif i == 1:
32
- action = Action(action_type="bash_cmd", command="systemctl try-restart nginx")
33
- elif i == 2:
34
- action = Action(action_type="bash_cmd", command="nginx -t")
35
- else:
36
- action = Action(action_type="bash_cmd", command="curl http://localhost")
37
- elif task_id == "task2":
38
- if i == 0:
39
- action = Action(action_type="bash_cmd", command="cat /srv/docker-compose.yml")
40
- elif i == 1:
41
- action = Action(
42
- action_type="file_edit",
43
- file_path="/srv/docker-compose.yml",
44
- file_content="version: '3.8'\nservices:\n mockapi:\n image: mockapi:latest\n ports:\n - \"3000:3000\""
45
- )
46
- elif i == 2:
47
- action = Action(action_type="bash_cmd", command="docker-compose up -d")
48
- else:
49
- action = Action(action_type="bash_cmd", command="docker ps")
50
- else: # task3
51
- if i == 0:
52
- action = Action(action_type="bash_cmd", command="ps aux | grep python")
53
- elif i == 1:
54
- action = Action(action_type="bash_cmd", command="kill 300")
55
- elif i == 2:
56
- action = Action(
57
- action_type="file_edit",
58
- file_path="/opt/mockapi/app.py",
59
- file_content="import json\nfrom flask import Flask\n\napp = Flask(__name__)\n\n@app.route('/api/data')\ndef get_data():\n return json.dumps({'status': 'ok'})\n\nif __name__ == '__main__':\n app.run()\n"
60
- )
61
- else:
62
- action = Action(action_type="bash_cmd", command="python3 /opt/mockapi/app.py &")
63
-
64
- try:
65
- result = env.step(episode_id, action)
66
- print(f" Step {i+1}: {action.action_type} - Reward: {result.reward.step_reward:+.3f}")
67
- if result.done:
68
- print(f" -> Episode completed early")
69
- break
70
- except Exception as e:
71
- print(f" Step {i+1} ERROR: {e}")
72
- break
73
-
74
- # Check state
75
- print(f"\n3. Calling get_state()...")
76
- state = env.get_state(episode_id)
77
- print(f"[OK] State: step_number={state.step_number}, total_reward={state.total_reward:.3f}, done={state.done}")
78
-
79
- # Finish episode if not already done
80
- if not state.done:
81
- print(f"\n4. Calling submit()...")
82
  result = env.step(episode_id, Action(action_type="submit"))
83
- print(f"[OK] Episode submitted, done={result.done}")
84
-
 
 
 
 
 
85
  # Grade
86
- print(f"\n5. Calling grade()...")
87
- try:
88
- score, breakdown, feedback = env.grade(episode_id)
89
- print(f"[OK] Score: {score:.3f}/1.0")
90
- print(f" Breakdown: {breakdown}")
91
- print(f" Feedback: {feedback}")
92
- except Exception as e:
93
- print(f"[ERROR] Grading error: {e}")
94
 
95
 
96
  def main():
97
- """Run all tests."""
98
- print("DevOpsEnv Integration Test")
99
- print("="*60)
100
-
101
- try:
102
- test_task("task1", max_test_steps=5)
103
- test_task("task2", max_test_steps=5)
104
- test_task("task3", max_test_steps=5)
105
-
106
- print(f"\n{'='*60}")
107
- print("[OK] All tests completed successfully!")
108
- print(f"{'='*60}")
109
- except Exception as e:
110
- print(f"\n[ERROR] Test failed: {e}")
111
- import traceback
112
- traceback.print_exc()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
 
115
  if __name__ == "__main__":
116
- main()
 
 
1
  """
2
+ Integration test for SupportEnv.
3
 
4
+ Runs a full episode for each task and prints results.
5
+ Usage: python test_integration.py
6
  """
7
  import environment as env
8
  from models import Action
9
 
10
 
11
+ def test_task(task_id: str) -> bool:
12
+ """Run a full episode for a task. Returns True if passed."""
13
  print(f"\n{'='*60}")
14
  print(f"Testing {task_id}")
15
  print(f"{'='*60}")
16
+
17
  # Reset
18
+ print("1. reset()...")
19
+ obs = env.reset(task_id, ticket_index=0)
20
  episode_id = obs.episode_id
21
+ print(f" [OK] episode_id={episode_id[:8]}...")
22
+ print(f" ticket_id={obs.ticket.ticket_id} subject={obs.ticket.subject[:50]}")
23
+ print(f" max_steps={obs.max_steps} hint={obs.hint}")
24
+
25
+ # Take a relevant action
26
+ print("2. step() with task action...")
27
+ if task_id == "task1":
28
+ action = Action(action_type="classify", category="billing", priority="high")
29
+ elif task_id == "task2":
30
+ action = Action(
31
+ action_type="extract",
32
+ extracted_entities={"customer_name": "Robert Chen", "account_id": "ACC-78234"},
33
+ required_actions=["issue_refund"],
34
+ )
35
+ else: # task3
36
+ action = Action(
37
+ action_type="respond",
38
+ response_text=(
39
+ "We sincerely apologize for the inconvenience with your password reset. "
40
+ "We will manually reset your password and send a new email immediately. "
41
+ "Please check your spam folder and whitelist our domain. "
42
+ "We will resolve this within the next 30 minutes."
43
+ ),
44
+ resolution_steps=[
45
+ "verify_email_delivery",
46
+ "check_spam_filters",
47
+ "manual_password_reset",
48
+ "follow_up_confirmation",
49
+ ],
50
+ )
51
+
52
+ result = env.step(episode_id, action)
53
+ print(f" [OK] step_reward={result.reward.step_reward:+.4f} done={result.done}")
54
+
55
+ # Submit
56
+ print("3. step() submit...")
57
+ if not result.done:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  result = env.step(episode_id, Action(action_type="submit"))
59
+ print(f" [OK] done={result.done} total_reward={result.reward.total_reward:.4f}")
60
+
61
+ # State
62
+ print("4. get_state()...")
63
+ state = env.get_state(episode_id)
64
+ print(f" [OK] steps={state.step_number} history_len={len(state.history)}")
65
+
66
  # Grade
67
+ print("5. grade()...")
68
+ score, breakdown, feedback = env.grade(episode_id)
69
+ print(f" [OK] score={score:.4f}/1.0")
70
+ print(f" breakdown: {', '.join(f'{k}={v:.2f}' for k, v in breakdown.items())}")
71
+ print(f" feedback: {feedback}")
72
+
73
+ passed = score >= 0.0 # just verify pipeline works
74
+ return passed
75
 
76
 
77
  def main():
78
+ print("SupportEnv Integration Test")
79
+ print("=" * 60)
80
+
81
+ results = []
82
+ for task_id in ["task1", "task2", "task3"]:
83
+ try:
84
+ ok = test_task(task_id)
85
+ results.append((task_id, ok, None))
86
+ except Exception as exc:
87
+ import traceback
88
+ traceback.print_exc()
89
+ results.append((task_id, False, str(exc)))
90
+ finally:
91
+ env._EPISODES.clear()
92
+
93
+ print(f"\n{'='*60}")
94
+ print("SUMMARY")
95
+ print("=" * 60)
96
+ all_ok = True
97
+ for task_id, ok, err in results:
98
+ status = "[PASS]" if ok else "[FAIL]"
99
+ print(f" {status} {task_id}" + (f" — {err}" if err else ""))
100
+ if not ok:
101
+ all_ok = False
102
+
103
+ if all_ok:
104
+ print("\n[OK] All integration tests passed!")
105
+ else:
106
+ print("\n[FAIL] Some tests failed.")
107
+ return 0 if all_ok else 1
108
 
109
 
110
  if __name__ == "__main__":
111
+ import sys
112
+ sys.exit(main())
tests_new.py CHANGED
@@ -1,11 +1,9 @@
1
  """
2
- Comprehensive tests for DevOpsEnv.
3
 
4
- Run with: pytest tests/test_environment.py -v
5
  """
6
  import pytest
7
- import json
8
- from unittest.mock import patch
9
 
10
  import environment as env
11
  from models import Action, Observation, StepResult, State
@@ -14,232 +12,245 @@ from data import TASK_META
14
 
15
  class TestReset:
16
  """Test episode reset functionality."""
17
-
18
- def test_reset_valid_task(self):
19
- """Reset creates a valid episode."""
20
  obs = env.reset("task1")
21
-
22
  assert isinstance(obs, Observation)
23
  assert obs.task_id == "task1"
24
  assert obs.episode_id is not None
25
- assert len(obs.episode_id) > 0
26
- assert obs.system_state is not None
27
  assert obs.step_number == 0
28
  assert obs.max_steps == TASK_META["task1"]["max_steps"]
29
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def test_reset_invalid_task(self):
31
- """Reset raises error for unknown task."""
32
  with pytest.raises(ValueError):
33
- env.reset("invalid_task")
34
-
35
- def test_reset_creates_episode_state(self):
36
- """Reset creates episode in internal state."""
37
- obs = env.reset("task2")
38
-
 
 
39
  assert obs.episode_id in env._EPISODES
40
  ep = env._EPISODES[obs.episode_id]
41
- assert ep["task_id"] == "task2"
42
  assert ep["step_number"] == 0
43
  assert ep["done"] is False
44
 
 
 
 
 
45
 
46
  class TestStep:
47
  """Test step execution."""
48
-
49
- def test_step_bash_command(self):
50
- """Step handles bash_cmd action."""
51
  obs = env.reset("task1")
52
-
53
- action = Action(action_type="bash_cmd", command="systemctl status nginx")
54
  result = env.step(obs.episode_id, action)
55
-
56
  assert isinstance(result, StepResult)
57
  assert result.observation.step_number == 1
58
- assert result.reward.step_reward != 0
59
- assert result.done is False
60
-
61
- def test_step_file_edit(self):
62
- """Step handles file_edit action."""
63
  obs = env.reset("task2")
64
-
65
  action = Action(
66
- action_type="file_edit",
67
- file_path="/srv/docker-compose.yml",
68
- file_content="version: '3.8'\nservices:\n test: {}"
69
  )
70
  result = env.step(obs.episode_id, action)
71
-
72
- assert isinstance(result, StepResult)
73
  assert result.observation.step_number == 1
74
-
75
- def test_step_submit(self):
76
- """Step with submit action marks episode done."""
77
- obs = env.reset("task1")
78
-
79
- action = Action(action_type="submit", summary="Done")
 
 
80
  result = env.step(obs.episode_id, action)
81
-
 
 
 
 
82
  assert result.done is True
83
  assert result.observation.available_actions == []
84
-
85
  def test_step_invalid_episode(self):
86
- """Step raises error for invalid episode."""
87
- action = Action(action_type="bash_cmd", command="ls")
88
-
89
  with pytest.raises(KeyError):
90
- env.step("invalid_episode_id", action)
91
-
92
- def test_step_after_done(self):
93
- """Step raises error after episode is done."""
94
  obs = env.reset("task1")
95
-
96
- # End the episode
97
- action1 = Action(action_type="submit")
98
- env.step(obs.episode_id, action1)
99
-
100
- # Try to step again
101
  with pytest.raises(ValueError):
102
- env.step(obs.episode_id, action1)
103
-
104
- def test_step_max_steps_limit(self):
105
- """Episode ends after max_steps."""
106
  obs = env.reset("task1")
107
  max_steps = obs.max_steps
108
-
109
  for i in range(max_steps):
110
- action = Action(action_type="bash_cmd", command="ps aux")
111
  result = env.step(obs.episode_id, action)
112
-
113
- if i < max_steps - 1:
114
- assert result.done is False
115
- else:
116
- assert result.done is True
 
 
117
 
118
 
119
  class TestState:
120
  """Test state retrieval."""
121
-
122
- def test_get_state(self):
123
- """get_state returns current episode state."""
124
- obs = env.reset("task3")
125
-
126
  state = env.get_state(obs.episode_id)
127
-
128
  assert isinstance(state, State)
129
  assert state.episode_id == obs.episode_id
130
- assert state.task_id == "task3"
131
  assert state.step_number == 0
132
  assert state.done is False
133
-
134
- def test_get_state_invalid_episode(self):
135
- """get_state raises error for invalid episode."""
136
  with pytest.raises(KeyError):
137
- env.get_state("invalid_id")
138
-
139
- def test_state_history(self):
140
- """State includes action history."""
141
- obs = env.reset("task1")
142
-
143
- # Take Actions
144
- action1 = Action(action_type="bash_cmd", command="ps aux")
145
- env.step(obs.episode_id, action1)
146
-
147
  state = env.get_state(obs.episode_id)
148
-
149
  assert len(state.history) == 1
150
- assert state.history[0]["action_type"] == "bash_cmd"
151
 
152
 
153
- class TestGrading:
154
- """Test episode grading."""
155
-
156
- def test_grade_task1_nginx_running(self):
157
- """Task 1 grades based on nginx status."""
158
- obs = env.reset("task1")
159
-
160
- # Run commands to fix nginx
161
- env.step(obs.episode_id, Action(action_type="bash_cmd", command="systemctl restart nginx"))
162
- env.step(obs.episode_id, Action(action_type="bash_cmd", command="nginx -t"))
163
- env.step(obs.episode_id, Action(action_type="bash_cmd", command="curl http://localhost"))
164
  env.step(obs.episode_id, Action(action_type="submit"))
165
-
166
  score, breakdown, feedback = env.grade(obs.episode_id)
167
-
168
- assert 0.0 <= score <= 1.0
169
- assert "nginx_running" in breakdown
170
- assert "config_valid" in breakdown
171
- assert "http_200" in breakdown
172
-
173
- def test_grade_invalid_episode(self):
174
- """grade raises error for invalid episode."""
175
- with pytest.raises(KeyError):
176
- env.grade("invalid_id")
177
-
178
- def test_grade_not_done(self):
179
- """grade raises error if episode not done."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  obs = env.reset("task1")
181
- # Don't finish the episode
182
-
183
  with pytest.raises(ValueError):
184
  env.grade(obs.episode_id)
185
 
 
 
 
186
 
187
- class TestSystemSimulation:
188
- """Test mock system state simulation."""
189
-
190
- def test_task1_initial_state(self):
191
- """Task 1 initializes with nginx crashed."""
192
- obs = env.reset("task1")
193
-
194
- assert obs.system_state.service_status.get("nginx") == "inactive"
195
- assert 80 not in obs.system_state.http_ports_open
196
-
197
- def test_task2_initial_state(self):
198
- """Task 2 initializes with docker misconfigured."""
199
- obs = env.reset("task2")
200
-
201
- assert obs.system_state.service_status.get("docker") == "active"
202
- assert 80 in obs.system_state.http_ports_open
203
-
204
- def test_task3_initial_state(self):
205
- """Task 3 initializes with memory leak."""
206
- obs = env.reset("task3")
207
-
208
- assert obs.system_state.service_status.get("mockapi") == "active"
209
- # Should have high memory usage
210
- assert obs.system_state.memory_usage_mb > 1024
211
 
212
 
213
  class TestRewards:
214
- """Test reward calculation."""
215
-
216
- def test_step_reward_positive(self):
217
- """Taking actions yields positive reward."""
218
  obs = env.reset("task1")
219
-
220
- action = Action(action_type="bash_cmd", command="ps aux")
221
- result = env.step(obs.episode_id, action)
222
-
223
- assert result.reward.step_reward > -1.0 # Not all negative
224
-
225
- def test_total_reward_accumulation(self):
226
- """Total reward accumulates across steps."""
 
 
227
  obs = env.reset("task1")
228
-
229
- env.step(obs.episode_id, Action(action_type="bash_cmd", command="ps aux"))
230
- result1 = env.step(obs.episode_id, Action(action_type="bash_cmd", command="ls"))
231
- total1 = result1.reward.total_reward
232
-
233
- result2 = env.step(obs.episode_id, Action(action_type="bash_cmd", command="pwd"))
234
- total2 = result2.reward.total_reward
235
-
236
- # Total reward should accumulate
237
- assert total2 >= total1 or total2 < total1 # Can go either way depending on grader
238
 
239
 
240
  @pytest.fixture(autouse=True)
241
  def cleanup():
242
- """Clean up episodes after each test."""
243
  yield
244
  env._EPISODES.clear()
245
 
 
1
  """
2
+ Comprehensive tests for SupportEnv.
3
 
4
+ Run with: pytest tests_new.py -v
5
  """
6
  import pytest
 
 
7
 
8
  import environment as env
9
  from models import Action, Observation, StepResult, State
 
12
 
13
  class TestReset:
14
  """Test episode reset functionality."""
15
+
16
+ def test_reset_task1(self):
 
17
  obs = env.reset("task1")
 
18
  assert isinstance(obs, Observation)
19
  assert obs.task_id == "task1"
20
  assert obs.episode_id is not None
21
+ assert obs.ticket is not None
 
22
  assert obs.step_number == 0
23
  assert obs.max_steps == TASK_META["task1"]["max_steps"]
24
+
25
+ def test_reset_task2(self):
26
+ obs = env.reset("task2")
27
+ assert obs.task_id == "task2"
28
+ assert obs.ticket.ticket_id.startswith("T2-")
29
+
30
+ def test_reset_task3(self):
31
+ obs = env.reset("task3")
32
+ assert obs.task_id == "task3"
33
+ assert obs.ticket.ticket_id.startswith("T3-")
34
+
35
+ def test_reset_with_ticket_index(self):
36
+ obs = env.reset("task1", ticket_index=2)
37
+ assert obs.ticket.ticket_id == "T1-003"
38
+
39
  def test_reset_invalid_task(self):
 
40
  with pytest.raises(ValueError):
41
+ env.reset("task_unknown")
42
+
43
+ def test_reset_invalid_ticket_index(self):
44
+ with pytest.raises(ValueError):
45
+ env.reset("task1", ticket_index=99)
46
+
47
+ def test_reset_creates_episode(self):
48
+ obs = env.reset("task1")
49
  assert obs.episode_id in env._EPISODES
50
  ep = env._EPISODES[obs.episode_id]
51
+ assert ep["task_id"] == "task1"
52
  assert ep["step_number"] == 0
53
  assert ep["done"] is False
54
 
55
+ def test_reset_hint_on_first_step(self):
56
+ obs = env.reset("task1")
57
+ assert obs.hint is not None
58
+
59
 
60
  class TestStep:
61
  """Test step execution."""
62
+
63
+ def test_step_classify(self):
 
64
  obs = env.reset("task1")
65
+ action = Action(action_type="classify", category="billing", priority="high")
 
66
  result = env.step(obs.episode_id, action)
 
67
  assert isinstance(result, StepResult)
68
  assert result.observation.step_number == 1
69
+ assert result.reward.step_reward is not None
70
+
71
+ def test_step_extract(self):
 
 
72
  obs = env.reset("task2")
 
73
  action = Action(
74
+ action_type="extract",
75
+ extracted_entities={"customer_name": "Alice"},
76
+ required_actions=["issue_refund"],
77
  )
78
  result = env.step(obs.episode_id, action)
 
 
79
  assert result.observation.step_number == 1
80
+
81
+ def test_step_respond(self):
82
+ obs = env.reset("task3")
83
+ action = Action(
84
+ action_type="respond",
85
+ response_text="Thank you for reaching out. We sincerely apologize for the inconvenience and will resolve this immediately.",
86
+ resolution_steps=["verify_account", "issue_refund"],
87
+ )
88
  result = env.step(obs.episode_id, action)
89
+ assert result.observation.step_number == 1
90
+
91
+ def test_step_submit_marks_done(self):
92
+ obs = env.reset("task1")
93
+ result = env.step(obs.episode_id, Action(action_type="submit"))
94
  assert result.done is True
95
  assert result.observation.available_actions == []
96
+
97
  def test_step_invalid_episode(self):
 
 
 
98
  with pytest.raises(KeyError):
99
+ env.step("nonexistent-id", Action(action_type="submit"))
100
+
101
+ def test_step_after_done_raises(self):
 
102
  obs = env.reset("task1")
103
+ env.step(obs.episode_id, Action(action_type="submit"))
 
 
 
 
 
104
  with pytest.raises(ValueError):
105
+ env.step(obs.episode_id, Action(action_type="submit"))
106
+
107
+ def test_step_max_steps_ends_episode(self):
 
108
  obs = env.reset("task1")
109
  max_steps = obs.max_steps
 
110
  for i in range(max_steps):
111
+ action = Action(action_type="classify", category="general", priority="low")
112
  result = env.step(obs.episode_id, action)
113
+ assert result.done is True
114
+
115
+ def test_thread_history_grows(self):
116
+ obs = env.reset("task1")
117
+ env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
118
+ result = env.step(obs.episode_id, Action(action_type="submit"))
119
+ assert len(result.observation.thread_history) == 2
120
 
121
 
122
  class TestState:
123
  """Test state retrieval."""
124
+
125
+ def test_get_state_initial(self):
126
+ obs = env.reset("task1")
 
 
127
  state = env.get_state(obs.episode_id)
 
128
  assert isinstance(state, State)
129
  assert state.episode_id == obs.episode_id
130
+ assert state.task_id == "task1"
131
  assert state.step_number == 0
132
  assert state.done is False
133
+
134
+ def test_get_state_invalid(self):
 
135
  with pytest.raises(KeyError):
136
+ env.get_state("bad-id")
137
+
138
+ def test_state_history_after_step(self):
139
+ obs = env.reset("task2")
140
+ env.step(obs.episode_id, Action(action_type="extract", extracted_entities={}, required_actions=[]))
 
 
 
 
 
141
  state = env.get_state(obs.episode_id)
 
142
  assert len(state.history) == 1
143
+ assert state.history[0]["action_type"] == "extract"
144
 
145
 
146
+ class TestGraders:
147
+ """Test grading for each task."""
148
+
149
+ def test_grade_task1_perfect(self):
150
+ """Correct category + priority on ticket 0 (billing/high)."""
151
+ obs = env.reset("task1", ticket_index=0)
152
+ env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
 
 
 
 
153
  env.step(obs.episode_id, Action(action_type="submit"))
 
154
  score, breakdown, feedback = env.grade(obs.episode_id)
155
+ assert score >= 0.9
156
+ assert breakdown["category_correct"] == 0.50
157
+ assert breakdown["priority_correct"] == 0.40
158
+
159
+ def test_grade_task1_wrong_category(self):
160
+ obs = env.reset("task1", ticket_index=0)
161
+ env.step(obs.episode_id, Action(action_type="classify", category="technical", priority="high"))
162
+ env.step(obs.episode_id, Action(action_type="submit"))
163
+ score, breakdown, _ = env.grade(obs.episode_id)
164
+ assert breakdown["category_correct"] == 0.0
165
+ assert breakdown["priority_correct"] == 0.40
166
+
167
+ def test_grade_task1_no_classify_action(self):
168
+ obs = env.reset("task1")
169
+ env.step(obs.episode_id, Action(action_type="submit"))
170
+ score, _, _ = env.grade(obs.episode_id)
171
+ assert score == 0.0
172
+
173
+ def test_grade_task2_entities(self):
174
+ obs = env.reset("task2", ticket_index=0)
175
+ env.step(obs.episode_id, Action(
176
+ action_type="extract",
177
+ extracted_entities={
178
+ "customer_name": "Robert Chen",
179
+ "account_id": "ACC-78234",
180
+ "invoice_number": "INV-20240312",
181
+ "incorrect_amount": "199.00",
182
+ "correct_amount": "99.00",
183
+ "refund_amount": "100.00",
184
+ },
185
+ required_actions=["issue_refund", "send_corrected_invoice"],
186
+ ))
187
+ env.step(obs.episode_id, Action(action_type="submit"))
188
+ score, breakdown, _ = env.grade(obs.episode_id)
189
+ assert breakdown["entity_coverage"] == pytest.approx(0.60, abs=0.01)
190
+ assert breakdown["action_coverage"] == pytest.approx(0.30, abs=0.01)
191
+
192
+ def test_grade_task3_keywords_and_steps(self):
193
+ obs = env.reset("task3", ticket_index=0)
194
+ env.step(obs.episode_id, Action(
195
+ action_type="respond",
196
+ response_text=(
197
+ "We sincerely apologize for the password reset issue. "
198
+ "We will send a new reset email and ask you to check your spam folder "
199
+ "and whitelist our domain. We will have this resolved within the hour."
200
+ ),
201
+ resolution_steps=[
202
+ "verify_email_delivery",
203
+ "check_spam_filters",
204
+ "manual_password_reset",
205
+ "follow_up_confirmation",
206
+ ],
207
+ ))
208
+ env.step(obs.episode_id, Action(action_type="submit"))
209
+ score, breakdown, _ = env.grade(obs.episode_id)
210
+ assert score >= 0.7
211
+ assert breakdown["length_adequate"] == 0.10
212
+ assert breakdown["no_empty_steps"] == 0.05
213
+
214
+ def test_grade_not_done_raises(self):
215
  obs = env.reset("task1")
 
 
216
  with pytest.raises(ValueError):
217
  env.grade(obs.episode_id)
218
 
219
+ def test_grade_invalid_episode_raises(self):
220
+ with pytest.raises(KeyError):
221
+ env.grade("bad-id")
222
 
223
+ def test_score_in_range(self):
224
+ for task_id in ["task1", "task2", "task3"]:
225
+ obs = env.reset(task_id)
226
+ env.step(obs.episode_id, Action(action_type="submit"))
227
+ score, _, _ = env.grade(obs.episode_id)
228
+ assert 0.0 <= score <= 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
 
231
  class TestRewards:
232
+ """Test reward signals."""
233
+
234
+ def test_step_reward_is_float(self):
 
235
  obs = env.reset("task1")
236
+ result = env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
237
+ assert isinstance(result.reward.step_reward, float)
238
+
239
+ def test_total_reward_accumulates(self):
240
+ obs = env.reset("task2")
241
+ r1 = env.step(obs.episode_id, Action(action_type="extract", extracted_entities={}, required_actions=[]))
242
+ r2 = env.step(obs.episode_id, Action(action_type="submit"))
243
+ assert r2.reward.total_reward != r1.reward.total_reward
244
+
245
+ def test_submit_bonus_applied(self):
246
  obs = env.reset("task1")
247
+ result = env.step(obs.episode_id, Action(action_type="submit"))
248
+ # submit_bonus=0.05 minus step_cost=0.02 = +0.03 base before grader
249
+ assert result.reward.step_reward > 0.0
 
 
 
 
 
 
 
250
 
251
 
252
  @pytest.fixture(autouse=True)
253
  def cleanup():
 
254
  yield
255
  env._EPISODES.clear()
256