Spaces:

exploring-solver
/

my-env

Sleeping

App Files Files Community

Shivoo29 commited on 25 days ago

Commit

ceec48c

1 Parent(s): 5b64237

final

Browse files

Files changed (16) hide show

.claude/settings.json +19 -0
Dockerfile +2 -21
Pre_Validation_Script.sh +185 -0
README.md +103 -139
Sample_Inference_Script.py +187 -0
app.py +21 -25
baseline.py +0 -309
data.py +5 -82
environment.py +149 -347
graders.py +225 -172
inference.py +262 -323
models.py +54 -28
openenv.yaml +0 -8
requirements.txt +0 -1
test_integration.py +92 -96
tests_new.py +175 -164

.claude/settings.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(python -m pytest tests_new.py -v)",
+      "Bash(pip install:*)",
+      "Bash(pytest tests_new.py -v)",
+      "Bash(/var/data/python/bin/pytest tests_new.py -v)",
+      "Bash(python test_integration.py)",
+      "Bash(docker build:*)",
+      "Bash(/var/data/python/bin/pytest tests_new.py -v --tb=short)",
+      "WebFetch(domain:exploring-solver-openenv-solvor.hf.space)",
+      "Bash(curl -s -o /tmp/reset_empty.json -w 'HTTP %{http_code}' -X POST -H 'Content-Type: application/json' -d '{}' https://exploring-solver-openenv-solvor.hf.space/reset --max-time 30)",
+      "Read(//tmp/**)",
+      "Bash(curl -s -X POST -H 'Content-Type: application/json' -d '{\"task_id\":\"task1\"}' https://exploring-solver-openenv-solvor.hf.space/reset --max-time 30)",
+      "Bash(python3 -m json.tool)",
+      "Bash(python3 test_integration.py)"
+    ]
+  }
+}

Dockerfile CHANGED Viewed

@@ -1,42 +1,23 @@
 # ---------------------------------------------------------------
-# DevOpsEnv — Hugging Face Spaces Docker container
 # Space SDK: Docker  |  Port: 7860
 # ---------------------------------------------------------------
 FROM python:3.11-slim
-# Install system utilities for DevOps tasks
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    nginx \
-    docker.io \
-    systemctl \
-    curl \
-    git \
-    vim \
-    && rm -rf /var/lib/apt/lists/*
-# Create non-root user for Hugging Face Spaces
 RUN useradd -m -u 1000 appuser
 WORKDIR /app
-# Install Python dependencies first (layer caching)
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy application code
 COPY --chown=appuser:appuser . .
-# HF Spaces compatibility
-RUN chmod +x /app/app.py 2>/dev/null || true
 USER appuser
-# Expose the port HF Spaces expects
 EXPOSE 7860
-# Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
     CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
-# Start the FastAPI server
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--reload"]

 # ---------------------------------------------------------------
+# SupportEnv — Hugging Face Spaces Docker container
 # Space SDK: Docker  |  Port: 7860
 # ---------------------------------------------------------------
 FROM python:3.11-slim
 RUN useradd -m -u 1000 appuser
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY --chown=appuser:appuser . .
 USER appuser
 EXPOSE 7860
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
     CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

Pre_Validation_Script.sh ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env bash
+#
+# validate-submission.sh — OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+#
+# Prerequisites:
+#   - Docker:       https://docs.docker.com/get-docker/
+#   - openenv-core: pip install openenv-core
+#   - curl (usually pre-installed)
+#
+# Run:
+#   curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
+#
+#   Or download and run locally:
+#     chmod +x validate-submission.sh
+#     ./validate-submission.sh <ping_url> [repo_dir]
+#
+# Arguments:
+#   ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)
+#   repo_dir   Path to your repo (default: current directory)
+#
+# Examples:
+#   ./validate-submission.sh https://my-team.hf.space
+#   ./validate-submission.sh https://my-team.hf.space ./my-repo
+#
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0

README.md CHANGED Viewed

@@ -1,161 +1,125 @@
----
-title: DevOpsEnv
-emoji: 🛠️
 colorFrom: blue
-colorTo: green
 sdk: docker
 app_port: 7860
 tags:
   - openenv
-  - devops
-  - sre
-  - troubleshooting
   - agent-evaluation
 pinned: false
 ---
-# DevOpsEnv
-DevOpsEnv is a practice environment where an agent acts like a junior SRE.
-In each episode, the agent gets a broken Linux-like system and must fix it by:
-- Running shell commands
-- Editing files
-- Submitting when the fix is done
-The server gives rewards during the episode and a final score at the end.
-## What It Simulates (Simple)
-There are 3 tasks:
-- Task 1: Nginx is down. Bring service back and verify HTTP is OK.
-- Task 2: Docker compose port mapping is wrong. Fix and redeploy.
-- Task 3: Python API has memory leak behavior. Diagnose and reduce memory usage.
-## How It Works
-Step by step:
-1. Call POST /reset with task_id.
-2. You get episode_id plus current system_state.
-3. Call POST /step with an action.
-4. Repeat steps until done, or send action_type submit.
-5. Call POST /grader to get final score and breakdown.
-Main endpoints:
-- GET /health
-- GET /tasks
-- POST /reset
-- POST /step
-- GET /state
-- POST /grader
-## Action Types
-- bash_cmd: Run a command like systemctl status nginx
-- file_edit: Replace content of a file path
-- submit: End episode and grade
-## Quick Start (Normal)
-### 1) Install
-Windows PowerShell:
-python -m pip install -r requirements.txt
-### 2) Start server
-python -m uvicorn app:app --host 0.0.0.0 --port 7860
-### 3) Check health
-In another terminal:
-Invoke-WebRequest -Uri "http://127.0.0.1:7860/health" -UseBasicParsing
-If working, response includes status: healthy.
-### 4) Run built-in integration test
-python test_integration.py
-If working, you should see all 3 tasks run and a final success message.
-## Minimal API Example (Normal)
-PowerShell example:
-$reset = Invoke-WebRequest -Uri "http://127.0.0.1:7860/reset" -Method POST -ContentType "application/json" -Body '{"task_id":"task1"}' | Select-Object -ExpandProperty Content | ConvertFrom-Json
-$episodeId = $reset.episode_id
-$step = @{
-  episode_id = $episodeId
-  action = @{
-    action_type = "bash_cmd"
-    command = "systemctl restart nginx"
   }
-} | ConvertTo-Json -Depth 5
-Invoke-WebRequest -Uri "http://127.0.0.1:7860/step" -Method POST -ContentType "application/json" -Body $step
-## Test With LLM (OpenAI Key)
-1) Keep API server running.
-2) Set key and run inference:
-PowerShell:
-$env:OPENAI_API_KEY = "your-openai-key"
-python inference.py --task task1 --model gpt-4o-mini
-You should see step logs, rewards, and a grader score.
-## Test With Gemini API Key
-inference.py now supports OpenAI-compatible base URLs.
-Use Gemini via OpenAI-compatible endpoint:
-PowerShell:
-$env:GEMINI_API_KEY = "your-gemini-key"
-$env:OPENAI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
-python inference.py --task task1 --model gemini-2.5-flash
-Notes:
-- You can also use OPENAI_API_KEY instead of GEMINI_API_KEY.
-- If your model name is unavailable, switch to a Gemini model enabled on your key.
-- Keep the environment server running at http://127.0.0.1:7860 (or pass --api-url).
 ## Docker
-Build:
-docker build -t devopsenv .
-Run:
-docker run -p 7860:7860 devopsenv
-Then open:
-- http://127.0.0.1:7860/health
-- http://127.0.0.1:7860/docs
-## Project Files
-- app.py: FastAPI API
-- environment.py: episode logic and simulator
-- graders.py: deterministic scoring
-- data.py: task metadata
-- models.py: Pydantic schemas
-- inference.py: LLM baseline runner
-- test_integration.py: local end-to-end check
-## Troubleshooting
-- Port already in use:
-  - change server port or stop old process.
-- 400/404 from API:
-  - check episode_id and task_id values.
-- LLM errors:
-  - verify API key, model name, and OPENAI_BASE_URL for Gemini.

+---
+title: SupportEnv
+emoji: 🎫
 colorFrom: blue
+colorTo: indigo
 sdk: docker
 app_port: 7860
 tags:
   - openenv
+  - customer-support
+  - nlp
+  - ticket-triage
   - agent-evaluation
 pinned: false
 ---
+# SupportEnv
+SupportEnv is an OpenEnv-compliant environment for evaluating LLM agents on customer support ticket triage. Each episode presents a realistic support ticket and asks the agent to classify, extract, or resolve it — scored deterministically against ground-truth labels.
+## Tasks
+| Task | Difficulty | Action | Max Steps |
+|------|-----------|--------|-----------|
+| Task 1 — Ticket Classification | Easy | `classify` | 3 |
+| Task 2 — Information Extraction | Medium | `extract` | 5 |
+| Task 3 — Resolution Generation | Hard | `respond` | 8 |
+**Task 1 — Ticket Classification (Easy)**
+Assign a `category` (billing / technical / account / feature_request / complaint / general) and `priority` (low / medium / high / critical) to each ticket.
+**Task 2 — Information Extraction (Medium)**
+Extract structured entities (IDs, names, amounts, dates) and identify the list of required resolution actions.
+**Task 3 — Resolution Generation (Hard)**
+Write a professional customer-facing response and an ordered list of internal resolution steps. Graded on keyword coverage, step completeness, tone adherence, and minimum length.
+## API
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| `POST` | `/reset` | Start a new episode |
+| `POST` | `/step` | Submit an action |
+| `GET` | `/state` | Get current episode state |
+| `POST` | `/grader` | Grade a finished episode |
+| `GET` | `/tasks` | List all tasks |
+| `GET` | `/health` | Liveness check |
+| `GET` | `/docs` | OpenAPI docs |
+### Reset
+```json
+POST /reset
+{"task_id": "task1", "ticket_index": 0}
+```
+### Step — Task 1 (classify)
+```json
+POST /step
+{
+  "episode_id": "<id>",
+  "action": {"action_type": "classify", "category": "billing", "priority": "high"}
+}
+```
+### Step — Task 2 (extract)
+```json
+POST /step
+{
+  "episode_id": "<id>",
+  "action": {
+    "action_type": "extract",
+    "extracted_entities": {"customer_name": "Alice", "invoice_number": "INV-001"},
+    "required_actions": ["issue_refund", "send_corrected_invoice"]
   }
+}
+```
+### Step — Task 3 (respond)
+```json
+POST /step
+{
+  "episode_id": "<id>",
+  "action": {
+    "action_type": "respond",
+    "response_text": "Dear customer, we sincerely apologize...",
+    "resolution_steps": ["verify_account", "issue_refund", "send_confirmation"]
+  }
+}
+```
+### Submit
+```json
+POST /step
+{"episode_id": "<id>", "action": {"action_type": "submit"}}
+```
+## Scoring
+**Task 1:** category match (0.50) + priority match (0.40) + efficiency (0.10)
+**Task 2:** entity coverage (0.60) + action coverage (0.30) + no hallucination (0.10)
+**Task 3:** keyword coverage (0.30) + step coverage (0.30) + tone compliance (0.25) + length adequate (0.10) + non-empty steps (0.05)
+## Running Locally
+```bash
+pip install -r requirements.txt
+uvicorn app:app --host 0.0.0.0 --port 7860
+```
+## Running the Baseline Agent
+```bash
+export HF_TOKEN=your_token_here
+export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
+python inference.py
+```
 ## Docker
+```bash
+docker build -t supportenv .
+docker run -p 7860:7860 supportenv
+```

Sample_Inference_Script.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+Inference Script Example
+===================================
+MANDATORY
+- Before submitting, ensure the following variables are defined in your environment configuration:
+    API_BASE_URL   The API endpoint for the LLM.
+    MODEL_NAME     The model identifier to use for inference.
+    HF_TOKEN       Your Hugging Face / API key.
+    LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
+                     method
+- Defaults are set only for API_BASE_URL and MODEL_NAME
+    (and should reflect your active inference setup):
+    API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
+    MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
+- The inference script must be named `inference.py` and placed in the root directory of the project
+- Participants must use OpenAI Client for all LLM calls using above variables
+STDOUT FORMAT
+- The script must emit exactly three line types to stdout, in this order:
+    [START] task=<task_name> env=<benchmark> model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
+  Rules:
+    - One [START] line at episode begin.
+    - One [STEP] line per step, immediately after env.step() returns.
+    - One [END] line after env.close(), always emitted (even on exception).
+    - reward and rewards are formatted to 2 decimal places.
+    - done and success are lowercase booleans: true or false.
+    - error is the raw last_action_error string, or null if none.
+    - All fields on a single line with no newlines within a line.
+  Example:
+    [START] task=click-test env=miniwob model=Qwen3-VL-30B
+    [STEP] step=1 action=click('123') reward=0.00 done=false error=null
+    [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
+    [STEP] step=3 action=click('789') reward=1.00 done=true error=null
+    [END] success=true steps=3 rewards=0.00,0.00,1.00
+"""
+import asyncio
+import os
+import textwrap
+from typing import List, Optional
+from openai import OpenAI
+from my_env_v4 import MyEnvV4Action, MyEnvV4Env
+IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
+MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
+TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
+BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
+MAX_STEPS = 8
+TEMPERATURE = 0.7
+MAX_TOKENS = 150
+SUCCESS_SCORE_THRESHOLD = 0.1  # normalized score in [0, 1]
+# Max possible reward: each token contributes 0.1, across all steps
+_MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
+MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
+SYSTEM_PROMPT = textwrap.dedent(
+    """
+    You are interacting with a simple echo environment.
+    Each turn you must send a message. The environment will echo it back.
+    Reward is proportional to message length: reward = len(message) * 0.1
+    Your goal is to maximize total reward by sending meaningful, substantive messages.
+    Reply with exactly one message string — no quotes, no prefixes, just the message text.
+    """
+).strip()
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
+def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
+    history_block = "\n".join(history[-4:]) if history else "None"
+    return textwrap.dedent(
+        f"""
+        Step: {step}
+        Last echoed message: {last_echoed!r}
+        Last reward: {last_reward:.2f}
+        Previous steps:
+        {history_block}
+        Send your next message.
+        """
+    ).strip()
+def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
+    user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+            stream=False,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        return text if text else "hello"
+    except Exception as exc:
+        print(f"[DEBUG] Model request failed: {exc}", flush=True)
+        return "hello"
+async def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
+    history: List[str] = []
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
+    try:
+        result = await env.reset() # OpenENV.reset()
+        last_echoed = result.observation.echoed_message
+        last_reward = 0.0
+        for step in range(1, MAX_STEPS + 1):
+            if result.done:
+                break
+            message = get_model_message(client, step, last_echoed, last_reward, history)
+            result = await env.step(MyEnvV4Action(message=message))
+            obs = result.observation
+            reward = result.reward or 0.0
+            done = result.done
+            error = None
+            rewards.append(reward)
+            steps_taken = step
+            last_echoed = obs.echoed_message
+            last_reward = reward
+            log_step(step=step, action=message, reward=reward, done=done, error=error)
+            history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
+            if done:
+                break
+        score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
+        score = min(max(score, 0.0), 1.0)  # clamp to [0, 1]
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    finally:
+        try:
+            await env.close()
+        except Exception as e:
+            print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+if __name__ == "__main__":
+    asyncio.run(main())

app.py CHANGED Viewed

@@ -1,23 +1,20 @@
 """
-FastAPI server for DevOpsEnv - Linux DevOps/SRE troubleshooting environment.
 Endpoints:
----------
-POST   /reset          Create a new episode
-POST   /step           Advance the episode
-GET    /state          Current episode state
-GET    /tasks          List tasks and action schema
-POST   /grader         Grade a finished episode
-GET    /health         Liveness check
-GET    /               Info / spec link
 """
 from __future__ import annotations
 import os
-import uuid
-import json
-from typing import Any, Dict, List, Optional
 from datetime import datetime
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
@@ -27,17 +24,13 @@ import environment as env
 from data import TASK_META
 from models import (
     Action,
-    Observation,
-    State,
-    StepResult,
-    TaskInfo,
-    Reward,
     GraderResponse,
 )
 app = FastAPI(
-    title="DevOpsEnv",
-    description="An OpenEnv-compliant Linux DevOps/SRE troubleshooting environment.",
     version="1.0.0",
     docs_url="/docs",
     redoc_url="/redoc",
@@ -51,8 +44,13 @@ app.add_middleware(
 )
 class ResetRequest(BaseModel):
-    task_id: str
 class StepRequest(BaseModel):
@@ -71,10 +69,9 @@ class GraderRequest(BaseModel):
 @app.get("/", tags=["meta"])
 def root():
     return {
-        "name": "DevOpsEnv",
         "version": "1.0.0",
-        "description": "OpenEnv DevOps/SRE troubleshooting environment",
-        "openenv_spec": "https://github.com/meta-pytorch/OpenEnv",
         "tasks": list(TASK_META.keys()),
         "endpoints": {
             "reset": "POST /reset",
@@ -112,7 +109,7 @@ def tasks():
 @app.post("/reset", tags=["control"])
 def reset(req: ResetRequest):
     try:
-        obs = env.reset(req.task_id)
         return obs.model_dump()
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
@@ -155,4 +152,3 @@ if __name__ == "__main__":
     import uvicorn
     port = int(os.environ.get("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port, workers=1)

 """
+FastAPI server for SupportEnv — Customer Support Ticket Triage.
 Endpoints:
+  POST   /reset          Create a new episode
+  POST   /step           Advance the episode
+  GET    /state          Current episode state
+  GET    /tasks          List tasks and action schema
+  POST   /grader         Grade a finished episode
+  GET    /health         Liveness check
+  GET    /               Info / spec link
 """
 from __future__ import annotations
 import os
 from datetime import datetime
+from typing import Optional
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
 from data import TASK_META
 from models import (
     Action,
     GraderResponse,
+    TaskInfo,
 )
 app = FastAPI(
+    title="SupportEnv",
+    description="An OpenEnv-compliant customer support ticket triage environment.",
     version="1.0.0",
     docs_url="/docs",
     redoc_url="/redoc",
 )
+# ---------------------------------------------------------------------------
+# Request schemas
+# ---------------------------------------------------------------------------
 class ResetRequest(BaseModel):
+    task_id: str = "task1"
+    ticket_index: Optional[int] = 0
 class StepRequest(BaseModel):
 @app.get("/", tags=["meta"])
 def root():
     return {
+        "name": "SupportEnv",
         "version": "1.0.0",
+        "description": "OpenEnv customer support ticket triage environment",
         "tasks": list(TASK_META.keys()),
         "endpoints": {
             "reset": "POST /reset",
 @app.post("/reset", tags=["control"])
 def reset(req: ResetRequest):
     try:
+        obs = env.reset(req.task_id, ticket_index=req.ticket_index or 0)
         return obs.model_dump()
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
     import uvicorn
     port = int(os.environ.get("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port, workers=1)

baseline.py DELETED Viewed

@@ -1,309 +0,0 @@
-"""
-SupportEnv — FastAPI server
-Endpoints
----------
-POST   /reset          Create a new episode
-POST   /step           Advance the episode
-GET    /state          Current episode state
-GET    /tasks          List tasks and action schema
-POST   /grader         Grade a finished episode
-POST   /baseline       Run the built-in baseline agent on all tasks
-GET    /health         Liveness check
-GET    /               Info / spec link
-"""
-from __future__ import annotations
-import os
-import subprocess
-import sys
-import tempfile
-from typing import Any, Dict, List, Optional
-from fastapi import FastAPI, HTTPException, Query
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-import environment as env
-from data import TASK_META
-from models import (
-    Action,
-    BaselineResult,
-    GraderResponse,
-    Observation,
-    State,
-    StepResult,
-    TaskInfo,
-)
-app = FastAPI(
-    title="SupportEnv",
-    description=(
-        "An OpenEnv-compliant customer-support triage environment. "
-        "Agents learn to classify, extract information from, and resolve "
-        "real-world SaaS support tickets."
-    ),
-    version="1.0.0",
-    docs_url="/docs",
-    redoc_url="/redoc",
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# ---------------------------------------------------------------------------
-# Request / response shapes for endpoints not covered by models.py
-# ---------------------------------------------------------------------------
-class ResetRequest(BaseModel):
-    task_id: str
-    ticket_index: Optional[int] = None
-class StepRequest(BaseModel):
-    episode_id: str
-    action: Action
-class GraderRequest(BaseModel):
-    episode_id: str
-# ---------------------------------------------------------------------------
-# Endpoints
-# ---------------------------------------------------------------------------
-@app.get("/", tags=["meta"])
-def root():
-    return {
-        "name": "SupportEnv",
-        "version": "1.0.0",
-        "description": "OpenEnv customer-support ticket triage environment",
-        "openenv_spec": "https://github.com/openenv/openenv",
-        "tasks": list(TASK_META.keys()),
-        "endpoints": {
-            "reset": "POST /reset",
-            "step": "POST /step",
-            "state": "GET /state?episode_id=...",
-            "tasks": "GET /tasks",
-            "grader": "POST /grader",
-            "baseline": "POST /baseline",
-            "health": "GET /health",
-            "docs": "GET /docs",
-        },
-    }
-@app.get("/health", tags=["meta"])
-def health():
-    return {"status": "ok"}
-# ---------------------------------------------------------------------------
-# Core OpenEnv endpoints
-# ---------------------------------------------------------------------------
-@app.post("/reset", response_model=Observation, tags=["openenv"])
-def reset(request: ResetRequest) -> Observation:
-    """
-    Start a new episode.
-    - **task_id**: `task1` | `task2` | `task3`
-    - **ticket_index**: 0-indexed ticket to use (optional; default 0)
-    """
-    try:
-        return env.reset(request.task_id, request.ticket_index)
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-@app.post("/step", response_model=StepResult, tags=["openenv"])
-def step(request: StepRequest) -> StepResult:
-    """
-    Submit an action and advance the episode.
-    The `action` object must include `action_type` and the fields relevant
-    to that action type (see GET /tasks for the schema).
-    """
-    try:
-        return env.step(request.episode_id, request.action)
-    except KeyError:
-        raise HTTPException(
-            status_code=404,
-            detail=f"Episode '{request.episode_id}' not found. Call POST /reset first.",
-        )
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-@app.get("/state", response_model=State, tags=["openenv"])
-def state(episode_id: str = Query(..., description="Episode UUID from POST /reset")) -> State:
-    """Return the current state of an episode."""
-    try:
-        return env.state(episode_id)
-    except KeyError:
-        raise HTTPException(
-            status_code=404,
-            detail=f"Episode '{episode_id}' not found.",
-        )
-# ---------------------------------------------------------------------------
-# /tasks — task listing + action schema
-# ---------------------------------------------------------------------------
-# JSON Schema for the Action model (subset used in each task)
-_BASE_ACTION_SCHEMA = {
-    "type": "object",
-    "required": ["action_type"],
-    "properties": {
-        "action_type": {
-            "type": "string",
-            "description": "One of the available_actions listed in the Observation",
-        },
-    },
-}
-_ACTION_SCHEMAS: Dict[str, Dict[str, Any]] = {
-    "task1": {
-        **_BASE_ACTION_SCHEMA,
-        "description": "classify action: set category + priority; then submit",
-        "properties": {
-            **_BASE_ACTION_SCHEMA["properties"],
-            "category": {
-                "type": "string",
-                "enum": [
-                    "billing", "technical", "account",
-                    "feature_request", "complaint", "general",
-                ],
-            },
-            "priority": {
-                "type": "string",
-                "enum": ["low", "medium", "high", "critical"],
-            },
-        },
-    },
-    "task2": {
-        **_BASE_ACTION_SCHEMA,
-        "description": "extract action: populate extracted_entities + required_actions; then submit",
-        "properties": {
-            **_BASE_ACTION_SCHEMA["properties"],
-            "extracted_entities": {
-                "type": "object",
-                "additionalProperties": True,
-                "description": "Key-value pairs extracted from the ticket text",
-            },
-            "required_actions": {
-                "type": "array",
-                "items": {"type": "string"},
-                "description": "List of action identifiers (snake_case) needed to close the ticket",
-            },
-        },
-    },
-    "task3": {
-        **_BASE_ACTION_SCHEMA,
-        "description": (
-            "respond or resolve action: write response_text + resolution_steps; "
-            "optionally escalate; then submit"
-        ),
-        "properties": {
-            **_BASE_ACTION_SCHEMA["properties"],
-            "response_text": {
-                "type": "string",
-                "description": "Full professional response to send to the customer",
-            },
-            "resolution_steps": {
-                "type": "array",
-                "items": {"type": "string"},
-                "description": "Ordered steps for support staff to resolve the ticket",
-            },
-            "escalation_team": {
-                "type": "string",
-                "enum": ["billing_team", "engineering", "account_management", "legal"],
-            },
-            "escalation_reason": {"type": "string"},
-        },
-    },
-}
-@app.get("/tasks", response_model=List[TaskInfo], tags=["openenv"])
-def list_tasks() -> List[TaskInfo]:
-    """Return metadata and action schema for all tasks."""
-    result = []
-    for task_id, meta in TASK_META.items():
-        result.append(
-            TaskInfo(
-                task_id=task_id,
-                name=meta["name"],
-                description=meta["description"],
-                difficulty=meta["difficulty"],
-                max_steps=meta["max_steps"],
-                action_schema=_ACTION_SCHEMAS[task_id],
-            )
-        )
-    return result
-# ---------------------------------------------------------------------------
-# /grader — grade a finished episode
-# ---------------------------------------------------------------------------
-@app.post("/grader", response_model=GraderResponse, tags=["openenv"])
-def grader(request: GraderRequest) -> GraderResponse:
-    """
-    Grade a finished episode.
-    The episode must have reached `done=True` (either via a `submit` action
-    or by exhausting `max_steps`).
-    """
-    try:
-        return env.grade(request.episode_id)
-    except KeyError:
-        raise HTTPException(
-            status_code=404,
-            detail=f"Episode '{request.episode_id}' not found.",
-        )
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-# ---------------------------------------------------------------------------
-# /baseline — run the built-in baseline agent
-# ---------------------------------------------------------------------------
-class BaselineRequest(BaseModel):
-    model: str = "gpt-4o-mini"
-    ticket_index: Optional[int] = 0
-@app.post("/baseline", response_model=BaselineResult, tags=["openenv"])
-def run_baseline(request: BaselineRequest) -> BaselineResult:
-    """
-    Run the heuristic baseline agent against all three tasks.
-    The built-in baseline does NOT require an OpenAI key — it uses the
-    deterministic heuristic baseline from `baseline.py`.
-    If you want to run the LLM baseline, call `baseline.py` directly.
-    """
-    try:
-        from baseline import run_heuristic_baseline
-        scores = run_heuristic_baseline(
-            ticket_index=request.ticket_index or 0
-        )
-        avg = round(sum(s["score"] for s in scores) / len(scores), 4)
-        return BaselineResult(
-            model="heuristic-baseline",
-            scores=[
-                {"task_id": s["task_id"], "score": s["score"], "details": s}
-                for s in scores
-            ],
-            average_score=avg,
-        )
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=str(exc))

data.py CHANGED Viewed

@@ -1,92 +1,15 @@
 """
-Linux DevOps tasks for SRE troubleshooting environment.
-Task 1 (easy)   — Restart crashed Nginx service
-Task 2 (medium) — Fix Docker container misconfiguration
-Task 3 (hard)   — Debug and fix memory leak in mock API
 """
 from __future__ import annotations
 from typing import Any, Dict, List
-# Nginx service configuration
-NGINX_CONFIG_PATH = "/etc/nginx/nginx.conf"
-NGINX_SYSTEMD_PATH = "/etc/systemd/system/nginx.service"
-# Docker configuration
-DOCKER_COMPOSE_PATH = "/srv/docker-compose.yml"
-# Mock API code path
-MOCK_API_PATH = "/opt/mockapi/app.py"
 # ---------------------------------------------------------------------------
-# TASK DEFINITIONS
-# ---------------------------------------------------------------------------
-TASK_META: Dict[str, Dict[str, Any]] = {
-    "task1": {
-        "name": "Restart Nginx Service",
-        "description": (
-            "Production Nginx service has crashed. Restart the service, "
-            "verify the configuration syntax, and ensure the server "
-            "returns HTTP 200 on port 80. Failing checklist:\n"
-            "1. Restart nginx (systemctl restart nginx)\n"
-            "2. Verify config syntax (nginx -t)\n"
-            "3. Confirm service is running (systemctl status nginx)\n"
-            "4. Check HTTP 200 response (curl http://localhost:80)"
-        ),
-        "difficulty": "easy",
-        "max_steps": 10,
-        "available_actions": ["bash_cmd", "submit"],
-        "passing_conditions": [
-            "nginx_running",
-            "config_valid",
-            "http_200_response",
-        ],
-    },
-    "task2": {
-        "name": "Fix Docker Container Configuration",
-        "description": (
-            "A critical microservice container is misconfigured. The port "
-            "mapping in docker-compose.yml is broken. Fix the configuration, "
-            "redeploy the container, and verify it's accessible on the "
-            "correct port.\n"
-            "1. Edit docker-compose.yml (fix port mapping)\n"
-            "2. Restart containers (docker-compose up -d)\n"
-            "3. Verify container is running\n"
-            "4. Check service responds on mapped port"
-        ),
-        "difficulty": "medium",
-        "max_steps": 15,
-        "available_actions": ["bash_cmd", "file_edit", "submit"],
-        "passing_conditions": [
-            "docker_compose_valid",
-            "container_running",
-            "port_accessible",
-        ],
-    },
-    "task3": {
-        "name": "Find and Fix Memory Leak in Mock API",
-        "description": (
-            "The Python API service is leaking memory and consuming excessive "
-            "resources. Diagnose the memory leak in /opt/mockapi/app.py, fix "
-            "the offending code, and restart the service without root access.\n"
-            "1. Identify the memory leak (check processes, logs)\n"
-            "2. Kill the runaway process\n"
-            "3. Fix the code in app.py (patch the leak)\n"
-            "4. Restart the service as appuser\n"
-            "5. Verify memory usage is normal"
-        ),
-        "difficulty": "hard",
-        "max_steps": 20,
-        "available_actions": ["bash_cmd", "file_edit", "submit"],
-        "passing_conditions": [
-            "process_killed",
-            "code_fixed",
-            "service_restarted",
-            "memory_normal",
-        ],
-    },
-}
 #   Agent must choose: category + priority
 #   Categories: billing | technical | account | feature_request | complaint | general
 #   Priorities: low | medium | high | critical

 """
+SupportEnv — Customer Support Ticket Triage data.
+Task 1 (easy)   — Ticket Classification
+Task 2 (medium) — Information Extraction
+Task 3 (hard)   — Resolution Generation
 """
 from __future__ import annotations
 from typing import Any, Dict, List
 # ---------------------------------------------------------------------------
+# TASK 1 — Ticket Classification
 #   Agent must choose: category + priority
 #   Categories: billing | technical | account | feature_request | complaint | general
 #   Priorities: low | medium | high | critical

environment.py CHANGED Viewed

@@ -1,25 +1,23 @@
 """
-Core DevOpsEnv environment logic.
-Simulates a broken Linux server with:
-- Task 1: Crashed Nginx service needing restart
-- Task 2: Misconfigured Docker container
-- Task 3: Memory leak in Python mock API
 Manages episode lifecycle:
-  reset() → Observation
-  step(action) → StepResult
-  get_state() → State
-  grade() → (score, breakdown, feedback)
 """
 from __future__ import annotations
 import uuid
-import json
-import re
-from typing import Any, Dict, Optional, Tuple, List
-from data import TASK_META
 from graders import grade_task
 from models import (
     Action,
@@ -27,350 +25,75 @@ from models import (
     Reward,
     State,
     StepResult,
-    SystemState,
 )
-# In-memory store: episode_id → EpisodeState dict
 _EPISODES: Dict[str, Dict[str, Any]] = {}
 # ---------------------------------------------------------------------------
-# Mock filesystem and system state
 # ---------------------------------------------------------------------------
-def _create_initial_state_task1() -> Dict[str, Any]:
-    """Task 1: Nginx is crashed."""
-    return {
-        "running_processes": [
-            {"pid": 100, "name": "systemd"},
-            {"pid": 105, "name": "sshd"},
-            # nginx NOT running
-        ],
-        "service_status": {
-            "nginx": "inactive",
-            "docker": "active",
-            "mockapi": "active",
-        },
-        "http_ports_open": [8080],  # 80 is down
-        "docker_containers": [],
-        "logs": "2026-03-29 01:30:00 nginx crashed\nCore dump detected.\n",
-        "files": {
-            NGINX_CONFIG_PATH: """
-user nginx;
-worker_processes auto;
-error_log /var/log/nginx/error.log warn;
-pid /var/run/nginx.pid;
-events {
-    worker_connections 1024;
-}
-http {
-    include /etc/nginx/mime.types;
-    default_type application/octet-stream;
-    sendfile on;
-    keepalive_timeout 65;
-    server {
-        listen 80 default_server;
-        server_name _;
-        location / {
-            return 200 "OK\\n";
-        }
-    }
-}""",
-            "/etc/systemd/system/nginx.service": """
-[Unit]
-Description=The NGINX HTTP and reverse proxy server
-After=network.target
-[Service]
-Type=forking
-PIDFile=/var/run/nginx.pid
-ExecStartPre=/usr/sbin/nginx -t
-ExecStart=/usr/sbin/nginx
-ExecReload=/bin/kill -s HUP $MAINPID
-ExecStop=/bin/kill -s QUIT $MAINPID
-PrivateTmp=true
-[Install]
-WantedBy=multi-user.target""",
-        },
-        "cpu_usage": 45.2,
-        "memory_usage_mb": 256,
-    }
-def _create_initial_state_task2() -> Dict[str, Any]:
-    """Task 2: Docker misconfigured."""
-    return {
-        "running_processes": [
-            {"pid": 100, "name": "systemd"},
-            {"pid": 105, "name": "sshd"},
-            {"pid": 200, "name": "dockerd"},
-        ],
-        "service_status": {
-            "nginx": "active",
-            "docker": "active",
-            "mockapi": "inactive",
-        },
-        "http_ports_open": [80],
-        "docker_containers": [
-            {"id": "abc123", "name": "mockapi-svc", "status": "running", "ports": "8000->3000/tcp"}
-        ],
-        "logs": "docker: port 3000 already in use\n",
-        "files": {
-            "/srv/docker-compose.yml": """
-version: '3.8'
-services:
-  mockapi:
-    image: mockapi:latest
-    ports:
-      - "3000:3000"
-    environment:
-      - PORT=3000
-    volumes:
-      - ./app.py:/app/app.py""",
-        },
-        "cpu_usage": 62.0,
-        "memory_usage_mb": 1024,
-    }
-def _create_initial_state_task3() -> Dict[str, Any]:
-    """Task 3: Memory leak in mock API."""
-    return {
-        "running_processes": [
-            {"pid": 100, "name": "systemd"},
-            {"pid": 105, "name": "sshd"},
-            {"pid": 300, "name": "python3", "rss_mb": 2048, "user": "appuser"},  # MEMORY LEAK
-        ],
-        "service_status": {
-            "nginx": "active",
-            "docker": "active",
-            "mockapi": "active",
-        },
-        "http_ports_open": [80, 5000],
-        "docker_containers": [],
-        "logs": (
-            "2026-03-29 01:45:00 mockapi started\n"
-            "2026-03-29 01:46:00 memory usage: 512 MB\n"
-            "2026-03-29 01:47:00 memory usage: 1024 MB\n"
-            "2026-03-29 01:48:00 memory usage: 1536 MB (WARNING: HIGH)\n"
-            "2026-03-29 01:49:00 memory usage: 2048 MB (CRITICAL)\n"
-        ),
-        "files": {
-            "/opt/mockapi/app.py": """
-import json
-from flask import Flask
-app = Flask(__name__)
-# BUG: This list grows unbounded
-request_cache = []
-@app.route('/api/data', methods=['GET'])
-def get_data():
-    data = {"timestamp": 123456, "value": 42}
-    request_cache.append(data)  # MEMORY LEAK!
-    return json.dumps(data)
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=5000)
-""",
-        },
-        "cpu_usage": 85.5,
-        "memory_usage_mb": 2048,
-    }
-NGINX_CONFIG_PATH = "/etc/nginx/nginx.conf"
-DOCKER_COMPOSE_PATH = "/srv/docker-compose.yml"
-MOCK_API_PATH = "/opt/mockapi/app.py"
-def _build_system_state(task_id: str, ep_state: Dict[str, Any]) -> SystemState:
-    """Build a SystemState object from episode state."""
-    state_dict = ep_state["system_state"]
-    return SystemState(
-        task_id=task_id,
-        available_commands=["systemctl", "nginx", "docker", "curl", "ps", "cat", "vim"],
-        filesystem_snapshot=json.dumps({
-            k: v for k, v in state_dict.get("files", {}).items()
-        }),
-        running_processes=state_dict.get("running_processes", []),
-        service_status=state_dict.get("service_status", {}),
-        logs=state_dict.get("logs", ""),
-        http_ports_open=state_dict.get("http_ports_open", []),
-        docker_containers=state_dict.get("docker_containers", []),
-        cpu_usage=state_dict.get("cpu_usage", 0.0),
-        memory_usage_mb=state_dict.get("memory_usage_mb", 0),
-    )
 # ---------------------------------------------------------------------------
-# Dynamic execution simulation
 # ---------------------------------------------------------------------------
-def _simulate_bash_cmd(cmd: str, task_id: str, ep_state: Dict[str, Any]) -> str:
-    """Simulate bash command execution."""
-    state_dict = ep_state["system_state"]
-    lower_cmd = cmd.lower()
-    # Task 1: Nginx commands
-    if task_id == "task1":
-        if "systemctl restart nginx" in lower_cmd or "systemctl start nginx" in lower_cmd:
-            state_dict["service_status"]["nginx"] = "active"
-            state_dict["running_processes"].append({"pid": 999, "name": "nginx"})
-            state_dict["http_ports_open"] = [80]
-            return "Job for nginx.service started successfully."
-        elif "systemctl status nginx" in lower_cmd:
-            if state_dict["service_status"]["nginx"] == "active":
-                return "● nginx.service - NGINX HTTP Server\n   Loaded: loaded (/etc/systemd/system/nginx.service)\n   Active: active (running)"
-            return "● nginx.service - NGINX HTTP Server\n   Active: inactive (dead)"
-        elif "nginx -t" in lower_cmd:
-            return "nginx: the configuration file /etc/nginx/nginx.conf syntax is ok\nnginx: configuration file /etc/nginx/nginx.conf test is successful"
-        elif "curl http://localhost:80" in lower_cmd or "curl http://localhost" in lower_cmd:
-            if 80 in state_dict["http_ports_open"]:
-                return "OK"
-            return "curl: (7) Failed to connect to localhost port 80: Connection refused"
-    # Task 2: Docker commands
-    elif task_id == "task2":
-        if "docker-compose up -d" in lower_cmd:
-            if DOCKER_COMPOSE_PATH in state_dict["files"]:
-                compose_content = state_dict["files"][DOCKER_COMPOSE_PATH]
-                # Check if port is now correct
-                if "3000:3000" in compose_content:
-                    state_dict["docker_containers"] = [
-                        {"id": "xyz789", "name": "mockapi-svc", "status": "running", "ports": "3000:3000/tcp"}
-                    ]
-                    state_dict["service_status"]["mockapi"] = "active"
-                    return "Creating mockapi ... done"
-            return "ERROR: docker-compose.yml not found or invalid"
-        elif "docker ps" in lower_cmd:
-            if state_dict["docker_containers"]:
-                return "\n".join([f"{c['id']} {c['name']} {c['status']}" for c in state_dict["docker_containers"]])
-            return "No containers running"
-    # Task 3: Process/memory commands
-    elif task_id == "task3":
-        if "ps aux" in lower_cmd or "ps aux grep python" in lower_cmd:
-            output = ""
-            for proc in state_dict["running_processes"]:
-                if proc.get("name") == "python3":
-                    output += f"appuser {proc['pid']} 85.5 {proc.get('rss_mb', 512)} python3 /opt/mockapi/app.py\n"
-            return output if output else "No python processes found"
-        elif "kill" in lower_cmd:
-            if "300" in lower_cmd or "python" in lower_cmd:
-                state_dict["running_processes"] = [p for p in state_dict["running_processes"] if p.get("name") != "python3"]
-                state_dict["service_status"]["mockapi"] = "inactive"
-                return "Process killed"
-            return "Process not found"
-        elif "python3 /opt/mockapi/app.py &" in lower_cmd or "python3 /opt/mockapi/app.py" in lower_cmd:
-            state_dict["running_processes"].append({"pid": 301, "name": "python3", "rss_mb": 128, "user": "appuser"})
-            state_dict["service_status"]["mockapi"] = "active"
-            state_dict["http_ports_open"] = [80, 5000]
-            return "Application started"
-    return f"Command '{cmd}' executed (simulated)"
-def _simulate_file_edit(file_path: str, new_content: str, ep_state: Dict[str, Any]) -> str:
-    """Simulate file editing."""
-    state_dict = ep_state["system_state"]
-    if file_path not in state_dict.get("files", {}):
-        return f"ERROR: File {file_path} not found"
-    # Detect task 2: Check docker-compose.yml fix
-    if file_path == DOCKER_COMPOSE_PATH and "3000:3000" in new_content:
-        state_dict["files"][file_path] = new_content
-        return f"File {file_path} updated successfully"
-    # Detect task 3: Check mock API fix
-    elif file_path == MOCK_API_PATH and "request_cache = []" not in new_content:
-        # Verify fix removes the memory leak
-        state_dict["files"][file_path] = new_content
-        return f"File {file_path} patched successfully"
-    state_dict["files"][file_path] = new_content
-    return f"File {file_path} edited"
-# ---------------------------------------------------------------------------
-# Reward calculation
-# ---------------------------------------------------------------------------
-def _calculate_step_reward(task_id: str, action: Action, ep_state: Dict[str, Any]) -> Tuple[float, str]:
-    """Calculate reward based on action and task."""
-    base_step_cost = -0.01
-    reward = base_step_cost
-    if action.action_type == "bash_cmd":
-        cmd = action.command or ""
-        reward += 0.05
-        explanation = f"Executed: {cmd[:50]}"
-        return reward, explanation
-    elif action.action_type == "file_edit":
-        reward += 0.03
-        explanation = f"Edited: {action.file_path}"
-        return reward, explanation
-    elif action.action_type == "submit":
-        reward += 0.1
-        explanation = "Episode submitted for grading"
-        return reward, explanation
-    return reward, "Step taken"
-# ---------------------------------------------------------------------------
-# Core API functions
-# ---------------------------------------------------------------------------
-def reset(task_id: str) -> Observation:
-    """Create a new episode for the given task."""
     if task_id not in TASK_META:
         raise ValueError(f"Unknown task_id {task_id!r}. Valid: {list(TASK_META)}")
     meta = TASK_META[task_id]
-    # Initialize system state based on task
-    if task_id == "task1":
-        initial_sys_state = _create_initial_state_task1()
-    elif task_id == "task2":
-        initial_sys_state = _create_initial_state_task2()
-    elif task_id == "task3":
-        initial_sys_state = _create_initial_state_task3()
-    else:
-        initial_sys_state = {}
     episode_id = str(uuid.uuid4())
     _EPISODES[episode_id] = {
         "task_id": task_id,
         "step_number": 0,
         "max_steps": meta["max_steps"],
         "done": False,
         "total_reward": 0.0,
         "action_history": [],
         "final_score": None,
-        "system_state": initial_sys_state,
     }
-    system_state = _build_system_state(task_id, _EPISODES[episode_id])
     return Observation(
         task_id=task_id,
-        task_description=meta["description"],
         episode_id=episode_id,
-        system_state=system_state,
         thread_history=[],
-        available_actions=meta["available_actions"],
         step_number=0,
         max_steps=meta["max_steps"],
-        hint="Start by diagnosing the system state with basic commands.",
     )
@@ -379,24 +102,14 @@ def step(episode_id: str, action: Action) -> StepResult:
     ep = _EPISODES.get(episode_id)
     if ep is None:
         raise KeyError(f"Episode {episode_id} not found")
     if ep["done"]:
         raise ValueError(f"Episode {episode_id} is already done.")
     task_id = ep["task_id"]
-    meta = TASK_META[task_id]
     ep["step_number"] += 1
     ep["action_history"].append(action.model_dump())
-    # Execute action
-    if action.action_type == "bash_cmd":
-        cmd_output = _simulate_bash_cmd(action.command or "", task_id, ep)
-        ep["action_history"][-1]["output"] = cmd_output
-    elif action.action_type == "file_edit":
-        edit_result = _simulate_file_edit(action.file_path or "", action.file_content or "", ep)
-        ep["action_history"][-1]["result"] = edit_result
     # Determine if done
     done = False
     if action.action_type == "submit":
@@ -404,16 +117,21 @@ def step(episode_id: str, action: Action) -> StepResult:
     elif ep["step_number"] >= ep["max_steps"]:
         done = True
-    # Calculate reward
-    step_reward, explanation = _calculate_step_reward(task_id, action, ep)
-    # Apply grader bonus when done
     if done:
-        final_score, breakdown, grader_feedback = grade_task(task_id, ep)
         ep["final_score"] = final_score
-        bonus = final_score * 0.5
-        step_reward += bonus
-        explanation += f" | Grader score: {final_score:.3f} (+{bonus:.3f} bonus)"
     else:
         final_score = None
@@ -421,21 +139,34 @@ def step(episode_id: str, action: Action) -> StepResult:
     ep["done"] = done
     # Build observation
-    system_state = _build_system_state(task_id, ep)
     thread_history = [
-        {"role": "agent", "content": str(a)} for a in ep["action_history"]
     ]
     obs = Observation(
         task_id=task_id,
-        task_description=meta["description"],
         episode_id=episode_id,
-        system_state=system_state,
         thread_history=thread_history,
-        available_actions=meta["available_actions"] if not done else [],
         step_number=ep["step_number"],
         max_steps=ep["max_steps"],
-        hint=None if done else "Continue diagnosing and fixing the issue.",
     )
     reward = Reward(
@@ -444,7 +175,7 @@ def step(episode_id: str, action: Action) -> StepResult:
         explanation=explanation,
     )
-    info = {"step": ep["step_number"]}
     if done:
         info["final_score"] = final_score
@@ -474,13 +205,84 @@ def grade(episode_id: str) -> Tuple[float, Dict[str, float], str]:
     ep = _EPISODES.get(episode_id)
     if ep is None:
         raise KeyError(f"Episode {episode_id} not found")
     if not ep.get("done"):
         raise ValueError(f"Episode {episode_id} is not done yet")
     task_id = ep["task_id"]
     score, breakdown, feedback = grade_task(task_id, ep)
     ep["final_score"] = score
     return score, breakdown, feedback

 """
+Core SupportEnv environment logic.
+Simulates a customer support ticket triage workflow:
+- Task 1 (easy):   Ticket Classification — assign category + priority
+- Task 2 (medium): Information Extraction — pull entities + required actions
+- Task 3 (hard):   Resolution Generation — write response + resolution steps
 Manages episode lifecycle:
+  reset(task_id, ticket_index) → Observation
+  step(episode_id, action)     → StepResult
+  get_state(episode_id)        → State
+  grade(episode_id)            → (score, breakdown, feedback)
 """
 from __future__ import annotations
 import uuid
+from typing import Any, Dict, Optional, Tuple
+from data import TASK_META, get_task_meta, get_tickets
 from graders import grade_task
 from models import (
     Action,
     Reward,
     State,
     StepResult,
+    TicketInfo,
 )
+# In-memory store: episode_id → episode dict
 _EPISODES: Dict[str, Dict[str, Any]] = {}
 # ---------------------------------------------------------------------------
+# Reward constants (match openenv.yaml)
 # ---------------------------------------------------------------------------
+STEP_COST = -0.02
+SUBMIT_BONUS = 0.05
+MAX_STEP_PENALTY = -0.10
 # ---------------------------------------------------------------------------
+# Core API
 # ---------------------------------------------------------------------------
+def reset(task_id: str, ticket_index: int = 0) -> Observation:
+    """Create a new episode for the given task and ticket."""
     if task_id not in TASK_META:
         raise ValueError(f"Unknown task_id {task_id!r}. Valid: {list(TASK_META)}")
     meta = TASK_META[task_id]
+    tickets = get_tickets(task_id)
+    if ticket_index < 0 or ticket_index >= len(tickets):
+        raise ValueError(
+            f"ticket_index {ticket_index} out of range [0, {len(tickets) - 1}]"
+        )
+    ticket_data = tickets[ticket_index]
+    safe_meta = get_task_meta(task_id)
     episode_id = str(uuid.uuid4())
     _EPISODES[episode_id] = {
         "task_id": task_id,
+        "ticket_index": ticket_index,
+        "ticket_data": ticket_data,
         "step_number": 0,
         "max_steps": meta["max_steps"],
         "done": False,
         "total_reward": 0.0,
         "action_history": [],
         "final_score": None,
     }
+    ticket_info = TicketInfo(
+        ticket_id=ticket_data["ticket_id"],
+        subject=ticket_data["subject"],
+        body=ticket_data["body"],
+        customer_tier=ticket_data["customer_tier"],
+        account_age_days=ticket_data["account_age_days"],
+        previous_tickets=ticket_data["previous_tickets"],
+        attachments=ticket_data.get("attachments", []),
+    )
     return Observation(
         task_id=task_id,
+        task_description=safe_meta["description"],
         episode_id=episode_id,
+        ticket=ticket_info,
         thread_history=[],
+        available_actions=safe_meta["available_actions"],
         step_number=0,
         max_steps=meta["max_steps"],
+        hint=_get_hint(task_id, 0),
     )
     ep = _EPISODES.get(episode_id)
     if ep is None:
         raise KeyError(f"Episode {episode_id} not found")
     if ep["done"]:
         raise ValueError(f"Episode {episode_id} is already done.")
     task_id = ep["task_id"]
     ep["step_number"] += 1
     ep["action_history"].append(action.model_dump())
     # Determine if done
     done = False
     if action.action_type == "submit":
     elif ep["step_number"] >= ep["max_steps"]:
         done = True
+    # Calculate step reward
+    step_reward, explanation = _calculate_step_reward(task_id, action, ep, done)
+    # Apply grader bonus on terminal step
     if done:
+        final_score, _breakdown, _feedback = grade_task(task_id, ep)
         ep["final_score"] = final_score
+        # Grader score is the terminal bonus (0–1)
+        step_reward += final_score
+        explanation += f" | Grader score: {final_score:.3f}"
+        # Penalty for running out of steps without submitting
+        if action.action_type != "submit" and ep["step_number"] >= ep["max_steps"]:
+            step_reward += MAX_STEP_PENALTY
+            explanation += f" | Max-step penalty: {MAX_STEP_PENALTY}"
     else:
         final_score = None
     ep["done"] = done
     # Build observation
+    ticket_data = ep["ticket_data"]
+    safe_meta = get_task_meta(task_id)
+    ticket_info = TicketInfo(
+        ticket_id=ticket_data["ticket_id"],
+        subject=ticket_data["subject"],
+        body=ticket_data["body"],
+        customer_tier=ticket_data["customer_tier"],
+        account_age_days=ticket_data["account_age_days"],
+        previous_tickets=ticket_data["previous_tickets"],
+        attachments=ticket_data.get("attachments", []),
+    )
     thread_history = [
+        {"role": "agent", "content": _summarize_action(a)}
+        for a in ep["action_history"]
     ]
     obs = Observation(
         task_id=task_id,
+        task_description=safe_meta["description"],
         episode_id=episode_id,
+        ticket=ticket_info,
         thread_history=thread_history,
+        available_actions=safe_meta["available_actions"] if not done else [],
         step_number=ep["step_number"],
         max_steps=ep["max_steps"],
+        hint=None if done else _get_hint(task_id, ep["step_number"]),
     )
     reward = Reward(
         explanation=explanation,
     )
+    info: Dict[str, Any] = {"step": ep["step_number"]}
     if done:
         info["final_score"] = final_score
     ep = _EPISODES.get(episode_id)
     if ep is None:
         raise KeyError(f"Episode {episode_id} not found")
     if not ep.get("done"):
         raise ValueError(f"Episode {episode_id} is not done yet")
     task_id = ep["task_id"]
     score, breakdown, feedback = grade_task(task_id, ep)
     ep["final_score"] = score
     return score, breakdown, feedback
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _calculate_step_reward(
+    task_id: str, action: Action, ep: Dict[str, Any], done: bool
+) -> Tuple[float, str]:
+    """Dense per-step reward."""
+    reward = STEP_COST  # small cost per step
+    if action.action_type == "submit":
+        reward += SUBMIT_BONUS
+        return reward, "Submitted for grading"
+    # Partial-progress signals based on task
+    if task_id == "task1":
+        if action.action_type == "classify":
+            if action.category:
+                reward += 0.02
+            if action.priority:
+                reward += 0.02
+            return reward, f"Classified: category={action.category}, priority={action.priority}"
+    elif task_id == "task2":
+        if action.action_type == "extract":
+            n_entities = len(action.extracted_entities) if action.extracted_entities else 0
+            n_actions = len(action.required_actions) if action.required_actions else 0
+            reward += min(n_entities * 0.005, 0.04)
+            reward += min(n_actions * 0.005, 0.02)
+            return reward, f"Extracted {n_entities} entities, {n_actions} actions"
+    elif task_id == "task3":
+        if action.action_type == "respond":
+            text_len = len(action.response_text or "")
+            n_steps = len(action.resolution_steps) if action.resolution_steps else 0
+            if text_len > 0:
+                reward += min(text_len * 0.0001, 0.03)
+            if n_steps > 0:
+                reward += min(n_steps * 0.005, 0.02)
+            return reward, f"Response ({text_len} chars), {n_steps} resolution steps"
+    return reward, "Step taken"
+def _summarize_action(action_dict: Dict[str, Any]) -> str:
+    """One-line summary of an action for thread_history."""
+    atype = action_dict.get("action_type", "unknown")
+    if atype == "classify":
+        return f"classify(category={action_dict.get('category')}, priority={action_dict.get('priority')})"
+    elif atype == "extract":
+        ents = action_dict.get("extracted_entities") or {}
+        acts = action_dict.get("required_actions") or []
+        return f"extract(entities={list(ents.keys())}, actions={acts})"
+    elif atype == "respond":
+        text = (action_dict.get("response_text") or "")[:60]
+        steps = action_dict.get("resolution_steps") or []
+        return f"respond(text='{text}...', steps={len(steps)})"
+    elif atype == "submit":
+        return "submit()"
+    return f"{atype}()"
+def _get_hint(task_id: str, step: int) -> Optional[str]:
+    """Contextual hints to guide the agent."""
+    if step == 0:
+        hints = {
+            "task1": "Read the ticket carefully and classify by category and priority.",
+            "task2": "Extract all entities (IDs, names, amounts) and identify required actions.",
+            "task3": "Write a professional response and list resolution steps.",
+        }
+        return hints.get(task_id)
+    return None

graders.py CHANGED Viewed

@@ -1,191 +1,244 @@
 """
-Graders for DevOpsEnv tasks.
-Deterministic scoring based on system state changes and action validity.
 """
-from typing import Any, Dict, Tuple
-def grade_task(task_id: str, episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
-    """
-    Grade a completed task episode.
-    Returns (score, breakdown, feedback)
-    """
     if task_id == "task1":
-        return grade_task1(episode_state)
     elif task_id == "task2":
-        return grade_task2(episode_state)
     elif task_id == "task3":
-        return grade_task3(episode_state)
-    else:
-        return 0.0, {}, "Unknown task"
-def grade_task1(episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
     """
-    Grade Task 1: Restart Nginx.
-    Success criteria:
-    - nginx service is running (30%)
-    - nginx config is valid (30%)
-    - HTTP 200 response on port 80 (40%)
     """
-    state_dict = episode_state.get("system_state", {})
-    action_history = episode_state.get("action_history", [])
-    breakdown = {
-        "nginx_running": 0.0,
-        "config_valid": 0.0,
-        "http_200": 0.0,
     }
-    # Check if nginx is running
-    service_status = state_dict.get("service_status", {})
-    if service_status.get("nginx") == "active":
-        breakdown["nginx_running"] = 0.3
-    # Check if config validation was attempted and passed
-    config_valid = False
-    for action in action_history:
-        output = action.get("output", "")
-        if output and ("syntax is ok" in str(output).lower() or "test is successful" in str(output).lower()):
-            config_valid = True
-            breakdown["config_valid"] = 0.3
-            break
-    # Check if HTTP 200 response was achieved
-    http_ports = state_dict.get("http_ports_open", [])
-    if 80 in http_ports:
-        # Verify http 200 response was confirmed
-        for action in action_history:
-            output = action.get("output", "")
-            cmd = action.get("command", "")
-            if output and cmd and "OK" in str(output) and "curl" in str(cmd).lower():
-                breakdown["http_200"] = 0.4
-                break
-    score = sum(breakdown.values())
-    feedback = f"Task 1 Grading: nginx_running={breakdown['nginx_running']:.1f}, config_valid={breakdown['config_valid']:.1f}, http_200={breakdown['http_200']:.1f}"
-    return min(score, 1.0), breakdown, feedback
-def grade_task2(episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
     """
-    Grade Task 2: Fix Docker configuration.
-    Success criteria:
-    - docker-compose.yml was edited (25%)
-    - docker-compose up -d was successful (25%)
-    - Container is running (25%)
-    - Service accessible on correct port (25%)
     """
-    state_dict = episode_state.get("system_state", {})
-    action_history = episode_state.get("action_history", [])
-    files = state_dict.get("files", {})
-    breakdown = {
-        "file_edited": 0.0,
-        "compose_ran": 0.0,
-        "container_running": 0.0,
-        "port_accessible": 0.0,
     }
-    # Check if docker-compose.yml was edited correctly
-    compose_file = "/srv/docker-compose.yml"
-    if compose_file in files:
-        content = files[compose_file]
-        if content and "3000:3000" in str(content):
-            breakdown["file_edited"] = 0.25
-    # Check if docker-compose up -d was run
-    for action in action_history:
-        cmd = action.get("command")
-        if cmd and "docker-compose up -d" in str(cmd):
-            output = action.get("output", "")
-            if output and ("done" in str(output).lower() or "created" in str(output).lower()):
-                breakdown["compose_ran"] = 0.25
-            break
-    # Check if container is running
-    containers = state_dict.get("docker_containers", [])
-    if containers:
-        for container in containers:
-            if container.get("status") == "running" and "mockapi" in str(container.get("name", "")):
-                breakdown["container_running"] = 0.25
-                break
-    # Check if port is correctly mapped
-    if containers:
-        for container in containers:
-            if "3000:3000" in str(container.get("ports", "")):
-                breakdown["port_accessible"] = 0.25
-                break
-    score = sum(breakdown.values())
-    feedback = f"Task 2 Grading: file_edited={breakdown['file_edited']:.2f}, compose_ran={breakdown['compose_ran']:.2f}, container_running={breakdown['container_running']:.2f}, port_accessible={breakdown['port_accessible']:.2f}"
-    return min(score, 1.0), breakdown, feedback
-def grade_task3(episode_state: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
     """
-    Grade Task 3: Fix memory leak.
-    Success criteria:
-    - Process was killed (25%)
-    - Code was fixed (removing the leak) (25%)
-    - Service was restarted (25%)
-    - Memory usage decreased (25%)
     """
-    state_dict = episode_state.get("system_state", {})
-    action_history = episode_state.get("action_history", [])
-    files = state_dict.get("files", {})
-    processes = state_dict.get("running_processes", [])
-    breakdown = {
-        "process_killed": 0.0,
-        "code_fixed": 0.0,
-        "service_restarted": 0.0,
-        "memory_reduced": 0.0,
     }
-    # Check if python process was killed
-    has_python_leak = False
-    if processes:
-        has_python_leak = any(p.get("name") == "python3" and p.get("rss_mb", 512) > 1024 for p in processes)
-    if not has_python_leak:
-        # Process was killed
-        breakdown["process_killed"] = 0.25
-    # Check if code was fixed (removed the memory leak)
-    app_file = "/opt/mockapi/app.py"
-    if app_file in files:
-        content = files[app_file]
-        # Memory leak is the unbounded list append - check if it is fixed
-        if content and ("request_cache.append" not in str(content) or "request_cache = []" not in str(content)):
-            # If it has been removed or replaced with something better
-            if "request_cache" not in str(content) or "# " in str(content):
-                breakdown["code_fixed"] = 0.25
-    # Check if service was restarted
-    service_status = state_dict.get("service_status", {})
-    if service_status.get("mockapi") == "active":
-        # And there is a newer process
-        for action in action_history:
-            cmd = action.get("command", "")
-            if cmd and "python3" in str(cmd) and ("start" in str(cmd) or "&" in str(cmd)):
-                breakdown["service_restarted"] = 0.25
-                break
-    # Check if memory usage decreased
-    initial_memory = 2048
-    current_memory = state_dict.get("memory_usage_mb", 2048)
-    if current_memory < initial_memory * 0.75:  # At least 25% improvement
-        breakdown["memory_reduced"] = 0.25
-    score = sum(breakdown.values())
-    feedback = f"Task 3 Grading: process_killed={breakdown['process_killed']:.2f}, code_fixed={breakdown['code_fixed']:.2f}, service_restarted={breakdown['service_restarted']:.2f}, memory_reduced={breakdown['memory_reduced']:.2f}"
-    return min(score, 1.0), breakdown, feedback

 """
+Deterministic graders for SupportEnv tasks.
+Each grader inspects the agent's action_history against ground-truth data
+and returns (score, breakdown, feedback) where score is in [0.0, 1.0].
+Task 1 — Classification:  category match (0.50) + priority match (0.40) + efficiency (0.10)
+Task 2 — Extraction:      entity coverage (0.60) + action coverage (0.30) + no hallucination (0.10)
+Task 3 — Resolution:      keyword coverage (0.30) + step coverage (0.30) + tone (0.25) +
+                           length (0.10) + non-empty steps (0.05)
 """
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Tuple
+def grade_task(
+    task_id: str, episode_state: Dict[str, Any]
+) -> Tuple[float, Dict[str, float], str]:
     if task_id == "task1":
+        return _grade_classification(episode_state)
     elif task_id == "task2":
+        return _grade_extraction(episode_state)
     elif task_id == "task3":
+        return _grade_resolution(episode_state)
+    return 0.0, {}, "Unknown task"
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _last_action_of_type(
+    history: List[Dict[str, Any]], action_type: str
+) -> Optional[Dict[str, Any]]:
+    """Return the last action matching *action_type*, or None."""
+    for action in reversed(history):
+        if action.get("action_type") == action_type:
+            return action
+    return None
+def _normalize(s: Any) -> str:
+    return str(s).strip().lower() if s is not None else ""
+# ---------------------------------------------------------------------------
+# Task 1 — Classification
+# ---------------------------------------------------------------------------
+def _grade_classification(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
     """
+    Score breakdown:
+      category_correct  0.50 — exact match
+      priority_correct  0.40 — exact match
+      efficiency        0.10 — 1 step = full, degrades linearly
     """
+    gt = ep["ticket_data"]["ground_truth"]
+    history = ep.get("action_history", [])
+    breakdown: Dict[str, float] = {
+        "category_correct": 0.0,
+        "priority_correct": 0.0,
+        "efficiency": 0.0,
     }
+    classify_action = _last_action_of_type(history, "classify")
+    if classify_action is None:
+        return 0.0, breakdown, "No classify action found."
+    # Category
+    if _normalize(classify_action.get("category")) == _normalize(gt["category"]):
+        breakdown["category_correct"] = 0.50
+    # Priority
+    if _normalize(classify_action.get("priority")) == _normalize(gt["priority"]):
+        breakdown["priority_correct"] = 0.40
+    # Efficiency: full marks if classified in 1 step, degrades linearly
+    max_steps = ep.get("max_steps", 3)
+    steps_used = ep.get("step_number", max_steps)
+    if steps_used <= 1:
+        breakdown["efficiency"] = 0.10
+    else:
+        breakdown["efficiency"] = round(max(0.0, 0.10 * (1 - (steps_used - 1) / max_steps)), 4)
+    score = round(min(sum(breakdown.values()), 1.0), 4)
+    parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
+    return score, breakdown, f"Task 1: {parts}"
+# ---------------------------------------------------------------------------
+# Task 2 — Information Extraction
+# ---------------------------------------------------------------------------
+def _grade_extraction(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
     """
+    Score breakdown:
+      entity_coverage   0.60 — fraction of ground-truth entities matched
+      action_coverage   0.30 — fraction of required actions matched
+      no_hallucination  0.10 — penalty for extra entities not in ground truth
     """
+    gt = ep["ticket_data"]["ground_truth"]
+    history = ep.get("action_history", [])
+    breakdown: Dict[str, float] = {
+        "entity_coverage": 0.0,
+        "action_coverage": 0.0,
+        "no_hallucination": 0.10,  # start with full marks, deduct
     }
+    extract_action = _last_action_of_type(history, "extract")
+    if extract_action is None:
+        breakdown["no_hallucination"] = 0.0
+        return 0.0, breakdown, "No extract action found."
+    # --- Entity coverage ---
+    gt_entities: Dict[str, Any] = gt.get("entities", {})
+    pred_entities: Dict[str, Any] = extract_action.get("extracted_entities") or {}
+    if gt_entities:
+        matched = 0
+        for key, gt_val in gt_entities.items():
+            pred_val = pred_entities.get(key)
+            if pred_val is not None and _entity_matches(gt_val, pred_val):
+                matched += 1
+        breakdown["entity_coverage"] = round(0.60 * matched / len(gt_entities), 4)
+    # --- Action coverage ---
+    gt_actions: List[str] = gt.get("required_actions", [])
+    pred_actions: List[str] = extract_action.get("required_actions") or []
+    pred_actions_lower = [_normalize(a) for a in pred_actions]
+    if gt_actions:
+        matched_actions = sum(
+            1 for ga in gt_actions if _normalize(ga) in pred_actions_lower
+        )
+        breakdown["action_coverage"] = round(0.30 * matched_actions / len(gt_actions), 4)
+    # --- No hallucination ---
+    if pred_entities and gt_entities:
+        extra_keys = set(pred_entities.keys()) - set(gt_entities.keys())
+        if extra_keys:
+            penalty = min(len(extra_keys) * 0.02, 0.10)
+            breakdown["no_hallucination"] = round(max(0.0, 0.10 - penalty), 4)
+    score = round(min(sum(breakdown.values()), 1.0), 4)
+    parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
+    return score, breakdown, f"Task 2: {parts}"
+def _entity_matches(gt_val: Any, pred_val: Any) -> bool:
+    """Flexible entity comparison — handles strings, lists, and numbers."""
+    if isinstance(gt_val, list) and isinstance(pred_val, list):
+        gt_set = {_normalize(v) for v in gt_val}
+        pred_set = {_normalize(v) for v in pred_val}
+        return gt_set == pred_set
+    return _normalize(gt_val) == _normalize(pred_val)
+# ---------------------------------------------------------------------------
+# Task 3 — Resolution Generation
+# ---------------------------------------------------------------------------
+def _grade_resolution(ep: Dict[str, Any]) -> Tuple[float, Dict[str, float], str]:
     """
+    Score breakdown:
+      keyword_coverage  0.30 — fraction of required keywords found in response
+      step_coverage     0.30 — fraction of required resolution steps matched
+      tone_compliance   0.25 — apology / urgency / timeline adherence
+      length_adequate   0.10 — response meets minimum length
+      no_empty_steps    0.05 — all resolution steps are non-empty
     """
+    gt = ep["ticket_data"]["ground_truth"]
+    history = ep.get("action_history", [])
+    breakdown: Dict[str, float] = {
+        "keyword_coverage": 0.0,
+        "step_coverage": 0.0,
+        "tone_compliance": 0.0,
+        "length_adequate": 0.0,
+        "no_empty_steps": 0.05,  # assume pass unless empty steps found
     }
+    respond_action = _last_action_of_type(history, "respond")
+    if respond_action is None:
+        breakdown["no_empty_steps"] = 0.0
+        return 0.0, breakdown, "No respond action found."
+    response_text: str = respond_action.get("response_text") or ""
+    resolution_steps: List[str] = respond_action.get("resolution_steps") or []
+    response_lower = response_text.lower()
+    # --- Keyword coverage ---
+    required_keywords: List[str] = gt.get("required_keywords", [])
+    if required_keywords:
+        matched_kw = sum(1 for kw in required_keywords if kw.lower() in response_lower)
+        breakdown["keyword_coverage"] = round(0.30 * matched_kw / len(required_keywords), 4)
+    # --- Step coverage ---
+    gt_steps: List[str] = gt.get("required_resolution_steps", [])
+    if gt_steps:
+        pred_steps_lower = [_normalize(s) for s in resolution_steps]
+        matched_steps = sum(
+            1 for gs in gt_steps if _normalize(gs) in pred_steps_lower
+        )
+        breakdown["step_coverage"] = round(0.30 * matched_steps / len(gt_steps), 4)
+    # --- Tone compliance ---
+    tone_req = gt.get("tone_requirements", {})
+    tone_checks = 0
+    tone_pass = 0
+    if tone_req.get("must_apologize"):
+        tone_checks += 1
+        apology_words = ["apolog", "sorry", "regret", "sincerely"]
+        if any(w in response_lower for w in apology_words):
+            tone_pass += 1
+    if tone_req.get("must_acknowledge_urgency"):
+        tone_checks += 1
+        urgency_words = ["urgent", "immediately", "priority", "asap", "right away", "as soon as"]
+        if any(w in response_lower for w in urgency_words):
+            tone_pass += 1
+    if tone_req.get("must_provide_timeline"):
+        tone_checks += 1
+        timeline_words = ["within", "hours", "minutes", "by end of", "shortly", "today", "tomorrow", "timeline", "expect"]
+        if any(w in response_lower for w in timeline_words):
+            tone_pass += 1
+    if tone_checks > 0:
+        breakdown["tone_compliance"] = round(0.25 * tone_pass / tone_checks, 4)
+    else:
+        breakdown["tone_compliance"] = 0.25  # no tone requirements = full marks
+    # --- Length adequate ---
+    min_len = gt.get("expected_response_length_min", 80)
+    if len(response_text) >= min_len:
+        breakdown["length_adequate"] = 0.10
+    # --- Non-empty steps ---
+    if not resolution_steps or any(not s.strip() for s in resolution_steps):
+        breakdown["no_empty_steps"] = 0.0
+    score = round(min(sum(breakdown.values()), 1.0), 4)
+    parts = ", ".join(f"{k}={v:.2f}" for k, v in breakdown.items())
+    return score, breakdown, f"Task 3: {parts}"

inference.py CHANGED Viewed

@@ -1,351 +1,290 @@
 """
-Baseline LLM inference agent for DevOpsEnv.
-This script reads an OpenEnv environment's state() and uses an LLM to generate
-actions that solve the DevOps tasks.
-Usage:
-  python inference.py --task task1 --model gpt-4 --hf-token <token>
 """
 import os
 import sys
-import json
-import argparse
 import time
-from pathlib import Path
-from typing import Optional
 import requests
-from google import genai
 from openai import OpenAI
-# Load .env values from current folder (if present) before reading config.
-def _load_dotenv_from_workspace() -> None:
-    """Load KEY=VALUE pairs from .env into os.environ without overriding existing vars."""
-    dotenv_path = Path(__file__).resolve().parent / ".env"
-    if not dotenv_path.exists():
-        return
-    for raw_line in dotenv_path.read_text(encoding="utf-8").splitlines():
-        line = raw_line.strip()
-        if not line or line.startswith("#"):
-            continue
-        if line.startswith("export "):
-            line = line[7:].strip()
-        if "=" not in line:
-            continue
-        key, value = line.split("=", 1)
-        key = key.strip()
-        value = value.strip()
-        if not key:
-            continue
-        # Remove surrounding quotes if present.
-        if (value.startswith('"') and value.endswith('"')) or (
-            value.startswith("'") and value.endswith("'")
-        ):
-            value = value[1:-1]
-        os.environ.setdefault(key, value)
-_load_dotenv_from_workspace()
-# Read config from environment/.env
-API_BASE_URL = os.environ.get("API_BASE_URL", "http://localhost:7860")
-MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4")
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "")
-GEMINI_DEFAULT_MODEL = os.environ.get("GEMINI_MODEL", "gemini-3-flash-preview")
-def _get_openai_client() -> OpenAI:
-    """Create an OpenAI-compatible client for OpenAI-style chat completions."""
-    api_key = os.environ.get("OPENAI_API_KEY", "sk-test")
-    client_kwargs = {"api_key": api_key}
-    if OPENAI_BASE_URL:
-        client_kwargs["base_url"] = OPENAI_BASE_URL
-    return OpenAI(**client_kwargs)
-def _get_gemini_client() -> genai.Client:
-    """Create a Gemini client using the official google-genai SDK."""
-    api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("OPENAI_API_KEY", "")
-    if not api_key:
-        raise ValueError("GEMINI_API_KEY is required for Gemini models")
-    return genai.Client(api_key=api_key)
-def _is_gemini_model(model: str) -> bool:
-    """Detect whether the requested model should use the Gemini SDK path."""
-    m = (model or "").lower()
-    return "gemini" in m
-def _resolve_gemini_model(model: str) -> str:
-    """Map shorthand Gemini model names to concrete model IDs."""
-    m = (model or "").strip()
-    if not m or m.lower() == "gemini":
-        return GEMINI_DEFAULT_MODEL
-    return m
-def _generate_action_text(
-    model: str,
-    system_prompt: str,
-    user_prompt: str,
-    openai_client: Optional[OpenAI],
-    gemini_client: Optional[genai.Client],
-) -> str:
-    """Generate model output text using Gemini SDK or OpenAI-compatible chat."""
-    if _is_gemini_model(model):
-        if gemini_client is None:
-            raise ValueError("Gemini client was not initialized")
-        gemini_model = _resolve_gemini_model(model)
-        combined_prompt = (
-            f"System instructions:\n{system_prompt}\n\n"
-            f"User request:\n{user_prompt}"
-        )
-        response = gemini_client.models.generate_content(
-            model=gemini_model,
-            contents=combined_prompt,
-        )
-        return response.text or ""
-    if openai_client is None:
-        raise ValueError("OpenAI client was not initialized")
-    response = openai_client.chat.completions.create(
-        model=model,
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt},
-        ],
-        temperature=0.3,
-        max_tokens=1000,
     )
-    return response.choices[0].message.content or ""
-def send_request(method: str, endpoint: str, **kwargs):
-    """Send HTTP request to the environment server."""
-    url = f"{API_BASE_URL}{endpoint}"
-    response = requests.request(method, url, timeout=10, **kwargs)
-    response.raise_for_status()
-    return response.json()
-def run_agent(task_id: str, max_steps: int = 20, model: Optional[str] = None) -> dict:
-    """Run the agent on a specific task."""
-    model = model or MODEL_NAME
-    if _is_gemini_model(model):
-        model = _resolve_gemini_model(model)
-    openai_client: Optional[OpenAI] = None
-    gemini_client: Optional[genai.Client] = None
-    if _is_gemini_model(model):
-        gemini_client = _get_gemini_client()
-    else:
-        openai_client = _get_openai_client()
-    # Initialize episode
-    print(f"\n{'='*60}")
-    print(f"Starting task: {task_id}")
-    print(f"Model: {model}")
-    print(f"{'='*60}\n")
-    obs = send_request("POST", "/reset", json={"task_id": task_id})
-    episode_id = obs["episode_id"]
-    max_steps = obs["max_steps"]
-    print(f"Episode ID: {episode_id}")
-    print(f"Max Steps: {max_steps}")
-    print(f"\nTask: {obs['task_description']}\n")
-    step_count = 0
-    total_reward = 0.0
-    actions_taken = []
-    while step_count < max_steps:
-        step_count += 1
-        # Get current state
-        state = send_request("GET", f"/state?episode_id={episode_id}")
-        # Prepare prompt for LLM
-        system_prompt = """You are an expert Linux DevOps engineer/SRE.
-Your job is to diagnose and fix broken systems using bash commands and file edits.
-You are interacting with a simulated Linux environment.
-Available actions:
-1. bash_cmd: Execute a bash command
-2. file_edit: Edit a file
-3. submit: Submit when the task is complete
-Respond in JSON format with this structure:
-{
-  "action_type": "bash_cmd" | "file_edit" | "submit",
-  "command": "command to execute" (if bash_cmd),
-  "file_path": "/path/to/file" (if file_edit),
-  "file_content": "new file content" (if file_edit),
-  "summary": "why you're taking this action"
 }
-Be strategic:
-- Start by diagnosing the system
-- Use ps, systemctl, curl, etc. to understand issues
-- Fix the root cause
-- Submit when done
-"""
-        user_prompt = f"""
-Current system state:
-- Task: {obs['task_description']}
-- Step: {state['step_number']}/{state['max_steps']}
-- Reward so far: {state['total_reward']:.3f}
-System status:
-{json.dumps(obs['system_state'], indent=2)}
-Previous actions: {len(state['history'])} taken so far
-History of commands:
-{json.dumps(state['history'][-3:], indent=2) if state['history'] else 'None yet'}
-What should I do next? Think step-by-step about what the issue is and how to fix it.
-"""
-        try:
-            # Call LLM (Gemini SDK or OpenAI-compatible chat)
-            response_text = _generate_action_text(
-                model=model,
-                system_prompt=system_prompt,
-                user_prompt=user_prompt,
-                openai_client=openai_client,
-                gemini_client=gemini_client,
-            )
-            try:
-                # Try to extract JSON from response
-                if "```json" in response_text:
-                    json_str = response_text.split("```json")[1].split("```")[0]
-                elif "```" in response_text:
-                    json_str = response_text.split("```")[1].split("```")[0]
-                else:
-                    json_str = response_text
-                action_data = json.loads(json_str)
-            except (json.JSONDecodeError, IndexError):
-                print(f"Failed to parse LLM response: {response_text[:100]}")
-                # Fallback to simple diagnosis
-                action_data = {"action_type": "bash_cmd", "command": "ps aux"}
-        except Exception as e:
-            print(f"LLM error: {e}. Falling back to heuristic...")
-            # Fallback heuristic actions
-            if step_count == 1:
-                action_data = {"action_type": "bash_cmd", "command": "systemctl status nginx"}
-            else:
-                action_data = {"action_type": "submit", "summary": "Diagnostics complete"}
-        # Step in environment
-        try:
-            result = send_request("POST", "/step", json={
                 "episode_id": episode_id,
-                "action": action_data
             })
-            obs = result["observation"]
-            reward = result["reward"]
-            done = result["done"]
-            step_count = obs["step_number"]
-            total_reward = reward["total_reward"]
-            actions_taken.append(action_data)
-            print(f"\nStep {step_count}/{max_steps}")
-            print(f"Action: {action_data['action_type']}")
-            if action_data.get("command"):
-                print(f"Command: {action_data['command']}")
-            elif action_data.get("file_path"):
-                print(f"File: {action_data['file_path']}")
-            print(f"Reward: {reward['step_reward']:+.3f} (total: {total_reward:.3f})")
-            print(f"Info: {reward['explanation'][:100]}")
-            if done:
-                print(f"\n{'='*60}")
-                print("EPISODE COMPLETE!")
-                print(f"Final Reward: {total_reward:.3f}")
-                print(f"Steps taken: {step_count}")
-                print(f"{'='*60}\n")
-                break
-        except Exception as e:
-            print(f"Step error: {e}")
-            break
-        # Small delay to avoid rate limiting
-        time.sleep(0.5)
-    # Grade the episode
-    try:
-        grade_result = send_request("POST", "/grader", json={"episode_id": episode_id})
-        print(f"\nGrader Results:")
-        print(f"Score: {grade_result['score']:.3f}/1.0")
-        print(f"Breakdown: {json.dumps(grade_result['breakdown'], indent=2)}")
-        print(f"Feedback: {grade_result['feedback']}")
-    except Exception as e:
-        print(f"Grading error: {e}")
     return {
         "task_id": task_id,
-        "episode_id": episode_id,
-        "final_reward": total_reward,
-        "step_count": step_count,
-        "actions": actions_taken,
     }
-def main():
-    parser = argparse.ArgumentParser(description="Run DevOpsEnv baseline agent")
-    parser.add_argument("--task", default="task1", help="Task ID (task1, task2, or task3)")
-    parser.add_argument(
-        "--model",
-        default=None,
-        help=(
-            "Model name (default: env var MODEL_NAME). "
-            "For Gemini, pass a real model ID like gemini-3-flash-preview "
-            "or use --model gemini to auto-resolve to GEMINI_MODEL."
-        ),
-    )
-    parser.add_argument("--api-url", default=None, help="API URL (default: env var API_BASE_URL)")
-    parser.add_argument("--hf-token", default=None, help="HF token (default: env var HF_TOKEN)")
-    parser.add_argument(
-        "--openai-base-url",
-        default=None,
-        help="OpenAI-compatible base URL for non-OpenAI providers (for example Gemini OpenAI API)",
-    )
-    args = parser.parse_args()
-    # Override env variables if provided
-    global API_BASE_URL, MODEL_NAME, HF_TOKEN, OPENAI_BASE_URL
-    if args.api_url:
-        API_BASE_URL = args.api_url
-    if args.model:
-        MODEL_NAME = args.model
-    if args.hf_token:
-        HF_TOKEN = args.hf_token
-    if args.openai_base_url:
-        OPENAI_BASE_URL = args.openai_base_url
-    try:
-        result = run_agent(args.task, model=MODEL_NAME)
-        print(json.dumps(result, indent=2))
-    except Exception as e:
-        print(f"Fatal error: {e}", file=sys.stderr)
-        sys.exit(1)
 if __name__ == "__main__":

 """
+Baseline inference script for SupportEnv.
+Runs an LLM agent against all 3 tasks (5 tickets each) and emits the
+mandatory [START]/[STEP]/[END] stdout format.
+Environment variables:
+  API_BASE_URL   LLM endpoint            (default: https://router.huggingface.co/v1)
+  MODEL_NAME     Model identifier         (default: Qwen/Qwen2.5-72B-Instruct)
+  HF_TOKEN       API key
+  API_BASE_URL_ENV  SupportEnv server URL (default: http://localhost:7860)
 """
+import json
 import os
 import sys
 import time
+from typing import Any, Dict, List, Optional
 import requests
 from openai import OpenAI
+# ---------------------------------------------------------------------------
+# Config from environment
+# ---------------------------------------------------------------------------
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
+ENV_BASE_URL = os.getenv("API_BASE_URL_ENV", "http://localhost:7860")
+TEMPERATURE = 0.3
+MAX_TOKENS = 1024
+BENCHMARK = "supportenv"
+TASKS = [
+    {"task_id": "task1", "name": "Ticket Classification", "tickets": 5},
+    {"task_id": "task2", "name": "Information Extraction", "tickets": 5},
+    {"task_id": "task3", "name": "Resolution Generation", "tickets": 5},
+]
+# ---------------------------------------------------------------------------
+# Logging helpers (mandatory format)
+# ---------------------------------------------------------------------------
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
+        flush=True,
     )
+# ---------------------------------------------------------------------------
+# Environment HTTP helpers
+# ---------------------------------------------------------------------------
+def env_request(method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
+    url = f"{ENV_BASE_URL}{endpoint}"
+    resp = requests.request(method, url, timeout=30, **kwargs)
+    resp.raise_for_status()
+    return resp.json()
+# ---------------------------------------------------------------------------
+# LLM prompts per task
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPTS = {
+    "task1": (
+        "You are an expert customer support triage agent.\n"
+        "Given a support ticket, classify it by:\n"
+        "  category: one of billing | technical | account | feature_request | complaint | general\n"
+        "  priority: one of low | medium | high | critical\n\n"
+        "Respond with ONLY valid JSON:\n"
+        '{"action_type": "classify", "category": "<category>", "priority": "<priority>"}'
+    ),
+    "task2": (
+        "You are an expert information extraction agent for customer support.\n"
+        "Given a support ticket, extract ALL structured entities and identify required actions.\n\n"
+        "Respond with ONLY valid JSON:\n"
+        '{"action_type": "extract", "extracted_entities": {"key": "value", ...}, '
+        '"required_actions": ["action1", "action2", ...]}'
+    ),
+    "task3": (
+        "You are an expert customer support resolution agent.\n"
+        "Given a support ticket, write a professional customer-facing response and "
+        "list the internal resolution steps.\n\n"
+        "Requirements:\n"
+        "- response_text: Professional, empathetic response (80+ chars)\n"
+        "- resolution_steps: Ordered list of internal action identifiers\n"
+        "- If the ticket is urgent, acknowledge urgency and provide a timeline\n"
+        "- If appropriate, include an apology\n\n"
+        "Respond with ONLY valid JSON:\n"
+        '{"action_type": "respond", "response_text": "...", '
+        '"resolution_steps": ["step1", "step2", ...]}'
+    ),
 }
+def build_user_prompt(task_id: str, ticket: Dict[str, Any]) -> str:
+    parts = [
+        f"Ticket ID: {ticket['ticket_id']}",
+        f"Subject: {ticket['subject']}",
+        f"Body: {ticket['body']}",
+        f"Customer Tier: {ticket['customer_tier']}",
+        f"Account Age: {ticket['account_age_days']} days",
+        f"Previous Tickets: {ticket['previous_tickets']}",
+    ]
+    if ticket.get("attachments"):
+        parts.append(f"Attachments: {', '.join(ticket['attachments'])}")
+    return "\n".join(parts)
+# ---------------------------------------------------------------------------
+# LLM call
+# ---------------------------------------------------------------------------
+def call_llm(client: OpenAI, task_id: str, ticket: Dict[str, Any]) -> Dict[str, Any]:
+    """Call the LLM and parse its JSON response into an action dict."""
+    system_prompt = SYSTEM_PROMPTS[task_id]
+    user_prompt = build_user_prompt(task_id, ticket)
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        return _parse_json(text, task_id)
+    except Exception as exc:
+        print(f"[DEBUG] LLM error: {exc}", file=sys.stderr, flush=True)
+        return _fallback_action(task_id)
+def _parse_json(text: str, task_id: str) -> Dict[str, Any]:
+    """Extract JSON from model output, handling markdown fences."""
+    if "```json" in text:
+        text = text.split("```json")[1].split("```")[0]
+    elif "```" in text:
+        text = text.split("```")[1].split("```")[0]
+    try:
+        return json.loads(text.strip())
+    except json.JSONDecodeError:
+        print(f"[DEBUG] JSON parse failed: {text[:120]}", file=sys.stderr, flush=True)
+        return _fallback_action(task_id)
+def _fallback_action(task_id: str) -> Dict[str, Any]:
+    """Deterministic fallback when LLM fails."""
+    if task_id == "task1":
+        return {"action_type": "classify", "category": "general", "priority": "medium"}
+    elif task_id == "task2":
+        return {"action_type": "extract", "extracted_entities": {}, "required_actions": []}
+    return {"action_type": "respond", "response_text": "Thank you for contacting support. We are looking into this.", "resolution_steps": []}
+# ---------------------------------------------------------------------------
+# Run one episode
+# ---------------------------------------------------------------------------
+def run_episode(
+    client: OpenAI, task_id: str, task_name: str, ticket_index: int
+) -> Dict[str, Any]:
+    """Run a single episode: reset → action → submit → grade."""
+    log_start(task=f"{task_name}-ticket{ticket_index}", env=BENCHMARK, model=MODEL_NAME)
+    rewards: List[float] = []
+    steps_taken = 0
+    success = False
+    error_msg: Optional[str] = None
+    try:
+        # Reset
+        obs = env_request("POST", "/reset", json={
+            "task_id": task_id, "ticket_index": ticket_index
+        })
+        episode_id = obs["episode_id"]
+        ticket = obs["ticket"]
+        # Step 1: LLM generates the action
+        action_data = call_llm(client, task_id, ticket)
+        result = env_request("POST", "/step", json={
+            "episode_id": episode_id, "action": action_data
+        })
+        steps_taken = 1
+        reward_val = result["reward"]["step_reward"]
+        rewards.append(reward_val)
+        done = result["done"]
+        action_summary = _action_summary(action_data)
+        log_step(step=1, action=action_summary, reward=reward_val, done=done, error=error_msg)
+        # Step 2: Submit if not already done
+        if not done:
+            submit_result = env_request("POST", "/step", json={
                 "episode_id": episode_id,
+                "action": {"action_type": "submit"},
             })
+            steps_taken = 2
+            reward_val = submit_result["reward"]["step_reward"]
+            rewards.append(reward_val)
+            done = submit_result["done"]
+            log_step(step=2, action="submit()", reward=reward_val, done=done, error=None)
+        # Grade
+        grade = env_request("POST", "/grader", json={"episode_id": episode_id})
+        final_score = grade["score"]
+        success = final_score >= 0.5
+    except Exception as exc:
+        error_msg = str(exc)
+        print(f"[DEBUG] Episode error: {exc}", file=sys.stderr, flush=True)
+    log_end(success=success, steps=steps_taken, rewards=rewards)
     return {
         "task_id": task_id,
+        "ticket_index": ticket_index,
+        "steps": steps_taken,
+        "rewards": rewards,
+        "success": success,
     }
+def _action_summary(action: Dict[str, Any]) -> str:
+    atype = action.get("action_type", "unknown")
+    if atype == "classify":
+        return f"classify({action.get('category')},{action.get('priority')})"
+    elif atype == "extract":
+        ents = action.get("extracted_entities") or {}
+        acts = action.get("required_actions") or []
+        return f"extract({len(ents)}ents,{len(acts)}acts)"
+    elif atype == "respond":
+        tlen = len(action.get("response_text") or "")
+        slen = len(action.get("resolution_steps") or [])
+        return f"respond({tlen}chars,{slen}steps)"
+    return atype
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
+    results = []
+    for task_info in TASKS:
+        task_id = task_info["task_id"]
+        task_name = task_info["name"]
+        num_tickets = task_info["tickets"]
+        for ticket_idx in range(num_tickets):
+            result = run_episode(client, task_id, task_name, ticket_idx)
+            results.append(result)
+            time.sleep(0.5)  # rate-limit courtesy
+    # Summary
+    print("\n" + "=" * 60, flush=True)
+    print("BASELINE RESULTS SUMMARY", flush=True)
+    print("=" * 60, flush=True)
+    for r in results:
+        status = "PASS" if r["success"] else "FAIL"
+        total_r = sum(r["rewards"])
+        print(
+            f"  {r['task_id']} ticket={r['ticket_index']}  "
+            f"steps={r['steps']}  reward={total_r:.2f}  {status}",
+            flush=True,
+        )
 if __name__ == "__main__":

models.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
-Pydantic models for DevOpsEnv OpenEnv environment.
-Domain: Linux DevOps & SRE Troubleshooting
 """
 from __future__ import annotations
@@ -10,21 +11,18 @@ from pydantic import BaseModel, Field
 # ---------------------------------------------------------------------------
-# System State Models
 # ---------------------------------------------------------------------------
-class SystemState(BaseModel):
-    """Current state of the mock Linux server."""
-    task_id: str
-    available_commands: List[str]
-    filesystem_snapshot: str
-    running_processes: List[Dict[str, Any]]
-    service_status: Dict[str, str]
-    logs: str
-    http_ports_open: List[int]
-    docker_containers: List[Dict[str, str]]
-    cpu_usage: float
-    memory_usage_mb: int
 # ---------------------------------------------------------------------------
@@ -34,17 +32,17 @@ class SystemState(BaseModel):
 class Observation(BaseModel):
     """Everything the agent sees at each step."""
     task_id: str = Field(description="task1 | task2 | task3")
-    task_description: str = Field(description="Human-readable task description")
-    episode_id: str = Field(description="Unique episode UUID")
-    system_state: SystemState
     thread_history: List[Dict[str, str]] = Field(
         default_factory=list,
-        description="Ordered list of {'role': 'agent'|'system', 'content': str}"
     )
     available_actions: List[str]
     step_number: int
     max_steps: int
-    hint: Optional[str] = Field(default=None)
 # ---------------------------------------------------------------------------
@@ -52,12 +50,40 @@ class Observation(BaseModel):
 # ---------------------------------------------------------------------------
 class Action(BaseModel):
-    """Agent action: run a bash command, edit a file, or submit."""
-    action_type: str = Field(description="bash_cmd | file_edit | submit")
-    command: Optional[str] = Field(default=None, description="Bash command to execute")
-    file_path: Optional[str] = Field(default=None, description="Absolute path to file to edit")
-    file_content: Optional[str] = Field(default=None, description="New full content for the file")
-    summary: Optional[str] = Field(default=None, description="Final summary of actions taken")
 # ---------------------------------------------------------------------------
@@ -94,7 +120,7 @@ class State(BaseModel):
     done: bool
     total_reward: float
     history: List[Dict[str, Any]] = Field(default_factory=list)
-    final_score: Optional[float] = Field(default=None)
 # ---------------------------------------------------------------------------
@@ -128,4 +154,4 @@ class BaselineResult(BaseModel):
     final_score: float
     step_count: int
     total_reward: float
-    actions: List[Dict[str, Any]]

 """
+Pydantic models for SupportEnv — Customer Support Ticket Triage.
+Domain: SaaS customer support automation
+Tasks: classification, information extraction, resolution generation
 """
 from __future__ import annotations
 # ---------------------------------------------------------------------------
+# Ticket Info (what the agent sees)
 # ---------------------------------------------------------------------------
+class TicketInfo(BaseModel):
+    """A customer support ticket presented to the agent."""
+    ticket_id: str
+    subject: str
+    body: str
+    customer_tier: str = Field(description="free | pro | enterprise")
+    account_age_days: int
+    previous_tickets: int
+    attachments: List[str] = Field(default_factory=list)
 # ---------------------------------------------------------------------------
 class Observation(BaseModel):
     """Everything the agent sees at each step."""
     task_id: str = Field(description="task1 | task2 | task3")
+    task_description: str
+    episode_id: str
+    ticket: TicketInfo
     thread_history: List[Dict[str, str]] = Field(
         default_factory=list,
+        description="Ordered list of {'role': 'agent'|'system', 'content': str}",
     )
     available_actions: List[str]
     step_number: int
     max_steps: int
+    hint: Optional[str] = None
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 class Action(BaseModel):
+    """Agent action for support ticket processing."""
+    action_type: str = Field(
+        description="classify | extract | respond | resolve | escalate | submit"
+    )
+    # Task 1: Classification
+    category: Optional[str] = Field(
+        default=None,
+        description="billing | technical | account | feature_request | complaint | general",
+    )
+    priority: Optional[str] = Field(
+        default=None,
+        description="low | medium | high | critical",
+    )
+    # Task 2: Extraction
+    extracted_entities: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Key-value pairs extracted from the ticket",
+    )
+    required_actions: Optional[List[str]] = Field(
+        default=None,
+        description="List of actions needed to resolve the ticket",
+    )
+    # Task 3: Resolution
+    response_text: Optional[str] = Field(
+        default=None,
+        description="Customer-facing response text",
+    )
+    resolution_steps: Optional[List[str]] = Field(
+        default=None,
+        description="Ordered list of internal resolution steps",
+    )
+    # Escalation
+    escalation_team: Optional[str] = Field(default=None)
+    escalation_reason: Optional[str] = Field(default=None)
 # ---------------------------------------------------------------------------
     done: bool
     total_reward: float
     history: List[Dict[str, Any]] = Field(default_factory=list)
+    final_score: Optional[float] = None
 # ---------------------------------------------------------------------------
     final_score: float
     step_count: int
     total_reward: float
+    actions: List[Dict[str, Any]]

openenv.yaml CHANGED Viewed

@@ -57,14 +57,6 @@ interface:
       episode_id: string
     response: GraderResponse
-  baseline:
-    method: POST
-    path: /baseline
-    request:
-      model: string          # optional, default heuristic
-      ticket_index: integer  # optional, default 0
-    response: BaselineResult
   health:
     method: GET
     path: /health

       episode_id: string
     response: GraderResponse
   health:
     method: GET
     path: /health

requirements.txt CHANGED Viewed

@@ -5,4 +5,3 @@ openai>=1.35.0
 httpx>=0.27.0
 python-multipart>=0.0.9
 requests>=2.31.0
-google-genai>=1.15.0

 httpx>=0.27.0
 python-multipart>=0.0.9
 requests>=2.31.0

test_integration.py CHANGED Viewed

@@ -1,116 +1,112 @@
 """
-Quick integration test for DevOpsEnv.
-This runs a full episode for each task to verify everything works.
 """
 import environment as env
 from models import Action
-def test_task(task_id: str, max_test_steps: int = 5):
-    """Test a single task."""
     print(f"\n{'='*60}")
     print(f"Testing {task_id}")
     print(f"{'='*60}")
     # Reset
-    print("1. Calling reset()...")
-    obs = env.reset(task_id)
     episode_id = obs.episode_id
-    print(f"[OK] Episode created: {episode_id}")
-    print(f"  Task: {obs.task_description[:80]}...")
-    print(f"  Max steps: {obs.max_steps}")
-    print(f"  System state: cpu={obs.system_state.cpu_usage:.1f}%, mem={obs.system_state.memory_usage_mb}MB")
-    # Take steps
-    print(f"\n2. Taking {max_test_steps} steps...")
-    for i in range(max_test_steps):
-        if task_id == "task1":
-            if i == 0:
-                action = Action(action_type="bash_cmd", command="systemctl status nginx")
-            elif i == 1:
-                action = Action(action_type="bash_cmd", command="systemctl try-restart nginx")
-            elif i == 2:
-                action = Action(action_type="bash_cmd", command="nginx -t")
-            else:
-                action = Action(action_type="bash_cmd", command="curl http://localhost")
-        elif task_id == "task2":
-            if i == 0:
-                action = Action(action_type="bash_cmd", command="cat /srv/docker-compose.yml")
-            elif i == 1:
-                action = Action(
-                    action_type="file_edit",
-                    file_path="/srv/docker-compose.yml",
-                    file_content="version: '3.8'\nservices:\n  mockapi:\n    image: mockapi:latest\n    ports:\n      - \"3000:3000\""
-                )
-            elif i == 2:
-                action = Action(action_type="bash_cmd", command="docker-compose up -d")
-            else:
-                action = Action(action_type="bash_cmd", command="docker ps")
-        else:  # task3
-            if i == 0:
-                action = Action(action_type="bash_cmd", command="ps aux | grep python")
-            elif i == 1:
-                action = Action(action_type="bash_cmd", command="kill 300")
-            elif i == 2:
-                action = Action(
-                    action_type="file_edit",
-                    file_path="/opt/mockapi/app.py",
-                    file_content="import json\nfrom flask import Flask\n\napp = Flask(__name__)\n\n@app.route('/api/data')\ndef get_data():\n    return json.dumps({'status': 'ok'})\n\nif __name__ == '__main__':\n    app.run()\n"
-                )
-            else:
-                action = Action(action_type="bash_cmd", command="python3 /opt/mockapi/app.py &")
-        try:
-            result = env.step(episode_id, action)
-            print(f"  Step {i+1}: {action.action_type} - Reward: {result.reward.step_reward:+.3f}")
-            if result.done:
-                print(f"  -> Episode completed early")
-                break
-        except Exception as e:
-            print(f"  Step {i+1} ERROR: {e}")
-            break
-    # Check state
-    print(f"\n3. Calling get_state()...")
-    state = env.get_state(episode_id)
-    print(f"[OK] State: step_number={state.step_number}, total_reward={state.total_reward:.3f}, done={state.done}")
-    # Finish episode if not already done
-    if not state.done:
-        print(f"\n4. Calling submit()...")
         result = env.step(episode_id, Action(action_type="submit"))
-        print(f"[OK] Episode submitted, done={result.done}")
     # Grade
-    print(f"\n5. Calling grade()...")
-    try:
-        score, breakdown, feedback = env.grade(episode_id)
-        print(f"[OK] Score: {score:.3f}/1.0")
-        print(f"  Breakdown: {breakdown}")
-        print(f"  Feedback: {feedback}")
-    except Exception as e:
-        print(f"[ERROR] Grading error: {e}")
 def main():
-    """Run all tests."""
-    print("DevOpsEnv Integration Test")
-    print("="*60)
-    try:
-        test_task("task1", max_test_steps=5)
-        test_task("task2", max_test_steps=5)
-        test_task("task3", max_test_steps=5)
-        print(f"\n{'='*60}")
-        print("[OK] All tests completed successfully!")
-        print(f"{'='*60}")
-    except Exception as e:
-        print(f"\n[ERROR] Test failed: {e}")
-        import traceback
-        traceback.print_exc()
 if __name__ == "__main__":
-    main()

 """
+Integration test for SupportEnv.
+Runs a full episode for each task and prints results.
+Usage: python test_integration.py
 """
 import environment as env
 from models import Action
+def test_task(task_id: str) -> bool:
+    """Run a full episode for a task. Returns True if passed."""
     print(f"\n{'='*60}")
     print(f"Testing {task_id}")
     print(f"{'='*60}")
     # Reset
+    print("1. reset()...")
+    obs = env.reset(task_id, ticket_index=0)
     episode_id = obs.episode_id
+    print(f"   [OK] episode_id={episode_id[:8]}...")
+    print(f"   ticket_id={obs.ticket.ticket_id}  subject={obs.ticket.subject[:50]}")
+    print(f"   max_steps={obs.max_steps}  hint={obs.hint}")
+    # Take a relevant action
+    print("2. step() with task action...")
+    if task_id == "task1":
+        action = Action(action_type="classify", category="billing", priority="high")
+    elif task_id == "task2":
+        action = Action(
+            action_type="extract",
+            extracted_entities={"customer_name": "Robert Chen", "account_id": "ACC-78234"},
+            required_actions=["issue_refund"],
+        )
+    else:  # task3
+        action = Action(
+            action_type="respond",
+            response_text=(
+                "We sincerely apologize for the inconvenience with your password reset. "
+                "We will manually reset your password and send a new email immediately. "
+                "Please check your spam folder and whitelist our domain. "
+                "We will resolve this within the next 30 minutes."
+            ),
+            resolution_steps=[
+                "verify_email_delivery",
+                "check_spam_filters",
+                "manual_password_reset",
+                "follow_up_confirmation",
+            ],
+        )
+    result = env.step(episode_id, action)
+    print(f"   [OK] step_reward={result.reward.step_reward:+.4f}  done={result.done}")
+    # Submit
+    print("3. step() submit...")
+    if not result.done:
         result = env.step(episode_id, Action(action_type="submit"))
+    print(f"   [OK] done={result.done}  total_reward={result.reward.total_reward:.4f}")
+    # State
+    print("4. get_state()...")
+    state = env.get_state(episode_id)
+    print(f"   [OK] steps={state.step_number}  history_len={len(state.history)}")
     # Grade
+    print("5. grade()...")
+    score, breakdown, feedback = env.grade(episode_id)
+    print(f"   [OK] score={score:.4f}/1.0")
+    print(f"   breakdown: {', '.join(f'{k}={v:.2f}' for k, v in breakdown.items())}")
+    print(f"   feedback: {feedback}")
+    passed = score >= 0.0  # just verify pipeline works
+    return passed
 def main():
+    print("SupportEnv Integration Test")
+    print("=" * 60)
+    results = []
+    for task_id in ["task1", "task2", "task3"]:
+        try:
+            ok = test_task(task_id)
+            results.append((task_id, ok, None))
+        except Exception as exc:
+            import traceback
+            traceback.print_exc()
+            results.append((task_id, False, str(exc)))
+        finally:
+            env._EPISODES.clear()
+    print(f"\n{'='*60}")
+    print("SUMMARY")
+    print("=" * 60)
+    all_ok = True
+    for task_id, ok, err in results:
+        status = "[PASS]" if ok else "[FAIL]"
+        print(f"  {status} {task_id}" + (f" — {err}" if err else ""))
+        if not ok:
+            all_ok = False
+    if all_ok:
+        print("\n[OK] All integration tests passed!")
+    else:
+        print("\n[FAIL] Some tests failed.")
+    return 0 if all_ok else 1
 if __name__ == "__main__":
+    import sys
+    sys.exit(main())

tests_new.py CHANGED Viewed

@@ -1,11 +1,9 @@
 """
-Comprehensive tests for DevOpsEnv.
-Run with: pytest tests/test_environment.py -v
 """
 import pytest
-import json
-from unittest.mock import patch
 import environment as env
 from models import Action, Observation, StepResult, State
@@ -14,232 +12,245 @@ from data import TASK_META
 class TestReset:
     """Test episode reset functionality."""
-    def test_reset_valid_task(self):
-        """Reset creates a valid episode."""
         obs = env.reset("task1")
         assert isinstance(obs, Observation)
         assert obs.task_id == "task1"
         assert obs.episode_id is not None
-        assert len(obs.episode_id) > 0
-        assert obs.system_state is not None
         assert obs.step_number == 0
         assert obs.max_steps == TASK_META["task1"]["max_steps"]
     def test_reset_invalid_task(self):
-        """Reset raises error for unknown task."""
         with pytest.raises(ValueError):
-            env.reset("invalid_task")
-    def test_reset_creates_episode_state(self):
-        """Reset creates episode in internal state."""
-        obs = env.reset("task2")
         assert obs.episode_id in env._EPISODES
         ep = env._EPISODES[obs.episode_id]
-        assert ep["task_id"] == "task2"
         assert ep["step_number"] == 0
         assert ep["done"] is False
 class TestStep:
     """Test step execution."""
-    def test_step_bash_command(self):
-        """Step handles bash_cmd action."""
         obs = env.reset("task1")
-        action = Action(action_type="bash_cmd", command="systemctl status nginx")
         result = env.step(obs.episode_id, action)
         assert isinstance(result, StepResult)
         assert result.observation.step_number == 1
-        assert result.reward.step_reward != 0
-        assert result.done is False
-    def test_step_file_edit(self):
-        """Step handles file_edit action."""
         obs = env.reset("task2")
         action = Action(
-            action_type="file_edit",
-            file_path="/srv/docker-compose.yml",
-            file_content="version: '3.8'\nservices:\n  test: {}"
         )
         result = env.step(obs.episode_id, action)
-        assert isinstance(result, StepResult)
         assert result.observation.step_number == 1
-    def test_step_submit(self):
-        """Step with submit action marks episode done."""
-        obs = env.reset("task1")
-        action = Action(action_type="submit", summary="Done")
         result = env.step(obs.episode_id, action)
         assert result.done is True
         assert result.observation.available_actions == []
     def test_step_invalid_episode(self):
-        """Step raises error for invalid episode."""
-        action = Action(action_type="bash_cmd", command="ls")
         with pytest.raises(KeyError):
-            env.step("invalid_episode_id", action)
-    def test_step_after_done(self):
-        """Step raises error after episode is done."""
         obs = env.reset("task1")
-        # End the episode
-        action1 = Action(action_type="submit")
-        env.step(obs.episode_id, action1)
-        # Try to step again
         with pytest.raises(ValueError):
-            env.step(obs.episode_id, action1)
-    def test_step_max_steps_limit(self):
-        """Episode ends after max_steps."""
         obs = env.reset("task1")
         max_steps = obs.max_steps
         for i in range(max_steps):
-            action = Action(action_type="bash_cmd", command="ps aux")
             result = env.step(obs.episode_id, action)
-            if i < max_steps - 1:
-                assert result.done is False
-            else:
-                assert result.done is True
 class TestState:
     """Test state retrieval."""
-    def test_get_state(self):
-        """get_state returns current episode state."""
-        obs = env.reset("task3")
         state = env.get_state(obs.episode_id)
         assert isinstance(state, State)
         assert state.episode_id == obs.episode_id
-        assert state.task_id == "task3"
         assert state.step_number == 0
         assert state.done is False
-    def test_get_state_invalid_episode(self):
-        """get_state raises error for invalid episode."""
         with pytest.raises(KeyError):
-            env.get_state("invalid_id")
-    def test_state_history(self):
-        """State includes action history."""
-        obs = env.reset("task1")
-        # Take Actions
-        action1 = Action(action_type="bash_cmd", command="ps aux")
-        env.step(obs.episode_id, action1)
         state = env.get_state(obs.episode_id)
         assert len(state.history) == 1
-        assert state.history[0]["action_type"] == "bash_cmd"
-class TestGrading:
-    """Test episode grading."""
-    def test_grade_task1_nginx_running(self):
-        """Task 1 grades based on nginx status."""
-        obs = env.reset("task1")
-        # Run commands to fix nginx
-        env.step(obs.episode_id, Action(action_type="bash_cmd", command="systemctl restart nginx"))
-        env.step(obs.episode_id, Action(action_type="bash_cmd", command="nginx -t"))
-        env.step(obs.episode_id, Action(action_type="bash_cmd", command="curl http://localhost"))
         env.step(obs.episode_id, Action(action_type="submit"))
         score, breakdown, feedback = env.grade(obs.episode_id)
-        assert 0.0 <= score <= 1.0
-        assert "nginx_running" in breakdown
-        assert "config_valid" in breakdown
-        assert "http_200" in breakdown
-    def test_grade_invalid_episode(self):
-        """grade raises error for invalid episode."""
-        with pytest.raises(KeyError):
-            env.grade("invalid_id")
-    def test_grade_not_done(self):
-        """grade raises error if episode not done."""
         obs = env.reset("task1")
-        # Don't finish the episode
         with pytest.raises(ValueError):
             env.grade(obs.episode_id)
-class TestSystemSimulation:
-    """Test mock system state simulation."""
-    def test_task1_initial_state(self):
-        """Task 1 initializes with nginx crashed."""
-        obs = env.reset("task1")
-        assert obs.system_state.service_status.get("nginx") == "inactive"
-        assert 80 not in obs.system_state.http_ports_open
-    def test_task2_initial_state(self):
-        """Task 2 initializes with docker misconfigured."""
-        obs = env.reset("task2")
-        assert obs.system_state.service_status.get("docker") == "active"
-        assert 80 in obs.system_state.http_ports_open
-    def test_task3_initial_state(self):
-        """Task 3 initializes with memory leak."""
-        obs = env.reset("task3")
-        assert obs.system_state.service_status.get("mockapi") == "active"
-        # Should have high memory usage
-        assert obs.system_state.memory_usage_mb > 1024
 class TestRewards:
-    """Test reward calculation."""
-    def test_step_reward_positive(self):
-        """Taking actions yields positive reward."""
         obs = env.reset("task1")
-        action = Action(action_type="bash_cmd", command="ps aux")
-        result = env.step(obs.episode_id, action)
-        assert result.reward.step_reward > -1.0  # Not all negative
-    def test_total_reward_accumulation(self):
-        """Total reward accumulates across steps."""
         obs = env.reset("task1")
-        env.step(obs.episode_id, Action(action_type="bash_cmd", command="ps aux"))
-        result1 = env.step(obs.episode_id, Action(action_type="bash_cmd", command="ls"))
-        total1 = result1.reward.total_reward
-        result2 = env.step(obs.episode_id, Action(action_type="bash_cmd", command="pwd"))
-        total2 = result2.reward.total_reward
-        # Total reward should accumulate
-        assert total2 >= total1 or total2 < total1  # Can go either way depending on grader
 @pytest.fixture(autouse=True)
 def cleanup():
-    """Clean up episodes after each test."""
     yield
     env._EPISODES.clear()

 """
+Comprehensive tests for SupportEnv.
+Run with: pytest tests_new.py -v
 """
 import pytest
 import environment as env
 from models import Action, Observation, StepResult, State
 class TestReset:
     """Test episode reset functionality."""
+    def test_reset_task1(self):
         obs = env.reset("task1")
         assert isinstance(obs, Observation)
         assert obs.task_id == "task1"
         assert obs.episode_id is not None
+        assert obs.ticket is not None
         assert obs.step_number == 0
         assert obs.max_steps == TASK_META["task1"]["max_steps"]
+    def test_reset_task2(self):
+        obs = env.reset("task2")
+        assert obs.task_id == "task2"
+        assert obs.ticket.ticket_id.startswith("T2-")
+    def test_reset_task3(self):
+        obs = env.reset("task3")
+        assert obs.task_id == "task3"
+        assert obs.ticket.ticket_id.startswith("T3-")
+    def test_reset_with_ticket_index(self):
+        obs = env.reset("task1", ticket_index=2)
+        assert obs.ticket.ticket_id == "T1-003"
     def test_reset_invalid_task(self):
         with pytest.raises(ValueError):
+            env.reset("task_unknown")
+    def test_reset_invalid_ticket_index(self):
+        with pytest.raises(ValueError):
+            env.reset("task1", ticket_index=99)
+    def test_reset_creates_episode(self):
+        obs = env.reset("task1")
         assert obs.episode_id in env._EPISODES
         ep = env._EPISODES[obs.episode_id]
+        assert ep["task_id"] == "task1"
         assert ep["step_number"] == 0
         assert ep["done"] is False
+    def test_reset_hint_on_first_step(self):
+        obs = env.reset("task1")
+        assert obs.hint is not None
 class TestStep:
     """Test step execution."""
+    def test_step_classify(self):
         obs = env.reset("task1")
+        action = Action(action_type="classify", category="billing", priority="high")
         result = env.step(obs.episode_id, action)
         assert isinstance(result, StepResult)
         assert result.observation.step_number == 1
+        assert result.reward.step_reward is not None
+    def test_step_extract(self):
         obs = env.reset("task2")
         action = Action(
+            action_type="extract",
+            extracted_entities={"customer_name": "Alice"},
+            required_actions=["issue_refund"],
         )
         result = env.step(obs.episode_id, action)
         assert result.observation.step_number == 1
+    def test_step_respond(self):
+        obs = env.reset("task3")
+        action = Action(
+            action_type="respond",
+            response_text="Thank you for reaching out. We sincerely apologize for the inconvenience and will resolve this immediately.",
+            resolution_steps=["verify_account", "issue_refund"],
+        )
         result = env.step(obs.episode_id, action)
+        assert result.observation.step_number == 1
+    def test_step_submit_marks_done(self):
+        obs = env.reset("task1")
+        result = env.step(obs.episode_id, Action(action_type="submit"))
         assert result.done is True
         assert result.observation.available_actions == []
     def test_step_invalid_episode(self):
         with pytest.raises(KeyError):
+            env.step("nonexistent-id", Action(action_type="submit"))
+    def test_step_after_done_raises(self):
         obs = env.reset("task1")
+        env.step(obs.episode_id, Action(action_type="submit"))
         with pytest.raises(ValueError):
+            env.step(obs.episode_id, Action(action_type="submit"))
+    def test_step_max_steps_ends_episode(self):
         obs = env.reset("task1")
         max_steps = obs.max_steps
         for i in range(max_steps):
+            action = Action(action_type="classify", category="general", priority="low")
             result = env.step(obs.episode_id, action)
+        assert result.done is True
+    def test_thread_history_grows(self):
+        obs = env.reset("task1")
+        env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
+        result = env.step(obs.episode_id, Action(action_type="submit"))
+        assert len(result.observation.thread_history) == 2
 class TestState:
     """Test state retrieval."""
+    def test_get_state_initial(self):
+        obs = env.reset("task1")
         state = env.get_state(obs.episode_id)
         assert isinstance(state, State)
         assert state.episode_id == obs.episode_id
+        assert state.task_id == "task1"
         assert state.step_number == 0
         assert state.done is False
+    def test_get_state_invalid(self):
         with pytest.raises(KeyError):
+            env.get_state("bad-id")
+    def test_state_history_after_step(self):
+        obs = env.reset("task2")
+        env.step(obs.episode_id, Action(action_type="extract", extracted_entities={}, required_actions=[]))
         state = env.get_state(obs.episode_id)
         assert len(state.history) == 1
+        assert state.history[0]["action_type"] == "extract"
+class TestGraders:
+    """Test grading for each task."""
+    def test_grade_task1_perfect(self):
+        """Correct category + priority on ticket 0 (billing/high)."""
+        obs = env.reset("task1", ticket_index=0)
+        env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
         env.step(obs.episode_id, Action(action_type="submit"))
         score, breakdown, feedback = env.grade(obs.episode_id)
+        assert score >= 0.9
+        assert breakdown["category_correct"] == 0.50
+        assert breakdown["priority_correct"] == 0.40
+    def test_grade_task1_wrong_category(self):
+        obs = env.reset("task1", ticket_index=0)
+        env.step(obs.episode_id, Action(action_type="classify", category="technical", priority="high"))
+        env.step(obs.episode_id, Action(action_type="submit"))
+        score, breakdown, _ = env.grade(obs.episode_id)
+        assert breakdown["category_correct"] == 0.0
+        assert breakdown["priority_correct"] == 0.40
+    def test_grade_task1_no_classify_action(self):
+        obs = env.reset("task1")
+        env.step(obs.episode_id, Action(action_type="submit"))
+        score, _, _ = env.grade(obs.episode_id)
+        assert score == 0.0
+    def test_grade_task2_entities(self):
+        obs = env.reset("task2", ticket_index=0)
+        env.step(obs.episode_id, Action(
+            action_type="extract",
+            extracted_entities={
+                "customer_name": "Robert Chen",
+                "account_id": "ACC-78234",
+                "invoice_number": "INV-20240312",
+                "incorrect_amount": "199.00",
+                "correct_amount": "99.00",
+                "refund_amount": "100.00",
+            },
+            required_actions=["issue_refund", "send_corrected_invoice"],
+        ))
+        env.step(obs.episode_id, Action(action_type="submit"))
+        score, breakdown, _ = env.grade(obs.episode_id)
+        assert breakdown["entity_coverage"] == pytest.approx(0.60, abs=0.01)
+        assert breakdown["action_coverage"] == pytest.approx(0.30, abs=0.01)
+    def test_grade_task3_keywords_and_steps(self):
+        obs = env.reset("task3", ticket_index=0)
+        env.step(obs.episode_id, Action(
+            action_type="respond",
+            response_text=(
+                "We sincerely apologize for the password reset issue. "
+                "We will send a new reset email and ask you to check your spam folder "
+                "and whitelist our domain. We will have this resolved within the hour."
+            ),
+            resolution_steps=[
+                "verify_email_delivery",
+                "check_spam_filters",
+                "manual_password_reset",
+                "follow_up_confirmation",
+            ],
+        ))
+        env.step(obs.episode_id, Action(action_type="submit"))
+        score, breakdown, _ = env.grade(obs.episode_id)
+        assert score >= 0.7
+        assert breakdown["length_adequate"] == 0.10
+        assert breakdown["no_empty_steps"] == 0.05
+    def test_grade_not_done_raises(self):
         obs = env.reset("task1")
         with pytest.raises(ValueError):
             env.grade(obs.episode_id)
+    def test_grade_invalid_episode_raises(self):
+        with pytest.raises(KeyError):
+            env.grade("bad-id")
+    def test_score_in_range(self):
+        for task_id in ["task1", "task2", "task3"]:
+            obs = env.reset(task_id)
+            env.step(obs.episode_id, Action(action_type="submit"))
+            score, _, _ = env.grade(obs.episode_id)
+            assert 0.0 <= score <= 1.0
 class TestRewards:
+    """Test reward signals."""
+    def test_step_reward_is_float(self):
         obs = env.reset("task1")
+        result = env.step(obs.episode_id, Action(action_type="classify", category="billing", priority="high"))
+        assert isinstance(result.reward.step_reward, float)
+    def test_total_reward_accumulates(self):
+        obs = env.reset("task2")
+        r1 = env.step(obs.episode_id, Action(action_type="extract", extracted_entities={}, required_actions=[]))
+        r2 = env.step(obs.episode_id, Action(action_type="submit"))
+        assert r2.reward.total_reward != r1.reward.total_reward
+    def test_submit_bonus_applied(self):
         obs = env.reset("task1")
+        result = env.step(obs.episode_id, Action(action_type="submit"))
+        # submit_bonus=0.05 minus step_cost=0.02 = +0.03 base before grader
+        assert result.reward.step_reward > 0.0
 @pytest.fixture(autouse=True)
 def cleanup():
     yield
     env._EPISODES.clear()