Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +14 -0
- README.md +254 -10
- __init__.py +3 -0
- client.py +3 -0
- examples/rl/train_q_agent.py +278 -0
- graders.py +3 -0
- inference.py +151 -0
- models.py +3 -0
- openenv.yaml +3 -0
- pyproject.toml +29 -0
- pytest-cache-files-0ad_osqx/CACHEDIR.TAG +4 -0
- pytest-cache-files-0ad_osqx/README.md +8 -0
- pytest-cache-files-7cu1ajqk/CACHEDIR.TAG +4 -0
- pytest-cache-files-7cu1ajqk/README.md +8 -0
- pytest-cache-files-8nb5ix7f/CACHEDIR.TAG +4 -0
- pytest-cache-files-8nb5ix7f/README.md +8 -0
- pytest-cache-files-fy9fhtya/CACHEDIR.TAG +4 -0
- pytest-cache-files-fy9fhtya/README.md +8 -0
- pytest-cache-files-i3h6kocm/CACHEDIR.TAG +4 -0
- pytest-cache-files-i3h6kocm/README.md +8 -0
- pytest-cache-files-l0dcjql5/CACHEDIR.TAG +4 -0
- pytest-cache-files-l0dcjql5/README.md +8 -0
- pytest-cache-files-ofsx67d1/CACHEDIR.TAG +4 -0
- pytest-cache-files-ofsx67d1/README.md +8 -0
- requirements.txt +1 -0
- scripts/validate-submission.sh +28 -0
- server/__init__.py +1 -0
- server/app.py +33 -0
- server/supportdesk_environment.py +3 -0
- supportdesk_env/__init__.py +46 -0
- supportdesk_env/client.py +31 -0
- supportdesk_env/graders.py +131 -0
- supportdesk_env/models.py +103 -0
- supportdesk_env/openenv_compat.py +76 -0
- supportdesk_env/policies.py +72 -0
- supportdesk_env/server/__init__.py +1 -0
- supportdesk_env/server/app.py +33 -0
- supportdesk_env/server/supportdesk_environment.py +241 -0
- supportdesk_env/tasks.py +281 -0
- tasks.py +3 -0
- tests/test_supportdesk.py +58 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
COPY . /app
|
| 9 |
+
RUN pip install --no-cache-dir --upgrade pip && pip install --no-cache-dir -r /app/requirements.txt
|
| 10 |
+
|
| 11 |
+
EXPOSE 8000
|
| 12 |
+
|
| 13 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 14 |
+
CMD ["python", "-m", "supportdesk_env.server.app"]
|
README.md
CHANGED
|
@@ -1,10 +1,254 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
--
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SupportDesk OpenEnv Environment
|
| 3 |
+
sdk: docker
|
| 4 |
+
app_port: 8000
|
| 5 |
+
tags:
|
| 6 |
+
- openenv
|
| 7 |
+
- reinforcement-learning
|
| 8 |
+
- customer-support
|
| 9 |
+
base_path: /web
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# SupportDesk OpenEnv Environment
|
| 13 |
+
|
| 14 |
+
SupportDesk is best thought of as an enterprise operations-desk environment, not a generic support classifier.
|
| 15 |
+
|
| 16 |
+
SupportDesk is a real-world RL environment for enterprise support operations. The agent receives a realistic inbound ticket, a small internal knowledge base, and the live case state. It must route the case, set the right priority, decide whether to request more information, draft the customer response, add an internal note, and submit the case with the correct final status.
|
| 17 |
+
|
| 18 |
+
This environment is intentionally built around work humans actually do every day in B2B SaaS support queues. It is not a toy chat task and it is not a game. The environment includes enterprise mechanics such as SLA countdowns, business-impact context, and distracting secondary concerns, so the agent has to prioritize the primary operational issue instead of just pattern-matching keywords.
|
| 19 |
+
|
| 20 |
+
## Environment Description and Motivation
|
| 21 |
+
|
| 22 |
+
The goal of this environment is to model a real operational gap in agent evaluation: many support benchmarks only test whether a model can produce a plausible reply, but real support work also requires correct routing, escalation, information gathering, and final disposition decisions. SupportDesk is designed to evaluate whether an agent can handle enterprise support operations end to end rather than just generate support-sounding text.
|
| 23 |
+
|
| 24 |
+
This makes the environment useful for both:
|
| 25 |
+
|
| 26 |
+
- training agents to improve multi-step support operations behavior
|
| 27 |
+
- evaluating whether an agent can make safe and business-correct support decisions under pressure
|
| 28 |
+
|
| 29 |
+
## Why this should score well
|
| 30 |
+
|
| 31 |
+
- Real-world utility: customer support triage is a real production workflow with immediate evaluation value.
|
| 32 |
+
- Deterministic grading: every task has an explicit gold queue, priority, issue type, required follow-up fields, reply markers, note markers, status, and resolution code.
|
| 33 |
+
- Dense rewards: each step gets rewarded from the delta in the deterministic grader, which gives partial progress rather than only a binary terminal signal.
|
| 34 |
+
- Reproducible baseline: `inference.py` runs all tasks in a fixed order and falls back to a deterministic heuristic policy if model credentials are unavailable.
|
| 35 |
+
- Novel mechanics: observations expose SLA pressure, business impact, and secondary concerns, which makes the environment closer to an enterprise operations desk than a plain support classifier.
|
| 36 |
+
|
| 37 |
+
## Why this is more novel than a standard support benchmark
|
| 38 |
+
|
| 39 |
+
- It is not just routing or intent classification. The agent has to combine queueing, urgency, customer communication, internal notes, and final disposition in one trajectory.
|
| 40 |
+
- It models primary-vs-secondary issue prioritization. The hardest task includes a tempting compliance side-question that should not override the live outage.
|
| 41 |
+
- It encodes enterprise pressure directly in the observation through SLA countdowns, affected-user counts, and business-impact context.
|
| 42 |
+
- It evaluates operational judgment, not just answer quality. A polished reply with the wrong queue, wrong escalation choice, or premature resolution still scores poorly.
|
| 43 |
+
- It is built specifically for OpenEnv-style agent learning and evaluation, where the same environment can be used for baseline runs, external agents, and RL experiments.
|
| 44 |
+
|
| 45 |
+
## Action Space
|
| 46 |
+
|
| 47 |
+
Each `step()` takes a typed `SupportDeskAction` with:
|
| 48 |
+
|
| 49 |
+
- `operation`: one of `classify`, `request_info`, `draft_reply`, `add_internal_note`, `submit`
|
| 50 |
+
- `queue`
|
| 51 |
+
- `priority`
|
| 52 |
+
- `issue_type`
|
| 53 |
+
- `status`
|
| 54 |
+
- `resolution_code`
|
| 55 |
+
- `requested_fields`
|
| 56 |
+
- `reply`
|
| 57 |
+
- `internal_note`
|
| 58 |
+
|
| 59 |
+
The environment allows the agent to update multiple fields in one structured action, which keeps the workflow realistic and helps training.
|
| 60 |
+
|
| 61 |
+
## Observation Space
|
| 62 |
+
|
| 63 |
+
Each observation contains:
|
| 64 |
+
|
| 65 |
+
- `task_id`, `difficulty`, and the agent objective
|
| 66 |
+
- the inbound `ticket`
|
| 67 |
+
- ticket-level urgency metadata such as `affected_users`, `sla_minutes_remaining`, `business_impact`, and `secondary_concerns`
|
| 68 |
+
- `knowledge_base` policy snippets
|
| 69 |
+
- allowed queues, priorities, statuses, and issue types
|
| 70 |
+
- the mutable `case` snapshot
|
| 71 |
+
- `action_history`
|
| 72 |
+
- `feedback`
|
| 73 |
+
- `remaining_steps`
|
| 74 |
+
- the standard OpenEnv `reward` and `done`
|
| 75 |
+
|
| 76 |
+
## OpenEnv Interface
|
| 77 |
+
|
| 78 |
+
The environment implements the standard OpenEnv API:
|
| 79 |
+
|
| 80 |
+
- `reset()` returns the initial typed observation for a new case
|
| 81 |
+
- `step(action)` returns the next typed observation together with reward and done status
|
| 82 |
+
- `state()` returns the current typed environment state
|
| 83 |
+
- `openenv.yaml` provides environment metadata used by validators and deployment tooling
|
| 84 |
+
|
| 85 |
+
The implementation uses typed Pydantic models for action, observation, and state.
|
| 86 |
+
|
| 87 |
+
## Task Descriptions with Expected Difficulty
|
| 88 |
+
|
| 89 |
+
1. `billing_refund_easy` — Expected difficulty: easy
|
| 90 |
+
Duplicate-charge billing ticket. The correct path is immediate billing routing, a refund confirmation, and case resolution.
|
| 91 |
+
2. `account_takeover_medium` — Expected difficulty: medium
|
| 92 |
+
Suspicious-login security ticket. The agent must escalate to trust and safety, request verification details, and keep the case waiting on the customer.
|
| 93 |
+
3. `api_incident_hard` — Expected difficulty: hard
|
| 94 |
+
Enterprise production API incident with a distracting compliance mention. The agent must escalate to platform engineering, request the right diagnostics, and open the incident instead of resolving it.
|
| 95 |
+
|
| 96 |
+
What makes these tasks less generic than ordinary support-routing demos:
|
| 97 |
+
|
| 98 |
+
- They mix queueing, priority, customer communication, internal note-taking, and close-vs-escalate decisions in one trajectory.
|
| 99 |
+
- They include operational context like customer tier, affected-user count, SLA pressure, and business impact.
|
| 100 |
+
- The harder tasks contain conflicting or distracting signals, so a frontier model has to identify the primary issue instead of treating every mention as equally important.
|
| 101 |
+
|
| 102 |
+
## Deterministic Graders
|
| 103 |
+
|
| 104 |
+
The final task score is a weighted total in `[0.0, 1.0]`:
|
| 105 |
+
|
| 106 |
+
- Queue correctness: `0.15`
|
| 107 |
+
- Priority correctness: `0.10`
|
| 108 |
+
- Issue-type correctness: `0.10`
|
| 109 |
+
- Requested-fields correctness: `0.15`
|
| 110 |
+
- Reply coverage: `0.25`
|
| 111 |
+
- Internal-note coverage: `0.10`
|
| 112 |
+
- Final status: `0.10`
|
| 113 |
+
- Resolution code: `0.05`
|
| 114 |
+
|
| 115 |
+
The same grader also drives dense reward shaping during the episode by comparing the current score to the previous score and then subtracting small penalties for no-op or low-signal actions.
|
| 116 |
+
|
| 117 |
+
## Project Layout
|
| 118 |
+
|
| 119 |
+
```text
|
| 120 |
+
.
|
| 121 |
+
|-- inference.py
|
| 122 |
+
|-- openenv.yaml
|
| 123 |
+
|-- pyproject.toml
|
| 124 |
+
|-- requirements.txt
|
| 125 |
+
|-- supportdesk_env
|
| 126 |
+
| |-- __init__.py
|
| 127 |
+
| |-- client.py
|
| 128 |
+
| |-- graders.py
|
| 129 |
+
| |-- models.py
|
| 130 |
+
| |-- tasks.py
|
| 131 |
+
| `-- server
|
| 132 |
+
| |-- app.py
|
| 133 |
+
| `-- supportdesk_environment.py
|
| 134 |
+
|-- tests
|
| 135 |
+
| `-- test_supportdesk.py
|
| 136 |
+
`-- uv.lock
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
## Local Setup
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
pip install -r requirements.txt
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
Or with uv:
|
| 146 |
+
|
| 147 |
+
```bash
|
| 148 |
+
uv sync
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
Optional environment variables for the baseline:
|
| 152 |
+
|
| 153 |
+
```bash
|
| 154 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 155 |
+
export MODEL_NAME="openai/gpt-oss-120b"
|
| 156 |
+
export OPENAI_API_KEY="sk-..." # Or use HF_TOKEN with a compatible router
|
| 157 |
+
export HF_TOKEN="hf_..."
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
The baseline uses the OpenAI Python client and supports both `OPENAI_API_KEY` and `HF_TOKEN`.
|
| 161 |
+
|
| 162 |
+
## Setup and Usage Instructions
|
| 163 |
+
|
| 164 |
+
Typical local workflow:
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
pip install -r requirements.txt
|
| 168 |
+
python -m openenv.cli validate .
|
| 169 |
+
python inference.py
|
| 170 |
+
python -m supportdesk_env.server.app
|
| 171 |
+
```
|
| 172 |
+
|
| 173 |
+
## Local RL Playground
|
| 174 |
+
|
| 175 |
+
If you want to import the package directly and train against the local environment without going through the HTTP server, use the tabular Q-learning example:
|
| 176 |
+
|
| 177 |
+
```bash
|
| 178 |
+
python examples/rl/train_q_agent.py
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
This script imports the package, instantiates `SupportDeskEnvironment` directly, trains a tiny Q-learning agent over a compact discrete action library, and then prints greedy evaluation results for all three tasks. It is meant as a local experimentation playground, not as the official submission baseline.
|
| 182 |
+
|
| 183 |
+
## Run the Server
|
| 184 |
+
|
| 185 |
+
```bash
|
| 186 |
+
python -m supportdesk_env.server.app
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
Or with the OpenEnv entrypoint:
|
| 190 |
+
|
| 191 |
+
```bash
|
| 192 |
+
server
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
## Run the Baseline
|
| 196 |
+
|
| 197 |
+
```bash
|
| 198 |
+
python inference.py
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
When model credentials are present, the script uses the OpenAI client against `API_BASE_URL` and `MODEL_NAME`. If credentials are missing or a request fails, it falls back to a deterministic heuristic policy so the script still completes and prints reproducible scores.
|
| 202 |
+
|
| 203 |
+
## Docker
|
| 204 |
+
|
| 205 |
+
```bash
|
| 206 |
+
docker build -t supportdesk-env .
|
| 207 |
+
docker run -p 8000:8000 supportdesk-env
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
## Hugging Face Space Deployment
|
| 211 |
+
|
| 212 |
+
Deploy this repo as a Docker Space and keep it public for submission. The Space should include the `openenv` tag and the following environment configuration values:
|
| 213 |
+
|
| 214 |
+
- `API_BASE_URL`
|
| 215 |
+
- `MODEL_NAME`
|
| 216 |
+
- `HF_TOKEN`
|
| 217 |
+
|
| 218 |
+
If the OpenEnv CLI is installed, deployment can be done with:
|
| 219 |
+
|
| 220 |
+
```bash
|
| 221 |
+
openenv push --repo-id your-username/hyperbrick-support-ops-env
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
## Validation
|
| 225 |
+
|
| 226 |
+
```bash
|
| 227 |
+
openenv validate .
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
For a full pre-submission pass against a deployed Space:
|
| 231 |
+
|
| 232 |
+
```bash
|
| 233 |
+
./scripts/validate-submission.sh https://your-space.hf.space .
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
## Submission Checklist
|
| 237 |
+
|
| 238 |
+
- Public GitHub repository with this codebase
|
| 239 |
+
- Root `inference.py`
|
| 240 |
+
- Working Docker build
|
| 241 |
+
- Deployed Hugging Face Docker Space tagged `openenv`
|
| 242 |
+
- Space secrets configured: `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN`
|
| 243 |
+
- README present with environment overview, action/observation definitions, tasks, setup, and baseline scores
|
| 244 |
+
|
| 245 |
+
## Baseline Scores
|
| 246 |
+
|
| 247 |
+
Expected deterministic fallback baseline:
|
| 248 |
+
|
| 249 |
+
- `billing_refund_easy`: `1.00`
|
| 250 |
+
- `account_takeover_medium`: `1.00`
|
| 251 |
+
- `api_incident_hard`: `1.00`
|
| 252 |
+
- Average: `1.00`
|
| 253 |
+
|
| 254 |
+
These scores are deliberately reproducible because the fallback policy follows the gold workflow exactly. A model-backed run will typically be lower unless the prompt or model is improved, which makes the environment useful for both training and evaluation.
|
__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for the real supportdesk_env package."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env import * # noqa: F401,F403
|
client.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for the real supportdesk_env package."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env.client import * # noqa: F401,F403
|
examples/rl/train_q_agent.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Train a simple tabular Q-learning agent against the local SupportDesk env.
|
| 2 |
+
|
| 3 |
+
This is an extra playground script for local experimentation. It is not part of
|
| 4 |
+
the hackathon submission baseline and intentionally uses a compact, hand-built
|
| 5 |
+
discrete action library so that plain Python Q-learning can train quickly.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import random
|
| 12 |
+
import sys
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 17 |
+
if str(REPO_ROOT) not in sys.path:
|
| 18 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 19 |
+
|
| 20 |
+
from supportdesk_env import (
|
| 21 |
+
SupportDeskAction,
|
| 22 |
+
get_task,
|
| 23 |
+
grade_case,
|
| 24 |
+
list_task_ids,
|
| 25 |
+
)
|
| 26 |
+
from supportdesk_env.policies import default_note, default_reply
|
| 27 |
+
from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class EvalResult:
|
| 32 |
+
"""Compact report for a greedy evaluation episode."""
|
| 33 |
+
|
| 34 |
+
task_id: str
|
| 35 |
+
score: float
|
| 36 |
+
reward: float
|
| 37 |
+
steps: int
|
| 38 |
+
actions: list[str]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def build_action_library(task_id: str) -> list[SupportDeskAction]:
|
| 42 |
+
"""Return a small discrete action set for a task."""
|
| 43 |
+
|
| 44 |
+
task = get_task(task_id)
|
| 45 |
+
wrong_queue = next(queue for queue in ("general_support", "billing_ops", "trust_and_safety", "platform_engineering") if queue != task.gold_queue)
|
| 46 |
+
wrong_priority = next(priority for priority in ("low", "normal", "high", "urgent") if priority != task.gold_priority)
|
| 47 |
+
wrong_issue = next(issue for issue in ("general_question", "duplicate_charge", "account_compromise", "production_incident") if issue != task.gold_issue_type)
|
| 48 |
+
|
| 49 |
+
partial_fields = list(task.required_requested_fields[:1])
|
| 50 |
+
if not partial_fields:
|
| 51 |
+
partial_fields = ["billing_email"]
|
| 52 |
+
|
| 53 |
+
if task.required_requested_fields:
|
| 54 |
+
good_request = SupportDeskAction(
|
| 55 |
+
operation="request_info",
|
| 56 |
+
requested_fields=list(task.required_requested_fields),
|
| 57 |
+
status=task.gold_status,
|
| 58 |
+
reply=default_reply(task_id),
|
| 59 |
+
)
|
| 60 |
+
else:
|
| 61 |
+
good_request = SupportDeskAction(
|
| 62 |
+
operation="request_info",
|
| 63 |
+
requested_fields=["billing_email"],
|
| 64 |
+
status="waiting_on_customer",
|
| 65 |
+
reply="Please confirm the billing email on the account so we can continue.",
|
| 66 |
+
)
|
| 67 |
+
partial_request = SupportDeskAction(
|
| 68 |
+
operation="request_info",
|
| 69 |
+
requested_fields=partial_fields,
|
| 70 |
+
status="waiting_on_customer",
|
| 71 |
+
reply="Please share more details so we can investigate.",
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
return [
|
| 75 |
+
SupportDeskAction(
|
| 76 |
+
operation="classify",
|
| 77 |
+
queue=task.gold_queue,
|
| 78 |
+
priority=task.gold_priority,
|
| 79 |
+
issue_type=task.gold_issue_type,
|
| 80 |
+
),
|
| 81 |
+
SupportDeskAction(
|
| 82 |
+
operation="classify",
|
| 83 |
+
queue=wrong_queue,
|
| 84 |
+
priority=wrong_priority,
|
| 85 |
+
issue_type=wrong_issue,
|
| 86 |
+
),
|
| 87 |
+
good_request,
|
| 88 |
+
partial_request,
|
| 89 |
+
SupportDeskAction(operation="draft_reply", reply=default_reply(task_id)),
|
| 90 |
+
SupportDeskAction(operation="draft_reply", reply="Thanks for reaching out. We are checking this now."),
|
| 91 |
+
SupportDeskAction(operation="add_internal_note", internal_note=default_note(task_id)),
|
| 92 |
+
SupportDeskAction(operation="add_internal_note", internal_note="Customer contacted support with a problem."),
|
| 93 |
+
SupportDeskAction(
|
| 94 |
+
operation="submit",
|
| 95 |
+
status=task.gold_status,
|
| 96 |
+
resolution_code=task.gold_resolution_code,
|
| 97 |
+
),
|
| 98 |
+
SupportDeskAction(
|
| 99 |
+
operation="submit",
|
| 100 |
+
status="resolved",
|
| 101 |
+
resolution_code="closed_generic",
|
| 102 |
+
),
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def state_key(task_id: str, observation) -> tuple:
|
| 107 |
+
"""Compress the observation into a tabular Q-learning state."""
|
| 108 |
+
|
| 109 |
+
case = observation.case
|
| 110 |
+
return (
|
| 111 |
+
task_id,
|
| 112 |
+
case.queue or "_",
|
| 113 |
+
case.priority or "_",
|
| 114 |
+
case.issue_type or "_",
|
| 115 |
+
case.status,
|
| 116 |
+
case.resolution_code or "_",
|
| 117 |
+
tuple(case.requested_fields),
|
| 118 |
+
bool(case.reply),
|
| 119 |
+
bool(case.internal_note),
|
| 120 |
+
observation.remaining_steps,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def action_label(action: SupportDeskAction) -> str:
|
| 125 |
+
"""Human-readable action label for debug output."""
|
| 126 |
+
|
| 127 |
+
parts = [action.operation]
|
| 128 |
+
if action.queue:
|
| 129 |
+
parts.append(action.queue)
|
| 130 |
+
if action.priority:
|
| 131 |
+
parts.append(action.priority)
|
| 132 |
+
if action.issue_type:
|
| 133 |
+
parts.append(action.issue_type)
|
| 134 |
+
if action.status:
|
| 135 |
+
parts.append(action.status)
|
| 136 |
+
if action.resolution_code:
|
| 137 |
+
parts.append(action.resolution_code)
|
| 138 |
+
if action.requested_fields:
|
| 139 |
+
parts.append(",".join(action.requested_fields))
|
| 140 |
+
if action.reply:
|
| 141 |
+
parts.append("reply")
|
| 142 |
+
if action.internal_note:
|
| 143 |
+
parts.append("note")
|
| 144 |
+
return " | ".join(parts)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def choose_action(q_values: dict[tuple, list[float]], state: tuple, num_actions: int, epsilon: float) -> int:
|
| 148 |
+
"""Epsilon-greedy action selection."""
|
| 149 |
+
|
| 150 |
+
if state not in q_values:
|
| 151 |
+
q_values[state] = [0.0] * num_actions
|
| 152 |
+
|
| 153 |
+
if random.random() < epsilon:
|
| 154 |
+
return random.randrange(num_actions)
|
| 155 |
+
|
| 156 |
+
best_value = max(q_values[state])
|
| 157 |
+
best_indices = [index for index, value in enumerate(q_values[state]) if value == best_value]
|
| 158 |
+
return random.choice(best_indices)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def train_q_agent(
|
| 162 |
+
episodes_per_task: int,
|
| 163 |
+
alpha: float,
|
| 164 |
+
gamma: float,
|
| 165 |
+
epsilon: float,
|
| 166 |
+
epsilon_decay: float,
|
| 167 |
+
min_epsilon: float,
|
| 168 |
+
seed: int,
|
| 169 |
+
) -> tuple[dict[tuple, list[float]], dict[str, list[SupportDeskAction]]]:
|
| 170 |
+
"""Train a small tabular Q-learning agent over all tasks."""
|
| 171 |
+
|
| 172 |
+
random.seed(seed)
|
| 173 |
+
q_values: dict[tuple, list[float]] = {}
|
| 174 |
+
action_libraries = {task_id: build_action_library(task_id) for task_id in list_task_ids()}
|
| 175 |
+
|
| 176 |
+
for _ in range(episodes_per_task):
|
| 177 |
+
for task_id in list_task_ids():
|
| 178 |
+
env = SupportDeskEnvironment(task_id=task_id)
|
| 179 |
+
observation = env.reset()
|
| 180 |
+
actions = action_libraries[task_id]
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
while not observation.done:
|
| 184 |
+
state = state_key(task_id, observation)
|
| 185 |
+
action_index = choose_action(q_values, state, len(actions), epsilon)
|
| 186 |
+
next_observation = env.step(actions[action_index])
|
| 187 |
+
|
| 188 |
+
next_state = state_key(task_id, next_observation)
|
| 189 |
+
if next_state not in q_values:
|
| 190 |
+
q_values[next_state] = [0.0] * len(actions)
|
| 191 |
+
|
| 192 |
+
td_target = next_observation.reward + gamma * (0.0 if next_observation.done else max(q_values[next_state]))
|
| 193 |
+
td_error = td_target - q_values[state][action_index]
|
| 194 |
+
q_values[state][action_index] += alpha * td_error
|
| 195 |
+
observation = next_observation
|
| 196 |
+
finally:
|
| 197 |
+
env.close()
|
| 198 |
+
|
| 199 |
+
epsilon = max(min_epsilon, epsilon * epsilon_decay)
|
| 200 |
+
|
| 201 |
+
return q_values, action_libraries
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def evaluate_policy(
|
| 205 |
+
q_values: dict[tuple, list[float]],
|
| 206 |
+
action_libraries: dict[str, list[SupportDeskAction]],
|
| 207 |
+
) -> list[EvalResult]:
|
| 208 |
+
"""Run a greedy evaluation episode for each task."""
|
| 209 |
+
|
| 210 |
+
results: list[EvalResult] = []
|
| 211 |
+
|
| 212 |
+
for task_id in list_task_ids():
|
| 213 |
+
env = SupportDeskEnvironment(task_id=task_id)
|
| 214 |
+
observation = env.reset()
|
| 215 |
+
actions = action_libraries[task_id]
|
| 216 |
+
chosen_actions: list[str] = []
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
while not observation.done:
|
| 220 |
+
state = state_key(task_id, observation)
|
| 221 |
+
q_values.setdefault(state, [0.0] * len(actions))
|
| 222 |
+
action_index = max(range(len(actions)), key=lambda idx: q_values[state][idx])
|
| 223 |
+
action = actions[action_index]
|
| 224 |
+
chosen_actions.append(action_label(action))
|
| 225 |
+
observation = env.step(action)
|
| 226 |
+
|
| 227 |
+
results.append(
|
| 228 |
+
EvalResult(
|
| 229 |
+
task_id=task_id,
|
| 230 |
+
score=grade_case(get_task(task_id), env.state.case).total_score,
|
| 231 |
+
reward=env.state.reward,
|
| 232 |
+
steps=env.state.step_count,
|
| 233 |
+
actions=chosen_actions,
|
| 234 |
+
)
|
| 235 |
+
)
|
| 236 |
+
finally:
|
| 237 |
+
env.close()
|
| 238 |
+
|
| 239 |
+
return results
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def main() -> None:
|
| 243 |
+
parser = argparse.ArgumentParser(description="Train a simple tabular Q-learning agent on SupportDesk.")
|
| 244 |
+
parser.add_argument("--episodes-per-task", type=int, default=250)
|
| 245 |
+
parser.add_argument("--alpha", type=float, default=0.45)
|
| 246 |
+
parser.add_argument("--gamma", type=float, default=0.92)
|
| 247 |
+
parser.add_argument("--epsilon", type=float, default=0.35)
|
| 248 |
+
parser.add_argument("--epsilon-decay", type=float, default=0.99)
|
| 249 |
+
parser.add_argument("--min-epsilon", type=float, default=0.03)
|
| 250 |
+
parser.add_argument("--seed", type=int, default=7)
|
| 251 |
+
args = parser.parse_args()
|
| 252 |
+
|
| 253 |
+
q_values, action_libraries = train_q_agent(
|
| 254 |
+
episodes_per_task=args.episodes_per_task,
|
| 255 |
+
alpha=args.alpha,
|
| 256 |
+
gamma=args.gamma,
|
| 257 |
+
epsilon=args.epsilon,
|
| 258 |
+
epsilon_decay=args.epsilon_decay,
|
| 259 |
+
min_epsilon=args.min_epsilon,
|
| 260 |
+
seed=args.seed,
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
results = evaluate_policy(q_values, action_libraries)
|
| 264 |
+
average_score = sum(result.score for result in results) / len(results)
|
| 265 |
+
|
| 266 |
+
print("Tabular Q-learning evaluation")
|
| 267 |
+
print("============================")
|
| 268 |
+
for result in results:
|
| 269 |
+
print(
|
| 270 |
+
f"{result.task_id}: score={result.score:.2f} reward={result.reward:.2f} "
|
| 271 |
+
f"steps={result.steps}"
|
| 272 |
+
)
|
| 273 |
+
print(f" actions: {' -> '.join(result.actions)}")
|
| 274 |
+
print(f"average_score={average_score:.3f}")
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
if __name__ == "__main__":
|
| 278 |
+
main()
|
graders.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for the real supportdesk_env package."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env.graders import * # noqa: F401,F403
|
inference.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Baseline inference script for the SupportDesk OpenEnv submission."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
from statistics import mean
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from openai import OpenAI
|
| 12 |
+
except ImportError: # pragma: no cover - local fallback mode
|
| 13 |
+
OpenAI = None # type: ignore[assignment]
|
| 14 |
+
|
| 15 |
+
from supportdesk_env.graders import grade_case
|
| 16 |
+
from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
|
| 17 |
+
from supportdesk_env.policies import heuristic_action
|
| 18 |
+
from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
|
| 19 |
+
from supportdesk_env.tasks import get_task, list_task_ids
|
| 20 |
+
|
| 21 |
+
SYSTEM_PROMPT = """You are a support operations agent solving one triage ticket.
|
| 22 |
+
Return exactly one JSON object with this schema:
|
| 23 |
+
{
|
| 24 |
+
"operation": "classify|request_info|draft_reply|add_internal_note|submit",
|
| 25 |
+
"queue": string or null,
|
| 26 |
+
"priority": string or null,
|
| 27 |
+
"issue_type": string or null,
|
| 28 |
+
"status": string or null,
|
| 29 |
+
"resolution_code": string or null,
|
| 30 |
+
"requested_fields": [string],
|
| 31 |
+
"reply": string or null,
|
| 32 |
+
"internal_note": string or null
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
Use the policy snippets in the observation. Keep customer replies short, precise, and professional.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1-mini")
|
| 39 |
+
API_BASE_URL = os.getenv("API_BASE_URL")
|
| 40 |
+
API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN") or "not-set"
|
| 41 |
+
MAX_STEPS = int(os.getenv("MAX_STEPS", "6"))
|
| 42 |
+
TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _build_client() -> OpenAI | None:
|
| 46 |
+
if OpenAI is None:
|
| 47 |
+
return None
|
| 48 |
+
if API_KEY == "not-set":
|
| 49 |
+
return None
|
| 50 |
+
kwargs = {"api_key": API_KEY}
|
| 51 |
+
if API_BASE_URL:
|
| 52 |
+
kwargs["base_url"] = API_BASE_URL
|
| 53 |
+
return OpenAI(**kwargs)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _extract_json(text: str) -> dict:
|
| 57 |
+
try:
|
| 58 |
+
return json.loads(text)
|
| 59 |
+
except json.JSONDecodeError:
|
| 60 |
+
match = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
| 61 |
+
if not match:
|
| 62 |
+
raise
|
| 63 |
+
return json.loads(match.group(0))
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _observation_prompt(observation: SupportDeskObservation) -> str:
|
| 67 |
+
kb_lines = "\n".join(
|
| 68 |
+
f"- {snippet.article_id}: {snippet.title}: {snippet.content}" for snippet in observation.knowledge_base
|
| 69 |
+
)
|
| 70 |
+
history_lines = "\n".join(
|
| 71 |
+
f"- step {entry.step}: {entry.summary} ({entry.reward_delta:+.2f})"
|
| 72 |
+
for entry in observation.action_history
|
| 73 |
+
) or "- none"
|
| 74 |
+
|
| 75 |
+
return f"""Task: {observation.task_id} ({observation.difficulty})
|
| 76 |
+
Objective: {observation.objective}
|
| 77 |
+
Ticket subject: {observation.ticket.subject}
|
| 78 |
+
Ticket body: {observation.ticket.body}
|
| 79 |
+
Customer tier: {observation.ticket.customer_tier}
|
| 80 |
+
Region: {observation.ticket.region}
|
| 81 |
+
Affected users: {observation.ticket.affected_users}
|
| 82 |
+
SLA minutes remaining: {observation.ticket.sla_minutes_remaining}
|
| 83 |
+
Business impact: {observation.ticket.business_impact}
|
| 84 |
+
Secondary concerns: {observation.ticket.secondary_concerns}
|
| 85 |
+
|
| 86 |
+
Knowledge base:
|
| 87 |
+
{kb_lines}
|
| 88 |
+
|
| 89 |
+
Current case state:
|
| 90 |
+
- queue: {observation.case.queue}
|
| 91 |
+
- priority: {observation.case.priority}
|
| 92 |
+
- issue_type: {observation.case.issue_type}
|
| 93 |
+
- status: {observation.case.status}
|
| 94 |
+
- resolution_code: {observation.case.resolution_code}
|
| 95 |
+
- requested_fields: {observation.case.requested_fields}
|
| 96 |
+
- reply: {observation.case.reply}
|
| 97 |
+
- internal_note: {observation.case.internal_note}
|
| 98 |
+
|
| 99 |
+
Feedback: {observation.feedback}
|
| 100 |
+
Remaining steps: {observation.remaining_steps}
|
| 101 |
+
|
| 102 |
+
History:
|
| 103 |
+
{history_lines}
|
| 104 |
+
"""
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _model_action(client: OpenAI | None, observation: SupportDeskObservation) -> SupportDeskAction:
|
| 108 |
+
if client is None:
|
| 109 |
+
return heuristic_action(observation)
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
completion = client.chat.completions.create(
|
| 113 |
+
model=MODEL_NAME,
|
| 114 |
+
temperature=TEMPERATURE,
|
| 115 |
+
messages=[
|
| 116 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 117 |
+
{"role": "user", "content": _observation_prompt(observation)},
|
| 118 |
+
],
|
| 119 |
+
)
|
| 120 |
+
content = completion.choices[0].message.content or ""
|
| 121 |
+
payload = _extract_json(content)
|
| 122 |
+
return SupportDeskAction(**payload)
|
| 123 |
+
except Exception:
|
| 124 |
+
return heuristic_action(observation)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def run_task(task_id: str, client: OpenAI | None) -> float:
|
| 128 |
+
env = SupportDeskEnvironment(task_id=task_id)
|
| 129 |
+
observation = env.reset()
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
for _ in range(MAX_STEPS):
|
| 133 |
+
action = _model_action(client, observation)
|
| 134 |
+
observation = env.step(action)
|
| 135 |
+
if observation.done:
|
| 136 |
+
break
|
| 137 |
+
final_grade = grade_case(get_task(task_id), env.state.case)
|
| 138 |
+
print(f"{task_id}: score={final_grade.total_score:.2f} reward={env.state.reward:.2f}")
|
| 139 |
+
return final_grade.total_score
|
| 140 |
+
finally:
|
| 141 |
+
env.close()
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def main() -> None:
|
| 145 |
+
client = _build_client()
|
| 146 |
+
scores = [run_task(task_id, client) for task_id in list_task_ids()]
|
| 147 |
+
print(f"average_score={mean(scores):.3f}")
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
if __name__ == "__main__":
|
| 151 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for the real supportdesk_env package."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env.models import * # noqa: F401,F403
|
openenv.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: HyperBrickCaseOps
|
| 2 |
+
env_name: supportdesk_env
|
| 3 |
+
description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
|
pyproject.toml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "supportdesk-env"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "A real-world OpenEnv environment for customer support triage and escalation."
|
| 5 |
+
authors = [{ name = "HyperBrick" }]
|
| 6 |
+
dependencies = [
|
| 7 |
+
"fastapi>=0.115.0",
|
| 8 |
+
"openai>=1.54.0",
|
| 9 |
+
"openenv-core>=0.2.0",
|
| 10 |
+
"pydantic>=2.9.0",
|
| 11 |
+
"requests>=2.32.0",
|
| 12 |
+
"uvicorn>=0.30.0",
|
| 13 |
+
]
|
| 14 |
+
requires-python = ">=3.10"
|
| 15 |
+
|
| 16 |
+
[project.optional-dependencies]
|
| 17 |
+
dev = [
|
| 18 |
+
"pytest>=8.3.0",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
[project.scripts]
|
| 22 |
+
server = "supportdesk_env.server.app:main"
|
| 23 |
+
|
| 24 |
+
[build-system]
|
| 25 |
+
requires = ["setuptools"]
|
| 26 |
+
build-backend = "setuptools.build_meta"
|
| 27 |
+
|
| 28 |
+
[tool.setuptools]
|
| 29 |
+
packages = ["supportdesk_env", "supportdesk_env.server"]
|
pytest-cache-files-0ad_osqx/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-0ad_osqx/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-7cu1ajqk/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-7cu1ajqk/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-8nb5ix7f/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-8nb5ix7f/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-fy9fhtya/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-fy9fhtya/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-i3h6kocm/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-i3h6kocm/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-l0dcjql5/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-l0dcjql5/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
pytest-cache-files-ofsx67d1/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
pytest-cache-files-ofsx67d1/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
-e .
|
scripts/validate-submission.sh
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
PING_URL="${1:-}"
|
| 5 |
+
REPO_DIR="${2:-.}"
|
| 6 |
+
IMAGE_TAG="supportdesk-env-validate"
|
| 7 |
+
|
| 8 |
+
if [ -z "$PING_URL" ]; then
|
| 9 |
+
echo "Usage: ./scripts/validate-submission.sh <space_url> [repo_dir]"
|
| 10 |
+
exit 1
|
| 11 |
+
fi
|
| 12 |
+
|
| 13 |
+
echo "[1/4] Validating local OpenEnv project structure"
|
| 14 |
+
(cd "$REPO_DIR" && openenv validate .)
|
| 15 |
+
|
| 16 |
+
echo "[2/4] Building Docker image"
|
| 17 |
+
(cd "$REPO_DIR" && docker build -t "$IMAGE_TAG" .)
|
| 18 |
+
|
| 19 |
+
echo "[3/4] Checking public Space health"
|
| 20 |
+
curl -fsSL "$PING_URL/health" >/dev/null
|
| 21 |
+
curl -fsSL "$PING_URL/openapi.json" >/dev/null
|
| 22 |
+
|
| 23 |
+
echo "[4/4] Checking public reset endpoint"
|
| 24 |
+
curl -fsSL -X POST "$PING_URL/reset" \
|
| 25 |
+
-H "Content-Type: application/json" \
|
| 26 |
+
-d '{}' >/dev/null
|
| 27 |
+
|
| 28 |
+
echo "Validation checks completed successfully."
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Server package for the SupportDesk OpenEnv environment."""
|
server/app.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI app entrypoint for the SupportDesk environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
import uvicorn
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from openenv.core.env_server.http_server import create_app
|
| 11 |
+
except ImportError: # pragma: no cover - package name differs across releases
|
| 12 |
+
from openenv_core.env_server.http_server import create_app
|
| 13 |
+
|
| 14 |
+
from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
|
| 15 |
+
from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
|
| 16 |
+
|
| 17 |
+
app = create_app(
|
| 18 |
+
SupportDeskEnvironment,
|
| 19 |
+
action_cls=SupportDeskAction,
|
| 20 |
+
observation_cls=SupportDeskObservation,
|
| 21 |
+
env_name="supportdesk_env",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def main() -> None:
|
| 26 |
+
"""Run the local HTTP server."""
|
| 27 |
+
|
| 28 |
+
port = int(os.getenv("PORT", "8000"))
|
| 29 |
+
uvicorn.run("supportdesk_env.server.app:app", host="0.0.0.0", port=port)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
main()
|
server/supportdesk_environment.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for the real supportdesk_env package."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env.server.supportdesk_environment import * # noqa: F401,F403
|
supportdesk_env/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SupportDesk OpenEnv environment package."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env.graders import GradeBreakdown, grade_case, grade_task_id
|
| 4 |
+
from supportdesk_env.models import (
|
| 5 |
+
ActionHistoryEntry,
|
| 6 |
+
KnowledgeSnippet,
|
| 7 |
+
SupportCaseProgress,
|
| 8 |
+
SupportDeskAction,
|
| 9 |
+
SupportDeskObservation,
|
| 10 |
+
SupportDeskState,
|
| 11 |
+
SupportTicket,
|
| 12 |
+
)
|
| 13 |
+
from supportdesk_env.policies import default_note, default_reply, heuristic_action
|
| 14 |
+
from supportdesk_env.tasks import TASKS, SupportTaskSpec, get_task, list_task_ids
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from supportdesk_env.client import SupportDeskEnv
|
| 18 |
+
except ImportError: # pragma: no cover - local unit tests can run without openenv-core
|
| 19 |
+
SupportDeskEnv = None # type: ignore[assignment]
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
|
| 23 |
+
except ImportError: # pragma: no cover - guarded for partial local setups
|
| 24 |
+
SupportDeskEnvironment = None # type: ignore[assignment]
|
| 25 |
+
|
| 26 |
+
__all__ = [
|
| 27 |
+
"ActionHistoryEntry",
|
| 28 |
+
"GradeBreakdown",
|
| 29 |
+
"KnowledgeSnippet",
|
| 30 |
+
"SupportCaseProgress",
|
| 31 |
+
"SupportDeskAction",
|
| 32 |
+
"SupportDeskEnv",
|
| 33 |
+
"SupportDeskEnvironment",
|
| 34 |
+
"SupportDeskObservation",
|
| 35 |
+
"SupportDeskState",
|
| 36 |
+
"SupportTaskSpec",
|
| 37 |
+
"SupportTicket",
|
| 38 |
+
"TASKS",
|
| 39 |
+
"default_note",
|
| 40 |
+
"default_reply",
|
| 41 |
+
"get_task",
|
| 42 |
+
"grade_case",
|
| 43 |
+
"grade_task_id",
|
| 44 |
+
"heuristic_action",
|
| 45 |
+
"list_task_ids",
|
| 46 |
+
]
|
supportdesk_env/client.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTTP client for interacting with a deployed SupportDesk environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from supportdesk_env.models import SupportDeskAction, SupportDeskObservation, SupportDeskState
|
| 6 |
+
from supportdesk_env.openenv_compat import EnvClient, StepResult
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _validate(model_cls, payload):
|
| 10 |
+
if hasattr(model_cls, "model_validate"):
|
| 11 |
+
return model_cls.model_validate(payload)
|
| 12 |
+
return model_cls(**payload) # pragma: no cover - pydantic v1 fallback
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class SupportDeskEnv(EnvClient[SupportDeskAction, SupportDeskObservation, SupportDeskState]):
|
| 16 |
+
"""Typed client for a locally running or deployed OpenEnv server."""
|
| 17 |
+
|
| 18 |
+
def _parse_state(self, payload) -> SupportDeskState:
|
| 19 |
+
return _validate(SupportDeskState, payload)
|
| 20 |
+
|
| 21 |
+
def _parse_reset(self, payload) -> SupportDeskObservation:
|
| 22 |
+
return _validate(SupportDeskObservation, payload)
|
| 23 |
+
|
| 24 |
+
def _parse_result(self, payload) -> StepResult[SupportDeskObservation]:
|
| 25 |
+
observation = _validate(SupportDeskObservation, payload["observation"])
|
| 26 |
+
return StepResult(
|
| 27 |
+
observation=observation,
|
| 28 |
+
reward=payload["reward"],
|
| 29 |
+
done=payload["done"],
|
| 30 |
+
info=payload.get("info", {}),
|
| 31 |
+
)
|
supportdesk_env/graders.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic graders and reward helpers for SupportDesk."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
|
| 8 |
+
from supportdesk_env.models import SupportCaseProgress
|
| 9 |
+
from supportdesk_env.tasks import SupportTaskSpec, get_task
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass(frozen=True)
|
| 13 |
+
class GradeBreakdown:
|
| 14 |
+
"""A scored view of how close a case is to the gold solution."""
|
| 15 |
+
|
| 16 |
+
total_score: float
|
| 17 |
+
queue_score: float
|
| 18 |
+
priority_score: float
|
| 19 |
+
issue_type_score: float
|
| 20 |
+
requested_fields_score: float
|
| 21 |
+
reply_score: float
|
| 22 |
+
note_score: float
|
| 23 |
+
status_score: float
|
| 24 |
+
resolution_score: float
|
| 25 |
+
completed_milestones: tuple[str, ...]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _normalize(text: str | None) -> str:
|
| 29 |
+
if not text:
|
| 30 |
+
return ""
|
| 31 |
+
normalized = text.lower().replace("-", " ")
|
| 32 |
+
return re.sub(r"[^a-z0-9\s]", " ", normalized)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _marker_group_score(text: str | None, marker_groups: tuple[tuple[str, ...], ...]) -> float:
|
| 36 |
+
if not marker_groups:
|
| 37 |
+
return 1.0
|
| 38 |
+
|
| 39 |
+
normalized = _normalize(text)
|
| 40 |
+
if not normalized:
|
| 41 |
+
return 0.0
|
| 42 |
+
|
| 43 |
+
matches = 0
|
| 44 |
+
for group in marker_groups:
|
| 45 |
+
if any(_normalize(marker) in normalized for marker in group):
|
| 46 |
+
matches += 1
|
| 47 |
+
return matches / len(marker_groups)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _requested_fields_score(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
|
| 51 |
+
required = set(task.required_requested_fields)
|
| 52 |
+
requested = set(case.requested_fields)
|
| 53 |
+
|
| 54 |
+
if not required:
|
| 55 |
+
return 1.0 if not requested else 0.0
|
| 56 |
+
if not requested:
|
| 57 |
+
return 0.0
|
| 58 |
+
|
| 59 |
+
matched = len(required.intersection(requested))
|
| 60 |
+
extras = len(requested.difference(required))
|
| 61 |
+
raw = matched / len(required)
|
| 62 |
+
penalty = min(0.25, extras * 0.05)
|
| 63 |
+
return max(0.0, raw - penalty)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _reply_penalty(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
|
| 67 |
+
text = _normalize(case.reply)
|
| 68 |
+
if not text:
|
| 69 |
+
return 0.0
|
| 70 |
+
return 0.0 if not any(_normalize(marker) in text for marker in task.forbidden_reply_markers) else 0.5
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def grade_case(task: SupportTaskSpec, case: SupportCaseProgress) -> GradeBreakdown:
|
| 74 |
+
"""Score a case from 0.0 to 1.0 using deterministic task rules."""
|
| 75 |
+
|
| 76 |
+
queue_score = 1.0 if case.queue == task.gold_queue else 0.0
|
| 77 |
+
priority_score = 1.0 if case.priority == task.gold_priority else 0.0
|
| 78 |
+
issue_type_score = 1.0 if case.issue_type == task.gold_issue_type else 0.0
|
| 79 |
+
requested_fields_score = _requested_fields_score(case, task)
|
| 80 |
+
reply_score = max(0.0, _marker_group_score(case.reply, task.required_reply_markers) - _reply_penalty(case, task))
|
| 81 |
+
note_score = _marker_group_score(case.internal_note, task.required_note_markers)
|
| 82 |
+
status_score = 1.0 if case.status == task.gold_status else 0.0
|
| 83 |
+
resolution_score = 1.0 if case.resolution_code == task.gold_resolution_code else 0.0
|
| 84 |
+
|
| 85 |
+
weighted_total = (
|
| 86 |
+
queue_score * 0.15
|
| 87 |
+
+ priority_score * 0.10
|
| 88 |
+
+ issue_type_score * 0.10
|
| 89 |
+
+ requested_fields_score * 0.15
|
| 90 |
+
+ reply_score * 0.25
|
| 91 |
+
+ note_score * 0.10
|
| 92 |
+
+ status_score * 0.10
|
| 93 |
+
+ resolution_score * 0.05
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
milestones: list[str] = []
|
| 97 |
+
if queue_score:
|
| 98 |
+
milestones.append("queue")
|
| 99 |
+
if priority_score:
|
| 100 |
+
milestones.append("priority")
|
| 101 |
+
if issue_type_score:
|
| 102 |
+
milestones.append("issue_type")
|
| 103 |
+
if requested_fields_score >= 0.99:
|
| 104 |
+
milestones.append("requested_fields")
|
| 105 |
+
if reply_score >= 0.99:
|
| 106 |
+
milestones.append("reply")
|
| 107 |
+
if note_score >= 0.99:
|
| 108 |
+
milestones.append("internal_note")
|
| 109 |
+
if status_score:
|
| 110 |
+
milestones.append("status")
|
| 111 |
+
if resolution_score:
|
| 112 |
+
milestones.append("resolution_code")
|
| 113 |
+
|
| 114 |
+
return GradeBreakdown(
|
| 115 |
+
total_score=round(weighted_total, 4),
|
| 116 |
+
queue_score=queue_score,
|
| 117 |
+
priority_score=priority_score,
|
| 118 |
+
issue_type_score=issue_type_score,
|
| 119 |
+
requested_fields_score=round(requested_fields_score, 4),
|
| 120 |
+
reply_score=round(reply_score, 4),
|
| 121 |
+
note_score=round(note_score, 4),
|
| 122 |
+
status_score=status_score,
|
| 123 |
+
resolution_score=resolution_score,
|
| 124 |
+
completed_milestones=tuple(milestones),
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def grade_task_id(task_id: str, case: SupportCaseProgress) -> GradeBreakdown:
|
| 129 |
+
"""Convenience wrapper used by tests and evaluation scripts."""
|
| 130 |
+
|
| 131 |
+
return grade_case(get_task(task_id), case)
|
supportdesk_env/models.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed models for the SupportDesk OpenEnv environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Literal
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
from supportdesk_env.openenv_compat import Action, Observation, State
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class KnowledgeSnippet(BaseModel):
|
| 13 |
+
"""A policy or runbook excerpt the agent can use during triage."""
|
| 14 |
+
|
| 15 |
+
article_id: str
|
| 16 |
+
title: str
|
| 17 |
+
content: str
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SupportTicket(BaseModel):
|
| 21 |
+
"""Static task input representing the inbound support ticket."""
|
| 22 |
+
|
| 23 |
+
customer_name: str
|
| 24 |
+
customer_tier: Literal["free", "pro", "enterprise"]
|
| 25 |
+
company: str
|
| 26 |
+
subject: str
|
| 27 |
+
body: str
|
| 28 |
+
region: str
|
| 29 |
+
affected_users: int | None = None
|
| 30 |
+
sla_minutes_remaining: int | None = None
|
| 31 |
+
business_impact: str | None = None
|
| 32 |
+
secondary_concerns: list[str] = Field(default_factory=list)
|
| 33 |
+
attachments: list[str] = Field(default_factory=list)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ActionHistoryEntry(BaseModel):
|
| 37 |
+
"""A concise trace entry used in observations and state dumps."""
|
| 38 |
+
|
| 39 |
+
step: int
|
| 40 |
+
operation: str
|
| 41 |
+
summary: str
|
| 42 |
+
reward_delta: float = 0.0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class SupportCaseProgress(BaseModel):
|
| 46 |
+
"""Mutable case state that graders score against."""
|
| 47 |
+
|
| 48 |
+
queue: str | None = None
|
| 49 |
+
priority: str | None = None
|
| 50 |
+
issue_type: str | None = None
|
| 51 |
+
status: str = "new"
|
| 52 |
+
resolution_code: str | None = None
|
| 53 |
+
requested_fields: list[str] = Field(default_factory=list)
|
| 54 |
+
reply: str | None = None
|
| 55 |
+
internal_note: str | None = None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class SupportDeskAction(Action):
|
| 59 |
+
"""One structured action the agent can take at each step."""
|
| 60 |
+
|
| 61 |
+
operation: Literal["classify", "request_info", "draft_reply", "add_internal_note", "submit"]
|
| 62 |
+
queue: str | None = None
|
| 63 |
+
priority: str | None = None
|
| 64 |
+
issue_type: str | None = None
|
| 65 |
+
status: str | None = None
|
| 66 |
+
resolution_code: str | None = None
|
| 67 |
+
requested_fields: list[str] = Field(default_factory=list)
|
| 68 |
+
reply: str | None = None
|
| 69 |
+
internal_note: str | None = None
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class SupportDeskObservation(Observation):
|
| 73 |
+
"""Observation emitted to the agent after reset and each step."""
|
| 74 |
+
|
| 75 |
+
task_id: str
|
| 76 |
+
difficulty: Literal["easy", "medium", "hard"]
|
| 77 |
+
objective: str
|
| 78 |
+
ticket: SupportTicket
|
| 79 |
+
knowledge_base: list[KnowledgeSnippet]
|
| 80 |
+
available_queues: list[str]
|
| 81 |
+
available_priorities: list[str]
|
| 82 |
+
available_statuses: list[str]
|
| 83 |
+
available_issue_types: list[str]
|
| 84 |
+
case: SupportCaseProgress
|
| 85 |
+
action_history: list[ActionHistoryEntry] = Field(default_factory=list)
|
| 86 |
+
feedback: str = ""
|
| 87 |
+
remaining_steps: int = 0
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class SupportDeskState(State):
|
| 91 |
+
"""Current environment state returned by the OpenEnv state() API."""
|
| 92 |
+
|
| 93 |
+
task_id: str
|
| 94 |
+
difficulty: Literal["easy", "medium", "hard"]
|
| 95 |
+
step_count: int = 0
|
| 96 |
+
reward: float = 0.0
|
| 97 |
+
done: bool = False
|
| 98 |
+
current_score: float = 0.0
|
| 99 |
+
max_steps: int = 0
|
| 100 |
+
case: SupportCaseProgress
|
| 101 |
+
action_history: list[ActionHistoryEntry] = Field(default_factory=list)
|
| 102 |
+
completed_milestones: list[str] = Field(default_factory=list)
|
| 103 |
+
last_feedback: str = ""
|
supportdesk_env/openenv_compat.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility helpers for environments where openenv-core is not installed."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Any, Generic, TypeVar
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
A = TypeVar("A")
|
| 11 |
+
O = TypeVar("O")
|
| 12 |
+
S = TypeVar("S")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
OPENENV_AVAILABLE = True
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
from openenv.core.client_types import StepResult # type: ignore
|
| 19 |
+
from openenv.core.env_client import EnvClient # type: ignore
|
| 20 |
+
from openenv.core.env_server.interfaces import Environment # type: ignore
|
| 21 |
+
from openenv.core.env_server.types import Action, Observation, State # type: ignore
|
| 22 |
+
from openenv.core.env_server.types import EnvironmentMetadata # type: ignore
|
| 23 |
+
except ImportError:
|
| 24 |
+
try:
|
| 25 |
+
from openenv_core.client_types import StepResult # type: ignore
|
| 26 |
+
from openenv_core.http_env_client import HTTPEnvClient as EnvClient # type: ignore
|
| 27 |
+
from openenv_core.env_server.interfaces import Environment # type: ignore
|
| 28 |
+
from openenv_core.env_server.types import Action, Observation, State # type: ignore
|
| 29 |
+
from openenv_core.env_server.types import EnvironmentMetadata # type: ignore
|
| 30 |
+
except ImportError:
|
| 31 |
+
OPENENV_AVAILABLE = False
|
| 32 |
+
|
| 33 |
+
class Action(BaseModel):
|
| 34 |
+
"""Fallback Action base type for local import-only workflows."""
|
| 35 |
+
|
| 36 |
+
class Observation(BaseModel):
|
| 37 |
+
"""Fallback Observation base type for local import-only workflows."""
|
| 38 |
+
|
| 39 |
+
reward: float = 0.0
|
| 40 |
+
done: bool = False
|
| 41 |
+
|
| 42 |
+
class State(BaseModel):
|
| 43 |
+
"""Fallback State base type for local import-only workflows."""
|
| 44 |
+
|
| 45 |
+
class Environment(Generic[A, O, S]):
|
| 46 |
+
"""Minimal base class used for local unit tests and import-based demos."""
|
| 47 |
+
|
| 48 |
+
def __init__(self) -> None:
|
| 49 |
+
super().__init__()
|
| 50 |
+
|
| 51 |
+
class EnvironmentMetadata(BaseModel):
|
| 52 |
+
"""Fallback metadata model used when OpenEnv is absent."""
|
| 53 |
+
|
| 54 |
+
name: str
|
| 55 |
+
description: str
|
| 56 |
+
readme_content: str | None = None
|
| 57 |
+
version: str | None = None
|
| 58 |
+
author: str | None = None
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class StepResult(Generic[O]):
|
| 62 |
+
"""Fallback step result for local-only client compatibility."""
|
| 63 |
+
|
| 64 |
+
observation: O
|
| 65 |
+
reward: float
|
| 66 |
+
done: bool
|
| 67 |
+
info: dict[str, Any] = field(default_factory=dict)
|
| 68 |
+
|
| 69 |
+
class EnvClient(Generic[A, O, S]):
|
| 70 |
+
"""Placeholder client that fails only when actually used."""
|
| 71 |
+
|
| 72 |
+
def __init__(self, *args, **kwargs) -> None:
|
| 73 |
+
raise ImportError(
|
| 74 |
+
"SupportDeskEnv requires openenv-core to be installed. "
|
| 75 |
+
"Run `py -3 -m pip install openenv-core` to use the HTTP client."
|
| 76 |
+
)
|
supportdesk_env/policies.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reusable policy helpers for local baselines and training examples."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
|
| 6 |
+
from supportdesk_env.tasks import get_task
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def default_reply(task_id: str) -> str:
|
| 10 |
+
"""Return a task-specific high-signal customer reply."""
|
| 11 |
+
|
| 12 |
+
if task_id == "billing_refund_easy":
|
| 13 |
+
return (
|
| 14 |
+
"Thanks for flagging the duplicate charge. I have started the refund for the extra "
|
| 15 |
+
"charge, and the funds usually appear within 5-7 business days."
|
| 16 |
+
)
|
| 17 |
+
if task_id == "account_takeover_medium":
|
| 18 |
+
return (
|
| 19 |
+
"We have escalated this to our trust team. Please reset your password, scan your "
|
| 20 |
+
"device for malware, and reply with your workspace_id, last successful login time, "
|
| 21 |
+
"and billing email so we can verify the account safely."
|
| 22 |
+
)
|
| 23 |
+
return (
|
| 24 |
+
"We are treating this as an active incident and our on-call engineering team is engaged. "
|
| 25 |
+
"Please send the affected request IDs, UTC timestamps, and the impacted region so we can "
|
| 26 |
+
"speed up the investigation."
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def default_note(task_id: str) -> str:
|
| 31 |
+
"""Return a task-specific internal note."""
|
| 32 |
+
|
| 33 |
+
if task_id == "billing_refund_easy":
|
| 34 |
+
return "Duplicate charge confirmed from attached invoice; refund approved."
|
| 35 |
+
if task_id == "account_takeover_medium":
|
| 36 |
+
return "Suspicious login alert reported and customer is locked out."
|
| 37 |
+
return "EU data residency rollout hit intermittent HTTP 500s and the customer launches tonight."
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def heuristic_action(observation: SupportDeskObservation) -> SupportDeskAction:
|
| 41 |
+
"""Deterministic high-performing policy used by the baseline."""
|
| 42 |
+
|
| 43 |
+
task = get_task(observation.task_id)
|
| 44 |
+
case = observation.case
|
| 45 |
+
|
| 46 |
+
if case.queue is None or case.priority is None or case.issue_type is None:
|
| 47 |
+
return SupportDeskAction(
|
| 48 |
+
operation="classify",
|
| 49 |
+
queue=task.gold_queue,
|
| 50 |
+
priority=task.gold_priority,
|
| 51 |
+
issue_type=task.gold_issue_type,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
if task.required_requested_fields and sorted(case.requested_fields) != sorted(task.required_requested_fields):
|
| 55 |
+
return SupportDeskAction(
|
| 56 |
+
operation="request_info",
|
| 57 |
+
requested_fields=list(task.required_requested_fields),
|
| 58 |
+
status=task.gold_status,
|
| 59 |
+
reply=default_reply(observation.task_id),
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
if not case.reply:
|
| 63 |
+
return SupportDeskAction(operation="draft_reply", reply=default_reply(observation.task_id))
|
| 64 |
+
|
| 65 |
+
if not case.internal_note:
|
| 66 |
+
return SupportDeskAction(operation="add_internal_note", internal_note=default_note(observation.task_id))
|
| 67 |
+
|
| 68 |
+
return SupportDeskAction(
|
| 69 |
+
operation="submit",
|
| 70 |
+
status=task.gold_status,
|
| 71 |
+
resolution_code=task.gold_resolution_code,
|
| 72 |
+
)
|
supportdesk_env/server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""SupportDesk server package."""
|
supportdesk_env/server/app.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI app entrypoint for the SupportDesk environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
import uvicorn
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from openenv.core.env_server.http_server import create_app
|
| 11 |
+
except ImportError: # pragma: no cover - package name differs across releases
|
| 12 |
+
from openenv_core.env_server.http_server import create_app
|
| 13 |
+
|
| 14 |
+
from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
|
| 15 |
+
from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
|
| 16 |
+
|
| 17 |
+
app = create_app(
|
| 18 |
+
SupportDeskEnvironment,
|
| 19 |
+
action_cls=SupportDeskAction,
|
| 20 |
+
observation_cls=SupportDeskObservation,
|
| 21 |
+
env_name="supportdesk_env",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def main() -> None:
|
| 26 |
+
"""Run the local HTTP server."""
|
| 27 |
+
|
| 28 |
+
port = int(os.getenv("PORT", "8000"))
|
| 29 |
+
uvicorn.run("supportdesk_env.server.app:app", host="0.0.0.0", port=port)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
main()
|
supportdesk_env/server/supportdesk_environment.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SupportDesk environment implementation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from supportdesk_env.graders import grade_case
|
| 9 |
+
from supportdesk_env.models import (
|
| 10 |
+
ActionHistoryEntry,
|
| 11 |
+
SupportCaseProgress,
|
| 12 |
+
SupportDeskAction,
|
| 13 |
+
SupportDeskObservation,
|
| 14 |
+
SupportDeskState,
|
| 15 |
+
)
|
| 16 |
+
from supportdesk_env.openenv_compat import Environment, EnvironmentMetadata
|
| 17 |
+
from supportdesk_env.tasks import (
|
| 18 |
+
ALL_ISSUE_TYPES,
|
| 19 |
+
ALL_PRIORITIES,
|
| 20 |
+
ALL_QUEUES,
|
| 21 |
+
ALL_STATUSES,
|
| 22 |
+
SupportTaskSpec,
|
| 23 |
+
get_task,
|
| 24 |
+
list_task_ids,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class SupportDeskEnvironment(
|
| 29 |
+
Environment[SupportDeskAction, SupportDeskObservation, SupportDeskState]
|
| 30 |
+
):
|
| 31 |
+
"""A realistic customer support triage environment with dense rewards."""
|
| 32 |
+
|
| 33 |
+
def __init__(self, task_id: str | None = None):
|
| 34 |
+
super().__init__()
|
| 35 |
+
requested_task = task_id or os.getenv("SUPPORTDESK_TASK_ID") or list_task_ids()[0]
|
| 36 |
+
self.task: SupportTaskSpec = get_task(requested_task)
|
| 37 |
+
self._max_steps = self.task.max_steps
|
| 38 |
+
self._step_count = 0
|
| 39 |
+
self._reward_total = 0.0
|
| 40 |
+
self._done = False
|
| 41 |
+
self._last_feedback = ""
|
| 42 |
+
self._history: list[ActionHistoryEntry] = []
|
| 43 |
+
self._case = SupportCaseProgress()
|
| 44 |
+
initial_grade = grade_case(self.task, self._case)
|
| 45 |
+
self._score = initial_grade.total_score
|
| 46 |
+
self._completed_milestones = list(initial_grade.completed_milestones)
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def state(self) -> SupportDeskState:
|
| 50 |
+
return SupportDeskState(
|
| 51 |
+
task_id=self.task.task_id,
|
| 52 |
+
difficulty=self.task.difficulty,
|
| 53 |
+
step_count=self._step_count,
|
| 54 |
+
reward=round(self._reward_total, 4),
|
| 55 |
+
done=self._done,
|
| 56 |
+
current_score=round(self._score, 4),
|
| 57 |
+
max_steps=self._max_steps,
|
| 58 |
+
case=self._case.model_copy(deep=True),
|
| 59 |
+
action_history=[entry.model_copy(deep=True) for entry in self._history],
|
| 60 |
+
completed_milestones=list(self._completed_milestones),
|
| 61 |
+
last_feedback=self._last_feedback,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def reset(
|
| 65 |
+
self,
|
| 66 |
+
seed: int | None = None,
|
| 67 |
+
episode_id: str | None = None,
|
| 68 |
+
**kwargs,
|
| 69 |
+
) -> SupportDeskObservation:
|
| 70 |
+
self._step_count = 0
|
| 71 |
+
self._reward_total = 0.0
|
| 72 |
+
self._done = False
|
| 73 |
+
self._last_feedback = "New case loaded. Review the ticket and policy snippets before acting."
|
| 74 |
+
self._history = []
|
| 75 |
+
self._case = SupportCaseProgress()
|
| 76 |
+
initial_grade = grade_case(self.task, self._case)
|
| 77 |
+
self._score = initial_grade.total_score
|
| 78 |
+
self._completed_milestones = list(initial_grade.completed_milestones)
|
| 79 |
+
return self._build_observation(reward=0.0, done=False)
|
| 80 |
+
|
| 81 |
+
def step(
|
| 82 |
+
self,
|
| 83 |
+
action: SupportDeskAction,
|
| 84 |
+
timeout_s: float | None = None,
|
| 85 |
+
**kwargs,
|
| 86 |
+
) -> SupportDeskObservation:
|
| 87 |
+
if self._done:
|
| 88 |
+
return self._build_observation(
|
| 89 |
+
reward=-0.05,
|
| 90 |
+
done=True,
|
| 91 |
+
feedback="Episode already finished. Call reset() before taking more actions.",
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
previous_grade = grade_case(self.task, self._case)
|
| 95 |
+
self._apply_action(action)
|
| 96 |
+
self._step_count += 1
|
| 97 |
+
|
| 98 |
+
current_grade = grade_case(self.task, self._case)
|
| 99 |
+
reward = current_grade.total_score - previous_grade.total_score
|
| 100 |
+
reward += self._action_penalty(action, current_grade.total_score, previous_grade.total_score)
|
| 101 |
+
reward = round(reward, 4)
|
| 102 |
+
|
| 103 |
+
self._score = current_grade.total_score
|
| 104 |
+
self._completed_milestones = list(current_grade.completed_milestones)
|
| 105 |
+
|
| 106 |
+
if action.operation == "submit":
|
| 107 |
+
self._done = True
|
| 108 |
+
self._last_feedback = (
|
| 109 |
+
"Case submitted. Final deterministic grade is "
|
| 110 |
+
f"{current_grade.total_score:.2f}."
|
| 111 |
+
)
|
| 112 |
+
elif self._step_count >= self._max_steps:
|
| 113 |
+
self._done = True
|
| 114 |
+
self._last_feedback = (
|
| 115 |
+
f"Reached max steps ({self._max_steps}). Final deterministic grade is "
|
| 116 |
+
f"{current_grade.total_score:.2f}."
|
| 117 |
+
)
|
| 118 |
+
else:
|
| 119 |
+
self._last_feedback = self._build_feedback(current_grade, reward)
|
| 120 |
+
|
| 121 |
+
self._reward_total = round(self._reward_total + reward, 4)
|
| 122 |
+
self._history.append(
|
| 123 |
+
ActionHistoryEntry(
|
| 124 |
+
step=self._step_count,
|
| 125 |
+
operation=action.operation,
|
| 126 |
+
summary=self._summarize_action(action),
|
| 127 |
+
reward_delta=reward,
|
| 128 |
+
)
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
return self._build_observation(reward=reward, done=self._done)
|
| 132 |
+
|
| 133 |
+
def close(self) -> None:
|
| 134 |
+
"""No-op close hook for compatibility with local scripts."""
|
| 135 |
+
|
| 136 |
+
def get_metadata(self) -> EnvironmentMetadata:
|
| 137 |
+
"""Return richer metadata for docs, validators, and HF Space UI."""
|
| 138 |
+
|
| 139 |
+
readme_path = Path(__file__).resolve().parents[2] / "README.md"
|
| 140 |
+
readme_content = readme_path.read_text(encoding="utf-8") if readme_path.exists() else None
|
| 141 |
+
return EnvironmentMetadata(
|
| 142 |
+
name="supportdesk_env",
|
| 143 |
+
description=(
|
| 144 |
+
"A policy-heavy enterprise support operations environment with deterministic "
|
| 145 |
+
"grading, dense rewards, SLA pressure, and escalating ticket difficulty."
|
| 146 |
+
),
|
| 147 |
+
readme_content=readme_content,
|
| 148 |
+
version="0.1.0",
|
| 149 |
+
author="HyperBrick",
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
def _apply_action(self, action: SupportDeskAction) -> None:
|
| 153 |
+
if action.queue is not None:
|
| 154 |
+
self._case.queue = action.queue
|
| 155 |
+
if action.priority is not None:
|
| 156 |
+
self._case.priority = action.priority
|
| 157 |
+
if action.issue_type is not None:
|
| 158 |
+
self._case.issue_type = action.issue_type
|
| 159 |
+
if action.status is not None:
|
| 160 |
+
self._case.status = action.status
|
| 161 |
+
if action.resolution_code is not None:
|
| 162 |
+
self._case.resolution_code = action.resolution_code
|
| 163 |
+
if action.reply is not None:
|
| 164 |
+
self._case.reply = action.reply
|
| 165 |
+
if action.internal_note is not None:
|
| 166 |
+
self._case.internal_note = action.internal_note
|
| 167 |
+
if action.requested_fields:
|
| 168 |
+
merged = {item for item in self._case.requested_fields}
|
| 169 |
+
merged.update(action.requested_fields)
|
| 170 |
+
self._case.requested_fields = sorted(merged)
|
| 171 |
+
|
| 172 |
+
def _action_penalty(
|
| 173 |
+
self,
|
| 174 |
+
action: SupportDeskAction,
|
| 175 |
+
current_score: float,
|
| 176 |
+
previous_score: float,
|
| 177 |
+
) -> float:
|
| 178 |
+
penalty = 0.0
|
| 179 |
+
if current_score <= previous_score:
|
| 180 |
+
penalty -= 0.03
|
| 181 |
+
if action.operation == "draft_reply" and not action.reply:
|
| 182 |
+
penalty -= 0.03
|
| 183 |
+
if action.operation == "request_info" and not action.requested_fields:
|
| 184 |
+
penalty -= 0.03
|
| 185 |
+
if action.operation == "add_internal_note" and not action.internal_note:
|
| 186 |
+
penalty -= 0.03
|
| 187 |
+
if action.operation == "classify" and not any(
|
| 188 |
+
[action.queue, action.priority, action.issue_type, action.status, action.resolution_code]
|
| 189 |
+
):
|
| 190 |
+
penalty -= 0.03
|
| 191 |
+
return round(penalty, 4)
|
| 192 |
+
|
| 193 |
+
def _build_feedback(self, grade, reward: float) -> str:
|
| 194 |
+
return (
|
| 195 |
+
f"Reward delta {reward:+.2f}. Current score {grade.total_score:.2f}. "
|
| 196 |
+
f"Completed milestones: {', '.join(grade.completed_milestones) or 'none yet'}."
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
def _summarize_action(self, action: SupportDeskAction) -> str:
|
| 200 |
+
parts = [action.operation]
|
| 201 |
+
if action.queue:
|
| 202 |
+
parts.append(f"queue={action.queue}")
|
| 203 |
+
if action.priority:
|
| 204 |
+
parts.append(f"priority={action.priority}")
|
| 205 |
+
if action.issue_type:
|
| 206 |
+
parts.append(f"issue_type={action.issue_type}")
|
| 207 |
+
if action.status:
|
| 208 |
+
parts.append(f"status={action.status}")
|
| 209 |
+
if action.resolution_code:
|
| 210 |
+
parts.append(f"resolution={action.resolution_code}")
|
| 211 |
+
if action.requested_fields:
|
| 212 |
+
parts.append(f"requested={','.join(action.requested_fields)}")
|
| 213 |
+
if action.reply:
|
| 214 |
+
parts.append("reply=yes")
|
| 215 |
+
if action.internal_note:
|
| 216 |
+
parts.append("note=yes")
|
| 217 |
+
return " | ".join(parts)
|
| 218 |
+
|
| 219 |
+
def _build_observation(
|
| 220 |
+
self,
|
| 221 |
+
reward: float,
|
| 222 |
+
done: bool,
|
| 223 |
+
feedback: str | None = None,
|
| 224 |
+
) -> SupportDeskObservation:
|
| 225 |
+
return SupportDeskObservation(
|
| 226 |
+
task_id=self.task.task_id,
|
| 227 |
+
difficulty=self.task.difficulty,
|
| 228 |
+
objective=self.task.objective,
|
| 229 |
+
ticket=self.task.ticket,
|
| 230 |
+
knowledge_base=list(self.task.knowledge_base),
|
| 231 |
+
available_queues=list(ALL_QUEUES),
|
| 232 |
+
available_priorities=list(ALL_PRIORITIES),
|
| 233 |
+
available_statuses=list(ALL_STATUSES),
|
| 234 |
+
available_issue_types=list(ALL_ISSUE_TYPES),
|
| 235 |
+
case=self._case.model_copy(deep=True),
|
| 236 |
+
action_history=[entry.model_copy(deep=True) for entry in self._history],
|
| 237 |
+
feedback=feedback or self._last_feedback,
|
| 238 |
+
remaining_steps=max(self._max_steps - self._step_count, 0),
|
| 239 |
+
reward=reward,
|
| 240 |
+
done=done,
|
| 241 |
+
)
|
supportdesk_env/tasks.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task registry for the SupportDesk environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Literal
|
| 7 |
+
|
| 8 |
+
from supportdesk_env.models import KnowledgeSnippet, SupportTicket
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
ALL_QUEUES = ["billing_ops", "trust_and_safety", "platform_engineering", "general_support"]
|
| 12 |
+
ALL_PRIORITIES = ["low", "normal", "high", "urgent"]
|
| 13 |
+
ALL_STATUSES = ["new", "waiting_on_customer", "resolved", "escalated"]
|
| 14 |
+
ALL_ISSUE_TYPES = [
|
| 15 |
+
"duplicate_charge",
|
| 16 |
+
"account_compromise",
|
| 17 |
+
"production_incident",
|
| 18 |
+
"general_question",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass(frozen=True)
|
| 23 |
+
class SupportTaskSpec:
|
| 24 |
+
"""Immutable definition of a single support triage task."""
|
| 25 |
+
|
| 26 |
+
task_id: str
|
| 27 |
+
difficulty: Literal["easy", "medium", "hard"]
|
| 28 |
+
title: str
|
| 29 |
+
objective: str
|
| 30 |
+
ticket: SupportTicket
|
| 31 |
+
knowledge_base: tuple[KnowledgeSnippet, ...]
|
| 32 |
+
gold_queue: str
|
| 33 |
+
gold_priority: str
|
| 34 |
+
gold_issue_type: str
|
| 35 |
+
gold_status: str
|
| 36 |
+
gold_resolution_code: str
|
| 37 |
+
required_requested_fields: tuple[str, ...]
|
| 38 |
+
required_reply_markers: tuple[tuple[str, ...], ...]
|
| 39 |
+
required_note_markers: tuple[tuple[str, ...], ...]
|
| 40 |
+
forbidden_reply_markers: tuple[str, ...] = ()
|
| 41 |
+
max_steps: int = 6
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
TASKS: dict[str, SupportTaskSpec] = {
|
| 45 |
+
"billing_refund_easy": SupportTaskSpec(
|
| 46 |
+
task_id="billing_refund_easy",
|
| 47 |
+
difficulty="easy",
|
| 48 |
+
title="Duplicate charge refund triage",
|
| 49 |
+
objective=(
|
| 50 |
+
"Triage a duplicate-charge billing ticket, send the correct customer response, "
|
| 51 |
+
"and close the case only if no further customer information is required."
|
| 52 |
+
),
|
| 53 |
+
ticket=SupportTicket(
|
| 54 |
+
customer_name="Riya Shah",
|
| 55 |
+
customer_tier="pro",
|
| 56 |
+
company="PixelNorth Studio",
|
| 57 |
+
subject="Charged twice after I canceled",
|
| 58 |
+
body=(
|
| 59 |
+
"I canceled our Pro annual workspace yesterday, but my card was charged again "
|
| 60 |
+
"this morning and I still see the old invoice. We only had one workspace, "
|
| 61 |
+
"so this looks like a duplicate charge. Please fix it quickly."
|
| 62 |
+
),
|
| 63 |
+
region="ap-south-1",
|
| 64 |
+
affected_users=12,
|
| 65 |
+
sla_minutes_remaining=240,
|
| 66 |
+
business_impact="Finance ops are blocked from closing the monthly books until the duplicate invoice is fixed.",
|
| 67 |
+
secondary_concerns=["The customer also wants confirmation that the canceled workspace will stay deactivated."],
|
| 68 |
+
attachments=["invoice_7741.pdf"],
|
| 69 |
+
),
|
| 70 |
+
knowledge_base=(
|
| 71 |
+
KnowledgeSnippet(
|
| 72 |
+
article_id="KB-101",
|
| 73 |
+
title="Duplicate charges and same-day cancellations",
|
| 74 |
+
content=(
|
| 75 |
+
"If a customer reports a duplicate charge and the subscription is already "
|
| 76 |
+
"canceled, route the ticket to billing_ops with high priority. Billing can "
|
| 77 |
+
"approve the refund immediately without requesting extra information when an "
|
| 78 |
+
"invoice is attached."
|
| 79 |
+
),
|
| 80 |
+
),
|
| 81 |
+
KnowledgeSnippet(
|
| 82 |
+
article_id="KB-102",
|
| 83 |
+
title="Refund communication checklist",
|
| 84 |
+
content=(
|
| 85 |
+
"Customer replies for approved duplicate-charge refunds must confirm that a "
|
| 86 |
+
"refund is being processed, mention the duplicate charge, and set the "
|
| 87 |
+
"expectation that funds typically appear within 5-7 business days."
|
| 88 |
+
),
|
| 89 |
+
),
|
| 90 |
+
KnowledgeSnippet(
|
| 91 |
+
article_id="KB-103",
|
| 92 |
+
title="When to close a billing case",
|
| 93 |
+
content=(
|
| 94 |
+
"Close the case as resolved only after the refund path is clear and no more "
|
| 95 |
+
"customer details are needed."
|
| 96 |
+
),
|
| 97 |
+
),
|
| 98 |
+
),
|
| 99 |
+
gold_queue="billing_ops",
|
| 100 |
+
gold_priority="high",
|
| 101 |
+
gold_issue_type="duplicate_charge",
|
| 102 |
+
gold_status="resolved",
|
| 103 |
+
gold_resolution_code="refund_approved",
|
| 104 |
+
required_requested_fields=(),
|
| 105 |
+
required_reply_markers=(
|
| 106 |
+
("refund", "refunded", "reimburse"),
|
| 107 |
+
("duplicate charge", "charged twice", "double charge"),
|
| 108 |
+
("5-7 business days", "5 to 7 business days", "within 7 business days"),
|
| 109 |
+
),
|
| 110 |
+
required_note_markers=(
|
| 111 |
+
("duplicate charge", "double charge"),
|
| 112 |
+
("refund", "refund approved"),
|
| 113 |
+
),
|
| 114 |
+
forbidden_reply_markers=("chargeback", "security team"),
|
| 115 |
+
),
|
| 116 |
+
"account_takeover_medium": SupportTaskSpec(
|
| 117 |
+
task_id="account_takeover_medium",
|
| 118 |
+
difficulty="medium",
|
| 119 |
+
title="Suspicious login recovery triage",
|
| 120 |
+
objective=(
|
| 121 |
+
"Handle a potential account-compromise case, request the missing verification "
|
| 122 |
+
"details, communicate safe next steps, and keep the case open until the customer replies. "
|
| 123 |
+
"The agent must protect account safety without promising an unsafe immediate unlock."
|
| 124 |
+
),
|
| 125 |
+
ticket=SupportTicket(
|
| 126 |
+
customer_name="Marcus Lee",
|
| 127 |
+
customer_tier="pro",
|
| 128 |
+
company="Northline Analytics",
|
| 129 |
+
subject="Locked out after strange login alert",
|
| 130 |
+
body=(
|
| 131 |
+
"Our workspace admin got a login alert from a country none of us have visited, "
|
| 132 |
+
"and now I can't get back into the account. Please unlock it ASAP. The billing "
|
| 133 |
+
"email is still ours, but I'm worried someone got in."
|
| 134 |
+
),
|
| 135 |
+
region="us-east-1",
|
| 136 |
+
affected_users=34,
|
| 137 |
+
sla_minutes_remaining=90,
|
| 138 |
+
business_impact="The admin is locked out of the analytics workspace ahead of the Monday executive review.",
|
| 139 |
+
secondary_concerns=["The customer wants the account unlocked immediately, but the verification flow cannot be skipped."],
|
| 140 |
+
attachments=[],
|
| 141 |
+
),
|
| 142 |
+
knowledge_base=(
|
| 143 |
+
KnowledgeSnippet(
|
| 144 |
+
article_id="SEC-201",
|
| 145 |
+
title="Account compromise routing",
|
| 146 |
+
content=(
|
| 147 |
+
"Potential account-takeover reports route to trust_and_safety with urgent "
|
| 148 |
+
"priority. Do not resolve the case immediately."
|
| 149 |
+
),
|
| 150 |
+
),
|
| 151 |
+
KnowledgeSnippet(
|
| 152 |
+
article_id="SEC-202",
|
| 153 |
+
title="Verification details before unlock",
|
| 154 |
+
content=(
|
| 155 |
+
"Before access can be restored, ask the customer for the workspace_id, the "
|
| 156 |
+
"last successful login time, and the billing email on file. Keep the status "
|
| 157 |
+
"waiting_on_customer until the details arrive."
|
| 158 |
+
),
|
| 159 |
+
),
|
| 160 |
+
KnowledgeSnippet(
|
| 161 |
+
article_id="SEC-203",
|
| 162 |
+
title="Customer response checklist",
|
| 163 |
+
content=(
|
| 164 |
+
"Security replies should tell the customer to reset their password, scan "
|
| 165 |
+
"their device for malware, and explain that the trust team is reviewing the case."
|
| 166 |
+
),
|
| 167 |
+
),
|
| 168 |
+
),
|
| 169 |
+
gold_queue="trust_and_safety",
|
| 170 |
+
gold_priority="urgent",
|
| 171 |
+
gold_issue_type="account_compromise",
|
| 172 |
+
gold_status="waiting_on_customer",
|
| 173 |
+
gold_resolution_code="verification_needed",
|
| 174 |
+
required_requested_fields=("workspace_id", "last_successful_login", "billing_email"),
|
| 175 |
+
required_reply_markers=(
|
| 176 |
+
("reset your password", "change your password"),
|
| 177 |
+
("scan", "malware", "device check"),
|
| 178 |
+
("trust team", "security team", "trust and safety"),
|
| 179 |
+
),
|
| 180 |
+
required_note_markers=(
|
| 181 |
+
("suspicious login", "strange login"),
|
| 182 |
+
("locked out", "can’t get back", "cannot get back"),
|
| 183 |
+
),
|
| 184 |
+
),
|
| 185 |
+
"api_incident_hard": SupportTaskSpec(
|
| 186 |
+
task_id="api_incident_hard",
|
| 187 |
+
difficulty="hard",
|
| 188 |
+
title="Production API incident escalation",
|
| 189 |
+
objective=(
|
| 190 |
+
"Triage a high-pressure enterprise incident, ask for the right diagnostics, notify "
|
| 191 |
+
"the customer that engineering is engaged, and escalate instead of resolving. "
|
| 192 |
+
"The agent must prioritize the outage over a tempting secondary compliance question."
|
| 193 |
+
),
|
| 194 |
+
ticket=SupportTicket(
|
| 195 |
+
customer_name="Asha Verma",
|
| 196 |
+
customer_tier="enterprise",
|
| 197 |
+
company="Kairo Health",
|
| 198 |
+
subject="EU rollout blocked by intermittent 500s",
|
| 199 |
+
body=(
|
| 200 |
+
"We're launching our EU workspace tonight. Since enabling EU data residency we "
|
| 201 |
+
"see intermittent HTTP 500 responses from /v1/exports in production. Our "
|
| 202 |
+
"compliance lead is also asking whether this affects the audit trail, but the "
|
| 203 |
+
"main issue is the outage. We need help immediately."
|
| 204 |
+
),
|
| 205 |
+
region="eu-west-1",
|
| 206 |
+
affected_users=1800,
|
| 207 |
+
sla_minutes_remaining=25,
|
| 208 |
+
business_impact="A production launch and a customer-facing compliance review are both at risk tonight if the outage persists.",
|
| 209 |
+
secondary_concerns=["The compliance lead is asking whether audit trails are affected, but the live outage is the primary incident."],
|
| 210 |
+
attachments=["error_screenshot.png"],
|
| 211 |
+
),
|
| 212 |
+
knowledge_base=(
|
| 213 |
+
KnowledgeSnippet(
|
| 214 |
+
article_id="INC-301",
|
| 215 |
+
title="Production availability incidents",
|
| 216 |
+
content=(
|
| 217 |
+
"Any active production 5xx incident for a paying customer routes to "
|
| 218 |
+
"platform_engineering with urgent priority and should be escalated, not resolved."
|
| 219 |
+
),
|
| 220 |
+
),
|
| 221 |
+
KnowledgeSnippet(
|
| 222 |
+
article_id="INC-302",
|
| 223 |
+
title="Minimum diagnostics for API incidents",
|
| 224 |
+
content=(
|
| 225 |
+
"Before engineering can investigate, request concrete examples including "
|
| 226 |
+
"request_ids, UTC timestamps, and the affected region."
|
| 227 |
+
),
|
| 228 |
+
),
|
| 229 |
+
KnowledgeSnippet(
|
| 230 |
+
article_id="INC-303",
|
| 231 |
+
title="Customer communication during an incident",
|
| 232 |
+
content=(
|
| 233 |
+
"The reply should acknowledge an incident, say the on-call engineering team "
|
| 234 |
+
"is engaged, and ask for the diagnostics needed to speed investigation."
|
| 235 |
+
),
|
| 236 |
+
),
|
| 237 |
+
KnowledgeSnippet(
|
| 238 |
+
article_id="INC-304",
|
| 239 |
+
title="Primary issue triage rule",
|
| 240 |
+
content=(
|
| 241 |
+
"When a production outage appears alongside a secondary compliance or audit "
|
| 242 |
+
"question, resolve the live outage first and avoid treating the secondary "
|
| 243 |
+
"question as the primary queue-driving issue."
|
| 244 |
+
),
|
| 245 |
+
),
|
| 246 |
+
),
|
| 247 |
+
gold_queue="platform_engineering",
|
| 248 |
+
gold_priority="urgent",
|
| 249 |
+
gold_issue_type="production_incident",
|
| 250 |
+
gold_status="escalated",
|
| 251 |
+
gold_resolution_code="incident_opened",
|
| 252 |
+
required_requested_fields=("request_ids", "timestamp_utc", "region"),
|
| 253 |
+
required_reply_markers=(
|
| 254 |
+
("incident", "outage", "investigating"),
|
| 255 |
+
("on-call", "engineering team", "engineering is engaged"),
|
| 256 |
+
("request id", "request_ids"),
|
| 257 |
+
("utc", "timestamp"),
|
| 258 |
+
),
|
| 259 |
+
required_note_markers=(
|
| 260 |
+
("eu data residency", "eu rollout"),
|
| 261 |
+
("500", "http 500"),
|
| 262 |
+
("launch tonight", "tonight"),
|
| 263 |
+
),
|
| 264 |
+
),
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def get_task(task_id: str) -> SupportTaskSpec:
|
| 269 |
+
"""Return a task definition or raise a helpful error."""
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
return TASKS[task_id]
|
| 273 |
+
except KeyError as exc: # pragma: no cover - defensive
|
| 274 |
+
valid = ", ".join(sorted(TASKS))
|
| 275 |
+
raise ValueError(f"Unknown task_id '{task_id}'. Valid task ids: {valid}") from exc
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def list_task_ids() -> list[str]:
|
| 279 |
+
"""List tasks in a stable evaluation order."""
|
| 280 |
+
|
| 281 |
+
return list(TASKS)
|
tasks.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility wrapper for the real supportdesk_env package."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env.tasks import * # noqa: F401,F403
|
tests/test_supportdesk.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smoke tests for the SupportDesk environment."""
|
| 2 |
+
|
| 3 |
+
from supportdesk_env.graders import grade_case
|
| 4 |
+
from supportdesk_env.models import SupportDeskAction
|
| 5 |
+
from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
|
| 6 |
+
from supportdesk_env.tasks import get_task, list_task_ids
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_all_tasks_are_registered():
|
| 10 |
+
assert list_task_ids() == [
|
| 11 |
+
"billing_refund_easy",
|
| 12 |
+
"account_takeover_medium",
|
| 13 |
+
"api_incident_hard",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_environment_reset_and_state():
|
| 18 |
+
env = SupportDeskEnvironment(task_id="billing_refund_easy")
|
| 19 |
+
observation = env.reset()
|
| 20 |
+
assert observation.task_id == "billing_refund_easy"
|
| 21 |
+
assert env.state.step_count == 0
|
| 22 |
+
assert env.state.current_score == 0.15
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_perfect_solution_grades_full_score():
|
| 26 |
+
task = get_task("billing_refund_easy")
|
| 27 |
+
env = SupportDeskEnvironment(task_id=task.task_id)
|
| 28 |
+
env.reset()
|
| 29 |
+
env.step(
|
| 30 |
+
SupportDeskAction(
|
| 31 |
+
operation="classify",
|
| 32 |
+
queue=task.gold_queue,
|
| 33 |
+
priority=task.gold_priority,
|
| 34 |
+
issue_type=task.gold_issue_type,
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
env.step(
|
| 38 |
+
SupportDeskAction(
|
| 39 |
+
operation="draft_reply",
|
| 40 |
+
reply="Refund approved for the duplicate charge and it should arrive within 5-7 business days.",
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
env.step(
|
| 44 |
+
SupportDeskAction(
|
| 45 |
+
operation="add_internal_note",
|
| 46 |
+
internal_note="Duplicate charge verified and refund approved.",
|
| 47 |
+
)
|
| 48 |
+
)
|
| 49 |
+
env.step(
|
| 50 |
+
SupportDeskAction(
|
| 51 |
+
operation="submit",
|
| 52 |
+
status=task.gold_status,
|
| 53 |
+
resolution_code=task.gold_resolution_code,
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
breakdown = grade_case(task, env.state.case)
|
| 58 |
+
assert breakdown.total_score == 1.0
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|