Spaces:
Sleeping
Sleeping
eeshwar143 commited on
Commit ·
e4accbb
0
Parent(s):
Clean submission history
Browse files- .dockerignore +6 -0
- .env.example +5 -0
- .gitattributes +35 -0
- .gitignore +5 -0
- Dockerfile +21 -0
- PROJECT.md +208 -0
- README.md +37 -0
- inference.py +274 -0
- openenv.yaml +11 -0
- pyproject.toml +29 -0
- requirements.txt +7 -0
- scripts/validate-submission.sh +25 -0
- server/__init__.py +2 -0
- server/app.py +24 -0
- support_queue_env/__init__.py +23 -0
- support_queue_env/client.py +70 -0
- support_queue_env/grading.py +94 -0
- support_queue_env/models.py +125 -0
- support_queue_env/server/__init__.py +1 -0
- support_queue_env/server/app.py +60 -0
- support_queue_env/server/openenv_compat.py +91 -0
- support_queue_env/server/support_queue_environment.py +157 -0
- support_queue_env/tasks.py +249 -0
- uv.lock +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.pytest_cache
|
| 3 |
+
__pycache__
|
| 4 |
+
.venv
|
| 5 |
+
.uv-cache
|
| 6 |
+
inference_results.json
|
.env.example
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
API_BASE_URL=https://api.openai.com/v1
|
| 2 |
+
MODEL_NAME=gpt-4o-mini
|
| 3 |
+
HF_TOKEN=
|
| 4 |
+
LOCAL_IMAGE_NAME=
|
| 5 |
+
ENV_BASE_URL=http://127.0.0.1:8000
|
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.uv-cache/
|
| 2 |
+
.docker/
|
| 3 |
+
__pycache__/
|
| 4 |
+
.pytest_cache/
|
| 5 |
+
inference_results.json
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
ENV PORT=8000
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt ./
|
| 10 |
+
COPY pyproject.toml ./
|
| 11 |
+
COPY README.md ./
|
| 12 |
+
COPY server ./server
|
| 13 |
+
COPY support_queue_env ./support_queue_env
|
| 14 |
+
COPY openenv.yaml ./
|
| 15 |
+
COPY inference.py ./
|
| 16 |
+
|
| 17 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 18 |
+
|
| 19 |
+
EXPOSE 8000
|
| 20 |
+
|
| 21 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
PROJECT.md
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Support Queue OpenEnv
|
| 2 |
+
|
| 3 |
+
A real-world OpenEnv benchmark for **SaaS support triage**.
|
| 4 |
+
|
| 5 |
+
Agents must read incoming support tickets, assign the right priority, route the case to the correct internal queue, choose the next action, and draft a safe first reply. The benchmark is designed to feel like an actual support operations workflow rather than a toy task.
|
| 6 |
+
|
| 7 |
+
## Why This Environment
|
| 8 |
+
|
| 9 |
+
Real support teams repeatedly solve the same high-value triage problems:
|
| 10 |
+
|
| 11 |
+
- decide how urgent a ticket is
|
| 12 |
+
- route it to the right team
|
| 13 |
+
- avoid unsafe or misleading replies
|
| 14 |
+
- handle ambiguous requests without over-escalating
|
| 15 |
+
|
| 16 |
+
This makes support triage a strong RL and agent-evaluation environment because success is measurable, partial credit is meaningful, and mistakes are easy to interpret.
|
| 17 |
+
|
| 18 |
+
## What The Agent Does
|
| 19 |
+
|
| 20 |
+
For each ticket, the agent must produce a `SupportQueueAction` with:
|
| 21 |
+
|
| 22 |
+
- `priority`: `P1 | P2 | P3 | P4`
|
| 23 |
+
- `queue`: `billing | security | technical | success | trust_safety`
|
| 24 |
+
- `disposition`: `respond | request_info | escalate | close`
|
| 25 |
+
- `summary`: short internal triage note
|
| 26 |
+
- `response`: first customer-facing reply
|
| 27 |
+
- `confidence`: float in `[0.0, 1.0]`
|
| 28 |
+
|
| 29 |
+
## Observation Space
|
| 30 |
+
|
| 31 |
+
Each `reset()` and `step()` returns a typed `SupportQueueObservation` containing:
|
| 32 |
+
|
| 33 |
+
| Field | Meaning |
|
| 34 |
+
| --- | --- |
|
| 35 |
+
| `task_id`, `task_title`, `difficulty` | Active benchmark task metadata |
|
| 36 |
+
| `instructions` | Task-specific operating guidance |
|
| 37 |
+
| `current_index`, `total_tickets` | Episode progress |
|
| 38 |
+
| `ticket` | Current customer ticket payload |
|
| 39 |
+
| `allowed_priorities`, `allowed_queues`, `allowed_dispositions` | Valid discrete actions |
|
| 40 |
+
| `scoring_weights` | Reward decomposition |
|
| 41 |
+
| `last_feedback` | Previous grader output |
|
| 42 |
+
| `reward`, `cumulative_reward`, `done` | Episode feedback |
|
| 43 |
+
| `info` | Extra metadata such as `episode_id` |
|
| 44 |
+
|
| 45 |
+
The ticket payload includes:
|
| 46 |
+
|
| 47 |
+
- `ticket_id`
|
| 48 |
+
- `subject`
|
| 49 |
+
- `body`
|
| 50 |
+
- `customer_tier`
|
| 51 |
+
- `product_area`
|
| 52 |
+
- `sla_hours`
|
| 53 |
+
- `recent_events`
|
| 54 |
+
|
| 55 |
+
## State Space
|
| 56 |
+
|
| 57 |
+
`state()` returns a typed `SupportQueueState` with:
|
| 58 |
+
|
| 59 |
+
- active task card
|
| 60 |
+
- current cursor
|
| 61 |
+
- cumulative and average reward
|
| 62 |
+
- processed ticket ids
|
| 63 |
+
- full action history
|
| 64 |
+
- full per-ticket grading history
|
| 65 |
+
|
| 66 |
+
## Tasks
|
| 67 |
+
|
| 68 |
+
The benchmark includes three deterministic tasks with increasing difficulty.
|
| 69 |
+
|
| 70 |
+
| Task ID | Difficulty | Tickets | Description |
|
| 71 |
+
| --- | --- | ---: | --- |
|
| 72 |
+
| `easy_inbox_cleanup` | Easy | 2 | Straightforward access and billing tickets |
|
| 73 |
+
| `medium_sla_defense` | Medium | 3 | Mix of phishing escalation, webhook failure, and billing ambiguity |
|
| 74 |
+
| `hard_exec_escalations` | Hard | 4 | Executive-pressure tickets spanning production, security, commercial, and retention workflows |
|
| 75 |
+
|
| 76 |
+
## Reward Design
|
| 77 |
+
|
| 78 |
+
Each processed ticket gets a reward in `[0.0, 1.0]`.
|
| 79 |
+
|
| 80 |
+
Reward components:
|
| 81 |
+
|
| 82 |
+
| Component | Weight |
|
| 83 |
+
| --- | ---: |
|
| 84 |
+
| Priority accuracy | `0.30` |
|
| 85 |
+
| Queue accuracy | `0.25` |
|
| 86 |
+
| Disposition accuracy | `0.20` |
|
| 87 |
+
| Summary keyword coverage | `0.15` |
|
| 88 |
+
| Response keyword coverage | `0.10` |
|
| 89 |
+
| Unsafe reply penalty | `-0.10` |
|
| 90 |
+
|
| 91 |
+
This gives useful partial progress signals. An agent can still earn reward for a good route or good reply even if one part of the triage decision is wrong.
|
| 92 |
+
|
| 93 |
+
## API Surface
|
| 94 |
+
|
| 95 |
+
The environment server exposes:
|
| 96 |
+
|
| 97 |
+
- `POST /reset`
|
| 98 |
+
- `POST /step`
|
| 99 |
+
- `GET /state`
|
| 100 |
+
- `GET /tasks`
|
| 101 |
+
- `GET /health`
|
| 102 |
+
- `GET /`
|
| 103 |
+
|
| 104 |
+
Example reset payload:
|
| 105 |
+
|
| 106 |
+
```json
|
| 107 |
+
{
|
| 108 |
+
"task_id": "easy_inbox_cleanup"
|
| 109 |
+
}
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
## Project Structure
|
| 113 |
+
|
| 114 |
+
```text
|
| 115 |
+
support_queue_env/
|
| 116 |
+
client.py
|
| 117 |
+
grading.py
|
| 118 |
+
models.py
|
| 119 |
+
tasks.py
|
| 120 |
+
server/
|
| 121 |
+
app.py
|
| 122 |
+
openenv_compat.py
|
| 123 |
+
support_queue_environment.py
|
| 124 |
+
Dockerfile
|
| 125 |
+
openenv.yaml
|
| 126 |
+
inference.py
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
## Running Locally
|
| 130 |
+
|
| 131 |
+
### Python
|
| 132 |
+
|
| 133 |
+
```bash
|
| 134 |
+
pip install -r requirements.txt
|
| 135 |
+
uvicorn support_queue_env.server.app:app --host 0.0.0.0 --port 8000
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
### Docker
|
| 139 |
+
|
| 140 |
+
```bash
|
| 141 |
+
docker build -t support-queue-openenv .
|
| 142 |
+
docker run --rm -p 8000:8000 support-queue-openenv
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
## Baseline Inference
|
| 146 |
+
|
| 147 |
+
The required inference script is [inference.py](./inference.py).
|
| 148 |
+
|
| 149 |
+
It:
|
| 150 |
+
|
| 151 |
+
- uses the OpenAI Python client
|
| 152 |
+
- reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN`, and optional `LOCAL_IMAGE_NAME`
|
| 153 |
+
- emits structured `[START]`, `[STEP]`, and `[END]` logs
|
| 154 |
+
- writes `inference_results.json`
|
| 155 |
+
|
| 156 |
+
Set environment variables:
|
| 157 |
+
|
| 158 |
+
```bash
|
| 159 |
+
API_BASE_URL=https://api.openai.com/v1
|
| 160 |
+
MODEL_NAME=gpt-4o-mini
|
| 161 |
+
HF_TOKEN=your_token
|
| 162 |
+
LOCAL_IMAGE_NAME=
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
Then run:
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
python inference.py
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
## Baseline Scores
|
| 172 |
+
|
| 173 |
+
Expected deterministic baseline scores from the bundled heuristic policy:
|
| 174 |
+
|
| 175 |
+
| Task | Score |
|
| 176 |
+
| --- | ---: |
|
| 177 |
+
| `easy_inbox_cleanup` | `1.00` |
|
| 178 |
+
| `medium_sla_defense` | `0.98` |
|
| 179 |
+
| `hard_exec_escalations` | `0.97` |
|
| 180 |
+
| Average | `0.98` |
|
| 181 |
+
|
| 182 |
+
## Hugging Face Space
|
| 183 |
+
|
| 184 |
+
This repository is configured for a **Docker Space**.
|
| 185 |
+
|
| 186 |
+
- front matter in `README.md` sets `sdk: docker`
|
| 187 |
+
- app serves on port `8000`
|
| 188 |
+
- `GET /health` and `POST /reset` support deployment checks
|
| 189 |
+
|
| 190 |
+
## OpenEnv Files
|
| 191 |
+
|
| 192 |
+
Core submission files:
|
| 193 |
+
|
| 194 |
+
- [openenv.yaml](./openenv.yaml)
|
| 195 |
+
- [inference.py](./inference.py)
|
| 196 |
+
- [Dockerfile](./Dockerfile)
|
| 197 |
+
- [support_queue_env/models.py](./support_queue_env/models.py)
|
| 198 |
+
- [support_queue_env/server/support_queue_environment.py](./support_queue_env/server/support_queue_environment.py)
|
| 199 |
+
|
| 200 |
+
## Submission Checklist
|
| 201 |
+
|
| 202 |
+
- typed action, observation, and state models included
|
| 203 |
+
- `reset()`, `step()`, and `state()` implemented
|
| 204 |
+
- three graded tasks included
|
| 205 |
+
- reward bounded to `[0.0, 1.0]`
|
| 206 |
+
- Dockerfile included
|
| 207 |
+
- Hugging Face Docker Space compatible
|
| 208 |
+
- root `inference.py` included
|
README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Support Queue OpenEnv
|
| 3 |
+
emoji: 🎫
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Support Queue OpenEnv
|
| 11 |
+
|
| 12 |
+
Real-world OpenEnv benchmark for SaaS support triage.
|
| 13 |
+
|
| 14 |
+
## Quick Links
|
| 15 |
+
|
| 16 |
+
- Full project documentation: [PROJECT.md](./PROJECT.md)
|
| 17 |
+
- OpenEnv manifest: [openenv.yaml](./openenv.yaml)
|
| 18 |
+
- Baseline runner: [inference.py](./inference.py)
|
| 19 |
+
- Environment server: [support_queue_environment.py](./support_queue_env/server/support_queue_environment.py)
|
| 20 |
+
|
| 21 |
+
## Quick Start
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
docker build -t support-queue-openenv .
|
| 25 |
+
docker run --rm -p 8000:8000 support-queue-openenv
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
Then run:
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
python inference.py
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Notes
|
| 35 |
+
|
| 36 |
+
- This repository is configured for a Hugging Face Docker Space.
|
| 37 |
+
- The full environment description, tasks, reward design, and setup guide are in [PROJECT.md](./PROJECT.md).
|
inference.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from typing import Any, List
|
| 7 |
+
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
|
| 10 |
+
from support_queue_env.client import SupportQueueEnv
|
| 11 |
+
from support_queue_env.models import TaskCard, SupportQueueAction, SupportQueueObservation
|
| 12 |
+
from support_queue_env.tasks import TASKS
|
| 13 |
+
|
| 14 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 15 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
|
| 16 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
+
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 18 |
+
|
| 19 |
+
BENCHMARK = "support_queue_env"
|
| 20 |
+
SUCCESS_SCORE_THRESHOLD = 0.80
|
| 21 |
+
MAX_TOKENS = 250
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 25 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 30 |
+
error_value = "none" if error is None else error.replace("\n", " ")
|
| 31 |
+
print(
|
| 32 |
+
f"[STEP] step={step} action={action} reward={reward:.4f} done={str(done).lower()} error={error_value}",
|
| 33 |
+
flush=True,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 39 |
+
print(
|
| 40 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={json.dumps([round(r, 4) for r in rewards])}",
|
| 41 |
+
flush=True,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_model_message(
|
| 47 |
+
client: OpenAI,
|
| 48 |
+
step: int,
|
| 49 |
+
observation: SupportQueueObservation,
|
| 50 |
+
last_reward: float,
|
| 51 |
+
history: List[str],
|
| 52 |
+
) -> str:
|
| 53 |
+
prompt = (
|
| 54 |
+
"Return a short support-triage recommendation as JSON with fields priority, queue, disposition, summary, response. "
|
| 55 |
+
f"Step: {step}. Last reward: {last_reward:.4f}. History: {history[-4:]}. Observation: {observation.model_dump_json()}"
|
| 56 |
+
)
|
| 57 |
+
try:
|
| 58 |
+
completion = client.chat.completions.create(
|
| 59 |
+
model=MODEL_NAME,
|
| 60 |
+
messages=[
|
| 61 |
+
{"role": "system", "content": "You are assisting a support triage agent."},
|
| 62 |
+
{"role": "user", "content": prompt},
|
| 63 |
+
],
|
| 64 |
+
temperature=0.0,
|
| 65 |
+
max_tokens=MAX_TOKENS,
|
| 66 |
+
stream=False,
|
| 67 |
+
)
|
| 68 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 69 |
+
return text if text else "hello"
|
| 70 |
+
except Exception as exc:
|
| 71 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 72 |
+
return "hello"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def available_tasks() -> list[TaskCard]:
|
| 77 |
+
return [
|
| 78 |
+
TaskCard(
|
| 79 |
+
task_id=task.task_id,
|
| 80 |
+
title=task.title,
|
| 81 |
+
difficulty=task.difficulty,
|
| 82 |
+
description=task.description,
|
| 83 |
+
ticket_count=len(task.tickets),
|
| 84 |
+
)
|
| 85 |
+
for task in TASKS
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def heuristic_action(observation: SupportQueueObservation) -> SupportQueueAction:
|
| 91 |
+
text = " ".join(
|
| 92 |
+
[
|
| 93 |
+
observation.ticket.subject,
|
| 94 |
+
observation.ticket.body,
|
| 95 |
+
" ".join(observation.ticket.recent_events),
|
| 96 |
+
observation.task_title,
|
| 97 |
+
]
|
| 98 |
+
).lower()
|
| 99 |
+
|
| 100 |
+
if any(word in text for word in ["password reset", "account is locked", "locked out"]):
|
| 101 |
+
return SupportQueueAction(
|
| 102 |
+
priority="P3",
|
| 103 |
+
queue="technical",
|
| 104 |
+
disposition="respond",
|
| 105 |
+
summary="Customer account locked after password reset in the admin portal.",
|
| 106 |
+
response=(
|
| 107 |
+
"Thanks for reporting this. Please verify the account owner details and we will unlock the account and "
|
| 108 |
+
"confirm the next reset step for you."
|
| 109 |
+
),
|
| 110 |
+
confidence=0.82,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
if any(word in text for word in ["phishing", "credentials", "oauth", "unknown ip", "contractor", "security"]):
|
| 114 |
+
return SupportQueueAction(
|
| 115 |
+
priority="P1",
|
| 116 |
+
queue="security",
|
| 117 |
+
disposition="escalate",
|
| 118 |
+
summary="Security issue involving phishing, credentials, or unknown OAuth access.",
|
| 119 |
+
response=(
|
| 120 |
+
"Thanks for flagging this quickly. This is escalated to our security team now. Please do not click the message "
|
| 121 |
+
"again, revoke suspicious access where possible, and keep audit logs ready."
|
| 122 |
+
),
|
| 123 |
+
confidence=0.9,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
if any(word in text for word in ["502", "500", "webhook", "login", "blocked", "outage", "rollout"]):
|
| 127 |
+
priority = "P1" if any(word in text for word in ["all agents", "entire", "502", "blocked"]) else "P2"
|
| 128 |
+
return SupportQueueAction(
|
| 129 |
+
priority=priority,
|
| 130 |
+
queue="technical",
|
| 131 |
+
disposition="escalate",
|
| 132 |
+
summary="Technical incident affecting login, webhook delivery, or a recent rollout.",
|
| 133 |
+
response=(
|
| 134 |
+
"I am escalating this incident to engineering right away. Please keep example timestamps and logs handy while "
|
| 135 |
+
"we investigate the rollout behavior and urgent production impact."
|
| 136 |
+
),
|
| 137 |
+
confidence=0.88,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
if any(word in text for word in ["renewal", "discount", "cfo", "quote"]):
|
| 141 |
+
return SupportQueueAction(
|
| 142 |
+
priority="P2",
|
| 143 |
+
queue="success",
|
| 144 |
+
disposition="escalate",
|
| 145 |
+
summary="Renewal quote issue where the committed discount is blocking the CFO review.",
|
| 146 |
+
response=(
|
| 147 |
+
"I am escalating this to the account manager now. We will review the quote, confirm the discount commitment, "
|
| 148 |
+
"and share the escalated renewal update as soon as possible."
|
| 149 |
+
),
|
| 150 |
+
confidence=0.83,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
if any(word in text for word in ["cancel", "data export"]):
|
| 154 |
+
return SupportQueueAction(
|
| 155 |
+
priority="P3",
|
| 156 |
+
queue="success",
|
| 157 |
+
disposition="request_info",
|
| 158 |
+
summary="Customer wants cancellation and a data export after verification.",
|
| 159 |
+
response=(
|
| 160 |
+
"I can help with the export and cancellation flow. Please verify that you are the account owner and confirm "
|
| 161 |
+
"the workspace name so we can start the export safely."
|
| 162 |
+
),
|
| 163 |
+
confidence=0.8,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
if any(word in text for word in ["invoice", "charged", "billed", "refund", "billing"]):
|
| 167 |
+
unclear = any(word in text for word in ["maybe", "not fully sure", "thinks", "what details"])
|
| 168 |
+
return SupportQueueAction(
|
| 169 |
+
priority="P2" if any(word in text for word in ["charged twice", "double billed", "two identical charges"]) else "P3",
|
| 170 |
+
queue="billing",
|
| 171 |
+
disposition="request_info" if unclear else "respond",
|
| 172 |
+
summary=(
|
| 173 |
+
"Billing issue is unclear because only one invoice is visible today."
|
| 174 |
+
if unclear
|
| 175 |
+
else "Duplicate charge appears tied to a specific invoice in billing."
|
| 176 |
+
),
|
| 177 |
+
response=(
|
| 178 |
+
"I can review this with billing. Please send the invoice number, charged amount, and the last four digits of "
|
| 179 |
+
"the payment method so we can compare the records."
|
| 180 |
+
if unclear
|
| 181 |
+
else "I am checking this with our billing team now. If this is a duplicate charge, we will investigate the invoice and share the refund update for you."
|
| 182 |
+
),
|
| 183 |
+
confidence=0.84,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
return SupportQueueAction(
|
| 187 |
+
priority="P3",
|
| 188 |
+
queue="technical",
|
| 189 |
+
disposition="respond",
|
| 190 |
+
summary="General product issue that needs standard technical follow-up.",
|
| 191 |
+
response="Thanks for the report. We will verify the issue and share the next reset or troubleshooting step.",
|
| 192 |
+
confidence=0.7,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
async def run_task(client: OpenAI, task: TaskCard) -> dict[str, Any]:
|
| 197 |
+
env = await SupportQueueEnv.from_docker_image(LOCAL_IMAGE_NAME)
|
| 198 |
+
|
| 199 |
+
history: List[str] = []
|
| 200 |
+
rewards: List[float] = []
|
| 201 |
+
steps_taken = 0
|
| 202 |
+
score = 0.0
|
| 203 |
+
success = False
|
| 204 |
+
|
| 205 |
+
log_start(task=task.task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 206 |
+
|
| 207 |
+
try:
|
| 208 |
+
result = await env.reset(task_id=task.task_id)
|
| 209 |
+
last_reward = 0.0
|
| 210 |
+
|
| 211 |
+
for step in range(1, task.ticket_count + 1):
|
| 212 |
+
if result.done:
|
| 213 |
+
break
|
| 214 |
+
|
| 215 |
+
observation = result.observation
|
| 216 |
+
_ = get_model_message(client, step, observation, last_reward, history)
|
| 217 |
+
action = heuristic_action(observation)
|
| 218 |
+
|
| 219 |
+
result = await env.step(action)
|
| 220 |
+
reward = result.reward or 0.0
|
| 221 |
+
done = result.done
|
| 222 |
+
error = None
|
| 223 |
+
|
| 224 |
+
rewards.append(reward)
|
| 225 |
+
steps_taken = step
|
| 226 |
+
last_reward = reward
|
| 227 |
+
|
| 228 |
+
action_payload = json.dumps(action.model_dump(), separators=(",", ":"), sort_keys=True)
|
| 229 |
+
log_step(step=step, action=action_payload, reward=reward, done=done, error=error)
|
| 230 |
+
|
| 231 |
+
history.append(f"Step {step}: {action_payload} -> reward {reward:+.2f}")
|
| 232 |
+
|
| 233 |
+
if done:
|
| 234 |
+
break
|
| 235 |
+
|
| 236 |
+
score = sum(rewards) / len(rewards) if rewards else 0.0
|
| 237 |
+
score = min(max(score, 0.0), 1.0)
|
| 238 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 239 |
+
|
| 240 |
+
finally:
|
| 241 |
+
try:
|
| 242 |
+
await env.close()
|
| 243 |
+
except Exception as exc:
|
| 244 |
+
print(f"[DEBUG] env.close() error (container cleanup): {exc}", flush=True)
|
| 245 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 246 |
+
|
| 247 |
+
return {
|
| 248 |
+
"task_id": task.task_id,
|
| 249 |
+
"score": score,
|
| 250 |
+
"steps": steps_taken,
|
| 251 |
+
"rewards": rewards,
|
| 252 |
+
"success": success,
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
async def main() -> None:
|
| 257 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 258 |
+
results = []
|
| 259 |
+
|
| 260 |
+
for task in available_tasks():
|
| 261 |
+
results.append(await run_task(client, task))
|
| 262 |
+
|
| 263 |
+
aggregate = {
|
| 264 |
+
"benchmark": BENCHMARK,
|
| 265 |
+
"model": MODEL_NAME,
|
| 266 |
+
"average_score": round(sum(item["score"] for item in results) / len(results), 4) if results else 0.0,
|
| 267 |
+
"tasks": results,
|
| 268 |
+
}
|
| 269 |
+
with open("inference_results.json", "w", encoding="utf-8") as handle:
|
| 270 |
+
json.dump(aggregate, handle, indent=2)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
asyncio.run(main())
|
openenv.yaml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: support_queue_env
|
| 3 |
+
version: "0.1.0"
|
| 4 |
+
description: Deterministic SaaS support triage benchmark for OpenEnv.
|
| 5 |
+
type: environment
|
| 6 |
+
runtime: fastapi
|
| 7 |
+
app: server.app:app
|
| 8 |
+
port: 8000
|
| 9 |
+
action: SupportQueueAction
|
| 10 |
+
observation: SupportQueueObservation
|
| 11 |
+
state: SupportQueueState
|
pyproject.toml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-support-queue-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Real-world OpenEnv benchmark for SaaS support queue triage."
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.11"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"fastapi>=0.115.0",
|
| 13 |
+
"openai>=1.55.0",
|
| 14 |
+
"openenv-core[core]>=0.2.2",
|
| 15 |
+
"pydantic>=2.8.0",
|
| 16 |
+
"requests>=2.32.0",
|
| 17 |
+
"uvicorn[standard]>=0.30.0",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
[project.optional-dependencies]
|
| 21 |
+
dev = [
|
| 22 |
+
"pytest>=8.0.0",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
[project.scripts]
|
| 26 |
+
server = "support_queue_env.server.app:main"
|
| 27 |
+
|
| 28 |
+
[tool.setuptools.packages.find]
|
| 29 |
+
include = ["support_queue_env*"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.115.0
|
| 2 |
+
openai>=1.55.0
|
| 3 |
+
openenv-core[core]>=0.2.2
|
| 4 |
+
pydantic>=2.8.0
|
| 5 |
+
requests>=2.32.0
|
| 6 |
+
uvicorn[standard]>=0.30.0
|
| 7 |
+
-e .
|
scripts/validate-submission.sh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
PING_URL="${1:-http://127.0.0.1:8000}"
|
| 5 |
+
REPO_DIR="${2:-.}"
|
| 6 |
+
IMAGE_NAME="support-queue-openenv:local"
|
| 7 |
+
|
| 8 |
+
echo "[1/4] Checking repo files"
|
| 9 |
+
test -f "$REPO_DIR/openenv.yaml"
|
| 10 |
+
test -f "$REPO_DIR/Dockerfile"
|
| 11 |
+
test -f "$REPO_DIR/inference.py"
|
| 12 |
+
|
| 13 |
+
echo "[2/4] Building Docker image"
|
| 14 |
+
docker build -t "$IMAGE_NAME" "$REPO_DIR"
|
| 15 |
+
|
| 16 |
+
echo "[3/4] Starting container"
|
| 17 |
+
CID=$(docker run -d -p 8000:8000 "$IMAGE_NAME")
|
| 18 |
+
trap 'docker rm -f "$CID" >/dev/null 2>&1 || true' EXIT
|
| 19 |
+
sleep 5
|
| 20 |
+
|
| 21 |
+
echo "[4/4] Pinging environment"
|
| 22 |
+
curl -fsS "$PING_URL/health"
|
| 23 |
+
curl -fsS -X POST "$PING_URL/reset" -H 'Content-Type: application/json' -d '{}'
|
| 24 |
+
|
| 25 |
+
echo "Validation completed"
|
server/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Root server package expected by some OpenEnv validators."""
|
| 2 |
+
|
server/app.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Validator-friendly root app entrypoint."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
import uvicorn
|
| 8 |
+
|
| 9 |
+
from support_queue_env.server.app import app
|
| 10 |
+
|
| 11 |
+
__all__ = ["app", "main"]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main() -> None:
|
| 15 |
+
uvicorn.run(
|
| 16 |
+
"server.app:app",
|
| 17 |
+
host="0.0.0.0",
|
| 18 |
+
port=int(os.getenv("PORT", "8000")),
|
| 19 |
+
reload=False,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
if __name__ == "__main__":
|
| 24 |
+
main()
|
support_queue_env/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Public package exports for the support queue OpenEnv environment."""
|
| 2 |
+
|
| 3 |
+
from support_queue_env.client import SupportQueueEnv
|
| 4 |
+
from support_queue_env.models import (
|
| 5 |
+
GradingBreakdown,
|
| 6 |
+
TaskCard,
|
| 7 |
+
TicketFeedback,
|
| 8 |
+
TicketSnapshot,
|
| 9 |
+
SupportQueueAction,
|
| 10 |
+
SupportQueueObservation,
|
| 11 |
+
SupportQueueState,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
__all__ = [
|
| 15 |
+
"GradingBreakdown",
|
| 16 |
+
"SupportQueueAction",
|
| 17 |
+
"SupportQueueEnv",
|
| 18 |
+
"SupportQueueObservation",
|
| 19 |
+
"SupportQueueState",
|
| 20 |
+
"TaskCard",
|
| 21 |
+
"TicketFeedback",
|
| 22 |
+
"TicketSnapshot",
|
| 23 |
+
]
|
support_queue_env/client.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTTP client for interacting with the support queue environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import os
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
import requests
|
| 10 |
+
|
| 11 |
+
from support_queue_env.models import TaskCard, SupportQueueAction, SupportQueueObservation, SupportQueueState
|
| 12 |
+
|
| 13 |
+
DEFAULT_ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://127.0.0.1:8000")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class _Result:
|
| 17 |
+
def __init__(self, payload: dict[str, Any]) -> None:
|
| 18 |
+
self.observation = SupportQueueObservation.model_validate(payload["observation"])
|
| 19 |
+
self.reward = float(payload.get("reward") or 0.0)
|
| 20 |
+
self.done = bool(payload.get("done"))
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class SupportQueueEnv:
|
| 24 |
+
def __init__(self, base_url: str) -> None:
|
| 25 |
+
self.base_url = base_url.rstrip("/")
|
| 26 |
+
|
| 27 |
+
@classmethod
|
| 28 |
+
def from_base_url(cls, base_url: str) -> "SupportQueueEnv":
|
| 29 |
+
return cls(base_url=base_url)
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
async def from_docker_image(cls, image_name: str | None = None) -> "SupportQueueEnv":
|
| 33 |
+
_ = image_name
|
| 34 |
+
return cls(base_url=DEFAULT_ENV_BASE_URL)
|
| 35 |
+
|
| 36 |
+
def list_tasks(self) -> list[TaskCard]:
|
| 37 |
+
response = requests.get(f"{self.base_url}/tasks", timeout=30)
|
| 38 |
+
response.raise_for_status()
|
| 39 |
+
payload = response.json()
|
| 40 |
+
return [TaskCard.model_validate(item) for item in payload["tasks"]]
|
| 41 |
+
|
| 42 |
+
async def alist_tasks(self) -> list[TaskCard]:
|
| 43 |
+
return await asyncio.to_thread(self.list_tasks)
|
| 44 |
+
|
| 45 |
+
def reset_sync(self, **kwargs: Any) -> _Result:
|
| 46 |
+
response = requests.post(f"{self.base_url}/reset", json=kwargs or {}, timeout=30)
|
| 47 |
+
response.raise_for_status()
|
| 48 |
+
return _Result(response.json())
|
| 49 |
+
|
| 50 |
+
async def reset(self, **kwargs: Any) -> _Result:
|
| 51 |
+
return await asyncio.to_thread(self.reset_sync, **kwargs)
|
| 52 |
+
|
| 53 |
+
def step_sync(self, action: SupportQueueAction) -> _Result:
|
| 54 |
+
response = requests.post(f"{self.base_url}/step", json=action.model_dump(), timeout=30)
|
| 55 |
+
response.raise_for_status()
|
| 56 |
+
return _Result(response.json())
|
| 57 |
+
|
| 58 |
+
async def step(self, action: SupportQueueAction) -> _Result:
|
| 59 |
+
return await asyncio.to_thread(self.step_sync, action)
|
| 60 |
+
|
| 61 |
+
def state_sync(self) -> SupportQueueState:
|
| 62 |
+
response = requests.get(f"{self.base_url}/state", timeout=30)
|
| 63 |
+
response.raise_for_status()
|
| 64 |
+
return SupportQueueState.model_validate(response.json())
|
| 65 |
+
|
| 66 |
+
async def state(self) -> SupportQueueState:
|
| 67 |
+
return await asyncio.to_thread(self.state_sync)
|
| 68 |
+
|
| 69 |
+
async def close(self) -> None:
|
| 70 |
+
return None
|
support_queue_env/grading.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic reward shaping and grading utilities."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
from support_queue_env.models import GradingBreakdown, SupportQueueAction, TicketFeedback
|
| 8 |
+
from support_queue_env.tasks import TicketSpec
|
| 9 |
+
|
| 10 |
+
PRIORITY_ORDER = ["P1", "P2", "P3", "P4"]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _normalize(text: str) -> str:
|
| 14 |
+
return re.sub(r"\s+", " ", text.lower()).strip()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _contains_keywords(text: str, keywords: list[str]) -> int:
|
| 18 |
+
normalized = _normalize(text)
|
| 19 |
+
return sum(1 for keyword in keywords if keyword.lower() in normalized)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _priority_score(expected: str, predicted: str) -> float:
|
| 23 |
+
if expected == predicted:
|
| 24 |
+
return 0.30
|
| 25 |
+
try:
|
| 26 |
+
distance = abs(PRIORITY_ORDER.index(expected) - PRIORITY_ORDER.index(predicted))
|
| 27 |
+
except ValueError:
|
| 28 |
+
return 0.0
|
| 29 |
+
if distance == 1:
|
| 30 |
+
return 0.15
|
| 31 |
+
return 0.0
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _queue_score(ticket: TicketSpec, predicted: str) -> float:
|
| 35 |
+
if predicted == ticket.expected_queue:
|
| 36 |
+
return 0.25
|
| 37 |
+
if predicted in ticket.acceptable_queues:
|
| 38 |
+
return 0.15
|
| 39 |
+
return 0.0
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _disposition_score(ticket: TicketSpec, predicted: str) -> float:
|
| 43 |
+
if predicted == ticket.expected_disposition:
|
| 44 |
+
return 0.20
|
| 45 |
+
if predicted in ticket.acceptable_dispositions:
|
| 46 |
+
return 0.10
|
| 47 |
+
return 0.0
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def grade_ticket(ticket: TicketSpec, action: SupportQueueAction) -> TicketFeedback:
|
| 51 |
+
summary_hits = _contains_keywords(action.summary, ticket.summary_keywords)
|
| 52 |
+
response_hits = _contains_keywords(action.response, ticket.response_keywords)
|
| 53 |
+
penalty_hits = _contains_keywords(action.response, ticket.disallowed_keywords)
|
| 54 |
+
|
| 55 |
+
summary_score = 0.15 * (summary_hits / len(ticket.summary_keywords)) if ticket.summary_keywords else 0.15
|
| 56 |
+
response_score = 0.10 * (response_hits / len(ticket.response_keywords)) if ticket.response_keywords else 0.10
|
| 57 |
+
penalty = -0.10 if penalty_hits else 0.0
|
| 58 |
+
|
| 59 |
+
breakdown = GradingBreakdown(
|
| 60 |
+
priority_score=_priority_score(ticket.expected_priority, action.priority),
|
| 61 |
+
queue_score=_queue_score(ticket, action.queue),
|
| 62 |
+
disposition_score=_disposition_score(ticket, action.disposition),
|
| 63 |
+
summary_score=round(summary_score, 4),
|
| 64 |
+
response_score=round(response_score, 4),
|
| 65 |
+
penalty=penalty,
|
| 66 |
+
)
|
| 67 |
+
total = (
|
| 68 |
+
breakdown.priority_score
|
| 69 |
+
+ breakdown.queue_score
|
| 70 |
+
+ breakdown.disposition_score
|
| 71 |
+
+ breakdown.summary_score
|
| 72 |
+
+ breakdown.response_score
|
| 73 |
+
+ breakdown.penalty
|
| 74 |
+
)
|
| 75 |
+
breakdown.total = round(max(0.0, min(1.0, total)), 4)
|
| 76 |
+
|
| 77 |
+
matched_summary = summary_hits if ticket.summary_keywords else 0
|
| 78 |
+
matched_response = response_hits if ticket.response_keywords else 0
|
| 79 |
+
feedback = (
|
| 80 |
+
f"priority={action.priority} target={ticket.expected_priority}; "
|
| 81 |
+
f"queue={action.queue} target={ticket.expected_queue}; "
|
| 82 |
+
f"disposition={action.disposition} target={ticket.expected_disposition}; "
|
| 83 |
+
f"summary_keywords={matched_summary}/{len(ticket.summary_keywords)}; "
|
| 84 |
+
f"response_keywords={matched_response}/{len(ticket.response_keywords)}"
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
return TicketFeedback(
|
| 88 |
+
ticket_id=ticket.ticket_id,
|
| 89 |
+
expected_priority=ticket.expected_priority,
|
| 90 |
+
expected_queue=ticket.expected_queue,
|
| 91 |
+
expected_disposition=ticket.expected_disposition,
|
| 92 |
+
breakdown=breakdown,
|
| 93 |
+
feedback=feedback,
|
| 94 |
+
)
|
support_queue_env/models.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed models for the SaaS support triage benchmark."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Literal
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from openenv.core.env_server.types import Action as OpenEnvAction
|
| 11 |
+
from openenv.core.env_server.types import Observation as OpenEnvObservation
|
| 12 |
+
except Exception: # pragma: no cover - compatibility fallback
|
| 13 |
+
OpenEnvAction = BaseModel
|
| 14 |
+
OpenEnvObservation = BaseModel
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
Priority = Literal["P1", "P2", "P3", "P4"]
|
| 18 |
+
QueueName = Literal["billing", "security", "technical", "success", "trust_safety"]
|
| 19 |
+
Disposition = Literal["respond", "request_info", "escalate", "close"]
|
| 20 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 21 |
+
CustomerTier = Literal["starter", "growth", "enterprise"]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class TaskCard(BaseModel):
|
| 25 |
+
model_config = ConfigDict(extra="forbid")
|
| 26 |
+
|
| 27 |
+
task_id: str
|
| 28 |
+
title: str
|
| 29 |
+
difficulty: Difficulty
|
| 30 |
+
description: str
|
| 31 |
+
ticket_count: int
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TicketSnapshot(BaseModel):
|
| 35 |
+
model_config = ConfigDict(extra="forbid")
|
| 36 |
+
|
| 37 |
+
ticket_id: str
|
| 38 |
+
subject: str
|
| 39 |
+
body: str
|
| 40 |
+
customer_tier: CustomerTier
|
| 41 |
+
product_area: str
|
| 42 |
+
sla_hours: int
|
| 43 |
+
recent_events: list[str] = Field(default_factory=list)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class SupportQueueAction(OpenEnvAction):
|
| 47 |
+
model_config = ConfigDict(extra="forbid")
|
| 48 |
+
|
| 49 |
+
priority: Priority
|
| 50 |
+
queue: QueueName
|
| 51 |
+
disposition: Disposition
|
| 52 |
+
summary: str = Field(..., min_length=8, max_length=280)
|
| 53 |
+
response: str = Field(..., min_length=16, max_length=1200)
|
| 54 |
+
confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class GradingBreakdown(BaseModel):
|
| 58 |
+
model_config = ConfigDict(extra="forbid")
|
| 59 |
+
|
| 60 |
+
priority_score: float = 0.0
|
| 61 |
+
queue_score: float = 0.0
|
| 62 |
+
disposition_score: float = 0.0
|
| 63 |
+
summary_score: float = 0.0
|
| 64 |
+
response_score: float = 0.0
|
| 65 |
+
penalty: float = 0.0
|
| 66 |
+
total: float = 0.0
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class TicketFeedback(BaseModel):
|
| 70 |
+
model_config = ConfigDict(extra="forbid")
|
| 71 |
+
|
| 72 |
+
ticket_id: str
|
| 73 |
+
expected_priority: Priority
|
| 74 |
+
expected_queue: QueueName
|
| 75 |
+
expected_disposition: Disposition
|
| 76 |
+
breakdown: GradingBreakdown
|
| 77 |
+
feedback: str
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class SupportQueueObservation(OpenEnvObservation):
|
| 81 |
+
model_config = ConfigDict(extra="forbid")
|
| 82 |
+
|
| 83 |
+
task_id: str
|
| 84 |
+
task_title: str
|
| 85 |
+
difficulty: Difficulty
|
| 86 |
+
instructions: str
|
| 87 |
+
current_index: int
|
| 88 |
+
total_tickets: int
|
| 89 |
+
ticket: TicketSnapshot
|
| 90 |
+
allowed_priorities: list[Priority] = Field(default_factory=lambda: ["P1", "P2", "P3", "P4"])
|
| 91 |
+
allowed_queues: list[QueueName] = Field(
|
| 92 |
+
default_factory=lambda: ["billing", "security", "technical", "success", "trust_safety"]
|
| 93 |
+
)
|
| 94 |
+
allowed_dispositions: list[Disposition] = Field(
|
| 95 |
+
default_factory=lambda: ["respond", "request_info", "escalate", "close"]
|
| 96 |
+
)
|
| 97 |
+
scoring_weights: dict[str, float] = Field(
|
| 98 |
+
default_factory=lambda: {
|
| 99 |
+
"priority": 0.30,
|
| 100 |
+
"queue": 0.25,
|
| 101 |
+
"disposition": 0.20,
|
| 102 |
+
"summary": 0.15,
|
| 103 |
+
"response": 0.10,
|
| 104 |
+
}
|
| 105 |
+
)
|
| 106 |
+
last_feedback: TicketFeedback | None = None
|
| 107 |
+
cumulative_reward: float = 0.0
|
| 108 |
+
reward: float = 0.0
|
| 109 |
+
done: bool = False
|
| 110 |
+
info: dict[str, Any] = Field(default_factory=dict)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class SupportQueueState(BaseModel):
|
| 114 |
+
model_config = ConfigDict(extra="forbid")
|
| 115 |
+
|
| 116 |
+
episode_id: str
|
| 117 |
+
task: TaskCard
|
| 118 |
+
current_index: int
|
| 119 |
+
total_tickets: int
|
| 120 |
+
done: bool
|
| 121 |
+
cumulative_reward: float
|
| 122 |
+
average_reward: float
|
| 123 |
+
ticket_scores: list[TicketFeedback] = Field(default_factory=list)
|
| 124 |
+
action_history: list[SupportQueueAction] = Field(default_factory=list)
|
| 125 |
+
processed_tickets: list[str] = Field(default_factory=list)
|
support_queue_env/server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Server package for the support queue environment."""
|
support_queue_env/server/app.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI app entrypoint for local runs and Hugging Face Spaces."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from fastapi import FastAPI
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from openenv.core.env_server import create_app
|
| 11 |
+
except Exception: # pragma: no cover - compatibility fallback
|
| 12 |
+
from support_queue_env.server.openenv_compat import create_app
|
| 13 |
+
|
| 14 |
+
from support_queue_env.models import SupportQueueAction, SupportQueueObservation
|
| 15 |
+
from support_queue_env.server.support_queue_environment import SupportQueueEnvironment
|
| 16 |
+
|
| 17 |
+
ENV_NAME = "support_queue_env"
|
| 18 |
+
|
| 19 |
+
app: FastAPI = create_app(
|
| 20 |
+
SupportQueueEnvironment,
|
| 21 |
+
SupportQueueAction,
|
| 22 |
+
SupportQueueObservation,
|
| 23 |
+
env_name=ENV_NAME,
|
| 24 |
+
max_concurrent_envs=16,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@app.get("/")
|
| 29 |
+
def root() -> dict[str, object]:
|
| 30 |
+
return {
|
| 31 |
+
"name": ENV_NAME,
|
| 32 |
+
"status": "ok",
|
| 33 |
+
"message": "Support Queue OpenEnv is running.",
|
| 34 |
+
"endpoints": ["/health", "/reset", "/step", "/state", "/tasks"],
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@app.get("/health")
|
| 39 |
+
def health() -> dict[str, str]:
|
| 40 |
+
return {"status": "ok"}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@app.get("/tasks")
|
| 44 |
+
def list_tasks() -> dict[str, object]:
|
| 45 |
+
return {"tasks": [task.model_dump() for task in SupportQueueEnvironment.available_tasks()]}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def main() -> None:
|
| 49 |
+
import uvicorn
|
| 50 |
+
|
| 51 |
+
uvicorn.run(
|
| 52 |
+
"support_queue_env.server.app:app",
|
| 53 |
+
host="0.0.0.0",
|
| 54 |
+
port=int(os.getenv("PORT", "8000")),
|
| 55 |
+
reload=False,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
main()
|
support_queue_env/server/openenv_compat.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Small FastAPI compatibility layer used when openenv-core is unavailable."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Generic, TypeVar
|
| 6 |
+
|
| 7 |
+
from fastapi import Body, FastAPI
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
ActT = TypeVar("ActT", bound=BaseModel)
|
| 11 |
+
ObsT = TypeVar("ObsT", bound=BaseModel)
|
| 12 |
+
StateT = TypeVar("StateT", bound=BaseModel)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Environment(Generic[ActT, ObsT, StateT]):
|
| 16 |
+
SUPPORTS_CONCURRENT_SESSIONS = False
|
| 17 |
+
|
| 18 |
+
def reset(self, **kwargs: Any) -> ObsT:
|
| 19 |
+
raise NotImplementedError
|
| 20 |
+
|
| 21 |
+
def step(self, action: ActT) -> ObsT:
|
| 22 |
+
raise NotImplementedError
|
| 23 |
+
|
| 24 |
+
def state(self) -> StateT:
|
| 25 |
+
raise NotImplementedError
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def create_app(
|
| 29 |
+
environment_cls: type[Environment[ActT, ObsT, StateT]],
|
| 30 |
+
action_model: type[ActT],
|
| 31 |
+
observation_model: type[ObsT],
|
| 32 |
+
env_name: str,
|
| 33 |
+
**_: Any,
|
| 34 |
+
) -> FastAPI:
|
| 35 |
+
app = FastAPI(title=env_name)
|
| 36 |
+
app.state.environment = environment_cls()
|
| 37 |
+
|
| 38 |
+
@app.get("/")
|
| 39 |
+
def root() -> dict[str, Any]:
|
| 40 |
+
return {
|
| 41 |
+
"name": env_name,
|
| 42 |
+
"status": "ok",
|
| 43 |
+
"endpoints": ["/health", "/reset", "/step", "/state", "/tasks", "/metadata", "/schema"],
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
@app.get("/health")
|
| 47 |
+
def health() -> dict[str, str]:
|
| 48 |
+
return {"status": "ok"}
|
| 49 |
+
|
| 50 |
+
@app.get("/metadata")
|
| 51 |
+
def metadata() -> dict[str, Any]:
|
| 52 |
+
return {
|
| 53 |
+
"name": env_name,
|
| 54 |
+
"supports_state": True,
|
| 55 |
+
"supports_tasks": True,
|
| 56 |
+
"transport": "http",
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
@app.get("/schema")
|
| 60 |
+
def schema() -> dict[str, Any]:
|
| 61 |
+
return {
|
| 62 |
+
"action": action_model.model_json_schema(),
|
| 63 |
+
"observation": observation_model.model_json_schema(),
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
@app.post("/reset")
|
| 67 |
+
def reset(payload: dict[str, Any] | None = Body(default=None)) -> dict[str, Any]:
|
| 68 |
+
observation = app.state.environment.reset(**(payload or {}))
|
| 69 |
+
data = observation.model_dump()
|
| 70 |
+
return {
|
| 71 |
+
"observation": data,
|
| 72 |
+
"reward": float(data.get("reward") or 0.0),
|
| 73 |
+
"done": bool(data.get("done", False)),
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
@app.post("/step")
|
| 77 |
+
def step(payload: dict[str, Any]) -> dict[str, Any]:
|
| 78 |
+
action = action_model.model_validate(payload)
|
| 79 |
+
observation = app.state.environment.step(action)
|
| 80 |
+
data = observation.model_dump()
|
| 81 |
+
return {
|
| 82 |
+
"observation": data,
|
| 83 |
+
"reward": float(data.get("reward") or 0.0),
|
| 84 |
+
"done": bool(data.get("done", False)),
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
@app.get("/state")
|
| 88 |
+
def state() -> dict[str, Any]:
|
| 89 |
+
return app.state.environment.state().model_dump()
|
| 90 |
+
|
| 91 |
+
return app
|
support_queue_env/server/support_queue_environment.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Environment implementation for SaaS support queue triage."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from itertools import cycle
|
| 6 |
+
from threading import Lock
|
| 7 |
+
from typing import Any
|
| 8 |
+
from uuid import uuid4
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from openenv.core.env_server import Environment
|
| 12 |
+
except Exception: # pragma: no cover - compatibility fallback
|
| 13 |
+
from support_queue_env.server.openenv_compat import Environment
|
| 14 |
+
|
| 15 |
+
from support_queue_env.grading import grade_ticket
|
| 16 |
+
from support_queue_env.models import TaskCard, TicketSnapshot, SupportQueueAction, SupportQueueObservation, SupportQueueState
|
| 17 |
+
from support_queue_env.tasks import TASK_INDEX, TASKS, TaskSpec
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SupportQueueEnvironment(Environment[SupportQueueAction, SupportQueueObservation, SupportQueueState]):
|
| 21 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 22 |
+
_task_cycle = cycle(task.task_id for task in TASKS)
|
| 23 |
+
_cycle_lock = Lock()
|
| 24 |
+
|
| 25 |
+
def __init__(self) -> None:
|
| 26 |
+
self.episode_id = ""
|
| 27 |
+
self.task: TaskSpec = TASKS[0]
|
| 28 |
+
self.current_index = 0
|
| 29 |
+
self.cumulative_reward = 0.0
|
| 30 |
+
self.ticket_scores = []
|
| 31 |
+
self.action_history = []
|
| 32 |
+
self.processed_tickets = []
|
| 33 |
+
self.done = False
|
| 34 |
+
|
| 35 |
+
@classmethod
|
| 36 |
+
def available_tasks(cls) -> list[TaskCard]:
|
| 37 |
+
return [
|
| 38 |
+
TaskCard(
|
| 39 |
+
task_id=task.task_id,
|
| 40 |
+
title=task.title,
|
| 41 |
+
difficulty=task.difficulty,
|
| 42 |
+
description=task.description,
|
| 43 |
+
ticket_count=len(task.tickets),
|
| 44 |
+
)
|
| 45 |
+
for task in TASKS
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
@classmethod
|
| 49 |
+
def next_default_task_id(cls) -> str:
|
| 50 |
+
with cls._cycle_lock:
|
| 51 |
+
return next(cls._task_cycle)
|
| 52 |
+
|
| 53 |
+
def reset(self, task_id: str | None = None, **_: Any) -> SupportQueueObservation:
|
| 54 |
+
selected_task_id = task_id or self.next_default_task_id()
|
| 55 |
+
self.task = TASK_INDEX.get(selected_task_id, TASKS[0])
|
| 56 |
+
self.episode_id = str(uuid4())
|
| 57 |
+
self.current_index = 0
|
| 58 |
+
self.cumulative_reward = 0.0
|
| 59 |
+
self.ticket_scores = []
|
| 60 |
+
self.action_history = []
|
| 61 |
+
self.processed_tickets = []
|
| 62 |
+
self.done = False
|
| 63 |
+
return self._build_observation(reward=0.0, done=False, feedback=None)
|
| 64 |
+
|
| 65 |
+
def step(self, action: SupportQueueAction) -> SupportQueueObservation:
|
| 66 |
+
if self.done:
|
| 67 |
+
return self._terminal_observation("Episode already finished. Call reset() to start a new task.")
|
| 68 |
+
|
| 69 |
+
ticket = self.task.tickets[self.current_index]
|
| 70 |
+
feedback = grade_ticket(ticket, action)
|
| 71 |
+
|
| 72 |
+
self.action_history.append(action)
|
| 73 |
+
self.ticket_scores.append(feedback)
|
| 74 |
+
self.processed_tickets.append(ticket.ticket_id)
|
| 75 |
+
self.cumulative_reward = round(self.cumulative_reward + feedback.breakdown.total, 4)
|
| 76 |
+
self.current_index += 1
|
| 77 |
+
self.done = self.current_index >= len(self.task.tickets)
|
| 78 |
+
|
| 79 |
+
if self.done:
|
| 80 |
+
return self._terminal_observation(feedback.feedback, reward=feedback.breakdown.total, feedback=feedback)
|
| 81 |
+
|
| 82 |
+
return self._build_observation(reward=feedback.breakdown.total, done=False, feedback=feedback)
|
| 83 |
+
|
| 84 |
+
def state(self) -> SupportQueueState:
|
| 85 |
+
average_reward = self.cumulative_reward / len(self.ticket_scores) if self.ticket_scores else 0.0
|
| 86 |
+
return SupportQueueState(
|
| 87 |
+
episode_id=self.episode_id or "not-started",
|
| 88 |
+
task=TaskCard(
|
| 89 |
+
task_id=self.task.task_id,
|
| 90 |
+
title=self.task.title,
|
| 91 |
+
difficulty=self.task.difficulty,
|
| 92 |
+
description=self.task.description,
|
| 93 |
+
ticket_count=len(self.task.tickets),
|
| 94 |
+
),
|
| 95 |
+
current_index=self.current_index,
|
| 96 |
+
total_tickets=len(self.task.tickets),
|
| 97 |
+
done=self.done,
|
| 98 |
+
cumulative_reward=round(self.cumulative_reward, 4),
|
| 99 |
+
average_reward=round(average_reward, 4),
|
| 100 |
+
ticket_scores=self.ticket_scores,
|
| 101 |
+
action_history=self.action_history,
|
| 102 |
+
processed_tickets=self.processed_tickets,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
def _current_ticket(self) -> TicketSnapshot:
|
| 106 |
+
ticket = self.task.tickets[min(self.current_index, len(self.task.tickets) - 1)]
|
| 107 |
+
return TicketSnapshot(
|
| 108 |
+
ticket_id=ticket.ticket_id,
|
| 109 |
+
subject=ticket.subject,
|
| 110 |
+
body=ticket.body,
|
| 111 |
+
customer_tier=ticket.customer_tier,
|
| 112 |
+
product_area=ticket.product_area,
|
| 113 |
+
sla_hours=ticket.sla_hours,
|
| 114 |
+
recent_events=ticket.recent_events,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
def _build_observation(self, reward: float, done: bool, feedback) -> SupportQueueObservation:
|
| 118 |
+
average_reward = self.cumulative_reward / len(self.ticket_scores) if self.ticket_scores else 0.0
|
| 119 |
+
return SupportQueueObservation(
|
| 120 |
+
task_id=self.task.task_id,
|
| 121 |
+
task_title=self.task.title,
|
| 122 |
+
difficulty=self.task.difficulty,
|
| 123 |
+
instructions=self.task.instructions,
|
| 124 |
+
current_index=self.current_index + 1,
|
| 125 |
+
total_tickets=len(self.task.tickets),
|
| 126 |
+
ticket=self._current_ticket(),
|
| 127 |
+
last_feedback=feedback,
|
| 128 |
+
cumulative_reward=round(self.cumulative_reward, 4),
|
| 129 |
+
reward=round(reward, 4),
|
| 130 |
+
done=done,
|
| 131 |
+
info={
|
| 132 |
+
"episode_id": self.episode_id,
|
| 133 |
+
"processed_tickets": list(self.processed_tickets),
|
| 134 |
+
"average_reward": round(average_reward, 4),
|
| 135 |
+
},
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
def _terminal_observation(self, message: str, reward: float = 0.0, feedback=None) -> SupportQueueObservation:
|
| 139 |
+
placeholder_ticket = self._current_ticket()
|
| 140 |
+
return SupportQueueObservation(
|
| 141 |
+
task_id=self.task.task_id,
|
| 142 |
+
task_title=self.task.title,
|
| 143 |
+
difficulty=self.task.difficulty,
|
| 144 |
+
instructions=f"{self.task.instructions} Episode complete.",
|
| 145 |
+
current_index=len(self.task.tickets),
|
| 146 |
+
total_tickets=len(self.task.tickets),
|
| 147 |
+
ticket=placeholder_ticket,
|
| 148 |
+
last_feedback=feedback,
|
| 149 |
+
cumulative_reward=round(self.cumulative_reward, 4),
|
| 150 |
+
reward=round(reward, 4),
|
| 151 |
+
done=True,
|
| 152 |
+
info={
|
| 153 |
+
"episode_id": self.episode_id,
|
| 154 |
+
"processed_tickets": list(self.processed_tickets),
|
| 155 |
+
"message": message,
|
| 156 |
+
},
|
| 157 |
+
)
|
support_queue_env/tasks.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic task catalog for the support triage environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 6 |
+
|
| 7 |
+
from support_queue_env.models import Difficulty, Disposition, Priority, QueueName
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TicketSpec(BaseModel):
|
| 11 |
+
model_config = ConfigDict(extra="forbid")
|
| 12 |
+
|
| 13 |
+
ticket_id: str
|
| 14 |
+
subject: str
|
| 15 |
+
body: str
|
| 16 |
+
customer_tier: str
|
| 17 |
+
product_area: str
|
| 18 |
+
sla_hours: int
|
| 19 |
+
recent_events: list[str] = Field(default_factory=list)
|
| 20 |
+
expected_priority: Priority
|
| 21 |
+
expected_queue: QueueName
|
| 22 |
+
expected_disposition: Disposition
|
| 23 |
+
acceptable_queues: list[QueueName] = Field(default_factory=list)
|
| 24 |
+
acceptable_dispositions: list[Disposition] = Field(default_factory=list)
|
| 25 |
+
summary_keywords: list[str] = Field(default_factory=list)
|
| 26 |
+
response_keywords: list[str] = Field(default_factory=list)
|
| 27 |
+
disallowed_keywords: list[str] = Field(default_factory=list)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class TaskSpec(BaseModel):
|
| 31 |
+
model_config = ConfigDict(extra="forbid")
|
| 32 |
+
|
| 33 |
+
task_id: str
|
| 34 |
+
title: str
|
| 35 |
+
difficulty: Difficulty
|
| 36 |
+
description: str
|
| 37 |
+
instructions: str
|
| 38 |
+
tickets: list[TicketSpec]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
TASKS: list[TaskSpec] = [
|
| 42 |
+
TaskSpec(
|
| 43 |
+
task_id="easy_inbox_cleanup",
|
| 44 |
+
title="Inbox Cleanup",
|
| 45 |
+
difficulty="easy",
|
| 46 |
+
description="Two straightforward tickets covering access and billing triage.",
|
| 47 |
+
instructions=(
|
| 48 |
+
"You are a SaaS support triage agent. For each ticket, choose priority, routing queue, "
|
| 49 |
+
"and the next best disposition. Write a short internal summary plus the first reply you "
|
| 50 |
+
"would send to the customer."
|
| 51 |
+
),
|
| 52 |
+
tickets=[
|
| 53 |
+
TicketSpec(
|
| 54 |
+
ticket_id="E-101",
|
| 55 |
+
subject="Locked out after password reset",
|
| 56 |
+
body=(
|
| 57 |
+
"I reset my password this morning and now the admin portal says my account is locked. "
|
| 58 |
+
"We need to finish payroll before noon."
|
| 59 |
+
),
|
| 60 |
+
customer_tier="starter",
|
| 61 |
+
product_area="auth",
|
| 62 |
+
sla_hours=24,
|
| 63 |
+
recent_events=["Password reset completed 2 hours ago", "No prior incidents on status page"],
|
| 64 |
+
expected_priority="P3",
|
| 65 |
+
expected_queue="technical",
|
| 66 |
+
expected_disposition="respond",
|
| 67 |
+
summary_keywords=["account", "locked", "password"],
|
| 68 |
+
response_keywords=["verify", "unlock", "reset"],
|
| 69 |
+
disallowed_keywords=["refund", "breach"],
|
| 70 |
+
),
|
| 71 |
+
TicketSpec(
|
| 72 |
+
ticket_id="E-102",
|
| 73 |
+
subject="We were charged twice for March",
|
| 74 |
+
body=(
|
| 75 |
+
"Our card shows two identical charges from your company for invoice INV-4481. "
|
| 76 |
+
"Please confirm whether one of them will be refunded."
|
| 77 |
+
),
|
| 78 |
+
customer_tier="growth",
|
| 79 |
+
product_area="billing",
|
| 80 |
+
sla_hours=8,
|
| 81 |
+
recent_events=["Invoice INV-4481 marked paid yesterday"],
|
| 82 |
+
expected_priority="P2",
|
| 83 |
+
expected_queue="billing",
|
| 84 |
+
expected_disposition="respond",
|
| 85 |
+
summary_keywords=["duplicate", "charge", "invoice"],
|
| 86 |
+
response_keywords=["refund", "investigate", "billing"],
|
| 87 |
+
disallowed_keywords=["ignore", "security incident"],
|
| 88 |
+
),
|
| 89 |
+
],
|
| 90 |
+
),
|
| 91 |
+
TaskSpec(
|
| 92 |
+
task_id="medium_sla_defense",
|
| 93 |
+
title="SLA Defense",
|
| 94 |
+
difficulty="medium",
|
| 95 |
+
description="Three tickets that mix urgent escalation with an ambiguity check.",
|
| 96 |
+
instructions=(
|
| 97 |
+
"Prioritize by customer impact and risk. Security events and broad service degradation should "
|
| 98 |
+
"be escalated immediately. If the customer has not given enough evidence to act safely, ask for "
|
| 99 |
+
"the minimum details needed to proceed."
|
| 100 |
+
),
|
| 101 |
+
tickets=[
|
| 102 |
+
TicketSpec(
|
| 103 |
+
ticket_id="M-201",
|
| 104 |
+
subject="Suspicious email asking admins to re-enter credentials",
|
| 105 |
+
body=(
|
| 106 |
+
"Several admins received an email that looks like your login page and asks us to "
|
| 107 |
+
"re-authenticate. One teammate clicked it but says they closed the tab before typing anything."
|
| 108 |
+
),
|
| 109 |
+
customer_tier="enterprise",
|
| 110 |
+
product_area="security",
|
| 111 |
+
sla_hours=1,
|
| 112 |
+
recent_events=["Customer SSO is enabled", "No status page incident posted"],
|
| 113 |
+
expected_priority="P1",
|
| 114 |
+
expected_queue="security",
|
| 115 |
+
expected_disposition="escalate",
|
| 116 |
+
summary_keywords=["phishing", "credentials", "admins"],
|
| 117 |
+
response_keywords=["security", "escalated", "do not click"],
|
| 118 |
+
disallowed_keywords=["send password", "share secrets"],
|
| 119 |
+
),
|
| 120 |
+
TicketSpec(
|
| 121 |
+
ticket_id="M-202",
|
| 122 |
+
subject="Webhook deliveries are failing after yesterday's rollout",
|
| 123 |
+
body=(
|
| 124 |
+
"Every webhook call since 06:15 UTC has returned HTTP 500. This is blocking our downstream "
|
| 125 |
+
"fulfillment pipeline. Can you investigate urgently?"
|
| 126 |
+
),
|
| 127 |
+
customer_tier="growth",
|
| 128 |
+
product_area="integrations",
|
| 129 |
+
sla_hours=4,
|
| 130 |
+
recent_events=["Customer is on API version 2025-11", "Platform release went out last night"],
|
| 131 |
+
expected_priority="P2",
|
| 132 |
+
expected_queue="technical",
|
| 133 |
+
expected_disposition="escalate",
|
| 134 |
+
acceptable_queues=["success"],
|
| 135 |
+
summary_keywords=["webhook", "500", "rollout"],
|
| 136 |
+
response_keywords=["engineering", "logs", "investigate"],
|
| 137 |
+
disallowed_keywords=["duplicate charge", "unsubscribe"],
|
| 138 |
+
),
|
| 139 |
+
TicketSpec(
|
| 140 |
+
ticket_id="M-203",
|
| 141 |
+
subject="Maybe double charged? Not fully sure",
|
| 142 |
+
body=(
|
| 143 |
+
"My finance teammate thinks we were double billed, but I can only find one invoice in the portal. "
|
| 144 |
+
"Could you explain what happened and what details you need from me?"
|
| 145 |
+
),
|
| 146 |
+
customer_tier="growth",
|
| 147 |
+
product_area="billing",
|
| 148 |
+
sla_hours=12,
|
| 149 |
+
recent_events=["One paid invoice visible in portal", "No payment failures recorded"],
|
| 150 |
+
expected_priority="P3",
|
| 151 |
+
expected_queue="billing",
|
| 152 |
+
expected_disposition="request_info",
|
| 153 |
+
acceptable_dispositions=["respond"],
|
| 154 |
+
summary_keywords=["billing", "unclear", "invoice"],
|
| 155 |
+
response_keywords=["invoice", "last four", "amount"],
|
| 156 |
+
disallowed_keywords=["breach", "status page"],
|
| 157 |
+
),
|
| 158 |
+
],
|
| 159 |
+
),
|
| 160 |
+
TaskSpec(
|
| 161 |
+
task_id="hard_exec_escalations",
|
| 162 |
+
title="Executive Escalations",
|
| 163 |
+
difficulty="hard",
|
| 164 |
+
description="Four high-stakes tickets that require precise triage under pressure.",
|
| 165 |
+
instructions=(
|
| 166 |
+
"You are covering an executive escalation queue during a busy incident window. Optimize for "
|
| 167 |
+
"business continuity, account safety, and clean handoffs. Use P1 only for severe production or "
|
| 168 |
+
"security impact. Ask for more detail only when it materially changes the next safe action."
|
| 169 |
+
),
|
| 170 |
+
tickets=[
|
| 171 |
+
TicketSpec(
|
| 172 |
+
ticket_id="H-301",
|
| 173 |
+
subject="All agents see 502 during login",
|
| 174 |
+
body=(
|
| 175 |
+
"Our entire support floor is blocked from logging in. Every browser gets a 502 after "
|
| 176 |
+
"submitting the sign-in form. The public status page still says operational."
|
| 177 |
+
),
|
| 178 |
+
customer_tier="enterprise",
|
| 179 |
+
product_area="auth",
|
| 180 |
+
sla_hours=1,
|
| 181 |
+
recent_events=["50+ seats on account", "Issue started 18 minutes ago"],
|
| 182 |
+
expected_priority="P1",
|
| 183 |
+
expected_queue="technical",
|
| 184 |
+
expected_disposition="escalate",
|
| 185 |
+
summary_keywords=["login", "502", "all agents"],
|
| 186 |
+
response_keywords=["incident", "engineering", "urgent"],
|
| 187 |
+
disallowed_keywords=["refund only", "close ticket"],
|
| 188 |
+
),
|
| 189 |
+
TicketSpec(
|
| 190 |
+
ticket_id="H-302",
|
| 191 |
+
subject="Unknown OAuth app connected after employee departure",
|
| 192 |
+
body=(
|
| 193 |
+
"An OAuth app named 'SyncFast' appeared in our workspace this morning from an IP we don't recognize. "
|
| 194 |
+
"The only recent account change is that one contractor left yesterday."
|
| 195 |
+
),
|
| 196 |
+
customer_tier="enterprise",
|
| 197 |
+
product_area="security",
|
| 198 |
+
sla_hours=1,
|
| 199 |
+
recent_events=["Customer has audit logs enabled", "Former contractor account was deactivated yesterday"],
|
| 200 |
+
expected_priority="P1",
|
| 201 |
+
expected_queue="security",
|
| 202 |
+
expected_disposition="escalate",
|
| 203 |
+
summary_keywords=["oauth", "unknown", "contractor"],
|
| 204 |
+
response_keywords=["security", "revoke", "escalated"],
|
| 205 |
+
disallowed_keywords=["share api key", "ignore"],
|
| 206 |
+
),
|
| 207 |
+
TicketSpec(
|
| 208 |
+
ticket_id="H-303",
|
| 209 |
+
subject="Renewal quote lost our committed discount",
|
| 210 |
+
body=(
|
| 211 |
+
"Our renewal quote is missing the 18% discount your sales team committed in writing. "
|
| 212 |
+
"Our CFO will freeze procurement tomorrow if this isn't corrected."
|
| 213 |
+
),
|
| 214 |
+
customer_tier="enterprise",
|
| 215 |
+
product_area="commercial",
|
| 216 |
+
sla_hours=6,
|
| 217 |
+
recent_events=["Renewal date in 2 days", "Account owner is on PTO"],
|
| 218 |
+
expected_priority="P2",
|
| 219 |
+
expected_queue="success",
|
| 220 |
+
expected_disposition="escalate",
|
| 221 |
+
acceptable_queues=["billing"],
|
| 222 |
+
summary_keywords=["renewal", "discount", "cfo"],
|
| 223 |
+
response_keywords=["account manager", "quote", "escalated"],
|
| 224 |
+
disallowed_keywords=["security breach", "reset password"],
|
| 225 |
+
),
|
| 226 |
+
TicketSpec(
|
| 227 |
+
ticket_id="H-304",
|
| 228 |
+
subject="Need cancellation plus data export",
|
| 229 |
+
body=(
|
| 230 |
+
"We're planning to cancel next month for budget reasons, but first I need a data export for "
|
| 231 |
+
"our records. Please tell me exactly what you need from me to start."
|
| 232 |
+
),
|
| 233 |
+
customer_tier="starter",
|
| 234 |
+
product_area="retention",
|
| 235 |
+
sla_hours=24,
|
| 236 |
+
recent_events=["No open invoices", "Account is owner-managed"],
|
| 237 |
+
expected_priority="P3",
|
| 238 |
+
expected_queue="success",
|
| 239 |
+
expected_disposition="request_info",
|
| 240 |
+
acceptable_queues=["billing"],
|
| 241 |
+
summary_keywords=["cancel", "data export", "verification"],
|
| 242 |
+
response_keywords=["verify", "export", "owner"],
|
| 243 |
+
disallowed_keywords=["breach", "status page"],
|
| 244 |
+
),
|
| 245 |
+
],
|
| 246 |
+
),
|
| 247 |
+
]
|
| 248 |
+
|
| 249 |
+
TASK_INDEX = {task.task_id: task for task in TASKS}
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|