Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- graders/__init__.py +169 -0
- pyproject.toml +1 -1
- server/app.py +51 -4
- tasks/__init__.py +43 -0
- tasks/account_takeover_medium.py +63 -0
- tasks/api_incident_hard.py +68 -0
- tasks/base.py +64 -0
- tasks/billing_refund_easy.py +66 -0
- tasks/regulated_export_exception_hard.py +66 -0
graders/__init__.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task-specific graders for the SupportDesk environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
from models import SupportCaseProgress, SupportDeskObservation
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _as_case(obj) -> SupportCaseProgress:
|
| 12 |
+
"""Normalize observation/state/case to SupportCaseProgress."""
|
| 13 |
+
if isinstance(obj, SupportCaseProgress):
|
| 14 |
+
return obj
|
| 15 |
+
if hasattr(obj, "case"):
|
| 16 |
+
return obj.case # type: ignore[attr-defined]
|
| 17 |
+
raise TypeError(f"Unsupported object for grading: {type(obj)}")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class GradeBreakdown:
|
| 22 |
+
score: float
|
| 23 |
+
message: str
|
| 24 |
+
penalties: dict[str, float]
|
| 25 |
+
completed_milestones: list[str] = None
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def total_score(self) -> float:
|
| 29 |
+
return self.score
|
| 30 |
+
|
| 31 |
+
def __post_init__(self):
|
| 32 |
+
if self.completed_milestones is None:
|
| 33 |
+
object.__setattr__(self, "completed_milestones", [])
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _clamp(v: float) -> float:
|
| 37 |
+
return max(0.01, min(0.99, v))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def grade_task_id(task_id: str, observation: SupportDeskObservation | SupportCaseProgress) -> GradeBreakdown:
|
| 41 |
+
case = _as_case(observation)
|
| 42 |
+
if task_id == "billing_refund_easy":
|
| 43 |
+
return BillingRefundEasyGrader().score(case)
|
| 44 |
+
if task_id == "account_takeover_medium":
|
| 45 |
+
return AccountTakeoverMediumGrader().score(case)
|
| 46 |
+
if task_id == "api_incident_hard":
|
| 47 |
+
return ApiIncidentHardGrader().score(case)
|
| 48 |
+
if task_id == "regulated_export_exception_hard":
|
| 49 |
+
return RegulatedExportExceptionHardGrader().score(case)
|
| 50 |
+
return GradeBreakdown(0.01, "Unknown task", {"unknown_task": 1.0})
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def grade_case(task_or_id, observation) -> GradeBreakdown:
|
| 54 |
+
"""Return a GradeBreakdown for the given task and case/observation."""
|
| 55 |
+
task_id = task_or_id.task_id if hasattr(task_or_id, "task_id") else str(task_or_id)
|
| 56 |
+
case = _as_case(observation)
|
| 57 |
+
return grade_task_id(task_id, case)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class BillingRefundEasyGrader:
|
| 61 |
+
def score(self, case: SupportCaseProgress) -> GradeBreakdown:
|
| 62 |
+
penalties: dict[str, float] = {}
|
| 63 |
+
score = 1.0
|
| 64 |
+
|
| 65 |
+
reply = (case.reply or "").lower()
|
| 66 |
+
note = (case.internal_note or "").lower()
|
| 67 |
+
|
| 68 |
+
if reply:
|
| 69 |
+
if "refund" not in reply:
|
| 70 |
+
penalties["missing_refund"] = 0.25
|
| 71 |
+
else:
|
| 72 |
+
penalties["no_reply"] = 0.55
|
| 73 |
+
|
| 74 |
+
if note:
|
| 75 |
+
if "duplicate" not in note:
|
| 76 |
+
penalties["note_missing_duplicate"] = 0.2
|
| 77 |
+
else:
|
| 78 |
+
penalties["no_note"] = 0.2
|
| 79 |
+
|
| 80 |
+
if case.status != "resolved":
|
| 81 |
+
penalties["status_not_resolved"] = 0.1
|
| 82 |
+
|
| 83 |
+
score -= sum(penalties.values())
|
| 84 |
+
score = round(score, 2)
|
| 85 |
+
return GradeBreakdown(_clamp(score), "Billing refund evaluation", penalties)
|
| 86 |
+
|
| 87 |
+
def grade(self, case: SupportCaseProgress) -> float:
|
| 88 |
+
return self.score(case).score
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class AccountTakeoverMediumGrader:
|
| 92 |
+
def score(self, case: SupportCaseProgress) -> GradeBreakdown:
|
| 93 |
+
penalties: dict[str, float] = {}
|
| 94 |
+
score = 0.2
|
| 95 |
+
|
| 96 |
+
reply = (case.reply or "").lower()
|
| 97 |
+
if reply:
|
| 98 |
+
if "lock" not in reply:
|
| 99 |
+
penalties["missing_lock"] = 0.2
|
| 100 |
+
if "verify" not in reply:
|
| 101 |
+
penalties["missing_verify"] = 0.2
|
| 102 |
+
if "ownership" not in reply:
|
| 103 |
+
penalties["missing_ownership"] = 0.2
|
| 104 |
+
else:
|
| 105 |
+
penalties["no_reply"] = 0.4
|
| 106 |
+
|
| 107 |
+
if case.status not in ("escalated", "waiting_on_customer"):
|
| 108 |
+
penalties["wrong_status"] = 0.2
|
| 109 |
+
|
| 110 |
+
score -= sum(penalties.values())
|
| 111 |
+
score = round(score, 2)
|
| 112 |
+
return GradeBreakdown(_clamp(score), "Account takeover evaluation", penalties)
|
| 113 |
+
|
| 114 |
+
def grade(self, case: SupportCaseProgress) -> float:
|
| 115 |
+
return self.score(case).score
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class ApiIncidentHardGrader:
|
| 119 |
+
def score(self, case: SupportCaseProgress) -> GradeBreakdown:
|
| 120 |
+
penalties: dict[str, float] = {}
|
| 121 |
+
score = 0.2
|
| 122 |
+
|
| 123 |
+
reply = (case.reply or "").lower()
|
| 124 |
+
if reply:
|
| 125 |
+
if "status" not in reply:
|
| 126 |
+
penalties["missing_status_page"] = 0.15
|
| 127 |
+
if "request" not in reply or "id" not in reply:
|
| 128 |
+
penalties["missing_request_ids"] = 0.2
|
| 129 |
+
if "escalat" not in reply:
|
| 130 |
+
penalties["missing_escalation"] = 0.2
|
| 131 |
+
else:
|
| 132 |
+
penalties["no_reply"] = 0.4
|
| 133 |
+
|
| 134 |
+
if case.queue != "platform_engineering":
|
| 135 |
+
penalties["wrong_queue"] = 0.15
|
| 136 |
+
|
| 137 |
+
score -= sum(penalties.values())
|
| 138 |
+
score = round(score, 2)
|
| 139 |
+
return GradeBreakdown(_clamp(score), "API incident evaluation", penalties)
|
| 140 |
+
|
| 141 |
+
def grade(self, case: SupportCaseProgress) -> float:
|
| 142 |
+
return self.score(case).score
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class RegulatedExportExceptionHardGrader:
|
| 146 |
+
def score(self, case: SupportCaseProgress) -> GradeBreakdown:
|
| 147 |
+
penalties: dict[str, float] = {}
|
| 148 |
+
score = 0.2
|
| 149 |
+
|
| 150 |
+
reply = (case.reply or "").lower()
|
| 151 |
+
if reply:
|
| 152 |
+
if "compliance" not in reply:
|
| 153 |
+
penalties["missing_compliance"] = 0.2
|
| 154 |
+
if "cannot promise" not in reply and "not promise" not in reply:
|
| 155 |
+
penalties["missing_no_promise"] = 0.2
|
| 156 |
+
if "recipient" not in reply or "identity" not in reply:
|
| 157 |
+
penalties["missing_recipient"] = 0.15
|
| 158 |
+
else:
|
| 159 |
+
penalties["no_reply"] = 0.4
|
| 160 |
+
|
| 161 |
+
if case.status != "waiting_on_customer":
|
| 162 |
+
penalties["wrong_status"] = 0.15
|
| 163 |
+
|
| 164 |
+
score -= sum(penalties.values())
|
| 165 |
+
score = round(score, 2)
|
| 166 |
+
return GradeBreakdown(_clamp(score), "Regulated export evaluation", penalties)
|
| 167 |
+
|
| 168 |
+
def grade(self, case: SupportCaseProgress) -> float:
|
| 169 |
+
return self.score(case).score
|
pyproject.toml
CHANGED
|
@@ -38,4 +38,4 @@ server = "server.app:main"
|
|
| 38 |
|
| 39 |
[tool.setuptools]
|
| 40 |
include-package-data = true
|
| 41 |
-
packages = ["server"]
|
|
|
|
| 38 |
|
| 39 |
[tool.setuptools]
|
| 40 |
include-package-data = true
|
| 41 |
+
packages = ["server", "tasks", "graders"]
|
server/app.py
CHANGED
|
@@ -14,10 +14,57 @@ try:
|
|
| 14 |
except ImportError:
|
| 15 |
try:
|
| 16 |
from openenv_core.env_server import http_server as openenv_http_server
|
| 17 |
-
except Exception
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
|
| 23 |
from server.supportdesk_environment import SupportDeskEnvironment
|
|
|
|
| 14 |
except ImportError:
|
| 15 |
try:
|
| 16 |
from openenv_core.env_server import http_server as openenv_http_server
|
| 17 |
+
except Exception:
|
| 18 |
+
# Minimal fallback for test runs when openenv is unavailable.
|
| 19 |
+
from pydantic import BaseModel, ValidationError as _PydValidationError
|
| 20 |
+
from fastapi import FastAPI
|
| 21 |
+
|
| 22 |
+
class _ResetRequest(BaseModel):
|
| 23 |
+
seed: int | None = None
|
| 24 |
+
episode_id: str | None = None
|
| 25 |
+
task_id: str | None = None
|
| 26 |
+
timeout_s: float | None = None
|
| 27 |
+
|
| 28 |
+
class _StepRequest(BaseModel):
|
| 29 |
+
action: dict
|
| 30 |
+
timeout_s: float | None = None
|
| 31 |
+
episode_id: str | None = None
|
| 32 |
+
|
| 33 |
+
def _deserialize_action(data, ActionCls):
|
| 34 |
+
return ActionCls.model_validate(data)
|
| 35 |
+
|
| 36 |
+
def _create_app(env_cls, action_cls, obs_cls, env_name: str = "env", max_concurrent_envs: int = 1):
|
| 37 |
+
app = FastAPI()
|
| 38 |
+
|
| 39 |
+
@app.post("/reset")
|
| 40 |
+
def _reset(req: _ResetRequest = _ResetRequest()):
|
| 41 |
+
env = env_cls()
|
| 42 |
+
kwargs = req.model_dump(exclude_none=True)
|
| 43 |
+
obs = env.reset(**kwargs)
|
| 44 |
+
return {"observation": obs.model_dump(), "reward": obs.reward, "done": obs.done}
|
| 45 |
+
|
| 46 |
+
@app.post("/step")
|
| 47 |
+
def _step(req: _StepRequest):
|
| 48 |
+
env = env_cls()
|
| 49 |
+
action = _deserialize_action(req.action, action_cls)
|
| 50 |
+
obs = env.step(action, timeout_s=req.timeout_s, episode_id=req.episode_id)
|
| 51 |
+
return {"observation": obs.model_dump(), "reward": obs.reward, "done": obs.done}
|
| 52 |
+
|
| 53 |
+
@app.get("/state")
|
| 54 |
+
def _state():
|
| 55 |
+
env = env_cls()
|
| 56 |
+
return env.state.model_dump()
|
| 57 |
+
|
| 58 |
+
return app
|
| 59 |
+
|
| 60 |
+
class _Shim:
|
| 61 |
+
ResetRequest = _ResetRequest
|
| 62 |
+
StepRequest = _StepRequest
|
| 63 |
+
ValidationError = _PydValidationError
|
| 64 |
+
deserialize_action = staticmethod(_deserialize_action)
|
| 65 |
+
create_app = staticmethod(_create_app)
|
| 66 |
+
|
| 67 |
+
openenv_http_server = _Shim()
|
| 68 |
|
| 69 |
from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
|
| 70 |
from server.supportdesk_environment import SupportDeskEnvironment
|
tasks/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Task registry for the SupportDesk environment (per-task modules)."""
|
| 2 |
+
|
| 3 |
+
from tasks.base import (
|
| 4 |
+
ALL_ISSUE_TYPES,
|
| 5 |
+
ALL_PRIORITIES,
|
| 6 |
+
ALL_QUEUES,
|
| 7 |
+
ALL_STATUSES,
|
| 8 |
+
SupportTaskSpec,
|
| 9 |
+
)
|
| 10 |
+
from tasks.billing_refund_easy import TASK as BILLING_REFUND_EASY
|
| 11 |
+
from tasks.account_takeover_medium import TASK as ACCOUNT_TAKEOVER_MEDIUM
|
| 12 |
+
from tasks.api_incident_hard import TASK as API_INCIDENT_HARD
|
| 13 |
+
from tasks.regulated_export_exception_hard import TASK as REGULATED_EXPORT_EXCEPTION_HARD
|
| 14 |
+
|
| 15 |
+
TASKS: dict[str, SupportTaskSpec] = {
|
| 16 |
+
t.task_id: t
|
| 17 |
+
for t in (
|
| 18 |
+
BILLING_REFUND_EASY,
|
| 19 |
+
ACCOUNT_TAKEOVER_MEDIUM,
|
| 20 |
+
API_INCIDENT_HARD,
|
| 21 |
+
REGULATED_EXPORT_EXCEPTION_HARD,
|
| 22 |
+
)
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def get_task(task_id: str) -> SupportTaskSpec:
|
| 27 |
+
return TASKS[task_id]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def list_task_ids() -> list[str]:
|
| 31 |
+
return list(TASKS.keys())
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
__all__ = [
|
| 35 |
+
"SupportTaskSpec",
|
| 36 |
+
"ALL_QUEUES",
|
| 37 |
+
"ALL_PRIORITIES",
|
| 38 |
+
"ALL_STATUSES",
|
| 39 |
+
"ALL_ISSUE_TYPES",
|
| 40 |
+
"TASKS",
|
| 41 |
+
"get_task",
|
| 42 |
+
"list_task_ids",
|
| 43 |
+
]
|
tasks/account_takeover_medium.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
|
| 4 |
+
|
| 5 |
+
TASK = SupportTaskSpec(
|
| 6 |
+
task_id="account_takeover_medium",
|
| 7 |
+
difficulty="medium",
|
| 8 |
+
title="Account takeover recovery",
|
| 9 |
+
objective=(
|
| 10 |
+
"Identify an account compromise, lock the account, start recovery, "
|
| 11 |
+
"and communicate next steps without leaking security-sensitive details."
|
| 12 |
+
),
|
| 13 |
+
ticket=SupportTicket(
|
| 14 |
+
customer_name="Anjali Menon",
|
| 15 |
+
customer_tier="enterprise",
|
| 16 |
+
company="Saffron Logistics",
|
| 17 |
+
subject="Someone changed our admin email",
|
| 18 |
+
body=(
|
| 19 |
+
"Our main admin email was changed and we lost access. "
|
| 20 |
+
"We’re seeing unfamiliar login alerts from overseas. Urgent help needed."
|
| 21 |
+
),
|
| 22 |
+
region="us-east-1",
|
| 23 |
+
affected_users=240,
|
| 24 |
+
sla_minutes_remaining=120,
|
| 25 |
+
),
|
| 26 |
+
knowledge_base=(
|
| 27 |
+
KnowledgeSnippet(
|
| 28 |
+
article_id="kb-compromise-protocol",
|
| 29 |
+
title="Account compromise protocol",
|
| 30 |
+
content=(
|
| 31 |
+
"Immediately lock the account, invalidate active sessions, require admin re-verification, "
|
| 32 |
+
"and collect proof of ownership (billing zip, last 4 of card, admin ID)."
|
| 33 |
+
),
|
| 34 |
+
),
|
| 35 |
+
KnowledgeSnippet(
|
| 36 |
+
article_id="kb-enterprise-sla",
|
| 37 |
+
title="Enterprise SLA",
|
| 38 |
+
content="Enterprise incidents must be contained within 2 hours. Provide status updates every 30 minutes.",
|
| 39 |
+
),
|
| 40 |
+
),
|
| 41 |
+
gold_queue="trust_and_safety",
|
| 42 |
+
gold_priority="urgent",
|
| 43 |
+
gold_issue_type="account_compromise",
|
| 44 |
+
gold_status="escalated",
|
| 45 |
+
gold_resolution_code="security_lock_and_verify",
|
| 46 |
+
required_requested_fields=("ownership_proof", "billing_zip", "last4_card"),
|
| 47 |
+
required_reply_markers=(
|
| 48 |
+
("locked", "account"),
|
| 49 |
+
("invalidate", "sessions"),
|
| 50 |
+
("verify", "ownership"),
|
| 51 |
+
("sla", "updates"),
|
| 52 |
+
),
|
| 53 |
+
required_note_markers=(("account locked",), ("ownership proof requested",)),
|
| 54 |
+
risk_flags=("security_incident", "sla_breach"),
|
| 55 |
+
follow_up_outcome="partial",
|
| 56 |
+
follow_up_message="Customer provided billing zip but not card last4 yet.",
|
| 57 |
+
follow_up_provided_fields=("billing_zip",),
|
| 58 |
+
follow_up_wrong_fields=("card_last4",),
|
| 59 |
+
sla_step_cost=18,
|
| 60 |
+
over_escalation_queues=("security_ops",),
|
| 61 |
+
under_escalation_deadline_step=4,
|
| 62 |
+
max_steps=7,
|
| 63 |
+
)
|
tasks/api_incident_hard.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
|
| 4 |
+
|
| 5 |
+
TASK = SupportTaskSpec(
|
| 6 |
+
task_id="api_incident_hard",
|
| 7 |
+
difficulty="hard",
|
| 8 |
+
title="Production API incident triage",
|
| 9 |
+
objective=(
|
| 10 |
+
"Triage a production API latency/5xx incident affecting multiple customers; "
|
| 11 |
+
"collect diagnostics, apply runbook mitigations, and escalate to platform engineering appropriately."
|
| 12 |
+
),
|
| 13 |
+
ticket=SupportTicket(
|
| 14 |
+
customer_name="Marco Alvarez",
|
| 15 |
+
customer_tier="enterprise",
|
| 16 |
+
company="Northwind Labs",
|
| 17 |
+
subject="API timeouts for createOrder",
|
| 18 |
+
body=(
|
| 19 |
+
"Since 3 hours ago, createOrder calls are timing out or returning 500s across regions. "
|
| 20 |
+
"We rolled back our last deploy and still see issues. Need RCA and mitigation ASAP."
|
| 21 |
+
),
|
| 22 |
+
region="us-west-2",
|
| 23 |
+
affected_users=4200,
|
| 24 |
+
sla_minutes_remaining=90,
|
| 25 |
+
),
|
| 26 |
+
knowledge_base=(
|
| 27 |
+
KnowledgeSnippet(
|
| 28 |
+
article_id="kb-api-runbook",
|
| 29 |
+
title="API latency/5xx runbook",
|
| 30 |
+
content=(
|
| 31 |
+
"Capture request IDs, time window, regions, and payload samples. "
|
| 32 |
+
"Check current status page and incident channel. "
|
| 33 |
+
"If multiple regions impacted, escalate to platform_engineering and set customer expectations."
|
| 34 |
+
),
|
| 35 |
+
),
|
| 36 |
+
KnowledgeSnippet(
|
| 37 |
+
article_id="kb-status-page",
|
| 38 |
+
title="Status page policy",
|
| 39 |
+
content="If 2+ enterprise customers report API errors, post a preliminary status within 15 minutes.",
|
| 40 |
+
),
|
| 41 |
+
),
|
| 42 |
+
gold_queue="platform_engineering",
|
| 43 |
+
gold_priority="urgent",
|
| 44 |
+
gold_issue_type="production_incident",
|
| 45 |
+
gold_status="escalated",
|
| 46 |
+
gold_resolution_code="runbook_investigation",
|
| 47 |
+
required_requested_fields=("request_ids", "time_window", "regions", "payload_sample"),
|
| 48 |
+
required_reply_markers=(
|
| 49 |
+
("acknowledge", "incident"),
|
| 50 |
+
("collect", "request ids"),
|
| 51 |
+
("status", "page"),
|
| 52 |
+
("escalate", "platform"),
|
| 53 |
+
),
|
| 54 |
+
required_note_markers=(
|
| 55 |
+
("status page",),
|
| 56 |
+
("platform escalation",),
|
| 57 |
+
("request ids",),
|
| 58 |
+
),
|
| 59 |
+
risk_flags=("sla_breach", "p1_incident"),
|
| 60 |
+
follow_up_outcome="partial",
|
| 61 |
+
follow_up_message="Platform team investigating elevated DB latency; ETA 20 minutes.",
|
| 62 |
+
follow_up_provided_fields=("request_ids", "time_window"),
|
| 63 |
+
follow_up_wrong_fields=("payload_sample",),
|
| 64 |
+
sla_step_cost=20,
|
| 65 |
+
over_escalation_queues=(),
|
| 66 |
+
under_escalation_deadline_step=3,
|
| 67 |
+
max_steps=8,
|
| 68 |
+
)
|
tasks/base.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared task structures for SupportDesk."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Literal
|
| 7 |
+
|
| 8 |
+
from models import KnowledgeSnippet, SupportTicket
|
| 9 |
+
|
| 10 |
+
ALL_QUEUES = [
|
| 11 |
+
"billing_ops",
|
| 12 |
+
"trust_and_safety",
|
| 13 |
+
"platform_engineering",
|
| 14 |
+
"compliance_ops",
|
| 15 |
+
"general_support",
|
| 16 |
+
]
|
| 17 |
+
ALL_PRIORITIES = ["low", "normal", "high", "urgent"]
|
| 18 |
+
ALL_STATUSES = ["new", "waiting_on_customer", "resolved", "escalated"]
|
| 19 |
+
ALL_ISSUE_TYPES = [
|
| 20 |
+
"duplicate_charge",
|
| 21 |
+
"account_compromise",
|
| 22 |
+
"production_incident",
|
| 23 |
+
"regulated_exception",
|
| 24 |
+
"general_question",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass(frozen=True)
|
| 29 |
+
class SupportTaskSpec:
|
| 30 |
+
task_id: str
|
| 31 |
+
difficulty: Literal["easy", "medium", "hard"]
|
| 32 |
+
title: str
|
| 33 |
+
objective: str
|
| 34 |
+
ticket: SupportTicket
|
| 35 |
+
knowledge_base: tuple[KnowledgeSnippet, ...]
|
| 36 |
+
gold_queue: str
|
| 37 |
+
gold_priority: str
|
| 38 |
+
gold_issue_type: str
|
| 39 |
+
gold_status: str
|
| 40 |
+
gold_resolution_code: str
|
| 41 |
+
required_requested_fields: tuple[str, ...]
|
| 42 |
+
required_reply_markers: tuple[tuple[str, ...], ...]
|
| 43 |
+
required_note_markers: tuple[tuple[str, ...], ...]
|
| 44 |
+
forbidden_reply_markers: tuple[str, ...] = ()
|
| 45 |
+
risk_flags: tuple[str, ...] = ()
|
| 46 |
+
follow_up_outcome: Literal["none", "partial", "complete", "incorrect"] = "none"
|
| 47 |
+
follow_up_message: str = ""
|
| 48 |
+
follow_up_provided_fields: tuple[str, ...] = ()
|
| 49 |
+
follow_up_wrong_fields: tuple[str, ...] = ()
|
| 50 |
+
sla_step_cost: int = 15
|
| 51 |
+
over_escalation_queues: tuple[str, ...] = ()
|
| 52 |
+
under_escalation_deadline_step: int | None = None
|
| 53 |
+
max_steps: int = 6
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
__all__ = [
|
| 57 |
+
"SupportTaskSpec",
|
| 58 |
+
"KnowledgeSnippet",
|
| 59 |
+
"SupportTicket",
|
| 60 |
+
"ALL_QUEUES",
|
| 61 |
+
"ALL_PRIORITIES",
|
| 62 |
+
"ALL_STATUSES",
|
| 63 |
+
"ALL_ISSUE_TYPES",
|
| 64 |
+
]
|
tasks/billing_refund_easy.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
|
| 4 |
+
|
| 5 |
+
TASK = SupportTaskSpec(
|
| 6 |
+
task_id="billing_refund_easy",
|
| 7 |
+
difficulty="easy",
|
| 8 |
+
title="Duplicate charge refund triage",
|
| 9 |
+
objective=(
|
| 10 |
+
"Triage a duplicate-charge billing ticket, send the correct customer response, "
|
| 11 |
+
"and close the case only if no further customer information is required."
|
| 12 |
+
),
|
| 13 |
+
ticket=SupportTicket(
|
| 14 |
+
customer_name="Riya Shah",
|
| 15 |
+
customer_tier="pro",
|
| 16 |
+
company="PixelNorth Studio",
|
| 17 |
+
subject="Charged twice after I canceled",
|
| 18 |
+
body=(
|
| 19 |
+
"I canceled our Pro annual workspace yesterday, but my card was charged again "
|
| 20 |
+
"this morning and I still see the old invoice. We only had one workspace, "
|
| 21 |
+
"so this looks like a duplicate charge. Please fix it quickly."
|
| 22 |
+
),
|
| 23 |
+
region="ap-south-1",
|
| 24 |
+
affected_users=12,
|
| 25 |
+
sla_minutes_remaining=240,
|
| 26 |
+
),
|
| 27 |
+
knowledge_base=(
|
| 28 |
+
KnowledgeSnippet(
|
| 29 |
+
article_id="kb-refund-dup",
|
| 30 |
+
title="Refund duplicate charges",
|
| 31 |
+
content=(
|
| 32 |
+
"If a customer is double-charged, verify the duplicate invoice IDs, refund the extra charge, "
|
| 33 |
+
"and send a confirmation summarizing the refund amount and timeline."
|
| 34 |
+
),
|
| 35 |
+
),
|
| 36 |
+
KnowledgeSnippet(
|
| 37 |
+
article_id="kb-pro-refund-policy",
|
| 38 |
+
title="Pro tier refund policy",
|
| 39 |
+
content=(
|
| 40 |
+
"Pro annual refunds are prorated to the current billing month; processing takes 5-7 business days. "
|
| 41 |
+
"Provide the refund reference ID in your reply."
|
| 42 |
+
),
|
| 43 |
+
),
|
| 44 |
+
),
|
| 45 |
+
gold_queue="billing_ops",
|
| 46 |
+
gold_priority="high",
|
| 47 |
+
gold_issue_type="duplicate_charge",
|
| 48 |
+
gold_status="resolved",
|
| 49 |
+
gold_resolution_code="refund_processed",
|
| 50 |
+
required_requested_fields=("invoice_ids",),
|
| 51 |
+
required_reply_markers=(
|
| 52 |
+
("refund", "processed"),
|
| 53 |
+
("reference", "id"),
|
| 54 |
+
("timeline",),
|
| 55 |
+
),
|
| 56 |
+
required_note_markers=(("duplicate charge",), ("refund issued",)),
|
| 57 |
+
risk_flags=("sla_breach", "finance_close_risk"),
|
| 58 |
+
follow_up_outcome="complete",
|
| 59 |
+
follow_up_message="Customer confirms the refund hit their card.",
|
| 60 |
+
follow_up_provided_fields=("refund_reference",),
|
| 61 |
+
follow_up_wrong_fields=(),
|
| 62 |
+
sla_step_cost=12,
|
| 63 |
+
over_escalation_queues=(),
|
| 64 |
+
under_escalation_deadline_step=None,
|
| 65 |
+
max_steps=6,
|
| 66 |
+
)
|
tasks/regulated_export_exception_hard.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
|
| 4 |
+
|
| 5 |
+
TASK = SupportTaskSpec(
|
| 6 |
+
task_id="regulated_export_exception_hard",
|
| 7 |
+
difficulty="hard",
|
| 8 |
+
title="Regulated export exception review",
|
| 9 |
+
objective=(
|
| 10 |
+
"Handle a compliance exception request for export-controlled data; gather required attestations, "
|
| 11 |
+
"route to compliance, and avoid promising approval."
|
| 12 |
+
),
|
| 13 |
+
ticket=SupportTicket(
|
| 14 |
+
customer_name="Lena Fischer",
|
| 15 |
+
customer_tier="pro",
|
| 16 |
+
company="Helio Analytics",
|
| 17 |
+
subject="Export of model outputs to restricted country",
|
| 18 |
+
body=(
|
| 19 |
+
"We need to send model outputs to a contractor in a restricted country for 2 weeks. "
|
| 20 |
+
"Requesting an exception. We will use VPN + access controls."
|
| 21 |
+
),
|
| 22 |
+
region="eu-central-1",
|
| 23 |
+
affected_users=35,
|
| 24 |
+
sla_minutes_remaining=180,
|
| 25 |
+
),
|
| 26 |
+
knowledge_base=(
|
| 27 |
+
KnowledgeSnippet(
|
| 28 |
+
article_id="kb-export-policy",
|
| 29 |
+
title="Export control policy",
|
| 30 |
+
content=(
|
| 31 |
+
"Exceptions require compliance review. Collect justification, data sensitivity, duration, "
|
| 32 |
+
"recipient identity, and technical safeguards. Do not promise approval."
|
| 33 |
+
),
|
| 34 |
+
),
|
| 35 |
+
KnowledgeSnippet(
|
| 36 |
+
article_id="kb-temp-access",
|
| 37 |
+
title="Temporary access rules",
|
| 38 |
+
content="Temporary access must be time-bounded and audited; VPN alone is insufficient without logging.",
|
| 39 |
+
),
|
| 40 |
+
),
|
| 41 |
+
gold_queue="compliance_ops",
|
| 42 |
+
gold_priority="high",
|
| 43 |
+
gold_issue_type="regulated_exception",
|
| 44 |
+
gold_status="waiting_on_customer",
|
| 45 |
+
gold_resolution_code="compliance_review_pending",
|
| 46 |
+
required_requested_fields=("justification", "data_type", "recipient_identity", "duration", "safeguards"),
|
| 47 |
+
required_reply_markers=(
|
| 48 |
+
("cannot promise", "approval"),
|
| 49 |
+
("compliance", "review"),
|
| 50 |
+
("collect", "recipient"),
|
| 51 |
+
("time-bound", "access"),
|
| 52 |
+
),
|
| 53 |
+
required_note_markers=(
|
| 54 |
+
("exception request",),
|
| 55 |
+
("awaiting compliance",),
|
| 56 |
+
),
|
| 57 |
+
risk_flags=("legal",),
|
| 58 |
+
follow_up_outcome="incorrect",
|
| 59 |
+
follow_up_message="Customer insists VPN is sufficient and did not provide recipient identity.",
|
| 60 |
+
follow_up_provided_fields=("duration",),
|
| 61 |
+
follow_up_wrong_fields=("safeguards",),
|
| 62 |
+
sla_step_cost=14,
|
| 63 |
+
over_escalation_queues=("legal_ops",),
|
| 64 |
+
under_escalation_deadline_step=4,
|
| 65 |
+
max_steps=8,
|
| 66 |
+
)
|