modelbuilderhq commited on
Commit
4f129c9
·
verified ·
1 Parent(s): 6f6f46e

Upload folder using huggingface_hub

Browse files
Files changed (17) hide show
  1. Dockerfile +1 -1
  2. README.md +11 -9
  3. __init__.py +49 -2
  4. client.py +37 -2
  5. graders.py +168 -11
  6. inference.py +6 -6
  7. main.py +1 -1
  8. models.py +121 -2
  9. openenv.yaml +1 -1
  10. openenv_compat.py +76 -0
  11. policies.py +84 -0
  12. pyproject.toml +3 -3
  13. server/__init__.py +5 -1
  14. server/app.py +185 -13
  15. server/supportdesk_environment.py +544 -2
  16. tasks.py +404 -2
  17. tests/test_supportdesk.py +9 -9
Dockerfile CHANGED
@@ -80,4 +80,4 @@ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
80
 
81
  # Run the FastAPI server
82
  # The module path is constructed to work with this repo's package layout.
83
- CMD ["sh", "-c", "cd /app/env && uvicorn supportdesk_env.server.app:app --host 0.0.0.0 --port 8000"]
 
80
 
81
  # Run the FastAPI server
82
  # The module path is constructed to work with this repo's package layout.
83
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
README.md CHANGED
@@ -303,15 +303,17 @@ Examples:
303
  |-- pyproject.toml
304
  |-- Dockerfile
305
  |-- uv.lock
306
- |-- supportdesk_env
 
 
 
 
 
 
 
307
  | |-- __init__.py
308
- | |-- graders.py
309
- | |-- models.py
310
- | |-- policies.py
311
- | |-- tasks.py
312
- | `-- server
313
- | |-- app.py
314
- | `-- supportdesk_environment.py
315
  |-- tests
316
  | `-- test_supportdesk.py
317
  `-- examples
@@ -344,7 +346,7 @@ python -m openenv.cli validate .
344
  Start the local server:
345
 
346
  ```bash
347
- python -m supportdesk_env.server.app
348
  ```
349
 
350
  Or use the entrypoint:
 
303
  |-- pyproject.toml
304
  |-- Dockerfile
305
  |-- uv.lock
306
+ |-- __init__.py
307
+ |-- client.py
308
+ |-- graders.py
309
+ |-- models.py
310
+ |-- openenv_compat.py
311
+ |-- policies.py
312
+ |-- tasks.py
313
+ |-- server
314
  | |-- __init__.py
315
+ | |-- app.py
316
+ | `-- supportdesk_environment.py
 
 
 
 
 
317
  |-- tests
318
  | `-- test_supportdesk.py
319
  `-- examples
 
346
  Start the local server:
347
 
348
  ```bash
349
+ python -m server.app
350
  ```
351
 
352
  Or use the entrypoint:
__init__.py CHANGED
@@ -1,3 +1,50 @@
1
- """Compatibility wrapper for the real supportdesk_env package."""
2
 
3
- from supportdesk_env import * # noqa: F401,F403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SupportDesk OpenEnv environment package (flat layout)."""
2
 
3
+ from client import SupportDeskEnv
4
+ from graders import (
5
+ AccountTakeoverMediumGrader,
6
+ ApiIncidentHardGrader,
7
+ BillingRefundEasyGrader,
8
+ GradeBreakdown,
9
+ RegulatedExportExceptionHardGrader,
10
+ grade_case,
11
+ grade_task_id,
12
+ )
13
+ from models import (
14
+ ActionHistoryEntry,
15
+ KnowledgeSnippet,
16
+ SupportCaseProgress,
17
+ SupportDeskAction,
18
+ SupportDeskObservation,
19
+ SupportDeskState,
20
+ SupportTicket,
21
+ )
22
+ from policies import default_note, default_reply, heuristic_action
23
+ from server.supportdesk_environment import SupportDeskEnvironment
24
+ from tasks import TASKS, SupportTaskSpec, get_task, list_task_ids
25
+
26
+ __all__ = [
27
+ "ActionHistoryEntry",
28
+ "GradeBreakdown",
29
+ "KnowledgeSnippet",
30
+ "SupportCaseProgress",
31
+ "SupportDeskAction",
32
+ "SupportDeskEnv",
33
+ "SupportDeskEnvironment",
34
+ "SupportDeskObservation",
35
+ "SupportDeskState",
36
+ "SupportTaskSpec",
37
+ "SupportTicket",
38
+ "TASKS",
39
+ "default_note",
40
+ "default_reply",
41
+ "get_task",
42
+ "grade_case",
43
+ "grade_task_id",
44
+ "heuristic_action",
45
+ "list_task_ids",
46
+ "AccountTakeoverMediumGrader",
47
+ "ApiIncidentHardGrader",
48
+ "BillingRefundEasyGrader",
49
+ "RegulatedExportExceptionHardGrader",
50
+ ]
client.py CHANGED
@@ -1,3 +1,38 @@
1
- """Compatibility wrapper for the real supportdesk_env package."""
2
 
3
- from supportdesk_env.client import * # noqa: F401,F403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTTP client for interacting with a deployed SupportDesk environment."""
2
 
3
+ from __future__ import annotations
4
+
5
+ from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
6
+ from openenv_compat import EnvClient, StepResult
7
+
8
+
9
+ def _validate(model_cls, payload):
10
+ if hasattr(model_cls, "model_validate"):
11
+ return model_cls.model_validate(payload)
12
+ return model_cls(**payload) # pragma: no cover - pydantic v1 fallback
13
+
14
+
15
+ class SupportDeskEnv(EnvClient[SupportDeskAction, SupportDeskObservation, SupportDeskState]):
16
+ """Typed client for a locally running or deployed OpenEnv server."""
17
+
18
+ def _step_payload(self, action: SupportDeskAction) -> dict:
19
+ """Convert a typed action into the JSON payload expected by the server."""
20
+
21
+ if hasattr(action, "model_dump"):
22
+ return action.model_dump()
23
+ return action.dict()
24
+
25
+ def _parse_state(self, payload) -> SupportDeskState:
26
+ return _validate(SupportDeskState, payload)
27
+
28
+ def _parse_reset(self, payload) -> SupportDeskObservation:
29
+ return _validate(SupportDeskObservation, payload)
30
+
31
+ def _parse_result(self, payload) -> StepResult[SupportDeskObservation]:
32
+ observation = _validate(SupportDeskObservation, payload["observation"])
33
+ # OpenEnv StepResult only accepts observation/reward/done in this runtime.
34
+ return StepResult(
35
+ observation=observation,
36
+ reward=payload["reward"],
37
+ done=payload["done"],
38
+ )
graders.py CHANGED
@@ -1,14 +1,171 @@
1
- """Compatibility wrapper exposing task graders from the repo root."""
2
-
3
- from supportdesk_env.graders import (
4
- AccountTakeoverMediumGrader,
5
- ApiIncidentHardGrader,
6
- BillingRefundEasyGrader,
7
- GradeBreakdown,
8
- RegulatedExportExceptionHardGrader,
9
- grade_case,
10
- grade_task_id,
11
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  __all__ = [
14
  "AccountTakeoverMediumGrader",
 
1
+ """Deterministic graders and reward helpers for SupportDesk."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+
8
+ from models import SupportCaseProgress
9
+ from tasks import SupportTaskSpec, get_task
10
+
11
+ STRICT_SCORE_EPSILON = 0.01
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class GradeBreakdown:
16
+ """A scored view of how close a case is to the gold solution."""
17
+
18
+ total_score: float
19
+ queue_score: float
20
+ priority_score: float
21
+ issue_type_score: float
22
+ requested_fields_score: float
23
+ reply_score: float
24
+ note_score: float
25
+ status_score: float
26
+ resolution_score: float
27
+ completed_milestones: tuple[str, ...]
28
+
29
+
30
+ def _normalize(text: str | None) -> str:
31
+ if not text:
32
+ return ""
33
+ normalized = text.lower().replace("-", " ")
34
+ return re.sub(r"[^a-z0-9\s]", " ", normalized)
35
+
36
+
37
+ def _marker_group_score(text: str | None, marker_groups: tuple[tuple[str, ...], ...]) -> float:
38
+ if not marker_groups:
39
+ return 1.0
40
+
41
+ normalized = _normalize(text)
42
+ if not normalized:
43
+ return 0.0
44
+
45
+ matches = 0
46
+ for group in marker_groups:
47
+ if any(_normalize(marker) in normalized for marker in group):
48
+ matches += 1
49
+ return matches / len(marker_groups)
50
+
51
+
52
+ def _requested_fields_score(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
53
+ required = set(task.required_requested_fields)
54
+ requested = set(case.requested_fields)
55
+
56
+ if not required:
57
+ return 1.0 if not requested else 0.0
58
+ if not requested:
59
+ return 0.0
60
+
61
+ matched = len(required.intersection(requested))
62
+ extras = len(requested.difference(required))
63
+ raw = matched / len(required)
64
+ penalty = min(0.25, extras * 0.05)
65
+ return max(0.0, raw - penalty)
66
+
67
+
68
+ def _reply_penalty(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
69
+ text = _normalize(case.reply)
70
+ if not text:
71
+ return 0.0
72
+ return 0.0 if not any(_normalize(marker) in text for marker in task.forbidden_reply_markers) else 0.5
73
+
74
+
75
+ def _strict_open_unit_interval(score: float) -> float:
76
+ """Keep final task scores strictly within (0, 1) for evaluator compatibility."""
77
+
78
+ return min(1.0 - STRICT_SCORE_EPSILON, max(STRICT_SCORE_EPSILON, score))
79
+
80
+
81
+ def grade_case(task: SupportTaskSpec, case: SupportCaseProgress) -> GradeBreakdown:
82
+ """Score a case deterministically with total_score strictly inside (0, 1)."""
83
+
84
+ queue_score = 1.0 if case.queue == task.gold_queue else 0.0
85
+ priority_score = 1.0 if case.priority == task.gold_priority else 0.0
86
+ issue_type_score = 1.0 if case.issue_type == task.gold_issue_type else 0.0
87
+ requested_fields_score = _requested_fields_score(case, task)
88
+ reply_score = max(0.0, _marker_group_score(case.reply, task.required_reply_markers) - _reply_penalty(case, task))
89
+ note_score = _marker_group_score(case.internal_note, task.required_note_markers)
90
+ status_score = 1.0 if case.status == task.gold_status else 0.0
91
+ resolution_score = 1.0 if case.resolution_code == task.gold_resolution_code else 0.0
92
+
93
+ weighted_total = (
94
+ queue_score * 0.15
95
+ + priority_score * 0.10
96
+ + issue_type_score * 0.10
97
+ + requested_fields_score * 0.15
98
+ + reply_score * 0.25
99
+ + note_score * 0.10
100
+ + status_score * 0.10
101
+ + resolution_score * 0.05
102
+ )
103
+
104
+ milestones: list[str] = []
105
+ if queue_score:
106
+ milestones.append("queue")
107
+ if priority_score:
108
+ milestones.append("priority")
109
+ if issue_type_score:
110
+ milestones.append("issue_type")
111
+ if requested_fields_score >= 0.99:
112
+ milestones.append("requested_fields")
113
+ if reply_score >= 0.99:
114
+ milestones.append("reply")
115
+ if note_score >= 0.99:
116
+ milestones.append("internal_note")
117
+ if status_score:
118
+ milestones.append("status")
119
+ if resolution_score:
120
+ milestones.append("resolution_code")
121
+
122
+ return GradeBreakdown(
123
+ total_score=round(_strict_open_unit_interval(weighted_total), 4),
124
+ queue_score=queue_score,
125
+ priority_score=priority_score,
126
+ issue_type_score=issue_type_score,
127
+ requested_fields_score=round(requested_fields_score, 4),
128
+ reply_score=round(reply_score, 4),
129
+ note_score=round(note_score, 4),
130
+ status_score=status_score,
131
+ resolution_score=resolution_score,
132
+ completed_milestones=tuple(milestones),
133
+ )
134
+
135
+
136
+ def grade_task_id(task_id: str, case: SupportCaseProgress) -> GradeBreakdown:
137
+ """Convenience wrapper used by tests and evaluation scripts."""
138
+
139
+ return grade_case(get_task(task_id), case)
140
+
141
+
142
+ class _TaskSpecificGrader:
143
+ """Importable task-specific grader wrapper for validator task discovery."""
144
+
145
+ task_id: str = ""
146
+
147
+ def grade(self, case: SupportCaseProgress) -> float:
148
+ return grade_task_id(self.task_id, case).total_score
149
+
150
+ def __call__(self, case: SupportCaseProgress) -> float:
151
+ return self.grade(case)
152
+
153
+
154
+ class BillingRefundEasyGrader(_TaskSpecificGrader):
155
+ task_id = "billing_refund_easy"
156
+
157
+
158
+ class AccountTakeoverMediumGrader(_TaskSpecificGrader):
159
+ task_id = "account_takeover_medium"
160
+
161
+
162
+ class ApiIncidentHardGrader(_TaskSpecificGrader):
163
+ task_id = "api_incident_hard"
164
+
165
+
166
+ class RegulatedExportExceptionHardGrader(_TaskSpecificGrader):
167
+ task_id = "regulated_export_exception_hard"
168
+
169
 
170
  __all__ = [
171
  "AccountTakeoverMediumGrader",
inference.py CHANGED
@@ -14,12 +14,12 @@ try:
14
  except ImportError: # pragma: no cover - local fallback mode
15
  OpenAI = None # type: ignore[assignment]
16
 
17
- from supportdesk_env.client import SupportDeskEnv
18
- from supportdesk_env.graders import grade_case
19
- from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
20
- from supportdesk_env.policies import heuristic_action
21
- from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
22
- from supportdesk_env.tasks import get_task, list_task_ids
23
 
24
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
25
  MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
 
14
  except ImportError: # pragma: no cover - local fallback mode
15
  OpenAI = None # type: ignore[assignment]
16
 
17
+ from client import SupportDeskEnv
18
+ from graders import grade_case
19
+ from models import SupportDeskAction, SupportDeskObservation
20
+ from policies import heuristic_action
21
+ from server.supportdesk_environment import SupportDeskEnvironment
22
+ from tasks import get_task, list_task_ids
23
 
24
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
25
  MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
main.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  from __future__ import annotations
4
 
5
- from supportdesk_env.server.app import app, main as _run_server
6
 
7
 
8
  def main() -> None:
 
2
 
3
  from __future__ import annotations
4
 
5
+ from server.app import app, main as _run_server
6
 
7
 
8
  def main() -> None:
models.py CHANGED
@@ -1,3 +1,122 @@
1
- """Compatibility wrapper for the real supportdesk_env package."""
2
 
3
- from supportdesk_env.models import * # noqa: F401,F403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typed models for the SupportDesk OpenEnv environment."""
2
 
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+ from openenv_compat import Action, Observation, State
10
+
11
+
12
+ class KnowledgeSnippet(BaseModel):
13
+ """A policy or runbook excerpt the agent can use during triage."""
14
+
15
+ article_id: str
16
+ title: str
17
+ content: str
18
+
19
+
20
+ class SupportTicket(BaseModel):
21
+ """Static task input representing the inbound support ticket."""
22
+
23
+ customer_name: str
24
+ customer_tier: Literal["free", "pro", "enterprise"]
25
+ company: str
26
+ subject: str
27
+ body: str
28
+ region: str
29
+ affected_users: int | None = None
30
+ sla_minutes_remaining: int | None = None
31
+ business_impact: str | None = None
32
+ secondary_concerns: list[str] = Field(default_factory=list)
33
+ attachments: list[str] = Field(default_factory=list)
34
+
35
+
36
+ class ActionHistoryEntry(BaseModel):
37
+ """A concise trace entry used in observations and state dumps."""
38
+
39
+ step: int
40
+ operation: str
41
+ summary: str
42
+ reward_delta: float = 0.0
43
+
44
+
45
+ class CustomerFollowUp(BaseModel):
46
+ """A scripted customer response that arrives after a request for more information."""
47
+
48
+ status: Literal["none", "pending", "partial", "complete", "incorrect"] = "none"
49
+ message: str | None = None
50
+ provided_fields: list[str] = Field(default_factory=list)
51
+ wrong_fields: list[str] = Field(default_factory=list)
52
+
53
+
54
+ class SupportCaseProgress(BaseModel):
55
+ """Mutable case state that graders score against."""
56
+
57
+ queue: str | None = None
58
+ priority: str | None = None
59
+ issue_type: str | None = None
60
+ status: str = "new"
61
+ resolution_code: str | None = None
62
+ requested_fields: list[str] = Field(default_factory=list)
63
+ reply: str | None = None
64
+ internal_note: str | None = None
65
+ customer_follow_up: CustomerFollowUp = Field(default_factory=CustomerFollowUp)
66
+
67
+
68
+ class SupportDeskAction(Action):
69
+ """One structured action the agent can take at each step."""
70
+
71
+ operation: Literal["classify", "request_info", "draft_reply", "add_internal_note", "submit", "wait"]
72
+ queue: str | None = None
73
+ priority: str | None = None
74
+ issue_type: str | None = None
75
+ status: str | None = None
76
+ resolution_code: str | None = None
77
+ requested_fields: list[str] = Field(default_factory=list)
78
+ reply: str | None = None
79
+ internal_note: str | None = None
80
+
81
+
82
+ class SupportDeskObservation(Observation):
83
+ """Observation emitted to the agent after reset and each step."""
84
+
85
+ task_id: str
86
+ difficulty: Literal["easy", "medium", "hard"]
87
+ objective: str
88
+ ticket: SupportTicket
89
+ knowledge_base: list[KnowledgeSnippet]
90
+ available_queues: list[str]
91
+ available_priorities: list[str]
92
+ available_statuses: list[str]
93
+ available_issue_types: list[str]
94
+ case: SupportCaseProgress
95
+ current_sla_minutes_remaining: int | None = None
96
+ workflow_stage: str
97
+ required_next_actions: list[str] = Field(default_factory=list)
98
+ risk_flags: list[str] = Field(default_factory=list)
99
+ action_history: list[ActionHistoryEntry] = Field(default_factory=list)
100
+ feedback: str = ""
101
+ remaining_steps: int = 0
102
+
103
+
104
+ class SupportDeskState(State):
105
+ """Current environment state returned by the OpenEnv state() API."""
106
+
107
+ episode_id: str | None = None
108
+ task_id: str
109
+ difficulty: Literal["easy", "medium", "hard"]
110
+ step_count: int = 0
111
+ reward: float = 0.0
112
+ done: bool = False
113
+ current_score: float = 0.0
114
+ max_steps: int = 0
115
+ case: SupportCaseProgress
116
+ current_sla_minutes_remaining: int | None = None
117
+ workflow_stage: str
118
+ required_next_actions: list[str] = Field(default_factory=list)
119
+ risk_flags: list[str] = Field(default_factory=list)
120
+ action_history: list[ActionHistoryEntry] = Field(default_factory=list)
121
+ completed_milestones: list[str] = Field(default_factory=list)
122
+ last_feedback: str = ""
openenv.yaml CHANGED
@@ -3,7 +3,7 @@ name: HyperBrickCaseOps
3
  env_name: supportdesk_env
4
  type: space
5
  runtime: fastapi
6
- app: supportdesk_env.server.app:app
7
  port: 8000
8
  description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
9
  tasks:
 
3
  env_name: supportdesk_env
4
  type: space
5
  runtime: fastapi
6
+ app: server.app:app
7
  port: 8000
8
  description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
9
  tasks:
openenv_compat.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compatibility helpers for environments where openenv-core is not installed."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Generic, TypeVar
7
+
8
+ from pydantic import BaseModel
9
+
10
+ A = TypeVar("A")
11
+ O = TypeVar("O")
12
+ S = TypeVar("S")
13
+
14
+
15
+ OPENENV_AVAILABLE = True
16
+
17
+ try:
18
+ from openenv.core.client_types import StepResult # type: ignore
19
+ from openenv.core.env_client import EnvClient # type: ignore
20
+ from openenv.core.env_server.interfaces import Environment # type: ignore
21
+ from openenv.core.env_server.types import Action, Observation, State # type: ignore
22
+ from openenv.core.env_server.types import EnvironmentMetadata # type: ignore
23
+ except ImportError:
24
+ try:
25
+ from openenv_core.client_types import StepResult # type: ignore
26
+ from openenv_core.http_env_client import HTTPEnvClient as EnvClient # type: ignore
27
+ from openenv_core.env_server.interfaces import Environment # type: ignore
28
+ from openenv_core.env_server.types import Action, Observation, State # type: ignore
29
+ from openenv_core.env_server.types import EnvironmentMetadata # type: ignore
30
+ except ImportError:
31
+ OPENENV_AVAILABLE = False
32
+
33
+ class Action(BaseModel):
34
+ """Fallback Action base type for local import-only workflows."""
35
+
36
+ class Observation(BaseModel):
37
+ """Fallback Observation base type for local import-only workflows."""
38
+
39
+ reward: float = 0.0
40
+ done: bool = False
41
+
42
+ class State(BaseModel):
43
+ """Fallback State base type for local import-only workflows."""
44
+
45
+ class Environment(Generic[A, O, S]):
46
+ """Minimal base class used for local unit tests and import-based demos."""
47
+
48
+ def __init__(self) -> None:
49
+ super().__init__()
50
+
51
+ class EnvironmentMetadata(BaseModel):
52
+ """Fallback metadata model used when OpenEnv is absent."""
53
+
54
+ name: str
55
+ description: str
56
+ readme_content: str | None = None
57
+ version: str | None = None
58
+ author: str | None = None
59
+
60
+ @dataclass
61
+ class StepResult(Generic[O]):
62
+ """Fallback step result for local-only client compatibility."""
63
+
64
+ observation: O
65
+ reward: float
66
+ done: bool
67
+ info: dict[str, Any] = field(default_factory=dict)
68
+
69
+ class EnvClient(Generic[A, O, S]):
70
+ """Placeholder client that fails only when actually used."""
71
+
72
+ def __init__(self, *args, **kwargs) -> None:
73
+ raise ImportError(
74
+ "SupportDeskEnv requires openenv-core to be installed. "
75
+ "Run `py -3 -m pip install openenv-core` to use the HTTP client."
76
+ )
policies.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Reusable policy helpers for local baselines and training examples."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from models import SupportDeskAction, SupportDeskObservation
6
+ from tasks import get_task
7
+
8
+
9
+ def default_reply(task_id: str) -> str:
10
+ """Return a task-specific high-signal customer reply."""
11
+
12
+ if task_id == "billing_refund_easy":
13
+ return (
14
+ "Thanks for flagging the duplicate charge. I have started the refund for the extra "
15
+ "charge, and the funds usually appear within 5-7 business days."
16
+ )
17
+ if task_id == "account_takeover_medium":
18
+ return (
19
+ "We have escalated this to our trust team. Please reset your password, scan your "
20
+ "device for malware, and reply with your workspace_id, last successful login time, "
21
+ "and billing email so we can verify the account safely."
22
+ )
23
+ if task_id == "regulated_export_exception_hard":
24
+ return (
25
+ "We cannot provide a bypass or temporary unlock yet. Our compliance team is running "
26
+ "a compliance review, and we need your tenant_region, dpa_amendment_id, and "
27
+ "legal_contact_email to continue that review."
28
+ )
29
+ return (
30
+ "We are treating this as an active incident and our on-call engineering team is engaged. "
31
+ "Please send the affected request IDs, UTC timestamps, and the impacted region so we can "
32
+ "speed up the investigation."
33
+ )
34
+
35
+
36
+ def default_note(task_id: str) -> str:
37
+ """Return a task-specific internal note."""
38
+
39
+ if task_id == "billing_refund_easy":
40
+ return "Duplicate charge confirmed from attached invoice; refund approved."
41
+ if task_id == "account_takeover_medium":
42
+ return "Suspicious login alert reported and customer is locked out."
43
+ if task_id == "regulated_export_exception_hard":
44
+ return (
45
+ "Audit-driven export exception request tied to an EU residency policy block; "
46
+ "customer asked for a manual bypass before legal approval."
47
+ )
48
+ return "EU data residency rollout hit intermittent HTTP 500s and the customer launches tonight."
49
+
50
+
51
+ def heuristic_action(observation: SupportDeskObservation) -> SupportDeskAction:
52
+ """Deterministic high-performing policy used by the baseline."""
53
+
54
+ task = get_task(observation.task_id)
55
+ case = observation.case
56
+
57
+ if case.queue is None or case.priority is None or case.issue_type is None:
58
+ return SupportDeskAction(
59
+ operation="classify",
60
+ queue=task.gold_queue,
61
+ priority=task.gold_priority,
62
+ issue_type=task.gold_issue_type,
63
+ )
64
+
65
+ if task.required_requested_fields and sorted(case.requested_fields) != sorted(task.required_requested_fields):
66
+ return SupportDeskAction(
67
+ operation="request_info",
68
+ requested_fields=list(task.required_requested_fields),
69
+ )
70
+
71
+ if case.customer_follow_up.status == "pending":
72
+ return SupportDeskAction(operation="wait")
73
+
74
+ if not case.reply:
75
+ return SupportDeskAction(operation="draft_reply", reply=default_reply(observation.task_id))
76
+
77
+ if not case.internal_note:
78
+ return SupportDeskAction(operation="add_internal_note", internal_note=default_note(observation.task_id))
79
+
80
+ return SupportDeskAction(
81
+ operation="submit",
82
+ status=task.gold_status,
83
+ resolution_code=task.gold_resolution_code,
84
+ )
pyproject.toml CHANGED
@@ -33,9 +33,9 @@ dev = [
33
 
34
  [project.scripts]
35
  # Server entry point - enables running via: uv run --project . server
36
- # or: python -m supportdesk_env.server.app
37
- server = "supportdesk_env.server.app:main"
38
 
39
  [tool.setuptools]
40
  include-package-data = true
41
- packages = ["supportdesk_env", "supportdesk_env.server"]
 
33
 
34
  [project.scripts]
35
  # Server entry point - enables running via: uv run --project . server
36
+ # or: python -m server.app
37
+ server = "server.app:main"
38
 
39
  [tool.setuptools]
40
  include-package-data = true
41
+ packages = ["server"]
server/__init__.py CHANGED
@@ -1 +1,5 @@
1
- """Server package for the SupportDesk OpenEnv environment."""
 
 
 
 
 
1
+ """Server package for the SupportDesk OpenEnv environment."""
2
+
3
+ from server.supportdesk_environment import SupportDeskEnvironment
4
+
5
+ __all__ = ["SupportDeskEnvironment"]
server/app.py CHANGED
@@ -1,33 +1,205 @@
1
- """FastAPI app entrypoint for the SupportDesk environment."""
2
 
3
  from __future__ import annotations
4
 
5
  import os
 
6
 
7
  import uvicorn
 
 
8
 
9
  try:
10
- from openenv.core.env_server.http_server import create_app
11
- except ImportError: # pragma: no cover - package name differs across releases
12
- from openenv_core.env_server.http_server import create_app
 
 
 
 
 
13
 
14
- from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
15
- from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
 
16
 
 
 
 
 
 
17
  app = create_app(
18
  SupportDeskEnvironment,
19
- action_cls=SupportDeskAction,
20
- observation_cls=SupportDeskObservation,
21
  env_name="supportdesk_env",
 
22
  )
23
 
24
 
25
- def main() -> None:
26
- """Run the local HTTP server."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- port = int(os.getenv("PORT", "8000"))
29
- uvicorn.run("supportdesk_env.server.app:app", host="0.0.0.0", port=port)
30
 
31
 
32
- if __name__ == "__main__":
33
  main()
 
1
+ """FastAPI application for the SupportDesk environment."""
2
 
3
  from __future__ import annotations
4
 
5
  import os
6
+ from typing import Any
7
 
8
  import uvicorn
9
+ from fastapi import Body, HTTPException
10
+ from fastapi.routing import APIRoute
11
 
12
  try:
13
+ from openenv.core.env_server import http_server as openenv_http_server
14
+ except ImportError:
15
+ try:
16
+ from openenv_core.env_server import http_server as openenv_http_server
17
+ except Exception as e: # pragma: no cover
18
+ raise ImportError(
19
+ "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
20
+ ) from e
21
 
22
+ from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
23
+ from server.supportdesk_environment import SupportDeskEnvironment
24
+ from tasks import TASKS
25
 
26
+ # Bind the default OpenEnv /state route to the full typed state model.
27
+ openenv_http_server.State = SupportDeskState
28
+ create_app = openenv_http_server.create_app
29
+
30
+ # Create the app with web interface and README integration.
31
  app = create_app(
32
  SupportDeskEnvironment,
33
+ SupportDeskAction,
34
+ SupportDeskObservation,
35
  env_name="supportdesk_env",
36
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
37
  )
38
 
39
 
40
+ TASK_GRADER_PATHS = {
41
+ "billing_refund_easy": "graders:BillingRefundEasyGrader",
42
+ "account_takeover_medium": "graders:AccountTakeoverMediumGrader",
43
+ "api_incident_hard": "graders:ApiIncidentHardGrader",
44
+ "regulated_export_exception_hard": "graders:RegulatedExportExceptionHardGrader",
45
+ }
46
+
47
+
48
+ def _replace_route(path: str, methods: set[str]) -> None:
49
+ """Remove a generated route so we can register a score-aware replacement."""
50
+
51
+ app.router.routes = [
52
+ route
53
+ for route in app.router.routes
54
+ if not (
55
+ isinstance(route, APIRoute)
56
+ and route.path == path
57
+ and methods.issubset(set(route.methods or set()))
58
+ )
59
+ ]
60
+
61
+
62
+ def _score_response(env: SupportDeskEnvironment, observation: SupportDeskObservation) -> dict[str, Any]:
63
+ """Return the standard OpenEnv shape plus an explicit top-level score."""
64
+
65
+ return {
66
+ "observation": observation.model_dump(),
67
+ "reward": observation.reward,
68
+ "done": observation.done,
69
+ "score": env.state.current_score,
70
+ }
71
+
72
+
73
+ _replace_route("/reset", {"POST"})
74
+ _replace_route("/step", {"POST"})
75
+
76
+
77
+ @app.post("/reset")
78
+ async def reset_with_score(
79
+ request: openenv_http_server.ResetRequest = Body(default_factory=openenv_http_server.ResetRequest),
80
+ ) -> dict[str, Any]:
81
+ """Reset the environment and expose the initial deterministic score at top level."""
82
+
83
+ env = SupportDeskEnvironment()
84
+ try:
85
+ kwargs = request.model_dump(exclude_unset=True)
86
+ observation = env.reset(**kwargs)
87
+ return _score_response(env, observation)
88
+ finally:
89
+ env.close()
90
+
91
+
92
+ @app.post("/step")
93
+ async def step_with_score(request: openenv_http_server.StepRequest) -> dict[str, Any]:
94
+ """Execute a step and expose the current deterministic score at top level."""
95
+
96
+ action_data = request.action
97
+ try:
98
+ action = openenv_http_server.deserialize_action(action_data, SupportDeskAction)
99
+ except openenv_http_server.ValidationError as exc:
100
+ raise HTTPException(status_code=422, detail=exc.errors()) from exc
101
+
102
+ env = SupportDeskEnvironment()
103
+ try:
104
+ kwargs = request.model_dump(exclude_unset=True, exclude={"action"})
105
+ observation = env.step(action, **kwargs)
106
+ return _score_response(env, observation)
107
+ finally:
108
+ env.close()
109
+
110
+
111
+ @app.get("/tasks")
112
+ def list_tasks() -> dict[str, Any]:
113
+ """Expose a stable task catalog for UI, debugging, and pre-submit checks."""
114
+
115
+ return {
116
+ "environment": {
117
+ "name": "supportdesk_env",
118
+ "version": "0.1.0",
119
+ "grader_type": "deterministic",
120
+ "score_range": [0.0, 1.0],
121
+ },
122
+ "total_tasks": len(TASKS),
123
+ "tasks": [
124
+ {
125
+ "task_id": task.task_id,
126
+ "grader": TASK_GRADER_PATHS[task.task_id],
127
+ "title": task.title,
128
+ "difficulty": task.difficulty,
129
+ "objective": task.objective,
130
+ "max_steps": task.max_steps,
131
+ "gold_issue_type": task.gold_issue_type,
132
+ "gold_queue": task.gold_queue,
133
+ "gold_priority": task.gold_priority,
134
+ "ticket_context": {
135
+ "customer_tier": task.ticket.customer_tier,
136
+ "region": task.ticket.region,
137
+ "affected_users": task.ticket.affected_users,
138
+ "sla_minutes_remaining": task.ticket.sla_minutes_remaining,
139
+ },
140
+ }
141
+ for task in TASKS.values()
142
+ ],
143
+ }
144
+
145
+
146
+ @app.get("/episodes/{episode_id}/state", response_model=SupportDeskState)
147
+ def get_episode_state(episode_id: str) -> SupportDeskState:
148
+ """Optional explicit state helper for robust episode-addressable inspection."""
149
+
150
+ try:
151
+ return SupportDeskEnvironment.state_for_episode(episode_id)
152
+ except ValueError as exc:
153
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
154
+
155
+
156
+ @app.post("/episodes/{episode_id}/step")
157
+ def step_episode(
158
+ episode_id: str,
159
+ payload: dict[str, Any] = Body(...),
160
+ ) -> dict[str, Any]:
161
+ """Optional explicit step helper that does not require sticky request context."""
162
+
163
+ action_payload = payload.get("action")
164
+ if not isinstance(action_payload, dict):
165
+ raise HTTPException(status_code=422, detail="Request body must include an 'action' object.")
166
+
167
+ timeout_s = payload.get("timeout_s")
168
+ try:
169
+ action = SupportDeskAction.model_validate(action_payload)
170
+ env = SupportDeskEnvironment()
171
+ observation = env.step(action, timeout_s=timeout_s, episode_id=episode_id)
172
+ except ValueError as exc:
173
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
174
+
175
+ return {
176
+ "observation": observation.model_dump(),
177
+ "reward": observation.reward,
178
+ "done": observation.done,
179
+ "score": SupportDeskEnvironment.state_for_episode(episode_id).current_score,
180
+ }
181
+
182
+
183
+ def main(host: str = "0.0.0.0", port: int = 8000) -> None:
184
+ """
185
+ Entry point for direct execution via uv run or python -m.
186
+
187
+ This function enables running the server without Docker:
188
+ uv run --project . server
189
+ uv run --project . server --port 8001
190
+ python -m server.app
191
+
192
+ Args:
193
+ host: Host address to bind to (default: "0.0.0.0")
194
+ port: Port number to listen on (default: 8000)
195
+
196
+ For production deployments, consider using uvicorn directly with
197
+ multiple workers:
198
+ uvicorn server.app:app --workers 4
199
+ """
200
 
201
+ uvicorn.run("server.app:app", host=host, port=port)
 
202
 
203
 
204
+ if __name__ == '__main__':
205
  main()
server/supportdesk_environment.py CHANGED
@@ -1,3 +1,545 @@
1
- """Compatibility wrapper for the real supportdesk_env package."""
2
 
3
- from supportdesk_env.server.supportdesk_environment import * # noqa: F401,F403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SupportDesk environment implementation."""
2
 
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import threading
7
+ import uuid
8
+ from pathlib import Path
9
+ from typing import ClassVar
10
+
11
+ from graders import grade_case
12
+ from models import (
13
+ ActionHistoryEntry,
14
+ CustomerFollowUp,
15
+ SupportCaseProgress,
16
+ SupportDeskAction,
17
+ SupportDeskObservation,
18
+ SupportDeskState,
19
+ )
20
+ from openenv_compat import Environment, EnvironmentMetadata
21
+ from tasks import (
22
+ ALL_ISSUE_TYPES,
23
+ ALL_PRIORITIES,
24
+ ALL_QUEUES,
25
+ ALL_STATUSES,
26
+ SupportTaskSpec,
27
+ get_task,
28
+ list_task_ids,
29
+ )
30
+
31
+
32
+ class SupportDeskEnvironment(
33
+ Environment[SupportDeskAction, SupportDeskObservation, SupportDeskState]
34
+ ):
35
+ """A realistic customer support triage environment with dense rewards."""
36
+
37
+ _state_lock: ClassVar[threading.RLock] = threading.RLock()
38
+ _episode_store: ClassVar[dict[str, SupportDeskState]] = {}
39
+ _episode_task_ids: ClassVar[dict[str, str]] = {}
40
+ _latest_episode_id: ClassVar[str | None] = None
41
+ _shared_reset_counter: ClassVar[int] = 0
42
+
43
+ def __init__(self, task_id: str | None = None):
44
+ super().__init__()
45
+ env_task_id = os.getenv("SUPPORTDESK_TASK_ID")
46
+ self._explicit_task_id = task_id is not None or env_task_id is not None
47
+ requested_task = task_id or env_task_id or list_task_ids()[0]
48
+ self.task: SupportTaskSpec = get_task(requested_task)
49
+ self._max_steps = self.task.max_steps
50
+ self._step_count = 0
51
+ self._reward_total = 0.0
52
+ self._done = False
53
+ self._last_feedback = ""
54
+ self._history: list[ActionHistoryEntry] = []
55
+ self._case = SupportCaseProgress()
56
+ self._episode_id: str | None = None
57
+ self._current_sla_minutes_remaining = self.task.ticket.sla_minutes_remaining
58
+ initial_grade = grade_case(self.task, self._case)
59
+ self._score = initial_grade.total_score
60
+ self._completed_milestones = list(initial_grade.completed_milestones)
61
+
62
+ @classmethod
63
+ def _build_initial_state(cls, task: SupportTaskSpec, episode_id: str) -> SupportDeskState:
64
+ initial_case = SupportCaseProgress()
65
+ initial_grade = grade_case(task, initial_case)
66
+ return SupportDeskState(
67
+ episode_id=episode_id,
68
+ task_id=task.task_id,
69
+ difficulty=task.difficulty,
70
+ step_count=0,
71
+ reward=0.0,
72
+ done=False,
73
+ current_score=initial_grade.total_score,
74
+ max_steps=task.max_steps,
75
+ case=initial_case,
76
+ current_sla_minutes_remaining=task.ticket.sla_minutes_remaining,
77
+ workflow_stage="intake",
78
+ required_next_actions=["classify"],
79
+ risk_flags=[],
80
+ action_history=[],
81
+ completed_milestones=list(initial_grade.completed_milestones),
82
+ last_feedback="New case loaded. Review the ticket and policy snippets before acting.",
83
+ )
84
+
85
+ @classmethod
86
+ def _extract_episode_id(cls, episode_id: str | None = None, **kwargs) -> str | None:
87
+ if episode_id:
88
+ return episode_id
89
+ for key in ("episode_id", "request_id"):
90
+ value = kwargs.get(key)
91
+ if isinstance(value, str) and value:
92
+ return value
93
+ return None
94
+
95
+ def _load_episode(self, episode_id: str | None = None, **kwargs) -> None:
96
+ resolved_episode_id = self._extract_episode_id(episode_id, **kwargs) or self.__class__._latest_episode_id
97
+ if not resolved_episode_id:
98
+ return
99
+
100
+ episode_state = self.__class__._episode_store.get(resolved_episode_id)
101
+ if episode_state is None:
102
+ raise ValueError(
103
+ f"Unknown episode_id '{resolved_episode_id}'. Call reset() first or provide a valid episode_id."
104
+ )
105
+
106
+ task = get_task(self.__class__._episode_task_ids.get(resolved_episode_id, episode_state.task_id))
107
+ self.task = task
108
+ self._max_steps = episode_state.max_steps
109
+ self._step_count = episode_state.step_count
110
+ self._reward_total = episode_state.reward
111
+ self._done = episode_state.done
112
+ self._last_feedback = episode_state.last_feedback
113
+ self._history = [entry.model_copy(deep=True) for entry in episode_state.action_history]
114
+ self._case = episode_state.case.model_copy(deep=True)
115
+ self._episode_id = resolved_episode_id
116
+ self._score = episode_state.current_score
117
+ self._completed_milestones = list(episode_state.completed_milestones)
118
+ self._current_sla_minutes_remaining = episode_state.current_sla_minutes_remaining
119
+
120
+ def _persist_episode(self) -> None:
121
+ if self._episode_id is None:
122
+ return
123
+ self.__class__._episode_store[self._episode_id] = SupportDeskState(
124
+ episode_id=self._episode_id,
125
+ task_id=self.task.task_id,
126
+ difficulty=self.task.difficulty,
127
+ step_count=self._step_count,
128
+ reward=round(self._reward_total, 4),
129
+ done=self._done,
130
+ current_score=round(self._score, 4),
131
+ max_steps=self._max_steps,
132
+ case=self._case.model_copy(deep=True),
133
+ current_sla_minutes_remaining=self._current_sla_minutes_remaining,
134
+ workflow_stage=self._workflow_stage(),
135
+ required_next_actions=self._required_next_actions(),
136
+ risk_flags=self._risk_flags(),
137
+ action_history=[entry.model_copy(deep=True) for entry in self._history],
138
+ completed_milestones=list(self._completed_milestones),
139
+ last_feedback=self._last_feedback,
140
+ )
141
+ self.__class__._episode_task_ids[self._episode_id] = self.task.task_id
142
+ self.__class__._latest_episode_id = self._episode_id
143
+
144
+ @property
145
+ def state(self) -> SupportDeskState:
146
+ with self.__class__._state_lock:
147
+ self._load_episode()
148
+ return SupportDeskState(
149
+ episode_id=self._episode_id,
150
+ task_id=self.task.task_id,
151
+ difficulty=self.task.difficulty,
152
+ step_count=self._step_count,
153
+ reward=round(self._reward_total, 4),
154
+ done=self._done,
155
+ current_score=round(self._score, 4),
156
+ max_steps=self._max_steps,
157
+ case=self._case.model_copy(deep=True),
158
+ current_sla_minutes_remaining=self._current_sla_minutes_remaining,
159
+ workflow_stage=self._workflow_stage(),
160
+ required_next_actions=self._required_next_actions(),
161
+ risk_flags=self._risk_flags(),
162
+ action_history=[entry.model_copy(deep=True) for entry in self._history],
163
+ completed_milestones=list(self._completed_milestones),
164
+ last_feedback=self._last_feedback,
165
+ )
166
+
167
+ def reset(
168
+ self,
169
+ seed: int | None = None,
170
+ episode_id: str | None = None,
171
+ **kwargs,
172
+ ) -> SupportDeskObservation:
173
+ with self.__class__._state_lock:
174
+ if not self._explicit_task_id:
175
+ task_ids = list_task_ids()
176
+ next_task_id = task_ids[self.__class__._shared_reset_counter % len(task_ids)]
177
+ self.__class__._shared_reset_counter += 1
178
+ self.task = get_task(next_task_id)
179
+ self._max_steps = self.task.max_steps
180
+ self._episode_id = episode_id or f"{self.task.task_id}-{uuid.uuid4().hex[:8]}"
181
+ initial_state = self.__class__._build_initial_state(self.task, self._episode_id)
182
+ self.__class__._episode_store[self._episode_id] = initial_state
183
+ self.__class__._episode_task_ids[self._episode_id] = self.task.task_id
184
+ self.__class__._latest_episode_id = self._episode_id
185
+ self._load_episode(self._episode_id)
186
+ return self._build_observation(reward=0.0, done=False)
187
+
188
+ def step(
189
+ self,
190
+ action: SupportDeskAction,
191
+ timeout_s: float | None = None,
192
+ episode_id: str | None = None,
193
+ **kwargs,
194
+ ) -> SupportDeskObservation:
195
+ with self.__class__._state_lock:
196
+ self._load_episode(episode_id, **kwargs)
197
+
198
+ if self._done:
199
+ return self._build_observation(
200
+ reward=-0.05,
201
+ done=True,
202
+ feedback="Episode already finished. Call reset() before taking more actions.",
203
+ )
204
+
205
+ previous_grade = grade_case(self.task, self._case)
206
+ previous_stage = self._workflow_stage()
207
+ self._apply_action(action)
208
+ self._step_count += 1
209
+ self._advance_external_events(action)
210
+ self._degrade_sla()
211
+
212
+ current_grade = grade_case(self.task, self._case)
213
+ reward = current_grade.total_score - previous_grade.total_score
214
+ reward += self._process_bonus(action, previous_stage, current_grade.total_score)
215
+ reward += self._action_penalty(
216
+ action,
217
+ current_grade.total_score,
218
+ previous_grade.total_score,
219
+ )
220
+ reward = round(reward, 4)
221
+
222
+ self._score = current_grade.total_score
223
+ self._completed_milestones = list(current_grade.completed_milestones)
224
+
225
+ if action.operation == "submit":
226
+ self._done = True
227
+ self._last_feedback = (
228
+ "Case submitted. Final deterministic grade is "
229
+ f"{current_grade.total_score:.2f}."
230
+ )
231
+ elif self._step_count >= self._max_steps:
232
+ self._done = True
233
+ self._last_feedback = (
234
+ f"Reached max steps ({self._max_steps}). Final deterministic grade is "
235
+ f"{current_grade.total_score:.2f}."
236
+ )
237
+ else:
238
+ self._last_feedback = self._build_feedback(current_grade, reward)
239
+
240
+ self._reward_total = round(self._reward_total + reward, 4)
241
+ self._history.append(
242
+ ActionHistoryEntry(
243
+ step=self._step_count,
244
+ operation=action.operation,
245
+ summary=self._summarize_action(action),
246
+ reward_delta=reward,
247
+ )
248
+ )
249
+ self._persist_episode()
250
+
251
+ return self._build_observation(reward=reward, done=self._done)
252
+
253
+ @classmethod
254
+ def state_for_episode(cls, episode_id: str) -> SupportDeskState:
255
+ with cls._state_lock:
256
+ state = cls._episode_store.get(episode_id)
257
+ if state is None:
258
+ raise ValueError(f"Unknown episode_id '{episode_id}'. Call reset() first.")
259
+ return state.model_copy(deep=True)
260
+
261
+ def close(self) -> None:
262
+ """No-op close hook for compatibility with local scripts."""
263
+
264
+ def get_metadata(self) -> EnvironmentMetadata:
265
+ """Return richer metadata for docs, validators, and HF Space UI."""
266
+
267
+ readme_path = Path(__file__).resolve().parents[1] / "README.md"
268
+ readme_content = readme_path.read_text(encoding="utf-8") if readme_path.exists() else None
269
+ return EnvironmentMetadata(
270
+ name="supportdesk_env",
271
+ description=(
272
+ "A policy-heavy enterprise operations desk with deterministic grading, delayed "
273
+ "customer follow-ups, SLA pressure, escalation tradeoffs, and sharper cross-functional triage."
274
+ ),
275
+ readme_content=readme_content,
276
+ version="0.1.0",
277
+ author="HyperBrick",
278
+ )
279
+
280
+ def _apply_action(self, action: SupportDeskAction) -> None:
281
+ if action.operation == "classify":
282
+ if action.queue is not None:
283
+ self._case.queue = action.queue
284
+ if action.priority is not None:
285
+ self._case.priority = action.priority
286
+ if action.issue_type is not None:
287
+ self._case.issue_type = action.issue_type
288
+ return
289
+
290
+ if action.operation == "request_info":
291
+ if action.requested_fields:
292
+ merged = {item for item in self._case.requested_fields}
293
+ merged.update(action.requested_fields)
294
+ self._case.requested_fields = sorted(merged)
295
+ if self.task.follow_up_outcome != "none" and self._case.customer_follow_up.status == "none":
296
+ self._case.customer_follow_up = CustomerFollowUp(status="pending")
297
+ return
298
+
299
+ if action.operation == "draft_reply":
300
+ if action.reply is not None:
301
+ self._case.reply = action.reply
302
+ return
303
+
304
+ if action.operation == "add_internal_note":
305
+ if action.internal_note is not None:
306
+ self._case.internal_note = action.internal_note
307
+ return
308
+
309
+ if action.operation == "submit":
310
+ if action.status is not None:
311
+ self._case.status = action.status
312
+ if action.resolution_code is not None:
313
+ self._case.resolution_code = action.resolution_code
314
+
315
+ def _advance_external_events(self, action: SupportDeskAction) -> None:
316
+ if self._case.customer_follow_up.status == "pending" and action.operation == "wait":
317
+ self._case.customer_follow_up = CustomerFollowUp(
318
+ status=self.task.follow_up_outcome,
319
+ message=self.task.follow_up_message or None,
320
+ provided_fields=list(self.task.follow_up_provided_fields),
321
+ wrong_fields=list(self.task.follow_up_wrong_fields),
322
+ )
323
+
324
+ def _degrade_sla(self) -> None:
325
+ if self._current_sla_minutes_remaining is None:
326
+ return
327
+ self._current_sla_minutes_remaining = max(
328
+ 0,
329
+ self._current_sla_minutes_remaining - self.task.sla_step_cost,
330
+ )
331
+
332
+ def _action_penalty(
333
+ self,
334
+ action: SupportDeskAction,
335
+ current_score: float,
336
+ previous_score: float,
337
+ ) -> float:
338
+ penalty = 0.0
339
+ if current_score <= previous_score:
340
+ penalty -= 0.03
341
+ penalty -= self._mixed_action_penalty(action)
342
+ penalty -= self._escalation_tradeoff_penalty()
343
+ if action.operation == "draft_reply" and not action.reply:
344
+ penalty -= 0.03
345
+ if action.operation == "request_info" and not action.requested_fields:
346
+ penalty -= 0.03
347
+ if action.operation == "add_internal_note" and not action.internal_note:
348
+ penalty -= 0.03
349
+ if action.operation == "classify" and not any(
350
+ [action.queue, action.priority, action.issue_type, action.status, action.resolution_code]
351
+ ):
352
+ penalty -= 0.03
353
+ if action.operation == "wait" and self._case.customer_follow_up.status != "pending":
354
+ penalty -= 0.02
355
+ if action.operation == "submit" and self._required_next_actions():
356
+ penalty -= 0.08
357
+ if (
358
+ self.task.under_escalation_deadline_step is not None
359
+ and self._step_count >= self.task.under_escalation_deadline_step
360
+ and (self._case.queue != self.task.gold_queue or self._case.priority != self.task.gold_priority)
361
+ ):
362
+ penalty -= 0.04
363
+ if self._current_sla_minutes_remaining is not None and self._current_sla_minutes_remaining <= 15:
364
+ penalty -= 0.02
365
+ return round(penalty, 4)
366
+
367
+ def _build_feedback(self, grade, reward: float) -> str:
368
+ return (
369
+ f"Reward delta {reward:+.2f}. Current score {grade.total_score:.2f}. "
370
+ f"SLA remaining: {self._current_sla_minutes_remaining if self._current_sla_minutes_remaining is not None else 'n/a'} minutes. "
371
+ f"Stage: {self._workflow_stage()}. "
372
+ f"Customer follow-up: {self._case.customer_follow_up.status}. "
373
+ f"Next actions: {', '.join(self._required_next_actions()) or 'none'}. "
374
+ f"Completed milestones: {', '.join(grade.completed_milestones) or 'none yet'}."
375
+ )
376
+
377
+ def _summarize_action(self, action: SupportDeskAction) -> str:
378
+ parts = [action.operation]
379
+ if action.queue:
380
+ parts.append(f"queue={action.queue}")
381
+ if action.priority:
382
+ parts.append(f"priority={action.priority}")
383
+ if action.issue_type:
384
+ parts.append(f"issue_type={action.issue_type}")
385
+ if action.status:
386
+ parts.append(f"status={action.status}")
387
+ if action.resolution_code:
388
+ parts.append(f"resolution={action.resolution_code}")
389
+ if action.requested_fields:
390
+ parts.append(f"requested={','.join(action.requested_fields)}")
391
+ if action.reply:
392
+ parts.append("reply=yes")
393
+ if action.internal_note:
394
+ parts.append("note=yes")
395
+ return " | ".join(parts)
396
+
397
+ def _build_observation(
398
+ self,
399
+ reward: float,
400
+ done: bool,
401
+ feedback: str | None = None,
402
+ ) -> SupportDeskObservation:
403
+ return SupportDeskObservation(
404
+ task_id=self.task.task_id,
405
+ difficulty=self.task.difficulty,
406
+ objective=self.task.objective,
407
+ ticket=self.task.ticket,
408
+ knowledge_base=list(self.task.knowledge_base),
409
+ available_queues=list(ALL_QUEUES),
410
+ available_priorities=list(ALL_PRIORITIES),
411
+ available_statuses=list(ALL_STATUSES),
412
+ available_issue_types=list(ALL_ISSUE_TYPES),
413
+ case=self._case.model_copy(deep=True),
414
+ current_sla_minutes_remaining=self._current_sla_minutes_remaining,
415
+ workflow_stage=self._workflow_stage(),
416
+ required_next_actions=self._required_next_actions(),
417
+ risk_flags=self._risk_flags(),
418
+ action_history=[entry.model_copy(deep=True) for entry in self._history],
419
+ feedback=feedback or self._last_feedback,
420
+ remaining_steps=max(self._max_steps - self._step_count, 0),
421
+ reward=reward,
422
+ done=done,
423
+ )
424
+
425
+ def _workflow_stage(self) -> str:
426
+ if self._done:
427
+ return "closed"
428
+ if self._case.queue is None or self._case.priority is None or self._case.issue_type is None:
429
+ return "intake"
430
+ if self.task.required_requested_fields and sorted(self._case.requested_fields) != sorted(self.task.required_requested_fields):
431
+ return "verification"
432
+ if self._case.customer_follow_up.status == "pending":
433
+ return "awaiting_customer"
434
+ if self._case.customer_follow_up.status in {"partial", "incorrect"}:
435
+ return "follow_up_review"
436
+ if not self._case.reply:
437
+ return "customer_communication"
438
+ if not self._case.internal_note:
439
+ return "internal_handoff"
440
+ if self._case.status != self.task.gold_status or self._case.resolution_code != self.task.gold_resolution_code:
441
+ return "final_resolution"
442
+ return "ready_to_submit"
443
+
444
+ def _required_next_actions(self) -> list[str]:
445
+ if self._case.queue is None or self._case.priority is None or self._case.issue_type is None:
446
+ return ["classify"]
447
+ if self.task.required_requested_fields and sorted(self._case.requested_fields) != sorted(self.task.required_requested_fields):
448
+ return ["request_info"]
449
+ if self._case.customer_follow_up.status == "pending":
450
+ return ["wait"]
451
+ needed: list[str] = []
452
+ if not self._case.reply:
453
+ needed.append("draft_reply")
454
+ if not self._case.internal_note:
455
+ needed.append("add_internal_note")
456
+ if self._case.status != self.task.gold_status or self._case.resolution_code != self.task.gold_resolution_code:
457
+ needed.append("submit")
458
+ return needed
459
+
460
+ def _risk_flags(self) -> list[str]:
461
+ flags = list(self.task.risk_flags)
462
+ if self._current_sla_minutes_remaining is not None and self._current_sla_minutes_remaining <= 30:
463
+ flags.append("sla_breach_risk")
464
+ if self.task.ticket.affected_users and self.task.ticket.affected_users >= 1000:
465
+ flags.append("high_customer_impact")
466
+ if self.task.ticket.secondary_concerns:
467
+ flags.append("secondary_issue_present")
468
+ if self._case.customer_follow_up.status == "partial":
469
+ flags.append("customer_reply_incomplete")
470
+ if self._case.customer_follow_up.status == "incorrect":
471
+ flags.append("customer_reply_irrelevant")
472
+ return sorted(set(flags))
473
+
474
+ def _process_bonus(
475
+ self,
476
+ action: SupportDeskAction,
477
+ previous_stage: str,
478
+ current_score: float,
479
+ ) -> float:
480
+ bonus = 0.0
481
+ stage_rank = {
482
+ "intake": 0,
483
+ "verification": 1,
484
+ "awaiting_customer": 2,
485
+ "follow_up_review": 3,
486
+ "customer_communication": 4,
487
+ "internal_handoff": 5,
488
+ "final_resolution": 6,
489
+ "ready_to_submit": 7,
490
+ "closed": 8,
491
+ }
492
+ current_stage = self._workflow_stage()
493
+ if stage_rank.get(current_stage, 0) > stage_rank.get(previous_stage, 0):
494
+ bonus += 0.02
495
+ if action.operation == "classify" and self._step_count == 1:
496
+ if self._case.queue == self.task.gold_queue and self._case.priority == self.task.gold_priority:
497
+ bonus += 0.03
498
+ if action.operation == "request_info" and current_score > 0 and self.task.required_requested_fields:
499
+ bonus += 0.02
500
+ if action.operation == "wait" and self._case.customer_follow_up.status in {"partial", "complete", "incorrect"}:
501
+ bonus += 0.02
502
+ if action.operation == "submit" and not self._required_next_actions():
503
+ bonus += 0.03
504
+ if self._current_sla_minutes_remaining is not None and self._current_sla_minutes_remaining > 0:
505
+ if self.task.gold_priority == "urgent" and self._step_count <= 2 and self._case.queue == self.task.gold_queue:
506
+ bonus += 0.02
507
+ return round(bonus, 4)
508
+
509
+ def _mixed_action_penalty(self, action: SupportDeskAction) -> float:
510
+ allowed_fields = {
511
+ "classify": {"queue", "priority", "issue_type"},
512
+ "request_info": {"requested_fields"},
513
+ "draft_reply": {"reply"},
514
+ "add_internal_note": {"internal_note"},
515
+ "submit": {"status", "resolution_code"},
516
+ "wait": set(),
517
+ }
518
+ populated_fields = {
519
+ "queue": action.queue,
520
+ "priority": action.priority,
521
+ "issue_type": action.issue_type,
522
+ "status": action.status,
523
+ "resolution_code": action.resolution_code,
524
+ "requested_fields": action.requested_fields,
525
+ "reply": action.reply,
526
+ "internal_note": action.internal_note,
527
+ }
528
+ extras = 0
529
+ for field_name, value in populated_fields.items():
530
+ if field_name in allowed_fields[action.operation]:
531
+ continue
532
+ if value is None:
533
+ continue
534
+ if isinstance(value, list) and not value:
535
+ continue
536
+ if isinstance(value, str) and not value:
537
+ continue
538
+ extras += 1
539
+ return min(0.06, extras * 0.02)
540
+
541
+ def _escalation_tradeoff_penalty(self) -> float:
542
+ penalty = 0.0
543
+ if self._case.queue in self.task.over_escalation_queues and self._case.queue != self.task.gold_queue:
544
+ penalty += 0.06
545
+ return round(penalty, 4)
tasks.py CHANGED
@@ -1,3 +1,405 @@
1
- """Compatibility wrapper for the real supportdesk_env package."""
2
 
3
- from supportdesk_env.tasks import * # noqa: F401,F403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Task registry for the SupportDesk environment."""
2
 
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Literal
7
+
8
+ from models import KnowledgeSnippet, SupportTicket
9
+
10
+
11
+ ALL_QUEUES = [
12
+ "billing_ops",
13
+ "trust_and_safety",
14
+ "platform_engineering",
15
+ "compliance_ops",
16
+ "general_support",
17
+ ]
18
+ ALL_PRIORITIES = ["low", "normal", "high", "urgent"]
19
+ ALL_STATUSES = ["new", "waiting_on_customer", "resolved", "escalated"]
20
+ ALL_ISSUE_TYPES = [
21
+ "duplicate_charge",
22
+ "account_compromise",
23
+ "production_incident",
24
+ "regulated_exception",
25
+ "general_question",
26
+ ]
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class SupportTaskSpec:
31
+ """Immutable definition of a single support triage task."""
32
+
33
+ task_id: str
34
+ difficulty: Literal["easy", "medium", "hard"]
35
+ title: str
36
+ objective: str
37
+ ticket: SupportTicket
38
+ knowledge_base: tuple[KnowledgeSnippet, ...]
39
+ gold_queue: str
40
+ gold_priority: str
41
+ gold_issue_type: str
42
+ gold_status: str
43
+ gold_resolution_code: str
44
+ required_requested_fields: tuple[str, ...]
45
+ required_reply_markers: tuple[tuple[str, ...], ...]
46
+ required_note_markers: tuple[tuple[str, ...], ...]
47
+ forbidden_reply_markers: tuple[str, ...] = ()
48
+ risk_flags: tuple[str, ...] = ()
49
+ follow_up_outcome: Literal["none", "partial", "complete", "incorrect"] = "none"
50
+ follow_up_message: str = ""
51
+ follow_up_provided_fields: tuple[str, ...] = ()
52
+ follow_up_wrong_fields: tuple[str, ...] = ()
53
+ sla_step_cost: int = 15
54
+ over_escalation_queues: tuple[str, ...] = ()
55
+ under_escalation_deadline_step: int | None = None
56
+ max_steps: int = 6
57
+
58
+
59
+ TASKS: dict[str, SupportTaskSpec] = {
60
+ "billing_refund_easy": SupportTaskSpec(
61
+ task_id="billing_refund_easy",
62
+ difficulty="easy",
63
+ title="Duplicate charge refund triage",
64
+ objective=(
65
+ "Triage a duplicate-charge billing ticket, send the correct customer response, "
66
+ "and close the case only if no further customer information is required."
67
+ ),
68
+ ticket=SupportTicket(
69
+ customer_name="Riya Shah",
70
+ customer_tier="pro",
71
+ company="PixelNorth Studio",
72
+ subject="Charged twice after I canceled",
73
+ body=(
74
+ "I canceled our Pro annual workspace yesterday, but my card was charged again "
75
+ "this morning and I still see the old invoice. We only had one workspace, "
76
+ "so this looks like a duplicate charge. Please fix it quickly."
77
+ ),
78
+ region="ap-south-1",
79
+ affected_users=12,
80
+ sla_minutes_remaining=240,
81
+ business_impact="Finance ops are blocked from closing the monthly books until the duplicate invoice is fixed.",
82
+ secondary_concerns=["The customer also wants confirmation that the canceled workspace will stay deactivated."],
83
+ attachments=["invoice_7741.pdf"],
84
+ ),
85
+ knowledge_base=(
86
+ KnowledgeSnippet(
87
+ article_id="KB-101",
88
+ title="Duplicate charges and same-day cancellations",
89
+ content=(
90
+ "If a customer reports a duplicate charge and the subscription is already "
91
+ "canceled, route the ticket to billing_ops with high priority. Billing can "
92
+ "approve the refund immediately without requesting extra information when an "
93
+ "invoice is attached."
94
+ ),
95
+ ),
96
+ KnowledgeSnippet(
97
+ article_id="KB-102",
98
+ title="Refund communication checklist",
99
+ content=(
100
+ "Customer replies for approved duplicate-charge refunds must confirm that a "
101
+ "refund is being processed, mention the duplicate charge, and set the "
102
+ "expectation that funds typically appear within 5-7 business days."
103
+ ),
104
+ ),
105
+ KnowledgeSnippet(
106
+ article_id="KB-103",
107
+ title="When to close a billing case",
108
+ content=(
109
+ "Close the case as resolved only after the refund path is clear and no more "
110
+ "customer details are needed."
111
+ ),
112
+ ),
113
+ ),
114
+ gold_queue="billing_ops",
115
+ gold_priority="high",
116
+ gold_issue_type="duplicate_charge",
117
+ gold_status="resolved",
118
+ gold_resolution_code="refund_approved",
119
+ required_requested_fields=(),
120
+ required_reply_markers=(
121
+ ("refund", "refunded", "reimburse"),
122
+ ("duplicate charge", "charged twice", "double charge"),
123
+ ("5-7 business days", "5 to 7 business days", "within 7 business days"),
124
+ ),
125
+ required_note_markers=(
126
+ ("duplicate charge", "double charge"),
127
+ ("refund", "refund approved"),
128
+ ),
129
+ forbidden_reply_markers=("chargeback", "security team"),
130
+ risk_flags=("finance_close_risk", "avoid_unnecessary_back_and_forth"),
131
+ over_escalation_queues=("trust_and_safety", "platform_engineering", "compliance_ops"),
132
+ sla_step_cost=10,
133
+ max_steps=6,
134
+ ),
135
+ "account_takeover_medium": SupportTaskSpec(
136
+ task_id="account_takeover_medium",
137
+ difficulty="medium",
138
+ title="Suspicious login recovery triage",
139
+ objective=(
140
+ "Handle a potential account-compromise case, request the missing verification "
141
+ "details, communicate safe next steps, and keep the case open until the customer replies. "
142
+ "The agent must protect account safety without promising an unsafe immediate unlock."
143
+ ),
144
+ ticket=SupportTicket(
145
+ customer_name="Marcus Lee",
146
+ customer_tier="pro",
147
+ company="Northline Analytics",
148
+ subject="Locked out after strange login alert",
149
+ body=(
150
+ "Our workspace admin got a login alert from a country none of us have visited, "
151
+ "and now I can't get back into the account. Please unlock it ASAP. The billing "
152
+ "email is still ours, but I'm worried someone got in."
153
+ ),
154
+ region="us-east-1",
155
+ affected_users=34,
156
+ sla_minutes_remaining=90,
157
+ business_impact="The admin is locked out of the analytics workspace ahead of the Monday executive review.",
158
+ secondary_concerns=["The customer wants the account unlocked immediately, but the verification flow cannot be skipped."],
159
+ attachments=[],
160
+ ),
161
+ knowledge_base=(
162
+ KnowledgeSnippet(
163
+ article_id="SEC-201",
164
+ title="Account compromise routing",
165
+ content=(
166
+ "Potential account-takeover reports route to trust_and_safety with urgent "
167
+ "priority. Do not resolve the case immediately."
168
+ ),
169
+ ),
170
+ KnowledgeSnippet(
171
+ article_id="SEC-202",
172
+ title="Verification details before unlock",
173
+ content=(
174
+ "Before access can be restored, ask the customer for the workspace_id, the "
175
+ "last successful login time, and the billing email on file. Keep the status "
176
+ "waiting_on_customer until the details arrive."
177
+ ),
178
+ ),
179
+ KnowledgeSnippet(
180
+ article_id="SEC-203",
181
+ title="Customer response checklist",
182
+ content=(
183
+ "Security replies should tell the customer to reset their password, scan "
184
+ "their device for malware, and explain that the trust team is reviewing the case."
185
+ ),
186
+ ),
187
+ ),
188
+ gold_queue="trust_and_safety",
189
+ gold_priority="urgent",
190
+ gold_issue_type="account_compromise",
191
+ gold_status="waiting_on_customer",
192
+ gold_resolution_code="verification_needed",
193
+ required_requested_fields=("workspace_id", "last_successful_login", "billing_email"),
194
+ required_reply_markers=(
195
+ ("reset your password", "change your password"),
196
+ ("scan", "malware", "device check"),
197
+ ("trust team", "security team", "trust and safety"),
198
+ ),
199
+ required_note_markers=(
200
+ ("suspicious login", "strange login"),
201
+ ("locked out", "can't get back", "cannot get back"),
202
+ ),
203
+ risk_flags=("unsafe_unlock_request", "identity_verification_required"),
204
+ follow_up_outcome="partial",
205
+ follow_up_message=(
206
+ "Customer follow-up: workspace_id=ws_9021 and billing email confirmed, "
207
+ "but they could not provide the last successful login time yet."
208
+ ),
209
+ follow_up_provided_fields=("workspace_id", "billing_email"),
210
+ sla_step_cost=18,
211
+ under_escalation_deadline_step=2,
212
+ max_steps=7,
213
+ ),
214
+ "api_incident_hard": SupportTaskSpec(
215
+ task_id="api_incident_hard",
216
+ difficulty="hard",
217
+ title="Production API incident escalation",
218
+ objective=(
219
+ "Triage a high-pressure enterprise incident, ask for the right diagnostics, notify "
220
+ "the customer that engineering is engaged, and escalate instead of resolving. "
221
+ "The agent must prioritize the outage over a tempting secondary compliance question."
222
+ ),
223
+ ticket=SupportTicket(
224
+ customer_name="Asha Verma",
225
+ customer_tier="enterprise",
226
+ company="Kairo Health",
227
+ subject="EU rollout blocked by intermittent 500s",
228
+ body=(
229
+ "We're launching our EU workspace tonight. Since enabling EU data residency we "
230
+ "see intermittent HTTP 500 responses from /v1/exports in production. Our "
231
+ "compliance lead is also asking whether this affects the audit trail, but the "
232
+ "main issue is the outage. We need help immediately."
233
+ ),
234
+ region="eu-west-1",
235
+ affected_users=1800,
236
+ sla_minutes_remaining=25,
237
+ business_impact="A production launch and a customer-facing compliance review are both at risk tonight if the outage persists.",
238
+ secondary_concerns=["The compliance lead is asking whether audit trails are affected, but the live outage is the primary incident."],
239
+ attachments=["error_screenshot.png"],
240
+ ),
241
+ knowledge_base=(
242
+ KnowledgeSnippet(
243
+ article_id="INC-301",
244
+ title="Production availability incidents",
245
+ content=(
246
+ "Any active production 5xx incident for a paying customer routes to "
247
+ "platform_engineering with urgent priority and should be escalated, not resolved."
248
+ ),
249
+ ),
250
+ KnowledgeSnippet(
251
+ article_id="INC-302",
252
+ title="Minimum diagnostics for API incidents",
253
+ content=(
254
+ "Before engineering can investigate, request concrete examples including "
255
+ "request_ids, UTC timestamps, and the affected region."
256
+ ),
257
+ ),
258
+ KnowledgeSnippet(
259
+ article_id="INC-303",
260
+ title="Customer communication during an incident",
261
+ content=(
262
+ "The reply should acknowledge an incident, say the on-call engineering team "
263
+ "is engaged, and ask for the diagnostics needed to speed investigation."
264
+ ),
265
+ ),
266
+ KnowledgeSnippet(
267
+ article_id="INC-304",
268
+ title="Primary issue triage rule",
269
+ content=(
270
+ "When a production outage appears alongside a secondary compliance or audit "
271
+ "question, resolve the live outage first and avoid treating the secondary "
272
+ "question as the primary queue-driving issue."
273
+ ),
274
+ ),
275
+ ),
276
+ gold_queue="platform_engineering",
277
+ gold_priority="urgent",
278
+ gold_issue_type="production_incident",
279
+ gold_status="escalated",
280
+ gold_resolution_code="incident_opened",
281
+ required_requested_fields=("request_ids", "timestamp_utc", "region"),
282
+ required_reply_markers=(
283
+ ("incident", "outage", "investigating"),
284
+ ("on-call", "engineering team", "engineering is engaged"),
285
+ ("request id", "request_ids"),
286
+ ("utc", "timestamp"),
287
+ ),
288
+ required_note_markers=(
289
+ ("eu data residency", "eu rollout"),
290
+ ("500", "http 500"),
291
+ ("launch tonight", "tonight"),
292
+ ),
293
+ risk_flags=("sev1_launch_risk", "secondary_issue_distraction", "engineering_escalation_required"),
294
+ follow_up_outcome="complete",
295
+ follow_up_message=(
296
+ "Customer follow-up: request_ids=req_991, req_998; UTC timestamps=2026-04-01T19:20Z, "
297
+ "2026-04-01T19:27Z; region=eu-west-1."
298
+ ),
299
+ follow_up_provided_fields=("request_ids", "timestamp_utc", "region"),
300
+ sla_step_cost=25,
301
+ under_escalation_deadline_step=2,
302
+ max_steps=8,
303
+ ),
304
+ "regulated_export_exception_hard": SupportTaskSpec(
305
+ task_id="regulated_export_exception_hard",
306
+ difficulty="hard",
307
+ title="Regulated export exception triage",
308
+ objective=(
309
+ "Handle a regulated export-access request where the customer wants a manual bypass. "
310
+ "The agent must avoid escalating to engineering just because an export error is mentioned, "
311
+ "route to compliance operations, request the right legal details, and keep the case open "
312
+ "until the required approvals are supplied."
313
+ ),
314
+ ticket=SupportTicket(
315
+ customer_name="Dr. Elena Kovacs",
316
+ customer_tier="enterprise",
317
+ company="NorthSea Clinics",
318
+ subject="Need urgent export unlock before audit review",
319
+ body=(
320
+ "Our audit team needs tonight's export package, but after enabling EU residency the "
321
+ "export workflow now shows a policy block. Can you temporarily disable the restriction "
322
+ "or give us a manual bypass just for this audit? We are under pressure and can share "
323
+ "screenshots if needed."
324
+ ),
325
+ region="eu-central-1",
326
+ affected_users=240,
327
+ sla_minutes_remaining=55,
328
+ business_impact="A regulated healthcare audit review may slip tonight if the export path remains blocked.",
329
+ secondary_concerns=[
330
+ "The customer suggests an engineering workaround, but legal approval is the real gating factor."
331
+ ],
332
+ attachments=["policy_block.png"],
333
+ ),
334
+ knowledge_base=(
335
+ KnowledgeSnippet(
336
+ article_id="CMP-401",
337
+ title="Export policy exceptions",
338
+ content=(
339
+ "Requests to bypass region or export restrictions route to compliance_ops. "
340
+ "Do not promise a manual engineering workaround before approvals are verified."
341
+ ),
342
+ ),
343
+ KnowledgeSnippet(
344
+ article_id="CMP-402",
345
+ title="Minimum legal details for export review",
346
+ content=(
347
+ "Ask for the tenant_region, dpa_amendment_id, and legal_contact_email before "
348
+ "compliance can review an export exception."
349
+ ),
350
+ ),
351
+ KnowledgeSnippet(
352
+ article_id="CMP-403",
353
+ title="Customer response rules for regulated exceptions",
354
+ content=(
355
+ "Replies must explain that no temporary bypass can be granted yet, mention the "
356
+ "compliance review, and request the required legal approval details."
357
+ ),
358
+ ),
359
+ ),
360
+ gold_queue="compliance_ops",
361
+ gold_priority="high",
362
+ gold_issue_type="regulated_exception",
363
+ gold_status="waiting_on_customer",
364
+ gold_resolution_code="legal_approval_required",
365
+ required_requested_fields=("tenant_region", "dpa_amendment_id", "legal_contact_email"),
366
+ required_reply_markers=(
367
+ ("no temporary bypass", "cannot provide a bypass", "can’t provide a bypass"),
368
+ ("compliance review", "compliance team"),
369
+ ("tenant_region", "tenant region"),
370
+ ("dpa_amendment_id", "dpa amendment", "amendment id"),
371
+ ),
372
+ required_note_markers=(
373
+ ("audit", "audit review"),
374
+ ("eu residency", "policy block"),
375
+ ("manual bypass", "workaround"),
376
+ ),
377
+ forbidden_reply_markers=("engineering workaround", "disable the restriction", "temporary unlock approved"),
378
+ risk_flags=("regulated_data_risk", "unsafe_shortcut_pressure", "over_escalation_risk"),
379
+ follow_up_outcome="incorrect",
380
+ follow_up_message=(
381
+ "Customer follow-up: sent a screenshot and export job ID, but did not include the DPA "
382
+ "amendment ID or legal contact."
383
+ ),
384
+ follow_up_wrong_fields=("screenshot", "job_id"),
385
+ sla_step_cost=16,
386
+ over_escalation_queues=("platform_engineering",),
387
+ max_steps=8,
388
+ ),
389
+ }
390
+
391
+
392
+ def get_task(task_id: str) -> SupportTaskSpec:
393
+ """Return a task definition or raise a helpful error."""
394
+
395
+ try:
396
+ return TASKS[task_id]
397
+ except KeyError as exc: # pragma: no cover - defensive
398
+ valid = ", ".join(sorted(TASKS))
399
+ raise ValueError(f"Unknown task_id '{task_id}'. Valid task ids: {valid}") from exc
400
+
401
+
402
+ def list_task_ids() -> list[str]:
403
+ """List tasks in a stable evaluation order."""
404
+
405
+ return list(TASKS)
tests/test_supportdesk.py CHANGED
@@ -10,10 +10,10 @@ try:
10
  except RuntimeError:
11
  TestClient = None # type: ignore[assignment]
12
 
13
- from supportdesk_env.graders import grade_case
14
- from supportdesk_env.models import SupportCaseProgress, SupportDeskAction
15
- from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
16
- from supportdesk_env.tasks import get_task, list_task_ids
17
 
18
 
19
  def test_all_tasks_are_registered():
@@ -90,13 +90,13 @@ def test_grade_is_bounded_between_zero_and_one():
90
 
91
 
92
  def test_task_specific_graders_are_importable_and_clamped():
93
- from supportdesk_env.graders import (
94
  AccountTakeoverMediumGrader,
95
  ApiIncidentHardGrader,
96
  BillingRefundEasyGrader,
97
  RegulatedExportExceptionHardGrader,
98
  )
99
- from supportdesk_env.models import SupportCaseProgress
100
 
101
  case = SupportCaseProgress()
102
  scores = [
@@ -176,7 +176,7 @@ def test_follow_up_arrives_after_wait():
176
 
177
  @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
178
  def test_http_reset_step_state_are_session_consistent():
179
- from supportdesk_env.server.app import app
180
 
181
  client = TestClient(app)
182
 
@@ -219,7 +219,7 @@ def test_http_reset_step_state_are_session_consistent():
219
 
220
  @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
221
  def test_http_explicit_episode_helpers_work():
222
- from supportdesk_env.server.app import app
223
 
224
  client = TestClient(app)
225
 
@@ -256,7 +256,7 @@ def test_http_explicit_episode_helpers_work():
256
 
257
  @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
258
  def test_http_tasks_include_truthy_grader_field():
259
- from supportdesk_env.server.app import app
260
 
261
  client = TestClient(app)
262
 
 
10
  except RuntimeError:
11
  TestClient = None # type: ignore[assignment]
12
 
13
+ from graders import grade_case
14
+ from models import SupportCaseProgress, SupportDeskAction
15
+ from server.supportdesk_environment import SupportDeskEnvironment
16
+ from tasks import get_task, list_task_ids
17
 
18
 
19
  def test_all_tasks_are_registered():
 
90
 
91
 
92
  def test_task_specific_graders_are_importable_and_clamped():
93
+ from graders import (
94
  AccountTakeoverMediumGrader,
95
  ApiIncidentHardGrader,
96
  BillingRefundEasyGrader,
97
  RegulatedExportExceptionHardGrader,
98
  )
99
+ from models import SupportCaseProgress
100
 
101
  case = SupportCaseProgress()
102
  scores = [
 
176
 
177
  @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
178
  def test_http_reset_step_state_are_session_consistent():
179
+ from server.app import app
180
 
181
  client = TestClient(app)
182
 
 
219
 
220
  @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
221
  def test_http_explicit_episode_helpers_work():
222
+ from server.app import app
223
 
224
  client = TestClient(app)
225
 
 
256
 
257
  @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
258
  def test_http_tasks_include_truthy_grader_field():
259
+ from server.app import app
260
 
261
  client = TestClient(app)
262