Spaces:

modelbuilderhq
/

HyperBrickCaseOps

Sleeping

App Files Files Community

modelbuilderhq commited on 27 days ago

Commit

4f129c9

verified ·

1 Parent(s): 6f6f46e

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

Dockerfile +1 -1
README.md +11 -9
__init__.py +49 -2
client.py +37 -2
graders.py +168 -11
inference.py +6 -6
main.py +1 -1
models.py +121 -2
openenv.yaml +1 -1
openenv_compat.py +76 -0
policies.py +84 -0
pyproject.toml +3 -3
server/__init__.py +5 -1
server/app.py +185 -13
server/supportdesk_environment.py +544 -2
tasks.py +404 -2
tests/test_supportdesk.py +9 -9

Dockerfile CHANGED Viewed

@@ -80,4 +80,4 @@ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
 # Run the FastAPI server
 # The module path is constructed to work with this repo's package layout.
-CMD ["sh", "-c", "cd /app/env && uvicorn supportdesk_env.server.app:app --host 0.0.0.0 --port 8000"]

 # Run the FastAPI server
 # The module path is constructed to work with this repo's package layout.
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -303,15 +303,17 @@ Examples:
 |-- pyproject.toml
 |-- Dockerfile
 |-- uv.lock
-|-- supportdesk_env
 |   |-- __init__.py
-|   |-- graders.py
-|   |-- models.py
-|   |-- policies.py
-|   |-- tasks.py
-|   `-- server
-|       |-- app.py
-|       `-- supportdesk_environment.py
 |-- tests
 |   `-- test_supportdesk.py
 `-- examples
@@ -344,7 +346,7 @@ python -m openenv.cli validate .
 Start the local server:
 ```bash
-python -m supportdesk_env.server.app
 ```
 Or use the entrypoint:

 |-- pyproject.toml
 |-- Dockerfile
 |-- uv.lock
+|-- __init__.py
+|-- client.py
+|-- graders.py
+|-- models.py
+|-- openenv_compat.py
+|-- policies.py
+|-- tasks.py
+|-- server
 |   |-- __init__.py
+|   |-- app.py
+|   `-- supportdesk_environment.py
 |-- tests
 |   `-- test_supportdesk.py
 `-- examples
 Start the local server:
 ```bash
+python -m server.app
 ```
 Or use the entrypoint:

__init__.py CHANGED Viewed

@@ -1,3 +1,50 @@
-"""Compatibility wrapper for the real supportdesk_env package."""
-from supportdesk_env import *  # noqa: F401,F403

+"""SupportDesk OpenEnv environment package (flat layout)."""
+from client import SupportDeskEnv
+from graders import (
+    AccountTakeoverMediumGrader,
+    ApiIncidentHardGrader,
+    BillingRefundEasyGrader,
+    GradeBreakdown,
+    RegulatedExportExceptionHardGrader,
+    grade_case,
+    grade_task_id,
+)
+from models import (
+    ActionHistoryEntry,
+    KnowledgeSnippet,
+    SupportCaseProgress,
+    SupportDeskAction,
+    SupportDeskObservation,
+    SupportDeskState,
+    SupportTicket,
+)
+from policies import default_note, default_reply, heuristic_action
+from server.supportdesk_environment import SupportDeskEnvironment
+from tasks import TASKS, SupportTaskSpec, get_task, list_task_ids
+__all__ = [
+    "ActionHistoryEntry",
+    "GradeBreakdown",
+    "KnowledgeSnippet",
+    "SupportCaseProgress",
+    "SupportDeskAction",
+    "SupportDeskEnv",
+    "SupportDeskEnvironment",
+    "SupportDeskObservation",
+    "SupportDeskState",
+    "SupportTaskSpec",
+    "SupportTicket",
+    "TASKS",
+    "default_note",
+    "default_reply",
+    "get_task",
+    "grade_case",
+    "grade_task_id",
+    "heuristic_action",
+    "list_task_ids",
+    "AccountTakeoverMediumGrader",
+    "ApiIncidentHardGrader",
+    "BillingRefundEasyGrader",
+    "RegulatedExportExceptionHardGrader",
+]

client.py CHANGED Viewed

@@ -1,3 +1,38 @@
-"""Compatibility wrapper for the real supportdesk_env package."""
-from supportdesk_env.client import *  # noqa: F401,F403

+"""HTTP client for interacting with a deployed SupportDesk environment."""
+from __future__ import annotations
+from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
+from openenv_compat import EnvClient, StepResult
+def _validate(model_cls, payload):
+    if hasattr(model_cls, "model_validate"):
+        return model_cls.model_validate(payload)
+    return model_cls(**payload)  # pragma: no cover - pydantic v1 fallback
+class SupportDeskEnv(EnvClient[SupportDeskAction, SupportDeskObservation, SupportDeskState]):
+    """Typed client for a locally running or deployed OpenEnv server."""
+    def _step_payload(self, action: SupportDeskAction) -> dict:
+        """Convert a typed action into the JSON payload expected by the server."""
+        if hasattr(action, "model_dump"):
+            return action.model_dump()
+        return action.dict()
+    def _parse_state(self, payload) -> SupportDeskState:
+        return _validate(SupportDeskState, payload)
+    def _parse_reset(self, payload) -> SupportDeskObservation:
+        return _validate(SupportDeskObservation, payload)
+    def _parse_result(self, payload) -> StepResult[SupportDeskObservation]:
+        observation = _validate(SupportDeskObservation, payload["observation"])
+        # OpenEnv StepResult only accepts observation/reward/done in this runtime.
+        return StepResult(
+            observation=observation,
+            reward=payload["reward"],
+            done=payload["done"],
+        )

graders.py CHANGED Viewed

@@ -1,14 +1,171 @@
-"""Compatibility wrapper exposing task graders from the repo root."""
-from supportdesk_env.graders import (
-    AccountTakeoverMediumGrader,
-    ApiIncidentHardGrader,
-    BillingRefundEasyGrader,
-    GradeBreakdown,
-    RegulatedExportExceptionHardGrader,
-    grade_case,
-    grade_task_id,
-)
 __all__ = [
     "AccountTakeoverMediumGrader",

+"""Deterministic graders and reward helpers for SupportDesk."""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from models import SupportCaseProgress
+from tasks import SupportTaskSpec, get_task
+STRICT_SCORE_EPSILON = 0.01
+@dataclass(frozen=True)
+class GradeBreakdown:
+    """A scored view of how close a case is to the gold solution."""
+    total_score: float
+    queue_score: float
+    priority_score: float
+    issue_type_score: float
+    requested_fields_score: float
+    reply_score: float
+    note_score: float
+    status_score: float
+    resolution_score: float
+    completed_milestones: tuple[str, ...]
+def _normalize(text: str | None) -> str:
+    if not text:
+        return ""
+    normalized = text.lower().replace("-", " ")
+    return re.sub(r"[^a-z0-9\s]", " ", normalized)
+def _marker_group_score(text: str | None, marker_groups: tuple[tuple[str, ...], ...]) -> float:
+    if not marker_groups:
+        return 1.0
+    normalized = _normalize(text)
+    if not normalized:
+        return 0.0
+    matches = 0
+    for group in marker_groups:
+        if any(_normalize(marker) in normalized for marker in group):
+            matches += 1
+    return matches / len(marker_groups)
+def _requested_fields_score(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
+    required = set(task.required_requested_fields)
+    requested = set(case.requested_fields)
+    if not required:
+        return 1.0 if not requested else 0.0
+    if not requested:
+        return 0.0
+    matched = len(required.intersection(requested))
+    extras = len(requested.difference(required))
+    raw = matched / len(required)
+    penalty = min(0.25, extras * 0.05)
+    return max(0.0, raw - penalty)
+def _reply_penalty(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
+    text = _normalize(case.reply)
+    if not text:
+        return 0.0
+    return 0.0 if not any(_normalize(marker) in text for marker in task.forbidden_reply_markers) else 0.5
+def _strict_open_unit_interval(score: float) -> float:
+    """Keep final task scores strictly within (0, 1) for evaluator compatibility."""
+    return min(1.0 - STRICT_SCORE_EPSILON, max(STRICT_SCORE_EPSILON, score))
+def grade_case(task: SupportTaskSpec, case: SupportCaseProgress) -> GradeBreakdown:
+    """Score a case deterministically with total_score strictly inside (0, 1)."""
+    queue_score = 1.0 if case.queue == task.gold_queue else 0.0
+    priority_score = 1.0 if case.priority == task.gold_priority else 0.0
+    issue_type_score = 1.0 if case.issue_type == task.gold_issue_type else 0.0
+    requested_fields_score = _requested_fields_score(case, task)
+    reply_score = max(0.0, _marker_group_score(case.reply, task.required_reply_markers) - _reply_penalty(case, task))
+    note_score = _marker_group_score(case.internal_note, task.required_note_markers)
+    status_score = 1.0 if case.status == task.gold_status else 0.0
+    resolution_score = 1.0 if case.resolution_code == task.gold_resolution_code else 0.0
+    weighted_total = (
+        queue_score * 0.15
+        + priority_score * 0.10
+        + issue_type_score * 0.10
+        + requested_fields_score * 0.15
+        + reply_score * 0.25
+        + note_score * 0.10
+        + status_score * 0.10
+        + resolution_score * 0.05
+    )
+    milestones: list[str] = []
+    if queue_score:
+        milestones.append("queue")
+    if priority_score:
+        milestones.append("priority")
+    if issue_type_score:
+        milestones.append("issue_type")
+    if requested_fields_score >= 0.99:
+        milestones.append("requested_fields")
+    if reply_score >= 0.99:
+        milestones.append("reply")
+    if note_score >= 0.99:
+        milestones.append("internal_note")
+    if status_score:
+        milestones.append("status")
+    if resolution_score:
+        milestones.append("resolution_code")
+    return GradeBreakdown(
+        total_score=round(_strict_open_unit_interval(weighted_total), 4),
+        queue_score=queue_score,
+        priority_score=priority_score,
+        issue_type_score=issue_type_score,
+        requested_fields_score=round(requested_fields_score, 4),
+        reply_score=round(reply_score, 4),
+        note_score=round(note_score, 4),
+        status_score=status_score,
+        resolution_score=resolution_score,
+        completed_milestones=tuple(milestones),
+    )
+def grade_task_id(task_id: str, case: SupportCaseProgress) -> GradeBreakdown:
+    """Convenience wrapper used by tests and evaluation scripts."""
+    return grade_case(get_task(task_id), case)
+class _TaskSpecificGrader:
+    """Importable task-specific grader wrapper for validator task discovery."""
+    task_id: str = ""
+    def grade(self, case: SupportCaseProgress) -> float:
+        return grade_task_id(self.task_id, case).total_score
+    def __call__(self, case: SupportCaseProgress) -> float:
+        return self.grade(case)
+class BillingRefundEasyGrader(_TaskSpecificGrader):
+    task_id = "billing_refund_easy"
+class AccountTakeoverMediumGrader(_TaskSpecificGrader):
+    task_id = "account_takeover_medium"
+class ApiIncidentHardGrader(_TaskSpecificGrader):
+    task_id = "api_incident_hard"
+class RegulatedExportExceptionHardGrader(_TaskSpecificGrader):
+    task_id = "regulated_export_exception_hard"
 __all__ = [
     "AccountTakeoverMediumGrader",

inference.py CHANGED Viewed

@@ -14,12 +14,12 @@ try:
 except ImportError:  # pragma: no cover - local fallback mode
     OpenAI = None  # type: ignore[assignment]
-from supportdesk_env.client import SupportDeskEnv
-from supportdesk_env.graders import grade_case
-from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
-from supportdesk_env.policies import heuristic_action
-from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
-from supportdesk_env.tasks import get_task, list_task_ids
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"

 except ImportError:  # pragma: no cover - local fallback mode
     OpenAI = None  # type: ignore[assignment]
+from client import SupportDeskEnv
+from graders import grade_case
+from models import SupportDeskAction, SupportDeskObservation
+from policies import heuristic_action
+from server.supportdesk_environment import SupportDeskEnvironment
+from tasks import get_task, list_task_ids
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"

main.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from supportdesk_env.server.app import app, main as _run_server
 def main() -> None:

 from __future__ import annotations
+from server.app import app, main as _run_server
 def main() -> None:

models.py CHANGED Viewed

@@ -1,3 +1,122 @@
-"""Compatibility wrapper for the real supportdesk_env package."""
-from supportdesk_env.models import *  # noqa: F401,F403

+"""Typed models for the SupportDesk OpenEnv environment."""
+from __future__ import annotations
+from typing import Literal
+from pydantic import BaseModel, Field
+from openenv_compat import Action, Observation, State
+class KnowledgeSnippet(BaseModel):
+    """A policy or runbook excerpt the agent can use during triage."""
+    article_id: str
+    title: str
+    content: str
+class SupportTicket(BaseModel):
+    """Static task input representing the inbound support ticket."""
+    customer_name: str
+    customer_tier: Literal["free", "pro", "enterprise"]
+    company: str
+    subject: str
+    body: str
+    region: str
+    affected_users: int | None = None
+    sla_minutes_remaining: int | None = None
+    business_impact: str | None = None
+    secondary_concerns: list[str] = Field(default_factory=list)
+    attachments: list[str] = Field(default_factory=list)
+class ActionHistoryEntry(BaseModel):
+    """A concise trace entry used in observations and state dumps."""
+    step: int
+    operation: str
+    summary: str
+    reward_delta: float = 0.0
+class CustomerFollowUp(BaseModel):
+    """A scripted customer response that arrives after a request for more information."""
+    status: Literal["none", "pending", "partial", "complete", "incorrect"] = "none"
+    message: str | None = None
+    provided_fields: list[str] = Field(default_factory=list)
+    wrong_fields: list[str] = Field(default_factory=list)
+class SupportCaseProgress(BaseModel):
+    """Mutable case state that graders score against."""
+    queue: str | None = None
+    priority: str | None = None
+    issue_type: str | None = None
+    status: str = "new"
+    resolution_code: str | None = None
+    requested_fields: list[str] = Field(default_factory=list)
+    reply: str | None = None
+    internal_note: str | None = None
+    customer_follow_up: CustomerFollowUp = Field(default_factory=CustomerFollowUp)
+class SupportDeskAction(Action):
+    """One structured action the agent can take at each step."""
+    operation: Literal["classify", "request_info", "draft_reply", "add_internal_note", "submit", "wait"]
+    queue: str | None = None
+    priority: str | None = None
+    issue_type: str | None = None
+    status: str | None = None
+    resolution_code: str | None = None
+    requested_fields: list[str] = Field(default_factory=list)
+    reply: str | None = None
+    internal_note: str | None = None
+class SupportDeskObservation(Observation):
+    """Observation emitted to the agent after reset and each step."""
+    task_id: str
+    difficulty: Literal["easy", "medium", "hard"]
+    objective: str
+    ticket: SupportTicket
+    knowledge_base: list[KnowledgeSnippet]
+    available_queues: list[str]
+    available_priorities: list[str]
+    available_statuses: list[str]
+    available_issue_types: list[str]
+    case: SupportCaseProgress
+    current_sla_minutes_remaining: int | None = None
+    workflow_stage: str
+    required_next_actions: list[str] = Field(default_factory=list)
+    risk_flags: list[str] = Field(default_factory=list)
+    action_history: list[ActionHistoryEntry] = Field(default_factory=list)
+    feedback: str = ""
+    remaining_steps: int = 0
+class SupportDeskState(State):
+    """Current environment state returned by the OpenEnv state() API."""
+    episode_id: str | None = None
+    task_id: str
+    difficulty: Literal["easy", "medium", "hard"]
+    step_count: int = 0
+    reward: float = 0.0
+    done: bool = False
+    current_score: float = 0.0
+    max_steps: int = 0
+    case: SupportCaseProgress
+    current_sla_minutes_remaining: int | None = None
+    workflow_stage: str
+    required_next_actions: list[str] = Field(default_factory=list)
+    risk_flags: list[str] = Field(default_factory=list)
+    action_history: list[ActionHistoryEntry] = Field(default_factory=list)
+    completed_milestones: list[str] = Field(default_factory=list)
+    last_feedback: str = ""

openenv.yaml CHANGED Viewed

@@ -3,7 +3,7 @@ name: HyperBrickCaseOps
 env_name: supportdesk_env
 type: space
 runtime: fastapi
-app: supportdesk_env.server.app:app
 port: 8000
 description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
 tasks:

 env_name: supportdesk_env
 type: space
 runtime: fastapi
+app: server.app:app
 port: 8000
 description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
 tasks:

openenv_compat.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Compatibility helpers for environments where openenv-core is not installed."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Generic, TypeVar
+from pydantic import BaseModel
+A = TypeVar("A")
+O = TypeVar("O")
+S = TypeVar("S")
+OPENENV_AVAILABLE = True
+try:
+    from openenv.core.client_types import StepResult  # type: ignore
+    from openenv.core.env_client import EnvClient  # type: ignore
+    from openenv.core.env_server.interfaces import Environment  # type: ignore
+    from openenv.core.env_server.types import Action, Observation, State  # type: ignore
+    from openenv.core.env_server.types import EnvironmentMetadata  # type: ignore
+except ImportError:
+    try:
+        from openenv_core.client_types import StepResult  # type: ignore
+        from openenv_core.http_env_client import HTTPEnvClient as EnvClient  # type: ignore
+        from openenv_core.env_server.interfaces import Environment  # type: ignore
+        from openenv_core.env_server.types import Action, Observation, State  # type: ignore
+        from openenv_core.env_server.types import EnvironmentMetadata  # type: ignore
+    except ImportError:
+        OPENENV_AVAILABLE = False
+        class Action(BaseModel):
+            """Fallback Action base type for local import-only workflows."""
+        class Observation(BaseModel):
+            """Fallback Observation base type for local import-only workflows."""
+            reward: float = 0.0
+            done: bool = False
+        class State(BaseModel):
+            """Fallback State base type for local import-only workflows."""
+        class Environment(Generic[A, O, S]):
+            """Minimal base class used for local unit tests and import-based demos."""
+            def __init__(self) -> None:
+                super().__init__()
+        class EnvironmentMetadata(BaseModel):
+            """Fallback metadata model used when OpenEnv is absent."""
+            name: str
+            description: str
+            readme_content: str | None = None
+            version: str | None = None
+            author: str | None = None
+        @dataclass
+        class StepResult(Generic[O]):
+            """Fallback step result for local-only client compatibility."""
+            observation: O
+            reward: float
+            done: bool
+            info: dict[str, Any] = field(default_factory=dict)
+        class EnvClient(Generic[A, O, S]):
+            """Placeholder client that fails only when actually used."""
+            def __init__(self, *args, **kwargs) -> None:
+                raise ImportError(
+                    "SupportDeskEnv requires openenv-core to be installed. "
+                    "Run `py -3 -m pip install openenv-core` to use the HTTP client."
+                )

policies.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Reusable policy helpers for local baselines and training examples."""
+from __future__ import annotations
+from models import SupportDeskAction, SupportDeskObservation
+from tasks import get_task
+def default_reply(task_id: str) -> str:
+    """Return a task-specific high-signal customer reply."""
+    if task_id == "billing_refund_easy":
+        return (
+            "Thanks for flagging the duplicate charge. I have started the refund for the extra "
+            "charge, and the funds usually appear within 5-7 business days."
+        )
+    if task_id == "account_takeover_medium":
+        return (
+            "We have escalated this to our trust team. Please reset your password, scan your "
+            "device for malware, and reply with your workspace_id, last successful login time, "
+            "and billing email so we can verify the account safely."
+        )
+    if task_id == "regulated_export_exception_hard":
+        return (
+            "We cannot provide a bypass or temporary unlock yet. Our compliance team is running "
+            "a compliance review, and we need your tenant_region, dpa_amendment_id, and "
+            "legal_contact_email to continue that review."
+        )
+    return (
+        "We are treating this as an active incident and our on-call engineering team is engaged. "
+        "Please send the affected request IDs, UTC timestamps, and the impacted region so we can "
+        "speed up the investigation."
+    )
+def default_note(task_id: str) -> str:
+    """Return a task-specific internal note."""
+    if task_id == "billing_refund_easy":
+        return "Duplicate charge confirmed from attached invoice; refund approved."
+    if task_id == "account_takeover_medium":
+        return "Suspicious login alert reported and customer is locked out."
+    if task_id == "regulated_export_exception_hard":
+        return (
+            "Audit-driven export exception request tied to an EU residency policy block; "
+            "customer asked for a manual bypass before legal approval."
+        )
+    return "EU data residency rollout hit intermittent HTTP 500s and the customer launches tonight."
+def heuristic_action(observation: SupportDeskObservation) -> SupportDeskAction:
+    """Deterministic high-performing policy used by the baseline."""
+    task = get_task(observation.task_id)
+    case = observation.case
+    if case.queue is None or case.priority is None or case.issue_type is None:
+        return SupportDeskAction(
+            operation="classify",
+            queue=task.gold_queue,
+            priority=task.gold_priority,
+            issue_type=task.gold_issue_type,
+        )
+    if task.required_requested_fields and sorted(case.requested_fields) != sorted(task.required_requested_fields):
+        return SupportDeskAction(
+            operation="request_info",
+            requested_fields=list(task.required_requested_fields),
+        )
+    if case.customer_follow_up.status == "pending":
+        return SupportDeskAction(operation="wait")
+    if not case.reply:
+        return SupportDeskAction(operation="draft_reply", reply=default_reply(observation.task_id))
+    if not case.internal_note:
+        return SupportDeskAction(operation="add_internal_note", internal_note=default_note(observation.task_id))
+    return SupportDeskAction(
+        operation="submit",
+        status=task.gold_status,
+        resolution_code=task.gold_resolution_code,
+    )

pyproject.toml CHANGED Viewed

@@ -33,9 +33,9 @@ dev = [
 [project.scripts]
 # Server entry point - enables running via: uv run --project . server
-# or: python -m supportdesk_env.server.app
-server = "supportdesk_env.server.app:main"
 [tool.setuptools]
 include-package-data = true
-packages = ["supportdesk_env", "supportdesk_env.server"]

 [project.scripts]
 # Server entry point - enables running via: uv run --project . server
+# or: python -m server.app
+server = "server.app:main"
 [tool.setuptools]
 include-package-data = true
+packages = ["server"]

server/__init__.py CHANGED Viewed

	@@ -1 +1,5 @@
1	- """Server package for the SupportDesk OpenEnv environment."""

+"""Server package for the SupportDesk OpenEnv environment."""
+from server.supportdesk_environment import SupportDeskEnvironment
+__all__ = ["SupportDeskEnvironment"]

server/app.py CHANGED Viewed

@@ -1,33 +1,205 @@
-"""FastAPI app entrypoint for the SupportDesk environment."""
 from __future__ import annotations
 import os
 import uvicorn
 try:
-    from openenv.core.env_server.http_server import create_app
-except ImportError:  # pragma: no cover - package name differs across releases
-    from openenv_core.env_server.http_server import create_app
-from supportdesk_env.models import SupportDeskAction, SupportDeskObservation
-from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
 app = create_app(
     SupportDeskEnvironment,
-    action_cls=SupportDeskAction,
-    observation_cls=SupportDeskObservation,
     env_name="supportdesk_env",
 )
-def main() -> None:
-    """Run the local HTTP server."""
-    port = int(os.getenv("PORT", "8000"))
-    uvicorn.run("supportdesk_env.server.app:app", host="0.0.0.0", port=port)
-if __name__ == "__main__":
     main()

+"""FastAPI application for the SupportDesk environment."""
 from __future__ import annotations
 import os
+from typing import Any
 import uvicorn
+from fastapi import Body, HTTPException
+from fastapi.routing import APIRoute
 try:
+    from openenv.core.env_server import http_server as openenv_http_server
+except ImportError:
+    try:
+        from openenv_core.env_server import http_server as openenv_http_server
+    except Exception as e:  # pragma: no cover
+        raise ImportError(
+            "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
+        ) from e
+from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
+from server.supportdesk_environment import SupportDeskEnvironment
+from tasks import TASKS
+# Bind the default OpenEnv /state route to the full typed state model.
+openenv_http_server.State = SupportDeskState
+create_app = openenv_http_server.create_app
+# Create the app with web interface and README integration.
 app = create_app(
     SupportDeskEnvironment,
+    SupportDeskAction,
+    SupportDeskObservation,
     env_name="supportdesk_env",
+    max_concurrent_envs=1,  # increase this number to allow more concurrent WebSocket sessions
 )
+TASK_GRADER_PATHS = {
+    "billing_refund_easy": "graders:BillingRefundEasyGrader",
+    "account_takeover_medium": "graders:AccountTakeoverMediumGrader",
+    "api_incident_hard": "graders:ApiIncidentHardGrader",
+    "regulated_export_exception_hard": "graders:RegulatedExportExceptionHardGrader",
+}
+def _replace_route(path: str, methods: set[str]) -> None:
+    """Remove a generated route so we can register a score-aware replacement."""
+    app.router.routes = [
+        route
+        for route in app.router.routes
+        if not (
+            isinstance(route, APIRoute)
+            and route.path == path
+            and methods.issubset(set(route.methods or set()))
+        )
+    ]
+def _score_response(env: SupportDeskEnvironment, observation: SupportDeskObservation) -> dict[str, Any]:
+    """Return the standard OpenEnv shape plus an explicit top-level score."""
+    return {
+        "observation": observation.model_dump(),
+        "reward": observation.reward,
+        "done": observation.done,
+        "score": env.state.current_score,
+    }
+_replace_route("/reset", {"POST"})
+_replace_route("/step", {"POST"})
+@app.post("/reset")
+async def reset_with_score(
+    request: openenv_http_server.ResetRequest = Body(default_factory=openenv_http_server.ResetRequest),
+) -> dict[str, Any]:
+    """Reset the environment and expose the initial deterministic score at top level."""
+    env = SupportDeskEnvironment()
+    try:
+        kwargs = request.model_dump(exclude_unset=True)
+        observation = env.reset(**kwargs)
+        return _score_response(env, observation)
+    finally:
+        env.close()
+@app.post("/step")
+async def step_with_score(request: openenv_http_server.StepRequest) -> dict[str, Any]:
+    """Execute a step and expose the current deterministic score at top level."""
+    action_data = request.action
+    try:
+        action = openenv_http_server.deserialize_action(action_data, SupportDeskAction)
+    except openenv_http_server.ValidationError as exc:
+        raise HTTPException(status_code=422, detail=exc.errors()) from exc
+    env = SupportDeskEnvironment()
+    try:
+        kwargs = request.model_dump(exclude_unset=True, exclude={"action"})
+        observation = env.step(action, **kwargs)
+        return _score_response(env, observation)
+    finally:
+        env.close()
+@app.get("/tasks")
+def list_tasks() -> dict[str, Any]:
+    """Expose a stable task catalog for UI, debugging, and pre-submit checks."""
+    return {
+        "environment": {
+            "name": "supportdesk_env",
+            "version": "0.1.0",
+            "grader_type": "deterministic",
+            "score_range": [0.0, 1.0],
+        },
+        "total_tasks": len(TASKS),
+        "tasks": [
+            {
+                "task_id": task.task_id,
+                "grader": TASK_GRADER_PATHS[task.task_id],
+                "title": task.title,
+                "difficulty": task.difficulty,
+                "objective": task.objective,
+                "max_steps": task.max_steps,
+                "gold_issue_type": task.gold_issue_type,
+                "gold_queue": task.gold_queue,
+                "gold_priority": task.gold_priority,
+                "ticket_context": {
+                    "customer_tier": task.ticket.customer_tier,
+                    "region": task.ticket.region,
+                    "affected_users": task.ticket.affected_users,
+                    "sla_minutes_remaining": task.ticket.sla_minutes_remaining,
+                },
+            }
+            for task in TASKS.values()
+        ],
+    }
+@app.get("/episodes/{episode_id}/state", response_model=SupportDeskState)
+def get_episode_state(episode_id: str) -> SupportDeskState:
+    """Optional explicit state helper for robust episode-addressable inspection."""
+    try:
+        return SupportDeskEnvironment.state_for_episode(episode_id)
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+@app.post("/episodes/{episode_id}/step")
+def step_episode(
+    episode_id: str,
+    payload: dict[str, Any] = Body(...),
+) -> dict[str, Any]:
+    """Optional explicit step helper that does not require sticky request context."""
+    action_payload = payload.get("action")
+    if not isinstance(action_payload, dict):
+        raise HTTPException(status_code=422, detail="Request body must include an 'action' object.")
+    timeout_s = payload.get("timeout_s")
+    try:
+        action = SupportDeskAction.model_validate(action_payload)
+        env = SupportDeskEnvironment()
+        observation = env.step(action, timeout_s=timeout_s, episode_id=episode_id)
+    except ValueError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+    return {
+        "observation": observation.model_dump(),
+        "reward": observation.reward,
+        "done": observation.done,
+        "score": SupportDeskEnvironment.state_for_episode(episode_id).current_score,
+    }
+def main(host: str = "0.0.0.0", port: int = 8000) -> None:
+    """
+    Entry point for direct execution via uv run or python -m.
+    This function enables running the server without Docker:
+        uv run --project . server
+        uv run --project . server --port 8001
+        python -m server.app
+    Args:
+        host: Host address to bind to (default: "0.0.0.0")
+        port: Port number to listen on (default: 8000)
+    For production deployments, consider using uvicorn directly with
+    multiple workers:
+        uvicorn server.app:app --workers 4
+    """
+    uvicorn.run("server.app:app", host=host, port=port)
+if __name__ == '__main__':
     main()

server/supportdesk_environment.py CHANGED Viewed

@@ -1,3 +1,545 @@
-"""Compatibility wrapper for the real supportdesk_env package."""
-from supportdesk_env.server.supportdesk_environment import *  # noqa: F401,F403

+"""SupportDesk environment implementation."""
+from __future__ import annotations
+import os
+import threading
+import uuid
+from pathlib import Path
+from typing import ClassVar
+from graders import grade_case
+from models import (
+    ActionHistoryEntry,
+    CustomerFollowUp,
+    SupportCaseProgress,
+    SupportDeskAction,
+    SupportDeskObservation,
+    SupportDeskState,
+)
+from openenv_compat import Environment, EnvironmentMetadata
+from tasks import (
+    ALL_ISSUE_TYPES,
+    ALL_PRIORITIES,
+    ALL_QUEUES,
+    ALL_STATUSES,
+    SupportTaskSpec,
+    get_task,
+    list_task_ids,
+)
+class SupportDeskEnvironment(
+    Environment[SupportDeskAction, SupportDeskObservation, SupportDeskState]
+):
+    """A realistic customer support triage environment with dense rewards."""
+    _state_lock: ClassVar[threading.RLock] = threading.RLock()
+    _episode_store: ClassVar[dict[str, SupportDeskState]] = {}
+    _episode_task_ids: ClassVar[dict[str, str]] = {}
+    _latest_episode_id: ClassVar[str | None] = None
+    _shared_reset_counter: ClassVar[int] = 0
+    def __init__(self, task_id: str | None = None):
+        super().__init__()
+        env_task_id = os.getenv("SUPPORTDESK_TASK_ID")
+        self._explicit_task_id = task_id is not None or env_task_id is not None
+        requested_task = task_id or env_task_id or list_task_ids()[0]
+        self.task: SupportTaskSpec = get_task(requested_task)
+        self._max_steps = self.task.max_steps
+        self._step_count = 0
+        self._reward_total = 0.0
+        self._done = False
+        self._last_feedback = ""
+        self._history: list[ActionHistoryEntry] = []
+        self._case = SupportCaseProgress()
+        self._episode_id: str | None = None
+        self._current_sla_minutes_remaining = self.task.ticket.sla_minutes_remaining
+        initial_grade = grade_case(self.task, self._case)
+        self._score = initial_grade.total_score
+        self._completed_milestones = list(initial_grade.completed_milestones)
+    @classmethod
+    def _build_initial_state(cls, task: SupportTaskSpec, episode_id: str) -> SupportDeskState:
+        initial_case = SupportCaseProgress()
+        initial_grade = grade_case(task, initial_case)
+        return SupportDeskState(
+            episode_id=episode_id,
+            task_id=task.task_id,
+            difficulty=task.difficulty,
+            step_count=0,
+            reward=0.0,
+            done=False,
+            current_score=initial_grade.total_score,
+            max_steps=task.max_steps,
+            case=initial_case,
+            current_sla_minutes_remaining=task.ticket.sla_minutes_remaining,
+            workflow_stage="intake",
+            required_next_actions=["classify"],
+            risk_flags=[],
+            action_history=[],
+            completed_milestones=list(initial_grade.completed_milestones),
+            last_feedback="New case loaded. Review the ticket and policy snippets before acting.",
+        )
+    @classmethod
+    def _extract_episode_id(cls, episode_id: str | None = None, **kwargs) -> str | None:
+        if episode_id:
+            return episode_id
+        for key in ("episode_id", "request_id"):
+            value = kwargs.get(key)
+            if isinstance(value, str) and value:
+                return value
+        return None
+    def _load_episode(self, episode_id: str | None = None, **kwargs) -> None:
+        resolved_episode_id = self._extract_episode_id(episode_id, **kwargs) or self.__class__._latest_episode_id
+        if not resolved_episode_id:
+            return
+        episode_state = self.__class__._episode_store.get(resolved_episode_id)
+        if episode_state is None:
+            raise ValueError(
+                f"Unknown episode_id '{resolved_episode_id}'. Call reset() first or provide a valid episode_id."
+            )
+        task = get_task(self.__class__._episode_task_ids.get(resolved_episode_id, episode_state.task_id))
+        self.task = task
+        self._max_steps = episode_state.max_steps
+        self._step_count = episode_state.step_count
+        self._reward_total = episode_state.reward
+        self._done = episode_state.done
+        self._last_feedback = episode_state.last_feedback
+        self._history = [entry.model_copy(deep=True) for entry in episode_state.action_history]
+        self._case = episode_state.case.model_copy(deep=True)
+        self._episode_id = resolved_episode_id
+        self._score = episode_state.current_score
+        self._completed_milestones = list(episode_state.completed_milestones)
+        self._current_sla_minutes_remaining = episode_state.current_sla_minutes_remaining
+    def _persist_episode(self) -> None:
+        if self._episode_id is None:
+            return
+        self.__class__._episode_store[self._episode_id] = SupportDeskState(
+            episode_id=self._episode_id,
+            task_id=self.task.task_id,
+            difficulty=self.task.difficulty,
+            step_count=self._step_count,
+            reward=round(self._reward_total, 4),
+            done=self._done,
+            current_score=round(self._score, 4),
+            max_steps=self._max_steps,
+            case=self._case.model_copy(deep=True),
+            current_sla_minutes_remaining=self._current_sla_minutes_remaining,
+            workflow_stage=self._workflow_stage(),
+            required_next_actions=self._required_next_actions(),
+            risk_flags=self._risk_flags(),
+            action_history=[entry.model_copy(deep=True) for entry in self._history],
+            completed_milestones=list(self._completed_milestones),
+            last_feedback=self._last_feedback,
+        )
+        self.__class__._episode_task_ids[self._episode_id] = self.task.task_id
+        self.__class__._latest_episode_id = self._episode_id
+    @property
+    def state(self) -> SupportDeskState:
+        with self.__class__._state_lock:
+            self._load_episode()
+            return SupportDeskState(
+                episode_id=self._episode_id,
+                task_id=self.task.task_id,
+                difficulty=self.task.difficulty,
+                step_count=self._step_count,
+                reward=round(self._reward_total, 4),
+                done=self._done,
+                current_score=round(self._score, 4),
+                max_steps=self._max_steps,
+                case=self._case.model_copy(deep=True),
+                current_sla_minutes_remaining=self._current_sla_minutes_remaining,
+                workflow_stage=self._workflow_stage(),
+                required_next_actions=self._required_next_actions(),
+                risk_flags=self._risk_flags(),
+                action_history=[entry.model_copy(deep=True) for entry in self._history],
+                completed_milestones=list(self._completed_milestones),
+                last_feedback=self._last_feedback,
+            )
+    def reset(
+        self,
+        seed: int | None = None,
+        episode_id: str | None = None,
+        **kwargs,
+    ) -> SupportDeskObservation:
+        with self.__class__._state_lock:
+            if not self._explicit_task_id:
+                task_ids = list_task_ids()
+                next_task_id = task_ids[self.__class__._shared_reset_counter % len(task_ids)]
+                self.__class__._shared_reset_counter += 1
+                self.task = get_task(next_task_id)
+                self._max_steps = self.task.max_steps
+            self._episode_id = episode_id or f"{self.task.task_id}-{uuid.uuid4().hex[:8]}"
+            initial_state = self.__class__._build_initial_state(self.task, self._episode_id)
+            self.__class__._episode_store[self._episode_id] = initial_state
+            self.__class__._episode_task_ids[self._episode_id] = self.task.task_id
+            self.__class__._latest_episode_id = self._episode_id
+            self._load_episode(self._episode_id)
+            return self._build_observation(reward=0.0, done=False)
+    def step(
+        self,
+        action: SupportDeskAction,
+        timeout_s: float | None = None,
+        episode_id: str | None = None,
+        **kwargs,
+    ) -> SupportDeskObservation:
+        with self.__class__._state_lock:
+            self._load_episode(episode_id, **kwargs)
+            if self._done:
+                return self._build_observation(
+                    reward=-0.05,
+                    done=True,
+                    feedback="Episode already finished. Call reset() before taking more actions.",
+                )
+            previous_grade = grade_case(self.task, self._case)
+            previous_stage = self._workflow_stage()
+            self._apply_action(action)
+            self._step_count += 1
+            self._advance_external_events(action)
+            self._degrade_sla()
+            current_grade = grade_case(self.task, self._case)
+            reward = current_grade.total_score - previous_grade.total_score
+            reward += self._process_bonus(action, previous_stage, current_grade.total_score)
+            reward += self._action_penalty(
+                action,
+                current_grade.total_score,
+                previous_grade.total_score,
+            )
+            reward = round(reward, 4)
+            self._score = current_grade.total_score
+            self._completed_milestones = list(current_grade.completed_milestones)
+            if action.operation == "submit":
+                self._done = True
+                self._last_feedback = (
+                    "Case submitted. Final deterministic grade is "
+                    f"{current_grade.total_score:.2f}."
+                )
+            elif self._step_count >= self._max_steps:
+                self._done = True
+                self._last_feedback = (
+                    f"Reached max steps ({self._max_steps}). Final deterministic grade is "
+                    f"{current_grade.total_score:.2f}."
+                )
+            else:
+                self._last_feedback = self._build_feedback(current_grade, reward)
+            self._reward_total = round(self._reward_total + reward, 4)
+            self._history.append(
+                ActionHistoryEntry(
+                    step=self._step_count,
+                    operation=action.operation,
+                    summary=self._summarize_action(action),
+                    reward_delta=reward,
+                )
+            )
+            self._persist_episode()
+            return self._build_observation(reward=reward, done=self._done)
+    @classmethod
+    def state_for_episode(cls, episode_id: str) -> SupportDeskState:
+        with cls._state_lock:
+            state = cls._episode_store.get(episode_id)
+            if state is None:
+                raise ValueError(f"Unknown episode_id '{episode_id}'. Call reset() first.")
+            return state.model_copy(deep=True)
+    def close(self) -> None:
+        """No-op close hook for compatibility with local scripts."""
+    def get_metadata(self) -> EnvironmentMetadata:
+        """Return richer metadata for docs, validators, and HF Space UI."""
+        readme_path = Path(__file__).resolve().parents[1] / "README.md"
+        readme_content = readme_path.read_text(encoding="utf-8") if readme_path.exists() else None
+        return EnvironmentMetadata(
+            name="supportdesk_env",
+            description=(
+                "A policy-heavy enterprise operations desk with deterministic grading, delayed "
+                "customer follow-ups, SLA pressure, escalation tradeoffs, and sharper cross-functional triage."
+            ),
+            readme_content=readme_content,
+            version="0.1.0",
+            author="HyperBrick",
+        )
+    def _apply_action(self, action: SupportDeskAction) -> None:
+        if action.operation == "classify":
+            if action.queue is not None:
+                self._case.queue = action.queue
+            if action.priority is not None:
+                self._case.priority = action.priority
+            if action.issue_type is not None:
+                self._case.issue_type = action.issue_type
+            return
+        if action.operation == "request_info":
+            if action.requested_fields:
+                merged = {item for item in self._case.requested_fields}
+                merged.update(action.requested_fields)
+                self._case.requested_fields = sorted(merged)
+                if self.task.follow_up_outcome != "none" and self._case.customer_follow_up.status == "none":
+                    self._case.customer_follow_up = CustomerFollowUp(status="pending")
+            return
+        if action.operation == "draft_reply":
+            if action.reply is not None:
+                self._case.reply = action.reply
+            return
+        if action.operation == "add_internal_note":
+            if action.internal_note is not None:
+                self._case.internal_note = action.internal_note
+            return
+        if action.operation == "submit":
+            if action.status is not None:
+                self._case.status = action.status
+            if action.resolution_code is not None:
+                self._case.resolution_code = action.resolution_code
+    def _advance_external_events(self, action: SupportDeskAction) -> None:
+        if self._case.customer_follow_up.status == "pending" and action.operation == "wait":
+            self._case.customer_follow_up = CustomerFollowUp(
+                status=self.task.follow_up_outcome,
+                message=self.task.follow_up_message or None,
+                provided_fields=list(self.task.follow_up_provided_fields),
+                wrong_fields=list(self.task.follow_up_wrong_fields),
+            )
+    def _degrade_sla(self) -> None:
+        if self._current_sla_minutes_remaining is None:
+            return
+        self._current_sla_minutes_remaining = max(
+            0,
+            self._current_sla_minutes_remaining - self.task.sla_step_cost,
+        )
+    def _action_penalty(
+        self,
+        action: SupportDeskAction,
+        current_score: float,
+        previous_score: float,
+    ) -> float:
+        penalty = 0.0
+        if current_score <= previous_score:
+            penalty -= 0.03
+        penalty -= self._mixed_action_penalty(action)
+        penalty -= self._escalation_tradeoff_penalty()
+        if action.operation == "draft_reply" and not action.reply:
+            penalty -= 0.03
+        if action.operation == "request_info" and not action.requested_fields:
+            penalty -= 0.03
+        if action.operation == "add_internal_note" and not action.internal_note:
+            penalty -= 0.03
+        if action.operation == "classify" and not any(
+            [action.queue, action.priority, action.issue_type, action.status, action.resolution_code]
+        ):
+            penalty -= 0.03
+        if action.operation == "wait" and self._case.customer_follow_up.status != "pending":
+            penalty -= 0.02
+        if action.operation == "submit" and self._required_next_actions():
+            penalty -= 0.08
+        if (
+            self.task.under_escalation_deadline_step is not None
+            and self._step_count >= self.task.under_escalation_deadline_step
+            and (self._case.queue != self.task.gold_queue or self._case.priority != self.task.gold_priority)
+        ):
+            penalty -= 0.04
+        if self._current_sla_minutes_remaining is not None and self._current_sla_minutes_remaining <= 15:
+            penalty -= 0.02
+        return round(penalty, 4)
+    def _build_feedback(self, grade, reward: float) -> str:
+        return (
+            f"Reward delta {reward:+.2f}. Current score {grade.total_score:.2f}. "
+            f"SLA remaining: {self._current_sla_minutes_remaining if self._current_sla_minutes_remaining is not None else 'n/a'} minutes. "
+            f"Stage: {self._workflow_stage()}. "
+            f"Customer follow-up: {self._case.customer_follow_up.status}. "
+            f"Next actions: {', '.join(self._required_next_actions()) or 'none'}. "
+            f"Completed milestones: {', '.join(grade.completed_milestones) or 'none yet'}."
+        )
+    def _summarize_action(self, action: SupportDeskAction) -> str:
+        parts = [action.operation]
+        if action.queue:
+            parts.append(f"queue={action.queue}")
+        if action.priority:
+            parts.append(f"priority={action.priority}")
+        if action.issue_type:
+            parts.append(f"issue_type={action.issue_type}")
+        if action.status:
+            parts.append(f"status={action.status}")
+        if action.resolution_code:
+            parts.append(f"resolution={action.resolution_code}")
+        if action.requested_fields:
+            parts.append(f"requested={','.join(action.requested_fields)}")
+        if action.reply:
+            parts.append("reply=yes")
+        if action.internal_note:
+            parts.append("note=yes")
+        return " | ".join(parts)
+    def _build_observation(
+        self,
+        reward: float,
+        done: bool,
+        feedback: str | None = None,
+    ) -> SupportDeskObservation:
+        return SupportDeskObservation(
+            task_id=self.task.task_id,
+            difficulty=self.task.difficulty,
+            objective=self.task.objective,
+            ticket=self.task.ticket,
+            knowledge_base=list(self.task.knowledge_base),
+            available_queues=list(ALL_QUEUES),
+            available_priorities=list(ALL_PRIORITIES),
+            available_statuses=list(ALL_STATUSES),
+            available_issue_types=list(ALL_ISSUE_TYPES),
+            case=self._case.model_copy(deep=True),
+            current_sla_minutes_remaining=self._current_sla_minutes_remaining,
+            workflow_stage=self._workflow_stage(),
+            required_next_actions=self._required_next_actions(),
+            risk_flags=self._risk_flags(),
+            action_history=[entry.model_copy(deep=True) for entry in self._history],
+            feedback=feedback or self._last_feedback,
+            remaining_steps=max(self._max_steps - self._step_count, 0),
+            reward=reward,
+            done=done,
+        )
+    def _workflow_stage(self) -> str:
+        if self._done:
+            return "closed"
+        if self._case.queue is None or self._case.priority is None or self._case.issue_type is None:
+            return "intake"
+        if self.task.required_requested_fields and sorted(self._case.requested_fields) != sorted(self.task.required_requested_fields):
+            return "verification"
+        if self._case.customer_follow_up.status == "pending":
+            return "awaiting_customer"
+        if self._case.customer_follow_up.status in {"partial", "incorrect"}:
+            return "follow_up_review"
+        if not self._case.reply:
+            return "customer_communication"
+        if not self._case.internal_note:
+            return "internal_handoff"
+        if self._case.status != self.task.gold_status or self._case.resolution_code != self.task.gold_resolution_code:
+            return "final_resolution"
+        return "ready_to_submit"
+    def _required_next_actions(self) -> list[str]:
+        if self._case.queue is None or self._case.priority is None or self._case.issue_type is None:
+            return ["classify"]
+        if self.task.required_requested_fields and sorted(self._case.requested_fields) != sorted(self.task.required_requested_fields):
+            return ["request_info"]
+        if self._case.customer_follow_up.status == "pending":
+            return ["wait"]
+        needed: list[str] = []
+        if not self._case.reply:
+            needed.append("draft_reply")
+        if not self._case.internal_note:
+            needed.append("add_internal_note")
+        if self._case.status != self.task.gold_status or self._case.resolution_code != self.task.gold_resolution_code:
+            needed.append("submit")
+        return needed
+    def _risk_flags(self) -> list[str]:
+        flags = list(self.task.risk_flags)
+        if self._current_sla_minutes_remaining is not None and self._current_sla_minutes_remaining <= 30:
+            flags.append("sla_breach_risk")
+        if self.task.ticket.affected_users and self.task.ticket.affected_users >= 1000:
+            flags.append("high_customer_impact")
+        if self.task.ticket.secondary_concerns:
+            flags.append("secondary_issue_present")
+        if self._case.customer_follow_up.status == "partial":
+            flags.append("customer_reply_incomplete")
+        if self._case.customer_follow_up.status == "incorrect":
+            flags.append("customer_reply_irrelevant")
+        return sorted(set(flags))
+    def _process_bonus(
+        self,
+        action: SupportDeskAction,
+        previous_stage: str,
+        current_score: float,
+    ) -> float:
+        bonus = 0.0
+        stage_rank = {
+            "intake": 0,
+            "verification": 1,
+            "awaiting_customer": 2,
+            "follow_up_review": 3,
+            "customer_communication": 4,
+            "internal_handoff": 5,
+            "final_resolution": 6,
+            "ready_to_submit": 7,
+            "closed": 8,
+        }
+        current_stage = self._workflow_stage()
+        if stage_rank.get(current_stage, 0) > stage_rank.get(previous_stage, 0):
+            bonus += 0.02
+        if action.operation == "classify" and self._step_count == 1:
+            if self._case.queue == self.task.gold_queue and self._case.priority == self.task.gold_priority:
+                bonus += 0.03
+        if action.operation == "request_info" and current_score > 0 and self.task.required_requested_fields:
+            bonus += 0.02
+        if action.operation == "wait" and self._case.customer_follow_up.status in {"partial", "complete", "incorrect"}:
+            bonus += 0.02
+        if action.operation == "submit" and not self._required_next_actions():
+            bonus += 0.03
+        if self._current_sla_minutes_remaining is not None and self._current_sla_minutes_remaining > 0:
+            if self.task.gold_priority == "urgent" and self._step_count <= 2 and self._case.queue == self.task.gold_queue:
+                bonus += 0.02
+        return round(bonus, 4)
+    def _mixed_action_penalty(self, action: SupportDeskAction) -> float:
+        allowed_fields = {
+            "classify": {"queue", "priority", "issue_type"},
+            "request_info": {"requested_fields"},
+            "draft_reply": {"reply"},
+            "add_internal_note": {"internal_note"},
+            "submit": {"status", "resolution_code"},
+            "wait": set(),
+        }
+        populated_fields = {
+            "queue": action.queue,
+            "priority": action.priority,
+            "issue_type": action.issue_type,
+            "status": action.status,
+            "resolution_code": action.resolution_code,
+            "requested_fields": action.requested_fields,
+            "reply": action.reply,
+            "internal_note": action.internal_note,
+        }
+        extras = 0
+        for field_name, value in populated_fields.items():
+            if field_name in allowed_fields[action.operation]:
+                continue
+            if value is None:
+                continue
+            if isinstance(value, list) and not value:
+                continue
+            if isinstance(value, str) and not value:
+                continue
+            extras += 1
+        return min(0.06, extras * 0.02)
+    def _escalation_tradeoff_penalty(self) -> float:
+        penalty = 0.0
+        if self._case.queue in self.task.over_escalation_queues and self._case.queue != self.task.gold_queue:
+            penalty += 0.06
+        return round(penalty, 4)

tasks.py CHANGED Viewed

@@ -1,3 +1,405 @@
-"""Compatibility wrapper for the real supportdesk_env package."""
-from supportdesk_env.tasks import *  # noqa: F401,F403

+"""Task registry for the SupportDesk environment."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+from models import KnowledgeSnippet, SupportTicket
+ALL_QUEUES = [
+    "billing_ops",
+    "trust_and_safety",
+    "platform_engineering",
+    "compliance_ops",
+    "general_support",
+]
+ALL_PRIORITIES = ["low", "normal", "high", "urgent"]
+ALL_STATUSES = ["new", "waiting_on_customer", "resolved", "escalated"]
+ALL_ISSUE_TYPES = [
+    "duplicate_charge",
+    "account_compromise",
+    "production_incident",
+    "regulated_exception",
+    "general_question",
+]
+@dataclass(frozen=True)
+class SupportTaskSpec:
+    """Immutable definition of a single support triage task."""
+    task_id: str
+    difficulty: Literal["easy", "medium", "hard"]
+    title: str
+    objective: str
+    ticket: SupportTicket
+    knowledge_base: tuple[KnowledgeSnippet, ...]
+    gold_queue: str
+    gold_priority: str
+    gold_issue_type: str
+    gold_status: str
+    gold_resolution_code: str
+    required_requested_fields: tuple[str, ...]
+    required_reply_markers: tuple[tuple[str, ...], ...]
+    required_note_markers: tuple[tuple[str, ...], ...]
+    forbidden_reply_markers: tuple[str, ...] = ()
+    risk_flags: tuple[str, ...] = ()
+    follow_up_outcome: Literal["none", "partial", "complete", "incorrect"] = "none"
+    follow_up_message: str = ""
+    follow_up_provided_fields: tuple[str, ...] = ()
+    follow_up_wrong_fields: tuple[str, ...] = ()
+    sla_step_cost: int = 15
+    over_escalation_queues: tuple[str, ...] = ()
+    under_escalation_deadline_step: int | None = None
+    max_steps: int = 6
+TASKS: dict[str, SupportTaskSpec] = {
+    "billing_refund_easy": SupportTaskSpec(
+        task_id="billing_refund_easy",
+        difficulty="easy",
+        title="Duplicate charge refund triage",
+        objective=(
+            "Triage a duplicate-charge billing ticket, send the correct customer response, "
+            "and close the case only if no further customer information is required."
+        ),
+        ticket=SupportTicket(
+            customer_name="Riya Shah",
+            customer_tier="pro",
+            company="PixelNorth Studio",
+            subject="Charged twice after I canceled",
+            body=(
+                "I canceled our Pro annual workspace yesterday, but my card was charged again "
+                "this morning and I still see the old invoice. We only had one workspace, "
+                "so this looks like a duplicate charge. Please fix it quickly."
+            ),
+            region="ap-south-1",
+            affected_users=12,
+            sla_minutes_remaining=240,
+            business_impact="Finance ops are blocked from closing the monthly books until the duplicate invoice is fixed.",
+            secondary_concerns=["The customer also wants confirmation that the canceled workspace will stay deactivated."],
+            attachments=["invoice_7741.pdf"],
+        ),
+        knowledge_base=(
+            KnowledgeSnippet(
+                article_id="KB-101",
+                title="Duplicate charges and same-day cancellations",
+                content=(
+                    "If a customer reports a duplicate charge and the subscription is already "
+                    "canceled, route the ticket to billing_ops with high priority. Billing can "
+                    "approve the refund immediately without requesting extra information when an "
+                    "invoice is attached."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="KB-102",
+                title="Refund communication checklist",
+                content=(
+                    "Customer replies for approved duplicate-charge refunds must confirm that a "
+                    "refund is being processed, mention the duplicate charge, and set the "
+                    "expectation that funds typically appear within 5-7 business days."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="KB-103",
+                title="When to close a billing case",
+                content=(
+                    "Close the case as resolved only after the refund path is clear and no more "
+                    "customer details are needed."
+                ),
+            ),
+        ),
+        gold_queue="billing_ops",
+        gold_priority="high",
+        gold_issue_type="duplicate_charge",
+        gold_status="resolved",
+        gold_resolution_code="refund_approved",
+        required_requested_fields=(),
+        required_reply_markers=(
+            ("refund", "refunded", "reimburse"),
+            ("duplicate charge", "charged twice", "double charge"),
+            ("5-7 business days", "5 to 7 business days", "within 7 business days"),
+        ),
+        required_note_markers=(
+            ("duplicate charge", "double charge"),
+            ("refund", "refund approved"),
+        ),
+        forbidden_reply_markers=("chargeback", "security team"),
+        risk_flags=("finance_close_risk", "avoid_unnecessary_back_and_forth"),
+        over_escalation_queues=("trust_and_safety", "platform_engineering", "compliance_ops"),
+        sla_step_cost=10,
+        max_steps=6,
+    ),
+    "account_takeover_medium": SupportTaskSpec(
+        task_id="account_takeover_medium",
+        difficulty="medium",
+        title="Suspicious login recovery triage",
+        objective=(
+            "Handle a potential account-compromise case, request the missing verification "
+            "details, communicate safe next steps, and keep the case open until the customer replies. "
+            "The agent must protect account safety without promising an unsafe immediate unlock."
+        ),
+        ticket=SupportTicket(
+            customer_name="Marcus Lee",
+            customer_tier="pro",
+            company="Northline Analytics",
+            subject="Locked out after strange login alert",
+            body=(
+                "Our workspace admin got a login alert from a country none of us have visited, "
+                "and now I can't get back into the account. Please unlock it ASAP. The billing "
+                "email is still ours, but I'm worried someone got in."
+            ),
+            region="us-east-1",
+            affected_users=34,
+            sla_minutes_remaining=90,
+            business_impact="The admin is locked out of the analytics workspace ahead of the Monday executive review.",
+            secondary_concerns=["The customer wants the account unlocked immediately, but the verification flow cannot be skipped."],
+            attachments=[],
+        ),
+        knowledge_base=(
+            KnowledgeSnippet(
+                article_id="SEC-201",
+                title="Account compromise routing",
+                content=(
+                    "Potential account-takeover reports route to trust_and_safety with urgent "
+                    "priority. Do not resolve the case immediately."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="SEC-202",
+                title="Verification details before unlock",
+                content=(
+                    "Before access can be restored, ask the customer for the workspace_id, the "
+                    "last successful login time, and the billing email on file. Keep the status "
+                    "waiting_on_customer until the details arrive."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="SEC-203",
+                title="Customer response checklist",
+                content=(
+                    "Security replies should tell the customer to reset their password, scan "
+                    "their device for malware, and explain that the trust team is reviewing the case."
+                ),
+            ),
+        ),
+        gold_queue="trust_and_safety",
+        gold_priority="urgent",
+        gold_issue_type="account_compromise",
+        gold_status="waiting_on_customer",
+        gold_resolution_code="verification_needed",
+        required_requested_fields=("workspace_id", "last_successful_login", "billing_email"),
+        required_reply_markers=(
+            ("reset your password", "change your password"),
+            ("scan", "malware", "device check"),
+            ("trust team", "security team", "trust and safety"),
+        ),
+        required_note_markers=(
+            ("suspicious login", "strange login"),
+            ("locked out", "can't get back", "cannot get back"),
+        ),
+        risk_flags=("unsafe_unlock_request", "identity_verification_required"),
+        follow_up_outcome="partial",
+        follow_up_message=(
+            "Customer follow-up: workspace_id=ws_9021 and billing email confirmed, "
+            "but they could not provide the last successful login time yet."
+        ),
+        follow_up_provided_fields=("workspace_id", "billing_email"),
+        sla_step_cost=18,
+        under_escalation_deadline_step=2,
+        max_steps=7,
+    ),
+    "api_incident_hard": SupportTaskSpec(
+        task_id="api_incident_hard",
+        difficulty="hard",
+        title="Production API incident escalation",
+        objective=(
+            "Triage a high-pressure enterprise incident, ask for the right diagnostics, notify "
+            "the customer that engineering is engaged, and escalate instead of resolving. "
+            "The agent must prioritize the outage over a tempting secondary compliance question."
+        ),
+        ticket=SupportTicket(
+            customer_name="Asha Verma",
+            customer_tier="enterprise",
+            company="Kairo Health",
+            subject="EU rollout blocked by intermittent 500s",
+            body=(
+                "We're launching our EU workspace tonight. Since enabling EU data residency we "
+                "see intermittent HTTP 500 responses from /v1/exports in production. Our "
+                "compliance lead is also asking whether this affects the audit trail, but the "
+                "main issue is the outage. We need help immediately."
+            ),
+            region="eu-west-1",
+            affected_users=1800,
+            sla_minutes_remaining=25,
+            business_impact="A production launch and a customer-facing compliance review are both at risk tonight if the outage persists.",
+            secondary_concerns=["The compliance lead is asking whether audit trails are affected, but the live outage is the primary incident."],
+            attachments=["error_screenshot.png"],
+        ),
+        knowledge_base=(
+            KnowledgeSnippet(
+                article_id="INC-301",
+                title="Production availability incidents",
+                content=(
+                    "Any active production 5xx incident for a paying customer routes to "
+                    "platform_engineering with urgent priority and should be escalated, not resolved."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="INC-302",
+                title="Minimum diagnostics for API incidents",
+                content=(
+                    "Before engineering can investigate, request concrete examples including "
+                    "request_ids, UTC timestamps, and the affected region."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="INC-303",
+                title="Customer communication during an incident",
+                content=(
+                    "The reply should acknowledge an incident, say the on-call engineering team "
+                    "is engaged, and ask for the diagnostics needed to speed investigation."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="INC-304",
+                title="Primary issue triage rule",
+                content=(
+                    "When a production outage appears alongside a secondary compliance or audit "
+                    "question, resolve the live outage first and avoid treating the secondary "
+                    "question as the primary queue-driving issue."
+                ),
+            ),
+        ),
+        gold_queue="platform_engineering",
+        gold_priority="urgent",
+        gold_issue_type="production_incident",
+        gold_status="escalated",
+        gold_resolution_code="incident_opened",
+        required_requested_fields=("request_ids", "timestamp_utc", "region"),
+        required_reply_markers=(
+            ("incident", "outage", "investigating"),
+            ("on-call", "engineering team", "engineering is engaged"),
+            ("request id", "request_ids"),
+            ("utc", "timestamp"),
+        ),
+        required_note_markers=(
+            ("eu data residency", "eu rollout"),
+            ("500", "http 500"),
+            ("launch tonight", "tonight"),
+        ),
+        risk_flags=("sev1_launch_risk", "secondary_issue_distraction", "engineering_escalation_required"),
+        follow_up_outcome="complete",
+        follow_up_message=(
+            "Customer follow-up: request_ids=req_991, req_998; UTC timestamps=2026-04-01T19:20Z, "
+            "2026-04-01T19:27Z; region=eu-west-1."
+        ),
+        follow_up_provided_fields=("request_ids", "timestamp_utc", "region"),
+        sla_step_cost=25,
+        under_escalation_deadline_step=2,
+        max_steps=8,
+    ),
+    "regulated_export_exception_hard": SupportTaskSpec(
+        task_id="regulated_export_exception_hard",
+        difficulty="hard",
+        title="Regulated export exception triage",
+        objective=(
+            "Handle a regulated export-access request where the customer wants a manual bypass. "
+            "The agent must avoid escalating to engineering just because an export error is mentioned, "
+            "route to compliance operations, request the right legal details, and keep the case open "
+            "until the required approvals are supplied."
+        ),
+        ticket=SupportTicket(
+            customer_name="Dr. Elena Kovacs",
+            customer_tier="enterprise",
+            company="NorthSea Clinics",
+            subject="Need urgent export unlock before audit review",
+            body=(
+                "Our audit team needs tonight's export package, but after enabling EU residency the "
+                "export workflow now shows a policy block. Can you temporarily disable the restriction "
+                "or give us a manual bypass just for this audit? We are under pressure and can share "
+                "screenshots if needed."
+            ),
+            region="eu-central-1",
+            affected_users=240,
+            sla_minutes_remaining=55,
+            business_impact="A regulated healthcare audit review may slip tonight if the export path remains blocked.",
+            secondary_concerns=[
+                "The customer suggests an engineering workaround, but legal approval is the real gating factor."
+            ],
+            attachments=["policy_block.png"],
+        ),
+        knowledge_base=(
+            KnowledgeSnippet(
+                article_id="CMP-401",
+                title="Export policy exceptions",
+                content=(
+                    "Requests to bypass region or export restrictions route to compliance_ops. "
+                    "Do not promise a manual engineering workaround before approvals are verified."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="CMP-402",
+                title="Minimum legal details for export review",
+                content=(
+                    "Ask for the tenant_region, dpa_amendment_id, and legal_contact_email before "
+                    "compliance can review an export exception."
+                ),
+            ),
+            KnowledgeSnippet(
+                article_id="CMP-403",
+                title="Customer response rules for regulated exceptions",
+                content=(
+                    "Replies must explain that no temporary bypass can be granted yet, mention the "
+                    "compliance review, and request the required legal approval details."
+                ),
+            ),
+        ),
+        gold_queue="compliance_ops",
+        gold_priority="high",
+        gold_issue_type="regulated_exception",
+        gold_status="waiting_on_customer",
+        gold_resolution_code="legal_approval_required",
+        required_requested_fields=("tenant_region", "dpa_amendment_id", "legal_contact_email"),
+        required_reply_markers=(
+            ("no temporary bypass", "cannot provide a bypass", "canâ€™t provide a bypass"),
+            ("compliance review", "compliance team"),
+            ("tenant_region", "tenant region"),
+            ("dpa_amendment_id", "dpa amendment", "amendment id"),
+        ),
+        required_note_markers=(
+            ("audit", "audit review"),
+            ("eu residency", "policy block"),
+            ("manual bypass", "workaround"),
+        ),
+        forbidden_reply_markers=("engineering workaround", "disable the restriction", "temporary unlock approved"),
+        risk_flags=("regulated_data_risk", "unsafe_shortcut_pressure", "over_escalation_risk"),
+        follow_up_outcome="incorrect",
+        follow_up_message=(
+            "Customer follow-up: sent a screenshot and export job ID, but did not include the DPA "
+            "amendment ID or legal contact."
+        ),
+        follow_up_wrong_fields=("screenshot", "job_id"),
+        sla_step_cost=16,
+        over_escalation_queues=("platform_engineering",),
+        max_steps=8,
+    ),
+}
+def get_task(task_id: str) -> SupportTaskSpec:
+    """Return a task definition or raise a helpful error."""
+    try:
+        return TASKS[task_id]
+    except KeyError as exc:  # pragma: no cover - defensive
+        valid = ", ".join(sorted(TASKS))
+        raise ValueError(f"Unknown task_id '{task_id}'. Valid task ids: {valid}") from exc
+def list_task_ids() -> list[str]:
+    """List tasks in a stable evaluation order."""
+    return list(TASKS)

tests/test_supportdesk.py CHANGED Viewed

@@ -10,10 +10,10 @@ try:
 except RuntimeError:
     TestClient = None  # type: ignore[assignment]
-from supportdesk_env.graders import grade_case
-from supportdesk_env.models import SupportCaseProgress, SupportDeskAction
-from supportdesk_env.server.supportdesk_environment import SupportDeskEnvironment
-from supportdesk_env.tasks import get_task, list_task_ids
 def test_all_tasks_are_registered():
@@ -90,13 +90,13 @@ def test_grade_is_bounded_between_zero_and_one():
 def test_task_specific_graders_are_importable_and_clamped():
-    from supportdesk_env.graders import (
         AccountTakeoverMediumGrader,
         ApiIncidentHardGrader,
         BillingRefundEasyGrader,
         RegulatedExportExceptionHardGrader,
     )
-    from supportdesk_env.models import SupportCaseProgress
     case = SupportCaseProgress()
     scores = [
@@ -176,7 +176,7 @@ def test_follow_up_arrives_after_wait():
 @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
 def test_http_reset_step_state_are_session_consistent():
-    from supportdesk_env.server.app import app
     client = TestClient(app)
@@ -219,7 +219,7 @@ def test_http_reset_step_state_are_session_consistent():
 @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
 def test_http_explicit_episode_helpers_work():
-    from supportdesk_env.server.app import app
     client = TestClient(app)
@@ -256,7 +256,7 @@ def test_http_explicit_episode_helpers_work():
 @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
 def test_http_tasks_include_truthy_grader_field():
-    from supportdesk_env.server.app import app
     client = TestClient(app)

 except RuntimeError:
     TestClient = None  # type: ignore[assignment]
+from graders import grade_case
+from models import SupportCaseProgress, SupportDeskAction
+from server.supportdesk_environment import SupportDeskEnvironment
+from tasks import get_task, list_task_ids
 def test_all_tasks_are_registered():
 def test_task_specific_graders_are_importable_and_clamped():
+    from graders import (
         AccountTakeoverMediumGrader,
         ApiIncidentHardGrader,
         BillingRefundEasyGrader,
         RegulatedExportExceptionHardGrader,
     )
+    from models import SupportCaseProgress
     case = SupportCaseProgress()
     scores = [
 @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
 def test_http_reset_step_state_are_session_consistent():
+    from server.app import app
     client = TestClient(app)
 @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
 def test_http_explicit_episode_helpers_work():
+    from server.app import app
     client = TestClient(app)
 @pytest.mark.skipif(TestClient is None, reason="httpx is not installed for FastAPI TestClient")
 def test_http_tasks_include_truthy_grader_field():
+    from server.app import app
     client = TestClient(app)