Spaces:

modelbuilderhq
/

HyperBrickCaseOps

Sleeping

App Files Files Community

modelbuilderhq commited on 29 days ago

Commit

2ade2c6

verified ·

1 Parent(s): 35d990a

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

graders/__init__.py +169 -0
pyproject.toml +1 -1
server/app.py +51 -4
tasks/__init__.py +43 -0
tasks/account_takeover_medium.py +63 -0
tasks/api_incident_hard.py +68 -0
tasks/base.py +64 -0
tasks/billing_refund_easy.py +66 -0
tasks/regulated_export_exception_hard.py +66 -0

graders/__init__.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""Task-specific graders for the SupportDesk environment."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+from models import SupportCaseProgress, SupportDeskObservation
+def _as_case(obj) -> SupportCaseProgress:
+    """Normalize observation/state/case to SupportCaseProgress."""
+    if isinstance(obj, SupportCaseProgress):
+        return obj
+    if hasattr(obj, "case"):
+        return obj.case  # type: ignore[attr-defined]
+    raise TypeError(f"Unsupported object for grading: {type(obj)}")
+@dataclass
+class GradeBreakdown:
+    score: float
+    message: str
+    penalties: dict[str, float]
+    completed_milestones: list[str] = None
+    @property
+    def total_score(self) -> float:
+        return self.score
+    def __post_init__(self):
+        if self.completed_milestones is None:
+            object.__setattr__(self, "completed_milestones", [])
+def _clamp(v: float) -> float:
+    return max(0.01, min(0.99, v))
+def grade_task_id(task_id: str, observation: SupportDeskObservation | SupportCaseProgress) -> GradeBreakdown:
+    case = _as_case(observation)
+    if task_id == "billing_refund_easy":
+        return BillingRefundEasyGrader().score(case)
+    if task_id == "account_takeover_medium":
+        return AccountTakeoverMediumGrader().score(case)
+    if task_id == "api_incident_hard":
+        return ApiIncidentHardGrader().score(case)
+    if task_id == "regulated_export_exception_hard":
+        return RegulatedExportExceptionHardGrader().score(case)
+    return GradeBreakdown(0.01, "Unknown task", {"unknown_task": 1.0})
+def grade_case(task_or_id, observation) -> GradeBreakdown:
+    """Return a GradeBreakdown for the given task and case/observation."""
+    task_id = task_or_id.task_id if hasattr(task_or_id, "task_id") else str(task_or_id)
+    case = _as_case(observation)
+    return grade_task_id(task_id, case)
+class BillingRefundEasyGrader:
+    def score(self, case: SupportCaseProgress) -> GradeBreakdown:
+        penalties: dict[str, float] = {}
+        score = 1.0
+        reply = (case.reply or "").lower()
+        note = (case.internal_note or "").lower()
+        if reply:
+            if "refund" not in reply:
+                penalties["missing_refund"] = 0.25
+        else:
+            penalties["no_reply"] = 0.55
+        if note:
+            if "duplicate" not in note:
+                penalties["note_missing_duplicate"] = 0.2
+        else:
+            penalties["no_note"] = 0.2
+        if case.status != "resolved":
+            penalties["status_not_resolved"] = 0.1
+        score -= sum(penalties.values())
+        score = round(score, 2)
+        return GradeBreakdown(_clamp(score), "Billing refund evaluation", penalties)
+    def grade(self, case: SupportCaseProgress) -> float:
+        return self.score(case).score
+class AccountTakeoverMediumGrader:
+    def score(self, case: SupportCaseProgress) -> GradeBreakdown:
+        penalties: dict[str, float] = {}
+        score = 0.2
+        reply = (case.reply or "").lower()
+        if reply:
+            if "lock" not in reply:
+                penalties["missing_lock"] = 0.2
+            if "verify" not in reply:
+                penalties["missing_verify"] = 0.2
+            if "ownership" not in reply:
+                penalties["missing_ownership"] = 0.2
+        else:
+            penalties["no_reply"] = 0.4
+        if case.status not in ("escalated", "waiting_on_customer"):
+            penalties["wrong_status"] = 0.2
+        score -= sum(penalties.values())
+        score = round(score, 2)
+        return GradeBreakdown(_clamp(score), "Account takeover evaluation", penalties)
+    def grade(self, case: SupportCaseProgress) -> float:
+        return self.score(case).score
+class ApiIncidentHardGrader:
+    def score(self, case: SupportCaseProgress) -> GradeBreakdown:
+        penalties: dict[str, float] = {}
+        score = 0.2
+        reply = (case.reply or "").lower()
+        if reply:
+            if "status" not in reply:
+                penalties["missing_status_page"] = 0.15
+            if "request" not in reply or "id" not in reply:
+                penalties["missing_request_ids"] = 0.2
+            if "escalat" not in reply:
+                penalties["missing_escalation"] = 0.2
+        else:
+            penalties["no_reply"] = 0.4
+        if case.queue != "platform_engineering":
+            penalties["wrong_queue"] = 0.15
+        score -= sum(penalties.values())
+        score = round(score, 2)
+        return GradeBreakdown(_clamp(score), "API incident evaluation", penalties)
+    def grade(self, case: SupportCaseProgress) -> float:
+        return self.score(case).score
+class RegulatedExportExceptionHardGrader:
+    def score(self, case: SupportCaseProgress) -> GradeBreakdown:
+        penalties: dict[str, float] = {}
+        score = 0.2
+        reply = (case.reply or "").lower()
+        if reply:
+            if "compliance" not in reply:
+                penalties["missing_compliance"] = 0.2
+            if "cannot promise" not in reply and "not promise" not in reply:
+                penalties["missing_no_promise"] = 0.2
+            if "recipient" not in reply or "identity" not in reply:
+                penalties["missing_recipient"] = 0.15
+        else:
+            penalties["no_reply"] = 0.4
+        if case.status != "waiting_on_customer":
+            penalties["wrong_status"] = 0.15
+        score -= sum(penalties.values())
+        score = round(score, 2)
+        return GradeBreakdown(_clamp(score), "Regulated export evaluation", penalties)
+    def grade(self, case: SupportCaseProgress) -> float:
+        return self.score(case).score

pyproject.toml CHANGED Viewed

@@ -38,4 +38,4 @@ server = "server.app:main"
 [tool.setuptools]
 include-package-data = true
-packages = ["server"]

 [tool.setuptools]
 include-package-data = true
+packages = ["server", "tasks", "graders"]

server/app.py CHANGED Viewed

@@ -14,10 +14,57 @@ try:
 except ImportError:
     try:
         from openenv_core.env_server import http_server as openenv_http_server
-    except Exception as e:  # pragma: no cover
-        raise ImportError(
-            "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
-        ) from e
 from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
 from server.supportdesk_environment import SupportDeskEnvironment

 except ImportError:
     try:
         from openenv_core.env_server import http_server as openenv_http_server
+    except Exception:
+        # Minimal fallback for test runs when openenv is unavailable.
+        from pydantic import BaseModel, ValidationError as _PydValidationError
+        from fastapi import FastAPI
+        class _ResetRequest(BaseModel):
+            seed: int | None = None
+            episode_id: str | None = None
+            task_id: str | None = None
+            timeout_s: float | None = None
+        class _StepRequest(BaseModel):
+            action: dict
+            timeout_s: float | None = None
+            episode_id: str | None = None
+        def _deserialize_action(data, ActionCls):
+            return ActionCls.model_validate(data)
+        def _create_app(env_cls, action_cls, obs_cls, env_name: str = "env", max_concurrent_envs: int = 1):
+            app = FastAPI()
+            @app.post("/reset")
+            def _reset(req: _ResetRequest = _ResetRequest()):
+                env = env_cls()
+                kwargs = req.model_dump(exclude_none=True)
+                obs = env.reset(**kwargs)
+                return {"observation": obs.model_dump(), "reward": obs.reward, "done": obs.done}
+            @app.post("/step")
+            def _step(req: _StepRequest):
+                env = env_cls()
+                action = _deserialize_action(req.action, action_cls)
+                obs = env.step(action, timeout_s=req.timeout_s, episode_id=req.episode_id)
+                return {"observation": obs.model_dump(), "reward": obs.reward, "done": obs.done}
+            @app.get("/state")
+            def _state():
+                env = env_cls()
+                return env.state.model_dump()
+            return app
+        class _Shim:
+            ResetRequest = _ResetRequest
+            StepRequest = _StepRequest
+            ValidationError = _PydValidationError
+            deserialize_action = staticmethod(_deserialize_action)
+            create_app = staticmethod(_create_app)
+        openenv_http_server = _Shim()
 from models import SupportDeskAction, SupportDeskObservation, SupportDeskState
 from server.supportdesk_environment import SupportDeskEnvironment

tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Task registry for the SupportDesk environment (per-task modules)."""
+from tasks.base import (
+    ALL_ISSUE_TYPES,
+    ALL_PRIORITIES,
+    ALL_QUEUES,
+    ALL_STATUSES,
+    SupportTaskSpec,
+)
+from tasks.billing_refund_easy import TASK as BILLING_REFUND_EASY
+from tasks.account_takeover_medium import TASK as ACCOUNT_TAKEOVER_MEDIUM
+from tasks.api_incident_hard import TASK as API_INCIDENT_HARD
+from tasks.regulated_export_exception_hard import TASK as REGULATED_EXPORT_EXCEPTION_HARD
+TASKS: dict[str, SupportTaskSpec] = {
+    t.task_id: t
+    for t in (
+        BILLING_REFUND_EASY,
+        ACCOUNT_TAKEOVER_MEDIUM,
+        API_INCIDENT_HARD,
+        REGULATED_EXPORT_EXCEPTION_HARD,
+    )
+}
+def get_task(task_id: str) -> SupportTaskSpec:
+    return TASKS[task_id]
+def list_task_ids() -> list[str]:
+    return list(TASKS.keys())
+__all__ = [
+    "SupportTaskSpec",
+    "ALL_QUEUES",
+    "ALL_PRIORITIES",
+    "ALL_STATUSES",
+    "ALL_ISSUE_TYPES",
+    "TASKS",
+    "get_task",
+    "list_task_ids",
+]

tasks/account_takeover_medium.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from __future__ import annotations
+from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
+TASK = SupportTaskSpec(
+    task_id="account_takeover_medium",
+    difficulty="medium",
+    title="Account takeover recovery",
+    objective=(
+        "Identify an account compromise, lock the account, start recovery, "
+        "and communicate next steps without leaking security-sensitive details."
+    ),
+    ticket=SupportTicket(
+        customer_name="Anjali Menon",
+        customer_tier="enterprise",
+        company="Saffron Logistics",
+        subject="Someone changed our admin email",
+        body=(
+            "Our main admin email was changed and we lost access. "
+            "We’re seeing unfamiliar login alerts from overseas. Urgent help needed."
+        ),
+        region="us-east-1",
+        affected_users=240,
+        sla_minutes_remaining=120,
+    ),
+    knowledge_base=(
+        KnowledgeSnippet(
+            article_id="kb-compromise-protocol",
+            title="Account compromise protocol",
+            content=(
+                "Immediately lock the account, invalidate active sessions, require admin re-verification, "
+                "and collect proof of ownership (billing zip, last 4 of card, admin ID)."
+            ),
+        ),
+        KnowledgeSnippet(
+            article_id="kb-enterprise-sla",
+            title="Enterprise SLA",
+            content="Enterprise incidents must be contained within 2 hours. Provide status updates every 30 minutes.",
+        ),
+    ),
+    gold_queue="trust_and_safety",
+    gold_priority="urgent",
+    gold_issue_type="account_compromise",
+    gold_status="escalated",
+    gold_resolution_code="security_lock_and_verify",
+    required_requested_fields=("ownership_proof", "billing_zip", "last4_card"),
+    required_reply_markers=(
+        ("locked", "account"),
+        ("invalidate", "sessions"),
+        ("verify", "ownership"),
+        ("sla", "updates"),
+    ),
+    required_note_markers=(("account locked",), ("ownership proof requested",)),
+    risk_flags=("security_incident", "sla_breach"),
+    follow_up_outcome="partial",
+    follow_up_message="Customer provided billing zip but not card last4 yet.",
+    follow_up_provided_fields=("billing_zip",),
+    follow_up_wrong_fields=("card_last4",),
+    sla_step_cost=18,
+    over_escalation_queues=("security_ops",),
+    under_escalation_deadline_step=4,
+    max_steps=7,
+)

tasks/api_incident_hard.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from __future__ import annotations
+from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
+TASK = SupportTaskSpec(
+    task_id="api_incident_hard",
+    difficulty="hard",
+    title="Production API incident triage",
+    objective=(
+        "Triage a production API latency/5xx incident affecting multiple customers; "
+        "collect diagnostics, apply runbook mitigations, and escalate to platform engineering appropriately."
+    ),
+    ticket=SupportTicket(
+        customer_name="Marco Alvarez",
+        customer_tier="enterprise",
+        company="Northwind Labs",
+        subject="API timeouts for createOrder",
+        body=(
+            "Since 3 hours ago, createOrder calls are timing out or returning 500s across regions. "
+            "We rolled back our last deploy and still see issues. Need RCA and mitigation ASAP."
+        ),
+        region="us-west-2",
+        affected_users=4200,
+        sla_minutes_remaining=90,
+    ),
+    knowledge_base=(
+        KnowledgeSnippet(
+            article_id="kb-api-runbook",
+            title="API latency/5xx runbook",
+            content=(
+                "Capture request IDs, time window, regions, and payload samples. "
+                "Check current status page and incident channel. "
+                "If multiple regions impacted, escalate to platform_engineering and set customer expectations."
+            ),
+        ),
+        KnowledgeSnippet(
+            article_id="kb-status-page",
+            title="Status page policy",
+            content="If 2+ enterprise customers report API errors, post a preliminary status within 15 minutes.",
+        ),
+    ),
+    gold_queue="platform_engineering",
+    gold_priority="urgent",
+    gold_issue_type="production_incident",
+    gold_status="escalated",
+    gold_resolution_code="runbook_investigation",
+    required_requested_fields=("request_ids", "time_window", "regions", "payload_sample"),
+    required_reply_markers=(
+        ("acknowledge", "incident"),
+        ("collect", "request ids"),
+        ("status", "page"),
+        ("escalate", "platform"),
+    ),
+    required_note_markers=(
+        ("status page",),
+        ("platform escalation",),
+        ("request ids",),
+    ),
+    risk_flags=("sla_breach", "p1_incident"),
+    follow_up_outcome="partial",
+    follow_up_message="Platform team investigating elevated DB latency; ETA 20 minutes.",
+    follow_up_provided_fields=("request_ids", "time_window"),
+    follow_up_wrong_fields=("payload_sample",),
+    sla_step_cost=20,
+    over_escalation_queues=(),
+    under_escalation_deadline_step=3,
+    max_steps=8,
+)

tasks/base.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""Shared task structures for SupportDesk."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+from models import KnowledgeSnippet, SupportTicket
+ALL_QUEUES = [
+    "billing_ops",
+    "trust_and_safety",
+    "platform_engineering",
+    "compliance_ops",
+    "general_support",
+]
+ALL_PRIORITIES = ["low", "normal", "high", "urgent"]
+ALL_STATUSES = ["new", "waiting_on_customer", "resolved", "escalated"]
+ALL_ISSUE_TYPES = [
+    "duplicate_charge",
+    "account_compromise",
+    "production_incident",
+    "regulated_exception",
+    "general_question",
+]
+@dataclass(frozen=True)
+class SupportTaskSpec:
+    task_id: str
+    difficulty: Literal["easy", "medium", "hard"]
+    title: str
+    objective: str
+    ticket: SupportTicket
+    knowledge_base: tuple[KnowledgeSnippet, ...]
+    gold_queue: str
+    gold_priority: str
+    gold_issue_type: str
+    gold_status: str
+    gold_resolution_code: str
+    required_requested_fields: tuple[str, ...]
+    required_reply_markers: tuple[tuple[str, ...], ...]
+    required_note_markers: tuple[tuple[str, ...], ...]
+    forbidden_reply_markers: tuple[str, ...] = ()
+    risk_flags: tuple[str, ...] = ()
+    follow_up_outcome: Literal["none", "partial", "complete", "incorrect"] = "none"
+    follow_up_message: str = ""
+    follow_up_provided_fields: tuple[str, ...] = ()
+    follow_up_wrong_fields: tuple[str, ...] = ()
+    sla_step_cost: int = 15
+    over_escalation_queues: tuple[str, ...] = ()
+    under_escalation_deadline_step: int | None = None
+    max_steps: int = 6
+__all__ = [
+    "SupportTaskSpec",
+    "KnowledgeSnippet",
+    "SupportTicket",
+    "ALL_QUEUES",
+    "ALL_PRIORITIES",
+    "ALL_STATUSES",
+    "ALL_ISSUE_TYPES",
+]

tasks/billing_refund_easy.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from __future__ import annotations
+from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
+TASK = SupportTaskSpec(
+    task_id="billing_refund_easy",
+    difficulty="easy",
+    title="Duplicate charge refund triage",
+    objective=(
+        "Triage a duplicate-charge billing ticket, send the correct customer response, "
+        "and close the case only if no further customer information is required."
+    ),
+    ticket=SupportTicket(
+        customer_name="Riya Shah",
+        customer_tier="pro",
+        company="PixelNorth Studio",
+        subject="Charged twice after I canceled",
+        body=(
+            "I canceled our Pro annual workspace yesterday, but my card was charged again "
+            "this morning and I still see the old invoice. We only had one workspace, "
+            "so this looks like a duplicate charge. Please fix it quickly."
+        ),
+        region="ap-south-1",
+        affected_users=12,
+        sla_minutes_remaining=240,
+    ),
+    knowledge_base=(
+        KnowledgeSnippet(
+            article_id="kb-refund-dup",
+            title="Refund duplicate charges",
+            content=(
+                "If a customer is double-charged, verify the duplicate invoice IDs, refund the extra charge, "
+                "and send a confirmation summarizing the refund amount and timeline."
+            ),
+        ),
+        KnowledgeSnippet(
+            article_id="kb-pro-refund-policy",
+            title="Pro tier refund policy",
+            content=(
+                "Pro annual refunds are prorated to the current billing month; processing takes 5-7 business days. "
+                "Provide the refund reference ID in your reply."
+            ),
+        ),
+    ),
+    gold_queue="billing_ops",
+    gold_priority="high",
+    gold_issue_type="duplicate_charge",
+    gold_status="resolved",
+    gold_resolution_code="refund_processed",
+    required_requested_fields=("invoice_ids",),
+    required_reply_markers=(
+        ("refund", "processed"),
+        ("reference", "id"),
+        ("timeline",),
+    ),
+    required_note_markers=(("duplicate charge",), ("refund issued",)),
+    risk_flags=("sla_breach", "finance_close_risk"),
+    follow_up_outcome="complete",
+    follow_up_message="Customer confirms the refund hit their card.",
+    follow_up_provided_fields=("refund_reference",),
+    follow_up_wrong_fields=(),
+    sla_step_cost=12,
+    over_escalation_queues=(),
+    under_escalation_deadline_step=None,
+    max_steps=6,
+)

tasks/regulated_export_exception_hard.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from __future__ import annotations
+from tasks.base import SupportTaskSpec, KnowledgeSnippet, SupportTicket
+TASK = SupportTaskSpec(
+    task_id="regulated_export_exception_hard",
+    difficulty="hard",
+    title="Regulated export exception review",
+    objective=(
+        "Handle a compliance exception request for export-controlled data; gather required attestations, "
+        "route to compliance, and avoid promising approval."
+    ),
+    ticket=SupportTicket(
+        customer_name="Lena Fischer",
+        customer_tier="pro",
+        company="Helio Analytics",
+        subject="Export of model outputs to restricted country",
+        body=(
+            "We need to send model outputs to a contractor in a restricted country for 2 weeks. "
+            "Requesting an exception. We will use VPN + access controls."
+        ),
+        region="eu-central-1",
+        affected_users=35,
+        sla_minutes_remaining=180,
+    ),
+    knowledge_base=(
+        KnowledgeSnippet(
+            article_id="kb-export-policy",
+            title="Export control policy",
+            content=(
+                "Exceptions require compliance review. Collect justification, data sensitivity, duration, "
+                "recipient identity, and technical safeguards. Do not promise approval."
+            ),
+        ),
+        KnowledgeSnippet(
+            article_id="kb-temp-access",
+            title="Temporary access rules",
+            content="Temporary access must be time-bounded and audited; VPN alone is insufficient without logging.",
+        ),
+    ),
+    gold_queue="compliance_ops",
+    gold_priority="high",
+    gold_issue_type="regulated_exception",
+    gold_status="waiting_on_customer",
+    gold_resolution_code="compliance_review_pending",
+    required_requested_fields=("justification", "data_type", "recipient_identity", "duration", "safeguards"),
+    required_reply_markers=(
+        ("cannot promise", "approval"),
+        ("compliance", "review"),
+        ("collect", "recipient"),
+        ("time-bound", "access"),
+    ),
+    required_note_markers=(
+        ("exception request",),
+        ("awaiting compliance",),
+    ),
+    risk_flags=("legal",),
+    follow_up_outcome="incorrect",
+    follow_up_message="Customer insists VPN is sufficient and did not provide recipient identity.",
+    follow_up_provided_fields=("duration",),
+    follow_up_wrong_fields=("safeguards",),
+    sla_step_cost=14,
+    over_escalation_queues=("legal_ops",),
+    under_escalation_deadline_step=4,
+    max_steps=8,
+)