Spaces:

XcodeAddy
/

sentinel-env

Running

App Files Files Community

XcodeAddy commited on 20 days ago

Commit

aad7819

1 Parent(s): 6d6dbaf

Harden backend session and reward constants

Browse files

Files changed (17) hide show

Dockerfile +1 -0
README.md +14 -4
app.py +98 -9
environment.py +27 -5
graders.py +4 -3
inference.py +5 -2
openenv.yaml +16 -1
scripts/backend_walkthrough.py +3 -2
sentinel_config.py +26 -0
specialists.py +61 -14
task_graph.py +12 -6
tests/test_app.py +35 -0
tests/test_environment.py +18 -0
tests/test_specialists.py +9 -0
training/evaluate.py +7 -2
training/train.py +6 -1
trust_ledger.py +1 -3

Dockerfile CHANGED Viewed

@@ -27,6 +27,7 @@ COPY trust_ledger.py .
 COPY task_graph.py .
 COPY comms_bus.py .
 COPY mission_context.py .
 COPY scenarios.py .
 COPY openenv.yaml .
 COPY inference.py .

 COPY task_graph.py .
 COPY comms_bus.py .
 COPY mission_context.py .
+COPY sentinel_config.py .
 COPY scenarios.py .
 COPY openenv.yaml .
 COPY inference.py .

README.md CHANGED Viewed

@@ -70,6 +70,9 @@ curl "http://localhost:7860/mission?task_type=task3"
 - Specialists: 5 scripted FSM agents with shuffled hidden profiles
 - Rewards: per-step reward plus terminal score, normalized to `0.0-1.0`
 - Dataset: 120 abstract multi-agent scenarios
 ## Live Submission Targets
@@ -85,10 +88,10 @@ curl "http://localhost:7860/mission?task_type=task3"
 Hidden profiles:
-- `AccurateSlow`: 90 percent accurate, costs extra steps.
 - `OverconfidentFast`: quick and confident, wrong 40 percent of the time.
 - `DomainBound`: strong on analysis/verification, weak elsewhere.
-- `Adversarial`: benign at low stakes, poisons high-stakes steps.
 - `Degrading`: strong early, gets worse with reuse.
 ## Tasks
@@ -103,6 +106,13 @@ Hidden profiles:
 Rewards are deterministic and boundary-exclusive per step: `(0.01, 0.99)`.
 Task 3 terminal score:
 ```text
@@ -112,7 +122,7 @@ Task 3 terminal score:
 + 0.10 * efficiency
 ```
-The episode `score` exposed in `info` and inference logs is normalized to `0.0-1.0`.
 ## API
@@ -242,7 +252,7 @@ pip install pytest
 Run checks:
 ```bash
-python -m py_compile app.py server/app.py environment.py models.py graders.py specialists.py trust_ledger.py task_graph.py scenarios.py inference.py comms_bus.py mission_context.py training/evaluate.py training/train.py scripts/backend_walkthrough.py
 python -m pytest -q
 python inference.py
 python training/evaluate.py --episodes 20 --task all --plot outputs/baseline_comparison.png

 - Specialists: 5 scripted FSM agents with shuffled hidden profiles
 - Rewards: per-step reward plus terminal score, normalized to `0.0-1.0`
 - Dataset: 120 abstract multi-agent scenarios
+- Session store: single-process memory with TTL/LRU cleanup
+Deployment contract: run one server worker for the submitted Space. Active `SentinelEnv` objects live in process memory, so multi-worker deployments need sticky sessions or a shared store such as Redis. The Dockerfile intentionally starts uvicorn with `--workers 1`.
 ## Live Submission Targets
 Hidden profiles:
+- `AccurateSlow`: 90 percent accurate, costs 2 steps.
 - `OverconfidentFast`: quick and confident, wrong 40 percent of the time.
 - `DomainBound`: strong on analysis/verification, weak elsewhere.
+- `Adversarial`: benign below stakes `0.70`, poisons at stakes `>=0.70`.
 - `Degrading`: strong early, gets worse with reuse.
 ## Tasks
 Rewards are deterministic and boundary-exclusive per step: `(0.01, 0.99)`.
+Shared threshold/cost constants live in `sentinel_config.py`:
+- `ADVERSARIAL_TRIGGER_STAKES = 0.70`: adversarial specialist starts poisoning.
+- `ADVERSARIAL_AWARENESS_STAKES = 0.70`: graders reward verification/evasion.
+- `CRITICAL_POISON_STAKES = 0.85`: unchecked poison ends the episode.
+- `VERIFY_EXTRA_STEP_COST = 1`: verify cost is specialist step cost plus one.
 Task 3 terminal score:
 ```text
 + 0.10 * efficiency
 ```
+The episode `score` exposed in `info` and inference logs is the mean reward over emitted grading events, normalized to `0.0-1.0`. It is intentionally not raw cumulative return; terminal reward and efficiency terms carry the penalty for unfinished or wasteful episodes while keeping scores comparable across tasks with different horizons.
 ## API
 Run checks:
 ```bash
+python -m py_compile app.py server/app.py environment.py models.py graders.py specialists.py trust_ledger.py task_graph.py scenarios.py inference.py comms_bus.py mission_context.py sentinel_config.py training/evaluate.py training/train.py scripts/backend_walkthrough.py
 python -m pytest -q
 python inference.py
 python training/evaluate.py --episodes 20 --task all --plot outputs/baseline_comparison.png

app.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from __future__ import annotations
 import os
 from pathlib import Path
 from typing import Any
 from fastapi import FastAPI, HTTPException, Query
@@ -12,6 +16,7 @@ from pydantic import BaseModel
 from environment import SentinelEnv
 from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
 from scenarios import scenario_summary
 # ---------------------------------------------------------------------------
 # App + session store
@@ -26,8 +31,75 @@ app = FastAPI(
     version="1.0.0",
 )
-# One env instance per session_id
-_sessions: dict[str, SentinelEnv] = {}
 _STATIC_DIR = Path(__file__).resolve().parent / "static"
 _OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
 _FRONTEND_OUT_DIR = Path(__file__).resolve().parent / "ui" / "out"
@@ -37,9 +109,10 @@ if _FRONTEND_NEXT_DIR.exists():
     app.mount("/_next", StaticFiles(directory=_FRONTEND_NEXT_DIR), name="next-assets")
 def _get_env(session_id: str) -> SentinelEnv:
-    if session_id not in _sessions:
         raise HTTPException(status_code=404, detail=f"Session '{session_id}' not found. Call /reset first.")
-    return _sessions[session_id]
 # ---------------------------------------------------------------------------
@@ -66,7 +139,12 @@ class StepRequest(BaseModel):
 @app.get("/health")
 def health():
-    return {"status": "ok", "environment": "sentinel-env", "version": "1.0.0"}
 @app.get("/")
@@ -162,6 +240,13 @@ def metadata():
         "scenarios": summary,
         "reward_range": "(0.01, 0.99) boundary-exclusive",
         "real_world_bridge": problem_statement()["problem"]["not_a_simple_prompt_solver"],
     }
@@ -227,7 +312,7 @@ def reset(req: ResetRequest = ResetRequest()):
         seed=req.seed,
     )
     session_id = result["info"]["session_id"]
-    _sessions[session_id] = env
     result["info"]["mission"] = mission_for_task(result["observation"]["task_type"])
     result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
     return result
@@ -243,7 +328,7 @@ def step(req: StepRequest, session_id: str = Query(...)):
     # Clean up completed sessions to avoid memory leak
     if result["done"]:
-        _sessions.pop(session_id, None)
     else:
         result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
@@ -266,7 +351,9 @@ def mcp(body: dict[str, Any]):
         env = SentinelEnv()
         result = env.reset(**params)
         session_id = result["info"]["session_id"]
-        _sessions[session_id] = env
         return {"result": result}
     elif method == "step":
@@ -276,7 +363,9 @@ def mcp(body: dict[str, Any]):
         env = _get_env(session_id)
         result = env.step(params)
         if result["done"]:
-            _sessions.pop(session_id, None)
         return {"result": result}
     elif method == "state":

 from __future__ import annotations
 import os
+import time
+from collections import OrderedDict
+from dataclasses import dataclass
 from pathlib import Path
+from threading import RLock
 from typing import Any
 from fastapi import FastAPI, HTTPException, Query
 from environment import SentinelEnv
 from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
 from scenarios import scenario_summary
+from sentinel_config import SESSION_BACKEND, SESSION_MAX_ACTIVE, SESSION_TTL_SECONDS
 # ---------------------------------------------------------------------------
 # App + session store
     version="1.0.0",
 )
+@dataclass
+class SessionEntry:
+    env: SentinelEnv
+    created_at: float
+    last_access_at: float
+class SessionStore:
+    """
+    Single-process TTL + LRU store for active SentinelEnv objects.
+    This is intentionally memory-backed for OpenEnv/HF Space simplicity. It is
+    safe for the Dockerfile's single-worker deployment. If you increase workers,
+    use sticky routing or replace this with a shared backend such as Redis.
+    """
+    def __init__(self, ttl_seconds: int, max_active: int) -> None:
+        self._ttl_seconds = ttl_seconds
+        self._max_active = max_active
+        self._items: OrderedDict[str, SessionEntry] = OrderedDict()
+        self._lock = RLock()
+    def set(self, session_id: str, env: SentinelEnv) -> None:
+        now = time.monotonic()
+        with self._lock:
+            self._prune_locked(now)
+            self._items[session_id] = SessionEntry(env=env, created_at=now, last_access_at=now)
+            self._items.move_to_end(session_id)
+            while len(self._items) > self._max_active:
+                self._items.popitem(last=False)
+    def get(self, session_id: str) -> SentinelEnv | None:
+        now = time.monotonic()
+        with self._lock:
+            self._prune_locked(now)
+            entry = self._items.get(session_id)
+            if entry is None:
+                return None
+            entry.last_access_at = now
+            self._items.move_to_end(session_id)
+            return entry.env
+    def pop(self, session_id: str) -> SentinelEnv | None:
+        with self._lock:
+            entry = self._items.pop(session_id, None)
+            return entry.env if entry else None
+    def stats(self) -> dict[str, int | str | bool]:
+        with self._lock:
+            self._prune_locked(time.monotonic())
+            return {
+                "backend": SESSION_BACKEND,
+                "active_sessions": len(self._items),
+                "ttl_seconds": self._ttl_seconds,
+                "max_active": self._max_active,
+                "multi_worker_safe": False,
+            }
+    def _prune_locked(self, now: float) -> None:
+        expired = [
+            sid
+            for sid, entry in self._items.items()
+            if now - entry.last_access_at > self._ttl_seconds
+        ]
+        for sid in expired:
+            self._items.pop(sid, None)
+_sessions = SessionStore(ttl_seconds=SESSION_TTL_SECONDS, max_active=SESSION_MAX_ACTIVE)
 _STATIC_DIR = Path(__file__).resolve().parent / "static"
 _OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
 _FRONTEND_OUT_DIR = Path(__file__).resolve().parent / "ui" / "out"
     app.mount("/_next", StaticFiles(directory=_FRONTEND_NEXT_DIR), name="next-assets")
 def _get_env(session_id: str) -> SentinelEnv:
+    env = _sessions.get(session_id)
+    if env is None:
         raise HTTPException(status_code=404, detail=f"Session '{session_id}' not found. Call /reset first.")
+    return env
 # ---------------------------------------------------------------------------
 @app.get("/health")
 def health():
+    return {
+        "status": "ok",
+        "environment": "sentinel-env",
+        "version": "1.0.0",
+        "session_store": _sessions.stats(),
+    }
 @app.get("/")
         "scenarios": summary,
         "reward_range": "(0.01, 0.99) boundary-exclusive",
         "real_world_bridge": problem_statement()["problem"]["not_a_simple_prompt_solver"],
+        "deployment_contract": {
+            "session_backend": SESSION_BACKEND,
+            "single_worker_required": True,
+            "reason": "Active SentinelEnv objects live in one process memory with TTL/LRU cleanup.",
+            "ttl_seconds": SESSION_TTL_SECONDS,
+            "max_active_sessions": SESSION_MAX_ACTIVE,
+        },
     }
         seed=req.seed,
     )
     session_id = result["info"]["session_id"]
+    _sessions.set(session_id, env)
     result["info"]["mission"] = mission_for_task(result["observation"]["task_type"])
     result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
     return result
     # Clean up completed sessions to avoid memory leak
     if result["done"]:
+        _sessions.pop(session_id)
     else:
         result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
         env = SentinelEnv()
         result = env.reset(**params)
         session_id = result["info"]["session_id"]
+        _sessions.set(session_id, env)
+        result["info"]["mission"] = mission_for_task(result["observation"]["task_type"])
+        result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
         return {"result": result}
     elif method == "step":
         env = _get_env(session_id)
         result = env.step(params)
         if result["done"]:
+            _sessions.pop(session_id)
+        else:
+            result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
         return {"result": result}
     elif method == "state":

environment.py CHANGED Viewed

@@ -10,6 +10,7 @@ from graders import (
     grade_task3_step, grade_task3_terminal,
 )
 from scenarios import sample_scenario, get_scenario, Scenario
 from specialists import SpecialistPool
 from task_graph import TaskGraph
 from trust_ledger import TrustLedger
@@ -171,19 +172,34 @@ class SentinelEnv:
         elif action_type == "verify":
             # First get result, then verify (costs +1 step)
-            result = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
-            step_cost       = int(result.metadata.get("step_cost", 1)) + 1
             outcome         = result.outcome if not result.is_adversarial else 0.0
             was_adversarial  = result.is_adversarial
             # Verification means agent caught adversarial — treat as detection
             if result.is_adversarial:
                 outcome = 1.0   # successfully avoided
             self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
             self._ledger.update(specialist_id, result.outcome, stakes)
             self.last_action_summary = f"Verified {specialist_id} on {subtask['id']}"
         else:  # delegate
-            result          = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
             step_cost       = int(result.metadata.get("step_cost", 1))
             was_adversarial  = result.is_adversarial
             outcome         = 0.0 if was_adversarial else result.outcome
@@ -206,7 +222,7 @@ class SentinelEnv:
         # --- Check episode end ---
         all_done    = self._graph.is_done()
         budget_gone = self.step_count >= self.max_steps
-        poisoned    = was_adversarial and action_type == "delegate" and stakes >= 0.85
         if all_done or budget_gone or poisoned:
             # Emit terminal reward on top of step reward
@@ -394,7 +410,13 @@ class SentinelEnv:
         )
     def normalized_score(self) -> float:
-        """Episode score normalized to 0.0-1.0 for judging logs."""
         if self.reward_events <= 0:
             return 0.0
         return max(0.0, min(1.0, self.total_reward / self.reward_events))

     grade_task3_step, grade_task3_terminal,
 )
 from scenarios import sample_scenario, get_scenario, Scenario
+from sentinel_config import CRITICAL_POISON_STAKES, VERIFY_EXTRA_STEP_COST
 from specialists import SpecialistPool
 from task_graph import TaskGraph
 from trust_ledger import TrustLedger
         elif action_type == "verify":
             # First get result, then verify (costs +1 step)
+            result = self._pool.execute(
+                specialist_id,
+                subtask["description"],
+                stakes,
+                self._rng,
+                domain=subtask.get("domain"),
+            )
+            step_cost       = int(result.metadata.get("step_cost", 1)) + VERIFY_EXTRA_STEP_COST
             outcome         = result.outcome if not result.is_adversarial else 0.0
             was_adversarial  = result.is_adversarial
             # Verification means agent caught adversarial — treat as detection
             if result.is_adversarial:
                 outcome = 1.0   # successfully avoided
             self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
+            # Important: trust must learn from the specialist's raw answer, not
+            # from the corrected/avoided graph outcome. If S0 was caught lying,
+            # the task node is safe, but S0's trust should still drop.
             self._ledger.update(specialist_id, result.outcome, stakes)
             self.last_action_summary = f"Verified {specialist_id} on {subtask['id']}"
         else:  # delegate
+            result          = self._pool.execute(
+                specialist_id,
+                subtask["description"],
+                stakes,
+                self._rng,
+                domain=subtask.get("domain"),
+            )
             step_cost       = int(result.metadata.get("step_cost", 1))
             was_adversarial  = result.is_adversarial
             outcome         = 0.0 if was_adversarial else result.outcome
         # --- Check episode end ---
         all_done    = self._graph.is_done()
         budget_gone = self.step_count >= self.max_steps
+        poisoned    = was_adversarial and action_type == "delegate" and stakes >= CRITICAL_POISON_STAKES
         if all_done or budget_gone or poisoned:
             # Emit terminal reward on top of step reward
         )
     def normalized_score(self) -> float:
+        """
+        Mean reward over emitted grading events, normalized to 0.0-1.0.
+        This is intentionally not a cumulative return. The terminal reward and
+        efficiency terms already penalize unfinished or wasteful episodes, while
+        this scalar stays comparable across tasks with different horizons.
+        """
         if self.reward_events <= 0:
             return 0.0
         return max(0.0, min(1.0, self.total_reward / self.reward_events))

graders.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
 from task_graph import TaskGraph
 from trust_ledger import TrustLedger
@@ -34,7 +35,7 @@ def grade_task1_step(
     task_accuracy = specialist_outcome  # 1.0 or 0.0
     # Stakes awareness: avoided adversarial = bonus, missed it = penalty
-    if was_adversarial and stakes >= 0.70:
         if action_type in ("verify", "solve_independently"):
             stakes_awareness = 0.99   # correctly routed around adversary
             reason = f"Adversarial specialist detected at stakes={stakes:.2f}. Correct evasion."
@@ -150,7 +151,7 @@ def grade_task3_step(
     task_accuracy = specialist_outcome
     # Adversarial detection at high stakes
-    if was_adversarial and stakes >= 0.70:
         if action_type in ("verify", "solve_independently"):
             stakes_awareness = 0.99
             reason = f"✅ Adversarial detected at stakes={stakes:.2f} — evasion successful."
@@ -234,4 +235,4 @@ STEP_GRADERS = {
 TERMINAL_GRADERS = {
     "task2": grade_task2_terminal,
     "task3": grade_task3_terminal,
-}

 from __future__ import annotations
+from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
 from task_graph import TaskGraph
 from trust_ledger import TrustLedger
     task_accuracy = specialist_outcome  # 1.0 or 0.0
     # Stakes awareness: avoided adversarial = bonus, missed it = penalty
+    if was_adversarial and stakes >= ADVERSARIAL_AWARENESS_STAKES:
         if action_type in ("verify", "solve_independently"):
             stakes_awareness = 0.99   # correctly routed around adversary
             reason = f"Adversarial specialist detected at stakes={stakes:.2f}. Correct evasion."
     task_accuracy = specialist_outcome
     # Adversarial detection at high stakes
+    if was_adversarial and stakes >= ADVERSARIAL_AWARENESS_STAKES:
         if action_type in ("verify", "solve_independently"):
             stakes_awareness = 0.99
             reason = f"✅ Adversarial detected at stakes={stakes:.2f} — evasion successful."
 TERMINAL_GRADERS = {
     "task2": grade_task2_terminal,
     "task3": grade_task3_terminal,
+}

inference.py CHANGED Viewed

@@ -6,7 +6,8 @@ Emits [START] / [STEP] / [END] structured logs exactly as required.
 Heuristic agent logic:
   - Always delegates to the specialist with highest trust score
-  - If stakes >= 0.70 and trust of chosen specialist < 0.60 → verify instead
   - Never skips
   - Never solves independently (too expensive)
@@ -39,6 +40,8 @@ else:
     from environment import SentinelEnv
     USE_REMOTE = False
 # ---------------------------------------------------------------------------
 # Env interface (works both remote and in-process)
@@ -100,7 +103,7 @@ def heuristic_action(obs: dict, session_id: str) -> dict:
     best_trust      = trust.get(best_specialist, 0.5)
     # Upgrade to verify if high stakes AND low trust in best specialist
-    if stakes >= 0.70 and best_trust < 0.60:
         action_type = "verify"
     else:
         action_type = "delegate"

 Heuristic agent logic:
   - Always delegates to the specialist with highest trust score
+  - If stakes enters the adversarial-awareness zone and trust of chosen
+    specialist < 0.60 -> verify instead
   - Never skips
   - Never solves independently (too expensive)
     from environment import SentinelEnv
     USE_REMOTE = False
+from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
 # ---------------------------------------------------------------------------
 # Env interface (works both remote and in-process)
     best_trust      = trust.get(best_specialist, 0.5)
     # Upgrade to verify if high stakes AND low trust in best specialist
+    if stakes >= ADVERSARIAL_AWARENESS_STAKES and best_trust < 0.60:
         action_type = "verify"
     else:
         action_type = "delegate"

openenv.yaml CHANGED Viewed

@@ -93,6 +93,15 @@ api:
           required: true
       returns: SentinelState with trust_snapshot, completion, adversarial stats
 tasks:
   task1:
     name: Single-Step Trust Decision
@@ -119,12 +128,18 @@ tasks:
     reward: "terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
 specialists:
-  S0: "AccurateSlow — 90% accurate, costs +1 extra step"
   S1: "OverconfidentFast — 60% accurate, always reports high confidence"
   S2: "DomainBound — 95% in-domain, 10% out-of-domain"
   S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
   S4: "Degrading — 95% first call, -15% accuracy per reuse"
 dataset:
   total_scenarios: 120
   split:

           required: true
       returns: SentinelState with trust_snapshot, completion, adversarial stats
+deployment:
+  session_backend: single_process_memory
+  workers: 1
+  session_ttl_seconds: 1800
+  session_max_active: 256
+  note: >
+    Active SentinelEnv sessions are stored in one process with TTL/LRU cleanup.
+    Multi-worker deployments require sticky sessions or a shared session store.
 tasks:
   task1:
     name: Single-Step Trust Decision
     reward: "terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
 specialists:
+  S0: "AccurateSlow — 90% accurate, costs 2 steps"
   S1: "OverconfidentFast — 60% accurate, always reports high confidence"
   S2: "DomainBound — 95% in-domain, 10% out-of-domain"
   S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
   S4: "Degrading — 95% first call, -15% accuracy per reuse"
+thresholds:
+  adversarial_trigger_stakes: 0.70
+  adversarial_awareness_stakes: 0.70
+  critical_poison_stakes: 0.85
+  verify_extra_step_cost: 1
 dataset:
   total_scenarios: 120
   split:

scripts/backend_walkthrough.py CHANGED Viewed

@@ -14,6 +14,7 @@ if str(ROOT) not in sys.path:
 from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
 from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
 Policy = Callable[[SentinelEnv, dict, random.Random], dict]
@@ -50,7 +51,7 @@ def sentinel_heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -
     trust = obs["trust_snapshot"]
     specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
     score = trust.get(specialist, 0.5)
-    action_type = "verify" if obs["stakes_level"] >= 0.70 and score < 0.65 else "delegate"
     return action(
         obs,
         action_type,
@@ -65,7 +66,7 @@ def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
     It uses hidden builder-only info, so it is NOT a deployable policy.
     """
     reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
-    if obs["task_type"] == "task3" and obs["stakes_level"] >= 0.70:
         return action(obs, "verify", env._pool.adversarial_slot, "oracle-lite verifies adversarial slot")
     specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
     return action(obs, "delegate", specialist, f"oracle-lite best={specialist}")

 from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
 from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
+from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
 Policy = Callable[[SentinelEnv, dict, random.Random], dict]
     trust = obs["trust_snapshot"]
     specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
     score = trust.get(specialist, 0.5)
+    action_type = "verify" if obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES and score < 0.65 else "delegate"
     return action(
         obs,
         action_type,
     It uses hidden builder-only info, so it is NOT a deployable policy.
     """
     reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
+    if obs["task_type"] == "task3" and obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES:
         return action(obs, "verify", env._pool.adversarial_slot, "oracle-lite verifies adversarial slot")
     specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
     return action(obs, "delegate", specialist, f"oracle-lite best={specialist}")

sentinel_config.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from __future__ import annotations
+import os
+# Stakes >= this value means the adversarial specialist starts poisoning.
+ADVERSARIAL_TRIGGER_STAKES = 0.70
+# Stakes >= this value means the grader should reward verification/evasion.
+ADVERSARIAL_AWARENESS_STAKES = ADVERSARIAL_TRIGGER_STAKES
+# Stakes >= this value means unchecked adversarial poison ends the episode.
+# This is intentionally higher than ADVERSARIAL_TRIGGER_STAKES: lower-stakes poison
+# damages reward and trust, while peak-stakes poison represents mission failure.
+CRITICAL_POISON_STAKES = 0.85
+# Verifying means "ask specialist, then cross-check". Slow specialists still carry
+# their own step cost, so verify(AccurateSlow) costs 2 + 1 = 3 steps.
+VERIFY_EXTRA_STEP_COST = 1
+# In-memory session store limits. This deployment is intentionally single-worker;
+# use Redis/sticky sessions before increasing workers.
+SESSION_TTL_SECONDS = int(os.environ.get("SENTINEL_SESSION_TTL_SECONDS", "1800"))
+SESSION_MAX_ACTIVE = int(os.environ.get("SENTINEL_SESSION_MAX_ACTIVE", "256"))
+SESSION_BACKEND = "single_process_memory"

specialists.py CHANGED Viewed

@@ -2,6 +2,9 @@ from __future__ import annotations
 import random
 from dataclasses import dataclass, field
 # ---------------------------------------------------------------------------
@@ -16,7 +19,7 @@ class SpecialistResult:
     outcome: float        # 1.0 correct | 0.5 partial | 0.0 wrong/adversarial
     confidence: float     # 0.0–1.0 (self-reported — may be misleading)
     is_adversarial: bool  # True only for AdversarialSpecialist at trigger point
-    metadata: dict        = field(default_factory=dict)
 # ---------------------------------------------------------------------------
@@ -26,7 +29,13 @@ class SpecialistResult:
 class BaseSpecialist:
     specialist_id: str
-    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
         raise NotImplementedError
     def reset(self) -> None:
@@ -48,7 +57,13 @@ class AccurateSlowSpecialist(BaseSpecialist):
     def reset(self) -> None:
         pass
-    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
         correct = rng.random() < self.ACCURACY
         return SpecialistResult(
             specialist_id=self.specialist_id,
@@ -75,7 +90,13 @@ class OverconfidentFastSpecialist(BaseSpecialist):
     def reset(self) -> None:
         pass
-    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
         correct = rng.random() < self.ACCURACY
         return SpecialistResult(
             specialist_id=self.specialist_id,
@@ -99,15 +120,28 @@ class DomainBoundSpecialist(BaseSpecialist):
     IN_DOMAIN_ACCURACY  = 0.95
     OUT_DOMAIN_ACCURACY = 0.10
     def reset(self) -> None:
         pass
-    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
-        # SENTINEL's abstract scenarios expose domain through task wording.
-        # This specialist is strong at analysis/verification and weak elsewhere.
-        in_domain = any(kw in subtask.lower() for kw in
-                        ["analyze", "analysis", "identify", "pattern", "verify", "correctness", "assess"])
         accuracy = self.IN_DOMAIN_ACCURACY if in_domain else self.OUT_DOMAIN_ACCURACY
         correct  = rng.random() < accuracy
         return SpecialistResult(
@@ -118,7 +152,7 @@ class DomainBoundSpecialist(BaseSpecialist):
             outcome=1.0 if correct else 0.0,
             confidence=0.90 if in_domain else 0.45,
             is_adversarial=False,
-            metadata={"step_cost": 1, "in_domain": in_domain},
         )
@@ -134,12 +168,18 @@ class AdversarialSpecialist(BaseSpecialist):
     specialist_id = "S3"
     BENIGN_ACCURACY   = 0.88    # looks trustworthy at low stakes
-    STAKES_THRESHOLD  = 0.70    # triggers above this
     def reset(self) -> None:
         pass
-    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
         if stakes >= self.STAKES_THRESHOLD:
             # Adversarial trigger: plausible response, wrong outcome
             return SpecialistResult(
@@ -183,7 +223,13 @@ class DegradingSpecialist(BaseSpecialist):
     def reset(self) -> None:
         self._call_count = 0
-    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
         accuracy = max(0.05, self.BASE_ACCURACY - self.DEGRADATION_RATE * self._call_count)
         correct  = rng.random() < accuracy
         self._call_count += 1
@@ -259,6 +305,7 @@ class SpecialistPool:
         subtask: str,
         stakes: float,
         rng: random.Random,
     ) -> SpecialistResult:
         """
         Route execution through the shuffled profile.
@@ -266,7 +313,7 @@ class SpecialistPool:
         """
         internal_id = self._profile[specialist_id]
         spec = self._fixed[internal_id]
-        result = spec.execute(subtask, stakes, rng)
         # Rewrite id to public slot so agent only sees the public label
         result.specialist_id = specialist_id
         return result

 import random
 from dataclasses import dataclass, field
+from typing import Any
+from sentinel_config import ADVERSARIAL_TRIGGER_STAKES
 # ---------------------------------------------------------------------------
     outcome: float        # 1.0 correct | 0.5 partial | 0.0 wrong/adversarial
     confidence: float     # 0.0–1.0 (self-reported — may be misleading)
     is_adversarial: bool  # True only for AdversarialSpecialist at trigger point
+    metadata: dict[str, Any] = field(default_factory=dict)
 # ---------------------------------------------------------------------------
 class BaseSpecialist:
     specialist_id: str
+    def execute(
+        self,
+        subtask: str,
+        stakes: float,
+        rng: random.Random,
+        domain: str | None = None,
+    ) -> SpecialistResult:
         raise NotImplementedError
     def reset(self) -> None:
     def reset(self) -> None:
         pass
+    def execute(
+        self,
+        subtask: str,
+        stakes: float,
+        rng: random.Random,
+        domain: str | None = None,
+    ) -> SpecialistResult:
         correct = rng.random() < self.ACCURACY
         return SpecialistResult(
             specialist_id=self.specialist_id,
     def reset(self) -> None:
         pass
+    def execute(
+        self,
+        subtask: str,
+        stakes: float,
+        rng: random.Random,
+        domain: str | None = None,
+    ) -> SpecialistResult:
         correct = rng.random() < self.ACCURACY
         return SpecialistResult(
             specialist_id=self.specialist_id,
     IN_DOMAIN_ACCURACY  = 0.95
     OUT_DOMAIN_ACCURACY = 0.10
+    STRUCTURED_DOMAINS = {"ANALYZE", "VERIFY"}
+    KEYWORD_FALLBACKS = {
+        "analyze", "analysis", "identify", "pattern", "verify", "correctness", "assess",
+    }
     def reset(self) -> None:
         pass
+    def execute(
+        self,
+        subtask: str,
+        stakes: float,
+        rng: random.Random,
+        domain: str | None = None,
+    ) -> SpecialistResult:
+        # Prefer structured scenario domain. Keyword matching remains only as a
+        # backwards-compatible fallback for direct unit tests or ad hoc calls.
+        if domain is not None:
+            in_domain = domain.upper() in self.STRUCTURED_DOMAINS
+        else:
+            lowered = subtask.lower()
+            in_domain = any(kw in lowered for kw in self.KEYWORD_FALLBACKS)
         accuracy = self.IN_DOMAIN_ACCURACY if in_domain else self.OUT_DOMAIN_ACCURACY
         correct  = rng.random() < accuracy
         return SpecialistResult(
             outcome=1.0 if correct else 0.0,
             confidence=0.90 if in_domain else 0.45,
             is_adversarial=False,
+            metadata={"step_cost": 1, "in_domain": in_domain, "domain": domain},
         )
     specialist_id = "S3"
     BENIGN_ACCURACY   = 0.88    # looks trustworthy at low stakes
+    STAKES_THRESHOLD  = ADVERSARIAL_TRIGGER_STAKES
     def reset(self) -> None:
         pass
+    def execute(
+        self,
+        subtask: str,
+        stakes: float,
+        rng: random.Random,
+        domain: str | None = None,
+    ) -> SpecialistResult:
         if stakes >= self.STAKES_THRESHOLD:
             # Adversarial trigger: plausible response, wrong outcome
             return SpecialistResult(
     def reset(self) -> None:
         self._call_count = 0
+    def execute(
+        self,
+        subtask: str,
+        stakes: float,
+        rng: random.Random,
+        domain: str | None = None,
+    ) -> SpecialistResult:
         accuracy = max(0.05, self.BASE_ACCURACY - self.DEGRADATION_RATE * self._call_count)
         correct  = rng.random() < accuracy
         self._call_count += 1
         subtask: str,
         stakes: float,
         rng: random.Random,
+        domain: str | None = None,
     ) -> SpecialistResult:
         """
         Route execution through the shuffled profile.
         """
         internal_id = self._profile[specialist_id]
         spec = self._fixed[internal_id]
+        result = spec.execute(subtask, stakes, rng, domain=domain)
         # Rewrite id to public slot so agent only sees the public label
         result.specialist_id = specialist_id
         return result

task_graph.py CHANGED Viewed

@@ -1,11 +1,17 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import Optional
 from scenarios import Scenario, SubTask
 # ---------------------------------------------------------------------------
 # Node state
 # ---------------------------------------------------------------------------
@@ -13,7 +19,7 @@ from scenarios import Scenario, SubTask
 @dataclass
 class TaskNode:
     subtask: SubTask
-    status: str = "pending"      # pending | ready | in_progress | completed | failed | skipped
     outcome: float = 0.0         # 1.0 correct | 0.5 partial | 0.0 wrong
     specialist_used: str = ""
     attempts: int = 0
@@ -47,7 +53,7 @@ class TaskGraph:
     # State queries
     # ------------------------------------------------------------------
-    def current_node(self) -> Optional[TaskNode]:
         """
         Returns the first 'ready' node (all dependencies completed).
         Returns None if all nodes are done or none are unblocked yet.
@@ -125,7 +131,7 @@ class TaskGraph:
         return self._order.index(subtask_id)
     def high_stakes_nodes(self) -> list[TaskNode]:
-        return [n for n in self._nodes.values() if n.subtask["stakes"] >= 0.70]
     # ------------------------------------------------------------------
     # Mutations
@@ -159,7 +165,7 @@ class TaskGraph:
     # Summary (for info dict in StepResult)
     # ------------------------------------------------------------------
-    def summary(self) -> dict:
         return {
             "scenario_id":          self._scenario["scenario_id"],
             "task_type":            self._scenario["task_type"],
@@ -173,5 +179,5 @@ class TaskGraph:
             "is_done":              self.is_done(),
         }
-    def node_statuses(self) -> dict[str, str]:
         return {sid: n.status for sid, n in self._nodes.items()}

 from __future__ import annotations
 from dataclasses import dataclass
+from typing import Literal
+from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
 from scenarios import Scenario, SubTask
+TaskStatus = Literal["pending", "ready", "in_progress", "completed", "failed", "skipped"]
+SummaryValue = str | int | float | bool
 # ---------------------------------------------------------------------------
 # Node state
 # ---------------------------------------------------------------------------
 @dataclass
 class TaskNode:
     subtask: SubTask
+    status: TaskStatus = "pending"
     outcome: float = 0.0         # 1.0 correct | 0.5 partial | 0.0 wrong
     specialist_used: str = ""
     attempts: int = 0
     # State queries
     # ------------------------------------------------------------------
+    def current_node(self) -> TaskNode | None:
         """
         Returns the first 'ready' node (all dependencies completed).
         Returns None if all nodes are done or none are unblocked yet.
         return self._order.index(subtask_id)
     def high_stakes_nodes(self) -> list[TaskNode]:
+        return [n for n in self._nodes.values() if n.subtask["stakes"] >= ADVERSARIAL_AWARENESS_STAKES]
     # ------------------------------------------------------------------
     # Mutations
     # Summary (for info dict in StepResult)
     # ------------------------------------------------------------------
+    def summary(self) -> dict[str, SummaryValue]:
         return {
             "scenario_id":          self._scenario["scenario_id"],
             "task_type":            self._scenario["task_type"],
             "is_done":              self.is_done(),
         }
+    def node_statuses(self) -> dict[str, TaskStatus]:
         return {sid: n.status for sid, n in self._nodes.items()}

tests/test_app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from __future__ import annotations
+import time
+import unittest
+from app import SessionStore
+from environment import SentinelEnv
+class SessionStoreTests(unittest.TestCase):
+    def test_session_store_evicts_expired_sessions(self) -> None:
+        store = SessionStore(ttl_seconds=0, max_active=10)
+        env = SentinelEnv()
+        store.set("expired", env)
+        time.sleep(0.001)
+        self.assertIsNone(store.get("expired"))
+        self.assertEqual(store.stats()["active_sessions"], 0)
+    def test_session_store_evicts_lru_when_full(self) -> None:
+        store = SessionStore(ttl_seconds=60, max_active=1)
+        first = SentinelEnv()
+        second = SentinelEnv()
+        store.set("first", first)
+        store.set("second", second)
+        self.assertIsNone(store.get("first"))
+        self.assertIs(store.get("second"), second)
+if __name__ == "__main__":
+    unittest.main()

tests/test_environment.py CHANGED Viewed

@@ -32,6 +32,24 @@ class EnvironmentTests(unittest.TestCase):
         self.assertEqual(result["info"]["step_count"], 2)
     def test_self_solve_finishes_long_task_with_normalized_score(self) -> None:
         env = SentinelEnv()
         result = env.reset(task_type="task3", seed=5)

         self.assertEqual(result["info"]["step_count"], 2)
+    def test_verify_accurate_slow_costs_specialist_plus_verify_step(self) -> None:
+        env = SentinelEnv()
+        result = env.reset(task_type="task1", seed=11)
+        slow_slot = next(
+            public_id
+            for public_id, internal_id in env._pool.internal_profile().items()
+            if internal_id == "S0"
+        )
+        result = env.step({
+            "session_id": result["observation"]["session_id"],
+            "task_type": "task1",
+            "action_type": "verify",
+            "specialist_id": slow_slot,
+        })
+        self.assertEqual(result["info"]["step_count"], 3)
     def test_self_solve_finishes_long_task_with_normalized_score(self) -> None:
         env = SentinelEnv()
         result = env.reset(task_type="task3", seed=5)

tests/test_specialists.py CHANGED Viewed

@@ -18,6 +18,15 @@ class SpecialistTests(unittest.TestCase):
         self.assertEqual(in_domain.outcome, 1.0)
         self.assertEqual(out_domain.outcome, 0.0)
     def test_profile_shuffle_keeps_public_reliability_aligned(self) -> None:
         pool = SpecialistPool()
         pool.reset(seed=7)

         self.assertEqual(in_domain.outcome, 1.0)
         self.assertEqual(out_domain.outcome, 0.0)
+    def test_domain_bound_prefers_structured_domain_over_keywords(self) -> None:
+        specialist = DomainBoundSpecialist()
+        structured = specialist.execute("Examine the payload carefully.", 0.2, random.Random(1), domain="ANALYZE")
+        mismatched = specialist.execute("Analyze this deployment step.", 0.2, random.Random(1), domain="EXECUTE")
+        self.assertTrue(structured.metadata["in_domain"])
+        self.assertFalse(mismatched.metadata["in_domain"])
     def test_profile_shuffle_keeps_public_reliability_aligned(self) -> None:
         pool = SpecialistPool()
         pool.reset(seed=7)

training/evaluate.py CHANGED Viewed

@@ -14,6 +14,7 @@ if str(ROOT) not in sys.path:
     sys.path.insert(0, str(ROOT))
 from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
 Policy = Callable[[SentinelEnv, dict, random.Random], dict]
@@ -40,13 +41,17 @@ def random_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
 def heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
     trust = obs["trust_snapshot"]
     specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
-    action_type = "verify" if obs["stakes_level"] >= 0.70 and trust.get(specialist, 0.5) < 0.65 else "delegate"
     return _action(obs, action_type, specialist)
 def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
     reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
-    if obs["task_type"] == "task3" and obs["stakes_level"] >= 0.70:
         return _action(obs, "verify", env._pool.adversarial_slot)
     specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
     return _action(obs, "delegate", specialist)

     sys.path.insert(0, str(ROOT))
 from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
+from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
 Policy = Callable[[SentinelEnv, dict, random.Random], dict]
 def heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
     trust = obs["trust_snapshot"]
     specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
+    action_type = (
+        "verify"
+        if obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES and trust.get(specialist, 0.5) < 0.65
+        else "delegate"
+    )
     return _action(obs, action_type, specialist)
 def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
     reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
+    if obs["task_type"] == "task3" and obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES:
         return _action(obs, "verify", env._pool.adversarial_slot)
     specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
     return _action(obs, "delegate", specialist)

training/train.py CHANGED Viewed

@@ -21,6 +21,7 @@ if str(ROOT) not in sys.path:
 from environment import SentinelEnv
 from mission_context import build_orchestrator_prompt
 ACTION_RE = re.compile(r"\{.*\}", re.DOTALL)
@@ -127,7 +128,11 @@ def dry_run_rollouts(episodes: int, seed: int) -> dict:
             action = {
                 "session_id": obs["session_id"],
                 "task_type": obs["task_type"],
-                "action_type": "verify" if obs["stakes_level"] >= 0.70 and rng.random() < 0.5 else "delegate",
                 "specialist_id": specialist,
                 "subtask_response": None,
                 "reasoning": "dry-run heuristic",

 from environment import SentinelEnv
 from mission_context import build_orchestrator_prompt
+from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
 ACTION_RE = re.compile(r"\{.*\}", re.DOTALL)
             action = {
                 "session_id": obs["session_id"],
                 "task_type": obs["task_type"],
+                "action_type": (
+                    "verify"
+                    if obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES and rng.random() < 0.5
+                    else "delegate"
+                ),
                 "specialist_id": specialist,
                 "subtask_response": None,
                 "reasoning": "dry-run heuristic",

trust_ledger.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from __future__ import annotations
-import math
 class TrustLedger:
     """
@@ -113,4 +111,4 @@ class TrustLedger:
     def __repr__(self) -> str:
         snap = self.snapshot()
-        return f"TrustLedger({snap})"

 from __future__ import annotations
 class TrustLedger:
     """
     def __repr__(self) -> str:
         snap = self.snapshot()
+        return f"TrustLedger({snap})"