XcodeAddy commited on
Commit
aad7819
·
1 Parent(s): 6d6dbaf

Harden backend session and reward constants

Browse files
Dockerfile CHANGED
@@ -27,6 +27,7 @@ COPY trust_ledger.py .
27
  COPY task_graph.py .
28
  COPY comms_bus.py .
29
  COPY mission_context.py .
 
30
  COPY scenarios.py .
31
  COPY openenv.yaml .
32
  COPY inference.py .
 
27
  COPY task_graph.py .
28
  COPY comms_bus.py .
29
  COPY mission_context.py .
30
+ COPY sentinel_config.py .
31
  COPY scenarios.py .
32
  COPY openenv.yaml .
33
  COPY inference.py .
README.md CHANGED
@@ -70,6 +70,9 @@ curl "http://localhost:7860/mission?task_type=task3"
70
  - Specialists: 5 scripted FSM agents with shuffled hidden profiles
71
  - Rewards: per-step reward plus terminal score, normalized to `0.0-1.0`
72
  - Dataset: 120 abstract multi-agent scenarios
 
 
 
73
 
74
  ## Live Submission Targets
75
 
@@ -85,10 +88,10 @@ curl "http://localhost:7860/mission?task_type=task3"
85
 
86
  Hidden profiles:
87
 
88
- - `AccurateSlow`: 90 percent accurate, costs extra steps.
89
  - `OverconfidentFast`: quick and confident, wrong 40 percent of the time.
90
  - `DomainBound`: strong on analysis/verification, weak elsewhere.
91
- - `Adversarial`: benign at low stakes, poisons high-stakes steps.
92
  - `Degrading`: strong early, gets worse with reuse.
93
 
94
  ## Tasks
@@ -103,6 +106,13 @@ Hidden profiles:
103
 
104
  Rewards are deterministic and boundary-exclusive per step: `(0.01, 0.99)`.
105
 
 
 
 
 
 
 
 
106
  Task 3 terminal score:
107
 
108
  ```text
@@ -112,7 +122,7 @@ Task 3 terminal score:
112
  + 0.10 * efficiency
113
  ```
114
 
115
- The episode `score` exposed in `info` and inference logs is normalized to `0.0-1.0`.
116
 
117
  ## API
118
 
@@ -242,7 +252,7 @@ pip install pytest
242
  Run checks:
243
 
244
  ```bash
245
- python -m py_compile app.py server/app.py environment.py models.py graders.py specialists.py trust_ledger.py task_graph.py scenarios.py inference.py comms_bus.py mission_context.py training/evaluate.py training/train.py scripts/backend_walkthrough.py
246
  python -m pytest -q
247
  python inference.py
248
  python training/evaluate.py --episodes 20 --task all --plot outputs/baseline_comparison.png
 
70
  - Specialists: 5 scripted FSM agents with shuffled hidden profiles
71
  - Rewards: per-step reward plus terminal score, normalized to `0.0-1.0`
72
  - Dataset: 120 abstract multi-agent scenarios
73
+ - Session store: single-process memory with TTL/LRU cleanup
74
+
75
+ Deployment contract: run one server worker for the submitted Space. Active `SentinelEnv` objects live in process memory, so multi-worker deployments need sticky sessions or a shared store such as Redis. The Dockerfile intentionally starts uvicorn with `--workers 1`.
76
 
77
  ## Live Submission Targets
78
 
 
88
 
89
  Hidden profiles:
90
 
91
+ - `AccurateSlow`: 90 percent accurate, costs 2 steps.
92
  - `OverconfidentFast`: quick and confident, wrong 40 percent of the time.
93
  - `DomainBound`: strong on analysis/verification, weak elsewhere.
94
+ - `Adversarial`: benign below stakes `0.70`, poisons at stakes `>=0.70`.
95
  - `Degrading`: strong early, gets worse with reuse.
96
 
97
  ## Tasks
 
106
 
107
  Rewards are deterministic and boundary-exclusive per step: `(0.01, 0.99)`.
108
 
109
+ Shared threshold/cost constants live in `sentinel_config.py`:
110
+
111
+ - `ADVERSARIAL_TRIGGER_STAKES = 0.70`: adversarial specialist starts poisoning.
112
+ - `ADVERSARIAL_AWARENESS_STAKES = 0.70`: graders reward verification/evasion.
113
+ - `CRITICAL_POISON_STAKES = 0.85`: unchecked poison ends the episode.
114
+ - `VERIFY_EXTRA_STEP_COST = 1`: verify cost is specialist step cost plus one.
115
+
116
  Task 3 terminal score:
117
 
118
  ```text
 
122
  + 0.10 * efficiency
123
  ```
124
 
125
+ The episode `score` exposed in `info` and inference logs is the mean reward over emitted grading events, normalized to `0.0-1.0`. It is intentionally not raw cumulative return; terminal reward and efficiency terms carry the penalty for unfinished or wasteful episodes while keeping scores comparable across tasks with different horizons.
126
 
127
  ## API
128
 
 
252
  Run checks:
253
 
254
  ```bash
255
+ python -m py_compile app.py server/app.py environment.py models.py graders.py specialists.py trust_ledger.py task_graph.py scenarios.py inference.py comms_bus.py mission_context.py sentinel_config.py training/evaluate.py training/train.py scripts/backend_walkthrough.py
256
  python -m pytest -q
257
  python inference.py
258
  python training/evaluate.py --episodes 20 --task all --plot outputs/baseline_comparison.png
app.py CHANGED
@@ -1,7 +1,11 @@
1
  from __future__ import annotations
2
 
3
  import os
 
 
 
4
  from pathlib import Path
 
5
  from typing import Any
6
 
7
  from fastapi import FastAPI, HTTPException, Query
@@ -12,6 +16,7 @@ from pydantic import BaseModel
12
  from environment import SentinelEnv
13
  from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
14
  from scenarios import scenario_summary
 
15
 
16
  # ---------------------------------------------------------------------------
17
  # App + session store
@@ -26,8 +31,75 @@ app = FastAPI(
26
  version="1.0.0",
27
  )
28
 
29
- # One env instance per session_id
30
- _sessions: dict[str, SentinelEnv] = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  _STATIC_DIR = Path(__file__).resolve().parent / "static"
32
  _OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
33
  _FRONTEND_OUT_DIR = Path(__file__).resolve().parent / "ui" / "out"
@@ -37,9 +109,10 @@ if _FRONTEND_NEXT_DIR.exists():
37
  app.mount("/_next", StaticFiles(directory=_FRONTEND_NEXT_DIR), name="next-assets")
38
 
39
  def _get_env(session_id: str) -> SentinelEnv:
40
- if session_id not in _sessions:
 
41
  raise HTTPException(status_code=404, detail=f"Session '{session_id}' not found. Call /reset first.")
42
- return _sessions[session_id]
43
 
44
 
45
  # ---------------------------------------------------------------------------
@@ -66,7 +139,12 @@ class StepRequest(BaseModel):
66
 
67
  @app.get("/health")
68
  def health():
69
- return {"status": "ok", "environment": "sentinel-env", "version": "1.0.0"}
 
 
 
 
 
70
 
71
 
72
  @app.get("/")
@@ -162,6 +240,13 @@ def metadata():
162
  "scenarios": summary,
163
  "reward_range": "(0.01, 0.99) boundary-exclusive",
164
  "real_world_bridge": problem_statement()["problem"]["not_a_simple_prompt_solver"],
 
 
 
 
 
 
 
165
  }
166
 
167
 
@@ -227,7 +312,7 @@ def reset(req: ResetRequest = ResetRequest()):
227
  seed=req.seed,
228
  )
229
  session_id = result["info"]["session_id"]
230
- _sessions[session_id] = env
231
  result["info"]["mission"] = mission_for_task(result["observation"]["task_type"])
232
  result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
233
  return result
@@ -243,7 +328,7 @@ def step(req: StepRequest, session_id: str = Query(...)):
243
 
244
  # Clean up completed sessions to avoid memory leak
245
  if result["done"]:
246
- _sessions.pop(session_id, None)
247
  else:
248
  result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
249
 
@@ -266,7 +351,9 @@ def mcp(body: dict[str, Any]):
266
  env = SentinelEnv()
267
  result = env.reset(**params)
268
  session_id = result["info"]["session_id"]
269
- _sessions[session_id] = env
 
 
270
  return {"result": result}
271
 
272
  elif method == "step":
@@ -276,7 +363,9 @@ def mcp(body: dict[str, Any]):
276
  env = _get_env(session_id)
277
  result = env.step(params)
278
  if result["done"]:
279
- _sessions.pop(session_id, None)
 
 
280
  return {"result": result}
281
 
282
  elif method == "state":
 
1
  from __future__ import annotations
2
 
3
  import os
4
+ import time
5
+ from collections import OrderedDict
6
+ from dataclasses import dataclass
7
  from pathlib import Path
8
+ from threading import RLock
9
  from typing import Any
10
 
11
  from fastapi import FastAPI, HTTPException, Query
 
16
  from environment import SentinelEnv
17
  from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
18
  from scenarios import scenario_summary
19
+ from sentinel_config import SESSION_BACKEND, SESSION_MAX_ACTIVE, SESSION_TTL_SECONDS
20
 
21
  # ---------------------------------------------------------------------------
22
  # App + session store
 
31
  version="1.0.0",
32
  )
33
 
34
+ @dataclass
35
+ class SessionEntry:
36
+ env: SentinelEnv
37
+ created_at: float
38
+ last_access_at: float
39
+
40
+
41
+ class SessionStore:
42
+ """
43
+ Single-process TTL + LRU store for active SentinelEnv objects.
44
+
45
+ This is intentionally memory-backed for OpenEnv/HF Space simplicity. It is
46
+ safe for the Dockerfile's single-worker deployment. If you increase workers,
47
+ use sticky routing or replace this with a shared backend such as Redis.
48
+ """
49
+
50
+ def __init__(self, ttl_seconds: int, max_active: int) -> None:
51
+ self._ttl_seconds = ttl_seconds
52
+ self._max_active = max_active
53
+ self._items: OrderedDict[str, SessionEntry] = OrderedDict()
54
+ self._lock = RLock()
55
+
56
+ def set(self, session_id: str, env: SentinelEnv) -> None:
57
+ now = time.monotonic()
58
+ with self._lock:
59
+ self._prune_locked(now)
60
+ self._items[session_id] = SessionEntry(env=env, created_at=now, last_access_at=now)
61
+ self._items.move_to_end(session_id)
62
+ while len(self._items) > self._max_active:
63
+ self._items.popitem(last=False)
64
+
65
+ def get(self, session_id: str) -> SentinelEnv | None:
66
+ now = time.monotonic()
67
+ with self._lock:
68
+ self._prune_locked(now)
69
+ entry = self._items.get(session_id)
70
+ if entry is None:
71
+ return None
72
+ entry.last_access_at = now
73
+ self._items.move_to_end(session_id)
74
+ return entry.env
75
+
76
+ def pop(self, session_id: str) -> SentinelEnv | None:
77
+ with self._lock:
78
+ entry = self._items.pop(session_id, None)
79
+ return entry.env if entry else None
80
+
81
+ def stats(self) -> dict[str, int | str | bool]:
82
+ with self._lock:
83
+ self._prune_locked(time.monotonic())
84
+ return {
85
+ "backend": SESSION_BACKEND,
86
+ "active_sessions": len(self._items),
87
+ "ttl_seconds": self._ttl_seconds,
88
+ "max_active": self._max_active,
89
+ "multi_worker_safe": False,
90
+ }
91
+
92
+ def _prune_locked(self, now: float) -> None:
93
+ expired = [
94
+ sid
95
+ for sid, entry in self._items.items()
96
+ if now - entry.last_access_at > self._ttl_seconds
97
+ ]
98
+ for sid in expired:
99
+ self._items.pop(sid, None)
100
+
101
+
102
+ _sessions = SessionStore(ttl_seconds=SESSION_TTL_SECONDS, max_active=SESSION_MAX_ACTIVE)
103
  _STATIC_DIR = Path(__file__).resolve().parent / "static"
104
  _OUTPUTS_DIR = Path(__file__).resolve().parent / "outputs"
105
  _FRONTEND_OUT_DIR = Path(__file__).resolve().parent / "ui" / "out"
 
109
  app.mount("/_next", StaticFiles(directory=_FRONTEND_NEXT_DIR), name="next-assets")
110
 
111
  def _get_env(session_id: str) -> SentinelEnv:
112
+ env = _sessions.get(session_id)
113
+ if env is None:
114
  raise HTTPException(status_code=404, detail=f"Session '{session_id}' not found. Call /reset first.")
115
+ return env
116
 
117
 
118
  # ---------------------------------------------------------------------------
 
139
 
140
  @app.get("/health")
141
  def health():
142
+ return {
143
+ "status": "ok",
144
+ "environment": "sentinel-env",
145
+ "version": "1.0.0",
146
+ "session_store": _sessions.stats(),
147
+ }
148
 
149
 
150
  @app.get("/")
 
240
  "scenarios": summary,
241
  "reward_range": "(0.01, 0.99) boundary-exclusive",
242
  "real_world_bridge": problem_statement()["problem"]["not_a_simple_prompt_solver"],
243
+ "deployment_contract": {
244
+ "session_backend": SESSION_BACKEND,
245
+ "single_worker_required": True,
246
+ "reason": "Active SentinelEnv objects live in one process memory with TTL/LRU cleanup.",
247
+ "ttl_seconds": SESSION_TTL_SECONDS,
248
+ "max_active_sessions": SESSION_MAX_ACTIVE,
249
+ },
250
  }
251
 
252
 
 
312
  seed=req.seed,
313
  )
314
  session_id = result["info"]["session_id"]
315
+ _sessions.set(session_id, env)
316
  result["info"]["mission"] = mission_for_task(result["observation"]["task_type"])
317
  result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
318
  return result
 
328
 
329
  # Clean up completed sessions to avoid memory leak
330
  if result["done"]:
331
+ _sessions.pop(session_id)
332
  else:
333
  result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
334
 
 
351
  env = SentinelEnv()
352
  result = env.reset(**params)
353
  session_id = result["info"]["session_id"]
354
+ _sessions.set(session_id, env)
355
+ result["info"]["mission"] = mission_for_task(result["observation"]["task_type"])
356
+ result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
357
  return {"result": result}
358
 
359
  elif method == "step":
 
363
  env = _get_env(session_id)
364
  result = env.step(params)
365
  if result["done"]:
366
+ _sessions.pop(session_id)
367
+ else:
368
+ result["info"]["orchestrator_prompt"] = build_orchestrator_prompt(result["observation"])
369
  return {"result": result}
370
 
371
  elif method == "state":
environment.py CHANGED
@@ -10,6 +10,7 @@ from graders import (
10
  grade_task3_step, grade_task3_terminal,
11
  )
12
  from scenarios import sample_scenario, get_scenario, Scenario
 
13
  from specialists import SpecialistPool
14
  from task_graph import TaskGraph
15
  from trust_ledger import TrustLedger
@@ -171,19 +172,34 @@ class SentinelEnv:
171
 
172
  elif action_type == "verify":
173
  # First get result, then verify (costs +1 step)
174
- result = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
175
- step_cost = int(result.metadata.get("step_cost", 1)) + 1
 
 
 
 
 
 
176
  outcome = result.outcome if not result.is_adversarial else 0.0
177
  was_adversarial = result.is_adversarial
178
  # Verification means agent caught adversarial — treat as detection
179
  if result.is_adversarial:
180
  outcome = 1.0 # successfully avoided
181
  self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
 
 
 
182
  self._ledger.update(specialist_id, result.outcome, stakes)
183
  self.last_action_summary = f"Verified {specialist_id} on {subtask['id']}"
184
 
185
  else: # delegate
186
- result = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
 
 
 
 
 
 
187
  step_cost = int(result.metadata.get("step_cost", 1))
188
  was_adversarial = result.is_adversarial
189
  outcome = 0.0 if was_adversarial else result.outcome
@@ -206,7 +222,7 @@ class SentinelEnv:
206
  # --- Check episode end ---
207
  all_done = self._graph.is_done()
208
  budget_gone = self.step_count >= self.max_steps
209
- poisoned = was_adversarial and action_type == "delegate" and stakes >= 0.85
210
 
211
  if all_done or budget_gone or poisoned:
212
  # Emit terminal reward on top of step reward
@@ -394,7 +410,13 @@ class SentinelEnv:
394
  )
395
 
396
  def normalized_score(self) -> float:
397
- """Episode score normalized to 0.0-1.0 for judging logs."""
 
 
 
 
 
 
398
  if self.reward_events <= 0:
399
  return 0.0
400
  return max(0.0, min(1.0, self.total_reward / self.reward_events))
 
10
  grade_task3_step, grade_task3_terminal,
11
  )
12
  from scenarios import sample_scenario, get_scenario, Scenario
13
+ from sentinel_config import CRITICAL_POISON_STAKES, VERIFY_EXTRA_STEP_COST
14
  from specialists import SpecialistPool
15
  from task_graph import TaskGraph
16
  from trust_ledger import TrustLedger
 
172
 
173
  elif action_type == "verify":
174
  # First get result, then verify (costs +1 step)
175
+ result = self._pool.execute(
176
+ specialist_id,
177
+ subtask["description"],
178
+ stakes,
179
+ self._rng,
180
+ domain=subtask.get("domain"),
181
+ )
182
+ step_cost = int(result.metadata.get("step_cost", 1)) + VERIFY_EXTRA_STEP_COST
183
  outcome = result.outcome if not result.is_adversarial else 0.0
184
  was_adversarial = result.is_adversarial
185
  # Verification means agent caught adversarial — treat as detection
186
  if result.is_adversarial:
187
  outcome = 1.0 # successfully avoided
188
  self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
189
+ # Important: trust must learn from the specialist's raw answer, not
190
+ # from the corrected/avoided graph outcome. If S0 was caught lying,
191
+ # the task node is safe, but S0's trust should still drop.
192
  self._ledger.update(specialist_id, result.outcome, stakes)
193
  self.last_action_summary = f"Verified {specialist_id} on {subtask['id']}"
194
 
195
  else: # delegate
196
+ result = self._pool.execute(
197
+ specialist_id,
198
+ subtask["description"],
199
+ stakes,
200
+ self._rng,
201
+ domain=subtask.get("domain"),
202
+ )
203
  step_cost = int(result.metadata.get("step_cost", 1))
204
  was_adversarial = result.is_adversarial
205
  outcome = 0.0 if was_adversarial else result.outcome
 
222
  # --- Check episode end ---
223
  all_done = self._graph.is_done()
224
  budget_gone = self.step_count >= self.max_steps
225
+ poisoned = was_adversarial and action_type == "delegate" and stakes >= CRITICAL_POISON_STAKES
226
 
227
  if all_done or budget_gone or poisoned:
228
  # Emit terminal reward on top of step reward
 
410
  )
411
 
412
  def normalized_score(self) -> float:
413
+ """
414
+ Mean reward over emitted grading events, normalized to 0.0-1.0.
415
+
416
+ This is intentionally not a cumulative return. The terminal reward and
417
+ efficiency terms already penalize unfinished or wasteful episodes, while
418
+ this scalar stays comparable across tasks with different horizons.
419
+ """
420
  if self.reward_events <= 0:
421
  return 0.0
422
  return max(0.0, min(1.0, self.total_reward / self.reward_events))
graders.py CHANGED
@@ -1,5 +1,6 @@
1
  from __future__ import annotations
2
 
 
3
  from task_graph import TaskGraph
4
  from trust_ledger import TrustLedger
5
 
@@ -34,7 +35,7 @@ def grade_task1_step(
34
  task_accuracy = specialist_outcome # 1.0 or 0.0
35
 
36
  # Stakes awareness: avoided adversarial = bonus, missed it = penalty
37
- if was_adversarial and stakes >= 0.70:
38
  if action_type in ("verify", "solve_independently"):
39
  stakes_awareness = 0.99 # correctly routed around adversary
40
  reason = f"Adversarial specialist detected at stakes={stakes:.2f}. Correct evasion."
@@ -150,7 +151,7 @@ def grade_task3_step(
150
  task_accuracy = specialist_outcome
151
 
152
  # Adversarial detection at high stakes
153
- if was_adversarial and stakes >= 0.70:
154
  if action_type in ("verify", "solve_independently"):
155
  stakes_awareness = 0.99
156
  reason = f"✅ Adversarial detected at stakes={stakes:.2f} — evasion successful."
@@ -234,4 +235,4 @@ STEP_GRADERS = {
234
  TERMINAL_GRADERS = {
235
  "task2": grade_task2_terminal,
236
  "task3": grade_task3_terminal,
237
- }
 
1
  from __future__ import annotations
2
 
3
+ from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
4
  from task_graph import TaskGraph
5
  from trust_ledger import TrustLedger
6
 
 
35
  task_accuracy = specialist_outcome # 1.0 or 0.0
36
 
37
  # Stakes awareness: avoided adversarial = bonus, missed it = penalty
38
+ if was_adversarial and stakes >= ADVERSARIAL_AWARENESS_STAKES:
39
  if action_type in ("verify", "solve_independently"):
40
  stakes_awareness = 0.99 # correctly routed around adversary
41
  reason = f"Adversarial specialist detected at stakes={stakes:.2f}. Correct evasion."
 
151
  task_accuracy = specialist_outcome
152
 
153
  # Adversarial detection at high stakes
154
+ if was_adversarial and stakes >= ADVERSARIAL_AWARENESS_STAKES:
155
  if action_type in ("verify", "solve_independently"):
156
  stakes_awareness = 0.99
157
  reason = f"✅ Adversarial detected at stakes={stakes:.2f} — evasion successful."
 
235
  TERMINAL_GRADERS = {
236
  "task2": grade_task2_terminal,
237
  "task3": grade_task3_terminal,
238
+ }
inference.py CHANGED
@@ -6,7 +6,8 @@ Emits [START] / [STEP] / [END] structured logs exactly as required.
6
 
7
  Heuristic agent logic:
8
  - Always delegates to the specialist with highest trust score
9
- - If stakes >= 0.70 and trust of chosen specialist < 0.60 → verify instead
 
10
  - Never skips
11
  - Never solves independently (too expensive)
12
 
@@ -39,6 +40,8 @@ else:
39
  from environment import SentinelEnv
40
  USE_REMOTE = False
41
 
 
 
42
 
43
  # ---------------------------------------------------------------------------
44
  # Env interface (works both remote and in-process)
@@ -100,7 +103,7 @@ def heuristic_action(obs: dict, session_id: str) -> dict:
100
  best_trust = trust.get(best_specialist, 0.5)
101
 
102
  # Upgrade to verify if high stakes AND low trust in best specialist
103
- if stakes >= 0.70 and best_trust < 0.60:
104
  action_type = "verify"
105
  else:
106
  action_type = "delegate"
 
6
 
7
  Heuristic agent logic:
8
  - Always delegates to the specialist with highest trust score
9
+ - If stakes enters the adversarial-awareness zone and trust of chosen
10
+ specialist < 0.60 -> verify instead
11
  - Never skips
12
  - Never solves independently (too expensive)
13
 
 
40
  from environment import SentinelEnv
41
  USE_REMOTE = False
42
 
43
+ from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
44
+
45
 
46
  # ---------------------------------------------------------------------------
47
  # Env interface (works both remote and in-process)
 
103
  best_trust = trust.get(best_specialist, 0.5)
104
 
105
  # Upgrade to verify if high stakes AND low trust in best specialist
106
+ if stakes >= ADVERSARIAL_AWARENESS_STAKES and best_trust < 0.60:
107
  action_type = "verify"
108
  else:
109
  action_type = "delegate"
openenv.yaml CHANGED
@@ -93,6 +93,15 @@ api:
93
  required: true
94
  returns: SentinelState with trust_snapshot, completion, adversarial stats
95
 
 
 
 
 
 
 
 
 
 
96
  tasks:
97
  task1:
98
  name: Single-Step Trust Decision
@@ -119,12 +128,18 @@ tasks:
119
  reward: "terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
120
 
121
  specialists:
122
- S0: "AccurateSlow — 90% accurate, costs +1 extra step"
123
  S1: "OverconfidentFast — 60% accurate, always reports high confidence"
124
  S2: "DomainBound — 95% in-domain, 10% out-of-domain"
125
  S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
126
  S4: "Degrading — 95% first call, -15% accuracy per reuse"
127
 
 
 
 
 
 
 
128
  dataset:
129
  total_scenarios: 120
130
  split:
 
93
  required: true
94
  returns: SentinelState with trust_snapshot, completion, adversarial stats
95
 
96
+ deployment:
97
+ session_backend: single_process_memory
98
+ workers: 1
99
+ session_ttl_seconds: 1800
100
+ session_max_active: 256
101
+ note: >
102
+ Active SentinelEnv sessions are stored in one process with TTL/LRU cleanup.
103
+ Multi-worker deployments require sticky sessions or a shared session store.
104
+
105
  tasks:
106
  task1:
107
  name: Single-Step Trust Decision
 
128
  reward: "terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
129
 
130
  specialists:
131
+ S0: "AccurateSlow — 90% accurate, costs 2 steps"
132
  S1: "OverconfidentFast — 60% accurate, always reports high confidence"
133
  S2: "DomainBound — 95% in-domain, 10% out-of-domain"
134
  S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
135
  S4: "Degrading — 95% first call, -15% accuracy per reuse"
136
 
137
+ thresholds:
138
+ adversarial_trigger_stakes: 0.70
139
+ adversarial_awareness_stakes: 0.70
140
+ critical_poison_stakes: 0.85
141
+ verify_extra_step_cost: 1
142
+
143
  dataset:
144
  total_scenarios: 120
145
  split:
scripts/backend_walkthrough.py CHANGED
@@ -14,6 +14,7 @@ if str(ROOT) not in sys.path:
14
 
15
  from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
16
  from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
 
17
 
18
 
19
  Policy = Callable[[SentinelEnv, dict, random.Random], dict]
@@ -50,7 +51,7 @@ def sentinel_heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -
50
  trust = obs["trust_snapshot"]
51
  specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
52
  score = trust.get(specialist, 0.5)
53
- action_type = "verify" if obs["stakes_level"] >= 0.70 and score < 0.65 else "delegate"
54
  return action(
55
  obs,
56
  action_type,
@@ -65,7 +66,7 @@ def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
65
  It uses hidden builder-only info, so it is NOT a deployable policy.
66
  """
67
  reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
68
- if obs["task_type"] == "task3" and obs["stakes_level"] >= 0.70:
69
  return action(obs, "verify", env._pool.adversarial_slot, "oracle-lite verifies adversarial slot")
70
  specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
71
  return action(obs, "delegate", specialist, f"oracle-lite best={specialist}")
 
14
 
15
  from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
16
  from mission_context import build_orchestrator_prompt, mission_for_task, problem_statement
17
+ from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
18
 
19
 
20
  Policy = Callable[[SentinelEnv, dict, random.Random], dict]
 
51
  trust = obs["trust_snapshot"]
52
  specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
53
  score = trust.get(specialist, 0.5)
54
+ action_type = "verify" if obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES and score < 0.65 else "delegate"
55
  return action(
56
  obs,
57
  action_type,
 
66
  It uses hidden builder-only info, so it is NOT a deployable policy.
67
  """
68
  reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
69
+ if obs["task_type"] == "task3" and obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES:
70
  return action(obs, "verify", env._pool.adversarial_slot, "oracle-lite verifies adversarial slot")
71
  specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
72
  return action(obs, "delegate", specialist, f"oracle-lite best={specialist}")
sentinel_config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+
6
+ # Stakes >= this value means the adversarial specialist starts poisoning.
7
+ ADVERSARIAL_TRIGGER_STAKES = 0.70
8
+
9
+ # Stakes >= this value means the grader should reward verification/evasion.
10
+ ADVERSARIAL_AWARENESS_STAKES = ADVERSARIAL_TRIGGER_STAKES
11
+
12
+ # Stakes >= this value means unchecked adversarial poison ends the episode.
13
+ # This is intentionally higher than ADVERSARIAL_TRIGGER_STAKES: lower-stakes poison
14
+ # damages reward and trust, while peak-stakes poison represents mission failure.
15
+ CRITICAL_POISON_STAKES = 0.85
16
+
17
+ # Verifying means "ask specialist, then cross-check". Slow specialists still carry
18
+ # their own step cost, so verify(AccurateSlow) costs 2 + 1 = 3 steps.
19
+ VERIFY_EXTRA_STEP_COST = 1
20
+
21
+ # In-memory session store limits. This deployment is intentionally single-worker;
22
+ # use Redis/sticky sessions before increasing workers.
23
+ SESSION_TTL_SECONDS = int(os.environ.get("SENTINEL_SESSION_TTL_SECONDS", "1800"))
24
+ SESSION_MAX_ACTIVE = int(os.environ.get("SENTINEL_SESSION_MAX_ACTIVE", "256"))
25
+ SESSION_BACKEND = "single_process_memory"
26
+
specialists.py CHANGED
@@ -2,6 +2,9 @@ from __future__ import annotations
2
 
3
  import random
4
  from dataclasses import dataclass, field
 
 
 
5
 
6
 
7
  # ---------------------------------------------------------------------------
@@ -16,7 +19,7 @@ class SpecialistResult:
16
  outcome: float # 1.0 correct | 0.5 partial | 0.0 wrong/adversarial
17
  confidence: float # 0.0–1.0 (self-reported — may be misleading)
18
  is_adversarial: bool # True only for AdversarialSpecialist at trigger point
19
- metadata: dict = field(default_factory=dict)
20
 
21
 
22
  # ---------------------------------------------------------------------------
@@ -26,7 +29,13 @@ class SpecialistResult:
26
  class BaseSpecialist:
27
  specialist_id: str
28
 
29
- def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
 
 
 
 
 
 
30
  raise NotImplementedError
31
 
32
  def reset(self) -> None:
@@ -48,7 +57,13 @@ class AccurateSlowSpecialist(BaseSpecialist):
48
  def reset(self) -> None:
49
  pass
50
 
51
- def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
 
 
 
 
 
 
52
  correct = rng.random() < self.ACCURACY
53
  return SpecialistResult(
54
  specialist_id=self.specialist_id,
@@ -75,7 +90,13 @@ class OverconfidentFastSpecialist(BaseSpecialist):
75
  def reset(self) -> None:
76
  pass
77
 
78
- def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
 
 
 
 
 
 
79
  correct = rng.random() < self.ACCURACY
80
  return SpecialistResult(
81
  specialist_id=self.specialist_id,
@@ -99,15 +120,28 @@ class DomainBoundSpecialist(BaseSpecialist):
99
 
100
  IN_DOMAIN_ACCURACY = 0.95
101
  OUT_DOMAIN_ACCURACY = 0.10
 
 
 
 
102
 
103
  def reset(self) -> None:
104
  pass
105
 
106
- def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
107
- # SENTINEL's abstract scenarios expose domain through task wording.
108
- # This specialist is strong at analysis/verification and weak elsewhere.
109
- in_domain = any(kw in subtask.lower() for kw in
110
- ["analyze", "analysis", "identify", "pattern", "verify", "correctness", "assess"])
 
 
 
 
 
 
 
 
 
111
  accuracy = self.IN_DOMAIN_ACCURACY if in_domain else self.OUT_DOMAIN_ACCURACY
112
  correct = rng.random() < accuracy
113
  return SpecialistResult(
@@ -118,7 +152,7 @@ class DomainBoundSpecialist(BaseSpecialist):
118
  outcome=1.0 if correct else 0.0,
119
  confidence=0.90 if in_domain else 0.45,
120
  is_adversarial=False,
121
- metadata={"step_cost": 1, "in_domain": in_domain},
122
  )
123
 
124
 
@@ -134,12 +168,18 @@ class AdversarialSpecialist(BaseSpecialist):
134
  specialist_id = "S3"
135
 
136
  BENIGN_ACCURACY = 0.88 # looks trustworthy at low stakes
137
- STAKES_THRESHOLD = 0.70 # triggers above this
138
 
139
  def reset(self) -> None:
140
  pass
141
 
142
- def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
 
 
 
 
 
 
143
  if stakes >= self.STAKES_THRESHOLD:
144
  # Adversarial trigger: plausible response, wrong outcome
145
  return SpecialistResult(
@@ -183,7 +223,13 @@ class DegradingSpecialist(BaseSpecialist):
183
  def reset(self) -> None:
184
  self._call_count = 0
185
 
186
- def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
 
 
 
 
 
 
187
  accuracy = max(0.05, self.BASE_ACCURACY - self.DEGRADATION_RATE * self._call_count)
188
  correct = rng.random() < accuracy
189
  self._call_count += 1
@@ -259,6 +305,7 @@ class SpecialistPool:
259
  subtask: str,
260
  stakes: float,
261
  rng: random.Random,
 
262
  ) -> SpecialistResult:
263
  """
264
  Route execution through the shuffled profile.
@@ -266,7 +313,7 @@ class SpecialistPool:
266
  """
267
  internal_id = self._profile[specialist_id]
268
  spec = self._fixed[internal_id]
269
- result = spec.execute(subtask, stakes, rng)
270
  # Rewrite id to public slot so agent only sees the public label
271
  result.specialist_id = specialist_id
272
  return result
 
2
 
3
  import random
4
  from dataclasses import dataclass, field
5
+ from typing import Any
6
+
7
+ from sentinel_config import ADVERSARIAL_TRIGGER_STAKES
8
 
9
 
10
  # ---------------------------------------------------------------------------
 
19
  outcome: float # 1.0 correct | 0.5 partial | 0.0 wrong/adversarial
20
  confidence: float # 0.0–1.0 (self-reported — may be misleading)
21
  is_adversarial: bool # True only for AdversarialSpecialist at trigger point
22
+ metadata: dict[str, Any] = field(default_factory=dict)
23
 
24
 
25
  # ---------------------------------------------------------------------------
 
29
  class BaseSpecialist:
30
  specialist_id: str
31
 
32
+ def execute(
33
+ self,
34
+ subtask: str,
35
+ stakes: float,
36
+ rng: random.Random,
37
+ domain: str | None = None,
38
+ ) -> SpecialistResult:
39
  raise NotImplementedError
40
 
41
  def reset(self) -> None:
 
57
  def reset(self) -> None:
58
  pass
59
 
60
+ def execute(
61
+ self,
62
+ subtask: str,
63
+ stakes: float,
64
+ rng: random.Random,
65
+ domain: str | None = None,
66
+ ) -> SpecialistResult:
67
  correct = rng.random() < self.ACCURACY
68
  return SpecialistResult(
69
  specialist_id=self.specialist_id,
 
90
  def reset(self) -> None:
91
  pass
92
 
93
+ def execute(
94
+ self,
95
+ subtask: str,
96
+ stakes: float,
97
+ rng: random.Random,
98
+ domain: str | None = None,
99
+ ) -> SpecialistResult:
100
  correct = rng.random() < self.ACCURACY
101
  return SpecialistResult(
102
  specialist_id=self.specialist_id,
 
120
 
121
  IN_DOMAIN_ACCURACY = 0.95
122
  OUT_DOMAIN_ACCURACY = 0.10
123
+ STRUCTURED_DOMAINS = {"ANALYZE", "VERIFY"}
124
+ KEYWORD_FALLBACKS = {
125
+ "analyze", "analysis", "identify", "pattern", "verify", "correctness", "assess",
126
+ }
127
 
128
  def reset(self) -> None:
129
  pass
130
 
131
+ def execute(
132
+ self,
133
+ subtask: str,
134
+ stakes: float,
135
+ rng: random.Random,
136
+ domain: str | None = None,
137
+ ) -> SpecialistResult:
138
+ # Prefer structured scenario domain. Keyword matching remains only as a
139
+ # backwards-compatible fallback for direct unit tests or ad hoc calls.
140
+ if domain is not None:
141
+ in_domain = domain.upper() in self.STRUCTURED_DOMAINS
142
+ else:
143
+ lowered = subtask.lower()
144
+ in_domain = any(kw in lowered for kw in self.KEYWORD_FALLBACKS)
145
  accuracy = self.IN_DOMAIN_ACCURACY if in_domain else self.OUT_DOMAIN_ACCURACY
146
  correct = rng.random() < accuracy
147
  return SpecialistResult(
 
152
  outcome=1.0 if correct else 0.0,
153
  confidence=0.90 if in_domain else 0.45,
154
  is_adversarial=False,
155
+ metadata={"step_cost": 1, "in_domain": in_domain, "domain": domain},
156
  )
157
 
158
 
 
168
  specialist_id = "S3"
169
 
170
  BENIGN_ACCURACY = 0.88 # looks trustworthy at low stakes
171
+ STAKES_THRESHOLD = ADVERSARIAL_TRIGGER_STAKES
172
 
173
  def reset(self) -> None:
174
  pass
175
 
176
+ def execute(
177
+ self,
178
+ subtask: str,
179
+ stakes: float,
180
+ rng: random.Random,
181
+ domain: str | None = None,
182
+ ) -> SpecialistResult:
183
  if stakes >= self.STAKES_THRESHOLD:
184
  # Adversarial trigger: plausible response, wrong outcome
185
  return SpecialistResult(
 
223
  def reset(self) -> None:
224
  self._call_count = 0
225
 
226
+ def execute(
227
+ self,
228
+ subtask: str,
229
+ stakes: float,
230
+ rng: random.Random,
231
+ domain: str | None = None,
232
+ ) -> SpecialistResult:
233
  accuracy = max(0.05, self.BASE_ACCURACY - self.DEGRADATION_RATE * self._call_count)
234
  correct = rng.random() < accuracy
235
  self._call_count += 1
 
305
  subtask: str,
306
  stakes: float,
307
  rng: random.Random,
308
+ domain: str | None = None,
309
  ) -> SpecialistResult:
310
  """
311
  Route execution through the shuffled profile.
 
313
  """
314
  internal_id = self._profile[specialist_id]
315
  spec = self._fixed[internal_id]
316
+ result = spec.execute(subtask, stakes, rng, domain=domain)
317
  # Rewrite id to public slot so agent only sees the public label
318
  result.specialist_id = specialist_id
319
  return result
task_graph.py CHANGED
@@ -1,11 +1,17 @@
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
4
- from typing import Optional
 
 
5
 
6
  from scenarios import Scenario, SubTask
7
 
8
 
 
 
 
 
9
  # ---------------------------------------------------------------------------
10
  # Node state
11
  # ---------------------------------------------------------------------------
@@ -13,7 +19,7 @@ from scenarios import Scenario, SubTask
13
  @dataclass
14
  class TaskNode:
15
  subtask: SubTask
16
- status: str = "pending" # pending | ready | in_progress | completed | failed | skipped
17
  outcome: float = 0.0 # 1.0 correct | 0.5 partial | 0.0 wrong
18
  specialist_used: str = ""
19
  attempts: int = 0
@@ -47,7 +53,7 @@ class TaskGraph:
47
  # State queries
48
  # ------------------------------------------------------------------
49
 
50
- def current_node(self) -> Optional[TaskNode]:
51
  """
52
  Returns the first 'ready' node (all dependencies completed).
53
  Returns None if all nodes are done or none are unblocked yet.
@@ -125,7 +131,7 @@ class TaskGraph:
125
  return self._order.index(subtask_id)
126
 
127
  def high_stakes_nodes(self) -> list[TaskNode]:
128
- return [n for n in self._nodes.values() if n.subtask["stakes"] >= 0.70]
129
 
130
  # ------------------------------------------------------------------
131
  # Mutations
@@ -159,7 +165,7 @@ class TaskGraph:
159
  # Summary (for info dict in StepResult)
160
  # ------------------------------------------------------------------
161
 
162
- def summary(self) -> dict:
163
  return {
164
  "scenario_id": self._scenario["scenario_id"],
165
  "task_type": self._scenario["task_type"],
@@ -173,5 +179,5 @@ class TaskGraph:
173
  "is_done": self.is_done(),
174
  }
175
 
176
- def node_statuses(self) -> dict[str, str]:
177
  return {sid: n.status for sid, n in self._nodes.items()}
 
1
  from __future__ import annotations
2
 
3
  from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
7
 
8
  from scenarios import Scenario, SubTask
9
 
10
 
11
+ TaskStatus = Literal["pending", "ready", "in_progress", "completed", "failed", "skipped"]
12
+ SummaryValue = str | int | float | bool
13
+
14
+
15
  # ---------------------------------------------------------------------------
16
  # Node state
17
  # ---------------------------------------------------------------------------
 
19
  @dataclass
20
  class TaskNode:
21
  subtask: SubTask
22
+ status: TaskStatus = "pending"
23
  outcome: float = 0.0 # 1.0 correct | 0.5 partial | 0.0 wrong
24
  specialist_used: str = ""
25
  attempts: int = 0
 
53
  # State queries
54
  # ------------------------------------------------------------------
55
 
56
+ def current_node(self) -> TaskNode | None:
57
  """
58
  Returns the first 'ready' node (all dependencies completed).
59
  Returns None if all nodes are done or none are unblocked yet.
 
131
  return self._order.index(subtask_id)
132
 
133
  def high_stakes_nodes(self) -> list[TaskNode]:
134
+ return [n for n in self._nodes.values() if n.subtask["stakes"] >= ADVERSARIAL_AWARENESS_STAKES]
135
 
136
  # ------------------------------------------------------------------
137
  # Mutations
 
165
  # Summary (for info dict in StepResult)
166
  # ------------------------------------------------------------------
167
 
168
+ def summary(self) -> dict[str, SummaryValue]:
169
  return {
170
  "scenario_id": self._scenario["scenario_id"],
171
  "task_type": self._scenario["task_type"],
 
179
  "is_done": self.is_done(),
180
  }
181
 
182
+ def node_statuses(self) -> dict[str, TaskStatus]:
183
  return {sid: n.status for sid, n in self._nodes.items()}
tests/test_app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ import unittest
5
+
6
+ from app import SessionStore
7
+ from environment import SentinelEnv
8
+
9
+
10
+ class SessionStoreTests(unittest.TestCase):
11
+ def test_session_store_evicts_expired_sessions(self) -> None:
12
+ store = SessionStore(ttl_seconds=0, max_active=10)
13
+ env = SentinelEnv()
14
+ store.set("expired", env)
15
+
16
+ time.sleep(0.001)
17
+
18
+ self.assertIsNone(store.get("expired"))
19
+ self.assertEqual(store.stats()["active_sessions"], 0)
20
+
21
+ def test_session_store_evicts_lru_when_full(self) -> None:
22
+ store = SessionStore(ttl_seconds=60, max_active=1)
23
+ first = SentinelEnv()
24
+ second = SentinelEnv()
25
+
26
+ store.set("first", first)
27
+ store.set("second", second)
28
+
29
+ self.assertIsNone(store.get("first"))
30
+ self.assertIs(store.get("second"), second)
31
+
32
+
33
+ if __name__ == "__main__":
34
+ unittest.main()
35
+
tests/test_environment.py CHANGED
@@ -32,6 +32,24 @@ class EnvironmentTests(unittest.TestCase):
32
 
33
  self.assertEqual(result["info"]["step_count"], 2)
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def test_self_solve_finishes_long_task_with_normalized_score(self) -> None:
36
  env = SentinelEnv()
37
  result = env.reset(task_type="task3", seed=5)
 
32
 
33
  self.assertEqual(result["info"]["step_count"], 2)
34
 
35
+ def test_verify_accurate_slow_costs_specialist_plus_verify_step(self) -> None:
36
+ env = SentinelEnv()
37
+ result = env.reset(task_type="task1", seed=11)
38
+ slow_slot = next(
39
+ public_id
40
+ for public_id, internal_id in env._pool.internal_profile().items()
41
+ if internal_id == "S0"
42
+ )
43
+
44
+ result = env.step({
45
+ "session_id": result["observation"]["session_id"],
46
+ "task_type": "task1",
47
+ "action_type": "verify",
48
+ "specialist_id": slow_slot,
49
+ })
50
+
51
+ self.assertEqual(result["info"]["step_count"], 3)
52
+
53
  def test_self_solve_finishes_long_task_with_normalized_score(self) -> None:
54
  env = SentinelEnv()
55
  result = env.reset(task_type="task3", seed=5)
tests/test_specialists.py CHANGED
@@ -18,6 +18,15 @@ class SpecialistTests(unittest.TestCase):
18
  self.assertEqual(in_domain.outcome, 1.0)
19
  self.assertEqual(out_domain.outcome, 0.0)
20
 
 
 
 
 
 
 
 
 
 
21
  def test_profile_shuffle_keeps_public_reliability_aligned(self) -> None:
22
  pool = SpecialistPool()
23
  pool.reset(seed=7)
 
18
  self.assertEqual(in_domain.outcome, 1.0)
19
  self.assertEqual(out_domain.outcome, 0.0)
20
 
21
+ def test_domain_bound_prefers_structured_domain_over_keywords(self) -> None:
22
+ specialist = DomainBoundSpecialist()
23
+
24
+ structured = specialist.execute("Examine the payload carefully.", 0.2, random.Random(1), domain="ANALYZE")
25
+ mismatched = specialist.execute("Analyze this deployment step.", 0.2, random.Random(1), domain="EXECUTE")
26
+
27
+ self.assertTrue(structured.metadata["in_domain"])
28
+ self.assertFalse(mismatched.metadata["in_domain"])
29
+
30
  def test_profile_shuffle_keeps_public_reliability_aligned(self) -> None:
31
  pool = SpecialistPool()
32
  pool.reset(seed=7)
training/evaluate.py CHANGED
@@ -14,6 +14,7 @@ if str(ROOT) not in sys.path:
14
  sys.path.insert(0, str(ROOT))
15
 
16
  from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
 
17
 
18
 
19
  Policy = Callable[[SentinelEnv, dict, random.Random], dict]
@@ -40,13 +41,17 @@ def random_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
40
  def heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
41
  trust = obs["trust_snapshot"]
42
  specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
43
- action_type = "verify" if obs["stakes_level"] >= 0.70 and trust.get(specialist, 0.5) < 0.65 else "delegate"
 
 
 
 
44
  return _action(obs, action_type, specialist)
45
 
46
 
47
  def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
48
  reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
49
- if obs["task_type"] == "task3" and obs["stakes_level"] >= 0.70:
50
  return _action(obs, "verify", env._pool.adversarial_slot)
51
  specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
52
  return _action(obs, "delegate", specialist)
 
14
  sys.path.insert(0, str(ROOT))
15
 
16
  from environment import SentinelEnv, _GROUND_TRUTH_RELIABILITY
17
+ from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
18
 
19
 
20
  Policy = Callable[[SentinelEnv, dict, random.Random], dict]
 
41
  def heuristic_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
42
  trust = obs["trust_snapshot"]
43
  specialist = max(obs["available_specialists"], key=lambda sid: trust.get(sid, 0.5))
44
+ action_type = (
45
+ "verify"
46
+ if obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES and trust.get(specialist, 0.5) < 0.65
47
+ else "delegate"
48
+ )
49
  return _action(obs, action_type, specialist)
50
 
51
 
52
  def oracle_lite_policy(env: SentinelEnv, obs: dict, rng: random.Random) -> dict:
53
  reliability = env._pool.public_ground_truth_reliability(_GROUND_TRUTH_RELIABILITY)
54
+ if obs["task_type"] == "task3" and obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES:
55
  return _action(obs, "verify", env._pool.adversarial_slot)
56
  specialist = max(obs["available_specialists"], key=lambda sid: reliability.get(sid, 0.5))
57
  return _action(obs, "delegate", specialist)
training/train.py CHANGED
@@ -21,6 +21,7 @@ if str(ROOT) not in sys.path:
21
 
22
  from environment import SentinelEnv
23
  from mission_context import build_orchestrator_prompt
 
24
 
25
 
26
  ACTION_RE = re.compile(r"\{.*\}", re.DOTALL)
@@ -127,7 +128,11 @@ def dry_run_rollouts(episodes: int, seed: int) -> dict:
127
  action = {
128
  "session_id": obs["session_id"],
129
  "task_type": obs["task_type"],
130
- "action_type": "verify" if obs["stakes_level"] >= 0.70 and rng.random() < 0.5 else "delegate",
 
 
 
 
131
  "specialist_id": specialist,
132
  "subtask_response": None,
133
  "reasoning": "dry-run heuristic",
 
21
 
22
  from environment import SentinelEnv
23
  from mission_context import build_orchestrator_prompt
24
+ from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
25
 
26
 
27
  ACTION_RE = re.compile(r"\{.*\}", re.DOTALL)
 
128
  action = {
129
  "session_id": obs["session_id"],
130
  "task_type": obs["task_type"],
131
+ "action_type": (
132
+ "verify"
133
+ if obs["stakes_level"] >= ADVERSARIAL_AWARENESS_STAKES and rng.random() < 0.5
134
+ else "delegate"
135
+ ),
136
  "specialist_id": specialist,
137
  "subtask_response": None,
138
  "reasoning": "dry-run heuristic",
trust_ledger.py CHANGED
@@ -1,7 +1,5 @@
1
  from __future__ import annotations
2
 
3
- import math
4
-
5
 
6
  class TrustLedger:
7
  """
@@ -113,4 +111,4 @@ class TrustLedger:
113
 
114
  def __repr__(self) -> str:
115
  snap = self.snapshot()
116
- return f"TrustLedger({snap})"
 
1
  from __future__ import annotations
2
 
 
 
3
 
4
  class TrustLedger:
5
  """
 
111
 
112
  def __repr__(self) -> str:
113
  snap = self.snapshot()
114
+ return f"TrustLedger({snap})"