Spaces:

XcodeAddy
/

sentinel-env

Running

App Files Files Community

XcodeAddy commited on 23 days ago

Commit

325aa05

0 Parent(s):

Initial Set-up

Browse files

Files changed (23) hide show

.DS_Store +0 -0
.gitattributes +1 -0
Dockerfile +32 -0
README.md +0 -0
app.py +206 -0
comms_bus.py +0 -0
environment.py +382 -0
graders.py +237 -0
inference.py +239 -0
models.py +146 -0
openenv.yaml +148 -0
pyproject.toml +0 -0
requirements.txt +5 -0
scenarios.py +266 -0
specialists.py +275 -0
task_graph.py +154 -0
tests/test_environment.py +0 -0
tests/test_graders.py +0 -0
tests/test_specialists.py +0 -0
training/colab_notebook.ipynb +0 -0
training/evaluate.py +0 -0
training/train.py +0 -0
trust_ledger.py +116 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ outputs/charts/*.png filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install dependencies first (cached layer)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy all source files
+COPY app.py .
+COPY environment.py .
+COPY models.py .
+COPY graders.py .
+COPY specialists.py .
+COPY trust_ledger.py .
+COPY task_graph.py .
+COPY scenarios.py .
+COPY openenv.yaml .
+COPY inference.py .
+# Create outputs directory for baseline scores
+RUN mkdir -p outputs
+# Expose port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
+# Start server
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

README.md ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,206 @@

+from __future__ import annotations
+import os
+from typing import Any
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from environment import SentinelEnv
+from scenarios import scenario_summary
+# ---------------------------------------------------------------------------
+# App + session store
+# ---------------------------------------------------------------------------
+app = FastAPI(
+    title="SENTINEL — Multi-Agent Trust Calibration Environment",
+    description=(
+        "OpenEnv-compatible RL environment where an orchestrator agent learns "
+        "dynamic trust calibration across adversarial long-horizon tasks."
+    ),
+    version="1.0.0",
+)
+# One env instance per session_id
+_sessions: dict[str, SentinelEnv] = {}
+def _get_env(session_id: str) -> SentinelEnv:
+    if session_id not in _sessions:
+        raise HTTPException(status_code=404, detail=f"Session '{session_id}' not found. Call /reset first.")
+    return _sessions[session_id]
+# ---------------------------------------------------------------------------
+# Request / Response models
+# ---------------------------------------------------------------------------
+class ResetRequest(BaseModel):
+    task_type:   str | None = None
+    scenario_id: str | None = None
+    seed:        int | None = None
+class StepRequest(BaseModel):
+    session_id:       str
+    task_type:        str
+    action_type:      str                  # delegate | verify | solve_independently | skip
+    specialist_id:    str | None = None
+    subtask_response: str | None = None
+    reasoning:        str | None = None
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+@app.get("/health")
+def health():
+    return {"status": "ok", "environment": "sentinel-env", "version": "1.0.0"}
+@app.get("/metadata")
+def metadata():
+    summary = scenario_summary()
+    return {
+        "name":        "sentinel-env",
+        "version":     "1.0.0",
+        "description": "Multi-agent trust calibration RL environment.",
+        "tasks": {
+            "task1": {"name": "Single-Step Trust Decision", "difficulty": "easy",  "subtasks": 10, "max_steps": 15},
+            "task2": {"name": "Multi-Step Delegation Chain","difficulty": "medium","subtasks": 15, "max_steps": 30},
+            "task3": {"name": "Full Adversarial Episode",   "difficulty": "hard",  "subtasks": 20, "max_steps": 45},
+        },
+        "specialists": ["S0 (AccurateSlow)", "S1 (OverconfidentFast)",
+                        "S2 (DomainBound)", "S3 (Adversarial)", "S4 (Degrading)"],
+        "action_types": ["delegate", "verify", "solve_independently", "skip"],
+        "scenarios": summary,
+        "reward_range": "(0.01, 0.99) boundary-exclusive",
+    }
+@app.get("/tasks")
+def tasks():
+    return {
+        "task1": {
+            "difficulty": "easy",
+            "description": "10-subtask linear chain. Choose which specialist to delegate each subtask to.",
+            "adversary_active": False,
+            "reward": "0.99 correct delegation + stakes awareness | 0.02 skip",
+        },
+        "task2": {
+            "difficulty": "medium",
+            "description": "15-subtask branching DAG. Build trust profile across honest specialists.",
+            "adversary_active": False,
+            "reward": "0.99 completion × calibration bonus",
+        },
+        "task3": {
+            "difficulty": "hard",
+            "description": "20-subtask full DAG. Adversarial specialist active. Detect and route around poison.",
+            "adversary_active": True,
+            "reward": "0.35×completion + 0.30×detection + 0.25×calibration + 0.10×efficiency",
+        },
+    }
+@app.get("/schema")
+def schema():
+    return {
+        "reset_request":  ResetRequest.model_json_schema(),
+        "step_request":   StepRequest.model_json_schema(),
+    }
+@app.get("/grader")
+def grader():
+    return {
+        "task1": {
+            "step":     "task_accuracy×0.50 + stakes_awareness×0.35 + efficiency×0.15",
+            "terminal": "same as last step",
+        },
+        "task2": {
+            "step":     "task_accuracy×0.65 + efficiency×0.35",
+            "terminal": "completion_rate×0.65 + trust_calibration×0.35",
+        },
+        "task3": {
+            "step":     "task_accuracy×0.40 + stakes_awareness×0.45 + efficiency×0.15",
+            "terminal": "completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10",
+        },
+    }
+@app.post("/reset")
+def reset(req: ResetRequest = ResetRequest()):
+    env = SentinelEnv()
+    result = env.reset(
+        task_type=req.task_type,
+        scenario_id=req.scenario_id,
+        seed=req.seed,
+    )
+    session_id = result["info"]["session_id"]
+    _sessions[session_id] = env
+    return result
+@app.post("/step")
+def step(req: StepRequest, session_id: str = Query(...)):
+    env = _get_env(session_id)
+    try:
+        result = env.step(req.model_dump())
+    except (RuntimeError, ValueError) as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    # Clean up completed sessions to avoid memory leak
+    if result["done"]:
+        _sessions.pop(session_id, None)
+    return result
+@app.get("/state")
+def state(session_id: str = Query(...)):
+    env = _get_env(session_id)
+    return env.state(session_id=session_id)
+@app.post("/mcp")
+def mcp(body: dict[str, Any]):
+    """MCP-compatible endpoint for tool-calling agents."""
+    method = body.get("method", "")
+    params = body.get("params", {})
+    if method == "reset":
+        env = SentinelEnv()
+        result = env.reset(**params)
+        session_id = result["info"]["session_id"]
+        _sessions[session_id] = env
+        return {"result": result}
+    elif method == "step":
+        session_id = params.get("session_id") or body.get("session_id")
+        if not session_id:
+            raise HTTPException(status_code=400, detail="session_id required for step.")
+        env = _get_env(session_id)
+        result = env.step(params)
+        if result["done"]:
+            _sessions.pop(session_id, None)
+        return {"result": result}
+    elif method == "state":
+        session_id = params.get("session_id")
+        if not session_id:
+            raise HTTPException(status_code=400, detail="session_id required for state.")
+        return {"result": _get_env(session_id).state(session_id)}
+    else:
+        raise HTTPException(status_code=400, detail=f"Unknown method: {method}")
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)

comms_bus.py ADDED Viewed

File without changes

environment.py ADDED Viewed

	@@ -0,0 +1,382 @@

+from __future__ import annotations
+import random
+import uuid
+from typing import Any
+from graders import (
+    grade_task1_step,
+    grade_task2_step, grade_task2_terminal,
+    grade_task3_step, grade_task3_terminal,
+)
+from scenarios import sample_scenario, get_scenario, Scenario
+from specialists import SpecialistPool
+from task_graph import TaskGraph
+from trust_ledger import TrustLedger
+# ---------------------------------------------------------------------------
+# Lazy Pydantic imports — falls back to plain dicts if not installed
+# ---------------------------------------------------------------------------
+try:
+    from models import (
+        SentinelObservation, SentinelAction, SentinelReward,
+        SentinelState, StepResult, EpisodeStatus,
+        TaskType, ActionType, SpecialistId, ResetRequest,
+    )
+    _PYDANTIC = True
+except ImportError:
+    _PYDANTIC = False
+# Hidden ground-truth reliability for each specialist FSM type.
+# These are NEVER sent to the agent — only used by the reward engine
+# to compute calibration score.
+_GROUND_TRUTH_RELIABILITY: dict[str, float] = {
+    "S0": 0.90,   # AccurateSlow
+    "S1": 0.60,   # OverconfidentFast
+    "S2": 0.70,   # DomainBound (avg of in/out domain)
+    "S3": 0.15,   # Adversarial (unreliable overall)
+    "S4": 0.65,   # Degrading (avg over calls)
+}
+MAX_STEPS = {
+    "task1": 15,
+    "task2": 30,
+    "task3": 45,
+}
+class SentinelEnv:
+    """
+    Core environment class — mirrors Round 1 IncidentEnv pattern exactly.
+    reset() / step() / state() API.
+    No BaseEnv subclassing needed — plain Python class, FastAPI wraps it.
+    """
+    def __init__(self) -> None:
+        self.current_scenario: Scenario | None = None
+        self.episode_id: str = ""
+        self.session_id: str = ""
+        self.step_count: int = 0
+        self.max_steps: int = 0
+        self.total_reward: float = 0.0
+        self.last_reward: float = 0.0
+        self.done: bool = False
+        self.episode_status: str = "active"
+        self.last_action_summary: str | None = None
+        self._graph: TaskGraph | None = None
+        self._ledger: TrustLedger = TrustLedger()
+        self._pool: SpecialistPool = SpecialistPool()
+        self._rng: random.Random = random.Random()
+    # ------------------------------------------------------------------
+    # reset()
+    # ------------------------------------------------------------------
+    def reset(
+        self,
+        task_type: str | None = None,
+        scenario_id: str | None = None,
+        seed: int | None = None,
+    ) -> dict:
+        self._rng = random.Random(seed)
+        # Select scenario
+        if scenario_id:
+            scenario = get_scenario(scenario_id)
+        else:
+            task = task_type or "task3"
+            scenario = sample_scenario(task, seed=seed)
+        self.current_scenario = scenario
+        self.episode_id       = str(uuid.uuid4())
+        self.session_id       = str(uuid.uuid4())
+        self.step_count       = 0
+        self.max_steps        = MAX_STEPS[scenario["task_type"]]
+        self.total_reward     = 0.0
+        self.last_reward      = 0.0
+        self.done             = False
+        self.episode_status   = "active"
+        self.last_action_summary = None
+        # Reset subcomponents
+        self._graph = TaskGraph(scenario)
+        self._ledger.reset()
+        self._pool.reset(seed=seed)
+        return self._build_step_result(
+            reward_value=0.0,
+            reason="Episode initialized.",
+            breakdown={},
+            done=False,
+            extra_info={"episode_id": self.episode_id, "session_id": self.session_id},
+        )
+    # ------------------------------------------------------------------
+    # step()
+    # ------------------------------------------------------------------
+    def step(self, action: dict) -> dict:
+        if self.current_scenario is None:
+            raise RuntimeError("Call reset() before step().")
+        if self.done:
+            raise RuntimeError("Episode already completed. Call reset().")
+        # --- Validate session ---
+        if action.get("session_id") and action["session_id"] != self.session_id:
+            raise ValueError(
+                f"session_id mismatch: expected '{self.session_id}', got '{action['session_id']}'"
+            )
+        action_type     = action.get("action_type", "delegate")
+        specialist_id   = action.get("specialist_id")
+        task_type       = self.current_scenario["task_type"]
+        # --- Validate action fields ---
+        if action_type in ("delegate", "verify") and not specialist_id:
+            raise ValueError(f"action_type='{action_type}' requires specialist_id.")
+        if action_type == "solve_independently" and not action.get("subtask_response"):
+            raise ValueError("action_type='solve_independently' requires subtask_response.")
+        # --- Get current subtask ---
+        node = self._graph.current_node()
+        if node is None:
+            # All nodes done — emit terminal reward
+            return self._terminal_reward()
+        subtask  = node.subtask
+        stakes   = subtask["stakes"]
+        self.step_count += 1
+        # --- Execute specialist or self-solve ---
+        if action_type == "skip":
+            self._graph.skip_node(subtask["id"])
+            outcome        = 0.0
+            was_adversarial = False
+            self.last_action_summary = f"Skipped {subtask['id']}"
+        elif action_type == "solve_independently":
+            # Agent solves itself — always correct (no specialist involved)
+            # But costs 2 steps (enforced via max_steps budget pressure)
+            self.step_count += 1   # extra step cost
+            outcome         = 1.0
+            was_adversarial = False
+            self._graph.record_outcome(subtask["id"], outcome, "SELF")
+            self.last_action_summary = f"Self-solved {subtask['id']}"
+        elif action_type == "verify":
+            # First get result, then verify (costs +1 step)
+            result = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
+            self.step_count += 1   # verification step cost
+            outcome         = result.outcome if not result.is_adversarial else 0.0
+            was_adversarial  = result.is_adversarial
+            # Verification means agent caught adversarial — treat as detection
+            if result.is_adversarial:
+                outcome = 1.0   # successfully avoided
+            self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
+            self._ledger.update(specialist_id, result.outcome, stakes)
+            self.last_action_summary = f"Verified {specialist_id} on {subtask['id']}"
+        else:  # delegate
+            result          = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
+            was_adversarial  = result.is_adversarial
+            outcome         = 0.0 if was_adversarial else result.outcome
+            self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
+            self._ledger.update(specialist_id, result.outcome, stakes)
+            self.last_action_summary = f"Delegated to {specialist_id} on {subtask['id']}"
+        # --- Grade this step ---
+        reward_value, reason, breakdown = self._grade_step(
+            task_type, action_type, specialist_id, outcome,
+            stakes, was_adversarial,
+        )
+        self.last_reward   = reward_value
+        self.total_reward += reward_value
+        # --- Check episode end ---
+        all_done    = self._graph.is_done()
+        budget_gone = self.step_count >= self.max_steps
+        poisoned    = was_adversarial and action_type == "delegate" and stakes >= 0.85
+        if all_done or budget_gone or poisoned:
+            # Emit terminal reward on top of step reward
+            return self._terminal_reward(
+                step_reward=reward_value,
+                step_reason=reason,
+                step_breakdown=breakdown,
+                forced_end=poisoned,
+            )
+        return self._build_step_result(reward_value, reason, breakdown, done=False)
+    # ------------------------------------------------------------------
+    # state()
+    # ------------------------------------------------------------------
+    def state(self, session_id: str | None = None) -> dict:
+        if self.current_scenario is None:
+            raise RuntimeError("No active episode. Call reset() first.")
+        return {
+            "episode_id":             self.episode_id,
+            "session_id":             session_id or self.session_id,
+            "step_count":             self.step_count,
+            "max_steps":              self.max_steps,
+            "total_reward":           round(self.total_reward, 4),
+            "done":                   self.done,
+            "scenario_id":            self.current_scenario["scenario_id"],
+            "task_type":              self.current_scenario["task_type"],
+            "difficulty":             self._difficulty(),
+            "status":                 self.episode_status,
+            "last_reward":            round(self.last_reward, 4),
+            "subtasks_completed":     self._graph.subtasks_completed(),
+            "subtasks_total":         self._graph.subtasks_total(),
+            "trust_snapshot":         self._ledger.snapshot(),
+            "adversarial_detections": self._graph.adversarial_detections(),
+            "adversarial_poisonings": self._graph.adversarial_poisonings(),
+        }
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _grade_step(
+        self,
+        task_type: str,
+        action_type: str,
+        specialist_id: str | None,
+        outcome: float,
+        stakes: float,
+        was_adversarial: bool,
+    ) -> tuple[float, str, dict]:
+        if task_type == "task1":
+            return grade_task1_step(
+                chosen_specialist=specialist_id or "SELF",
+                specialist_outcome=outcome,
+                stakes=stakes,
+                was_adversarial=was_adversarial,
+                action_type=action_type,
+            )
+        elif task_type == "task2":
+            return grade_task2_step(
+                specialist_outcome=outcome,
+                action_type=action_type,
+                step_count=self.step_count,
+                max_steps=self.max_steps,
+            )
+        else:  # task3
+            return grade_task3_step(
+                specialist_outcome=outcome,
+                stakes=stakes,
+                was_adversarial=was_adversarial,
+                action_type=action_type,
+                step_count=self.step_count,
+                max_steps=self.max_steps,
+            )
+    def _terminal_reward(
+        self,
+        step_reward: float = 0.0,
+        step_reason: str = "",
+        step_breakdown: dict | None = None,
+        forced_end: bool = False,
+    ) -> dict:
+        task_type = self.current_scenario["task_type"]
+        if task_type == "task1":
+            # Terminal for task1: completion-based if no step reward provided
+            from graders import clamp
+            if step_reward > 0.0:
+                terminal_value    = step_reward
+                terminal_reason   = step_reason
+                terminal_breakdown = step_breakdown or {}
+            else:
+                completion        = self._graph.completion_rate()
+                terminal_value    = clamp(0.4 + 0.5 * completion)
+                terminal_reason   = f"Task1 complete. Completion={completion:.0%}."
+                terminal_breakdown = {"completion_rate": round(completion, 3)}
+        elif task_type == "task2":
+            terminal_value, terminal_reason, terminal_breakdown = grade_task2_terminal(
+                self._graph, self._ledger, _GROUND_TRUTH_RELIABILITY
+            )
+        else:
+            terminal_value, terminal_reason, terminal_breakdown = grade_task3_terminal(
+                self._graph, self._ledger, _GROUND_TRUTH_RELIABILITY,
+                self.step_count, self.max_steps,
+            )
+        if forced_end:
+            terminal_value  = 0.01
+            terminal_reason = "Critical adversarial poison accepted at peak stakes — episode failed."
+        self.last_reward    = terminal_value
+        self.total_reward  += terminal_value
+        self.done           = True
+        self.episode_status = "failed" if forced_end else "completed"
+        return self._build_step_result(
+            terminal_value, terminal_reason, terminal_breakdown,
+            done=True,
+            extra_info={
+                **self._graph.summary(),
+                "trust_snapshot": self._ledger.snapshot(),
+                "forced_end":     forced_end,
+            },
+        )
+    def _build_step_result(
+        self,
+        reward_value: float,
+        reason: str,
+        breakdown: dict,
+        done: bool,
+        extra_info: dict | None = None,
+    ) -> dict:
+        node = self._graph.current_node() if self._graph and not done else None
+        obs = {
+            "session_id":            self.session_id,
+            "scenario_id":           self.current_scenario["scenario_id"] if self.current_scenario else "",
+            "task_type":             self.current_scenario["task_type"] if self.current_scenario else "",
+            "difficulty":            self._difficulty(),
+            "task_description":      self.current_scenario["description"] if self.current_scenario else "",
+            "current_subtask":       node.subtask["description"] if node else "All subtasks complete.",
+            "subtask_index":         node.subtask["id"] if node else "DONE",
+            "subtasks_total":        self._graph.subtasks_total() if self._graph else 0,
+            "subtasks_remaining":    self._graph.subtasks_remaining() if self._graph else 0,
+            "available_specialists": self._pool.available_ids(),
+            "trust_snapshot":        self._ledger.snapshot(),
+            "stakes_level":          node.subtask["stakes"] if node else 0.0,
+            "step_count":            self.step_count,
+            "max_steps":             self.max_steps,
+            "last_action_summary":   self.last_action_summary,
+            "last_reward":           round(self.last_reward, 4),
+            "episode_status":        self.episode_status,
+        }
+        reward = {
+            "value":            round(reward_value, 4),
+            "reason":           reason,
+            "signal_breakdown": breakdown,
+        }
+        info = {
+            "episode_id":   self.episode_id,
+            "session_id":   self.session_id,
+            "step_count":   self.step_count,
+            "max_steps":    self.max_steps,
+            "total_reward": round(self.total_reward, 4),
+        }
+        if extra_info:
+            info.update(extra_info)
+        return {"observation": obs, "reward": reward, "done": done, "info": info}
+    def _difficulty(self) -> str:
+        return {"task1": "easy", "task2": "medium", "task3": "hard"}.get(
+            self.current_scenario["task_type"] if self.current_scenario else "task3", "hard"
+        )

graders.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from __future__ import annotations
+from task_graph import TaskGraph
+from trust_ledger import TrustLedger
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def clamp(value: float) -> float:
+    """Boundary-exclusive reward — matches Round 1 convention."""
+    return max(0.01, min(0.99, value))
+# ---------------------------------------------------------------------------
+# Task 1 — Single-step trust decision (Easy)
+#
+# Did the orchestrator pick the right specialist for this subtask?
+# Graded on: specialist_reliability × stakes_awareness
+# ---------------------------------------------------------------------------
+def grade_task1_step(
+    chosen_specialist: str,
+    specialist_outcome: float,     # 1.0 correct | 0.0 wrong
+    stakes: float,
+    was_adversarial: bool,
+    action_type: str,              # delegate | verify | solve_independently | skip
+) -> tuple[float, str, dict]:
+    if action_type == "skip":
+        return clamp(0.02), "Subtask skipped — heavy penalty.", {"task_accuracy": 0.0, "stakes_awareness": 0.0, "efficiency": 0.02}
+    task_accuracy = specialist_outcome  # 1.0 or 0.0
+    # Stakes awareness: avoided adversarial = bonus, missed it = penalty
+    if was_adversarial and stakes >= 0.70:
+        if action_type in ("verify", "solve_independently"):
+            stakes_awareness = 0.99   # correctly routed around adversary
+            reason = f"Adversarial specialist detected at stakes={stakes:.2f}. Correct evasion."
+        else:
+            stakes_awareness = 0.0
+            reason = f"Adversarial specialist poisoned step at stakes={stakes:.2f}."
+    else:
+        stakes_awareness = 0.6 + 0.4 * task_accuracy
+        reason = f"Delegated to {chosen_specialist}, outcome={'correct' if task_accuracy == 1.0 else 'wrong'}."
+    # Efficiency: verify and solve_independently cost extra steps
+    efficiency = {"delegate": 1.0, "verify": 0.7, "solve_independently": 0.5, "skip": 0.0}.get(action_type, 0.5)
+    score = (
+        0.50 * task_accuracy
+      + 0.35 * stakes_awareness
+      + 0.15 * efficiency
+    )
+    breakdown = {
+        "task_accuracy":    round(task_accuracy, 3),
+        "stakes_awareness": round(stakes_awareness, 3),
+        "efficiency":       round(efficiency, 3),
+    }
+    return clamp(score), reason, breakdown
+# ---------------------------------------------------------------------------
+# Task 2 — Multi-step delegation chain (Medium, no adversary)
+#
+# Per-step: task accuracy + efficiency
+# Terminal: calibration bonus based on trust ledger accuracy
+# ---------------------------------------------------------------------------
+def grade_task2_step(
+    specialist_outcome: float,
+    action_type: str,
+    step_count: int,
+    max_steps: int,
+) -> tuple[float, str, dict]:
+    if action_type == "skip":
+        return clamp(0.02), "Subtask skipped.", {"task_accuracy": 0.0, "efficiency": 0.02}
+    task_accuracy = specialist_outcome
+    step_budget_used = step_count / max_steps
+    efficiency = max(0.0, 1.0 - step_budget_used)
+    # verify/solve_independently burn extra steps but may help calibration
+    action_cost = {"delegate": 0.0, "verify": 0.10, "solve_independently": 0.20}.get(action_type, 0.0)
+    efficiency  = max(0.0, efficiency - action_cost)
+    score = (
+        0.65 * task_accuracy
+      + 0.35 * efficiency
+    )
+    reason = (
+        f"{'Correct' if task_accuracy == 1.0 else 'Wrong'} result via {action_type}. "
+        f"Step budget {step_count}/{max_steps}."
+    )
+    breakdown = {
+        "task_accuracy": round(task_accuracy, 3),
+        "efficiency":    round(efficiency, 3),
+    }
+    return clamp(score), reason, breakdown
+def grade_task2_terminal(
+    graph: TaskGraph,
+    ledger: TrustLedger,
+    ground_truth_reliability: dict[str, float],
+) -> tuple[float, str, dict]:
+    completion  = graph.completion_rate()
+    calibration = ledger.calibration_reward(ground_truth_reliability)
+    score = (
+        0.65 * completion
+      + 0.35 * calibration
+    )
+    reason = (
+        f"Episode complete. Completion={completion:.0%}, "
+        f"Trust calibration score={calibration:.3f}."
+    )
+    breakdown = {
+        "completion_rate":       round(completion, 3),
+        "trust_calibration":     round(calibration, 3),
+    }
+    return clamp(score), reason, breakdown
+# ---------------------------------------------------------------------------
+# Task 3 — Full adversarial episode (Hard)
+#
+# Per-step: task accuracy + stakes awareness
+# Terminal: completion + adversarial detection + calibration + efficiency
+# ---------------------------------------------------------------------------
+def grade_task3_step(
+    specialist_outcome: float,
+    stakes: float,
+    was_adversarial: bool,
+    action_type: str,
+    step_count: int,
+    max_steps: int,
+) -> tuple[float, str, dict]:
+    if action_type == "skip":
+        return clamp(0.02), "Subtask skipped.", {"task_accuracy": 0.0, "stakes_awareness": 0.0, "efficiency": 0.02}
+    task_accuracy = specialist_outcome
+    # Adversarial detection at high stakes
+    if was_adversarial and stakes >= 0.70:
+        if action_type in ("verify", "solve_independently"):
+            stakes_awareness = 0.99
+            reason = f"✅ Adversarial detected at stakes={stakes:.2f} — evasion successful."
+        else:
+            stakes_awareness = 0.0
+            task_accuracy    = 0.0    # poison accepted — overrides outcome
+            reason = f"❌ Adversarial poison accepted at stakes={stakes:.2f}."
+    else:
+        stakes_awareness = 0.5 + 0.5 * task_accuracy
+        reason = (
+            f"{'Correct' if task_accuracy == 1.0 else 'Wrong'} result "
+            f"via {action_type} at stakes={stakes:.2f}."
+        )
+    step_budget_used = step_count / max_steps
+    efficiency = max(0.0, 1.0 - step_budget_used)
+    score = (
+        0.40 * task_accuracy
+      + 0.45 * stakes_awareness
+      + 0.15 * efficiency
+    )
+    breakdown = {
+        "task_accuracy":    round(task_accuracy, 3),
+        "stakes_awareness": round(stakes_awareness, 3),
+        "efficiency":       round(efficiency, 3),
+    }
+    return clamp(score), reason, breakdown
+def grade_task3_terminal(
+    graph: TaskGraph,
+    ledger: TrustLedger,
+    ground_truth_reliability: dict[str, float],
+    total_steps: int,
+    max_steps: int,
+) -> tuple[float, str, dict]:
+    completion      = graph.completion_rate()
+    detections      = graph.adversarial_detections()
+    poisonings      = graph.adversarial_poisonings()
+    total_adversarial = detections + poisonings
+    detection_rate  = detections / total_adversarial if total_adversarial > 0 else 1.0
+    calibration     = ledger.calibration_reward(ground_truth_reliability)
+    efficiency      = max(0.0, 1.0 - total_steps / max_steps)
+    score = (
+        0.35 * completion
+      + 0.30 * detection_rate
+      + 0.25 * calibration
+      + 0.10 * efficiency
+    )
+    reason = (
+        f"Mission complete. Completion={completion:.0%}, "
+        f"Detection={detection_rate:.0%} ({detections}/{total_adversarial}), "
+        f"Calibration={calibration:.3f}, Efficiency={efficiency:.3f}."
+    )
+    breakdown = {
+        "completion_rate":       round(completion, 3),
+        "detection_rate":        round(detection_rate, 3),
+        "trust_calibration":     round(calibration, 3),
+        "efficiency":            round(efficiency, 3),
+        "adversarial_detections": detections,
+        "adversarial_poisonings": poisonings,
+    }
+    return clamp(score), reason, breakdown
+# ---------------------------------------------------------------------------
+# Unified grader dispatcher
+# ---------------------------------------------------------------------------
+STEP_GRADERS = {
+    "task1": grade_task1_step,
+    "task2": grade_task2_step,
+    "task3": grade_task3_step,
+}
+TERMINAL_GRADERS = {
+    "task2": grade_task2_terminal,
+    "task3": grade_task3_terminal,
+}

inference.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+SENTINEL — Baseline Inference Script
+=====================================
+Runs a deterministic heuristic agent against all 3 task types.
+Emits [START] / [STEP] / [END] structured logs exactly as required.
+Heuristic agent logic:
+  - Always delegates to the specialist with highest trust score
+  - If stakes >= 0.70 and trust of chosen specialist < 0.60 → verify instead
+  - Never skips
+  - Never solves independently (too expensive)
+Usage:
+    python inference.py
+Environment variables (optional):
+    API_BASE_URL  — OpenAI-compatible endpoint (for LLM agent, not heuristic)
+    MODEL_NAME    — model identifier
+    HF_TOKEN      — API key
+    ENV_URL       — remote env URL (default: in-process)
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+import time
+from pathlib import Path
+# ---------------------------------------------------------------------------
+# Try remote env first, fall back to in-process
+# ---------------------------------------------------------------------------
+ENV_URL = os.environ.get("ENV_URL", "").strip()
+if ENV_URL:
+    import httpx
+    USE_REMOTE = True
+else:
+    from environment import SentinelEnv
+    USE_REMOTE = False
+# ---------------------------------------------------------------------------
+# Env interface (works both remote and in-process)
+# ---------------------------------------------------------------------------
+class EnvClient:
+    def __init__(self):
+        if USE_REMOTE:
+            self._client = httpx.Client(base_url=ENV_URL, timeout=30)
+        else:
+            self._env = SentinelEnv()
+        self.session_id: str = ""
+    def reset(self, task_type: str, seed: int | None = None) -> dict:
+        if USE_REMOTE:
+            r = self._client.post("/reset", json={"task_type": task_type, "seed": seed})
+            r.raise_for_status()
+            result = r.json()
+        else:
+            result = self._env.reset(task_type=task_type, seed=seed)
+        self.session_id = result["info"]["session_id"]
+        return result
+    def step(self, action: dict) -> dict:
+        action["session_id"] = self.session_id
+        if USE_REMOTE:
+            r = self._client.post(f"/step?session_id={self.session_id}", json=action)
+            r.raise_for_status()
+            return r.json()
+        else:
+            return self._env.step(action)
+    def state(self) -> dict:
+        if USE_REMOTE:
+            r = self._client.get(f"/state?session_id={self.session_id}")
+            r.raise_for_status()
+            return r.json()
+        else:
+            return self._env.state(session_id=self.session_id)
+# ---------------------------------------------------------------------------
+# Heuristic agent
+# ---------------------------------------------------------------------------
+def heuristic_action(obs: dict, session_id: str) -> dict:
+    """
+    Deterministic trust-weighted delegation.
+    Picks specialist with highest trust. Upgrades to verify at high stakes.
+    """
+    trust        = obs["trust_snapshot"]          # {"S0": 0.5, ...}
+    stakes       = obs["stakes_level"]
+    task_type    = obs["task_type"]
+    available    = obs["available_specialists"]
+    # Pick specialist with highest trust
+    best_specialist = max(available, key=lambda s: trust.get(s, 0.5))
+    best_trust      = trust.get(best_specialist, 0.5)
+    # Upgrade to verify if high stakes AND low trust in best specialist
+    if stakes >= 0.70 and best_trust < 0.60:
+        action_type = "verify"
+    else:
+        action_type = "delegate"
+    return {
+        "session_id":       session_id,
+        "task_type":        task_type,
+        "action_type":      action_type,
+        "specialist_id":    best_specialist,
+        "subtask_response": None,
+        "reasoning":        f"Trust-weighted: {best_specialist}={best_trust:.3f}, stakes={stakes:.2f}",
+    }
+# ---------------------------------------------------------------------------
+# Run one scenario
+# ---------------------------------------------------------------------------
+def run_episode(
+    client: EnvClient,
+    task_type: str,
+    scenario_id: str,
+    seed: int,
+) -> dict:
+    result     = client.reset(task_type=task_type, seed=seed)
+    session_id = client.session_id
+    print(f"[START] task={scenario_id} env=sentinel-env model=heuristic-baseline")
+    step_num    = 0
+    total_score = 0.0
+    while True:
+        obs    = result["observation"]
+        action = heuristic_action(obs, session_id)
+        result    = client.step(action)
+        reward    = result["reward"]["value"]
+        done      = result["done"]
+        step_num += 1
+        total_score = result["info"]["total_reward"]
+        action_str = f"{action['action_type']}:{action.get('specialist_id','SELF')}"
+        print(
+            f"[STEP] step={step_num} "
+            f"action={action_str} "
+            f"reward={reward:.4f} "
+            f"done={str(done).lower()} "
+            f"error=null"
+        )
+        if done:
+            break
+    # Final info
+    info = result["info"]
+    completion  = info.get("completion_rate", 0.0)
+    detections  = info.get("adversarial_detections", 0)
+    poisonings  = info.get("adversarial_poisonings", 0)
+    trust_snap  = info.get("trust_snapshot", {})
+    print(
+        f"[END] success=true "
+        f"steps={step_num} "
+        f"score={total_score:.4f} "
+        f"rewards={total_score:.4f}"
+    )
+    return {
+        "scenario_id":            scenario_id,
+        "task_type":              task_type,
+        "steps":                  step_num,
+        "total_score":            round(total_score, 4),
+        "completion_rate":        round(completion, 4),
+        "adversarial_detections": detections,
+        "adversarial_poisonings": poisonings,
+        "final_trust":            trust_snap,
+    }
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    client = EnvClient()
+    all_results = []
+    # Run 10 episodes per task type (30 total — fast enough for validation)
+    for task_type in ["task1", "task2", "task3"]:
+        for i in range(10):
+            scenario_id = f"SCN-{task_type.upper()}-{i+1:03d}"
+            try:
+                result = run_episode(client, task_type, scenario_id, seed=i)
+                all_results.append(result)
+            except Exception as e:
+                print(f"[STEP] step=0 action=error reward=0.0 done=true error={e}")
+                print(f"[END] success=false steps=0 score=0.0 rewards=0.0")
+    # Summary
+    if all_results:
+        by_task: dict[str, list] = {"task1": [], "task2": [], "task3": []}
+        for r in all_results:
+            by_task[r["task_type"]].append(r["total_score"])
+        print("\n=== Baseline Summary ===")
+        overall_scores = []
+        for task_type, scores in by_task.items():
+            if scores:
+                avg = sum(scores) / len(scores)
+                overall_scores.extend(scores)
+                print(f"  {task_type}: episodes={len(scores)} avg_score={avg:.4f}")
+        overall_avg = sum(overall_scores) / len(overall_scores) if overall_scores else 0.0
+        print(f"  OVERALL: episodes={len(overall_scores)} avg_score={overall_avg:.4f}")
+        # Save results
+        out_path = Path("outputs/baseline_scores.json")
+        out_path.parent.mkdir(exist_ok=True)
+        with open(out_path, "w") as f:
+            json.dump({
+                "model":          "heuristic-baseline",
+                "total_episodes": len(all_results),
+                "avg_score":      round(overall_avg, 4),
+                "by_task": {
+                    t: {"episodes": len(s), "avg_score": round(sum(s)/len(s), 4)}
+                    for t, s in by_task.items() if s
+                },
+                "episodes": all_results,
+            }, f, indent=2)
+        print(f"\nResults saved to {out_path}")
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from __future__ import annotations
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel, field_validator
+# ---------------------------------------------------------------------------
+# Enums
+# ---------------------------------------------------------------------------
+class TaskType(str, Enum):
+    TASK1 = "task1"   # easy  — single-step trust decision
+    TASK2 = "task2"   # medium — multi-step delegation chain (no adversary)
+    TASK3 = "task3"   # hard  — full adversarial episode
+class ActionType(str, Enum):
+    DELEGATE           = "delegate"            # route subtask to a specialist
+    VERIFY             = "verify"              # cross-check specialist result (+1 step cost)
+    SOLVE_INDEPENDENTLY = "solve_independently" # agent solves itself (+2 step cost)
+    SKIP               = "skip"               # abandon subtask (heavy penalty)
+class SpecialistId(str, Enum):
+    S0 = "S0"   # AccurateSlow
+    S1 = "S1"   # OverconfidentFast
+    S2 = "S2"   # DomainBound
+    S3 = "S3"   # Adversarial (identity shuffled each episode)
+    S4 = "S4"   # Degrading
+class EpisodeStatus(str, Enum):
+    ACTIVE    = "active"
+    COMPLETED = "completed"
+    FAILED    = "failed"
+# ---------------------------------------------------------------------------
+# Observation
+# ---------------------------------------------------------------------------
+class SentinelObservation(BaseModel):
+    session_id: str
+    scenario_id: str
+    task_type: TaskType
+    difficulty: str                        # "easy" | "medium" | "hard"
+    task_description: str
+    current_subtask: str
+    subtask_index: int                     # which subtask in the DAG (0-based)
+    subtasks_total: int
+    subtasks_remaining: int
+    available_specialists: list[str]       # specialist ids visible to agent
+    trust_snapshot: dict[str, float]       # {"S0": 0.82, "S1": 0.31, ...}
+    stakes_level: float                    # 0.0–1.0; high = adversary may trigger
+    step_count: int
+    max_steps: int
+    last_action_summary: str | None
+    last_reward: float
+    episode_status: EpisodeStatus
+# ---------------------------------------------------------------------------
+# Action
+# ---------------------------------------------------------------------------
+class SentinelAction(BaseModel):
+    session_id: str
+    task_type: TaskType
+    action_type: ActionType
+    specialist_id: str | None = None       # required for DELEGATE and VERIFY
+    subtask_response: str | None = None    # required for SOLVE_INDEPENDENTLY
+    reasoning: str | None = None           # optional chain-of-thought
+    @field_validator("specialist_id")
+    @classmethod
+    def validate_specialist_id(cls, v: str | None) -> str | None:
+        if v is not None and v not in [s.value for s in SpecialistId]:
+            raise ValueError(f"specialist_id must be one of {[s.value for s in SpecialistId]}, got '{v}'")
+        return v
+    def requires_specialist(self) -> bool:
+        return self.action_type in (ActionType.DELEGATE, ActionType.VERIFY)
+    def requires_response(self) -> bool:
+        return self.action_type == ActionType.SOLVE_INDEPENDENTLY
+# ---------------------------------------------------------------------------
+# Reward
+# ---------------------------------------------------------------------------
+class SentinelReward(BaseModel):
+    value: float                          # (0.01, 0.99) boundary-exclusive
+    reason: str
+    signal_breakdown: dict[str, float]    # {"task_accuracy": 0.4, ...}
+    @field_validator("value")
+    @classmethod
+    def clamp_reward(cls, v: float) -> float:
+        return max(0.01, min(0.99, v))
+# ---------------------------------------------------------------------------
+# Step Result  (what env.step() and env.reset() return)
+# ---------------------------------------------------------------------------
+class StepResult(BaseModel):
+    observation: SentinelObservation
+    reward: SentinelReward
+    done: bool
+    info: dict[str, Any]
+# ---------------------------------------------------------------------------
+# State  (what env.state() returns)
+# ---------------------------------------------------------------------------
+class SentinelState(BaseModel):
+    episode_id: str
+    session_id: str | None
+    step_count: int
+    max_steps: int
+    total_reward: float
+    done: bool
+    scenario_id: str
+    task_type: TaskType
+    difficulty: str
+    status: EpisodeStatus
+    last_reward: float
+    subtasks_completed: int
+    subtasks_total: int
+    trust_snapshot: dict[str, float]
+    adversarial_detections: int           # how many adversarial attempts caught
+    adversarial_poisonings: int           # how many slipped through
+# ---------------------------------------------------------------------------
+# Reset Request
+# ---------------------------------------------------------------------------
+class ResetRequest(BaseModel):
+    task_type: TaskType | None = None
+    scenario_id: str | None = None
+    seed: int | None = None

openenv.yaml ADDED Viewed

	@@ -0,0 +1,148 @@

+spec_version: 1
+name: sentinel-env
+type: space
+runtime: fastapi
+app: app:app
+port: 7860
+version: "1.0.0"
+tags: [openenv, multi-agent, trust-calibration, adversarial, long-horizon]
+description: >
+  SENTINEL is a multi-agent trust calibration RL environment. An orchestrator
+  agent must delegate subtasks across 5 specialists with hidden reliability
+  profiles, learning who to trust from behavioral evidence alone — under
+  adversarial pressure, across long-horizon task graphs, without access to
+  agent internals. Profiles resample every episode so the agent learns a
+  transferable skill, not memorized identities.
+api:
+  base_url: http://0.0.0.0:7860
+  endpoints:
+    health:
+      method: GET
+      path: /health
+      returns: health status
+    metadata:
+      method: GET
+      path: /metadata
+      returns: task metadata, specialist descriptions, scenario summary
+    reset:
+      method: POST
+      path: /reset
+      body:
+        task_type:
+          type: string
+          required: false
+          enum: [task1, task2, task3]
+        scenario_id:
+          type: string
+          required: false
+        seed:
+          type: integer
+          required: false
+      returns: StepResult with observation, reward, done, info (includes session_id)
+    step:
+      method: POST
+      path: /step
+      params:
+        session_id:
+          type: string
+          required: true
+      body:
+        session_id:
+          type: string
+          required: true
+        task_type:
+          type: string
+          required: true
+          enum: [task1, task2, task3]
+        action_type:
+          type: string
+          required: true
+          enum: [delegate, verify, solve_independently, skip]
+        specialist_id:
+          type: string
+          required: false
+          enum: [S0, S1, S2, S3, S4]
+          note: required for delegate and verify
+        subtask_response:
+          type: string
+          required: false
+          note: required for solve_independently
+        reasoning:
+          type: string
+          required: false
+      returns: StepResult with reward, done, info
+    state:
+      method: GET
+      path: /state
+      params:
+        session_id:
+          type: string
+          required: true
+      returns: SentinelState with trust_snapshot, completion, adversarial stats
+tasks:
+  task1:
+    name: Single-Step Trust Decision
+    difficulty: easy
+    subtasks: 10
+    max_steps: 15
+    adversary_active: false
+    reward: "0.99 correct delegation + stakes awareness | 0.02 skip penalty"
+  task2:
+    name: Multi-Step Delegation Chain
+    difficulty: medium
+    subtasks: 15
+    max_steps: 30
+    adversary_active: false
+    reward: "per-step accuracy + efficiency | terminal completion×0.65 + calibration×0.35"
+  task3:
+    name: Full Adversarial Episode
+    difficulty: hard
+    subtasks: 20
+    max_steps: 45
+    adversary_active: true
+    reward: "terminal completion×0.35 + detection×0.30 + calibration×0.25 + efficiency×0.10"
+specialists:
+  S0: "AccurateSlow — 90% accurate, costs +1 extra step"
+  S1: "OverconfidentFast — 60% accurate, always reports high confidence"
+  S2: "DomainBound — 95% in-domain, 10% out-of-domain"
+  S3: "Adversarial — benign until stakes >= 0.70, then poisons result"
+  S4: "Degrading — 95% first call, -15% accuracy per reuse"
+dataset:
+  total_scenarios: 120
+  split:
+    task1: 40
+    task2: 40
+    task3: 40
+  profile_shuffle: true
+  note: >
+    Specialist profiles (which slot is adversarial) resample every episode.
+    Agent cannot memorize identities — must learn behavioral trust calibration.
+baseline:
+  script: inference.py
+  required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
+  optional_env_vars: [ENV_URL]
+  latest_local_score: 0.0
+  latest_local_episodes: 30
+  reproducibility:
+    inference_temperature: 0.0
+    agent: heuristic-trust-weighted
+    dataset_order: fixed SCN-TASK*-001 through SCN-TASK*-010 per task

pyproject.toml ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+pydantic==2.7.4
+httpx==0.27.0
+python-multipart==0.0.9

scenarios.py ADDED Viewed

	@@ -0,0 +1,266 @@

+from __future__ import annotations
+# ---------------------------------------------------------------------------
+# SENTINEL Scenario Dataset
+#
+# Each scenario is a list of 20 abstract subtask nodes.
+# Subtask fields:
+#   id          : unique within the scenario
+#   description : what the orchestrator sees (abstract, domain-tagged)
+#   domain      : ANALYZE | SYNTHESIZE | VERIFY | PLAN | EXECUTE
+#   stakes      : 0.0-1.0  (>= 0.7 = adversarial specialist may trigger)
+#   ground_truth: correct answer string (hidden from orchestrator)
+#   depends_on  : list of subtask ids that must complete first (DAG edges)
+#
+# 3 scenario sets x 40 scenarios each = 120 total
+#   task1 (easy)   : 40 scenarios — 10 subtasks, no adversary
+#   task2 (medium) : 40 scenarios — 15 subtasks, adversary inactive
+#   task3 (hard)   : 40 scenarios — 20 subtasks, adversary active
+# ---------------------------------------------------------------------------
+from typing import TypedDict
+class SubTask(TypedDict):
+    id: str
+    description: str
+    domain: str           # ANALYZE | SYNTHESIZE | VERIFY | PLAN | EXECUTE
+    stakes: float
+    ground_truth: str
+    depends_on: list[str]
+class Scenario(TypedDict):
+    scenario_id: str
+    task_type: str
+    title: str
+    description: str
+    subtasks: list[SubTask]
+# ---------------------------------------------------------------------------
+# Subtask template builders
+# ---------------------------------------------------------------------------
+def _subtask(
+    idx: int,
+    domain: str,
+    stakes: float,
+    depends_on: list[str] | None = None,
+) -> SubTask:
+    """
+    Build a generic abstract subtask.
+    Description and ground_truth are domain-keyed but not domain-specific —
+    the agent only sees the abstract framing, not the domain context.
+    """
+    sid = f"T{idx:02d}"
+    templates = {
+        "ANALYZE": {
+            "description": f"Analyze the inputs from prior steps and identify the key pattern. "
+                           f"(subtask {idx}, stakes={stakes:.1f})",
+            "ground_truth": "PATTERN_IDENTIFIED",
+        },
+        "SYNTHESIZE": {
+            "description": f"Synthesize outputs from completed subtasks into a unified result. "
+                           f"(subtask {idx}, stakes={stakes:.1f})",
+            "ground_truth": "SYNTHESIS_COMPLETE",
+        },
+        "VERIFY": {
+            "description": f"Verify the correctness of the previous subtask result. "
+                           f"(subtask {idx}, stakes={stakes:.1f})",
+            "ground_truth": "VERIFIED_CORRECT",
+        },
+        "PLAN": {
+            "description": f"Plan the next sequence of actions given current state. "
+                           f"(subtask {idx}, stakes={stakes:.1f})",
+            "ground_truth": "PLAN_VALID",
+        },
+        "EXECUTE": {
+            "description": f"Execute the planned action and report the outcome. "
+                           f"(subtask {idx}, stakes={stakes:.1f})",
+            "ground_truth": "EXECUTION_SUCCESS",
+        },
+    }
+    t = templates[domain]
+    return SubTask(
+        id=sid,
+        description=t["description"],
+        domain=domain,
+        stakes=stakes,
+        ground_truth=t["ground_truth"],
+        depends_on=depends_on or [],
+    )
+# ---------------------------------------------------------------------------
+# DAG layouts for each task type
+# Each entry: (domain, stakes, depends_on_indices)
+# depends_on_indices are 1-based subtask numbers
+# ---------------------------------------------------------------------------
+# Task 1 — 10 subtasks, linear chain, no high-stakes nodes
+_TASK1_LAYOUT = [
+    ("ANALYZE",    0.20, []),
+    ("ANALYZE",    0.25, [1]),
+    ("VERIFY",     0.20, [2]),
+    ("SYNTHESIZE", 0.30, [3]),
+    ("VERIFY",     0.25, [4]),
+    ("PLAN",       0.30, [5]),
+    ("ANALYZE",    0.35, [6]),
+    ("SYNTHESIZE", 0.30, [7]),
+    ("VERIFY",     0.35, [8]),
+    ("EXECUTE",    0.40, [9]),
+]
+# Task 2 — 15 subtasks, branching DAG, medium stakes, no adversary trigger
+_TASK2_LAYOUT = [
+    ("ANALYZE",    0.20, []),
+    ("ANALYZE",    0.25, [1]),
+    ("VERIFY",     0.30, [2]),
+    ("SYNTHESIZE", 0.35, [3]),
+    ("PLAN",       0.40, [4]),
+    ("ANALYZE",    0.30, [3]),       # branch from node 3
+    ("VERIFY",     0.45, [5, 6]),    # joins branches
+    ("EXECUTE",    0.50, [7]),
+    ("VERIFY",     0.55, [8]),
+    ("SYNTHESIZE", 0.45, [9]),
+    ("PLAN",       0.50, [10]),
+    ("ANALYZE",    0.55, [11]),
+    ("VERIFY",     0.60, [12]),
+    ("SYNTHESIZE", 0.60, [13]),
+    ("EXECUTE",    0.65, [14]),
+]
+# Task 3 — 20 subtasks, full DAG with adversarial trigger zones (stakes >= 0.7)
+_TASK3_LAYOUT = [
+    ("ANALYZE",    0.20, []),
+    ("ANALYZE",    0.25, [1]),
+    ("VERIFY",     0.30, [2]),
+    ("SYNTHESIZE", 0.35, [3]),
+    ("PLAN",       0.40, [4]),
+    ("ANALYZE",    0.30, [3]),       # branch A
+    ("VERIFY",     0.45, [5, 6]),    # join A+B
+    ("EXECUTE",    0.50, [7]),
+    ("VERIFY",     0.55, [8]),
+    ("SYNTHESIZE", 0.50, [9]),
+    ("PLAN",       0.60, [10]),
+    ("ANALYZE",    0.55, [11]),
+    ("SYNTHESIZE", 0.65, [12]),
+    ("VERIFY",     0.70, [13]),      # ← ADVERSARIAL ZONE START
+    ("EXECUTE",    0.75, [14]),      # ← HIGH STAKES
+    ("PLAN",       0.80, [15]),      # ← HIGH STAKES
+    ("ANALYZE",    0.75, [16]),      # ← HIGH STAKES
+    ("VERIFY",     0.85, [17]),      # ← PEAK STAKES
+    ("SYNTHESIZE", 0.90, [18]),      # ← PEAK STAKES
+    ("EXECUTE",    0.95, [19]),      # ← CRITICAL — terminal
+]
+def _build_scenario(
+    scenario_id: str,
+    task_type: str,
+    layout: list[tuple],
+    title_suffix: str,
+) -> Scenario:
+    subtasks = []
+    for i, (domain, stakes, dep_indices) in enumerate(layout, start=1):
+        depends_on = [f"T{d:02d}" for d in dep_indices]
+        subtasks.append(_subtask(i, domain, stakes, depends_on))
+    return Scenario(
+        scenario_id=scenario_id,
+        task_type=task_type,
+        title=f"Multi-Agent Task Workflow {title_suffix}",
+        description=(
+            f"A {task_type} abstract multi-agent workflow where the orchestrator "
+            f"must delegate {len(subtasks)} subtasks across 5 specialists with "
+            f"hidden reliability profiles, building trust from behavioral evidence alone."
+        ),
+        subtasks=subtasks,
+    )
+# ---------------------------------------------------------------------------
+# Generate 40 scenarios per task type
+# Scenarios vary only by scenario_id and stakes jitter (+/- 0.05)
+# so the trust-calibration challenge is consistent but not identical
+# ---------------------------------------------------------------------------
+import random as _random
+def _jitter_stakes(layout: list[tuple], seed: int, max_jitter: float = 0.05) -> list[tuple]:
+    """Apply small random stakes perturbation so each scenario is slightly different."""
+    rng = _random.Random(seed)
+    return [
+        (domain, round(min(0.99, max(0.01, stakes + rng.uniform(-max_jitter, max_jitter))), 2), deps)
+        for domain, stakes, deps in layout
+    ]
+def _generate_scenarios(
+    task_type: str,
+    layout: list[tuple],
+    count: int = 40,
+) -> list[Scenario]:
+    scenarios = []
+    for i in range(count):
+        jittered = _jitter_stakes(layout, seed=i * 100 + hash(task_type) % 1000)
+        sid = f"SCN-{task_type.upper()}-{i+1:03d}"
+        scenarios.append(
+            _build_scenario(sid, task_type, jittered, f"#{i+1:03d}")
+        )
+    return scenarios
+# ---------------------------------------------------------------------------
+# Public dataset
+# ---------------------------------------------------------------------------
+TASK1_SCENARIOS: list[Scenario] = _generate_scenarios("task1", _TASK1_LAYOUT,  count=40)
+TASK2_SCENARIOS: list[Scenario] = _generate_scenarios("task2", _TASK2_LAYOUT,  count=40)
+TASK3_SCENARIOS: list[Scenario] = _generate_scenarios("task3", _TASK3_LAYOUT,  count=40)
+ALL_SCENARIOS: list[Scenario] = TASK1_SCENARIOS + TASK2_SCENARIOS + TASK3_SCENARIOS
+SCENARIOS_BY_ID: dict[str, Scenario] = {s["scenario_id"]: s for s in ALL_SCENARIOS}
+SCENARIOS_BY_TASK: dict[str, list[Scenario]] = {
+    "task1": TASK1_SCENARIOS,
+    "task2": TASK2_SCENARIOS,
+    "task3": TASK3_SCENARIOS,
+}
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def get_scenario(scenario_id: str) -> Scenario:
+    if scenario_id not in SCENARIOS_BY_ID:
+        raise ValueError(f"Unknown scenario_id: {scenario_id}")
+    return SCENARIOS_BY_ID[scenario_id]
+def sample_scenario(task_type: str, seed: int | None = None) -> Scenario:
+    pool = SCENARIOS_BY_TASK.get(task_type)
+    if not pool:
+        raise ValueError(f"Unknown task_type: {task_type}")
+    rng = _random.Random(seed)
+    return rng.choice(pool)
+def scenario_summary() -> dict:
+    return {
+        "total": len(ALL_SCENARIOS),
+        "task1": len(TASK1_SCENARIOS),
+        "task2": len(TASK2_SCENARIOS),
+        "task3": len(TASK3_SCENARIOS),
+        "subtasks_per_task": {
+            "task1": len(_TASK1_LAYOUT),
+            "task2": len(_TASK2_LAYOUT),
+            "task3": len(_TASK3_LAYOUT),
+        },
+    }

specialists.py ADDED Viewed

	@@ -0,0 +1,275 @@

+from __future__ import annotations
+import random
+from dataclasses import dataclass, field
+from typing import Protocol
+# ---------------------------------------------------------------------------
+# Result type returned by every specialist
+# ---------------------------------------------------------------------------
+@dataclass
+class SpecialistResult:
+    specialist_id: str
+    subtask: str
+    response: str
+    outcome: float        # 1.0 correct | 0.5 partial | 0.0 wrong/adversarial
+    confidence: float     # 0.0–1.0 (self-reported — may be misleading)
+    is_adversarial: bool  # True only for AdversarialSpecialist at trigger point
+    metadata: dict        = field(default_factory=dict)
+# ---------------------------------------------------------------------------
+# Base class
+# ---------------------------------------------------------------------------
+class BaseSpecialist:
+    specialist_id: str
+    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
+        raise NotImplementedError
+    def reset(self) -> None:
+        """Called at episode start. Override to reset FSM state."""
+        pass
+# ---------------------------------------------------------------------------
+# S0 — AccurateSlow
+# Correct 90% of the time, but costs +1 extra step budget when called.
+# Training signal: learn when accuracy matters more than speed.
+# ---------------------------------------------------------------------------
+class AccurateSlowSpecialist(BaseSpecialist):
+    specialist_id = "S0"
+    ACCURACY = 0.90
+    def reset(self) -> None:
+        pass
+    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
+        correct = rng.random() < self.ACCURACY
+        return SpecialistResult(
+            specialist_id=self.specialist_id,
+            subtask=subtask,
+            response=f"[S0] Careful analysis: {'correct' if correct else 'incorrect'} answer.",
+            outcome=1.0 if correct else 0.0,
+            confidence=0.85,
+            is_adversarial=False,
+            metadata={"step_cost": 2},   # costs 2 steps instead of 1
+        )
+# ---------------------------------------------------------------------------
+# S1 — OverconfidentFast
+# Always answers immediately. Wrong 40% of the time. Confidence always high.
+# Training signal: don't trust high confidence alone.
+# ---------------------------------------------------------------------------
+class OverconfidentFastSpecialist(BaseSpecialist):
+    specialist_id = "S1"
+    ACCURACY = 0.60
+    def reset(self) -> None:
+        pass
+    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
+        correct = rng.random() < self.ACCURACY
+        return SpecialistResult(
+            specialist_id=self.specialist_id,
+            subtask=subtask,
+            response=f"[S1] Instant answer: {'correct' if correct else 'incorrect'}.",
+            outcome=1.0 if correct else 0.0,
+            confidence=0.95,              # always reports high confidence
+            is_adversarial=False,
+            metadata={"step_cost": 1},
+        )
+# ---------------------------------------------------------------------------
+# S2 — DomainBound
+# Accurate (95%) within its domain, fails (10%) outside.
+# Training signal: learn to route by domain, not just trust score.
+# ---------------------------------------------------------------------------
+class DomainBoundSpecialist(BaseSpecialist):
+    specialist_id = "S2"
+    IN_DOMAIN_ACCURACY  = 0.95
+    OUT_DOMAIN_ACCURACY = 0.10
+    def reset(self) -> None:
+        pass
+    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
+        # Domain is inferred from subtask text (keywords signal domain)
+        in_domain = any(kw in subtask.lower() for kw in
+                        ["classify", "categorize", "label", "triage", "assess"])
+        accuracy = self.IN_DOMAIN_ACCURACY if in_domain else self.OUT_DOMAIN_ACCURACY
+        correct  = rng.random() < accuracy
+        return SpecialistResult(
+            specialist_id=self.specialist_id,
+            subtask=subtask,
+            response=f"[S2] Domain {'match' if in_domain else 'mismatch'}: "
+                     f"{'correct' if correct else 'incorrect'}.",
+            outcome=1.0 if correct else 0.0,
+            confidence=0.90 if in_domain else 0.45,
+            is_adversarial=False,
+            metadata={"step_cost": 1, "in_domain": in_domain},
+        )
+# ---------------------------------------------------------------------------
+# S3 — Adversarial
+# Correct when stakes < threshold. Poisons (returns plausible-but-wrong
+# answer with high confidence) when stakes >= threshold.
+# Identity is SHUFFLED every episode — agent cannot memorize who is S3.
+# Training signal: detect adversarial behavior from behavioral evidence.
+# ---------------------------------------------------------------------------
+class AdversarialSpecialist(BaseSpecialist):
+    specialist_id = "S3"
+    BENIGN_ACCURACY   = 0.88    # looks trustworthy at low stakes
+    STAKES_THRESHOLD  = 0.70    # triggers above this
+    def reset(self) -> None:
+        pass
+    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
+        if stakes >= self.STAKES_THRESHOLD:
+            # Adversarial trigger: plausible response, wrong outcome
+            return SpecialistResult(
+                specialist_id=self.specialist_id,
+                subtask=subtask,
+                response=f"[S3] Confident answer (poisoned at stakes={stakes:.2f}).",
+                outcome=0.0,
+                confidence=0.92,          # high confidence = trap
+                is_adversarial=True,
+                metadata={"step_cost": 1, "triggered": True, "stakes": stakes},
+            )
+        else:
+            # Benign phase — builds trust before striking
+            correct = rng.random() < self.BENIGN_ACCURACY
+            return SpecialistResult(
+                specialist_id=self.specialist_id,
+                subtask=subtask,
+                response=f"[S3] Normal answer: {'correct' if correct else 'incorrect'}.",
+                outcome=1.0 if correct else 0.0,
+                confidence=0.88,
+                is_adversarial=False,
+                metadata={"step_cost": 1, "triggered": False, "stakes": stakes},
+            )
+# ---------------------------------------------------------------------------
+# S4 — Degrading
+# 95% accurate on first call. Degrades by 15% per subsequent reuse.
+# Training signal: learn specialist fatigue, diversify delegation.
+# ---------------------------------------------------------------------------
+class DegradingSpecialist(BaseSpecialist):
+    specialist_id = "S4"
+    BASE_ACCURACY     = 0.95
+    DEGRADATION_RATE  = 0.15
+    def __init__(self) -> None:
+        self._call_count = 0
+    def reset(self) -> None:
+        self._call_count = 0
+    def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
+        accuracy = max(0.05, self.BASE_ACCURACY - self.DEGRADATION_RATE * self._call_count)
+        correct  = rng.random() < accuracy
+        self._call_count += 1
+        return SpecialistResult(
+            specialist_id=self.specialist_id,
+            subtask=subtask,
+            response=f"[S4] Call #{self._call_count}, accuracy={accuracy:.0%}: "
+                     f"{'correct' if correct else 'incorrect'}.",
+            outcome=1.0 if correct else 0.0,
+            confidence=max(0.3, 0.92 - 0.10 * self._call_count),
+            is_adversarial=False,
+            metadata={"step_cost": 1, "call_count": self._call_count, "accuracy": accuracy},
+        )
+# ---------------------------------------------------------------------------
+# SpecialistPool
+# Manages the 5 specialists. Shuffles adversarial identity each episode.
+# ---------------------------------------------------------------------------
+class SpecialistPool:
+    """
+    Holds the 5 specialist FSMs.
+    Each episode, the AdversarialSpecialist is assigned to a random slot
+    (S0–S4). The orchestrator cannot know which slot is adversarial —
+    it must infer from behavioral evidence via the TrustLedger.
+    """
+    def __init__(self) -> None:
+        self._fixed: dict[str, BaseSpecialist] = {
+            "S0": AccurateSlowSpecialist(),
+            "S1": OverconfidentFastSpecialist(),
+            "S2": DomainBoundSpecialist(),
+            "S3": AdversarialSpecialist(),
+            "S4": DegradingSpecialist(),
+        }
+        # Profile mapping: public_id → internal specialist
+        # Shuffled each reset()
+        self._profile: dict[str, str] = {sid: sid for sid in self._fixed}
+        self._adversarial_slot: str = "S3"
+    def reset(self, seed: int | None = None) -> None:
+        """
+        Resample adversarial identity. S3 behavior is assigned to a random slot.
+        All other behaviors are also shuffled so the agent truly cannot memorize.
+        """
+        rng = random.Random(seed)
+        # Reset all FSM states
+        for spec in self._fixed.values():
+            spec.reset()
+        # Shuffle which public slot gets which internal behavior
+        ids = list(self._fixed.keys())
+        shuffled = ids.copy()
+        rng.shuffle(shuffled)
+        self._profile = dict(zip(ids, shuffled))
+        # Track which public slot currently has adversarial behavior
+        # (S3 internal → whichever public slot maps to it)
+        self._adversarial_slot = next(
+            pub for pub, internal in self._profile.items() if internal == "S3"
+        )
+    @property
+    def adversarial_slot(self) -> str:
+        """Public slot that is currently adversarial. Hidden from agent."""
+        return self._adversarial_slot
+    def execute(
+        self,
+        specialist_id: str,
+        subtask: str,
+        stakes: float,
+        rng: random.Random,
+    ) -> SpecialistResult:
+        """
+        Route execution through the shuffled profile.
+        Returns result with specialist_id = the public slot (not internal type).
+        """
+        internal_id = self._profile[specialist_id]
+        spec = self._fixed[internal_id]
+        result = spec.execute(subtask, stakes, rng)
+        # Rewrite id to public slot so agent only sees the public label
+        result.specialist_id = specialist_id
+        return result
+    def available_ids(self) -> list[str]:
+        return list(self._profile.keys())

task_graph.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+from scenarios import Scenario, SubTask
+# ---------------------------------------------------------------------------
+# Node state
+# ---------------------------------------------------------------------------
+@dataclass
+class TaskNode:
+    subtask: SubTask
+    status: str = "pending"      # pending | ready | in_progress | completed | failed | skipped
+    outcome: float = 0.0         # 1.0 correct | 0.5 partial | 0.0 wrong
+    specialist_used: str = ""
+    attempts: int = 0
+    was_adversarial: bool = False
+# ---------------------------------------------------------------------------
+# TaskGraph
+# Manages the DAG of subtasks for one episode.
+# Tracks dependencies, determines which nodes are "ready" to execute,
+# and records outcomes.
+# ---------------------------------------------------------------------------
+class TaskGraph:
+    def __init__(self, scenario: Scenario) -> None:
+        self._scenario   = scenario
+        self._nodes: dict[str, TaskNode] = {}
+        self._order: list[str] = []   # insertion order (for iteration)
+        self._build(scenario["subtasks"])
+    def _build(self, subtasks: list[SubTask]) -> None:
+        for st in subtasks:
+            self._nodes[st["id"]] = TaskNode(subtask=st)
+            self._order.append(st["id"])
+    # ------------------------------------------------------------------
+    # State queries
+    # ------------------------------------------------------------------
+    def current_node(self) -> Optional[TaskNode]:
+        """
+        Returns the first 'ready' node (all dependencies completed).
+        Returns None if all nodes are done or none are unblocked yet.
+        """
+        for sid in self._order:
+            node = self._nodes[sid]
+            if node.status == "pending" and self._deps_met(sid):
+                node.status = "ready"
+            if node.status == "ready":
+                return node
+        return None
+    def _deps_met(self, subtask_id: str) -> bool:
+        """All dependencies of this node must be 'completed'."""
+        deps = self._nodes[subtask_id].subtask["depends_on"]
+        return all(
+            self._nodes[dep].status == "completed"
+            for dep in deps
+            if dep in self._nodes
+        )
+    def is_done(self) -> bool:
+        return all(
+            n.status in ("completed", "failed", "skipped")
+            for n in self._nodes.values()
+        )
+    def completion_rate(self) -> float:
+        completed = sum(1 for n in self._nodes.values() if n.status == "completed")
+        return completed / len(self._nodes) if self._nodes else 0.0
+    def adversarial_detections(self) -> int:
+        """
+        Count of high-stakes adversarial attempts that were avoided.
+        Avoided = node was adversarial AND orchestrator chose VERIFY or SOLVE_INDEPENDENTLY.
+        """
+        return sum(
+            1 for n in self._nodes.values()
+            if n.was_adversarial and n.status == "completed" and n.outcome > 0.0
+        )
+    def adversarial_poisonings(self) -> int:
+        """
+        Count of adversarial results that slipped through unchecked.
+        """
+        return sum(
+            1 for n in self._nodes.values()
+            if n.was_adversarial and n.outcome == 0.0
+        )
+    def subtasks_remaining(self) -> int:
+        return sum(
+            1 for n in self._nodes.values()
+            if n.status in ("pending", "ready", "in_progress")
+        )
+    def subtasks_completed(self) -> int:
+        return sum(1 for n in self._nodes.values() if n.status == "completed")
+    def subtasks_total(self) -> int:
+        return len(self._nodes)
+    def high_stakes_nodes(self) -> list[TaskNode]:
+        return [n for n in self._nodes.values() if n.subtask["stakes"] >= 0.70]
+    # ------------------------------------------------------------------
+    # Mutations
+    # ------------------------------------------------------------------
+    def record_outcome(
+        self,
+        subtask_id: str,
+        outcome: float,
+        specialist_id: str,
+        was_adversarial: bool = False,
+    ) -> None:
+        if subtask_id not in self._nodes:
+            raise KeyError(f"Unknown subtask_id: {subtask_id}")
+        node = self._nodes[subtask_id]
+        node.outcome         = outcome
+        node.specialist_used = specialist_id
+        node.attempts        += 1
+        node.was_adversarial  = was_adversarial
+        node.status = "completed" if outcome > 0.0 else "failed"
+    def skip_node(self, subtask_id: str) -> None:
+        if subtask_id in self._nodes:
+            self._nodes[subtask_id].status = "skipped"
+    # ------------------------------------------------------------------
+    # Summary (for info dict in StepResult)
+    # ------------------------------------------------------------------
+    def summary(self) -> dict:
+        return {
+            "scenario_id":          self._scenario["scenario_id"],
+            "task_type":            self._scenario["task_type"],
+            "subtasks_total":       self.subtasks_total(),
+            "subtasks_completed":   self.subtasks_completed(),
+            "subtasks_remaining":   self.subtasks_remaining(),
+            "completion_rate":      round(self.completion_rate(), 3),
+            "adversarial_detections": self.adversarial_detections(),
+            "adversarial_poisonings": self.adversarial_poisonings(),
+            "is_done":              self.is_done(),
+        }
+    def node_statuses(self) -> dict[str, str]:
+        return {sid: n.status for sid, n in self._nodes.items()}

tests/test_environment.py ADDED Viewed

File without changes

tests/test_graders.py ADDED Viewed

File without changes

tests/test_specialists.py ADDED Viewed

File without changes

training/colab_notebook.ipynb ADDED Viewed

File without changes

training/evaluate.py ADDED Viewed

File without changes

training/train.py ADDED Viewed

File without changes

trust_ledger.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from __future__ import annotations
+import math
+class TrustLedger:
+    """
+    Bayesian reliability tracker for each specialist.
+    Each specialist gets a Beta distribution prior (alpha, beta).
+    alpha = successes + 1, beta = failures + 1 (Laplace smoothing).
+    Trust score = alpha / (alpha + beta) = mean of Beta distribution.
+    Stakes multiplier: high-stakes outcomes move the needle harder.
+    Profile shuffles every episode — ledger resets on reset().
+    """
+    SPECIALIST_IDS = ["S0", "S1", "S2", "S3", "S4"]
+    def __init__(self) -> None:
+        self._reset()
+    def _reset(self) -> None:
+        # Uniform prior: alpha=1, beta=1 → trust=0.5 for all specialists
+        self._alpha: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
+        self._beta:  dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
+        self._call_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
+    def reset(self) -> None:
+        """Call at the start of each episode."""
+        self._reset()
+    # ------------------------------------------------------------------
+    # Update
+    # ------------------------------------------------------------------
+    def update(
+        self,
+        specialist_id: str,
+        outcome: float,   # 1.0 = correct, 0.0 = wrong/adversarial, 0.5 = partial
+        stakes: float,    # 0.0–1.0; high stakes = larger update
+    ) -> None:
+        """
+        Bayesian update after observing a specialist outcome.
+        stakes acts as a weight multiplier (1x at low stakes, 3x at high stakes).
+        """
+        if specialist_id not in self._alpha:
+            return
+        weight = 1.0 + 2.0 * stakes   # 1.0 → 3.0
+        self._call_count[specialist_id] += 1
+        if outcome >= 0.5:
+            self._alpha[specialist_id] += weight * outcome
+        else:
+            self._beta[specialist_id] += weight * (1.0 - outcome)
+    # ------------------------------------------------------------------
+    # Read
+    # ------------------------------------------------------------------
+    def trust(self, specialist_id: str) -> float:
+        """Point estimate: mean of Beta distribution."""
+        a = self._alpha.get(specialist_id, 1.0)
+        b = self._beta.get(specialist_id, 1.0)
+        return a / (a + b)
+    def snapshot(self) -> dict[str, float]:
+        """Rounded trust scores for all specialists."""
+        return {sid: round(self.trust(sid), 3) for sid in self.SPECIALIST_IDS}
+    def call_count(self, specialist_id: str) -> int:
+        return self._call_count.get(specialist_id, 0)
+    def most_trusted(self) -> str:
+        """Returns the specialist_id with the highest current trust score."""
+        return max(self.SPECIALIST_IDS, key=self.trust)
+    def least_trusted(self) -> str:
+        return min(self.SPECIALIST_IDS, key=self.trust)
+    # ------------------------------------------------------------------
+    # Calibration score (used in reward engine)
+    # ------------------------------------------------------------------
+    def brier_score(self, ground_truth_reliability: dict[str, float]) -> float:
+        """
+        Measures how well the trust scores predict actual specialist reliability.
+        Lower = better calibrated. Range 0.0–1.0.
+        ground_truth_reliability: {"S0": 0.9, "S1": 0.6, ...}
+        (hidden from agent, used only by reward engine)
+        """
+        total = 0.0
+        n = 0
+        for sid in self.SPECIALIST_IDS:
+            if sid in ground_truth_reliability:
+                predicted = self.trust(sid)
+                actual    = ground_truth_reliability[sid]
+                total += (predicted - actual) ** 2
+                n += 1
+        return total / n if n > 0 else 0.0
+    def calibration_reward(self, ground_truth_reliability: dict[str, float]) -> float:
+        """
+        Convert Brier score to a reward signal (0.0–1.0).
+        Perfect calibration → 1.0. Random → ~0.5.
+        """
+        brier = self.brier_score(ground_truth_reliability)
+        # Invert and scale: brier=0 → reward=1.0, brier=0.25 → reward=0.5
+        return max(0.0, 1.0 - 4.0 * brier)
+    def __repr__(self) -> str:
+        snap = self.snapshot()
+        return f"TrustLedger({snap})"