Spaces:
Running
Running
Commit Β·
325aa05
0
Parent(s):
Initial Set-up
Browse files- .DS_Store +0 -0
- .gitattributes +1 -0
- Dockerfile +32 -0
- README.md +0 -0
- app.py +206 -0
- comms_bus.py +0 -0
- environment.py +382 -0
- graders.py +237 -0
- inference.py +239 -0
- models.py +146 -0
- openenv.yaml +148 -0
- pyproject.toml +0 -0
- requirements.txt +5 -0
- scenarios.py +266 -0
- specialists.py +275 -0
- task_graph.py +154 -0
- tests/test_environment.py +0 -0
- tests/test_graders.py +0 -0
- tests/test_specialists.py +0 -0
- training/colab_notebook.ipynb +0 -0
- training/evaluate.py +0 -0
- training/train.py +0 -0
- trust_ledger.py +116 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
outputs/charts/*.png filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install dependencies first (cached layer)
|
| 6 |
+
COPY requirements.txt .
|
| 7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 8 |
+
|
| 9 |
+
# Copy all source files
|
| 10 |
+
COPY app.py .
|
| 11 |
+
COPY environment.py .
|
| 12 |
+
COPY models.py .
|
| 13 |
+
COPY graders.py .
|
| 14 |
+
COPY specialists.py .
|
| 15 |
+
COPY trust_ledger.py .
|
| 16 |
+
COPY task_graph.py .
|
| 17 |
+
COPY scenarios.py .
|
| 18 |
+
COPY openenv.yaml .
|
| 19 |
+
COPY inference.py .
|
| 20 |
+
|
| 21 |
+
# Create outputs directory for baseline scores
|
| 22 |
+
RUN mkdir -p outputs
|
| 23 |
+
|
| 24 |
+
# Expose port
|
| 25 |
+
EXPOSE 7860
|
| 26 |
+
|
| 27 |
+
# Health check
|
| 28 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 29 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
|
| 30 |
+
|
| 31 |
+
# Start server
|
| 32 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
README.md
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 7 |
+
from fastapi.responses import JSONResponse
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
from environment import SentinelEnv
|
| 11 |
+
from scenarios import scenario_summary
|
| 12 |
+
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
# App + session store
|
| 15 |
+
# ---------------------------------------------------------------------------
|
| 16 |
+
|
| 17 |
+
app = FastAPI(
|
| 18 |
+
title="SENTINEL β Multi-Agent Trust Calibration Environment",
|
| 19 |
+
description=(
|
| 20 |
+
"OpenEnv-compatible RL environment where an orchestrator agent learns "
|
| 21 |
+
"dynamic trust calibration across adversarial long-horizon tasks."
|
| 22 |
+
),
|
| 23 |
+
version="1.0.0",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# One env instance per session_id
|
| 27 |
+
_sessions: dict[str, SentinelEnv] = {}
|
| 28 |
+
|
| 29 |
+
def _get_env(session_id: str) -> SentinelEnv:
|
| 30 |
+
if session_id not in _sessions:
|
| 31 |
+
raise HTTPException(status_code=404, detail=f"Session '{session_id}' not found. Call /reset first.")
|
| 32 |
+
return _sessions[session_id]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
# Request / Response models
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
|
| 39 |
+
class ResetRequest(BaseModel):
|
| 40 |
+
task_type: str | None = None
|
| 41 |
+
scenario_id: str | None = None
|
| 42 |
+
seed: int | None = None
|
| 43 |
+
|
| 44 |
+
class StepRequest(BaseModel):
|
| 45 |
+
session_id: str
|
| 46 |
+
task_type: str
|
| 47 |
+
action_type: str # delegate | verify | solve_independently | skip
|
| 48 |
+
specialist_id: str | None = None
|
| 49 |
+
subtask_response: str | None = None
|
| 50 |
+
reasoning: str | None = None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
# Endpoints
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
|
| 57 |
+
@app.get("/health")
|
| 58 |
+
def health():
|
| 59 |
+
return {"status": "ok", "environment": "sentinel-env", "version": "1.0.0"}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@app.get("/metadata")
|
| 63 |
+
def metadata():
|
| 64 |
+
summary = scenario_summary()
|
| 65 |
+
return {
|
| 66 |
+
"name": "sentinel-env",
|
| 67 |
+
"version": "1.0.0",
|
| 68 |
+
"description": "Multi-agent trust calibration RL environment.",
|
| 69 |
+
"tasks": {
|
| 70 |
+
"task1": {"name": "Single-Step Trust Decision", "difficulty": "easy", "subtasks": 10, "max_steps": 15},
|
| 71 |
+
"task2": {"name": "Multi-Step Delegation Chain","difficulty": "medium","subtasks": 15, "max_steps": 30},
|
| 72 |
+
"task3": {"name": "Full Adversarial Episode", "difficulty": "hard", "subtasks": 20, "max_steps": 45},
|
| 73 |
+
},
|
| 74 |
+
"specialists": ["S0 (AccurateSlow)", "S1 (OverconfidentFast)",
|
| 75 |
+
"S2 (DomainBound)", "S3 (Adversarial)", "S4 (Degrading)"],
|
| 76 |
+
"action_types": ["delegate", "verify", "solve_independently", "skip"],
|
| 77 |
+
"scenarios": summary,
|
| 78 |
+
"reward_range": "(0.01, 0.99) boundary-exclusive",
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@app.get("/tasks")
|
| 83 |
+
def tasks():
|
| 84 |
+
return {
|
| 85 |
+
"task1": {
|
| 86 |
+
"difficulty": "easy",
|
| 87 |
+
"description": "10-subtask linear chain. Choose which specialist to delegate each subtask to.",
|
| 88 |
+
"adversary_active": False,
|
| 89 |
+
"reward": "0.99 correct delegation + stakes awareness | 0.02 skip",
|
| 90 |
+
},
|
| 91 |
+
"task2": {
|
| 92 |
+
"difficulty": "medium",
|
| 93 |
+
"description": "15-subtask branching DAG. Build trust profile across honest specialists.",
|
| 94 |
+
"adversary_active": False,
|
| 95 |
+
"reward": "0.99 completion Γ calibration bonus",
|
| 96 |
+
},
|
| 97 |
+
"task3": {
|
| 98 |
+
"difficulty": "hard",
|
| 99 |
+
"description": "20-subtask full DAG. Adversarial specialist active. Detect and route around poison.",
|
| 100 |
+
"adversary_active": True,
|
| 101 |
+
"reward": "0.35Γcompletion + 0.30Γdetection + 0.25Γcalibration + 0.10Γefficiency",
|
| 102 |
+
},
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@app.get("/schema")
|
| 107 |
+
def schema():
|
| 108 |
+
return {
|
| 109 |
+
"reset_request": ResetRequest.model_json_schema(),
|
| 110 |
+
"step_request": StepRequest.model_json_schema(),
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@app.get("/grader")
|
| 115 |
+
def grader():
|
| 116 |
+
return {
|
| 117 |
+
"task1": {
|
| 118 |
+
"step": "task_accuracyΓ0.50 + stakes_awarenessΓ0.35 + efficiencyΓ0.15",
|
| 119 |
+
"terminal": "same as last step",
|
| 120 |
+
},
|
| 121 |
+
"task2": {
|
| 122 |
+
"step": "task_accuracyΓ0.65 + efficiencyΓ0.35",
|
| 123 |
+
"terminal": "completion_rateΓ0.65 + trust_calibrationΓ0.35",
|
| 124 |
+
},
|
| 125 |
+
"task3": {
|
| 126 |
+
"step": "task_accuracyΓ0.40 + stakes_awarenessΓ0.45 + efficiencyΓ0.15",
|
| 127 |
+
"terminal": "completionΓ0.35 + detectionΓ0.30 + calibrationΓ0.25 + efficiencyΓ0.10",
|
| 128 |
+
},
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@app.post("/reset")
|
| 133 |
+
def reset(req: ResetRequest = ResetRequest()):
|
| 134 |
+
env = SentinelEnv()
|
| 135 |
+
result = env.reset(
|
| 136 |
+
task_type=req.task_type,
|
| 137 |
+
scenario_id=req.scenario_id,
|
| 138 |
+
seed=req.seed,
|
| 139 |
+
)
|
| 140 |
+
session_id = result["info"]["session_id"]
|
| 141 |
+
_sessions[session_id] = env
|
| 142 |
+
return result
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
@app.post("/step")
|
| 146 |
+
def step(req: StepRequest, session_id: str = Query(...)):
|
| 147 |
+
env = _get_env(session_id)
|
| 148 |
+
try:
|
| 149 |
+
result = env.step(req.model_dump())
|
| 150 |
+
except (RuntimeError, ValueError) as e:
|
| 151 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 152 |
+
|
| 153 |
+
# Clean up completed sessions to avoid memory leak
|
| 154 |
+
if result["done"]:
|
| 155 |
+
_sessions.pop(session_id, None)
|
| 156 |
+
|
| 157 |
+
return result
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@app.get("/state")
|
| 161 |
+
def state(session_id: str = Query(...)):
|
| 162 |
+
env = _get_env(session_id)
|
| 163 |
+
return env.state(session_id=session_id)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
@app.post("/mcp")
|
| 167 |
+
def mcp(body: dict[str, Any]):
|
| 168 |
+
"""MCP-compatible endpoint for tool-calling agents."""
|
| 169 |
+
method = body.get("method", "")
|
| 170 |
+
params = body.get("params", {})
|
| 171 |
+
|
| 172 |
+
if method == "reset":
|
| 173 |
+
env = SentinelEnv()
|
| 174 |
+
result = env.reset(**params)
|
| 175 |
+
session_id = result["info"]["session_id"]
|
| 176 |
+
_sessions[session_id] = env
|
| 177 |
+
return {"result": result}
|
| 178 |
+
|
| 179 |
+
elif method == "step":
|
| 180 |
+
session_id = params.get("session_id") or body.get("session_id")
|
| 181 |
+
if not session_id:
|
| 182 |
+
raise HTTPException(status_code=400, detail="session_id required for step.")
|
| 183 |
+
env = _get_env(session_id)
|
| 184 |
+
result = env.step(params)
|
| 185 |
+
if result["done"]:
|
| 186 |
+
_sessions.pop(session_id, None)
|
| 187 |
+
return {"result": result}
|
| 188 |
+
|
| 189 |
+
elif method == "state":
|
| 190 |
+
session_id = params.get("session_id")
|
| 191 |
+
if not session_id:
|
| 192 |
+
raise HTTPException(status_code=400, detail="session_id required for state.")
|
| 193 |
+
return {"result": _get_env(session_id).state(session_id)}
|
| 194 |
+
|
| 195 |
+
else:
|
| 196 |
+
raise HTTPException(status_code=400, detail=f"Unknown method: {method}")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# ---------------------------------------------------------------------------
|
| 200 |
+
# Entry point
|
| 201 |
+
# ---------------------------------------------------------------------------
|
| 202 |
+
|
| 203 |
+
if __name__ == "__main__":
|
| 204 |
+
import uvicorn
|
| 205 |
+
port = int(os.environ.get("PORT", 7860))
|
| 206 |
+
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)
|
comms_bus.py
ADDED
|
File without changes
|
environment.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
import uuid
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from graders import (
|
| 8 |
+
grade_task1_step,
|
| 9 |
+
grade_task2_step, grade_task2_terminal,
|
| 10 |
+
grade_task3_step, grade_task3_terminal,
|
| 11 |
+
)
|
| 12 |
+
from scenarios import sample_scenario, get_scenario, Scenario
|
| 13 |
+
from specialists import SpecialistPool
|
| 14 |
+
from task_graph import TaskGraph
|
| 15 |
+
from trust_ledger import TrustLedger
|
| 16 |
+
|
| 17 |
+
# ---------------------------------------------------------------------------
|
| 18 |
+
# Lazy Pydantic imports β falls back to plain dicts if not installed
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
try:
|
| 21 |
+
from models import (
|
| 22 |
+
SentinelObservation, SentinelAction, SentinelReward,
|
| 23 |
+
SentinelState, StepResult, EpisodeStatus,
|
| 24 |
+
TaskType, ActionType, SpecialistId, ResetRequest,
|
| 25 |
+
)
|
| 26 |
+
_PYDANTIC = True
|
| 27 |
+
except ImportError:
|
| 28 |
+
_PYDANTIC = False
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# Hidden ground-truth reliability for each specialist FSM type.
|
| 32 |
+
# These are NEVER sent to the agent β only used by the reward engine
|
| 33 |
+
# to compute calibration score.
|
| 34 |
+
_GROUND_TRUTH_RELIABILITY: dict[str, float] = {
|
| 35 |
+
"S0": 0.90, # AccurateSlow
|
| 36 |
+
"S1": 0.60, # OverconfidentFast
|
| 37 |
+
"S2": 0.70, # DomainBound (avg of in/out domain)
|
| 38 |
+
"S3": 0.15, # Adversarial (unreliable overall)
|
| 39 |
+
"S4": 0.65, # Degrading (avg over calls)
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
MAX_STEPS = {
|
| 43 |
+
"task1": 15,
|
| 44 |
+
"task2": 30,
|
| 45 |
+
"task3": 45,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class SentinelEnv:
|
| 50 |
+
"""
|
| 51 |
+
Core environment class β mirrors Round 1 IncidentEnv pattern exactly.
|
| 52 |
+
reset() / step() / state() API.
|
| 53 |
+
No BaseEnv subclassing needed β plain Python class, FastAPI wraps it.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def __init__(self) -> None:
|
| 57 |
+
self.current_scenario: Scenario | None = None
|
| 58 |
+
self.episode_id: str = ""
|
| 59 |
+
self.session_id: str = ""
|
| 60 |
+
self.step_count: int = 0
|
| 61 |
+
self.max_steps: int = 0
|
| 62 |
+
self.total_reward: float = 0.0
|
| 63 |
+
self.last_reward: float = 0.0
|
| 64 |
+
self.done: bool = False
|
| 65 |
+
self.episode_status: str = "active"
|
| 66 |
+
self.last_action_summary: str | None = None
|
| 67 |
+
|
| 68 |
+
self._graph: TaskGraph | None = None
|
| 69 |
+
self._ledger: TrustLedger = TrustLedger()
|
| 70 |
+
self._pool: SpecialistPool = SpecialistPool()
|
| 71 |
+
self._rng: random.Random = random.Random()
|
| 72 |
+
|
| 73 |
+
# ------------------------------------------------------------------
|
| 74 |
+
# reset()
|
| 75 |
+
# ------------------------------------------------------------------
|
| 76 |
+
|
| 77 |
+
def reset(
|
| 78 |
+
self,
|
| 79 |
+
task_type: str | None = None,
|
| 80 |
+
scenario_id: str | None = None,
|
| 81 |
+
seed: int | None = None,
|
| 82 |
+
) -> dict:
|
| 83 |
+
|
| 84 |
+
self._rng = random.Random(seed)
|
| 85 |
+
|
| 86 |
+
# Select scenario
|
| 87 |
+
if scenario_id:
|
| 88 |
+
scenario = get_scenario(scenario_id)
|
| 89 |
+
else:
|
| 90 |
+
task = task_type or "task3"
|
| 91 |
+
scenario = sample_scenario(task, seed=seed)
|
| 92 |
+
|
| 93 |
+
self.current_scenario = scenario
|
| 94 |
+
self.episode_id = str(uuid.uuid4())
|
| 95 |
+
self.session_id = str(uuid.uuid4())
|
| 96 |
+
self.step_count = 0
|
| 97 |
+
self.max_steps = MAX_STEPS[scenario["task_type"]]
|
| 98 |
+
self.total_reward = 0.0
|
| 99 |
+
self.last_reward = 0.0
|
| 100 |
+
self.done = False
|
| 101 |
+
self.episode_status = "active"
|
| 102 |
+
self.last_action_summary = None
|
| 103 |
+
|
| 104 |
+
# Reset subcomponents
|
| 105 |
+
self._graph = TaskGraph(scenario)
|
| 106 |
+
self._ledger.reset()
|
| 107 |
+
self._pool.reset(seed=seed)
|
| 108 |
+
|
| 109 |
+
return self._build_step_result(
|
| 110 |
+
reward_value=0.0,
|
| 111 |
+
reason="Episode initialized.",
|
| 112 |
+
breakdown={},
|
| 113 |
+
done=False,
|
| 114 |
+
extra_info={"episode_id": self.episode_id, "session_id": self.session_id},
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# ------------------------------------------------------------------
|
| 118 |
+
# step()
|
| 119 |
+
# ------------------------------------------------------------------
|
| 120 |
+
|
| 121 |
+
def step(self, action: dict) -> dict:
|
| 122 |
+
if self.current_scenario is None:
|
| 123 |
+
raise RuntimeError("Call reset() before step().")
|
| 124 |
+
if self.done:
|
| 125 |
+
raise RuntimeError("Episode already completed. Call reset().")
|
| 126 |
+
|
| 127 |
+
# --- Validate session ---
|
| 128 |
+
if action.get("session_id") and action["session_id"] != self.session_id:
|
| 129 |
+
raise ValueError(
|
| 130 |
+
f"session_id mismatch: expected '{self.session_id}', got '{action['session_id']}'"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
action_type = action.get("action_type", "delegate")
|
| 134 |
+
specialist_id = action.get("specialist_id")
|
| 135 |
+
task_type = self.current_scenario["task_type"]
|
| 136 |
+
|
| 137 |
+
# --- Validate action fields ---
|
| 138 |
+
if action_type in ("delegate", "verify") and not specialist_id:
|
| 139 |
+
raise ValueError(f"action_type='{action_type}' requires specialist_id.")
|
| 140 |
+
if action_type == "solve_independently" and not action.get("subtask_response"):
|
| 141 |
+
raise ValueError("action_type='solve_independently' requires subtask_response.")
|
| 142 |
+
|
| 143 |
+
# --- Get current subtask ---
|
| 144 |
+
node = self._graph.current_node()
|
| 145 |
+
if node is None:
|
| 146 |
+
# All nodes done β emit terminal reward
|
| 147 |
+
return self._terminal_reward()
|
| 148 |
+
|
| 149 |
+
subtask = node.subtask
|
| 150 |
+
stakes = subtask["stakes"]
|
| 151 |
+
|
| 152 |
+
self.step_count += 1
|
| 153 |
+
|
| 154 |
+
# --- Execute specialist or self-solve ---
|
| 155 |
+
if action_type == "skip":
|
| 156 |
+
self._graph.skip_node(subtask["id"])
|
| 157 |
+
outcome = 0.0
|
| 158 |
+
was_adversarial = False
|
| 159 |
+
self.last_action_summary = f"Skipped {subtask['id']}"
|
| 160 |
+
|
| 161 |
+
elif action_type == "solve_independently":
|
| 162 |
+
# Agent solves itself β always correct (no specialist involved)
|
| 163 |
+
# But costs 2 steps (enforced via max_steps budget pressure)
|
| 164 |
+
self.step_count += 1 # extra step cost
|
| 165 |
+
outcome = 1.0
|
| 166 |
+
was_adversarial = False
|
| 167 |
+
self._graph.record_outcome(subtask["id"], outcome, "SELF")
|
| 168 |
+
self.last_action_summary = f"Self-solved {subtask['id']}"
|
| 169 |
+
|
| 170 |
+
elif action_type == "verify":
|
| 171 |
+
# First get result, then verify (costs +1 step)
|
| 172 |
+
result = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
|
| 173 |
+
self.step_count += 1 # verification step cost
|
| 174 |
+
outcome = result.outcome if not result.is_adversarial else 0.0
|
| 175 |
+
was_adversarial = result.is_adversarial
|
| 176 |
+
# Verification means agent caught adversarial β treat as detection
|
| 177 |
+
if result.is_adversarial:
|
| 178 |
+
outcome = 1.0 # successfully avoided
|
| 179 |
+
self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
|
| 180 |
+
self._ledger.update(specialist_id, result.outcome, stakes)
|
| 181 |
+
self.last_action_summary = f"Verified {specialist_id} on {subtask['id']}"
|
| 182 |
+
|
| 183 |
+
else: # delegate
|
| 184 |
+
result = self._pool.execute(specialist_id, subtask["description"], stakes, self._rng)
|
| 185 |
+
was_adversarial = result.is_adversarial
|
| 186 |
+
outcome = 0.0 if was_adversarial else result.outcome
|
| 187 |
+
self._graph.record_outcome(subtask["id"], outcome, specialist_id, was_adversarial)
|
| 188 |
+
self._ledger.update(specialist_id, result.outcome, stakes)
|
| 189 |
+
self.last_action_summary = f"Delegated to {specialist_id} on {subtask['id']}"
|
| 190 |
+
|
| 191 |
+
# --- Grade this step ---
|
| 192 |
+
reward_value, reason, breakdown = self._grade_step(
|
| 193 |
+
task_type, action_type, specialist_id, outcome,
|
| 194 |
+
stakes, was_adversarial,
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
self.last_reward = reward_value
|
| 198 |
+
self.total_reward += reward_value
|
| 199 |
+
|
| 200 |
+
# --- Check episode end ---
|
| 201 |
+
all_done = self._graph.is_done()
|
| 202 |
+
budget_gone = self.step_count >= self.max_steps
|
| 203 |
+
poisoned = was_adversarial and action_type == "delegate" and stakes >= 0.85
|
| 204 |
+
|
| 205 |
+
if all_done or budget_gone or poisoned:
|
| 206 |
+
# Emit terminal reward on top of step reward
|
| 207 |
+
return self._terminal_reward(
|
| 208 |
+
step_reward=reward_value,
|
| 209 |
+
step_reason=reason,
|
| 210 |
+
step_breakdown=breakdown,
|
| 211 |
+
forced_end=poisoned,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
return self._build_step_result(reward_value, reason, breakdown, done=False)
|
| 215 |
+
|
| 216 |
+
# ------------------------------------------------------------------
|
| 217 |
+
# state()
|
| 218 |
+
# ------------------------------------------------------------------
|
| 219 |
+
|
| 220 |
+
def state(self, session_id: str | None = None) -> dict:
|
| 221 |
+
if self.current_scenario is None:
|
| 222 |
+
raise RuntimeError("No active episode. Call reset() first.")
|
| 223 |
+
return {
|
| 224 |
+
"episode_id": self.episode_id,
|
| 225 |
+
"session_id": session_id or self.session_id,
|
| 226 |
+
"step_count": self.step_count,
|
| 227 |
+
"max_steps": self.max_steps,
|
| 228 |
+
"total_reward": round(self.total_reward, 4),
|
| 229 |
+
"done": self.done,
|
| 230 |
+
"scenario_id": self.current_scenario["scenario_id"],
|
| 231 |
+
"task_type": self.current_scenario["task_type"],
|
| 232 |
+
"difficulty": self._difficulty(),
|
| 233 |
+
"status": self.episode_status,
|
| 234 |
+
"last_reward": round(self.last_reward, 4),
|
| 235 |
+
"subtasks_completed": self._graph.subtasks_completed(),
|
| 236 |
+
"subtasks_total": self._graph.subtasks_total(),
|
| 237 |
+
"trust_snapshot": self._ledger.snapshot(),
|
| 238 |
+
"adversarial_detections": self._graph.adversarial_detections(),
|
| 239 |
+
"adversarial_poisonings": self._graph.adversarial_poisonings(),
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
# ------------------------------------------------------------------
|
| 243 |
+
# Internal helpers
|
| 244 |
+
# ------------------------------------------------------------------
|
| 245 |
+
|
| 246 |
+
def _grade_step(
|
| 247 |
+
self,
|
| 248 |
+
task_type: str,
|
| 249 |
+
action_type: str,
|
| 250 |
+
specialist_id: str | None,
|
| 251 |
+
outcome: float,
|
| 252 |
+
stakes: float,
|
| 253 |
+
was_adversarial: bool,
|
| 254 |
+
) -> tuple[float, str, dict]:
|
| 255 |
+
|
| 256 |
+
if task_type == "task1":
|
| 257 |
+
return grade_task1_step(
|
| 258 |
+
chosen_specialist=specialist_id or "SELF",
|
| 259 |
+
specialist_outcome=outcome,
|
| 260 |
+
stakes=stakes,
|
| 261 |
+
was_adversarial=was_adversarial,
|
| 262 |
+
action_type=action_type,
|
| 263 |
+
)
|
| 264 |
+
elif task_type == "task2":
|
| 265 |
+
return grade_task2_step(
|
| 266 |
+
specialist_outcome=outcome,
|
| 267 |
+
action_type=action_type,
|
| 268 |
+
step_count=self.step_count,
|
| 269 |
+
max_steps=self.max_steps,
|
| 270 |
+
)
|
| 271 |
+
else: # task3
|
| 272 |
+
return grade_task3_step(
|
| 273 |
+
specialist_outcome=outcome,
|
| 274 |
+
stakes=stakes,
|
| 275 |
+
was_adversarial=was_adversarial,
|
| 276 |
+
action_type=action_type,
|
| 277 |
+
step_count=self.step_count,
|
| 278 |
+
max_steps=self.max_steps,
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
def _terminal_reward(
|
| 282 |
+
self,
|
| 283 |
+
step_reward: float = 0.0,
|
| 284 |
+
step_reason: str = "",
|
| 285 |
+
step_breakdown: dict | None = None,
|
| 286 |
+
forced_end: bool = False,
|
| 287 |
+
) -> dict:
|
| 288 |
+
task_type = self.current_scenario["task_type"]
|
| 289 |
+
|
| 290 |
+
if task_type == "task1":
|
| 291 |
+
# Terminal for task1: completion-based if no step reward provided
|
| 292 |
+
from graders import clamp
|
| 293 |
+
if step_reward > 0.0:
|
| 294 |
+
terminal_value = step_reward
|
| 295 |
+
terminal_reason = step_reason
|
| 296 |
+
terminal_breakdown = step_breakdown or {}
|
| 297 |
+
else:
|
| 298 |
+
completion = self._graph.completion_rate()
|
| 299 |
+
terminal_value = clamp(0.4 + 0.5 * completion)
|
| 300 |
+
terminal_reason = f"Task1 complete. Completion={completion:.0%}."
|
| 301 |
+
terminal_breakdown = {"completion_rate": round(completion, 3)}
|
| 302 |
+
elif task_type == "task2":
|
| 303 |
+
terminal_value, terminal_reason, terminal_breakdown = grade_task2_terminal(
|
| 304 |
+
self._graph, self._ledger, _GROUND_TRUTH_RELIABILITY
|
| 305 |
+
)
|
| 306 |
+
else:
|
| 307 |
+
terminal_value, terminal_reason, terminal_breakdown = grade_task3_terminal(
|
| 308 |
+
self._graph, self._ledger, _GROUND_TRUTH_RELIABILITY,
|
| 309 |
+
self.step_count, self.max_steps,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
if forced_end:
|
| 313 |
+
terminal_value = 0.01
|
| 314 |
+
terminal_reason = "Critical adversarial poison accepted at peak stakes β episode failed."
|
| 315 |
+
|
| 316 |
+
self.last_reward = terminal_value
|
| 317 |
+
self.total_reward += terminal_value
|
| 318 |
+
self.done = True
|
| 319 |
+
self.episode_status = "failed" if forced_end else "completed"
|
| 320 |
+
|
| 321 |
+
return self._build_step_result(
|
| 322 |
+
terminal_value, terminal_reason, terminal_breakdown,
|
| 323 |
+
done=True,
|
| 324 |
+
extra_info={
|
| 325 |
+
**self._graph.summary(),
|
| 326 |
+
"trust_snapshot": self._ledger.snapshot(),
|
| 327 |
+
"forced_end": forced_end,
|
| 328 |
+
},
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
def _build_step_result(
|
| 332 |
+
self,
|
| 333 |
+
reward_value: float,
|
| 334 |
+
reason: str,
|
| 335 |
+
breakdown: dict,
|
| 336 |
+
done: bool,
|
| 337 |
+
extra_info: dict | None = None,
|
| 338 |
+
) -> dict:
|
| 339 |
+
node = self._graph.current_node() if self._graph and not done else None
|
| 340 |
+
|
| 341 |
+
obs = {
|
| 342 |
+
"session_id": self.session_id,
|
| 343 |
+
"scenario_id": self.current_scenario["scenario_id"] if self.current_scenario else "",
|
| 344 |
+
"task_type": self.current_scenario["task_type"] if self.current_scenario else "",
|
| 345 |
+
"difficulty": self._difficulty(),
|
| 346 |
+
"task_description": self.current_scenario["description"] if self.current_scenario else "",
|
| 347 |
+
"current_subtask": node.subtask["description"] if node else "All subtasks complete.",
|
| 348 |
+
"subtask_index": node.subtask["id"] if node else "DONE",
|
| 349 |
+
"subtasks_total": self._graph.subtasks_total() if self._graph else 0,
|
| 350 |
+
"subtasks_remaining": self._graph.subtasks_remaining() if self._graph else 0,
|
| 351 |
+
"available_specialists": self._pool.available_ids(),
|
| 352 |
+
"trust_snapshot": self._ledger.snapshot(),
|
| 353 |
+
"stakes_level": node.subtask["stakes"] if node else 0.0,
|
| 354 |
+
"step_count": self.step_count,
|
| 355 |
+
"max_steps": self.max_steps,
|
| 356 |
+
"last_action_summary": self.last_action_summary,
|
| 357 |
+
"last_reward": round(self.last_reward, 4),
|
| 358 |
+
"episode_status": self.episode_status,
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
reward = {
|
| 362 |
+
"value": round(reward_value, 4),
|
| 363 |
+
"reason": reason,
|
| 364 |
+
"signal_breakdown": breakdown,
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
info = {
|
| 368 |
+
"episode_id": self.episode_id,
|
| 369 |
+
"session_id": self.session_id,
|
| 370 |
+
"step_count": self.step_count,
|
| 371 |
+
"max_steps": self.max_steps,
|
| 372 |
+
"total_reward": round(self.total_reward, 4),
|
| 373 |
+
}
|
| 374 |
+
if extra_info:
|
| 375 |
+
info.update(extra_info)
|
| 376 |
+
|
| 377 |
+
return {"observation": obs, "reward": reward, "done": done, "info": info}
|
| 378 |
+
|
| 379 |
+
def _difficulty(self) -> str:
|
| 380 |
+
return {"task1": "easy", "task2": "medium", "task3": "hard"}.get(
|
| 381 |
+
self.current_scenario["task_type"] if self.current_scenario else "task3", "hard"
|
| 382 |
+
)
|
graders.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from task_graph import TaskGraph
|
| 4 |
+
from trust_ledger import TrustLedger
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# ---------------------------------------------------------------------------
|
| 8 |
+
# Helpers
|
| 9 |
+
# ---------------------------------------------------------------------------
|
| 10 |
+
|
| 11 |
+
def clamp(value: float) -> float:
|
| 12 |
+
"""Boundary-exclusive reward β matches Round 1 convention."""
|
| 13 |
+
return max(0.01, min(0.99, value))
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
# Task 1 β Single-step trust decision (Easy)
|
| 18 |
+
#
|
| 19 |
+
# Did the orchestrator pick the right specialist for this subtask?
|
| 20 |
+
# Graded on: specialist_reliability Γ stakes_awareness
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
def grade_task1_step(
|
| 24 |
+
chosen_specialist: str,
|
| 25 |
+
specialist_outcome: float, # 1.0 correct | 0.0 wrong
|
| 26 |
+
stakes: float,
|
| 27 |
+
was_adversarial: bool,
|
| 28 |
+
action_type: str, # delegate | verify | solve_independently | skip
|
| 29 |
+
) -> tuple[float, str, dict]:
|
| 30 |
+
|
| 31 |
+
if action_type == "skip":
|
| 32 |
+
return clamp(0.02), "Subtask skipped β heavy penalty.", {"task_accuracy": 0.0, "stakes_awareness": 0.0, "efficiency": 0.02}
|
| 33 |
+
|
| 34 |
+
task_accuracy = specialist_outcome # 1.0 or 0.0
|
| 35 |
+
|
| 36 |
+
# Stakes awareness: avoided adversarial = bonus, missed it = penalty
|
| 37 |
+
if was_adversarial and stakes >= 0.70:
|
| 38 |
+
if action_type in ("verify", "solve_independently"):
|
| 39 |
+
stakes_awareness = 0.99 # correctly routed around adversary
|
| 40 |
+
reason = f"Adversarial specialist detected at stakes={stakes:.2f}. Correct evasion."
|
| 41 |
+
else:
|
| 42 |
+
stakes_awareness = 0.0
|
| 43 |
+
reason = f"Adversarial specialist poisoned step at stakes={stakes:.2f}."
|
| 44 |
+
else:
|
| 45 |
+
stakes_awareness = 0.6 + 0.4 * task_accuracy
|
| 46 |
+
reason = f"Delegated to {chosen_specialist}, outcome={'correct' if task_accuracy == 1.0 else 'wrong'}."
|
| 47 |
+
|
| 48 |
+
# Efficiency: verify and solve_independently cost extra steps
|
| 49 |
+
efficiency = {"delegate": 1.0, "verify": 0.7, "solve_independently": 0.5, "skip": 0.0}.get(action_type, 0.5)
|
| 50 |
+
|
| 51 |
+
score = (
|
| 52 |
+
0.50 * task_accuracy
|
| 53 |
+
+ 0.35 * stakes_awareness
|
| 54 |
+
+ 0.15 * efficiency
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
breakdown = {
|
| 58 |
+
"task_accuracy": round(task_accuracy, 3),
|
| 59 |
+
"stakes_awareness": round(stakes_awareness, 3),
|
| 60 |
+
"efficiency": round(efficiency, 3),
|
| 61 |
+
}
|
| 62 |
+
return clamp(score), reason, breakdown
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Task 2 β Multi-step delegation chain (Medium, no adversary)
|
| 67 |
+
#
|
| 68 |
+
# Per-step: task accuracy + efficiency
|
| 69 |
+
# Terminal: calibration bonus based on trust ledger accuracy
|
| 70 |
+
# ---------------------------------------------------------------------------
|
| 71 |
+
|
| 72 |
+
def grade_task2_step(
|
| 73 |
+
specialist_outcome: float,
|
| 74 |
+
action_type: str,
|
| 75 |
+
step_count: int,
|
| 76 |
+
max_steps: int,
|
| 77 |
+
) -> tuple[float, str, dict]:
|
| 78 |
+
|
| 79 |
+
if action_type == "skip":
|
| 80 |
+
return clamp(0.02), "Subtask skipped.", {"task_accuracy": 0.0, "efficiency": 0.02}
|
| 81 |
+
|
| 82 |
+
task_accuracy = specialist_outcome
|
| 83 |
+
step_budget_used = step_count / max_steps
|
| 84 |
+
efficiency = max(0.0, 1.0 - step_budget_used)
|
| 85 |
+
|
| 86 |
+
# verify/solve_independently burn extra steps but may help calibration
|
| 87 |
+
action_cost = {"delegate": 0.0, "verify": 0.10, "solve_independently": 0.20}.get(action_type, 0.0)
|
| 88 |
+
efficiency = max(0.0, efficiency - action_cost)
|
| 89 |
+
|
| 90 |
+
score = (
|
| 91 |
+
0.65 * task_accuracy
|
| 92 |
+
+ 0.35 * efficiency
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
reason = (
|
| 96 |
+
f"{'Correct' if task_accuracy == 1.0 else 'Wrong'} result via {action_type}. "
|
| 97 |
+
f"Step budget {step_count}/{max_steps}."
|
| 98 |
+
)
|
| 99 |
+
breakdown = {
|
| 100 |
+
"task_accuracy": round(task_accuracy, 3),
|
| 101 |
+
"efficiency": round(efficiency, 3),
|
| 102 |
+
}
|
| 103 |
+
return clamp(score), reason, breakdown
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def grade_task2_terminal(
|
| 107 |
+
graph: TaskGraph,
|
| 108 |
+
ledger: TrustLedger,
|
| 109 |
+
ground_truth_reliability: dict[str, float],
|
| 110 |
+
) -> tuple[float, str, dict]:
|
| 111 |
+
|
| 112 |
+
completion = graph.completion_rate()
|
| 113 |
+
calibration = ledger.calibration_reward(ground_truth_reliability)
|
| 114 |
+
|
| 115 |
+
score = (
|
| 116 |
+
0.65 * completion
|
| 117 |
+
+ 0.35 * calibration
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
reason = (
|
| 121 |
+
f"Episode complete. Completion={completion:.0%}, "
|
| 122 |
+
f"Trust calibration score={calibration:.3f}."
|
| 123 |
+
)
|
| 124 |
+
breakdown = {
|
| 125 |
+
"completion_rate": round(completion, 3),
|
| 126 |
+
"trust_calibration": round(calibration, 3),
|
| 127 |
+
}
|
| 128 |
+
return clamp(score), reason, breakdown
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# ---------------------------------------------------------------------------
|
| 132 |
+
# Task 3 β Full adversarial episode (Hard)
|
| 133 |
+
#
|
| 134 |
+
# Per-step: task accuracy + stakes awareness
|
| 135 |
+
# Terminal: completion + adversarial detection + calibration + efficiency
|
| 136 |
+
# ---------------------------------------------------------------------------
|
| 137 |
+
|
| 138 |
+
def grade_task3_step(
|
| 139 |
+
specialist_outcome: float,
|
| 140 |
+
stakes: float,
|
| 141 |
+
was_adversarial: bool,
|
| 142 |
+
action_type: str,
|
| 143 |
+
step_count: int,
|
| 144 |
+
max_steps: int,
|
| 145 |
+
) -> tuple[float, str, dict]:
|
| 146 |
+
|
| 147 |
+
if action_type == "skip":
|
| 148 |
+
return clamp(0.02), "Subtask skipped.", {"task_accuracy": 0.0, "stakes_awareness": 0.0, "efficiency": 0.02}
|
| 149 |
+
|
| 150 |
+
task_accuracy = specialist_outcome
|
| 151 |
+
|
| 152 |
+
# Adversarial detection at high stakes
|
| 153 |
+
if was_adversarial and stakes >= 0.70:
|
| 154 |
+
if action_type in ("verify", "solve_independently"):
|
| 155 |
+
stakes_awareness = 0.99
|
| 156 |
+
reason = f"β
Adversarial detected at stakes={stakes:.2f} β evasion successful."
|
| 157 |
+
else:
|
| 158 |
+
stakes_awareness = 0.0
|
| 159 |
+
task_accuracy = 0.0 # poison accepted β overrides outcome
|
| 160 |
+
reason = f"β Adversarial poison accepted at stakes={stakes:.2f}."
|
| 161 |
+
else:
|
| 162 |
+
stakes_awareness = 0.5 + 0.5 * task_accuracy
|
| 163 |
+
reason = (
|
| 164 |
+
f"{'Correct' if task_accuracy == 1.0 else 'Wrong'} result "
|
| 165 |
+
f"via {action_type} at stakes={stakes:.2f}."
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
step_budget_used = step_count / max_steps
|
| 169 |
+
efficiency = max(0.0, 1.0 - step_budget_used)
|
| 170 |
+
|
| 171 |
+
score = (
|
| 172 |
+
0.40 * task_accuracy
|
| 173 |
+
+ 0.45 * stakes_awareness
|
| 174 |
+
+ 0.15 * efficiency
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
breakdown = {
|
| 178 |
+
"task_accuracy": round(task_accuracy, 3),
|
| 179 |
+
"stakes_awareness": round(stakes_awareness, 3),
|
| 180 |
+
"efficiency": round(efficiency, 3),
|
| 181 |
+
}
|
| 182 |
+
return clamp(score), reason, breakdown
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def grade_task3_terminal(
|
| 186 |
+
graph: TaskGraph,
|
| 187 |
+
ledger: TrustLedger,
|
| 188 |
+
ground_truth_reliability: dict[str, float],
|
| 189 |
+
total_steps: int,
|
| 190 |
+
max_steps: int,
|
| 191 |
+
) -> tuple[float, str, dict]:
|
| 192 |
+
|
| 193 |
+
completion = graph.completion_rate()
|
| 194 |
+
detections = graph.adversarial_detections()
|
| 195 |
+
poisonings = graph.adversarial_poisonings()
|
| 196 |
+
total_adversarial = detections + poisonings
|
| 197 |
+
detection_rate = detections / total_adversarial if total_adversarial > 0 else 1.0
|
| 198 |
+
calibration = ledger.calibration_reward(ground_truth_reliability)
|
| 199 |
+
efficiency = max(0.0, 1.0 - total_steps / max_steps)
|
| 200 |
+
|
| 201 |
+
score = (
|
| 202 |
+
0.35 * completion
|
| 203 |
+
+ 0.30 * detection_rate
|
| 204 |
+
+ 0.25 * calibration
|
| 205 |
+
+ 0.10 * efficiency
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
reason = (
|
| 209 |
+
f"Mission complete. Completion={completion:.0%}, "
|
| 210 |
+
f"Detection={detection_rate:.0%} ({detections}/{total_adversarial}), "
|
| 211 |
+
f"Calibration={calibration:.3f}, Efficiency={efficiency:.3f}."
|
| 212 |
+
)
|
| 213 |
+
breakdown = {
|
| 214 |
+
"completion_rate": round(completion, 3),
|
| 215 |
+
"detection_rate": round(detection_rate, 3),
|
| 216 |
+
"trust_calibration": round(calibration, 3),
|
| 217 |
+
"efficiency": round(efficiency, 3),
|
| 218 |
+
"adversarial_detections": detections,
|
| 219 |
+
"adversarial_poisonings": poisonings,
|
| 220 |
+
}
|
| 221 |
+
return clamp(score), reason, breakdown
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# ---------------------------------------------------------------------------
|
| 225 |
+
# Unified grader dispatcher
|
| 226 |
+
# ---------------------------------------------------------------------------
|
| 227 |
+
|
| 228 |
+
STEP_GRADERS = {
|
| 229 |
+
"task1": grade_task1_step,
|
| 230 |
+
"task2": grade_task2_step,
|
| 231 |
+
"task3": grade_task3_step,
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
TERMINAL_GRADERS = {
|
| 235 |
+
"task2": grade_task2_terminal,
|
| 236 |
+
"task3": grade_task3_terminal,
|
| 237 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SENTINEL β Baseline Inference Script
|
| 3 |
+
=====================================
|
| 4 |
+
Runs a deterministic heuristic agent against all 3 task types.
|
| 5 |
+
Emits [START] / [STEP] / [END] structured logs exactly as required.
|
| 6 |
+
|
| 7 |
+
Heuristic agent logic:
|
| 8 |
+
- Always delegates to the specialist with highest trust score
|
| 9 |
+
- If stakes >= 0.70 and trust of chosen specialist < 0.60 β verify instead
|
| 10 |
+
- Never skips
|
| 11 |
+
- Never solves independently (too expensive)
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
python inference.py
|
| 15 |
+
|
| 16 |
+
Environment variables (optional):
|
| 17 |
+
API_BASE_URL β OpenAI-compatible endpoint (for LLM agent, not heuristic)
|
| 18 |
+
MODEL_NAME β model identifier
|
| 19 |
+
HF_TOKEN β API key
|
| 20 |
+
ENV_URL β remote env URL (default: in-process)
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
import json
|
| 26 |
+
import os
|
| 27 |
+
import sys
|
| 28 |
+
import time
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Try remote env first, fall back to in-process
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
ENV_URL = os.environ.get("ENV_URL", "").strip()
|
| 36 |
+
|
| 37 |
+
if ENV_URL:
|
| 38 |
+
import httpx
|
| 39 |
+
USE_REMOTE = True
|
| 40 |
+
else:
|
| 41 |
+
from environment import SentinelEnv
|
| 42 |
+
USE_REMOTE = False
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
# Env interface (works both remote and in-process)
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
|
| 49 |
+
class EnvClient:
|
| 50 |
+
def __init__(self):
|
| 51 |
+
if USE_REMOTE:
|
| 52 |
+
self._client = httpx.Client(base_url=ENV_URL, timeout=30)
|
| 53 |
+
else:
|
| 54 |
+
self._env = SentinelEnv()
|
| 55 |
+
self.session_id: str = ""
|
| 56 |
+
|
| 57 |
+
def reset(self, task_type: str, seed: int | None = None) -> dict:
|
| 58 |
+
if USE_REMOTE:
|
| 59 |
+
r = self._client.post("/reset", json={"task_type": task_type, "seed": seed})
|
| 60 |
+
r.raise_for_status()
|
| 61 |
+
result = r.json()
|
| 62 |
+
else:
|
| 63 |
+
result = self._env.reset(task_type=task_type, seed=seed)
|
| 64 |
+
self.session_id = result["info"]["session_id"]
|
| 65 |
+
return result
|
| 66 |
+
|
| 67 |
+
def step(self, action: dict) -> dict:
|
| 68 |
+
action["session_id"] = self.session_id
|
| 69 |
+
if USE_REMOTE:
|
| 70 |
+
r = self._client.post(f"/step?session_id={self.session_id}", json=action)
|
| 71 |
+
r.raise_for_status()
|
| 72 |
+
return r.json()
|
| 73 |
+
else:
|
| 74 |
+
return self._env.step(action)
|
| 75 |
+
|
| 76 |
+
def state(self) -> dict:
|
| 77 |
+
if USE_REMOTE:
|
| 78 |
+
r = self._client.get(f"/state?session_id={self.session_id}")
|
| 79 |
+
r.raise_for_status()
|
| 80 |
+
return r.json()
|
| 81 |
+
else:
|
| 82 |
+
return self._env.state(session_id=self.session_id)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
# Heuristic agent
|
| 87 |
+
# ---------------------------------------------------------------------------
|
| 88 |
+
|
| 89 |
+
def heuristic_action(obs: dict, session_id: str) -> dict:
|
| 90 |
+
"""
|
| 91 |
+
Deterministic trust-weighted delegation.
|
| 92 |
+
Picks specialist with highest trust. Upgrades to verify at high stakes.
|
| 93 |
+
"""
|
| 94 |
+
trust = obs["trust_snapshot"] # {"S0": 0.5, ...}
|
| 95 |
+
stakes = obs["stakes_level"]
|
| 96 |
+
task_type = obs["task_type"]
|
| 97 |
+
available = obs["available_specialists"]
|
| 98 |
+
|
| 99 |
+
# Pick specialist with highest trust
|
| 100 |
+
best_specialist = max(available, key=lambda s: trust.get(s, 0.5))
|
| 101 |
+
best_trust = trust.get(best_specialist, 0.5)
|
| 102 |
+
|
| 103 |
+
# Upgrade to verify if high stakes AND low trust in best specialist
|
| 104 |
+
if stakes >= 0.70 and best_trust < 0.60:
|
| 105 |
+
action_type = "verify"
|
| 106 |
+
else:
|
| 107 |
+
action_type = "delegate"
|
| 108 |
+
|
| 109 |
+
return {
|
| 110 |
+
"session_id": session_id,
|
| 111 |
+
"task_type": task_type,
|
| 112 |
+
"action_type": action_type,
|
| 113 |
+
"specialist_id": best_specialist,
|
| 114 |
+
"subtask_response": None,
|
| 115 |
+
"reasoning": f"Trust-weighted: {best_specialist}={best_trust:.3f}, stakes={stakes:.2f}",
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ---------------------------------------------------------------------------
|
| 120 |
+
# Run one scenario
|
| 121 |
+
# ---------------------------------------------------------------------------
|
| 122 |
+
|
| 123 |
+
def run_episode(
|
| 124 |
+
client: EnvClient,
|
| 125 |
+
task_type: str,
|
| 126 |
+
scenario_id: str,
|
| 127 |
+
seed: int,
|
| 128 |
+
) -> dict:
|
| 129 |
+
result = client.reset(task_type=task_type, seed=seed)
|
| 130 |
+
session_id = client.session_id
|
| 131 |
+
|
| 132 |
+
print(f"[START] task={scenario_id} env=sentinel-env model=heuristic-baseline")
|
| 133 |
+
|
| 134 |
+
step_num = 0
|
| 135 |
+
total_score = 0.0
|
| 136 |
+
|
| 137 |
+
while True:
|
| 138 |
+
obs = result["observation"]
|
| 139 |
+
action = heuristic_action(obs, session_id)
|
| 140 |
+
|
| 141 |
+
result = client.step(action)
|
| 142 |
+
reward = result["reward"]["value"]
|
| 143 |
+
done = result["done"]
|
| 144 |
+
step_num += 1
|
| 145 |
+
total_score = result["info"]["total_reward"]
|
| 146 |
+
|
| 147 |
+
action_str = f"{action['action_type']}:{action.get('specialist_id','SELF')}"
|
| 148 |
+
print(
|
| 149 |
+
f"[STEP] step={step_num} "
|
| 150 |
+
f"action={action_str} "
|
| 151 |
+
f"reward={reward:.4f} "
|
| 152 |
+
f"done={str(done).lower()} "
|
| 153 |
+
f"error=null"
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
if done:
|
| 157 |
+
break
|
| 158 |
+
|
| 159 |
+
# Final info
|
| 160 |
+
info = result["info"]
|
| 161 |
+
completion = info.get("completion_rate", 0.0)
|
| 162 |
+
detections = info.get("adversarial_detections", 0)
|
| 163 |
+
poisonings = info.get("adversarial_poisonings", 0)
|
| 164 |
+
trust_snap = info.get("trust_snapshot", {})
|
| 165 |
+
|
| 166 |
+
print(
|
| 167 |
+
f"[END] success=true "
|
| 168 |
+
f"steps={step_num} "
|
| 169 |
+
f"score={total_score:.4f} "
|
| 170 |
+
f"rewards={total_score:.4f}"
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
return {
|
| 174 |
+
"scenario_id": scenario_id,
|
| 175 |
+
"task_type": task_type,
|
| 176 |
+
"steps": step_num,
|
| 177 |
+
"total_score": round(total_score, 4),
|
| 178 |
+
"completion_rate": round(completion, 4),
|
| 179 |
+
"adversarial_detections": detections,
|
| 180 |
+
"adversarial_poisonings": poisonings,
|
| 181 |
+
"final_trust": trust_snap,
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# ---------------------------------------------------------------------------
|
| 186 |
+
# Main
|
| 187 |
+
# ---------------------------------------------------------------------------
|
| 188 |
+
|
| 189 |
+
def main():
|
| 190 |
+
client = EnvClient()
|
| 191 |
+
all_results = []
|
| 192 |
+
|
| 193 |
+
# Run 10 episodes per task type (30 total β fast enough for validation)
|
| 194 |
+
for task_type in ["task1", "task2", "task3"]:
|
| 195 |
+
for i in range(10):
|
| 196 |
+
scenario_id = f"SCN-{task_type.upper()}-{i+1:03d}"
|
| 197 |
+
try:
|
| 198 |
+
result = run_episode(client, task_type, scenario_id, seed=i)
|
| 199 |
+
all_results.append(result)
|
| 200 |
+
except Exception as e:
|
| 201 |
+
print(f"[STEP] step=0 action=error reward=0.0 done=true error={e}")
|
| 202 |
+
print(f"[END] success=false steps=0 score=0.0 rewards=0.0")
|
| 203 |
+
|
| 204 |
+
# Summary
|
| 205 |
+
if all_results:
|
| 206 |
+
by_task: dict[str, list] = {"task1": [], "task2": [], "task3": []}
|
| 207 |
+
for r in all_results:
|
| 208 |
+
by_task[r["task_type"]].append(r["total_score"])
|
| 209 |
+
|
| 210 |
+
print("\n=== Baseline Summary ===")
|
| 211 |
+
overall_scores = []
|
| 212 |
+
for task_type, scores in by_task.items():
|
| 213 |
+
if scores:
|
| 214 |
+
avg = sum(scores) / len(scores)
|
| 215 |
+
overall_scores.extend(scores)
|
| 216 |
+
print(f" {task_type}: episodes={len(scores)} avg_score={avg:.4f}")
|
| 217 |
+
|
| 218 |
+
overall_avg = sum(overall_scores) / len(overall_scores) if overall_scores else 0.0
|
| 219 |
+
print(f" OVERALL: episodes={len(overall_scores)} avg_score={overall_avg:.4f}")
|
| 220 |
+
|
| 221 |
+
# Save results
|
| 222 |
+
out_path = Path("outputs/baseline_scores.json")
|
| 223 |
+
out_path.parent.mkdir(exist_ok=True)
|
| 224 |
+
with open(out_path, "w") as f:
|
| 225 |
+
json.dump({
|
| 226 |
+
"model": "heuristic-baseline",
|
| 227 |
+
"total_episodes": len(all_results),
|
| 228 |
+
"avg_score": round(overall_avg, 4),
|
| 229 |
+
"by_task": {
|
| 230 |
+
t: {"episodes": len(s), "avg_score": round(sum(s)/len(s), 4)}
|
| 231 |
+
for t, s in by_task.items() if s
|
| 232 |
+
},
|
| 233 |
+
"episodes": all_results,
|
| 234 |
+
}, f, indent=2)
|
| 235 |
+
print(f"\nResults saved to {out_path}")
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
if __name__ == "__main__":
|
| 239 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from enum import Enum
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, field_validator
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# ---------------------------------------------------------------------------
|
| 10 |
+
# Enums
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
|
| 13 |
+
class TaskType(str, Enum):
|
| 14 |
+
TASK1 = "task1" # easy β single-step trust decision
|
| 15 |
+
TASK2 = "task2" # medium β multi-step delegation chain (no adversary)
|
| 16 |
+
TASK3 = "task3" # hard β full adversarial episode
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ActionType(str, Enum):
|
| 20 |
+
DELEGATE = "delegate" # route subtask to a specialist
|
| 21 |
+
VERIFY = "verify" # cross-check specialist result (+1 step cost)
|
| 22 |
+
SOLVE_INDEPENDENTLY = "solve_independently" # agent solves itself (+2 step cost)
|
| 23 |
+
SKIP = "skip" # abandon subtask (heavy penalty)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SpecialistId(str, Enum):
|
| 27 |
+
S0 = "S0" # AccurateSlow
|
| 28 |
+
S1 = "S1" # OverconfidentFast
|
| 29 |
+
S2 = "S2" # DomainBound
|
| 30 |
+
S3 = "S3" # Adversarial (identity shuffled each episode)
|
| 31 |
+
S4 = "S4" # Degrading
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class EpisodeStatus(str, Enum):
|
| 35 |
+
ACTIVE = "active"
|
| 36 |
+
COMPLETED = "completed"
|
| 37 |
+
FAILED = "failed"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Observation
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
class SentinelObservation(BaseModel):
|
| 45 |
+
session_id: str
|
| 46 |
+
scenario_id: str
|
| 47 |
+
task_type: TaskType
|
| 48 |
+
difficulty: str # "easy" | "medium" | "hard"
|
| 49 |
+
task_description: str
|
| 50 |
+
current_subtask: str
|
| 51 |
+
subtask_index: int # which subtask in the DAG (0-based)
|
| 52 |
+
subtasks_total: int
|
| 53 |
+
subtasks_remaining: int
|
| 54 |
+
available_specialists: list[str] # specialist ids visible to agent
|
| 55 |
+
trust_snapshot: dict[str, float] # {"S0": 0.82, "S1": 0.31, ...}
|
| 56 |
+
stakes_level: float # 0.0β1.0; high = adversary may trigger
|
| 57 |
+
step_count: int
|
| 58 |
+
max_steps: int
|
| 59 |
+
last_action_summary: str | None
|
| 60 |
+
last_reward: float
|
| 61 |
+
episode_status: EpisodeStatus
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
# Action
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
class SentinelAction(BaseModel):
|
| 69 |
+
session_id: str
|
| 70 |
+
task_type: TaskType
|
| 71 |
+
action_type: ActionType
|
| 72 |
+
specialist_id: str | None = None # required for DELEGATE and VERIFY
|
| 73 |
+
subtask_response: str | None = None # required for SOLVE_INDEPENDENTLY
|
| 74 |
+
reasoning: str | None = None # optional chain-of-thought
|
| 75 |
+
|
| 76 |
+
@field_validator("specialist_id")
|
| 77 |
+
@classmethod
|
| 78 |
+
def validate_specialist_id(cls, v: str | None) -> str | None:
|
| 79 |
+
if v is not None and v not in [s.value for s in SpecialistId]:
|
| 80 |
+
raise ValueError(f"specialist_id must be one of {[s.value for s in SpecialistId]}, got '{v}'")
|
| 81 |
+
return v
|
| 82 |
+
|
| 83 |
+
def requires_specialist(self) -> bool:
|
| 84 |
+
return self.action_type in (ActionType.DELEGATE, ActionType.VERIFY)
|
| 85 |
+
|
| 86 |
+
def requires_response(self) -> bool:
|
| 87 |
+
return self.action_type == ActionType.SOLVE_INDEPENDENTLY
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# ---------------------------------------------------------------------------
|
| 91 |
+
# Reward
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
|
| 94 |
+
class SentinelReward(BaseModel):
|
| 95 |
+
value: float # (0.01, 0.99) boundary-exclusive
|
| 96 |
+
reason: str
|
| 97 |
+
signal_breakdown: dict[str, float] # {"task_accuracy": 0.4, ...}
|
| 98 |
+
|
| 99 |
+
@field_validator("value")
|
| 100 |
+
@classmethod
|
| 101 |
+
def clamp_reward(cls, v: float) -> float:
|
| 102 |
+
return max(0.01, min(0.99, v))
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ---------------------------------------------------------------------------
|
| 106 |
+
# Step Result (what env.step() and env.reset() return)
|
| 107 |
+
# ---------------------------------------------------------------------------
|
| 108 |
+
|
| 109 |
+
class StepResult(BaseModel):
|
| 110 |
+
observation: SentinelObservation
|
| 111 |
+
reward: SentinelReward
|
| 112 |
+
done: bool
|
| 113 |
+
info: dict[str, Any]
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# State (what env.state() returns)
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
class SentinelState(BaseModel):
|
| 121 |
+
episode_id: str
|
| 122 |
+
session_id: str | None
|
| 123 |
+
step_count: int
|
| 124 |
+
max_steps: int
|
| 125 |
+
total_reward: float
|
| 126 |
+
done: bool
|
| 127 |
+
scenario_id: str
|
| 128 |
+
task_type: TaskType
|
| 129 |
+
difficulty: str
|
| 130 |
+
status: EpisodeStatus
|
| 131 |
+
last_reward: float
|
| 132 |
+
subtasks_completed: int
|
| 133 |
+
subtasks_total: int
|
| 134 |
+
trust_snapshot: dict[str, float]
|
| 135 |
+
adversarial_detections: int # how many adversarial attempts caught
|
| 136 |
+
adversarial_poisonings: int # how many slipped through
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ---------------------------------------------------------------------------
|
| 140 |
+
# Reset Request
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
|
| 143 |
+
class ResetRequest(BaseModel):
|
| 144 |
+
task_type: TaskType | None = None
|
| 145 |
+
scenario_id: str | None = None
|
| 146 |
+
seed: int | None = None
|
openenv.yaml
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
|
| 3 |
+
name: sentinel-env
|
| 4 |
+
|
| 5 |
+
type: space
|
| 6 |
+
|
| 7 |
+
runtime: fastapi
|
| 8 |
+
|
| 9 |
+
app: app:app
|
| 10 |
+
|
| 11 |
+
port: 7860
|
| 12 |
+
|
| 13 |
+
version: "1.0.0"
|
| 14 |
+
|
| 15 |
+
tags: [openenv, multi-agent, trust-calibration, adversarial, long-horizon]
|
| 16 |
+
|
| 17 |
+
description: >
|
| 18 |
+
SENTINEL is a multi-agent trust calibration RL environment. An orchestrator
|
| 19 |
+
agent must delegate subtasks across 5 specialists with hidden reliability
|
| 20 |
+
profiles, learning who to trust from behavioral evidence alone β under
|
| 21 |
+
adversarial pressure, across long-horizon task graphs, without access to
|
| 22 |
+
agent internals. Profiles resample every episode so the agent learns a
|
| 23 |
+
transferable skill, not memorized identities.
|
| 24 |
+
|
| 25 |
+
api:
|
| 26 |
+
base_url: http://0.0.0.0:7860
|
| 27 |
+
endpoints:
|
| 28 |
+
health:
|
| 29 |
+
method: GET
|
| 30 |
+
path: /health
|
| 31 |
+
returns: health status
|
| 32 |
+
|
| 33 |
+
metadata:
|
| 34 |
+
method: GET
|
| 35 |
+
path: /metadata
|
| 36 |
+
returns: task metadata, specialist descriptions, scenario summary
|
| 37 |
+
|
| 38 |
+
reset:
|
| 39 |
+
method: POST
|
| 40 |
+
path: /reset
|
| 41 |
+
body:
|
| 42 |
+
task_type:
|
| 43 |
+
type: string
|
| 44 |
+
required: false
|
| 45 |
+
enum: [task1, task2, task3]
|
| 46 |
+
scenario_id:
|
| 47 |
+
type: string
|
| 48 |
+
required: false
|
| 49 |
+
seed:
|
| 50 |
+
type: integer
|
| 51 |
+
required: false
|
| 52 |
+
returns: StepResult with observation, reward, done, info (includes session_id)
|
| 53 |
+
|
| 54 |
+
step:
|
| 55 |
+
method: POST
|
| 56 |
+
path: /step
|
| 57 |
+
params:
|
| 58 |
+
session_id:
|
| 59 |
+
type: string
|
| 60 |
+
required: true
|
| 61 |
+
body:
|
| 62 |
+
session_id:
|
| 63 |
+
type: string
|
| 64 |
+
required: true
|
| 65 |
+
task_type:
|
| 66 |
+
type: string
|
| 67 |
+
required: true
|
| 68 |
+
enum: [task1, task2, task3]
|
| 69 |
+
action_type:
|
| 70 |
+
type: string
|
| 71 |
+
required: true
|
| 72 |
+
enum: [delegate, verify, solve_independently, skip]
|
| 73 |
+
specialist_id:
|
| 74 |
+
type: string
|
| 75 |
+
required: false
|
| 76 |
+
enum: [S0, S1, S2, S3, S4]
|
| 77 |
+
note: required for delegate and verify
|
| 78 |
+
subtask_response:
|
| 79 |
+
type: string
|
| 80 |
+
required: false
|
| 81 |
+
note: required for solve_independently
|
| 82 |
+
reasoning:
|
| 83 |
+
type: string
|
| 84 |
+
required: false
|
| 85 |
+
returns: StepResult with reward, done, info
|
| 86 |
+
|
| 87 |
+
state:
|
| 88 |
+
method: GET
|
| 89 |
+
path: /state
|
| 90 |
+
params:
|
| 91 |
+
session_id:
|
| 92 |
+
type: string
|
| 93 |
+
required: true
|
| 94 |
+
returns: SentinelState with trust_snapshot, completion, adversarial stats
|
| 95 |
+
|
| 96 |
+
tasks:
|
| 97 |
+
task1:
|
| 98 |
+
name: Single-Step Trust Decision
|
| 99 |
+
difficulty: easy
|
| 100 |
+
subtasks: 10
|
| 101 |
+
max_steps: 15
|
| 102 |
+
adversary_active: false
|
| 103 |
+
reward: "0.99 correct delegation + stakes awareness | 0.02 skip penalty"
|
| 104 |
+
|
| 105 |
+
task2:
|
| 106 |
+
name: Multi-Step Delegation Chain
|
| 107 |
+
difficulty: medium
|
| 108 |
+
subtasks: 15
|
| 109 |
+
max_steps: 30
|
| 110 |
+
adversary_active: false
|
| 111 |
+
reward: "per-step accuracy + efficiency | terminal completionΓ0.65 + calibrationΓ0.35"
|
| 112 |
+
|
| 113 |
+
task3:
|
| 114 |
+
name: Full Adversarial Episode
|
| 115 |
+
difficulty: hard
|
| 116 |
+
subtasks: 20
|
| 117 |
+
max_steps: 45
|
| 118 |
+
adversary_active: true
|
| 119 |
+
reward: "terminal completionΓ0.35 + detectionΓ0.30 + calibrationΓ0.25 + efficiencyΓ0.10"
|
| 120 |
+
|
| 121 |
+
specialists:
|
| 122 |
+
S0: "AccurateSlow β 90% accurate, costs +1 extra step"
|
| 123 |
+
S1: "OverconfidentFast β 60% accurate, always reports high confidence"
|
| 124 |
+
S2: "DomainBound β 95% in-domain, 10% out-of-domain"
|
| 125 |
+
S3: "Adversarial β benign until stakes >= 0.70, then poisons result"
|
| 126 |
+
S4: "Degrading β 95% first call, -15% accuracy per reuse"
|
| 127 |
+
|
| 128 |
+
dataset:
|
| 129 |
+
total_scenarios: 120
|
| 130 |
+
split:
|
| 131 |
+
task1: 40
|
| 132 |
+
task2: 40
|
| 133 |
+
task3: 40
|
| 134 |
+
profile_shuffle: true
|
| 135 |
+
note: >
|
| 136 |
+
Specialist profiles (which slot is adversarial) resample every episode.
|
| 137 |
+
Agent cannot memorize identities β must learn behavioral trust calibration.
|
| 138 |
+
|
| 139 |
+
baseline:
|
| 140 |
+
script: inference.py
|
| 141 |
+
required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
|
| 142 |
+
optional_env_vars: [ENV_URL]
|
| 143 |
+
latest_local_score: 0.0
|
| 144 |
+
latest_local_episodes: 30
|
| 145 |
+
reproducibility:
|
| 146 |
+
inference_temperature: 0.0
|
| 147 |
+
agent: heuristic-trust-weighted
|
| 148 |
+
dataset_order: fixed SCN-TASK*-001 through SCN-TASK*-010 per task
|
pyproject.toml
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn[standard]==0.30.6
|
| 3 |
+
pydantic==2.7.4
|
| 4 |
+
httpx==0.27.0
|
| 5 |
+
python-multipart==0.0.9
|
scenarios.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
# ---------------------------------------------------------------------------
|
| 4 |
+
# SENTINEL Scenario Dataset
|
| 5 |
+
#
|
| 6 |
+
# Each scenario is a list of 20 abstract subtask nodes.
|
| 7 |
+
# Subtask fields:
|
| 8 |
+
# id : unique within the scenario
|
| 9 |
+
# description : what the orchestrator sees (abstract, domain-tagged)
|
| 10 |
+
# domain : ANALYZE | SYNTHESIZE | VERIFY | PLAN | EXECUTE
|
| 11 |
+
# stakes : 0.0-1.0 (>= 0.7 = adversarial specialist may trigger)
|
| 12 |
+
# ground_truth: correct answer string (hidden from orchestrator)
|
| 13 |
+
# depends_on : list of subtask ids that must complete first (DAG edges)
|
| 14 |
+
#
|
| 15 |
+
# 3 scenario sets x 40 scenarios each = 120 total
|
| 16 |
+
# task1 (easy) : 40 scenarios β 10 subtasks, no adversary
|
| 17 |
+
# task2 (medium) : 40 scenarios β 15 subtasks, adversary inactive
|
| 18 |
+
# task3 (hard) : 40 scenarios β 20 subtasks, adversary active
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
|
| 21 |
+
from typing import TypedDict
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class SubTask(TypedDict):
|
| 25 |
+
id: str
|
| 26 |
+
description: str
|
| 27 |
+
domain: str # ANALYZE | SYNTHESIZE | VERIFY | PLAN | EXECUTE
|
| 28 |
+
stakes: float
|
| 29 |
+
ground_truth: str
|
| 30 |
+
depends_on: list[str]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class Scenario(TypedDict):
|
| 34 |
+
scenario_id: str
|
| 35 |
+
task_type: str
|
| 36 |
+
title: str
|
| 37 |
+
description: str
|
| 38 |
+
subtasks: list[SubTask]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Subtask template builders
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
|
| 45 |
+
def _subtask(
|
| 46 |
+
idx: int,
|
| 47 |
+
domain: str,
|
| 48 |
+
stakes: float,
|
| 49 |
+
depends_on: list[str] | None = None,
|
| 50 |
+
) -> SubTask:
|
| 51 |
+
"""
|
| 52 |
+
Build a generic abstract subtask.
|
| 53 |
+
Description and ground_truth are domain-keyed but not domain-specific β
|
| 54 |
+
the agent only sees the abstract framing, not the domain context.
|
| 55 |
+
"""
|
| 56 |
+
sid = f"T{idx:02d}"
|
| 57 |
+
|
| 58 |
+
templates = {
|
| 59 |
+
"ANALYZE": {
|
| 60 |
+
"description": f"Analyze the inputs from prior steps and identify the key pattern. "
|
| 61 |
+
f"(subtask {idx}, stakes={stakes:.1f})",
|
| 62 |
+
"ground_truth": "PATTERN_IDENTIFIED",
|
| 63 |
+
},
|
| 64 |
+
"SYNTHESIZE": {
|
| 65 |
+
"description": f"Synthesize outputs from completed subtasks into a unified result. "
|
| 66 |
+
f"(subtask {idx}, stakes={stakes:.1f})",
|
| 67 |
+
"ground_truth": "SYNTHESIS_COMPLETE",
|
| 68 |
+
},
|
| 69 |
+
"VERIFY": {
|
| 70 |
+
"description": f"Verify the correctness of the previous subtask result. "
|
| 71 |
+
f"(subtask {idx}, stakes={stakes:.1f})",
|
| 72 |
+
"ground_truth": "VERIFIED_CORRECT",
|
| 73 |
+
},
|
| 74 |
+
"PLAN": {
|
| 75 |
+
"description": f"Plan the next sequence of actions given current state. "
|
| 76 |
+
f"(subtask {idx}, stakes={stakes:.1f})",
|
| 77 |
+
"ground_truth": "PLAN_VALID",
|
| 78 |
+
},
|
| 79 |
+
"EXECUTE": {
|
| 80 |
+
"description": f"Execute the planned action and report the outcome. "
|
| 81 |
+
f"(subtask {idx}, stakes={stakes:.1f})",
|
| 82 |
+
"ground_truth": "EXECUTION_SUCCESS",
|
| 83 |
+
},
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
t = templates[domain]
|
| 87 |
+
return SubTask(
|
| 88 |
+
id=sid,
|
| 89 |
+
description=t["description"],
|
| 90 |
+
domain=domain,
|
| 91 |
+
stakes=stakes,
|
| 92 |
+
ground_truth=t["ground_truth"],
|
| 93 |
+
depends_on=depends_on or [],
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
# DAG layouts for each task type
|
| 99 |
+
# Each entry: (domain, stakes, depends_on_indices)
|
| 100 |
+
# depends_on_indices are 1-based subtask numbers
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
|
| 103 |
+
# Task 1 β 10 subtasks, linear chain, no high-stakes nodes
|
| 104 |
+
_TASK1_LAYOUT = [
|
| 105 |
+
("ANALYZE", 0.20, []),
|
| 106 |
+
("ANALYZE", 0.25, [1]),
|
| 107 |
+
("VERIFY", 0.20, [2]),
|
| 108 |
+
("SYNTHESIZE", 0.30, [3]),
|
| 109 |
+
("VERIFY", 0.25, [4]),
|
| 110 |
+
("PLAN", 0.30, [5]),
|
| 111 |
+
("ANALYZE", 0.35, [6]),
|
| 112 |
+
("SYNTHESIZE", 0.30, [7]),
|
| 113 |
+
("VERIFY", 0.35, [8]),
|
| 114 |
+
("EXECUTE", 0.40, [9]),
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
# Task 2 β 15 subtasks, branching DAG, medium stakes, no adversary trigger
|
| 118 |
+
_TASK2_LAYOUT = [
|
| 119 |
+
("ANALYZE", 0.20, []),
|
| 120 |
+
("ANALYZE", 0.25, [1]),
|
| 121 |
+
("VERIFY", 0.30, [2]),
|
| 122 |
+
("SYNTHESIZE", 0.35, [3]),
|
| 123 |
+
("PLAN", 0.40, [4]),
|
| 124 |
+
("ANALYZE", 0.30, [3]), # branch from node 3
|
| 125 |
+
("VERIFY", 0.45, [5, 6]), # joins branches
|
| 126 |
+
("EXECUTE", 0.50, [7]),
|
| 127 |
+
("VERIFY", 0.55, [8]),
|
| 128 |
+
("SYNTHESIZE", 0.45, [9]),
|
| 129 |
+
("PLAN", 0.50, [10]),
|
| 130 |
+
("ANALYZE", 0.55, [11]),
|
| 131 |
+
("VERIFY", 0.60, [12]),
|
| 132 |
+
("SYNTHESIZE", 0.60, [13]),
|
| 133 |
+
("EXECUTE", 0.65, [14]),
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
# Task 3 β 20 subtasks, full DAG with adversarial trigger zones (stakes >= 0.7)
|
| 137 |
+
_TASK3_LAYOUT = [
|
| 138 |
+
("ANALYZE", 0.20, []),
|
| 139 |
+
("ANALYZE", 0.25, [1]),
|
| 140 |
+
("VERIFY", 0.30, [2]),
|
| 141 |
+
("SYNTHESIZE", 0.35, [3]),
|
| 142 |
+
("PLAN", 0.40, [4]),
|
| 143 |
+
("ANALYZE", 0.30, [3]), # branch A
|
| 144 |
+
("VERIFY", 0.45, [5, 6]), # join A+B
|
| 145 |
+
("EXECUTE", 0.50, [7]),
|
| 146 |
+
("VERIFY", 0.55, [8]),
|
| 147 |
+
("SYNTHESIZE", 0.50, [9]),
|
| 148 |
+
("PLAN", 0.60, [10]),
|
| 149 |
+
("ANALYZE", 0.55, [11]),
|
| 150 |
+
("SYNTHESIZE", 0.65, [12]),
|
| 151 |
+
("VERIFY", 0.70, [13]), # β ADVERSARIAL ZONE START
|
| 152 |
+
("EXECUTE", 0.75, [14]), # β HIGH STAKES
|
| 153 |
+
("PLAN", 0.80, [15]), # β HIGH STAKES
|
| 154 |
+
("ANALYZE", 0.75, [16]), # β HIGH STAKES
|
| 155 |
+
("VERIFY", 0.85, [17]), # β PEAK STAKES
|
| 156 |
+
("SYNTHESIZE", 0.90, [18]), # β PEAK STAKES
|
| 157 |
+
("EXECUTE", 0.95, [19]), # β CRITICAL β terminal
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def _build_scenario(
|
| 162 |
+
scenario_id: str,
|
| 163 |
+
task_type: str,
|
| 164 |
+
layout: list[tuple],
|
| 165 |
+
title_suffix: str,
|
| 166 |
+
) -> Scenario:
|
| 167 |
+
subtasks = []
|
| 168 |
+
for i, (domain, stakes, dep_indices) in enumerate(layout, start=1):
|
| 169 |
+
depends_on = [f"T{d:02d}" for d in dep_indices]
|
| 170 |
+
subtasks.append(_subtask(i, domain, stakes, depends_on))
|
| 171 |
+
|
| 172 |
+
return Scenario(
|
| 173 |
+
scenario_id=scenario_id,
|
| 174 |
+
task_type=task_type,
|
| 175 |
+
title=f"Multi-Agent Task Workflow {title_suffix}",
|
| 176 |
+
description=(
|
| 177 |
+
f"A {task_type} abstract multi-agent workflow where the orchestrator "
|
| 178 |
+
f"must delegate {len(subtasks)} subtasks across 5 specialists with "
|
| 179 |
+
f"hidden reliability profiles, building trust from behavioral evidence alone."
|
| 180 |
+
),
|
| 181 |
+
subtasks=subtasks,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# ---------------------------------------------------------------------------
|
| 186 |
+
# Generate 40 scenarios per task type
|
| 187 |
+
# Scenarios vary only by scenario_id and stakes jitter (+/- 0.05)
|
| 188 |
+
# so the trust-calibration challenge is consistent but not identical
|
| 189 |
+
# ---------------------------------------------------------------------------
|
| 190 |
+
|
| 191 |
+
import random as _random
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _jitter_stakes(layout: list[tuple], seed: int, max_jitter: float = 0.05) -> list[tuple]:
|
| 195 |
+
"""Apply small random stakes perturbation so each scenario is slightly different."""
|
| 196 |
+
rng = _random.Random(seed)
|
| 197 |
+
return [
|
| 198 |
+
(domain, round(min(0.99, max(0.01, stakes + rng.uniform(-max_jitter, max_jitter))), 2), deps)
|
| 199 |
+
for domain, stakes, deps in layout
|
| 200 |
+
]
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def _generate_scenarios(
|
| 204 |
+
task_type: str,
|
| 205 |
+
layout: list[tuple],
|
| 206 |
+
count: int = 40,
|
| 207 |
+
) -> list[Scenario]:
|
| 208 |
+
scenarios = []
|
| 209 |
+
for i in range(count):
|
| 210 |
+
jittered = _jitter_stakes(layout, seed=i * 100 + hash(task_type) % 1000)
|
| 211 |
+
sid = f"SCN-{task_type.upper()}-{i+1:03d}"
|
| 212 |
+
scenarios.append(
|
| 213 |
+
_build_scenario(sid, task_type, jittered, f"#{i+1:03d}")
|
| 214 |
+
)
|
| 215 |
+
return scenarios
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
# ---------------------------------------------------------------------------
|
| 219 |
+
# Public dataset
|
| 220 |
+
# ---------------------------------------------------------------------------
|
| 221 |
+
|
| 222 |
+
TASK1_SCENARIOS: list[Scenario] = _generate_scenarios("task1", _TASK1_LAYOUT, count=40)
|
| 223 |
+
TASK2_SCENARIOS: list[Scenario] = _generate_scenarios("task2", _TASK2_LAYOUT, count=40)
|
| 224 |
+
TASK3_SCENARIOS: list[Scenario] = _generate_scenarios("task3", _TASK3_LAYOUT, count=40)
|
| 225 |
+
|
| 226 |
+
ALL_SCENARIOS: list[Scenario] = TASK1_SCENARIOS + TASK2_SCENARIOS + TASK3_SCENARIOS
|
| 227 |
+
|
| 228 |
+
SCENARIOS_BY_ID: dict[str, Scenario] = {s["scenario_id"]: s for s in ALL_SCENARIOS}
|
| 229 |
+
|
| 230 |
+
SCENARIOS_BY_TASK: dict[str, list[Scenario]] = {
|
| 231 |
+
"task1": TASK1_SCENARIOS,
|
| 232 |
+
"task2": TASK2_SCENARIOS,
|
| 233 |
+
"task3": TASK3_SCENARIOS,
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
# ---------------------------------------------------------------------------
|
| 238 |
+
# Helpers
|
| 239 |
+
# ---------------------------------------------------------------------------
|
| 240 |
+
|
| 241 |
+
def get_scenario(scenario_id: str) -> Scenario:
|
| 242 |
+
if scenario_id not in SCENARIOS_BY_ID:
|
| 243 |
+
raise ValueError(f"Unknown scenario_id: {scenario_id}")
|
| 244 |
+
return SCENARIOS_BY_ID[scenario_id]
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def sample_scenario(task_type: str, seed: int | None = None) -> Scenario:
|
| 248 |
+
pool = SCENARIOS_BY_TASK.get(task_type)
|
| 249 |
+
if not pool:
|
| 250 |
+
raise ValueError(f"Unknown task_type: {task_type}")
|
| 251 |
+
rng = _random.Random(seed)
|
| 252 |
+
return rng.choice(pool)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def scenario_summary() -> dict:
|
| 256 |
+
return {
|
| 257 |
+
"total": len(ALL_SCENARIOS),
|
| 258 |
+
"task1": len(TASK1_SCENARIOS),
|
| 259 |
+
"task2": len(TASK2_SCENARIOS),
|
| 260 |
+
"task3": len(TASK3_SCENARIOS),
|
| 261 |
+
"subtasks_per_task": {
|
| 262 |
+
"task1": len(_TASK1_LAYOUT),
|
| 263 |
+
"task2": len(_TASK2_LAYOUT),
|
| 264 |
+
"task3": len(_TASK3_LAYOUT),
|
| 265 |
+
},
|
| 266 |
+
}
|
specialists.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import random
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
from typing import Protocol
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# ---------------------------------------------------------------------------
|
| 9 |
+
# Result type returned by every specialist
|
| 10 |
+
# ---------------------------------------------------------------------------
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class SpecialistResult:
|
| 14 |
+
specialist_id: str
|
| 15 |
+
subtask: str
|
| 16 |
+
response: str
|
| 17 |
+
outcome: float # 1.0 correct | 0.5 partial | 0.0 wrong/adversarial
|
| 18 |
+
confidence: float # 0.0β1.0 (self-reported β may be misleading)
|
| 19 |
+
is_adversarial: bool # True only for AdversarialSpecialist at trigger point
|
| 20 |
+
metadata: dict = field(default_factory=dict)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
# Base class
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
|
| 27 |
+
class BaseSpecialist:
|
| 28 |
+
specialist_id: str
|
| 29 |
+
|
| 30 |
+
def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
|
| 31 |
+
raise NotImplementedError
|
| 32 |
+
|
| 33 |
+
def reset(self) -> None:
|
| 34 |
+
"""Called at episode start. Override to reset FSM state."""
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# S0 β AccurateSlow
|
| 40 |
+
# Correct 90% of the time, but costs +1 extra step budget when called.
|
| 41 |
+
# Training signal: learn when accuracy matters more than speed.
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
class AccurateSlowSpecialist(BaseSpecialist):
|
| 45 |
+
specialist_id = "S0"
|
| 46 |
+
|
| 47 |
+
ACCURACY = 0.90
|
| 48 |
+
|
| 49 |
+
def reset(self) -> None:
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
|
| 53 |
+
correct = rng.random() < self.ACCURACY
|
| 54 |
+
return SpecialistResult(
|
| 55 |
+
specialist_id=self.specialist_id,
|
| 56 |
+
subtask=subtask,
|
| 57 |
+
response=f"[S0] Careful analysis: {'correct' if correct else 'incorrect'} answer.",
|
| 58 |
+
outcome=1.0 if correct else 0.0,
|
| 59 |
+
confidence=0.85,
|
| 60 |
+
is_adversarial=False,
|
| 61 |
+
metadata={"step_cost": 2}, # costs 2 steps instead of 1
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# S1 β OverconfidentFast
|
| 67 |
+
# Always answers immediately. Wrong 40% of the time. Confidence always high.
|
| 68 |
+
# Training signal: don't trust high confidence alone.
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
class OverconfidentFastSpecialist(BaseSpecialist):
|
| 72 |
+
specialist_id = "S1"
|
| 73 |
+
|
| 74 |
+
ACCURACY = 0.60
|
| 75 |
+
|
| 76 |
+
def reset(self) -> None:
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
|
| 80 |
+
correct = rng.random() < self.ACCURACY
|
| 81 |
+
return SpecialistResult(
|
| 82 |
+
specialist_id=self.specialist_id,
|
| 83 |
+
subtask=subtask,
|
| 84 |
+
response=f"[S1] Instant answer: {'correct' if correct else 'incorrect'}.",
|
| 85 |
+
outcome=1.0 if correct else 0.0,
|
| 86 |
+
confidence=0.95, # always reports high confidence
|
| 87 |
+
is_adversarial=False,
|
| 88 |
+
metadata={"step_cost": 1},
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
# S2 β DomainBound
|
| 94 |
+
# Accurate (95%) within its domain, fails (10%) outside.
|
| 95 |
+
# Training signal: learn to route by domain, not just trust score.
|
| 96 |
+
# ---------------------------------------------------------------------------
|
| 97 |
+
|
| 98 |
+
class DomainBoundSpecialist(BaseSpecialist):
|
| 99 |
+
specialist_id = "S2"
|
| 100 |
+
|
| 101 |
+
IN_DOMAIN_ACCURACY = 0.95
|
| 102 |
+
OUT_DOMAIN_ACCURACY = 0.10
|
| 103 |
+
|
| 104 |
+
def reset(self) -> None:
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
|
| 108 |
+
# Domain is inferred from subtask text (keywords signal domain)
|
| 109 |
+
in_domain = any(kw in subtask.lower() for kw in
|
| 110 |
+
["classify", "categorize", "label", "triage", "assess"])
|
| 111 |
+
accuracy = self.IN_DOMAIN_ACCURACY if in_domain else self.OUT_DOMAIN_ACCURACY
|
| 112 |
+
correct = rng.random() < accuracy
|
| 113 |
+
return SpecialistResult(
|
| 114 |
+
specialist_id=self.specialist_id,
|
| 115 |
+
subtask=subtask,
|
| 116 |
+
response=f"[S2] Domain {'match' if in_domain else 'mismatch'}: "
|
| 117 |
+
f"{'correct' if correct else 'incorrect'}.",
|
| 118 |
+
outcome=1.0 if correct else 0.0,
|
| 119 |
+
confidence=0.90 if in_domain else 0.45,
|
| 120 |
+
is_adversarial=False,
|
| 121 |
+
metadata={"step_cost": 1, "in_domain": in_domain},
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# ---------------------------------------------------------------------------
|
| 126 |
+
# S3 β Adversarial
|
| 127 |
+
# Correct when stakes < threshold. Poisons (returns plausible-but-wrong
|
| 128 |
+
# answer with high confidence) when stakes >= threshold.
|
| 129 |
+
# Identity is SHUFFLED every episode β agent cannot memorize who is S3.
|
| 130 |
+
# Training signal: detect adversarial behavior from behavioral evidence.
|
| 131 |
+
# ---------------------------------------------------------------------------
|
| 132 |
+
|
| 133 |
+
class AdversarialSpecialist(BaseSpecialist):
|
| 134 |
+
specialist_id = "S3"
|
| 135 |
+
|
| 136 |
+
BENIGN_ACCURACY = 0.88 # looks trustworthy at low stakes
|
| 137 |
+
STAKES_THRESHOLD = 0.70 # triggers above this
|
| 138 |
+
|
| 139 |
+
def reset(self) -> None:
|
| 140 |
+
pass
|
| 141 |
+
|
| 142 |
+
def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
|
| 143 |
+
if stakes >= self.STAKES_THRESHOLD:
|
| 144 |
+
# Adversarial trigger: plausible response, wrong outcome
|
| 145 |
+
return SpecialistResult(
|
| 146 |
+
specialist_id=self.specialist_id,
|
| 147 |
+
subtask=subtask,
|
| 148 |
+
response=f"[S3] Confident answer (poisoned at stakes={stakes:.2f}).",
|
| 149 |
+
outcome=0.0,
|
| 150 |
+
confidence=0.92, # high confidence = trap
|
| 151 |
+
is_adversarial=True,
|
| 152 |
+
metadata={"step_cost": 1, "triggered": True, "stakes": stakes},
|
| 153 |
+
)
|
| 154 |
+
else:
|
| 155 |
+
# Benign phase β builds trust before striking
|
| 156 |
+
correct = rng.random() < self.BENIGN_ACCURACY
|
| 157 |
+
return SpecialistResult(
|
| 158 |
+
specialist_id=self.specialist_id,
|
| 159 |
+
subtask=subtask,
|
| 160 |
+
response=f"[S3] Normal answer: {'correct' if correct else 'incorrect'}.",
|
| 161 |
+
outcome=1.0 if correct else 0.0,
|
| 162 |
+
confidence=0.88,
|
| 163 |
+
is_adversarial=False,
|
| 164 |
+
metadata={"step_cost": 1, "triggered": False, "stakes": stakes},
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ---------------------------------------------------------------------------
|
| 169 |
+
# S4 β Degrading
|
| 170 |
+
# 95% accurate on first call. Degrades by 15% per subsequent reuse.
|
| 171 |
+
# Training signal: learn specialist fatigue, diversify delegation.
|
| 172 |
+
# ---------------------------------------------------------------------------
|
| 173 |
+
|
| 174 |
+
class DegradingSpecialist(BaseSpecialist):
|
| 175 |
+
specialist_id = "S4"
|
| 176 |
+
|
| 177 |
+
BASE_ACCURACY = 0.95
|
| 178 |
+
DEGRADATION_RATE = 0.15
|
| 179 |
+
|
| 180 |
+
def __init__(self) -> None:
|
| 181 |
+
self._call_count = 0
|
| 182 |
+
|
| 183 |
+
def reset(self) -> None:
|
| 184 |
+
self._call_count = 0
|
| 185 |
+
|
| 186 |
+
def execute(self, subtask: str, stakes: float, rng: random.Random) -> SpecialistResult:
|
| 187 |
+
accuracy = max(0.05, self.BASE_ACCURACY - self.DEGRADATION_RATE * self._call_count)
|
| 188 |
+
correct = rng.random() < accuracy
|
| 189 |
+
self._call_count += 1
|
| 190 |
+
return SpecialistResult(
|
| 191 |
+
specialist_id=self.specialist_id,
|
| 192 |
+
subtask=subtask,
|
| 193 |
+
response=f"[S4] Call #{self._call_count}, accuracy={accuracy:.0%}: "
|
| 194 |
+
f"{'correct' if correct else 'incorrect'}.",
|
| 195 |
+
outcome=1.0 if correct else 0.0,
|
| 196 |
+
confidence=max(0.3, 0.92 - 0.10 * self._call_count),
|
| 197 |
+
is_adversarial=False,
|
| 198 |
+
metadata={"step_cost": 1, "call_count": self._call_count, "accuracy": accuracy},
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ---------------------------------------------------------------------------
|
| 203 |
+
# SpecialistPool
|
| 204 |
+
# Manages the 5 specialists. Shuffles adversarial identity each episode.
|
| 205 |
+
# ---------------------------------------------------------------------------
|
| 206 |
+
|
| 207 |
+
class SpecialistPool:
|
| 208 |
+
"""
|
| 209 |
+
Holds the 5 specialist FSMs.
|
| 210 |
+
Each episode, the AdversarialSpecialist is assigned to a random slot
|
| 211 |
+
(S0βS4). The orchestrator cannot know which slot is adversarial β
|
| 212 |
+
it must infer from behavioral evidence via the TrustLedger.
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
def __init__(self) -> None:
|
| 216 |
+
self._fixed: dict[str, BaseSpecialist] = {
|
| 217 |
+
"S0": AccurateSlowSpecialist(),
|
| 218 |
+
"S1": OverconfidentFastSpecialist(),
|
| 219 |
+
"S2": DomainBoundSpecialist(),
|
| 220 |
+
"S3": AdversarialSpecialist(),
|
| 221 |
+
"S4": DegradingSpecialist(),
|
| 222 |
+
}
|
| 223 |
+
# Profile mapping: public_id β internal specialist
|
| 224 |
+
# Shuffled each reset()
|
| 225 |
+
self._profile: dict[str, str] = {sid: sid for sid in self._fixed}
|
| 226 |
+
self._adversarial_slot: str = "S3"
|
| 227 |
+
|
| 228 |
+
def reset(self, seed: int | None = None) -> None:
|
| 229 |
+
"""
|
| 230 |
+
Resample adversarial identity. S3 behavior is assigned to a random slot.
|
| 231 |
+
All other behaviors are also shuffled so the agent truly cannot memorize.
|
| 232 |
+
"""
|
| 233 |
+
rng = random.Random(seed)
|
| 234 |
+
|
| 235 |
+
# Reset all FSM states
|
| 236 |
+
for spec in self._fixed.values():
|
| 237 |
+
spec.reset()
|
| 238 |
+
|
| 239 |
+
# Shuffle which public slot gets which internal behavior
|
| 240 |
+
ids = list(self._fixed.keys())
|
| 241 |
+
shuffled = ids.copy()
|
| 242 |
+
rng.shuffle(shuffled)
|
| 243 |
+
self._profile = dict(zip(ids, shuffled))
|
| 244 |
+
|
| 245 |
+
# Track which public slot currently has adversarial behavior
|
| 246 |
+
# (S3 internal β whichever public slot maps to it)
|
| 247 |
+
self._adversarial_slot = next(
|
| 248 |
+
pub for pub, internal in self._profile.items() if internal == "S3"
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
@property
|
| 252 |
+
def adversarial_slot(self) -> str:
|
| 253 |
+
"""Public slot that is currently adversarial. Hidden from agent."""
|
| 254 |
+
return self._adversarial_slot
|
| 255 |
+
|
| 256 |
+
def execute(
|
| 257 |
+
self,
|
| 258 |
+
specialist_id: str,
|
| 259 |
+
subtask: str,
|
| 260 |
+
stakes: float,
|
| 261 |
+
rng: random.Random,
|
| 262 |
+
) -> SpecialistResult:
|
| 263 |
+
"""
|
| 264 |
+
Route execution through the shuffled profile.
|
| 265 |
+
Returns result with specialist_id = the public slot (not internal type).
|
| 266 |
+
"""
|
| 267 |
+
internal_id = self._profile[specialist_id]
|
| 268 |
+
spec = self._fixed[internal_id]
|
| 269 |
+
result = spec.execute(subtask, stakes, rng)
|
| 270 |
+
# Rewrite id to public slot so agent only sees the public label
|
| 271 |
+
result.specialist_id = specialist_id
|
| 272 |
+
return result
|
| 273 |
+
|
| 274 |
+
def available_ids(self) -> list[str]:
|
| 275 |
+
return list(self._profile.keys())
|
task_graph.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from scenarios import Scenario, SubTask
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# ---------------------------------------------------------------------------
|
| 10 |
+
# Node state
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class TaskNode:
|
| 15 |
+
subtask: SubTask
|
| 16 |
+
status: str = "pending" # pending | ready | in_progress | completed | failed | skipped
|
| 17 |
+
outcome: float = 0.0 # 1.0 correct | 0.5 partial | 0.0 wrong
|
| 18 |
+
specialist_used: str = ""
|
| 19 |
+
attempts: int = 0
|
| 20 |
+
was_adversarial: bool = False
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
# TaskGraph
|
| 25 |
+
# Manages the DAG of subtasks for one episode.
|
| 26 |
+
# Tracks dependencies, determines which nodes are "ready" to execute,
|
| 27 |
+
# and records outcomes.
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
class TaskGraph:
|
| 31 |
+
def __init__(self, scenario: Scenario) -> None:
|
| 32 |
+
self._scenario = scenario
|
| 33 |
+
self._nodes: dict[str, TaskNode] = {}
|
| 34 |
+
self._order: list[str] = [] # insertion order (for iteration)
|
| 35 |
+
self._build(scenario["subtasks"])
|
| 36 |
+
|
| 37 |
+
def _build(self, subtasks: list[SubTask]) -> None:
|
| 38 |
+
for st in subtasks:
|
| 39 |
+
self._nodes[st["id"]] = TaskNode(subtask=st)
|
| 40 |
+
self._order.append(st["id"])
|
| 41 |
+
|
| 42 |
+
# ------------------------------------------------------------------
|
| 43 |
+
# State queries
|
| 44 |
+
# ------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
def current_node(self) -> Optional[TaskNode]:
|
| 47 |
+
"""
|
| 48 |
+
Returns the first 'ready' node (all dependencies completed).
|
| 49 |
+
Returns None if all nodes are done or none are unblocked yet.
|
| 50 |
+
"""
|
| 51 |
+
for sid in self._order:
|
| 52 |
+
node = self._nodes[sid]
|
| 53 |
+
if node.status == "pending" and self._deps_met(sid):
|
| 54 |
+
node.status = "ready"
|
| 55 |
+
if node.status == "ready":
|
| 56 |
+
return node
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
def _deps_met(self, subtask_id: str) -> bool:
|
| 60 |
+
"""All dependencies of this node must be 'completed'."""
|
| 61 |
+
deps = self._nodes[subtask_id].subtask["depends_on"]
|
| 62 |
+
return all(
|
| 63 |
+
self._nodes[dep].status == "completed"
|
| 64 |
+
for dep in deps
|
| 65 |
+
if dep in self._nodes
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
def is_done(self) -> bool:
|
| 69 |
+
return all(
|
| 70 |
+
n.status in ("completed", "failed", "skipped")
|
| 71 |
+
for n in self._nodes.values()
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
def completion_rate(self) -> float:
|
| 75 |
+
completed = sum(1 for n in self._nodes.values() if n.status == "completed")
|
| 76 |
+
return completed / len(self._nodes) if self._nodes else 0.0
|
| 77 |
+
|
| 78 |
+
def adversarial_detections(self) -> int:
|
| 79 |
+
"""
|
| 80 |
+
Count of high-stakes adversarial attempts that were avoided.
|
| 81 |
+
Avoided = node was adversarial AND orchestrator chose VERIFY or SOLVE_INDEPENDENTLY.
|
| 82 |
+
"""
|
| 83 |
+
return sum(
|
| 84 |
+
1 for n in self._nodes.values()
|
| 85 |
+
if n.was_adversarial and n.status == "completed" and n.outcome > 0.0
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
def adversarial_poisonings(self) -> int:
|
| 89 |
+
"""
|
| 90 |
+
Count of adversarial results that slipped through unchecked.
|
| 91 |
+
"""
|
| 92 |
+
return sum(
|
| 93 |
+
1 for n in self._nodes.values()
|
| 94 |
+
if n.was_adversarial and n.outcome == 0.0
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
def subtasks_remaining(self) -> int:
|
| 98 |
+
return sum(
|
| 99 |
+
1 for n in self._nodes.values()
|
| 100 |
+
if n.status in ("pending", "ready", "in_progress")
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
def subtasks_completed(self) -> int:
|
| 104 |
+
return sum(1 for n in self._nodes.values() if n.status == "completed")
|
| 105 |
+
|
| 106 |
+
def subtasks_total(self) -> int:
|
| 107 |
+
return len(self._nodes)
|
| 108 |
+
|
| 109 |
+
def high_stakes_nodes(self) -> list[TaskNode]:
|
| 110 |
+
return [n for n in self._nodes.values() if n.subtask["stakes"] >= 0.70]
|
| 111 |
+
|
| 112 |
+
# ------------------------------------------------------------------
|
| 113 |
+
# Mutations
|
| 114 |
+
# ------------------------------------------------------------------
|
| 115 |
+
|
| 116 |
+
def record_outcome(
|
| 117 |
+
self,
|
| 118 |
+
subtask_id: str,
|
| 119 |
+
outcome: float,
|
| 120 |
+
specialist_id: str,
|
| 121 |
+
was_adversarial: bool = False,
|
| 122 |
+
) -> None:
|
| 123 |
+
if subtask_id not in self._nodes:
|
| 124 |
+
raise KeyError(f"Unknown subtask_id: {subtask_id}")
|
| 125 |
+
node = self._nodes[subtask_id]
|
| 126 |
+
node.outcome = outcome
|
| 127 |
+
node.specialist_used = specialist_id
|
| 128 |
+
node.attempts += 1
|
| 129 |
+
node.was_adversarial = was_adversarial
|
| 130 |
+
node.status = "completed" if outcome > 0.0 else "failed"
|
| 131 |
+
|
| 132 |
+
def skip_node(self, subtask_id: str) -> None:
|
| 133 |
+
if subtask_id in self._nodes:
|
| 134 |
+
self._nodes[subtask_id].status = "skipped"
|
| 135 |
+
|
| 136 |
+
# ------------------------------------------------------------------
|
| 137 |
+
# Summary (for info dict in StepResult)
|
| 138 |
+
# ------------------------------------------------------------------
|
| 139 |
+
|
| 140 |
+
def summary(self) -> dict:
|
| 141 |
+
return {
|
| 142 |
+
"scenario_id": self._scenario["scenario_id"],
|
| 143 |
+
"task_type": self._scenario["task_type"],
|
| 144 |
+
"subtasks_total": self.subtasks_total(),
|
| 145 |
+
"subtasks_completed": self.subtasks_completed(),
|
| 146 |
+
"subtasks_remaining": self.subtasks_remaining(),
|
| 147 |
+
"completion_rate": round(self.completion_rate(), 3),
|
| 148 |
+
"adversarial_detections": self.adversarial_detections(),
|
| 149 |
+
"adversarial_poisonings": self.adversarial_poisonings(),
|
| 150 |
+
"is_done": self.is_done(),
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
def node_statuses(self) -> dict[str, str]:
|
| 154 |
+
return {sid: n.status for sid, n in self._nodes.items()}
|
tests/test_environment.py
ADDED
|
File without changes
|
tests/test_graders.py
ADDED
|
File without changes
|
tests/test_specialists.py
ADDED
|
File without changes
|
training/colab_notebook.ipynb
ADDED
|
File without changes
|
training/evaluate.py
ADDED
|
File without changes
|
training/train.py
ADDED
|
File without changes
|
trust_ledger.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TrustLedger:
|
| 7 |
+
"""
|
| 8 |
+
Bayesian reliability tracker for each specialist.
|
| 9 |
+
|
| 10 |
+
Each specialist gets a Beta distribution prior (alpha, beta).
|
| 11 |
+
alpha = successes + 1, beta = failures + 1 (Laplace smoothing).
|
| 12 |
+
Trust score = alpha / (alpha + beta) = mean of Beta distribution.
|
| 13 |
+
|
| 14 |
+
Stakes multiplier: high-stakes outcomes move the needle harder.
|
| 15 |
+
Profile shuffles every episode β ledger resets on reset().
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
SPECIALIST_IDS = ["S0", "S1", "S2", "S3", "S4"]
|
| 19 |
+
|
| 20 |
+
def __init__(self) -> None:
|
| 21 |
+
self._reset()
|
| 22 |
+
|
| 23 |
+
def _reset(self) -> None:
|
| 24 |
+
# Uniform prior: alpha=1, beta=1 β trust=0.5 for all specialists
|
| 25 |
+
self._alpha: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
|
| 26 |
+
self._beta: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
|
| 27 |
+
self._call_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
|
| 28 |
+
|
| 29 |
+
def reset(self) -> None:
|
| 30 |
+
"""Call at the start of each episode."""
|
| 31 |
+
self._reset()
|
| 32 |
+
|
| 33 |
+
# ------------------------------------------------------------------
|
| 34 |
+
# Update
|
| 35 |
+
# ------------------------------------------------------------------
|
| 36 |
+
|
| 37 |
+
def update(
|
| 38 |
+
self,
|
| 39 |
+
specialist_id: str,
|
| 40 |
+
outcome: float, # 1.0 = correct, 0.0 = wrong/adversarial, 0.5 = partial
|
| 41 |
+
stakes: float, # 0.0β1.0; high stakes = larger update
|
| 42 |
+
) -> None:
|
| 43 |
+
"""
|
| 44 |
+
Bayesian update after observing a specialist outcome.
|
| 45 |
+
stakes acts as a weight multiplier (1x at low stakes, 3x at high stakes).
|
| 46 |
+
"""
|
| 47 |
+
if specialist_id not in self._alpha:
|
| 48 |
+
return
|
| 49 |
+
|
| 50 |
+
weight = 1.0 + 2.0 * stakes # 1.0 β 3.0
|
| 51 |
+
|
| 52 |
+
self._call_count[specialist_id] += 1
|
| 53 |
+
|
| 54 |
+
if outcome >= 0.5:
|
| 55 |
+
self._alpha[specialist_id] += weight * outcome
|
| 56 |
+
else:
|
| 57 |
+
self._beta[specialist_id] += weight * (1.0 - outcome)
|
| 58 |
+
|
| 59 |
+
# ------------------------------------------------------------------
|
| 60 |
+
# Read
|
| 61 |
+
# ------------------------------------------------------------------
|
| 62 |
+
|
| 63 |
+
def trust(self, specialist_id: str) -> float:
|
| 64 |
+
"""Point estimate: mean of Beta distribution."""
|
| 65 |
+
a = self._alpha.get(specialist_id, 1.0)
|
| 66 |
+
b = self._beta.get(specialist_id, 1.0)
|
| 67 |
+
return a / (a + b)
|
| 68 |
+
|
| 69 |
+
def snapshot(self) -> dict[str, float]:
|
| 70 |
+
"""Rounded trust scores for all specialists."""
|
| 71 |
+
return {sid: round(self.trust(sid), 3) for sid in self.SPECIALIST_IDS}
|
| 72 |
+
|
| 73 |
+
def call_count(self, specialist_id: str) -> int:
|
| 74 |
+
return self._call_count.get(specialist_id, 0)
|
| 75 |
+
|
| 76 |
+
def most_trusted(self) -> str:
|
| 77 |
+
"""Returns the specialist_id with the highest current trust score."""
|
| 78 |
+
return max(self.SPECIALIST_IDS, key=self.trust)
|
| 79 |
+
|
| 80 |
+
def least_trusted(self) -> str:
|
| 81 |
+
return min(self.SPECIALIST_IDS, key=self.trust)
|
| 82 |
+
|
| 83 |
+
# ------------------------------------------------------------------
|
| 84 |
+
# Calibration score (used in reward engine)
|
| 85 |
+
# ------------------------------------------------------------------
|
| 86 |
+
|
| 87 |
+
def brier_score(self, ground_truth_reliability: dict[str, float]) -> float:
|
| 88 |
+
"""
|
| 89 |
+
Measures how well the trust scores predict actual specialist reliability.
|
| 90 |
+
Lower = better calibrated. Range 0.0β1.0.
|
| 91 |
+
|
| 92 |
+
ground_truth_reliability: {"S0": 0.9, "S1": 0.6, ...}
|
| 93 |
+
(hidden from agent, used only by reward engine)
|
| 94 |
+
"""
|
| 95 |
+
total = 0.0
|
| 96 |
+
n = 0
|
| 97 |
+
for sid in self.SPECIALIST_IDS:
|
| 98 |
+
if sid in ground_truth_reliability:
|
| 99 |
+
predicted = self.trust(sid)
|
| 100 |
+
actual = ground_truth_reliability[sid]
|
| 101 |
+
total += (predicted - actual) ** 2
|
| 102 |
+
n += 1
|
| 103 |
+
return total / n if n > 0 else 0.0
|
| 104 |
+
|
| 105 |
+
def calibration_reward(self, ground_truth_reliability: dict[str, float]) -> float:
|
| 106 |
+
"""
|
| 107 |
+
Convert Brier score to a reward signal (0.0β1.0).
|
| 108 |
+
Perfect calibration β 1.0. Random β ~0.5.
|
| 109 |
+
"""
|
| 110 |
+
brier = self.brier_score(ground_truth_reliability)
|
| 111 |
+
# Invert and scale: brier=0 β reward=1.0, brier=0.25 β reward=0.5
|
| 112 |
+
return max(0.0, 1.0 - 4.0 * brier)
|
| 113 |
+
|
| 114 |
+
def __repr__(self) -> str:
|
| 115 |
+
snap = self.snapshot()
|
| 116 |
+
return f"TrustLedger({snap})"
|