sentinel-env / trust_ledger.py
XcodeAddy's picture
Add adaptive trust curriculum wow features
74b74f1
from __future__ import annotations
from sentinel_config import ADVERSARIAL_AWARENESS_STAKES
class TrustLedger:
"""
Bayesian reliability tracker for each specialist.
Each specialist gets a Beta distribution prior (alpha, beta).
alpha = successes + 1, beta = failures + 1 (Laplace smoothing).
Trust score = alpha / (alpha + beta) = mean of Beta distribution.
Stakes multiplier: high-stakes outcomes move the needle harder.
Profile shuffles every episode β€” ledger resets on reset().
"""
SPECIALIST_IDS = ["S0", "S1", "S2", "S3", "S4"]
def __init__(self) -> None:
self._reset()
def _reset(self) -> None:
# Uniform prior: alpha=1, beta=1 β†’ trust=0.5 for all specialists
self._alpha: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
self._beta: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
self._call_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
self._confidence_gap_sum: dict[str, float] = {sid: 0.0 for sid in self.SPECIALIST_IDS}
self._confidence_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
self._domain_success: dict[str, dict[str, float]] = {sid: {} for sid in self.SPECIALIST_IDS}
self._domain_count: dict[str, dict[str, int]] = {sid: {} for sid in self.SPECIALIST_IDS}
self._stakes_success: dict[str, dict[str, float]] = {
sid: {"low": 0.0, "high": 0.0} for sid in self.SPECIALIST_IDS
}
self._stakes_count: dict[str, dict[str, int]] = {
sid: {"low": 0, "high": 0} for sid in self.SPECIALIST_IDS
}
def reset(self) -> None:
"""Call at the start of each episode."""
self._reset()
# ------------------------------------------------------------------
# Update
# ------------------------------------------------------------------
def update(
self,
specialist_id: str,
outcome: float, # 1.0 = correct, 0.0 = wrong/adversarial, 0.5 = partial
stakes: float, # 0.0–1.0; high stakes = larger update
confidence: float | None = None,
domain: str | None = None,
) -> None:
"""
Bayesian update after observing a specialist outcome.
stakes acts as a weight multiplier (1x at low stakes, 3x at high stakes).
"""
if specialist_id not in self._alpha:
return
weight = 1.0 + 2.0 * stakes # 1.0 β†’ 3.0
self._call_count[specialist_id] += 1
if outcome >= 0.5:
self._alpha[specialist_id] += weight * outcome
else:
self._beta[specialist_id] += weight * (1.0 - outcome)
if confidence is not None:
self._confidence_gap_sum[specialist_id] += max(0.0, confidence - outcome)
self._confidence_count[specialist_id] += 1
if domain:
domain_key = domain.upper()
self._domain_success[specialist_id][domain_key] = (
self._domain_success[specialist_id].get(domain_key, 0.0) + outcome
)
self._domain_count[specialist_id][domain_key] = (
self._domain_count[specialist_id].get(domain_key, 0) + 1
)
stakes_bucket = "high" if stakes >= ADVERSARIAL_AWARENESS_STAKES else "low"
self._stakes_success[specialist_id][stakes_bucket] += outcome
self._stakes_count[specialist_id][stakes_bucket] += 1
# ------------------------------------------------------------------
# Read
# ------------------------------------------------------------------
def trust(self, specialist_id: str) -> float:
"""Point estimate: mean of Beta distribution."""
a = self._alpha.get(specialist_id, 1.0)
b = self._beta.get(specialist_id, 1.0)
return a / (a + b)
def snapshot(self) -> dict[str, float]:
"""Rounded trust scores for all specialists."""
return {sid: round(self.trust(sid), 3) for sid in self.SPECIALIST_IDS}
def behavioral_fingerprints(self) -> dict[str, dict]:
"""
Public behavioral features an orchestrator can learn from.
These are still evidence-only: no hidden specialist identity leaks.
"""
fingerprints: dict[str, dict] = {}
for sid in self.SPECIALIST_IDS:
confidence_count = self._confidence_count[sid]
gap = (
self._confidence_gap_sum[sid] / confidence_count
if confidence_count
else 0.0
)
domain_hit_rate = {
domain: round(success / max(1, self._domain_count[sid][domain]), 3)
for domain, success in sorted(self._domain_success[sid].items())
}
low_rate = self._bucket_rate(sid, "low")
high_rate = self._bucket_rate(sid, "high")
volatility = abs(high_rate - low_rate) if low_rate is not None and high_rate is not None else 0.0
fingerprints[sid] = {
"calls": self._call_count[sid],
"confidence_accuracy_gap": round(gap, 3),
"domain_hit_rate": domain_hit_rate,
"stakes_volatility": round(volatility, 3),
"low_stakes_accuracy": round(low_rate, 3) if low_rate is not None else None,
"high_stakes_accuracy": round(high_rate, 3) if high_rate is not None else None,
}
return fingerprints
def _bucket_rate(self, specialist_id: str, bucket: str) -> float | None:
count = self._stakes_count[specialist_id][bucket]
if count == 0:
return None
return self._stakes_success[specialist_id][bucket] / count
def call_count(self, specialist_id: str) -> int:
return self._call_count.get(specialist_id, 0)
def most_trusted(self) -> str:
"""Returns the specialist_id with the highest current trust score."""
return max(self.SPECIALIST_IDS, key=self.trust)
def least_trusted(self) -> str:
return min(self.SPECIALIST_IDS, key=self.trust)
# ------------------------------------------------------------------
# Calibration score (used in reward engine)
# ------------------------------------------------------------------
def brier_score(self, ground_truth_reliability: dict[str, float]) -> float:
"""
Measures how well the trust scores predict actual specialist reliability.
Lower = better calibrated. Range 0.0–1.0.
ground_truth_reliability: {"S0": 0.9, "S1": 0.6, ...}
(hidden from agent, used only by reward engine)
"""
total = 0.0
n = 0
for sid in self.SPECIALIST_IDS:
if sid in ground_truth_reliability:
predicted = self.trust(sid)
actual = ground_truth_reliability[sid]
total += (predicted - actual) ** 2
n += 1
return total / n if n > 0 else 0.0
def calibration_reward(self, ground_truth_reliability: dict[str, float]) -> float:
"""
Convert Brier score to a reward signal (0.0–1.0).
Perfect calibration β†’ 1.0. Random β†’ ~0.5.
"""
brier = self.brier_score(ground_truth_reliability)
# Invert and scale: brier=0 β†’ reward=1.0, brier=0.25 β†’ reward=0.5
return max(0.0, 1.0 - 4.0 * brier)
def __repr__(self) -> str:
snap = self.snapshot()
return f"TrustLedger({snap})"