Spaces:

XcodeAddy
/

sentinel-env

Running

App Files Files Community

sentinel-env / trust_ledger.py

XcodeAddy

Add adaptive trust curriculum wow features

74b74f1 18 days ago

raw

history blame contribute delete

7.5 kB

	from __future__ import annotations

	from sentinel_config import ADVERSARIAL_AWARENESS_STAKES


	class TrustLedger:
	"""
	Bayesian reliability tracker for each specialist.

	Each specialist gets a Beta distribution prior (alpha, beta).
	alpha = successes + 1, beta = failures + 1 (Laplace smoothing).
	Trust score = alpha / (alpha + beta) = mean of Beta distribution.

	Stakes multiplier: high-stakes outcomes move the needle harder.
	Profile shuffles every episode — ledger resets on reset().
	"""

	SPECIALIST_IDS = ["S0", "S1", "S2", "S3", "S4"]

	def __init__(self) -> None:
	self._reset()

	def _reset(self) -> None:
	# Uniform prior: alpha=1, beta=1 → trust=0.5 for all specialists
	self._alpha: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
	self._beta: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS}
	self._call_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
	self._confidence_gap_sum: dict[str, float] = {sid: 0.0 for sid in self.SPECIALIST_IDS}
	self._confidence_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS}
	self._domain_success: dict[str, dict[str, float]] = {sid: {} for sid in self.SPECIALIST_IDS}
	self._domain_count: dict[str, dict[str, int]] = {sid: {} for sid in self.SPECIALIST_IDS}
	self._stakes_success: dict[str, dict[str, float]] = {
	sid: {"low": 0.0, "high": 0.0} for sid in self.SPECIALIST_IDS
	}
	self._stakes_count: dict[str, dict[str, int]] = {
	sid: {"low": 0, "high": 0} for sid in self.SPECIALIST_IDS
	}

	def reset(self) -> None:
	"""Call at the start of each episode."""
	self._reset()

	# ------------------------------------------------------------------
	# Update
	# ------------------------------------------------------------------

	def update(
	self,
	specialist_id: str,
	outcome: float, # 1.0 = correct, 0.0 = wrong/adversarial, 0.5 = partial
	stakes: float, # 0.0–1.0; high stakes = larger update
	confidence: float \| None = None,
	domain: str \| None = None,
	) -> None:
	"""
	Bayesian update after observing a specialist outcome.
	stakes acts as a weight multiplier (1x at low stakes, 3x at high stakes).
	"""
	if specialist_id not in self._alpha:
	return

	weight = 1.0 + 2.0 * stakes # 1.0 → 3.0

	self._call_count[specialist_id] += 1

	if outcome >= 0.5:
	self._alpha[specialist_id] += weight * outcome
	else:
	self._beta[specialist_id] += weight * (1.0 - outcome)

	if confidence is not None:
	self._confidence_gap_sum[specialist_id] += max(0.0, confidence - outcome)
	self._confidence_count[specialist_id] += 1

	if domain:
	domain_key = domain.upper()
	self._domain_success[specialist_id][domain_key] = (
	self._domain_success[specialist_id].get(domain_key, 0.0) + outcome
	)
	self._domain_count[specialist_id][domain_key] = (
	self._domain_count[specialist_id].get(domain_key, 0) + 1
	)

	stakes_bucket = "high" if stakes >= ADVERSARIAL_AWARENESS_STAKES else "low"
	self._stakes_success[specialist_id][stakes_bucket] += outcome
	self._stakes_count[specialist_id][stakes_bucket] += 1

	# ------------------------------------------------------------------
	# Read
	# ------------------------------------------------------------------

	def trust(self, specialist_id: str) -> float:
	"""Point estimate: mean of Beta distribution."""
	a = self._alpha.get(specialist_id, 1.0)
	b = self._beta.get(specialist_id, 1.0)
	return a / (a + b)

	def snapshot(self) -> dict[str, float]:
	"""Rounded trust scores for all specialists."""
	return {sid: round(self.trust(sid), 3) for sid in self.SPECIALIST_IDS}

	def behavioral_fingerprints(self) -> dict[str, dict]:
	"""
	Public behavioral features an orchestrator can learn from.

	These are still evidence-only: no hidden specialist identity leaks.
	"""
	fingerprints: dict[str, dict] = {}
	for sid in self.SPECIALIST_IDS:
	confidence_count = self._confidence_count[sid]
	gap = (
	self._confidence_gap_sum[sid] / confidence_count
	if confidence_count
	else 0.0
	)
	domain_hit_rate = {
	domain: round(success / max(1, self._domain_count[sid][domain]), 3)
	for domain, success in sorted(self._domain_success[sid].items())
	}
	low_rate = self._bucket_rate(sid, "low")
	high_rate = self._bucket_rate(sid, "high")
	volatility = abs(high_rate - low_rate) if low_rate is not None and high_rate is not None else 0.0
	fingerprints[sid] = {
	"calls": self._call_count[sid],
	"confidence_accuracy_gap": round(gap, 3),
	"domain_hit_rate": domain_hit_rate,
	"stakes_volatility": round(volatility, 3),
	"low_stakes_accuracy": round(low_rate, 3) if low_rate is not None else None,
	"high_stakes_accuracy": round(high_rate, 3) if high_rate is not None else None,
	}
	return fingerprints

	def _bucket_rate(self, specialist_id: str, bucket: str) -> float \| None:
	count = self._stakes_count[specialist_id][bucket]
	if count == 0:
	return None
	return self._stakes_success[specialist_id][bucket] / count

	def call_count(self, specialist_id: str) -> int:
	return self._call_count.get(specialist_id, 0)

	def most_trusted(self) -> str:
	"""Returns the specialist_id with the highest current trust score."""
	return max(self.SPECIALIST_IDS, key=self.trust)

	def least_trusted(self) -> str:
	return min(self.SPECIALIST_IDS, key=self.trust)

	# ------------------------------------------------------------------
	# Calibration score (used in reward engine)
	# ------------------------------------------------------------------

	def brier_score(self, ground_truth_reliability: dict[str, float]) -> float:
	"""
	Measures how well the trust scores predict actual specialist reliability.
	Lower = better calibrated. Range 0.0–1.0.

	ground_truth_reliability: {"S0": 0.9, "S1": 0.6, ...}
	(hidden from agent, used only by reward engine)
	"""
	total = 0.0
	n = 0
	for sid in self.SPECIALIST_IDS:
	if sid in ground_truth_reliability:
	predicted = self.trust(sid)
	actual = ground_truth_reliability[sid]
	total += (predicted - actual) ** 2
	n += 1
	return total / n if n > 0 else 0.0

	def calibration_reward(self, ground_truth_reliability: dict[str, float]) -> float:
	"""
	Convert Brier score to a reward signal (0.0–1.0).
	Perfect calibration → 1.0. Random → ~0.5.
	"""
	brier = self.brier_score(ground_truth_reliability)
	# Invert and scale: brier=0 → reward=1.0, brier=0.25 → reward=0.5
	return max(0.0, 1.0 - 4.0 * brier)

	def __repr__(self) -> str:
	snap = self.snapshot()
	return f"TrustLedger({snap})"