Spaces:
Running
Running
| from __future__ import annotations | |
| from sentinel_config import ADVERSARIAL_AWARENESS_STAKES | |
| class TrustLedger: | |
| """ | |
| Bayesian reliability tracker for each specialist. | |
| Each specialist gets a Beta distribution prior (alpha, beta). | |
| alpha = successes + 1, beta = failures + 1 (Laplace smoothing). | |
| Trust score = alpha / (alpha + beta) = mean of Beta distribution. | |
| Stakes multiplier: high-stakes outcomes move the needle harder. | |
| Profile shuffles every episode β ledger resets on reset(). | |
| """ | |
| SPECIALIST_IDS = ["S0", "S1", "S2", "S3", "S4"] | |
| def __init__(self) -> None: | |
| self._reset() | |
| def _reset(self) -> None: | |
| # Uniform prior: alpha=1, beta=1 β trust=0.5 for all specialists | |
| self._alpha: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS} | |
| self._beta: dict[str, float] = {sid: 1.0 for sid in self.SPECIALIST_IDS} | |
| self._call_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS} | |
| self._confidence_gap_sum: dict[str, float] = {sid: 0.0 for sid in self.SPECIALIST_IDS} | |
| self._confidence_count: dict[str, int] = {sid: 0 for sid in self.SPECIALIST_IDS} | |
| self._domain_success: dict[str, dict[str, float]] = {sid: {} for sid in self.SPECIALIST_IDS} | |
| self._domain_count: dict[str, dict[str, int]] = {sid: {} for sid in self.SPECIALIST_IDS} | |
| self._stakes_success: dict[str, dict[str, float]] = { | |
| sid: {"low": 0.0, "high": 0.0} for sid in self.SPECIALIST_IDS | |
| } | |
| self._stakes_count: dict[str, dict[str, int]] = { | |
| sid: {"low": 0, "high": 0} for sid in self.SPECIALIST_IDS | |
| } | |
| def reset(self) -> None: | |
| """Call at the start of each episode.""" | |
| self._reset() | |
| # ------------------------------------------------------------------ | |
| # Update | |
| # ------------------------------------------------------------------ | |
| def update( | |
| self, | |
| specialist_id: str, | |
| outcome: float, # 1.0 = correct, 0.0 = wrong/adversarial, 0.5 = partial | |
| stakes: float, # 0.0β1.0; high stakes = larger update | |
| confidence: float | None = None, | |
| domain: str | None = None, | |
| ) -> None: | |
| """ | |
| Bayesian update after observing a specialist outcome. | |
| stakes acts as a weight multiplier (1x at low stakes, 3x at high stakes). | |
| """ | |
| if specialist_id not in self._alpha: | |
| return | |
| weight = 1.0 + 2.0 * stakes # 1.0 β 3.0 | |
| self._call_count[specialist_id] += 1 | |
| if outcome >= 0.5: | |
| self._alpha[specialist_id] += weight * outcome | |
| else: | |
| self._beta[specialist_id] += weight * (1.0 - outcome) | |
| if confidence is not None: | |
| self._confidence_gap_sum[specialist_id] += max(0.0, confidence - outcome) | |
| self._confidence_count[specialist_id] += 1 | |
| if domain: | |
| domain_key = domain.upper() | |
| self._domain_success[specialist_id][domain_key] = ( | |
| self._domain_success[specialist_id].get(domain_key, 0.0) + outcome | |
| ) | |
| self._domain_count[specialist_id][domain_key] = ( | |
| self._domain_count[specialist_id].get(domain_key, 0) + 1 | |
| ) | |
| stakes_bucket = "high" if stakes >= ADVERSARIAL_AWARENESS_STAKES else "low" | |
| self._stakes_success[specialist_id][stakes_bucket] += outcome | |
| self._stakes_count[specialist_id][stakes_bucket] += 1 | |
| # ------------------------------------------------------------------ | |
| # Read | |
| # ------------------------------------------------------------------ | |
| def trust(self, specialist_id: str) -> float: | |
| """Point estimate: mean of Beta distribution.""" | |
| a = self._alpha.get(specialist_id, 1.0) | |
| b = self._beta.get(specialist_id, 1.0) | |
| return a / (a + b) | |
| def snapshot(self) -> dict[str, float]: | |
| """Rounded trust scores for all specialists.""" | |
| return {sid: round(self.trust(sid), 3) for sid in self.SPECIALIST_IDS} | |
| def behavioral_fingerprints(self) -> dict[str, dict]: | |
| """ | |
| Public behavioral features an orchestrator can learn from. | |
| These are still evidence-only: no hidden specialist identity leaks. | |
| """ | |
| fingerprints: dict[str, dict] = {} | |
| for sid in self.SPECIALIST_IDS: | |
| confidence_count = self._confidence_count[sid] | |
| gap = ( | |
| self._confidence_gap_sum[sid] / confidence_count | |
| if confidence_count | |
| else 0.0 | |
| ) | |
| domain_hit_rate = { | |
| domain: round(success / max(1, self._domain_count[sid][domain]), 3) | |
| for domain, success in sorted(self._domain_success[sid].items()) | |
| } | |
| low_rate = self._bucket_rate(sid, "low") | |
| high_rate = self._bucket_rate(sid, "high") | |
| volatility = abs(high_rate - low_rate) if low_rate is not None and high_rate is not None else 0.0 | |
| fingerprints[sid] = { | |
| "calls": self._call_count[sid], | |
| "confidence_accuracy_gap": round(gap, 3), | |
| "domain_hit_rate": domain_hit_rate, | |
| "stakes_volatility": round(volatility, 3), | |
| "low_stakes_accuracy": round(low_rate, 3) if low_rate is not None else None, | |
| "high_stakes_accuracy": round(high_rate, 3) if high_rate is not None else None, | |
| } | |
| return fingerprints | |
| def _bucket_rate(self, specialist_id: str, bucket: str) -> float | None: | |
| count = self._stakes_count[specialist_id][bucket] | |
| if count == 0: | |
| return None | |
| return self._stakes_success[specialist_id][bucket] / count | |
| def call_count(self, specialist_id: str) -> int: | |
| return self._call_count.get(specialist_id, 0) | |
| def most_trusted(self) -> str: | |
| """Returns the specialist_id with the highest current trust score.""" | |
| return max(self.SPECIALIST_IDS, key=self.trust) | |
| def least_trusted(self) -> str: | |
| return min(self.SPECIALIST_IDS, key=self.trust) | |
| # ------------------------------------------------------------------ | |
| # Calibration score (used in reward engine) | |
| # ------------------------------------------------------------------ | |
| def brier_score(self, ground_truth_reliability: dict[str, float]) -> float: | |
| """ | |
| Measures how well the trust scores predict actual specialist reliability. | |
| Lower = better calibrated. Range 0.0β1.0. | |
| ground_truth_reliability: {"S0": 0.9, "S1": 0.6, ...} | |
| (hidden from agent, used only by reward engine) | |
| """ | |
| total = 0.0 | |
| n = 0 | |
| for sid in self.SPECIALIST_IDS: | |
| if sid in ground_truth_reliability: | |
| predicted = self.trust(sid) | |
| actual = ground_truth_reliability[sid] | |
| total += (predicted - actual) ** 2 | |
| n += 1 | |
| return total / n if n > 0 else 0.0 | |
| def calibration_reward(self, ground_truth_reliability: dict[str, float]) -> float: | |
| """ | |
| Convert Brier score to a reward signal (0.0β1.0). | |
| Perfect calibration β 1.0. Random β ~0.5. | |
| """ | |
| brier = self.brier_score(ground_truth_reliability) | |
| # Invert and scale: brier=0 β reward=1.0, brier=0.25 β reward=0.5 | |
| return max(0.0, 1.0 - 4.0 * brier) | |
| def __repr__(self) -> str: | |
| snap = self.snapshot() | |
| return f"TrustLedger({snap})" | |