| """ |
| Compare single-shot ScoringBridge vs iterative cognitive scorer on a slice |
| of the benchmark. No LLM in either path — this isolates the cognitive |
| contribution. |
| """ |
| from __future__ import annotations |
|
|
| import time |
| import argparse |
| import logging |
| import warnings |
|
|
| import numpy as np |
|
|
| from tensegrity.bench.tasks import load_task_samples |
| from tensegrity.engine.scoring import ScoringBridge |
| from tensegrity.pipeline.iterative import IterativeCognitiveScorer |
|
|
| logging.basicConfig(level=logging.WARNING) |
|
|
|
|
| TASKS_TO_RUN = [ |
| ("truthfulqa", 30), |
| ("mmlu_philosophy", 30), |
| ("winogrande", 30), |
| ("arc_challenge", 30), |
| ("copa", 20), |
| ("logical_deduction", 30), |
| ] |
|
|
|
|
| def run_task(task_name: str, n: int): |
| samples = load_task_samples(task_name, max_samples=n) |
| if not samples: |
| print(f" [{task_name}] no samples") |
| return None |
|
|
| shared_params = { |
| "obs_dim": 256, |
| "hidden_dims": [128, 32], |
| "fhrr_dim": 2048, |
| "ngc_settle_steps": 30, |
| "ngc_learning_rate": 0.01, |
| "hopfield_beta": 0.05, |
| "context_settle_steps": 40, |
| "choice_settle_steps": 25, |
| "context_learning_epochs": 3, |
| } |
| single = ScoringBridge( |
| **shared_params, |
| confidence_threshold=0.15, |
| ) |
| iterative = IterativeCognitiveScorer( |
| **shared_params, |
| max_iterations=6, |
| convergence_top_p=0.75, |
| w_sbert=1.0, |
| w_fhrr=0.3, |
| w_ngc=0.6, |
| belief_step=0.6, |
| shaping_lr_scale=0.5, |
| use_hopfield=True, |
| hopfield_steps=2, |
| ) |
|
|
| n_total = len(samples) |
| n_single_correct = 0 |
| n_iter_correct = 0 |
| n_iter_used_total = 0 |
| n_iter_converged = 0 |
| n_disagree = 0 |
| n_iter_better = 0 |
| n_single_better = 0 |
|
|
| t_single = 0.0 |
| t_iter = 0.0 |
|
|
| for s in samples: |
| |
| single.reset() |
| t0 = time.time() |
| scores_s, _ = single.score_choices(s.prompt, s.choices) |
| t_single += time.time() - t0 |
| |
| sa = np.array(scores_s) |
| if np.allclose(sa, 0.0): |
| |
| if hasattr(single, "sentence_similarities"): |
| sims = single.sentence_similarities(s.prompt, s.choices) |
| elif hasattr(single, "_sentence_similarities"): |
| warnings.warn( |
| "ScoringBridge has no public sentence_similarities(); using " |
| "_sentence_similarities (private). Prefer adding a stable public API.", |
| UserWarning, |
| stacklevel=2, |
| ) |
| sims = single._sentence_similarities(s.prompt, s.choices) |
| else: |
| raise AttributeError( |
| "ScoringBridge exposes no sentence_similarities() or " |
| "_sentence_similarities(); add a public API on ScoringBridge for tie-breaks.", |
| ) |
| pred_s = int(np.argmax(sims)) |
| else: |
| pred_s = int(np.argmax(sa)) |
|
|
| |
| iterative.reset() |
| t0 = time.time() |
| result = iterative.score(s.prompt, s.choices) |
| t_iter += time.time() - t0 |
| pred_i = result.committed_idx |
|
|
| ok_s = (pred_s == s.gold) |
| ok_i = (pred_i == s.gold) |
| n_single_correct += int(ok_s) |
| n_iter_correct += int(ok_i) |
| n_iter_used_total += result.iterations_used |
| n_iter_converged += int(result.converged) |
| if pred_s != pred_i: |
| n_disagree += 1 |
| if ok_i and not ok_s: |
| n_iter_better += 1 |
| elif ok_s and not ok_i: |
| n_single_better += 1 |
|
|
| acc_s = n_single_correct / n_total |
| acc_i = n_iter_correct / n_total |
| print( |
| f" [{task_name:<22}] N={n_total:3d} " |
| f"single={acc_s:5.1%} iter={acc_i:5.1%} " |
| f"Δ={(acc_i-acc_s):+5.1%} " |
| f"disagree={n_disagree:2d} " |
| f"iter→✓={n_iter_better} iter→✗={n_single_better} " |
| f"avg_iters={n_iter_used_total/n_total:.1f} " |
| f"conv={n_iter_converged}/{n_total} " |
| f"t_s={t_single:.1f}s t_i={t_iter:.1f}s" |
| ) |
| return { |
| "task": task_name, "n": n_total, |
| "single": acc_s, "iter": acc_i, "delta": acc_i - acc_s, |
| "disagree": n_disagree, "iter_better": n_iter_better, |
| "single_better": n_single_better, |
| "avg_iters": n_iter_used_total / n_total, |
| "converged": n_iter_converged, |
| "t_single_s": t_single, "t_iter_s": t_iter, |
| } |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--tasks", nargs="*", default=None, |
| help="task names; default = small fixed slice") |
| ap.add_argument("--n", type=int, default=None, |
| help="override per-task sample count") |
| args = ap.parse_args() |
|
|
| if args.tasks: |
| plan = [(t, args.n or 30) for t in args.tasks] |
| else: |
| plan = TASKS_TO_RUN |
| if args.n is not None: |
| plan = [(t, args.n) for t, _ in plan] |
|
|
| print("=" * 110) |
| print("Single-shot ScoringBridge vs Iterative cognitive scorer (LLM-free)") |
| print("=" * 110) |
| rows = [] |
| for t, n in plan: |
| try: |
| r = run_task(t, n) |
| except Exception as e: |
| print(f" [{t}] FAILED: {type(e).__name__}: {e}") |
| continue |
| if r is not None: |
| rows.append(r) |
|
|
| if not rows: |
| return |
|
|
| print("-" * 110) |
| total_n = sum(r["n"] for r in rows) |
| sum_s = sum(r["single"] * r["n"] for r in rows) / total_n |
| sum_i = sum(r["iter"] * r["n"] for r in rows) / total_n |
| print( |
| f" {'OVERALL':<24} N={total_n:3d} " |
| f"single={sum_s:5.1%} iter={sum_i:5.1%} " |
| f"Δ={(sum_i-sum_s):+5.1%} " |
| f"disagree={sum(r['disagree'] for r in rows):3d} " |
| f"iter→✓={sum(r['iter_better'] for r in rows):3d} " |
| f"iter→✗={sum(r['single_better'] for r in rows):3d}" |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|