Spaces:
Running
Running
| """Smoke + dispatch tests for scripts/run_calibration.py. | |
| Two failure modes this guards against: | |
| 1. Silent broken imports inside cmd_generate_outputs. The runner has no | |
| module-level test coverage; a missing symbol like build_default_registry | |
| will pass CI and fail at first invocation. test_module_imports asserts | |
| the runner is importable. | |
| 2. Mixed-corpus calibration items routed to the wrong store. The spec | |
| includes both k8s and fastapi questions. test_dispatch_routes_per_corpus | |
| verifies each item goes to the orchestrator built for its corpus, and | |
| test_unknown_corpus_raises verifies a clear error if the spec drifts | |
| from the corpora the runner builds. | |
| """ | |
| from __future__ import annotations | |
| import importlib | |
| import json | |
| import sys | |
| from pathlib import Path | |
| import pytest | |
| REPO_ROOT = Path(__file__).resolve().parents[2] | |
| if str(REPO_ROOT) not in sys.path: | |
| sys.path.insert(0, str(REPO_ROOT)) | |
| class _StubProvider: | |
| def __init__(self, *args, **kwargs): | |
| pass | |
| class _StubEmbedder: | |
| def __init__(self, *args, **kwargs): | |
| pass | |
| class _StubOrchestrator: | |
| """Records each run() call and returns a synthetic AgentResponse-shaped | |
| object so cmd_generate_outputs can compute its hash + write its record. | |
| """ | |
| def __init__(self, corpus_name: str, calls: list) -> None: | |
| self.corpus_name = corpus_name | |
| self.calls = calls | |
| async def run(self, *, question: str, system_prompt: str): | |
| self.calls.append({"corpus": self.corpus_name, "question": question}) | |
| class _Source: | |
| def __init__(self, s: str) -> None: | |
| self.source = s | |
| class _Resp: | |
| answer = f"[{self.corpus_name}] answer to: {question}" | |
| sources = [_Source(f"{self.corpus_name}/doc.md")] | |
| ranked_sources = [f"{self.corpus_name}/doc.md"] | |
| source_chunks = ["chunk text"] | |
| return _Resp() | |
| def test_module_imports(): | |
| """Importing the runner must not raise. Catches broken symbol references | |
| inside the module before they cost a calibration run.""" | |
| mod = importlib.import_module("scripts.run_calibration") | |
| assert hasattr(mod, "cmd_generate_outputs") | |
| assert hasattr(mod, "_build_corpus_orchestrator") | |
| async def test_dispatch_routes_per_corpus(monkeypatch, tmp_path): | |
| runner = importlib.import_module("scripts.run_calibration") | |
| monkeypatch.setattr( | |
| "agent_bench.core.provider.AnthropicProvider", _StubProvider | |
| ) | |
| monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder) | |
| calls: list = [] | |
| built_corpora: list = [] | |
| def fake_builder(cfg, corpus_name, embedder, provider): | |
| built_corpora.append(corpus_name) | |
| return _StubOrchestrator(corpus_name, calls) | |
| monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder) | |
| out_path = tmp_path / "system_outputs.json" | |
| monkeypatch.setattr(runner, "SYSTEM_OUTPUTS", out_path) | |
| await runner.cmd_generate_outputs(concurrency=2) | |
| assert sorted(built_corpora) == ["fastapi", "k8s"] | |
| spec = json.loads(runner.CALIBRATION_SPEC.read_text()) | |
| expected_corpus_by_id = {i["id"]: i["corpus"] for i in spec["items"]} | |
| records = json.loads(out_path.read_text()) | |
| assert len(records) == len(expected_corpus_by_id) | |
| seen_ids = set() | |
| for rec in records: | |
| item_id = rec["item_id"] | |
| seen_ids.add(item_id) | |
| expected = expected_corpus_by_id[item_id] | |
| assert rec["corpus"] == expected | |
| assert rec["answer"].startswith(f"[{expected}]") | |
| assert rec["sources"] == [f"{expected}/doc.md"] | |
| assert isinstance(rec["system_output_hash"], str) | |
| assert len(rec["system_output_hash"]) == 64 | |
| assert seen_ids == set(expected_corpus_by_id.keys()) | |
| by_corpus: dict[str, int] = {} | |
| for c in calls: | |
| by_corpus[c["corpus"]] = by_corpus.get(c["corpus"], 0) + 1 | |
| expected_counts: dict[str, int] = {} | |
| for cor in expected_corpus_by_id.values(): | |
| expected_counts[cor] = expected_counts.get(cor, 0) + 1 | |
| assert by_corpus == expected_counts | |
| async def test_unknown_corpus_raises(monkeypatch, tmp_path): | |
| runner = importlib.import_module("scripts.run_calibration") | |
| monkeypatch.setattr( | |
| "agent_bench.core.provider.AnthropicProvider", _StubProvider | |
| ) | |
| monkeypatch.setattr("agent_bench.rag.embedder.Embedder", _StubEmbedder) | |
| calls: list = [] | |
| def fake_builder(cfg, corpus_name, embedder, provider): | |
| return _StubOrchestrator(corpus_name, calls) | |
| monkeypatch.setattr(runner, "_build_corpus_orchestrator", fake_builder) | |
| monkeypatch.setattr( | |
| runner, "SYSTEM_OUTPUTS", tmp_path / "system_outputs.json" | |
| ) | |
| spec = json.loads(runner.CALIBRATION_SPEC.read_text()) | |
| bogus_spec = { | |
| "items": [ | |
| {**spec["items"][0], "corpus": "phantom_corpus"}, | |
| ] | |
| } | |
| bogus_spec_path = tmp_path / "calibration_v1.json" | |
| bogus_spec_path.write_text(json.dumps(bogus_spec)) | |
| monkeypatch.setattr(runner, "CALIBRATION_SPEC", bogus_spec_path) | |
| with pytest.raises(KeyError) as excinfo: | |
| await runner.cmd_generate_outputs(concurrency=1) | |
| msg = str(excinfo.value) | |
| assert "phantom_corpus" in msg | |
| assert "not in cfg.corpora" in msg | |
| assert spec["items"][0]["id"] in msg | |
| def _write_jsonl(path: Path, rows: list[dict]) -> None: | |
| path.write_text("\n".join(json.dumps(r) for r in rows) + "\n") | |
| def test_compute_kappa_weights_returns_real_kappa(monkeypatch, tmp_path): | |
| """Two judges over 4 items with known agreement patterns. Cohen's κ | |
| must come out positive for the high-agreement judge and zero for the | |
| chance-agreement judge. | |
| """ | |
| runner = importlib.import_module("scripts.run_calibration") | |
| labels_path = tmp_path / "labels.jsonl" | |
| _write_jsonl( | |
| labels_path, | |
| [ | |
| { | |
| "item_id": f"i{n}", | |
| "dimension": "completeness", | |
| "score": gold, | |
| "abstained": False, | |
| "system_output_hash": f"h{n}", | |
| } | |
| for n, gold in enumerate([2, 2, 1, 1]) | |
| ], | |
| ) | |
| monkeypatch.setattr(runner, "LABELS_PATH", labels_path) | |
| sidecar = tmp_path / "predictions.jsonl" | |
| rows = [] | |
| for n, gold in enumerate([2, 2, 1, 1]): | |
| rows.append( | |
| { | |
| "judge_id": "good_completeness", | |
| "system_output_hash": f"h{n}", | |
| "score": gold, | |
| } | |
| ) | |
| rows.append( | |
| { | |
| "judge_id": "bad_completeness", | |
| "system_output_hash": f"h{n}", | |
| "score": 1, | |
| } | |
| ) | |
| _write_jsonl(sidecar, rows) | |
| weights = runner._compute_kappa_weights( | |
| sidecar, | |
| "completeness", | |
| expected_judge_ids={"good_completeness", "bad_completeness"}, | |
| ) | |
| assert weights["good_completeness"] == pytest.approx(1.0) | |
| assert weights["bad_completeness"] == 0.0 # negative κ clipped to 0 | |
| def test_compute_kappa_weights_hard_errors_on_missing_member(monkeypatch, tmp_path): | |
| """Asymmetric coverage in the weights source must hard-error, not | |
| silently return partial weights — that was the v1 bug that masked | |
| gpt-4o-mini's exclusion.""" | |
| runner = importlib.import_module("scripts.run_calibration") | |
| labels_path = tmp_path / "labels.jsonl" | |
| _write_jsonl( | |
| labels_path, | |
| [ | |
| { | |
| "item_id": "i0", | |
| "dimension": "completeness", | |
| "score": 2, | |
| "abstained": False, | |
| "system_output_hash": "h0", | |
| } | |
| ], | |
| ) | |
| monkeypatch.setattr(runner, "LABELS_PATH", labels_path) | |
| sidecar = tmp_path / "predictions.jsonl" | |
| _write_jsonl( | |
| sidecar, | |
| [ | |
| { | |
| "judge_id": "haiku_completeness", | |
| "system_output_hash": "h0", | |
| "score": 2, | |
| } | |
| ], | |
| ) | |
| with pytest.raises(ValueError, match="symmetric coverage"): | |
| runner._compute_kappa_weights( | |
| sidecar, | |
| "completeness", | |
| expected_judge_ids={"haiku_completeness", "gpt4o_completeness"}, | |
| ) | |