Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.7 (1M context) commited on 4 days ago

Commit

8ef480a

1 Parent(s): 3a2ed35

feat(calibration): 30-item stratified calibration_v1 sample

Stratified across FastAPI (categorized) + K8s (CRAG question_types)
per the design doc's sampling table. 26 items from explicit strata +
4 spare slots from K8s simple_w_condition / multi_hop / comparison /
false_premise (highest-variance R@5 strata in pre-judge runs). Sample
seed 20260504 (date-derived) so the sampling is reproducible.

The spare pool was widened from {simple_w_condition, multi_hop} to
also include {comparison, false_premise} because the K8s golden set
holds only 4 simple_w_condition and 6 multi_hop items, of which the
explicit targets already consumed 7, leaving 3 in the original pool —
one short of the 4 spares the design's 30-item total requires.

system_config_git_sha pins the commit producing the sample; the soon-
to-be-generated system_outputs file references this SHA so any drift
in the pipeline between sample generation and output generation is
detectable in the calibration report.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

agent_bench/evaluation/datasets/calibration_v1.json +158 -0
scripts/_dev/sample_calibration_v1.py +115 -0

agent_bench/evaluation/datasets/calibration_v1.json ADDED Viewed

	@@ -0,0 +1,158 @@

+{
+  "version": "v1",
+  "system_config_git_sha": "3a2ed359eb16437cf95987b1fca47281a37fb74c",
+  "sample_seed": 20260504,
+  "notes": "30-item stratified calibration set per the design doc. Spare slots filled from K8s simple_w_condition and multi_hop (typically highest-variance R@5 strata).",
+  "items": [
+    {
+      "id": "q021",
+      "corpus": "fastapi",
+      "stratum": "calculation"
+    },
+    {
+      "id": "q010",
+      "corpus": "fastapi",
+      "stratum": "out_of_scope"
+    },
+    {
+      "id": "q027",
+      "corpus": "fastapi",
+      "stratum": "out_of_scope"
+    },
+    {
+      "id": "q006",
+      "corpus": "fastapi",
+      "stratum": "retrieval"
+    },
+    {
+      "id": "q011",
+      "corpus": "fastapi",
+      "stratum": "retrieval"
+    },
+    {
+      "id": "q012",
+      "corpus": "fastapi",
+      "stratum": "retrieval"
+    },
+    {
+      "id": "q023",
+      "corpus": "fastapi",
+      "stratum": "retrieval"
+    },
+    {
+      "id": "q025",
+      "corpus": "fastapi",
+      "stratum": "retrieval"
+    },
+    {
+      "id": "k8s_002",
+      "corpus": "k8s",
+      "stratum": "comparison"
+    },
+    {
+      "id": "k8s_014",
+      "corpus": "k8s",
+      "stratum": "comparison"
+    },
+    {
+      "id": "k8s_016",
+      "corpus": "k8s",
+      "stratum": "comparison"
+    },
+    {
+      "id": "k8s_004",
+      "corpus": "k8s",
+      "stratum": "false_premise"
+    },
+    {
+      "id": "k8s_022",
+      "corpus": "k8s",
+      "stratum": "false_premise"
+    },
+    {
+      "id": "k8s_024",
+      "corpus": "k8s",
+      "stratum": "false_premise"
+    },
+    {
+      "id": "k8s_003",
+      "corpus": "k8s",
+      "stratum": "multi_hop"
+    },
+    {
+      "id": "k8s_017",
+      "corpus": "k8s",
+      "stratum": "multi_hop"
+    },
+    {
+      "id": "k8s_018",
+      "corpus": "k8s",
+      "stratum": "multi_hop"
+    },
+    {
+      "id": "k8s_019",
+      "corpus": "k8s",
+      "stratum": "multi_hop"
+    },
+    {
+      "id": "k8s_025",
+      "corpus": "k8s",
+      "stratum": "set"
+    },
+    {
+      "id": "k8s_001",
+      "corpus": "k8s",
+      "stratum": "simple"
+    },
+    {
+      "id": "k8s_006",
+      "corpus": "k8s",
+      "stratum": "simple"
+    },
+    {
+      "id": "k8s_007",
+      "corpus": "k8s",
+      "stratum": "simple"
+    },
+    {
+      "id": "k8s_009",
+      "corpus": "k8s",
+      "stratum": "simple"
+    },
+    {
+      "id": "k8s_005",
+      "corpus": "k8s",
+      "stratum": "simple_w_condition"
+    },
+    {
+      "id": "k8s_012",
+      "corpus": "k8s",
+      "stratum": "simple_w_condition"
+    },
+    {
+      "id": "k8s_013",
+      "corpus": "k8s",
+      "stratum": "simple_w_condition"
+    },
+    {
+      "id": "k8s_015",
+      "corpus": "k8s",
+      "stratum": "spare_comparison"
+    },
+    {
+      "id": "k8s_023",
+      "corpus": "k8s",
+      "stratum": "spare_false_premise"
+    },
+    {
+      "id": "k8s_020",
+      "corpus": "k8s",
+      "stratum": "spare_multi_hop"
+    },
+    {
+      "id": "k8s_011",
+      "corpus": "k8s",
+      "stratum": "spare_simple_w_condition"
+    }
+  ]
+}

scripts/_dev/sample_calibration_v1.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""One-shot stratified sampler for calibration_v1.json. Run once; output
+is committed to agent_bench/evaluation/datasets/calibration_v1.json.
+The stratification target is in docs/plans/2026-05-04-judge-layer-v1-design.md
+under Calibration Methodology > Stratified sampling.
+"""
+from __future__ import annotations
+import json
+import random
+import subprocess
+from pathlib import Path
+REPO = Path(__file__).resolve().parents[2]
+FASTAPI_PATH = REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json"
+K8S_PATH = REPO / "agent_bench/evaluation/datasets/k8s_golden.json"
+OUTPUT = REPO / "agent_bench/evaluation/datasets/calibration_v1.json"
+SEED = 20260504  # date-derived; deterministic across runs
+FASTAPI_TARGETS = {"retrieval": 5, "calculation": 1, "out_of_scope": 2}
+K8S_TARGETS = {
+    "simple": 4,
+    "simple_w_condition": 3,
+    "comparison": 3,
+    "multi_hop": 4,
+    "false_premise": 3,
+    "set": 1,
+}
+SPARE_TOTAL = 4
+def main() -> None:
+    rng = random.Random(SEED)
+    fastapi = json.loads(FASTAPI_PATH.read_text())
+    k8s = json.loads(K8S_PATH.read_text())["questions"]
+    selected: list[dict] = []
+    by_cat: dict[str, list[dict]] = {}
+    for q in fastapi:
+        by_cat.setdefault(q["category"], []).append(q)
+    for cat, n in FASTAPI_TARGETS.items():
+        pool = by_cat.get(cat, [])
+        if len(pool) < n:
+            raise SystemExit(f"FastAPI {cat}: have {len(pool)}, need {n}")
+        sample = rng.sample(pool, n)
+        for q in sample:
+            selected.append({"id": q["id"], "corpus": "fastapi", "stratum": cat})
+    by_qt: dict[str, list[dict]] = {}
+    for q in k8s:
+        by_qt.setdefault(q.get("question_type", "?"), []).append(q)
+    for qt, n in K8S_TARGETS.items():
+        pool = by_qt.get(qt, [])
+        if len(pool) < n:
+            raise SystemExit(f"K8s {qt}: have {len(pool)}, need {n}")
+        sample = rng.sample(pool, n)
+        for q in sample:
+            selected.append({"id": q["id"], "corpus": "k8s", "stratum": qt})
+    # Spare slots — fill from highest-variance K8s strata. Original target
+    # was simple_w_condition + multi_hop; expanded to include comparison and
+    # false_premise because the K8s golden set has only 4 simple_w_condition
+    # and 6 multi_hop items, of which Targets already consumed 7, leaving
+    # only 3 in the original pool. Adding comparison/false_premise gives
+    # enough headroom for 4 spares.
+    selected_ids = {s["id"] for s in selected}
+    spare_pool: list[dict] = [
+        q
+        for q in k8s
+        if q.get("question_type")
+        in ("simple_w_condition", "multi_hop", "comparison", "false_premise")
+        and q["id"] not in selected_ids
+    ]
+    if len(spare_pool) < SPARE_TOTAL:
+        raise SystemExit(
+            f"Spare pool exhausted: have {len(spare_pool)}, need {SPARE_TOTAL}"
+        )
+    spare = rng.sample(spare_pool, SPARE_TOTAL)
+    for q in spare:
+        selected.append(
+            {
+                "id": q["id"],
+                "corpus": "k8s",
+                "stratum": f"spare_{q['question_type']}",
+            }
+        )
+    if len(selected) != 30:
+        raise SystemExit(f"Expected 30 items; got {len(selected)}")
+    sha = subprocess.check_output(
+        ["git", "rev-parse", "HEAD"], cwd=REPO, text=True
+    ).strip()
+    out = {
+        "version": "v1",
+        "system_config_git_sha": sha,
+        "sample_seed": SEED,
+        "notes": (
+            "30-item stratified calibration set per the design doc. "
+            "Spare slots filled from K8s simple_w_condition and multi_hop "
+            "(typically highest-variance R@5 strata)."
+        ),
+        "items": sorted(selected, key=lambda s: (s["corpus"], s["stratum"], s["id"])),
+    }
+    OUTPUT.write_text(json.dumps(out, indent=2) + "\n")
+    print(f"Wrote {OUTPUT} with {len(selected)} items; git_sha={sha[:12]}")
+if __name__ == "__main__":
+    main()