Spaces:
Running
feat(calibration): 30-item stratified calibration_v1 sample
Browse filesStratified across FastAPI (categorized) + K8s (CRAG question_types)
per the design doc's sampling table. 26 items from explicit strata +
4 spare slots from K8s simple_w_condition / multi_hop / comparison /
false_premise (highest-variance R@5 strata in pre-judge runs). Sample
seed 20260504 (date-derived) so the sampling is reproducible.
The spare pool was widened from {simple_w_condition, multi_hop} to
also include {comparison, false_premise} because the K8s golden set
holds only 4 simple_w_condition and 6 multi_hop items, of which the
explicit targets already consumed 7, leaving 3 in the original pool —
one short of the 4 spares the design's 30-item total requires.
system_config_git_sha pins the commit producing the sample; the soon-
to-be-generated system_outputs file references this SHA so any drift
in the pipeline between sample generation and output generation is
detectable in the calibration report.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "v1",
|
| 3 |
+
"system_config_git_sha": "3a2ed359eb16437cf95987b1fca47281a37fb74c",
|
| 4 |
+
"sample_seed": 20260504,
|
| 5 |
+
"notes": "30-item stratified calibration set per the design doc. Spare slots filled from K8s simple_w_condition and multi_hop (typically highest-variance R@5 strata).",
|
| 6 |
+
"items": [
|
| 7 |
+
{
|
| 8 |
+
"id": "q021",
|
| 9 |
+
"corpus": "fastapi",
|
| 10 |
+
"stratum": "calculation"
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"id": "q010",
|
| 14 |
+
"corpus": "fastapi",
|
| 15 |
+
"stratum": "out_of_scope"
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"id": "q027",
|
| 19 |
+
"corpus": "fastapi",
|
| 20 |
+
"stratum": "out_of_scope"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id": "q006",
|
| 24 |
+
"corpus": "fastapi",
|
| 25 |
+
"stratum": "retrieval"
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"id": "q011",
|
| 29 |
+
"corpus": "fastapi",
|
| 30 |
+
"stratum": "retrieval"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"id": "q012",
|
| 34 |
+
"corpus": "fastapi",
|
| 35 |
+
"stratum": "retrieval"
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"id": "q023",
|
| 39 |
+
"corpus": "fastapi",
|
| 40 |
+
"stratum": "retrieval"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"id": "q025",
|
| 44 |
+
"corpus": "fastapi",
|
| 45 |
+
"stratum": "retrieval"
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"id": "k8s_002",
|
| 49 |
+
"corpus": "k8s",
|
| 50 |
+
"stratum": "comparison"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": "k8s_014",
|
| 54 |
+
"corpus": "k8s",
|
| 55 |
+
"stratum": "comparison"
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"id": "k8s_016",
|
| 59 |
+
"corpus": "k8s",
|
| 60 |
+
"stratum": "comparison"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"id": "k8s_004",
|
| 64 |
+
"corpus": "k8s",
|
| 65 |
+
"stratum": "false_premise"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"id": "k8s_022",
|
| 69 |
+
"corpus": "k8s",
|
| 70 |
+
"stratum": "false_premise"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"id": "k8s_024",
|
| 74 |
+
"corpus": "k8s",
|
| 75 |
+
"stratum": "false_premise"
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"id": "k8s_003",
|
| 79 |
+
"corpus": "k8s",
|
| 80 |
+
"stratum": "multi_hop"
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"id": "k8s_017",
|
| 84 |
+
"corpus": "k8s",
|
| 85 |
+
"stratum": "multi_hop"
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"id": "k8s_018",
|
| 89 |
+
"corpus": "k8s",
|
| 90 |
+
"stratum": "multi_hop"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"id": "k8s_019",
|
| 94 |
+
"corpus": "k8s",
|
| 95 |
+
"stratum": "multi_hop"
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"id": "k8s_025",
|
| 99 |
+
"corpus": "k8s",
|
| 100 |
+
"stratum": "set"
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"id": "k8s_001",
|
| 104 |
+
"corpus": "k8s",
|
| 105 |
+
"stratum": "simple"
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"id": "k8s_006",
|
| 109 |
+
"corpus": "k8s",
|
| 110 |
+
"stratum": "simple"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"id": "k8s_007",
|
| 114 |
+
"corpus": "k8s",
|
| 115 |
+
"stratum": "simple"
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"id": "k8s_009",
|
| 119 |
+
"corpus": "k8s",
|
| 120 |
+
"stratum": "simple"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"id": "k8s_005",
|
| 124 |
+
"corpus": "k8s",
|
| 125 |
+
"stratum": "simple_w_condition"
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"id": "k8s_012",
|
| 129 |
+
"corpus": "k8s",
|
| 130 |
+
"stratum": "simple_w_condition"
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"id": "k8s_013",
|
| 134 |
+
"corpus": "k8s",
|
| 135 |
+
"stratum": "simple_w_condition"
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"id": "k8s_015",
|
| 139 |
+
"corpus": "k8s",
|
| 140 |
+
"stratum": "spare_comparison"
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"id": "k8s_023",
|
| 144 |
+
"corpus": "k8s",
|
| 145 |
+
"stratum": "spare_false_premise"
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"id": "k8s_020",
|
| 149 |
+
"corpus": "k8s",
|
| 150 |
+
"stratum": "spare_multi_hop"
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"id": "k8s_011",
|
| 154 |
+
"corpus": "k8s",
|
| 155 |
+
"stratum": "spare_simple_w_condition"
|
| 156 |
+
}
|
| 157 |
+
]
|
| 158 |
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""One-shot stratified sampler for calibration_v1.json. Run once; output
|
| 2 |
+
is committed to agent_bench/evaluation/datasets/calibration_v1.json.
|
| 3 |
+
|
| 4 |
+
The stratification target is in docs/plans/2026-05-04-judge-layer-v1-design.md
|
| 5 |
+
under Calibration Methodology > Stratified sampling.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import random
|
| 12 |
+
import subprocess
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
REPO = Path(__file__).resolve().parents[2]
|
| 16 |
+
FASTAPI_PATH = REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json"
|
| 17 |
+
K8S_PATH = REPO / "agent_bench/evaluation/datasets/k8s_golden.json"
|
| 18 |
+
OUTPUT = REPO / "agent_bench/evaluation/datasets/calibration_v1.json"
|
| 19 |
+
|
| 20 |
+
SEED = 20260504 # date-derived; deterministic across runs
|
| 21 |
+
|
| 22 |
+
FASTAPI_TARGETS = {"retrieval": 5, "calculation": 1, "out_of_scope": 2}
|
| 23 |
+
K8S_TARGETS = {
|
| 24 |
+
"simple": 4,
|
| 25 |
+
"simple_w_condition": 3,
|
| 26 |
+
"comparison": 3,
|
| 27 |
+
"multi_hop": 4,
|
| 28 |
+
"false_premise": 3,
|
| 29 |
+
"set": 1,
|
| 30 |
+
}
|
| 31 |
+
SPARE_TOTAL = 4
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def main() -> None:
|
| 35 |
+
rng = random.Random(SEED)
|
| 36 |
+
|
| 37 |
+
fastapi = json.loads(FASTAPI_PATH.read_text())
|
| 38 |
+
k8s = json.loads(K8S_PATH.read_text())["questions"]
|
| 39 |
+
|
| 40 |
+
selected: list[dict] = []
|
| 41 |
+
|
| 42 |
+
by_cat: dict[str, list[dict]] = {}
|
| 43 |
+
for q in fastapi:
|
| 44 |
+
by_cat.setdefault(q["category"], []).append(q)
|
| 45 |
+
for cat, n in FASTAPI_TARGETS.items():
|
| 46 |
+
pool = by_cat.get(cat, [])
|
| 47 |
+
if len(pool) < n:
|
| 48 |
+
raise SystemExit(f"FastAPI {cat}: have {len(pool)}, need {n}")
|
| 49 |
+
sample = rng.sample(pool, n)
|
| 50 |
+
for q in sample:
|
| 51 |
+
selected.append({"id": q["id"], "corpus": "fastapi", "stratum": cat})
|
| 52 |
+
|
| 53 |
+
by_qt: dict[str, list[dict]] = {}
|
| 54 |
+
for q in k8s:
|
| 55 |
+
by_qt.setdefault(q.get("question_type", "?"), []).append(q)
|
| 56 |
+
for qt, n in K8S_TARGETS.items():
|
| 57 |
+
pool = by_qt.get(qt, [])
|
| 58 |
+
if len(pool) < n:
|
| 59 |
+
raise SystemExit(f"K8s {qt}: have {len(pool)}, need {n}")
|
| 60 |
+
sample = rng.sample(pool, n)
|
| 61 |
+
for q in sample:
|
| 62 |
+
selected.append({"id": q["id"], "corpus": "k8s", "stratum": qt})
|
| 63 |
+
|
| 64 |
+
# Spare slots — fill from highest-variance K8s strata. Original target
|
| 65 |
+
# was simple_w_condition + multi_hop; expanded to include comparison and
|
| 66 |
+
# false_premise because the K8s golden set has only 4 simple_w_condition
|
| 67 |
+
# and 6 multi_hop items, of which Targets already consumed 7, leaving
|
| 68 |
+
# only 3 in the original pool. Adding comparison/false_premise gives
|
| 69 |
+
# enough headroom for 4 spares.
|
| 70 |
+
selected_ids = {s["id"] for s in selected}
|
| 71 |
+
spare_pool: list[dict] = [
|
| 72 |
+
q
|
| 73 |
+
for q in k8s
|
| 74 |
+
if q.get("question_type")
|
| 75 |
+
in ("simple_w_condition", "multi_hop", "comparison", "false_premise")
|
| 76 |
+
and q["id"] not in selected_ids
|
| 77 |
+
]
|
| 78 |
+
if len(spare_pool) < SPARE_TOTAL:
|
| 79 |
+
raise SystemExit(
|
| 80 |
+
f"Spare pool exhausted: have {len(spare_pool)}, need {SPARE_TOTAL}"
|
| 81 |
+
)
|
| 82 |
+
spare = rng.sample(spare_pool, SPARE_TOTAL)
|
| 83 |
+
for q in spare:
|
| 84 |
+
selected.append(
|
| 85 |
+
{
|
| 86 |
+
"id": q["id"],
|
| 87 |
+
"corpus": "k8s",
|
| 88 |
+
"stratum": f"spare_{q['question_type']}",
|
| 89 |
+
}
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
if len(selected) != 30:
|
| 93 |
+
raise SystemExit(f"Expected 30 items; got {len(selected)}")
|
| 94 |
+
|
| 95 |
+
sha = subprocess.check_output(
|
| 96 |
+
["git", "rev-parse", "HEAD"], cwd=REPO, text=True
|
| 97 |
+
).strip()
|
| 98 |
+
|
| 99 |
+
out = {
|
| 100 |
+
"version": "v1",
|
| 101 |
+
"system_config_git_sha": sha,
|
| 102 |
+
"sample_seed": SEED,
|
| 103 |
+
"notes": (
|
| 104 |
+
"30-item stratified calibration set per the design doc. "
|
| 105 |
+
"Spare slots filled from K8s simple_w_condition and multi_hop "
|
| 106 |
+
"(typically highest-variance R@5 strata)."
|
| 107 |
+
),
|
| 108 |
+
"items": sorted(selected, key=lambda s: (s["corpus"], s["stratum"], s["id"])),
|
| 109 |
+
}
|
| 110 |
+
OUTPUT.write_text(json.dumps(out, indent=2) + "\n")
|
| 111 |
+
print(f"Wrote {OUTPUT} with {len(selected)} items; git_sha={sha[:12]}")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == "__main__":
|
| 115 |
+
main()
|