Nomearod Claude Opus 4.7 (1M context) commited on
Commit
8ef480a
·
1 Parent(s): 3a2ed35

feat(calibration): 30-item stratified calibration_v1 sample

Browse files

Stratified across FastAPI (categorized) + K8s (CRAG question_types)
per the design doc's sampling table. 26 items from explicit strata +
4 spare slots from K8s simple_w_condition / multi_hop / comparison /
false_premise (highest-variance R@5 strata in pre-judge runs). Sample
seed 20260504 (date-derived) so the sampling is reproducible.

The spare pool was widened from {simple_w_condition, multi_hop} to
also include {comparison, false_premise} because the K8s golden set
holds only 4 simple_w_condition and 6 multi_hop items, of which the
explicit targets already consumed 7, leaving 3 in the original pool —
one short of the 4 spares the design's 30-item total requires.

system_config_git_sha pins the commit producing the sample; the soon-
to-be-generated system_outputs file references this SHA so any drift
in the pipeline between sample generation and output generation is
detectable in the calibration report.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

agent_bench/evaluation/datasets/calibration_v1.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "v1",
3
+ "system_config_git_sha": "3a2ed359eb16437cf95987b1fca47281a37fb74c",
4
+ "sample_seed": 20260504,
5
+ "notes": "30-item stratified calibration set per the design doc. Spare slots filled from K8s simple_w_condition and multi_hop (typically highest-variance R@5 strata).",
6
+ "items": [
7
+ {
8
+ "id": "q021",
9
+ "corpus": "fastapi",
10
+ "stratum": "calculation"
11
+ },
12
+ {
13
+ "id": "q010",
14
+ "corpus": "fastapi",
15
+ "stratum": "out_of_scope"
16
+ },
17
+ {
18
+ "id": "q027",
19
+ "corpus": "fastapi",
20
+ "stratum": "out_of_scope"
21
+ },
22
+ {
23
+ "id": "q006",
24
+ "corpus": "fastapi",
25
+ "stratum": "retrieval"
26
+ },
27
+ {
28
+ "id": "q011",
29
+ "corpus": "fastapi",
30
+ "stratum": "retrieval"
31
+ },
32
+ {
33
+ "id": "q012",
34
+ "corpus": "fastapi",
35
+ "stratum": "retrieval"
36
+ },
37
+ {
38
+ "id": "q023",
39
+ "corpus": "fastapi",
40
+ "stratum": "retrieval"
41
+ },
42
+ {
43
+ "id": "q025",
44
+ "corpus": "fastapi",
45
+ "stratum": "retrieval"
46
+ },
47
+ {
48
+ "id": "k8s_002",
49
+ "corpus": "k8s",
50
+ "stratum": "comparison"
51
+ },
52
+ {
53
+ "id": "k8s_014",
54
+ "corpus": "k8s",
55
+ "stratum": "comparison"
56
+ },
57
+ {
58
+ "id": "k8s_016",
59
+ "corpus": "k8s",
60
+ "stratum": "comparison"
61
+ },
62
+ {
63
+ "id": "k8s_004",
64
+ "corpus": "k8s",
65
+ "stratum": "false_premise"
66
+ },
67
+ {
68
+ "id": "k8s_022",
69
+ "corpus": "k8s",
70
+ "stratum": "false_premise"
71
+ },
72
+ {
73
+ "id": "k8s_024",
74
+ "corpus": "k8s",
75
+ "stratum": "false_premise"
76
+ },
77
+ {
78
+ "id": "k8s_003",
79
+ "corpus": "k8s",
80
+ "stratum": "multi_hop"
81
+ },
82
+ {
83
+ "id": "k8s_017",
84
+ "corpus": "k8s",
85
+ "stratum": "multi_hop"
86
+ },
87
+ {
88
+ "id": "k8s_018",
89
+ "corpus": "k8s",
90
+ "stratum": "multi_hop"
91
+ },
92
+ {
93
+ "id": "k8s_019",
94
+ "corpus": "k8s",
95
+ "stratum": "multi_hop"
96
+ },
97
+ {
98
+ "id": "k8s_025",
99
+ "corpus": "k8s",
100
+ "stratum": "set"
101
+ },
102
+ {
103
+ "id": "k8s_001",
104
+ "corpus": "k8s",
105
+ "stratum": "simple"
106
+ },
107
+ {
108
+ "id": "k8s_006",
109
+ "corpus": "k8s",
110
+ "stratum": "simple"
111
+ },
112
+ {
113
+ "id": "k8s_007",
114
+ "corpus": "k8s",
115
+ "stratum": "simple"
116
+ },
117
+ {
118
+ "id": "k8s_009",
119
+ "corpus": "k8s",
120
+ "stratum": "simple"
121
+ },
122
+ {
123
+ "id": "k8s_005",
124
+ "corpus": "k8s",
125
+ "stratum": "simple_w_condition"
126
+ },
127
+ {
128
+ "id": "k8s_012",
129
+ "corpus": "k8s",
130
+ "stratum": "simple_w_condition"
131
+ },
132
+ {
133
+ "id": "k8s_013",
134
+ "corpus": "k8s",
135
+ "stratum": "simple_w_condition"
136
+ },
137
+ {
138
+ "id": "k8s_015",
139
+ "corpus": "k8s",
140
+ "stratum": "spare_comparison"
141
+ },
142
+ {
143
+ "id": "k8s_023",
144
+ "corpus": "k8s",
145
+ "stratum": "spare_false_premise"
146
+ },
147
+ {
148
+ "id": "k8s_020",
149
+ "corpus": "k8s",
150
+ "stratum": "spare_multi_hop"
151
+ },
152
+ {
153
+ "id": "k8s_011",
154
+ "corpus": "k8s",
155
+ "stratum": "spare_simple_w_condition"
156
+ }
157
+ ]
158
+ }
scripts/_dev/sample_calibration_v1.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """One-shot stratified sampler for calibration_v1.json. Run once; output
2
+ is committed to agent_bench/evaluation/datasets/calibration_v1.json.
3
+
4
+ The stratification target is in docs/plans/2026-05-04-judge-layer-v1-design.md
5
+ under Calibration Methodology > Stratified sampling.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import random
12
+ import subprocess
13
+ from pathlib import Path
14
+
15
+ REPO = Path(__file__).resolve().parents[2]
16
+ FASTAPI_PATH = REPO / "agent_bench/evaluation/datasets/tech_docs_golden.json"
17
+ K8S_PATH = REPO / "agent_bench/evaluation/datasets/k8s_golden.json"
18
+ OUTPUT = REPO / "agent_bench/evaluation/datasets/calibration_v1.json"
19
+
20
+ SEED = 20260504 # date-derived; deterministic across runs
21
+
22
+ FASTAPI_TARGETS = {"retrieval": 5, "calculation": 1, "out_of_scope": 2}
23
+ K8S_TARGETS = {
24
+ "simple": 4,
25
+ "simple_w_condition": 3,
26
+ "comparison": 3,
27
+ "multi_hop": 4,
28
+ "false_premise": 3,
29
+ "set": 1,
30
+ }
31
+ SPARE_TOTAL = 4
32
+
33
+
34
+ def main() -> None:
35
+ rng = random.Random(SEED)
36
+
37
+ fastapi = json.loads(FASTAPI_PATH.read_text())
38
+ k8s = json.loads(K8S_PATH.read_text())["questions"]
39
+
40
+ selected: list[dict] = []
41
+
42
+ by_cat: dict[str, list[dict]] = {}
43
+ for q in fastapi:
44
+ by_cat.setdefault(q["category"], []).append(q)
45
+ for cat, n in FASTAPI_TARGETS.items():
46
+ pool = by_cat.get(cat, [])
47
+ if len(pool) < n:
48
+ raise SystemExit(f"FastAPI {cat}: have {len(pool)}, need {n}")
49
+ sample = rng.sample(pool, n)
50
+ for q in sample:
51
+ selected.append({"id": q["id"], "corpus": "fastapi", "stratum": cat})
52
+
53
+ by_qt: dict[str, list[dict]] = {}
54
+ for q in k8s:
55
+ by_qt.setdefault(q.get("question_type", "?"), []).append(q)
56
+ for qt, n in K8S_TARGETS.items():
57
+ pool = by_qt.get(qt, [])
58
+ if len(pool) < n:
59
+ raise SystemExit(f"K8s {qt}: have {len(pool)}, need {n}")
60
+ sample = rng.sample(pool, n)
61
+ for q in sample:
62
+ selected.append({"id": q["id"], "corpus": "k8s", "stratum": qt})
63
+
64
+ # Spare slots — fill from highest-variance K8s strata. Original target
65
+ # was simple_w_condition + multi_hop; expanded to include comparison and
66
+ # false_premise because the K8s golden set has only 4 simple_w_condition
67
+ # and 6 multi_hop items, of which Targets already consumed 7, leaving
68
+ # only 3 in the original pool. Adding comparison/false_premise gives
69
+ # enough headroom for 4 spares.
70
+ selected_ids = {s["id"] for s in selected}
71
+ spare_pool: list[dict] = [
72
+ q
73
+ for q in k8s
74
+ if q.get("question_type")
75
+ in ("simple_w_condition", "multi_hop", "comparison", "false_premise")
76
+ and q["id"] not in selected_ids
77
+ ]
78
+ if len(spare_pool) < SPARE_TOTAL:
79
+ raise SystemExit(
80
+ f"Spare pool exhausted: have {len(spare_pool)}, need {SPARE_TOTAL}"
81
+ )
82
+ spare = rng.sample(spare_pool, SPARE_TOTAL)
83
+ for q in spare:
84
+ selected.append(
85
+ {
86
+ "id": q["id"],
87
+ "corpus": "k8s",
88
+ "stratum": f"spare_{q['question_type']}",
89
+ }
90
+ )
91
+
92
+ if len(selected) != 30:
93
+ raise SystemExit(f"Expected 30 items; got {len(selected)}")
94
+
95
+ sha = subprocess.check_output(
96
+ ["git", "rev-parse", "HEAD"], cwd=REPO, text=True
97
+ ).strip()
98
+
99
+ out = {
100
+ "version": "v1",
101
+ "system_config_git_sha": sha,
102
+ "sample_seed": SEED,
103
+ "notes": (
104
+ "30-item stratified calibration set per the design doc. "
105
+ "Spare slots filled from K8s simple_w_condition and multi_hop "
106
+ "(typically highest-variance R@5 strata)."
107
+ ),
108
+ "items": sorted(selected, key=lambda s: (s["corpus"], s["stratum"], s["id"])),
109
+ }
110
+ OUTPUT.write_text(json.dumps(out, indent=2) + "\n")
111
+ print(f"Wrote {OUTPUT} with {len(selected)} items; git_sha={sha[:12]}")
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()