Nomearod Claude Opus 4.6 (1M context) commited on
Commit
f0bfb5e
·
1 Parent(s): a152b95

feat: add reproducible retrieval gate check with committed artifact

Browse files

- scripts/verify_retrieval.py: runs golden questions against the store,
outputs markdown table with per-question Recall@5 and PASS/FAIL
- docs/retrieval_gate.md: committed gate artifact showing 1.00 Recall@5
across all 7 positive questions (Day 4 gate proven)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (2) hide show
  1. docs/retrieval_gate.md +19 -0
  2. scripts/verify_retrieval.py +112 -0
docs/retrieval_gate.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Retrieval Gate Check
2
+
3
+ **Store:** 207 chunks, 16 sources
4
+
5
+ | ID | Category | Expected Source | Top-5 Sources | Recall@5 | Result |
6
+ |-----|----------|----------------|---------------|----------|--------|
7
+ | q001 | retrieval | fastapi_path_params.md | fastapi_path_params.md, fastapi_query_params.md, fastapi_request_body.md | 1.00 | PASS |
8
+ | q002 | retrieval | fastapi_pagination.md | fastapi_pagination.md, fastapi_path_params.md | 1.00 | PASS |
9
+ | q003 | retrieval | fastapi_middleware.md | fastapi_middleware.md | 1.00 | PASS |
10
+ | q004 | retrieval | fastapi_security.md | fastapi_security.md | 1.00 | PASS |
11
+ | q005 | retrieval | fastapi_deployment.md | fastapi_deployment.md | 1.00 | PASS |
12
+ | q006 | retrieval | fastapi_dependencies.md | fastapi_dependencies.md | 1.00 | PASS |
13
+ | q007 | calculation | fastapi_pagination.md | fastapi_pagination.md | 1.00 | PASS |
14
+ | q008 | out_of_scope | (none) | fastapi_deployment.md, fastapi_intro.md, fastapi_openapi.md | n/a | N/A |
15
+ | q009 | out_of_scope | (none) | fastapi_websockets.md, fastapi_background_tasks.md | n/a | N/A |
16
+ | q010 | out_of_scope | (none) | fastapi_openapi.md, fastapi_response_model.md | n/a | N/A |
17
+
18
+ **Avg Recall@5 (positive only):** 1.00
19
+ **Gate:** PASS (threshold >= 0.5)
scripts/verify_retrieval.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Verify retrieval quality against golden dataset.
2
+
3
+ Runs the Day 4 gate check: for each positive golden question,
4
+ does hybrid retrieval return the expected source in top-5?
5
+
6
+ Usage:
7
+ python scripts/verify_retrieval.py
8
+ python scripts/verify_retrieval.py --store-path .cache/store --output docs/retrieval_gate.md
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
19
+
20
+ from agent_bench.rag.embedder import Embedder
21
+ from agent_bench.rag.store import HybridStore
22
+
23
+
24
+ def verify(
25
+ store_path: str = ".cache/store",
26
+ golden_path: str = "agent_bench/evaluation/datasets/tech_docs_golden.json",
27
+ model_name: str = "all-MiniLM-L6-v2",
28
+ cache_dir: str = ".cache/embeddings",
29
+ output_path: str | None = None,
30
+ ) -> bool:
31
+ store = HybridStore.load(store_path)
32
+ embedder = Embedder(model_name=model_name, cache_dir=cache_dir)
33
+
34
+ with open(golden_path) as f:
35
+ questions = json.load(f)
36
+
37
+ lines: list[str] = []
38
+ lines.append("# Retrieval Gate Check")
39
+ lines.append("")
40
+ lines.append(
41
+ f"**Store:** {store.stats().total_chunks} chunks, "
42
+ f"{store.stats().unique_sources} sources"
43
+ )
44
+ lines.append("")
45
+ lines.append("| ID | Category | Expected Source | Top-5 Sources | Recall@5 | Result |")
46
+ lines.append("|-----|----------|----------------|---------------|----------|--------|")
47
+
48
+ total_recall = 0.0
49
+ scorable = 0
50
+
51
+ for q in questions:
52
+ qid = q["id"]
53
+ question = q["question"]
54
+ expected = set(q["expected_sources"])
55
+ category = q["category"]
56
+
57
+ vec = embedder.embed(question)
58
+ results = store.search(vec, question, top_k=5, strategy="hybrid")
59
+ retrieved = [r.chunk.source for r in results]
60
+ retrieved_set = set(retrieved)
61
+
62
+ if expected:
63
+ recall = len(expected & retrieved_set) / len(expected)
64
+ total_recall += recall
65
+ scorable += 1
66
+ result = "PASS" if recall >= 0.5 else "FAIL"
67
+ else:
68
+ recall = float("nan")
69
+ result = "N/A"
70
+
71
+ expected_str = ", ".join(sorted(expected)) if expected else "(none)"
72
+ retrieved_str = ", ".join(dict.fromkeys(retrieved[:3])) # dedup, first 3
73
+ recall_str = f"{recall:.2f}" if expected else "n/a"
74
+ lines.append(
75
+ f"| {qid} | {category} | {expected_str} | {retrieved_str} | {recall_str} | {result} |"
76
+ )
77
+
78
+ avg_recall = total_recall / max(scorable, 1)
79
+ gate_pass = avg_recall >= 0.5
80
+
81
+ lines.append("")
82
+ lines.append(f"**Avg Recall@5 (positive only):** {avg_recall:.2f}")
83
+ lines.append(f"**Gate:** {'PASS' if gate_pass else 'FAIL'} (threshold >= 0.5)")
84
+
85
+ report = "\n".join(lines)
86
+ print(report)
87
+
88
+ if output_path:
89
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
90
+ Path(output_path).write_text(report + "\n")
91
+ print(f"\nSaved to {output_path}")
92
+
93
+ return gate_pass
94
+
95
+
96
+ def main() -> None:
97
+ parser = argparse.ArgumentParser(description="Verify retrieval against golden dataset")
98
+ parser.add_argument("--store-path", default=".cache/store")
99
+ parser.add_argument("--golden-path", default="agent_bench/evaluation/datasets/tech_docs_golden.json")
100
+ parser.add_argument("--output", default="docs/retrieval_gate.md")
101
+ args = parser.parse_args()
102
+
103
+ passed = verify(
104
+ store_path=args.store_path,
105
+ golden_path=args.golden_path,
106
+ output_path=args.output,
107
+ )
108
+ sys.exit(0 if passed else 1)
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()