Spaces:
Running
Running
feat: add reproducible retrieval gate check with committed artifact
Browse files- scripts/verify_retrieval.py: runs golden questions against the store,
outputs markdown table with per-question Recall@5 and PASS/FAIL
- docs/retrieval_gate.md: committed gate artifact showing 1.00 Recall@5
across all 7 positive questions (Day 4 gate proven)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- docs/retrieval_gate.md +19 -0
- scripts/verify_retrieval.py +112 -0
docs/retrieval_gate.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Retrieval Gate Check
|
| 2 |
+
|
| 3 |
+
**Store:** 207 chunks, 16 sources
|
| 4 |
+
|
| 5 |
+
| ID | Category | Expected Source | Top-5 Sources | Recall@5 | Result |
|
| 6 |
+
|-----|----------|----------------|---------------|----------|--------|
|
| 7 |
+
| q001 | retrieval | fastapi_path_params.md | fastapi_path_params.md, fastapi_query_params.md, fastapi_request_body.md | 1.00 | PASS |
|
| 8 |
+
| q002 | retrieval | fastapi_pagination.md | fastapi_pagination.md, fastapi_path_params.md | 1.00 | PASS |
|
| 9 |
+
| q003 | retrieval | fastapi_middleware.md | fastapi_middleware.md | 1.00 | PASS |
|
| 10 |
+
| q004 | retrieval | fastapi_security.md | fastapi_security.md | 1.00 | PASS |
|
| 11 |
+
| q005 | retrieval | fastapi_deployment.md | fastapi_deployment.md | 1.00 | PASS |
|
| 12 |
+
| q006 | retrieval | fastapi_dependencies.md | fastapi_dependencies.md | 1.00 | PASS |
|
| 13 |
+
| q007 | calculation | fastapi_pagination.md | fastapi_pagination.md | 1.00 | PASS |
|
| 14 |
+
| q008 | out_of_scope | (none) | fastapi_deployment.md, fastapi_intro.md, fastapi_openapi.md | n/a | N/A |
|
| 15 |
+
| q009 | out_of_scope | (none) | fastapi_websockets.md, fastapi_background_tasks.md | n/a | N/A |
|
| 16 |
+
| q010 | out_of_scope | (none) | fastapi_openapi.md, fastapi_response_model.md | n/a | N/A |
|
| 17 |
+
|
| 18 |
+
**Avg Recall@5 (positive only):** 1.00
|
| 19 |
+
**Gate:** PASS (threshold >= 0.5)
|
scripts/verify_retrieval.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Verify retrieval quality against golden dataset.
|
| 2 |
+
|
| 3 |
+
Runs the Day 4 gate check: for each positive golden question,
|
| 4 |
+
does hybrid retrieval return the expected source in top-5?
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
python scripts/verify_retrieval.py
|
| 8 |
+
python scripts/verify_retrieval.py --store-path .cache/store --output docs/retrieval_gate.md
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import json
|
| 15 |
+
import sys
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 19 |
+
|
| 20 |
+
from agent_bench.rag.embedder import Embedder
|
| 21 |
+
from agent_bench.rag.store import HybridStore
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def verify(
|
| 25 |
+
store_path: str = ".cache/store",
|
| 26 |
+
golden_path: str = "agent_bench/evaluation/datasets/tech_docs_golden.json",
|
| 27 |
+
model_name: str = "all-MiniLM-L6-v2",
|
| 28 |
+
cache_dir: str = ".cache/embeddings",
|
| 29 |
+
output_path: str | None = None,
|
| 30 |
+
) -> bool:
|
| 31 |
+
store = HybridStore.load(store_path)
|
| 32 |
+
embedder = Embedder(model_name=model_name, cache_dir=cache_dir)
|
| 33 |
+
|
| 34 |
+
with open(golden_path) as f:
|
| 35 |
+
questions = json.load(f)
|
| 36 |
+
|
| 37 |
+
lines: list[str] = []
|
| 38 |
+
lines.append("# Retrieval Gate Check")
|
| 39 |
+
lines.append("")
|
| 40 |
+
lines.append(
|
| 41 |
+
f"**Store:** {store.stats().total_chunks} chunks, "
|
| 42 |
+
f"{store.stats().unique_sources} sources"
|
| 43 |
+
)
|
| 44 |
+
lines.append("")
|
| 45 |
+
lines.append("| ID | Category | Expected Source | Top-5 Sources | Recall@5 | Result |")
|
| 46 |
+
lines.append("|-----|----------|----------------|---------------|----------|--------|")
|
| 47 |
+
|
| 48 |
+
total_recall = 0.0
|
| 49 |
+
scorable = 0
|
| 50 |
+
|
| 51 |
+
for q in questions:
|
| 52 |
+
qid = q["id"]
|
| 53 |
+
question = q["question"]
|
| 54 |
+
expected = set(q["expected_sources"])
|
| 55 |
+
category = q["category"]
|
| 56 |
+
|
| 57 |
+
vec = embedder.embed(question)
|
| 58 |
+
results = store.search(vec, question, top_k=5, strategy="hybrid")
|
| 59 |
+
retrieved = [r.chunk.source for r in results]
|
| 60 |
+
retrieved_set = set(retrieved)
|
| 61 |
+
|
| 62 |
+
if expected:
|
| 63 |
+
recall = len(expected & retrieved_set) / len(expected)
|
| 64 |
+
total_recall += recall
|
| 65 |
+
scorable += 1
|
| 66 |
+
result = "PASS" if recall >= 0.5 else "FAIL"
|
| 67 |
+
else:
|
| 68 |
+
recall = float("nan")
|
| 69 |
+
result = "N/A"
|
| 70 |
+
|
| 71 |
+
expected_str = ", ".join(sorted(expected)) if expected else "(none)"
|
| 72 |
+
retrieved_str = ", ".join(dict.fromkeys(retrieved[:3])) # dedup, first 3
|
| 73 |
+
recall_str = f"{recall:.2f}" if expected else "n/a"
|
| 74 |
+
lines.append(
|
| 75 |
+
f"| {qid} | {category} | {expected_str} | {retrieved_str} | {recall_str} | {result} |"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
avg_recall = total_recall / max(scorable, 1)
|
| 79 |
+
gate_pass = avg_recall >= 0.5
|
| 80 |
+
|
| 81 |
+
lines.append("")
|
| 82 |
+
lines.append(f"**Avg Recall@5 (positive only):** {avg_recall:.2f}")
|
| 83 |
+
lines.append(f"**Gate:** {'PASS' if gate_pass else 'FAIL'} (threshold >= 0.5)")
|
| 84 |
+
|
| 85 |
+
report = "\n".join(lines)
|
| 86 |
+
print(report)
|
| 87 |
+
|
| 88 |
+
if output_path:
|
| 89 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
Path(output_path).write_text(report + "\n")
|
| 91 |
+
print(f"\nSaved to {output_path}")
|
| 92 |
+
|
| 93 |
+
return gate_pass
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def main() -> None:
|
| 97 |
+
parser = argparse.ArgumentParser(description="Verify retrieval against golden dataset")
|
| 98 |
+
parser.add_argument("--store-path", default=".cache/store")
|
| 99 |
+
parser.add_argument("--golden-path", default="agent_bench/evaluation/datasets/tech_docs_golden.json")
|
| 100 |
+
parser.add_argument("--output", default="docs/retrieval_gate.md")
|
| 101 |
+
args = parser.parse_args()
|
| 102 |
+
|
| 103 |
+
passed = verify(
|
| 104 |
+
store_path=args.store_path,
|
| 105 |
+
golden_path=args.golden_path,
|
| 106 |
+
output_path=args.output,
|
| 107 |
+
)
|
| 108 |
+
sys.exit(0 if passed else 1)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
main()
|