Spaces:

Nomearod
/

agentbench

Running

Nomearod Claude Opus 4.6 (1M context) commited on Mar 24

Commit

f0bfb5e

1 Parent(s): a152b95

feat: add reproducible retrieval gate check with committed artifact

- scripts/verify_retrieval.py: runs golden questions against the store,
outputs markdown table with per-question Recall@5 and PASS/FAIL
- docs/retrieval_gate.md: committed gate artifact showing 1.00 Recall@5
across all 7 positive questions (Day 4 gate proven)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

docs/retrieval_gate.md +19 -0
scripts/verify_retrieval.py +112 -0

docs/retrieval_gate.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Retrieval Gate Check
+**Store:** 207 chunks, 16 sources
+| ID | Category | Expected Source | Top-5 Sources | Recall@5 | Result |
+|-----|----------|----------------|---------------|----------|--------|
+| q001 | retrieval | fastapi_path_params.md | fastapi_path_params.md, fastapi_query_params.md, fastapi_request_body.md | 1.00 | PASS |
+| q002 | retrieval | fastapi_pagination.md | fastapi_pagination.md, fastapi_path_params.md | 1.00 | PASS |
+| q003 | retrieval | fastapi_middleware.md | fastapi_middleware.md | 1.00 | PASS |
+| q004 | retrieval | fastapi_security.md | fastapi_security.md | 1.00 | PASS |
+| q005 | retrieval | fastapi_deployment.md | fastapi_deployment.md | 1.00 | PASS |
+| q006 | retrieval | fastapi_dependencies.md | fastapi_dependencies.md | 1.00 | PASS |
+| q007 | calculation | fastapi_pagination.md | fastapi_pagination.md | 1.00 | PASS |
+| q008 | out_of_scope | (none) | fastapi_deployment.md, fastapi_intro.md, fastapi_openapi.md | n/a | N/A |
+| q009 | out_of_scope | (none) | fastapi_websockets.md, fastapi_background_tasks.md | n/a | N/A |
+| q010 | out_of_scope | (none) | fastapi_openapi.md, fastapi_response_model.md | n/a | N/A |
+**Avg Recall@5 (positive only):** 1.00
+**Gate:** PASS (threshold >= 0.5)

scripts/verify_retrieval.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Verify retrieval quality against golden dataset.
+Runs the Day 4 gate check: for each positive golden question,
+does hybrid retrieval return the expected source in top-5?
+Usage:
+    python scripts/verify_retrieval.py
+    python scripts/verify_retrieval.py --store-path .cache/store --output docs/retrieval_gate.md
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from agent_bench.rag.embedder import Embedder
+from agent_bench.rag.store import HybridStore
+def verify(
+    store_path: str = ".cache/store",
+    golden_path: str = "agent_bench/evaluation/datasets/tech_docs_golden.json",
+    model_name: str = "all-MiniLM-L6-v2",
+    cache_dir: str = ".cache/embeddings",
+    output_path: str | None = None,
+) -> bool:
+    store = HybridStore.load(store_path)
+    embedder = Embedder(model_name=model_name, cache_dir=cache_dir)
+    with open(golden_path) as f:
+        questions = json.load(f)
+    lines: list[str] = []
+    lines.append("# Retrieval Gate Check")
+    lines.append("")
+    lines.append(
+        f"**Store:** {store.stats().total_chunks} chunks, "
+        f"{store.stats().unique_sources} sources"
+    )
+    lines.append("")
+    lines.append("| ID | Category | Expected Source | Top-5 Sources | Recall@5 | Result |")
+    lines.append("|-----|----------|----------------|---------------|----------|--------|")
+    total_recall = 0.0
+    scorable = 0
+    for q in questions:
+        qid = q["id"]
+        question = q["question"]
+        expected = set(q["expected_sources"])
+        category = q["category"]
+        vec = embedder.embed(question)
+        results = store.search(vec, question, top_k=5, strategy="hybrid")
+        retrieved = [r.chunk.source for r in results]
+        retrieved_set = set(retrieved)
+        if expected:
+            recall = len(expected & retrieved_set) / len(expected)
+            total_recall += recall
+            scorable += 1
+            result = "PASS" if recall >= 0.5 else "FAIL"
+        else:
+            recall = float("nan")
+            result = "N/A"
+        expected_str = ", ".join(sorted(expected)) if expected else "(none)"
+        retrieved_str = ", ".join(dict.fromkeys(retrieved[:3]))  # dedup, first 3
+        recall_str = f"{recall:.2f}" if expected else "n/a"
+        lines.append(
+            f"| {qid} | {category} | {expected_str} | {retrieved_str} | {recall_str} | {result} |"
+        )
+    avg_recall = total_recall / max(scorable, 1)
+    gate_pass = avg_recall >= 0.5
+    lines.append("")
+    lines.append(f"**Avg Recall@5 (positive only):** {avg_recall:.2f}")
+    lines.append(f"**Gate:** {'PASS' if gate_pass else 'FAIL'} (threshold >= 0.5)")
+    report = "\n".join(lines)
+    print(report)
+    if output_path:
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        Path(output_path).write_text(report + "\n")
+        print(f"\nSaved to {output_path}")
+    return gate_pass
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Verify retrieval against golden dataset")
+    parser.add_argument("--store-path", default=".cache/store")
+    parser.add_argument("--golden-path", default="agent_bench/evaluation/datasets/tech_docs_golden.json")
+    parser.add_argument("--output", default="docs/retrieval_gate.md")
+    args = parser.parse_args()
+    passed = verify(
+        store_path=args.store_path,
+        golden_path=args.golden_path,
+        output_path=args.output,
+    )
+    sys.exit(0 if passed else 1)
+if __name__ == "__main__":
+    main()