matrix-ai / scripts /build_kb.py
ruslanmv's picture
VectorDB
215df55
raw
history blame
1.99 kB
#!/usr/bin/env python3
"""
Builds/refreshes the local RAG KB (data/kb.jsonl) from GitHub + local docs.
Usage:
python scripts/build_kb.py --config configs/rag_sources.yaml --out data/kb.jsonl
python scripts/build_kb.py --config ... --out ... --force
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
from pathlib import Path
# --- Ensure THIS repo is first on sys.path (avoid clashing 'app' packages) ---
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
logger = logging.getLogger("build_kb")
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
# Import the builder from this project
try:
from app.core.rag.build import build_kb_from_config, ensure_kb # type: ignore
except Exception as e: # pragma: no cover
logger.error("Failed importing KB builder from app.core.rag.build: %s", e)
logger.error("Make sure you're running from the project root and PYTHONPATH includes '.'.")
sys.exit(2)
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--config", required=True, help="Path to configs/rag_sources.yaml")
p.add_argument("--out", required=True, help="Output JSONL file, e.g., data/kb.jsonl")
p.add_argument("--force", action="store_true", help="Delete output file first, then rebuild")
args = p.parse_args()
out_path = Path(args.out)
if args.force and out_path.exists():
logger.info("Removing existing %s", out_path)
out_path.unlink()
# If you want a one-liner that skips if exists, use ensure_kb:
# created = ensure_kb(out_jsonl=args.out, config_path=args.config, skip_if_exists=True)
# logger.info("KB %s at %s", "ready" if created else "unchanged", args.out)
# Otherwise, always (re)build:
n = build_kb_from_config(config_path=args.config, out_jsonl=args.out)
logger.info("Wrote %d records to %s", n, args.out)
return 0
if __name__ == "__main__":
raise SystemExit(main())