| | |
| | """ |
| | Local similarity search (cosine) over embeddings.jsonl. |
| | |
| | Purpose: |
| | Performs local similarity search using cosine similarity over pre-generated embeddings. |
| | Useful for testing and debugging search functionality without connecting to a vector database. |
| | |
| | Inputs: |
| | embeddings_path (str): Path to embeddings.jsonl file |
| | query (str): Search query text |
| | k (int, optional): Number of results to return (default: 3) |
| | dim (int, optional): Embedding dimension (default: 64) |
| | |
| | Outputs: |
| | Prints top-k results with id, filename, chunk_id, and similarity score |
| | |
| | Usage: |
| | python scripts/search_documents.py /path/to/embeddings.jsonl "query text" [k] [dim] |
| | |
| | Example: |
| | python scripts/search_documents.py ./data/chunks.jsonl "what is GDPR" 5 384 |
| | """ |
| |
|
| | import sys |
| | import json |
| | import math |
| | from pathlib import Path |
| | from typing import List |
| | from src.ingestion.embeddings import get_embedding |
| |
|
| | def load_embeddings(path: str): |
| | path = Path(path) |
| | if not path.exists(): |
| | raise FileNotFoundError(path) |
| | items = [] |
| | with path.open("r", encoding="utf-8") as fh: |
| | for line in fh: |
| | obj = json.loads(line) |
| | items.append(obj) |
| | return items |
| |
|
| | def dot(a: List[float], b: List[float]) -> float: |
| | return sum(x*y for x,y in zip(a,b)) |
| |
|
| | def norm(a: List[float]) -> float: |
| | return math.sqrt(sum(x*x for x in a)) |
| |
|
| | def cosine_sim(a: List[float], b: List[float]) -> float: |
| | na = norm(a) |
| | nb = norm(b) |
| | if na == 0 or nb == 0: |
| | return 0.0 |
| | return dot(a,b) / (na * nb) |
| |
|
| | def search(embeddings_path: str, query: str, k: int = 3, dim: int = 64): |
| | items = load_embeddings(embeddings_path) |
| | qvec = get_embedding(query, provider="local", dim=dim) |
| | scored = [] |
| | for it in items: |
| | emb = it.get("embedding") |
| | if not emb: |
| | continue |
| | score = cosine_sim(qvec, emb) |
| | scored.append((score, it)) |
| | scored.sort(key=lambda x: x[0], reverse=True) |
| | return scored[:k] |
| |
|
| | def print_results(results): |
| | print(f"{'SCORE':>8} {'ID':60} {'FILENAME':40} {'CHUNK_ID':>7}") |
| | print("-"*130) |
| | for score, it in results: |
| | print(f"{score:8.4f} {it['id'][:60]:60} {it['filename'][:40]:40} {it['chunk_id']:7d}") |
| |
|
| | if __name__ == "__main__": |
| | if len(sys.argv) < 3: |
| | print("Usage: python3 scripts/search_documents.py /path/to/embeddings.jsonl \"query text\" [k] [dim]") |
| | raise SystemExit(1) |
| | emb_path = sys.argv[1] |
| | query = sys.argv[2] |
| | k = int(sys.argv[3]) if len(sys.argv) > 3 else 3 |
| | dim = int(sys.argv[4]) if len(sys.argv) > 4 else 64 |
| |
|
| | results = search(emb_path, query, k=k, dim=dim) |
| | print_results(results) |