devjas1
(FEAT)[Implement document search functionality]: enhance the search_documents function to load FAISS index and metadata, enabling semantic document retrieval.
593e022
| """ | |
| Retriever module for semantic document search using FAISS. | |
| Provides functions to perform similarity-based lookups over embedded document vectors. | |
| Integrates with FAISS for efficient vector search and returns relevant document matches. | |
| """ | |
| import os | |
| import pickle | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| def search_documents(query: str, config: dict): | |
| """ | |
| Search for semantically similar documents using FAISS index. | |
| Args: | |
| query (str): Search query | |
| config (dict): Configuration dictionary | |
| Returns: | |
| list: List of relevant text chunks with similarity scores | |
| """ | |
| # Check if FAISS index exists | |
| if not os.path.exists("vector_cache/faiss_index.bin"): | |
| print("No FAISS index found. Please run 'init' command first.") | |
| return [] | |
| try: | |
| # Load FAISS index and metadata | |
| index = faiss.read_index("vector_cache/faiss_index.bin") | |
| with open("vector_cache/metadata.pkl", "rb") as f: | |
| metadata = pickle.load(f) | |
| texts = metadata["texts"] | |
| filenames = metadata["filenames"] | |
| # Embed the query | |
| model = SentenceTransformer(config["embedding"]["model_path"]) | |
| query_embedding = model.encode([query]).astype("float32") | |
| faiss.normalize_L2(query_embedding) | |
| # Search similar documents | |
| top_k = config.get("retrieval", {}).get("top_k", 5) | |
| similarity_threshold = config.get("retrieval", {}).get( | |
| "similarity_threshold", 0.75 | |
| ) | |
| scores, indices = index.search(query_embedding, top_k) | |
| results = [] | |
| for i, (score, idx) in enumerate(zip(scores[0], indices[0])): | |
| if score >= similarity_threshold: | |
| results.append( | |
| f"[{filenames[idx]}] (score: {score:.3f}): {texts[idx][:200]}..." | |
| ) | |
| else: | |
| break | |
| if not results: | |
| results.append(f"No matches found above threshold {similarity_threshold}") | |
| return results | |
| except ( | |
| FileNotFoundError, | |
| pickle.UnpicklingError, | |
| KeyError, | |
| ValueError, | |
| ) as e: | |
| print(f"Error during search: {e}") | |
| return [f"Search failed: {e}"] | |