Spaces:
Sleeping
Sleeping
| """ | |
| src/vector_store.py | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Responsible for: | |
| 1. Converting document chunks into vector embeddings | |
| 2. Storing them in a FAISS index (fast similarity search) | |
| 3. Persisting the index to disk (so you don't re-embed every time) | |
| 4. Loading an existing index from disk | |
| What is an embedding? | |
| An embedding is a numeric vector (list of floats) that represents the | |
| semantic meaning of a text. Similar texts β close vectors in space. | |
| This lets us find the most relevant document chunks for a user's question. | |
| What is FAISS? | |
| Facebook AI Similarity Search β an ultra-fast library to find the nearest | |
| vectors to a query vector. Perfect for document retrieval. | |
| """ | |
| import os | |
| from typing import List | |
| from langchain_core.documents import Document | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| def build_embedding_model(model_id: str) -> HuggingFaceEmbeddings: | |
| """ | |
| Load a sentence-transformer embedding model from Hugging Face. | |
| The model runs locally (no API call for embeddings), which means: | |
| - It's free and private | |
| - Fast for batch processing | |
| - No rate limits | |
| Args: | |
| model_id: Hugging Face model ID, e.g. "sentence-transformers/all-MiniLM-L6-v2" | |
| Returns: | |
| HuggingFaceEmbeddings object usable by LangChain. | |
| """ | |
| print(f"[VectorStore] Loading embedding model: {model_id}") | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=model_id, | |
| model_kwargs={"device": "cpu"}, # use "cuda" if you have a GPU | |
| encode_kwargs={"normalize_embeddings": True}, # unit vectors β cosine similarity | |
| ) | |
| return embeddings | |
| def create_vectorstore( | |
| chunks: List[Document], | |
| embeddings: HuggingFaceEmbeddings, | |
| persist_path: str, | |
| ) -> FAISS: | |
| """ | |
| Embed all document chunks and store them in a FAISS index. | |
| The index is saved to disk for reuse across sessions. | |
| Args: | |
| chunks: Document chunks from document_loader.split_documents() | |
| embeddings: The embedding model to use. | |
| persist_path: Folder where the FAISS index will be saved. | |
| Returns: | |
| A FAISS vectorstore ready for similarity search. | |
| """ | |
| print(f"[VectorStore] Embedding {len(chunks)} chunks... (this may take a moment)") | |
| vectorstore = FAISS.from_documents(chunks, embeddings) | |
| # Persist to disk so we don't need to re-embed on next startup | |
| os.makedirs(persist_path, exist_ok=True) | |
| vectorstore.save_local(persist_path) | |
| print(f"[VectorStore] Index saved to: {persist_path}") | |
| return vectorstore | |
| def load_vectorstore( | |
| persist_path: str, | |
| embeddings: HuggingFaceEmbeddings, | |
| ) -> FAISS: | |
| """ | |
| Load a previously saved FAISS index from disk. | |
| Args: | |
| persist_path: Folder where the index was saved. | |
| embeddings: Must be the SAME embedding model used during creation. | |
| Returns: | |
| A FAISS vectorstore ready for similarity search. | |
| """ | |
| print(f"[VectorStore] Loading existing index from: {persist_path}") | |
| vectorstore = FAISS.load_local( | |
| persist_path, | |
| embeddings, | |
| allow_dangerous_deserialization=True, # required by LangChain for local files | |
| ) | |
| return vectorstore | |
| def get_or_create_vectorstore( | |
| chunks: List[Document], | |
| embeddings: HuggingFaceEmbeddings, | |
| persist_path: str, | |
| ) -> FAISS: | |
| """ | |
| Convenience function: loads existing index if available, else creates it. | |
| This avoids re-embedding documents on every restart. | |
| Args: | |
| chunks: Document chunks (only used if index doesn't exist yet). | |
| embeddings: Embedding model. | |
| persist_path: Where to save/load the FAISS index. | |
| Returns: | |
| A ready-to-use FAISS vectorstore. | |
| """ | |
| index_file = os.path.join(persist_path, "index.faiss") | |
| if os.path.exists(index_file): | |
| return load_vectorstore(persist_path, embeddings) | |
| else: | |
| return create_vectorstore(chunks, embeddings, persist_path) |