ece-intelligence-lab / src /vector_store.py
RayanMLK's picture
Initial commit - ECE Intelligence Lab chatbot
659d6ec
"""
src/vector_store.py
───────────────────────────────────────────────────────────────────────────────
Responsible for:
1. Converting document chunks into vector embeddings
2. Storing them in a FAISS index (fast similarity search)
3. Persisting the index to disk (so you don't re-embed every time)
4. Loading an existing index from disk
What is an embedding?
An embedding is a numeric vector (list of floats) that represents the
semantic meaning of a text. Similar texts β†’ close vectors in space.
This lets us find the most relevant document chunks for a user's question.
What is FAISS?
Facebook AI Similarity Search β€” an ultra-fast library to find the nearest
vectors to a query vector. Perfect for document retrieval.
"""
import os
from typing import List
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
def build_embedding_model(model_id: str) -> HuggingFaceEmbeddings:
"""
Load a sentence-transformer embedding model from Hugging Face.
The model runs locally (no API call for embeddings), which means:
- It's free and private
- Fast for batch processing
- No rate limits
Args:
model_id: Hugging Face model ID, e.g. "sentence-transformers/all-MiniLM-L6-v2"
Returns:
HuggingFaceEmbeddings object usable by LangChain.
"""
print(f"[VectorStore] Loading embedding model: {model_id}")
embeddings = HuggingFaceEmbeddings(
model_name=model_id,
model_kwargs={"device": "cpu"}, # use "cuda" if you have a GPU
encode_kwargs={"normalize_embeddings": True}, # unit vectors β†’ cosine similarity
)
return embeddings
def create_vectorstore(
chunks: List[Document],
embeddings: HuggingFaceEmbeddings,
persist_path: str,
) -> FAISS:
"""
Embed all document chunks and store them in a FAISS index.
The index is saved to disk for reuse across sessions.
Args:
chunks: Document chunks from document_loader.split_documents()
embeddings: The embedding model to use.
persist_path: Folder where the FAISS index will be saved.
Returns:
A FAISS vectorstore ready for similarity search.
"""
print(f"[VectorStore] Embedding {len(chunks)} chunks... (this may take a moment)")
vectorstore = FAISS.from_documents(chunks, embeddings)
# Persist to disk so we don't need to re-embed on next startup
os.makedirs(persist_path, exist_ok=True)
vectorstore.save_local(persist_path)
print(f"[VectorStore] Index saved to: {persist_path}")
return vectorstore
def load_vectorstore(
persist_path: str,
embeddings: HuggingFaceEmbeddings,
) -> FAISS:
"""
Load a previously saved FAISS index from disk.
Args:
persist_path: Folder where the index was saved.
embeddings: Must be the SAME embedding model used during creation.
Returns:
A FAISS vectorstore ready for similarity search.
"""
print(f"[VectorStore] Loading existing index from: {persist_path}")
vectorstore = FAISS.load_local(
persist_path,
embeddings,
allow_dangerous_deserialization=True, # required by LangChain for local files
)
return vectorstore
def get_or_create_vectorstore(
chunks: List[Document],
embeddings: HuggingFaceEmbeddings,
persist_path: str,
) -> FAISS:
"""
Convenience function: loads existing index if available, else creates it.
This avoids re-embedding documents on every restart.
Args:
chunks: Document chunks (only used if index doesn't exist yet).
embeddings: Embedding model.
persist_path: Where to save/load the FAISS index.
Returns:
A ready-to-use FAISS vectorstore.
"""
index_file = os.path.join(persist_path, "index.faiss")
if os.path.exists(index_file):
return load_vectorstore(persist_path, embeddings)
else:
return create_vectorstore(chunks, embeddings, persist_path)