Spaces:

RayanMLK
/

ece-intelligence-lab

Sleeping

App Files Files Community

ece-intelligence-lab / src /vector_store.py

RayanMLK

Initial commit - ECE Intelligence Lab chatbot

659d6ec 21 days ago

raw

history blame contribute delete

4.23 kB

	"""
	src/vector_store.py
	───────────────────────────────────────────────────────────────────────────────
	Responsible for:
	1. Converting document chunks into vector embeddings
	2. Storing them in a FAISS index (fast similarity search)
	3. Persisting the index to disk (so you don't re-embed every time)
	4. Loading an existing index from disk

	What is an embedding?
	An embedding is a numeric vector (list of floats) that represents the
	semantic meaning of a text. Similar texts → close vectors in space.
	This lets us find the most relevant document chunks for a user's question.

	What is FAISS?
	Facebook AI Similarity Search — an ultra-fast library to find the nearest
	vectors to a query vector. Perfect for document retrieval.
	"""

	import os
	from typing import List

	from langchain_core.documents import Document
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings


	def build_embedding_model(model_id: str) -> HuggingFaceEmbeddings:
	"""
	Load a sentence-transformer embedding model from Hugging Face.

	The model runs locally (no API call for embeddings), which means:
	- It's free and private
	- Fast for batch processing
	- No rate limits

	Args:
	model_id: Hugging Face model ID, e.g. "sentence-transformers/all-MiniLM-L6-v2"

	Returns:
	HuggingFaceEmbeddings object usable by LangChain.
	"""
	print(f"[VectorStore] Loading embedding model: {model_id}")
	embeddings = HuggingFaceEmbeddings(
	model_name=model_id,
	model_kwargs={"device": "cpu"}, # use "cuda" if you have a GPU
	encode_kwargs={"normalize_embeddings": True}, # unit vectors → cosine similarity
	)
	return embeddings


	def create_vectorstore(
	chunks: List[Document],
	embeddings: HuggingFaceEmbeddings,
	persist_path: str,
	) -> FAISS:
	"""
	Embed all document chunks and store them in a FAISS index.
	The index is saved to disk for reuse across sessions.

	Args:
	chunks: Document chunks from document_loader.split_documents()
	embeddings: The embedding model to use.
	persist_path: Folder where the FAISS index will be saved.

	Returns:
	A FAISS vectorstore ready for similarity search.
	"""
	print(f"[VectorStore] Embedding {len(chunks)} chunks... (this may take a moment)")
	vectorstore = FAISS.from_documents(chunks, embeddings)

	# Persist to disk so we don't need to re-embed on next startup
	os.makedirs(persist_path, exist_ok=True)
	vectorstore.save_local(persist_path)
	print(f"[VectorStore] Index saved to: {persist_path}")

	return vectorstore


	def load_vectorstore(
	persist_path: str,
	embeddings: HuggingFaceEmbeddings,
	) -> FAISS:
	"""
	Load a previously saved FAISS index from disk.

	Args:
	persist_path: Folder where the index was saved.
	embeddings: Must be the SAME embedding model used during creation.

	Returns:
	A FAISS vectorstore ready for similarity search.
	"""
	print(f"[VectorStore] Loading existing index from: {persist_path}")
	vectorstore = FAISS.load_local(
	persist_path,
	embeddings,
	allow_dangerous_deserialization=True, # required by LangChain for local files
	)
	return vectorstore


	def get_or_create_vectorstore(
	chunks: List[Document],
	embeddings: HuggingFaceEmbeddings,
	persist_path: str,
	) -> FAISS:
	"""
	Convenience function: loads existing index if available, else creates it.
	This avoids re-embedding documents on every restart.

	Args:
	chunks: Document chunks (only used if index doesn't exist yet).
	embeddings: Embedding model.
	persist_path: Where to save/load the FAISS index.

	Returns:
	A ready-to-use FAISS vectorstore.
	"""
	index_file = os.path.join(persist_path, "index.faiss")

	if os.path.exists(index_file):
	return load_vectorstore(persist_path, embeddings)
	else:
	return create_vectorstore(chunks, embeddings, persist_path)