Spaces:

Prog-amin
/

pdf-qa-chatbot

Runtime error

App Files Files Community

pdf-qa-chatbot / backend /app /services /vector_store.py

Amin23

initial commit

ec2497b 16 days ago

raw

history blame contribute delete

7.65 kB

	import chromadb
	from chromadb.config import Settings as ChromaSettings
	from typing import List, Dict, Optional, Tuple
	import json
	import logging
	from app.core.config import settings

	logger = logging.getLogger(__name__)


	class VectorStore:
	_instance = None

	def __new__(cls):
	if cls._instance is None:
	cls._instance = super(VectorStore, cls).__new__(cls)
	cls._instance._initialized = False
	return cls._instance

	def __init__(self):
	if not self._initialized:
	self.client = chromadb.PersistentClient(
	path=settings.CHROMA_PERSIST_DIRECTORY,
	settings=ChromaSettings(
	anonymized_telemetry=False
	)
	)
	self.collection_name = "pdf_documents"
	self.collection = self._get_or_create_collection()
	self._initialized = True

	def _get_or_create_collection(self):
	"""Get existing collection or create new one"""
	try:
	collection = self.client.get_collection(name=self.collection_name)
	logger.info(f"Using existing collection: {self.collection_name}")
	except Exception:
	collection = self.client.create_collection(
	name=self.collection_name,
	metadata={"description": "PDF document embeddings for Q&A chatbot"}
	)
	logger.info(f"Created new collection: {self.collection_name}")

	return collection

	def add_document(self, document_id: str, content: str, metadata: Dict = None) -> bool:
	"""Add document content to vector store"""
	try:
	logger.info(f"Starting to add document {document_id} to vector store")
	logger.info(f"Content length: {len(content)} characters")

	# Split content into chunks for better retrieval
	chunks = self._split_text(content, chunk_size=1000, overlap=200)
	logger.info(f"Split content into {len(chunks)} chunks")

	# Prepare data for ChromaDB
	ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
	documents = chunks
	metadatas = [{
	"document_id": document_id,
	"chunk_index": i,
	**(metadata or {})
	} for i in range(len(chunks))]

	logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...") # Log first 3 IDs

	# Add to collection
	logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
	self.collection.add(
	ids=ids,
	documents=documents,
	metadatas=metadatas
	)

	logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
	return True

	except Exception as e:
	logger.error(f"Error adding document {document_id} to vector store: {e}")
	logger.error(f"Exception type: {type(e).__name__}")
	import traceback
	logger.error(f"Full traceback: {traceback.format_exc()}")
	return False

	def search_similar(self, query: str, n_results: int = 5, document_id: str = None) -> List[Dict]:
	"""Search for similar documents based on query, optionally filtering by document_id"""
	try:
	results = self.collection.query(
	query_texts=[query],
	n_results=n_results,
	include=["documents", "metadatas", "distances"]
	)

	# Format results
	formatted_results = []
	if results['documents'] and results['documents'][0]:
	for i, (doc, metadata, distance) in enumerate(zip(
	results['documents'][0],
	results['metadatas'][0],
	results['distances'][0]
	)):
	if document_id is not None and str(metadata.get('document_id')) != str(document_id):
	continue
	formatted_results.append({
	'content': doc,
	'metadata': metadata,
	'similarity_score': 1 - distance, # Convert distance to similarity
	'rank': i + 1
	})
	return formatted_results
	except Exception as e:
	logger.error(f"Error searching vector store: {e}")
	return []

	def delete_document(self, document_id: str) -> bool:
	"""Delete all chunks for a specific document"""
	try:
	# Get all chunks for this document
	results = self.collection.get(
	where={"document_id": document_id}
	)

	if results['ids']:
	self.collection.delete(ids=results['ids'])
	logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")

	return True

	except Exception as e:
	logger.error(f"Error deleting document {document_id} from vector store: {e}")
	return False

	def get_collection_stats(self) -> Dict:
	"""Get statistics about the vector store collection"""
	try:
	logger.info(f"Getting stats for collection: {self.collection_name}")
	count = self.collection.count()
	logger.info(f"Collection count: {count}")
	return {
	"total_documents": count,
	"collection_name": self.collection_name
	}
	except Exception as e:
	logger.error(f"Error getting collection stats: {e}")
	logger.error(f"Exception type: {type(e).__name__}")
	import traceback
	logger.error(f"Full traceback: {traceback.format_exc()}")
	return {"total_documents": 0, "collection_name": self.collection_name}

	def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
	"""Split text into overlapping chunks"""
	if len(text) <= chunk_size:
	return [text]

	chunks = []
	start = 0

	while start < len(text):
	end = start + chunk_size

	# If this isn't the last chunk, try to break at a sentence boundary
	if end < len(text):
	# Look for sentence endings
	for i in range(end, max(start + chunk_size - 100, start), -1):
	if text[i] in '.!?':
	end = i + 1
	break

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	# Move start position with overlap
	start = end - overlap
	if start >= len(text):
	break

	return chunks

	def clear_all(self) -> bool:
	"""Clear all documents from the vector store"""
	try:
	self.client.delete_collection(name=self.collection_name)
	self.collection = self._get_or_create_collection()
	logger.info("Cleared all documents from vector store")
	return True
	except Exception as e:
	logger.error(f"Error clearing vector store: {e}")
	return False

	@classmethod
	def reset_instance(cls):
	"""Reset the singleton instance - useful after clearing collections"""
	cls._instance = None