Spaces:

Prog-amin
/

pdf-qa-chatbot

Runtime error

File size: 7,650 Bytes

e22dcc4
 
ec2497b
e22dcc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec2497b
 
 
 
 
 
e22dcc4
ec2497b
e22dcc4
ec2497b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e22dcc4
 
 
 
ec2497b
e22dcc4
 
 
ec2497b
e22dcc4
 
 
ec2497b
 
 
 
 
 
e22dcc4
ec2497b
e22dcc4
 
ec2497b
e22dcc4
 
ec2497b
e22dcc4
ec2497b
e22dcc4
 
ec2497b
e22dcc4
 
 
 
 
 
ec2497b
 
e22dcc4
 
ec2497b
e22dcc4
 
 
 
ec2497b
e22dcc4
 
ec2497b
 
 
 
 
 
 
 
 
 
 
 
 
 
e22dcc4
 
 
 
ec2497b
e22dcc4
 
 
 
ec2497b
e22dcc4
 
ec2497b
e22dcc4
ec2497b
e22dcc4
ec2497b
e22dcc4
ec2497b
e22dcc4
 
 
ec2497b
e22dcc4
 
 
 
ec2497b
e22dcc4
 
 
 
 
 
 
 
 
 
 
ec2497b
e22dcc4
 
 
 
ec2497b
e22dcc4
 
ec2497b
e22dcc4
 
ec2497b
e22dcc4
 
 
 
 
 
 
ec2497b
e22dcc4
 
 
ec2497b
e22dcc4
 
 
 
ec2497b
e22dcc4
ec2497b
e22dcc4
 
 
ec2497b
 
 
e22dcc4
 
ec2497b
1d00b72
ec2497b
e22dcc4

import chromadb
from chromadb.config import Settings as ChromaSettings
from typing import List, Dict, Optional, Tuple
import json
import logging
from app.core.config import settings

logger = logging.getLogger(__name__)


class VectorStore:
    _instance = None
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(VectorStore, cls).__new__(cls)
            cls._instance._initialized = False
        return cls._instance
    
    def __init__(self):
        if not self._initialized:
            self.client = chromadb.PersistentClient(
                path=settings.CHROMA_PERSIST_DIRECTORY,
                settings=ChromaSettings(
                    anonymized_telemetry=False
                )
            )
            self.collection_name = "pdf_documents"
            self.collection = self._get_or_create_collection()
            self._initialized = True
    
    def _get_or_create_collection(self):
        """Get existing collection or create new one"""
        try:
            collection = self.client.get_collection(name=self.collection_name)
            logger.info(f"Using existing collection: {self.collection_name}")
        except Exception:
            collection = self.client.create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for Q&A chatbot"}
            )
            logger.info(f"Created new collection: {self.collection_name}")
        
        return collection
    
    def add_document(self, document_id: str, content: str, metadata: Dict = None) -> bool:
        """Add document content to vector store"""
        try:
            logger.info(f"Starting to add document {document_id} to vector store")
            logger.info(f"Content length: {len(content)} characters")
            
            # Split content into chunks for better retrieval
            chunks = self._split_text(content, chunk_size=1000, overlap=200)
            logger.info(f"Split content into {len(chunks)} chunks")
            
            # Prepare data for ChromaDB
            ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
            documents = chunks
            metadatas = [{
                "document_id": document_id,
                "chunk_index": i,
                **(metadata or {})
            } for i in range(len(chunks))]
            
            logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...")  # Log first 3 IDs
            
            # Add to collection
            logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
            self.collection.add(
                ids=ids,
                documents=documents,
                metadatas=metadatas
            )
            
            logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
            return True
            
        except Exception as e:
            logger.error(f"Error adding document {document_id} to vector store: {e}")
            logger.error(f"Exception type: {type(e).__name__}")
            import traceback
            logger.error(f"Full traceback: {traceback.format_exc()}")
            return False
    
    def search_similar(self, query: str, n_results: int = 5, document_id: str = None) -> List[Dict]:
        """Search for similar documents based on query, optionally filtering by document_id"""
        try:
            results = self.collection.query(
                query_texts=[query],
                n_results=n_results,
                include=["documents", "metadatas", "distances"]
            )
            
            # Format results
            formatted_results = []
            if results['documents'] and results['documents'][0]:
                for i, (doc, metadata, distance) in enumerate(zip(
                    results['documents'][0],
                    results['metadatas'][0],
                    results['distances'][0]
                )):
                    if document_id is not None and str(metadata.get('document_id')) != str(document_id):
                        continue
                    formatted_results.append({
                        'content': doc,
                        'metadata': metadata,
                        'similarity_score': 1 - distance,  # Convert distance to similarity
                        'rank': i + 1
                    })
            return formatted_results
        except Exception as e:
            logger.error(f"Error searching vector store: {e}")
            return []
    
    def delete_document(self, document_id: str) -> bool:
        """Delete all chunks for a specific document"""
        try:
            # Get all chunks for this document
            results = self.collection.get(
                where={"document_id": document_id}
            )
            
            if results['ids']:
                self.collection.delete(ids=results['ids'])
                logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")
            
            return True
            
        except Exception as e:
            logger.error(f"Error deleting document {document_id} from vector store: {e}")
            return False
    
    def get_collection_stats(self) -> Dict:
        """Get statistics about the vector store collection"""
        try:
            logger.info(f"Getting stats for collection: {self.collection_name}")
            count = self.collection.count()
            logger.info(f"Collection count: {count}")
            return {
                "total_documents": count,
                "collection_name": self.collection_name
            }
        except Exception as e:
            logger.error(f"Error getting collection stats: {e}")
            logger.error(f"Exception type: {type(e).__name__}")
            import traceback
            logger.error(f"Full traceback: {traceback.format_exc()}")
            return {"total_documents": 0, "collection_name": self.collection_name}
    
    def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
        """Split text into overlapping chunks"""
        if len(text) <= chunk_size:
            return [text]
        
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + chunk_size
            
            # If this isn't the last chunk, try to break at a sentence boundary
            if end < len(text):
                # Look for sentence endings
                for i in range(end, max(start + chunk_size - 100, start), -1):
                    if text[i] in '.!?':
                        end = i + 1
                        break
            
            chunk = text[start:end].strip()
            if chunk:
                chunks.append(chunk)
            
            # Move start position with overlap
            start = end - overlap
            if start >= len(text):
                break
        
        return chunks
    
    def clear_all(self) -> bool:
        """Clear all documents from the vector store"""
        try:
            self.client.delete_collection(name=self.collection_name)
            self.collection = self._get_or_create_collection()
            logger.info("Cleared all documents from vector store")
            return True
        except Exception as e:
            logger.error(f"Error clearing vector store: {e}")
            return False
    
    @classmethod
    def reset_instance(cls):
        """Reset the singleton instance - useful after clearing collections"""
        cls._instance = None