Spaces:

Prog-amin
/

pdf-qa-chatbot

Runtime error

App Files Files Community

Amin23 commited on 13 days ago

Commit

ec2497b

1 Parent(s): 19c50c3

initial commit

Browse files

Files changed (7) hide show

Dockerfile +5 -4
backend/app/api/endpoints/chat.py +10 -19
backend/app/api/endpoints/documents.py +24 -46
backend/app/core/config.py +43 -3
backend/app/services/vector_store.py +76 -107
backend/main.py +29 -36
frontend/package.json +2 -1

Dockerfile CHANGED Viewed

@@ -5,7 +5,7 @@
     COPY frontend/package*.json ./
     RUN npm ci --only=production
     COPY frontend/ ./
-    RUN npm run build
     # ----------  Back‑end stage ----------
@@ -24,11 +24,12 @@
     COPY backend/ ./
     # Static export from previous stage
-    COPY --from=frontend-build /app/frontend/out /app/backend/frontend/out
     # Create folders & DB file and make them writable
-    # Directories /tmp/uploads and /tmp/pdf_chatbot.db are used and should be writable by default
-    # ChromaDB is configured for in-memory, so no persistent directory needed for it
     # Expose API port
     EXPOSE 8000

     COPY frontend/package*.json ./
     RUN npm ci --only=production
     COPY frontend/ ./
+    RUN npm run build && npm run export
     # ----------  Back‑end stage ----------
     COPY backend/ ./
     # Static export from previous stage
+    COPY --from=frontend-build /app/frontend/out /app/frontend_out
     # Create folders & DB file and make them writable
+    RUN mkdir -p /app/backend/uploads /app/backend/chroma_db \
+        && touch /app/backend/pdf_chatbot.db \
+        && chmod -R 777 /app/backend/uploads /app/backend/chroma_db /app/backend/pdf_chatbot.db
     # Expose API port
     EXPOSE 8000

backend/app/api/endpoints/chat.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import List
 import json
 import uuid
 from datetime import datetime
-from sqlalchemy import func as sa_func
 from app.core.database import get_db
 from app.models.document import ChatMessage
@@ -13,20 +12,14 @@ from app.services.vector_store import VectorStore
 from app.services.ai_service import AIService
 router = APIRouter()
-def get_vector_store() -> VectorStore:
-    return VectorStore()
-def get_ai_service() -> AIService:
-    return AIService()
 @router.post("/", response_model=ChatResponse)
 def chat_with_documents(
     request: ChatRequest,
-    db: Session = Depends(get_db),
-    vector_store: VectorStore = Depends(get_vector_store),
-    ai_service: AIService = Depends(get_ai_service)
 ):
     """Send a question and get an answer based on uploaded documents"""
     try:
@@ -70,20 +63,19 @@ def chat_with_documents(
             )
             db.add(assistant_message)
             db.commit()
-            db.refresh(assistant_message) # Refresh to get the actual ID
             return ChatResponse(
                 success=True,
                 answer=answer,
                 session_id=request.session_id,
-                message_id=ChatMessageResponse.from_orm(assistant_message).id # Access ID from Pydantic model
             )
         # Generate answer using AI
         ai_response = ai_service.generate_answer(
             request.question,
             context_documents,
-            model=str(request.model) if request.model else "auto"  # Cast to str to satisfy linter
         )
         # Save user message
@@ -105,7 +97,6 @@ def chat_with_documents(
         )
         db.add(assistant_message)
         db.commit()
-        db.refresh(assistant_message) # Refresh to get the actual ID
         return ChatResponse(
             success=ai_response["success"],
@@ -113,7 +104,7 @@ def chat_with_documents(
             model=ai_response.get("model"),
             sources=ai_response.get("sources", []),
             session_id=request.session_id,
-            message_id=ChatMessageResponse.from_orm(assistant_message).id # Access ID from Pydantic model
         )
     except HTTPException:
@@ -164,10 +155,10 @@ def list_sessions(db: Session = Depends(get_db)):
         # Get unique session IDs with message counts
         sessions = db.query(
             ChatMessage.session_id,
-            sa_func.count(ChatMessage.id).label('message_count'),
-            sa_func.max(ChatMessage.created_at).label('last_message_at')
         ).group_by(ChatMessage.session_id).order_by(
-            sa_func.max(ChatMessage.created_at).desc()
         ).all()
         return [
@@ -201,7 +192,7 @@ def delete_session(session_id: str, db: Session = Depends(get_db)):
 @router.get("/models/available")
-def get_available_models(ai_service: AIService = Depends(get_ai_service)):
     """Get list of available AI models"""
     try:
         models = ai_service.get_available_models()

 import json
 import uuid
 from datetime import datetime
 from app.core.database import get_db
 from app.models.document import ChatMessage
 from app.services.ai_service import AIService
 router = APIRouter()
+vector_store = VectorStore()
+ai_service = AIService()
 @router.post("/", response_model=ChatResponse)
 def chat_with_documents(
     request: ChatRequest,
+    db: Session = Depends(get_db)
 ):
     """Send a question and get an answer based on uploaded documents"""
     try:
             )
             db.add(assistant_message)
             db.commit()
             return ChatResponse(
                 success=True,
                 answer=answer,
                 session_id=request.session_id,
+                message_id=assistant_message.id
             )
         # Generate answer using AI
         ai_response = ai_service.generate_answer(
             request.question,
             context_documents,
+            model=request.model
         )
         # Save user message
         )
         db.add(assistant_message)
         db.commit()
         return ChatResponse(
             success=ai_response["success"],
             model=ai_response.get("model"),
             sources=ai_response.get("sources", []),
             session_id=request.session_id,
+            message_id=assistant_message.id
         )
     except HTTPException:
         # Get unique session IDs with message counts
         sessions = db.query(
             ChatMessage.session_id,
+            db.func.count(ChatMessage.id).label('message_count'),
+            db.func.max(ChatMessage.created_at).label('last_message_at')
         ).group_by(ChatMessage.session_id).order_by(
+            db.func.max(ChatMessage.created_at).desc()
         ).all()
         return [
 @router.get("/models/available")
+def get_available_models():
     """Get list of available AI models"""
     try:
         models = ai_service.get_available_models()

backend/app/api/endpoints/documents.py CHANGED Viewed

@@ -18,23 +18,16 @@ import asyncio
 from concurrent.futures import ThreadPoolExecutor
 router = APIRouter()
-def get_pdf_processor() -> PDFProcessor:
-    return PDFProcessor()
-def get_vector_store() -> VectorStore:
-    return VectorStore()
 @router.post("/upload", response_model=UploadResponse)
 async def upload_document(
     file: UploadFile = File(...),
-    db: Session = Depends(get_db),
-    pdf_processor: PDFProcessor = Depends(get_pdf_processor),
-    vector_store: VectorStore = Depends(get_vector_store)
 ):
     """Upload and process a PDF document"""
-    file_path = None # Initialize file_path to None
     try:
         # Restrict to 3 documents max
         doc_count = db.query(Document).count()
@@ -44,12 +37,6 @@ async def upload_document(
         if not file.filename or not file.filename.lower().endswith('.pdf'):
             raise HTTPException(status_code=400, detail="Only PDF files are allowed")
-        # No need to explicitly create UPLOAD_DIR as it's /tmp/uploads and created by OS or on first write
-        # try:
-        #     os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
-        # except PermissionError as e:
-        #     raise HTTPException(status_code=500, detail=f"Failed to create upload directory: {e}")
         # Generate unique filename
         file_extension = os.path.splitext(str(file.filename))[1]
         unique_filename = f"{uuid.uuid4()}{file_extension}"
@@ -65,7 +52,7 @@ async def upload_document(
         if not success:
             # Clean up file if processing failed
-            if file_path and os.path.exists(file_path):
                 os.remove(file_path)
             raise HTTPException(status_code=400, detail=text_content)
@@ -114,11 +101,8 @@ async def upload_document(
         raise
     except Exception as e:
         # Clean up file if something went wrong
-        if file_path and os.path.exists(file_path):
-            try:
-                os.remove(file_path)
-            except Exception as cleanup_e:
-                print(f"Error cleaning up file {file_path}: {cleanup_e}")
         raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
@@ -157,7 +141,7 @@ def get_document(document_id: int, db: Session = Depends(get_db)):
 @router.delete("/{document_id}", response_model=DocumentDeleteResponse)
-def delete_document(document_id: int, db: Session = Depends(get_db), vector_store: VectorStore = Depends(get_vector_store)):
     """Delete a document and its vector embeddings"""
     try:
         document = db.query(Document).filter(Document.id == document_id).first()
@@ -169,10 +153,7 @@ def delete_document(document_id: int, db: Session = Depends(get_db), vector_stor
         # Delete file from filesystem
         if isinstance(document.file_path, str) and os.path.exists(document.file_path):
-            try:
-                os.remove(document.file_path)
-            except Exception as e:
-                print(f"Error deleting file {document.file_path}: {e}")
         # Delete from database
         db.delete(document)
@@ -189,39 +170,36 @@ def delete_document(document_id: int, db: Session = Depends(get_db), vector_stor
 @router.post("/clear_all")
-async def clear_all_data(db: Session = Depends(get_db), vector_store: VectorStore = Depends(get_vector_store)):
     """Admin endpoint to clear all documents, chat messages, uploaded files, and vector store."""
     try:
         # Delete all documents and chat messages from DB
         db.query(Document).delete()
         db.query(ChatMessage).delete()
         db.commit()
-        # Delete all files in uploads directory (only if UPLOAD_DIR is specified and exists)
         upload_dir = settings.UPLOAD_DIR
-        if upload_dir and os.path.exists(upload_dir):
-            loop = asyncio.get_event_loop()
-            def remove_uploads():
-                for filename in os.listdir(upload_dir):
-                    file_path = os.path.join(upload_dir, filename)
-                    try:
-                        if os.path.isfile(file_path) or os.path.islink(file_path):
-                            os.unlink(file_path)
-                        elif os.path.isdir(file_path):
-                            shutil.rmtree(file_path)
-                    except Exception as e:
-                        print(f"Failed to delete {file_path}: {e}")
-            await loop.run_in_executor(None, remove_uploads)
         # Clear ChromaDB vector store using the singleton
-        await asyncio.get_event_loop().run_in_executor(None, vector_store.clear_all)
         return {"success": True, "message": "All documents, chat messages, uploads, and vectors cleared."}
     except Exception as e:
         return {"success": False, "message": f"Error clearing data: {str(e)}"}
 @router.get("/stats/summary")
-def get_document_stats(db: Session = Depends(get_db), vector_store: VectorStore = Depends(get_vector_store)):
     """Get document statistics"""
     try:
         total_documents = db.query(Document).count()

 from concurrent.futures import ThreadPoolExecutor
 router = APIRouter()
+pdf_processor = PDFProcessor()
+vector_store = VectorStore()
 @router.post("/upload", response_model=UploadResponse)
 async def upload_document(
     file: UploadFile = File(...),
+    db: Session = Depends(get_db)
 ):
     """Upload and process a PDF document"""
     try:
         # Restrict to 3 documents max
         doc_count = db.query(Document).count()
         if not file.filename or not file.filename.lower().endswith('.pdf'):
             raise HTTPException(status_code=400, detail="Only PDF files are allowed")
         # Generate unique filename
         file_extension = os.path.splitext(str(file.filename))[1]
         unique_filename = f"{uuid.uuid4()}{file_extension}"
         if not success:
             # Clean up file if processing failed
+            if os.path.exists(file_path):
                 os.remove(file_path)
             raise HTTPException(status_code=400, detail=text_content)
         raise
     except Exception as e:
         # Clean up file if something went wrong
+        if 'file_path' in locals() and os.path.exists(file_path):
+            os.remove(file_path)
         raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
 @router.delete("/{document_id}", response_model=DocumentDeleteResponse)
+def delete_document(document_id: int, db: Session = Depends(get_db)):
     """Delete a document and its vector embeddings"""
     try:
         document = db.query(Document).filter(Document.id == document_id).first()
         # Delete file from filesystem
         if isinstance(document.file_path, str) and os.path.exists(document.file_path):
+            os.remove(document.file_path)
         # Delete from database
         db.delete(document)
 @router.post("/clear_all")
+async def clear_all_data(db: Session = Depends(get_db)):
     """Admin endpoint to clear all documents, chat messages, uploaded files, and vector store."""
     try:
         # Delete all documents and chat messages from DB
         db.query(Document).delete()
         db.query(ChatMessage).delete()
         db.commit()
+        # Delete all files in uploads directory
         upload_dir = settings.UPLOAD_DIR
+        loop = asyncio.get_event_loop()
+        def remove_uploads():
+            for filename in os.listdir(upload_dir):
+                file_path = os.path.join(upload_dir, filename)
+                try:
+                    if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception as e:
+                    print(f"Failed to delete {file_path}: {e}")
+        await loop.run_in_executor(None, remove_uploads)
         # Clear ChromaDB vector store using the singleton
+        await loop.run_in_executor(None, vector_store.clear_all)
         return {"success": True, "message": "All documents, chat messages, uploads, and vectors cleared."}
     except Exception as e:
         return {"success": False, "message": f"Error clearing data: {str(e)}"}
 @router.get("/stats/summary")
+def get_document_stats(db: Session = Depends(get_db)):
     """Get document statistics"""
     try:
         total_documents = db.query(Document).count()

backend/app/core/config.py CHANGED Viewed

@@ -14,17 +14,17 @@ class Settings(BaseSettings):
     ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8  # 8 days
     # Database
-    DATABASE_URL: str = os.environ.get("DATABASE_URL", f"sqlite:///{tempfile.gettempdir()}/pdf_chatbot.db")
     # Vector Database
-    CHROMA_PERSIST_DIRECTORY: Optional[str] = os.environ.get("CHROMA_PERSIST_DIRECTORY", None)
     # AI Providers
     OPENROUTER_API_KEY: Optional[str] = None
     ANTHROPIC_API_KEY: Optional[str] = None
     # File Storage
-    UPLOAD_DIR: str = os.environ.get("UPLOAD_DIR", "/tmp/uploads") # Use os.environ.get with /tmp default
     MAX_FILE_SIZE: int = 10 * 1024 * 1024  # 10MB
     ALLOWED_EXTENSIONS: list = [".pdf"]
@@ -42,4 +42,44 @@ class Settings(BaseSettings):
 settings = Settings()

     ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8  # 8 days
     # Database
+    DATABASE_URL: str = "sqlite:///./pdf_chatbot.db"
     # Vector Database
+    CHROMA_PERSIST_DIRECTORY: str = "chroma_db"
     # AI Providers
     OPENROUTER_API_KEY: Optional[str] = None
     ANTHROPIC_API_KEY: Optional[str] = None
     # File Storage
+    UPLOAD_DIR: str = "uploads"
     MAX_FILE_SIZE: int = 10 * 1024 * 1024  # 10MB
     ALLOWED_EXTENSIONS: list = [".pdf"]
 settings = Settings()
+# Ensure upload and persistence directories exist, falling back to system temporary
+for attr, fallback_subdir in [("UPLOAD_DIR", "uploads"), ("CHROMA_PERSIST_DIRECTORY", "chroma_db")]:
+    dir_path = getattr(settings, attr)
+    try:
+        os.makedirs(dir_path, exist_ok=True)
+    except PermissionError:
+        # Fall back to a path inside the system temporary directory where writes are usually allowed
+        temp_dir = os.path.join(tempfile.gettempdir(), fallback_subdir)
+        os.makedirs(temp_dir, exist_ok=True)
+        setattr(settings, attr, temp_dir)
+# Additional safety: ensure CHROMA_PERSIST_DIRECTORY is writable (fallback to temp dir if not)
+try:
+    _test_chroma = os.path.join(settings.CHROMA_PERSIST_DIRECTORY, '.write_test')
+    with open(_test_chroma, 'w') as _f:
+        _f.write('')
+    os.remove(_test_chroma)
+except (PermissionError, OSError):
+    temp_chroma = os.path.join(tempfile.gettempdir(), 'chroma_db')
+    os.makedirs(temp_chroma, exist_ok=True)
+    settings.CHROMA_PERSIST_DIRECTORY = temp_chroma
+# Ensure SQLite database path is writable (fallback to temp dir if not)
+if settings.DATABASE_URL.startswith("sqlite"):
+    db_uri_prefix = "sqlite:///"
+    db_path = settings.DATABASE_URL[len(db_uri_prefix):]
+    abs_db_path = os.path.abspath(db_path)
+    db_dir = os.path.dirname(abs_db_path)
+    try:
+        os.makedirs(db_dir, exist_ok=True)
+        # Try creating a temporary file to check write access
+        test_path = os.path.join(db_dir, ".write_test")
+        with open(test_path, "w") as _f:
+            _f.write("")
+        os.remove(test_path)
+    except (PermissionError, OSError):
+        # Fallback
+        temp_db_path = os.path.join(tempfile.gettempdir(), "pdf_chatbot.db")
+        settings.DATABASE_URL = f"sqlite:///{temp_db_path}"

backend/app/services/vector_store.py CHANGED Viewed

@@ -1,17 +1,12 @@
 import chromadb
 from chromadb.config import Settings as ChromaSettings
-from typing import List, Dict, Optional, Tuple, Any, Union
 import json
 import logging
 from app.core.config import settings
-from chromadb.types import Metadata
-from chromadb.api import ClientAPI
-from chromadb import PersistentClient # Import PersistentClient directly
 logger = logging.getLogger(__name__)
-# Define a type for ChromaDB metadata values
-MetadataValue = Union[str, int, float, bool]
 class VectorStore:
     _instance = None
@@ -24,141 +19,122 @@ class VectorStore:
     def __init__(self):
         if not self._initialized:
-            self.client: Optional[ClientAPI] = None
-            self.collection = None
             self.collection_name = "pdf_documents"
             self._initialized = True
-    def _get_client_and_collection(self):
-        """Lazily initializes and returns the ChromaDB client and collection."""
-        if self.client is None or self.collection is None:
-            # Always use in-memory client for robust deployment on restricted filesystems
-            self.client = chromadb.Client()
-            logger.info("ChromaDB in-memory client initialized.")
-            try:
-                self.collection = self.client.get_or_create_collection(name=self.collection_name)
-                logger.info(f"Using/Created collection: {self.collection_name}")
-            except Exception as e:
-                logger.error(f"Failed to get or create collection {self.collection_name}: {e}. This is a critical error.")
-                raise  # Re-raise if collection cannot be obtained/created
-        return self.client, self.collection
-    def add_document(self, document_id: str, content: str, metadata: Optional[Dict[str, Any]] = None) -> bool:
         """Add document content to vector store"""
-        client, collection = self._get_client_and_collection()
         try:
             logger.info(f"Starting to add document {document_id} to vector store")
             logger.info(f"Content length: {len(content)} characters")
             # Split content into chunks for better retrieval
             chunks = self._split_text(content, chunk_size=1000, overlap=200)
             logger.info(f"Split content into {len(chunks)} chunks")
             # Prepare data for ChromaDB
             ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
             documents = chunks
-            metadatas: List[Metadata] = []
-            for i in range(len(chunks)):
-                chunk_metadata: Dict[str, MetadataValue] = {
-                    "document_id": document_id,
-                    "chunk_index": i,
-                    **(metadata or {})
-                }
-                # Ensure all metadata values are of supported types by ChromaDB
-                for k, v in chunk_metadata.items():
-                    if not isinstance(v, (str, int, float, bool)):
-                        chunk_metadata[k] = str(v)  # Convert unsupported types to string
-                metadatas.append(chunk_metadata)
             logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...")  # Log first 3 IDs
             # Add to collection
             logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
-            collection.add(
                 ids=ids,
                 documents=documents,
-                metadatas=metadatas  # This should now match Metadata type
             )
             logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
             return True
         except Exception as e:
             logger.error(f"Error adding document {document_id} to vector store: {e}")
             logger.error(f"Exception type: {type(e).__name__}")
             import traceback
             logger.error(f"Full traceback: {traceback.format_exc()}")
             return False
-    def search_similar(self, query: str, n_results: int = 5, document_id: Optional[str] = None) -> List[Dict]:
         """Search for similar documents based on query, optionally filtering by document_id"""
-        client, collection = self._get_client_and_collection()
         try:
-            results = collection.query(
                 query_texts=[query],
                 n_results=n_results,
                 include=["documents", "metadatas", "distances"]
             )
             # Format results
             formatted_results = []
-            if results and results.get('documents') and results.get('metadatas') and results.get('distances'):
-                documents_list = results['documents'][0] if results['documents'] else []
-                metadatas_list = results['metadatas'][0] if results['metadatas'] else []
-                distances_list = results['distances'][0] if results['distances'] else []
-                # Ensure iterables are not empty before zipping
-                if documents_list and metadatas_list and distances_list:
-                    for i, (doc, metadata, distance) in enumerate(zip(
-                        documents_list,
-                        metadatas_list,
-                        distances_list
-                    )):
-                        # Ensure metadata is a dict before trying .get()
-                        if not isinstance(metadata, dict):
-                            metadata = {}
-                        if document_id is not None and str(metadata.get('document_id')) != str(document_id):
-                            continue
-                        formatted_results.append({
-                            'content': doc,
-                            'metadata': metadata,
-                            'similarity_score': 1 - distance,  # Convert distance to similarity
-                            'rank': i + 1  # Re-introduce rank based on enumerate
-                        })
             return formatted_results
         except Exception as e:
             logger.error(f"Error searching vector store: {e}")
             return []
     def delete_document(self, document_id: str) -> bool:
         """Delete all chunks for a specific document"""
-        client, collection = self._get_client_and_collection()
         try:
             # Get all chunks for this document
-            results = collection.get(
                 where={"document_id": document_id}
             )
             if results['ids']:
-                collection.delete(ids=results['ids'])
                 logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")
             return True
         except Exception as e:
             logger.error(f"Error deleting document {document_id} from vector store: {e}")
             return False
     def get_collection_stats(self) -> Dict:
         """Get statistics about the vector store collection"""
-        client, collection = self._get_client_and_collection()
         try:
             logger.info(f"Getting stats for collection: {self.collection_name}")
-            count = collection.count()
             logger.info(f"Collection count: {count}")
             return {
                 "total_documents": count,
@@ -170,18 +146,18 @@ class VectorStore:
             import traceback
             logger.error(f"Full traceback: {traceback.format_exc()}")
             return {"total_documents": 0, "collection_name": self.collection_name}
     def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
         """Split text into overlapping chunks"""
         if len(text) <= chunk_size:
             return [text]
         chunks = []
         start = 0
         while start < len(text):
             end = start + chunk_size
             # If this isn't the last chunk, try to break at a sentence boundary
             if end < len(text):
                 # Look for sentence endings
@@ -189,36 +165,29 @@ class VectorStore:
                     if text[i] in '.!?':
                         end = i + 1
                         break
             chunk = text[start:end].strip()
             if chunk:
                 chunks.append(chunk)
             # Move start position with overlap
             start = end - overlap
             if start >= len(text):
                 break
         return chunks
     def clear_all(self) -> bool:
         """Clear all documents from the vector store"""
-        client, collection = self._get_client_and_collection()
         try:
-            # Use client.reset() to clear all data and collections
-            client.reset()
-            logger.info("ChromaDB client reset. All data cleared.")
-            # Re-create the collection after reset
-            self.collection = client.create_collection(
-                name=self.collection_name,
-                metadata={"description": "PDF document embeddings for Q&A chatbot"}
-            )
-            logger.info(f"Re-created collection {self.collection_name} after reset.")
             return True
         except Exception as e:
-            logger.error(f"Error clearing/resetting vector store: {e}")
             return False
     @classmethod
     def reset_instance(cls):
         """Reset the singleton instance - useful after clearing collections"""

 import chromadb
 from chromadb.config import Settings as ChromaSettings
+from typing import List, Dict, Optional, Tuple
 import json
 import logging
 from app.core.config import settings
 logger = logging.getLogger(__name__)
 class VectorStore:
     _instance = None
     def __init__(self):
         if not self._initialized:
+            self.client = chromadb.PersistentClient(
+                path=settings.CHROMA_PERSIST_DIRECTORY,
+                settings=ChromaSettings(
+                    anonymized_telemetry=False
+                )
+            )
             self.collection_name = "pdf_documents"
+            self.collection = self._get_or_create_collection()
             self._initialized = True
+    def _get_or_create_collection(self):
+        """Get existing collection or create new one"""
+        try:
+            collection = self.client.get_collection(name=self.collection_name)
+            logger.info(f"Using existing collection: {self.collection_name}")
+        except Exception:
+            collection = self.client.create_collection(
+                name=self.collection_name,
+                metadata={"description": "PDF document embeddings for Q&A chatbot"}
+            )
+            logger.info(f"Created new collection: {self.collection_name}")
+        return collection
+    def add_document(self, document_id: str, content: str, metadata: Dict = None) -> bool:
         """Add document content to vector store"""
         try:
             logger.info(f"Starting to add document {document_id} to vector store")
             logger.info(f"Content length: {len(content)} characters")
             # Split content into chunks for better retrieval
             chunks = self._split_text(content, chunk_size=1000, overlap=200)
             logger.info(f"Split content into {len(chunks)} chunks")
             # Prepare data for ChromaDB
             ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
             documents = chunks
+            metadatas = [{
+                "document_id": document_id,
+                "chunk_index": i,
+                **(metadata or {})
+            } for i in range(len(chunks))]
             logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...")  # Log first 3 IDs
             # Add to collection
             logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
+            self.collection.add(
                 ids=ids,
                 documents=documents,
+                metadatas=metadatas
             )
             logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
             return True
         except Exception as e:
             logger.error(f"Error adding document {document_id} to vector store: {e}")
             logger.error(f"Exception type: {type(e).__name__}")
             import traceback
             logger.error(f"Full traceback: {traceback.format_exc()}")
             return False
+    def search_similar(self, query: str, n_results: int = 5, document_id: str = None) -> List[Dict]:
         """Search for similar documents based on query, optionally filtering by document_id"""
         try:
+            results = self.collection.query(
                 query_texts=[query],
                 n_results=n_results,
                 include=["documents", "metadatas", "distances"]
             )
             # Format results
             formatted_results = []
+            if results['documents'] and results['documents'][0]:
+                for i, (doc, metadata, distance) in enumerate(zip(
+                    results['documents'][0],
+                    results['metadatas'][0],
+                    results['distances'][0]
+                )):
+                    if document_id is not None and str(metadata.get('document_id')) != str(document_id):
+                        continue
+                    formatted_results.append({
+                        'content': doc,
+                        'metadata': metadata,
+                        'similarity_score': 1 - distance,  # Convert distance to similarity
+                        'rank': i + 1
+                    })
             return formatted_results
         except Exception as e:
             logger.error(f"Error searching vector store: {e}")
             return []
     def delete_document(self, document_id: str) -> bool:
         """Delete all chunks for a specific document"""
         try:
             # Get all chunks for this document
+            results = self.collection.get(
                 where={"document_id": document_id}
             )
             if results['ids']:
+                self.collection.delete(ids=results['ids'])
                 logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")
             return True
         except Exception as e:
             logger.error(f"Error deleting document {document_id} from vector store: {e}")
             return False
     def get_collection_stats(self) -> Dict:
         """Get statistics about the vector store collection"""
         try:
             logger.info(f"Getting stats for collection: {self.collection_name}")
+            count = self.collection.count()
             logger.info(f"Collection count: {count}")
             return {
                 "total_documents": count,
             import traceback
             logger.error(f"Full traceback: {traceback.format_exc()}")
             return {"total_documents": 0, "collection_name": self.collection_name}
     def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
         """Split text into overlapping chunks"""
         if len(text) <= chunk_size:
             return [text]
         chunks = []
         start = 0
         while start < len(text):
             end = start + chunk_size
             # If this isn't the last chunk, try to break at a sentence boundary
             if end < len(text):
                 # Look for sentence endings
                     if text[i] in '.!?':
                         end = i + 1
                         break
             chunk = text[start:end].strip()
             if chunk:
                 chunks.append(chunk)
             # Move start position with overlap
             start = end - overlap
             if start >= len(text):
                 break
         return chunks
     def clear_all(self) -> bool:
         """Clear all documents from the vector store"""
         try:
+            self.client.delete_collection(name=self.collection_name)
+            self.collection = self._get_or_create_collection()
+            logger.info("Cleared all documents from vector store")
             return True
         except Exception as e:
+            logger.error(f"Error clearing vector store: {e}")
             return False
     @classmethod
     def reset_instance(cls):
         """Reset the singleton instance - useful after clearing collections"""

backend/main.py CHANGED Viewed

@@ -49,15 +49,9 @@ app.include_router(
 )
 # Serve static frontend (exported Next.js) at root
-frontend_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "frontend", "out"))
-# Log frontend path and existence
-logging.info(f"Calculated frontend_path: {frontend_path}")
 if os.path.isdir(frontend_path):
-    logging.info(f"Frontend directory exists: {frontend_path}")
     app.mount("/", StaticFiles(directory=frontend_path, html=True), name="frontend")
-else:
-    logging.warning(f"Frontend directory does NOT exist: {frontend_path}. Frontend will not be served.")
 # Health check endpoint
 @app.get("/health")
@@ -84,40 +78,39 @@ def root():
 @app.on_event("startup")
 async def startup_event():
     """Initialize application on startup"""
     # --- ERASE ALL DOCUMENTS, CHAT MESSAGES, AND VECTORS ON STARTUP ---
     try:
-        # Log UPLOAD_DIR for debugging
-        logging.info(f"Configured UPLOAD_DIR: {settings.UPLOAD_DIR}")
-        # Ensure UPLOAD_DIR exists (even if it's /tmp, to catch permission issues)
         try:
-            os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
-            logging.info(f"Ensured UPLOAD_DIR exists: {settings.UPLOAD_DIR}")
-        except PermissionError as e:
-            logging.error(f"CRITICAL PERMISSION ERROR: Cannot create UPLOAD_DIR at {settings.UPLOAD_DIR}: {e}. Uploads will fail.")
-        # 1. Delete all rows from documents and chat_messages tables
-        db = SessionLocal()
-        try:
-            db.query(Document).delete()
-            db.query(ChatMessage).delete()
-            db.commit()
-        finally:
-            db.close()
-        # 2. Clear ChromaDB vector store using the singleton (now purely in-memory)
-        VectorStore.reset_instance()
-        vector_store_instance = VectorStore()
-        vector_store_instance.clear_all()
-        logging.info("All documents, chat messages, and vector store erased on startup.")
-    except Exception as e:
-        logging.error(f"Error during startup data cleanup: {e}")
-    # Create database tables (should be done after any potential database file cleanup)
-    create_tables()
     logging.info("Application started successfully")
 # Shutdown event

 )
 # Serve static frontend (exported Next.js) at root
+frontend_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "frontend_out"))
 if os.path.isdir(frontend_path):
     app.mount("/", StaticFiles(directory=frontend_path, html=True), name="frontend")
 # Health check endpoint
 @app.get("/health")
 @app.on_event("startup")
 async def startup_event():
     """Initialize application on startup"""
+    # Create database tables
+    create_tables()
+    # Ensure directories exist
+    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
+    os.makedirs(settings.CHROMA_PERSIST_DIRECTORY, exist_ok=True)
     # --- ERASE ALL DOCUMENTS, CHAT MESSAGES, AND VECTORS ON STARTUP ---
+    # 1. Delete all rows from documents and chat_messages tables
+    db = SessionLocal()
     try:
+        db.query(Document).delete()
+        db.query(ChatMessage).delete()
+        db.commit()
+    finally:
+        db.close()
+    # 2. Remove all files in chroma_db directory (but keep the directory)
+    chroma_dir = settings.CHROMA_PERSIST_DIRECTORY
+    for filename in os.listdir(chroma_dir):
+        file_path = os.path.join(chroma_dir, filename)
         try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            logging.warning(f"Failed to delete {file_path}: {e}")
+    # 3. Explicitly clear ChromaDB vector store
+    vector_store = VectorStore()
+    vector_store.clear_all()
+    logging.info("All documents, chat messages, and vector store erased on startup.")
+    # --- END ERASE ---
     logging.info("Application started successfully")
 # Shutdown event

frontend/package.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "dev": "next dev",
     "build": "next build",
     "start": "next start",
-    "lint": "next lint"
   },
   "dependencies": {
     "next": "14.0.4",

     "dev": "next dev",
     "build": "next build",
     "start": "next start",
+    "lint": "next lint",
+    "export": "next build"
   },
   "dependencies": {
     "next": "14.0.4",