Amin23 commited on
Commit
ec2497b
·
1 Parent(s): 19c50c3

initial commit

Browse files
Dockerfile CHANGED
@@ -5,7 +5,7 @@
5
  COPY frontend/package*.json ./
6
  RUN npm ci --only=production
7
  COPY frontend/ ./
8
- RUN npm run build
9
 
10
 
11
  # ---------- Back‑end stage ----------
@@ -24,11 +24,12 @@
24
  COPY backend/ ./
25
 
26
  # Static export from previous stage
27
- COPY --from=frontend-build /app/frontend/out /app/backend/frontend/out
28
 
29
  # Create folders & DB file and make them writable
30
- # Directories /tmp/uploads and /tmp/pdf_chatbot.db are used and should be writable by default
31
- # ChromaDB is configured for in-memory, so no persistent directory needed for it
 
32
 
33
  # Expose API port
34
  EXPOSE 8000
 
5
  COPY frontend/package*.json ./
6
  RUN npm ci --only=production
7
  COPY frontend/ ./
8
+ RUN npm run build && npm run export
9
 
10
 
11
  # ---------- Back‑end stage ----------
 
24
  COPY backend/ ./
25
 
26
  # Static export from previous stage
27
+ COPY --from=frontend-build /app/frontend/out /app/frontend_out
28
 
29
  # Create folders & DB file and make them writable
30
+ RUN mkdir -p /app/backend/uploads /app/backend/chroma_db \
31
+ && touch /app/backend/pdf_chatbot.db \
32
+ && chmod -R 777 /app/backend/uploads /app/backend/chroma_db /app/backend/pdf_chatbot.db
33
 
34
  # Expose API port
35
  EXPOSE 8000
backend/app/api/endpoints/chat.py CHANGED
@@ -4,7 +4,6 @@ from typing import List
4
  import json
5
  import uuid
6
  from datetime import datetime
7
- from sqlalchemy import func as sa_func
8
 
9
  from app.core.database import get_db
10
  from app.models.document import ChatMessage
@@ -13,20 +12,14 @@ from app.services.vector_store import VectorStore
13
  from app.services.ai_service import AIService
14
 
15
  router = APIRouter()
16
-
17
- def get_vector_store() -> VectorStore:
18
- return VectorStore()
19
-
20
- def get_ai_service() -> AIService:
21
- return AIService()
22
 
23
 
24
  @router.post("/", response_model=ChatResponse)
25
  def chat_with_documents(
26
  request: ChatRequest,
27
- db: Session = Depends(get_db),
28
- vector_store: VectorStore = Depends(get_vector_store),
29
- ai_service: AIService = Depends(get_ai_service)
30
  ):
31
  """Send a question and get an answer based on uploaded documents"""
32
  try:
@@ -70,20 +63,19 @@ def chat_with_documents(
70
  )
71
  db.add(assistant_message)
72
  db.commit()
73
- db.refresh(assistant_message) # Refresh to get the actual ID
74
 
75
  return ChatResponse(
76
  success=True,
77
  answer=answer,
78
  session_id=request.session_id,
79
- message_id=ChatMessageResponse.from_orm(assistant_message).id # Access ID from Pydantic model
80
  )
81
 
82
  # Generate answer using AI
83
  ai_response = ai_service.generate_answer(
84
  request.question,
85
  context_documents,
86
- model=str(request.model) if request.model else "auto" # Cast to str to satisfy linter
87
  )
88
 
89
  # Save user message
@@ -105,7 +97,6 @@ def chat_with_documents(
105
  )
106
  db.add(assistant_message)
107
  db.commit()
108
- db.refresh(assistant_message) # Refresh to get the actual ID
109
 
110
  return ChatResponse(
111
  success=ai_response["success"],
@@ -113,7 +104,7 @@ def chat_with_documents(
113
  model=ai_response.get("model"),
114
  sources=ai_response.get("sources", []),
115
  session_id=request.session_id,
116
- message_id=ChatMessageResponse.from_orm(assistant_message).id # Access ID from Pydantic model
117
  )
118
 
119
  except HTTPException:
@@ -164,10 +155,10 @@ def list_sessions(db: Session = Depends(get_db)):
164
  # Get unique session IDs with message counts
165
  sessions = db.query(
166
  ChatMessage.session_id,
167
- sa_func.count(ChatMessage.id).label('message_count'),
168
- sa_func.max(ChatMessage.created_at).label('last_message_at')
169
  ).group_by(ChatMessage.session_id).order_by(
170
- sa_func.max(ChatMessage.created_at).desc()
171
  ).all()
172
 
173
  return [
@@ -201,7 +192,7 @@ def delete_session(session_id: str, db: Session = Depends(get_db)):
201
 
202
 
203
  @router.get("/models/available")
204
- def get_available_models(ai_service: AIService = Depends(get_ai_service)):
205
  """Get list of available AI models"""
206
  try:
207
  models = ai_service.get_available_models()
 
4
  import json
5
  import uuid
6
  from datetime import datetime
 
7
 
8
  from app.core.database import get_db
9
  from app.models.document import ChatMessage
 
12
  from app.services.ai_service import AIService
13
 
14
  router = APIRouter()
15
+ vector_store = VectorStore()
16
+ ai_service = AIService()
 
 
 
 
17
 
18
 
19
  @router.post("/", response_model=ChatResponse)
20
  def chat_with_documents(
21
  request: ChatRequest,
22
+ db: Session = Depends(get_db)
 
 
23
  ):
24
  """Send a question and get an answer based on uploaded documents"""
25
  try:
 
63
  )
64
  db.add(assistant_message)
65
  db.commit()
 
66
 
67
  return ChatResponse(
68
  success=True,
69
  answer=answer,
70
  session_id=request.session_id,
71
+ message_id=assistant_message.id
72
  )
73
 
74
  # Generate answer using AI
75
  ai_response = ai_service.generate_answer(
76
  request.question,
77
  context_documents,
78
+ model=request.model
79
  )
80
 
81
  # Save user message
 
97
  )
98
  db.add(assistant_message)
99
  db.commit()
 
100
 
101
  return ChatResponse(
102
  success=ai_response["success"],
 
104
  model=ai_response.get("model"),
105
  sources=ai_response.get("sources", []),
106
  session_id=request.session_id,
107
+ message_id=assistant_message.id
108
  )
109
 
110
  except HTTPException:
 
155
  # Get unique session IDs with message counts
156
  sessions = db.query(
157
  ChatMessage.session_id,
158
+ db.func.count(ChatMessage.id).label('message_count'),
159
+ db.func.max(ChatMessage.created_at).label('last_message_at')
160
  ).group_by(ChatMessage.session_id).order_by(
161
+ db.func.max(ChatMessage.created_at).desc()
162
  ).all()
163
 
164
  return [
 
192
 
193
 
194
  @router.get("/models/available")
195
+ def get_available_models():
196
  """Get list of available AI models"""
197
  try:
198
  models = ai_service.get_available_models()
backend/app/api/endpoints/documents.py CHANGED
@@ -18,23 +18,16 @@ import asyncio
18
  from concurrent.futures import ThreadPoolExecutor
19
 
20
  router = APIRouter()
21
-
22
- def get_pdf_processor() -> PDFProcessor:
23
- return PDFProcessor()
24
-
25
- def get_vector_store() -> VectorStore:
26
- return VectorStore()
27
 
28
 
29
  @router.post("/upload", response_model=UploadResponse)
30
  async def upload_document(
31
  file: UploadFile = File(...),
32
- db: Session = Depends(get_db),
33
- pdf_processor: PDFProcessor = Depends(get_pdf_processor),
34
- vector_store: VectorStore = Depends(get_vector_store)
35
  ):
36
  """Upload and process a PDF document"""
37
- file_path = None # Initialize file_path to None
38
  try:
39
  # Restrict to 3 documents max
40
  doc_count = db.query(Document).count()
@@ -44,12 +37,6 @@ async def upload_document(
44
  if not file.filename or not file.filename.lower().endswith('.pdf'):
45
  raise HTTPException(status_code=400, detail="Only PDF files are allowed")
46
 
47
- # No need to explicitly create UPLOAD_DIR as it's /tmp/uploads and created by OS or on first write
48
- # try:
49
- # os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
50
- # except PermissionError as e:
51
- # raise HTTPException(status_code=500, detail=f"Failed to create upload directory: {e}")
52
-
53
  # Generate unique filename
54
  file_extension = os.path.splitext(str(file.filename))[1]
55
  unique_filename = f"{uuid.uuid4()}{file_extension}"
@@ -65,7 +52,7 @@ async def upload_document(
65
 
66
  if not success:
67
  # Clean up file if processing failed
68
- if file_path and os.path.exists(file_path):
69
  os.remove(file_path)
70
  raise HTTPException(status_code=400, detail=text_content)
71
 
@@ -114,11 +101,8 @@ async def upload_document(
114
  raise
115
  except Exception as e:
116
  # Clean up file if something went wrong
117
- if file_path and os.path.exists(file_path):
118
- try:
119
- os.remove(file_path)
120
- except Exception as cleanup_e:
121
- print(f"Error cleaning up file {file_path}: {cleanup_e}")
122
  raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
123
 
124
 
@@ -157,7 +141,7 @@ def get_document(document_id: int, db: Session = Depends(get_db)):
157
 
158
 
159
  @router.delete("/{document_id}", response_model=DocumentDeleteResponse)
160
- def delete_document(document_id: int, db: Session = Depends(get_db), vector_store: VectorStore = Depends(get_vector_store)):
161
  """Delete a document and its vector embeddings"""
162
  try:
163
  document = db.query(Document).filter(Document.id == document_id).first()
@@ -169,10 +153,7 @@ def delete_document(document_id: int, db: Session = Depends(get_db), vector_stor
169
 
170
  # Delete file from filesystem
171
  if isinstance(document.file_path, str) and os.path.exists(document.file_path):
172
- try:
173
- os.remove(document.file_path)
174
- except Exception as e:
175
- print(f"Error deleting file {document.file_path}: {e}")
176
 
177
  # Delete from database
178
  db.delete(document)
@@ -189,39 +170,36 @@ def delete_document(document_id: int, db: Session = Depends(get_db), vector_stor
189
 
190
 
191
  @router.post("/clear_all")
192
- async def clear_all_data(db: Session = Depends(get_db), vector_store: VectorStore = Depends(get_vector_store)):
193
  """Admin endpoint to clear all documents, chat messages, uploaded files, and vector store."""
194
  try:
195
  # Delete all documents and chat messages from DB
196
  db.query(Document).delete()
197
  db.query(ChatMessage).delete()
198
  db.commit()
199
-
200
- # Delete all files in uploads directory (only if UPLOAD_DIR is specified and exists)
201
  upload_dir = settings.UPLOAD_DIR
202
- if upload_dir and os.path.exists(upload_dir):
203
- loop = asyncio.get_event_loop()
204
- def remove_uploads():
205
- for filename in os.listdir(upload_dir):
206
- file_path = os.path.join(upload_dir, filename)
207
- try:
208
- if os.path.isfile(file_path) or os.path.islink(file_path):
209
- os.unlink(file_path)
210
- elif os.path.isdir(file_path):
211
- shutil.rmtree(file_path)
212
- except Exception as e:
213
- print(f"Failed to delete {file_path}: {e}")
214
- await loop.run_in_executor(None, remove_uploads)
215
-
216
  # Clear ChromaDB vector store using the singleton
217
- await asyncio.get_event_loop().run_in_executor(None, vector_store.clear_all)
218
  return {"success": True, "message": "All documents, chat messages, uploads, and vectors cleared."}
219
  except Exception as e:
220
  return {"success": False, "message": f"Error clearing data: {str(e)}"}
221
 
222
 
223
  @router.get("/stats/summary")
224
- def get_document_stats(db: Session = Depends(get_db), vector_store: VectorStore = Depends(get_vector_store)):
225
  """Get document statistics"""
226
  try:
227
  total_documents = db.query(Document).count()
 
18
  from concurrent.futures import ThreadPoolExecutor
19
 
20
  router = APIRouter()
21
+ pdf_processor = PDFProcessor()
22
+ vector_store = VectorStore()
 
 
 
 
23
 
24
 
25
  @router.post("/upload", response_model=UploadResponse)
26
  async def upload_document(
27
  file: UploadFile = File(...),
28
+ db: Session = Depends(get_db)
 
 
29
  ):
30
  """Upload and process a PDF document"""
 
31
  try:
32
  # Restrict to 3 documents max
33
  doc_count = db.query(Document).count()
 
37
  if not file.filename or not file.filename.lower().endswith('.pdf'):
38
  raise HTTPException(status_code=400, detail="Only PDF files are allowed")
39
 
 
 
 
 
 
 
40
  # Generate unique filename
41
  file_extension = os.path.splitext(str(file.filename))[1]
42
  unique_filename = f"{uuid.uuid4()}{file_extension}"
 
52
 
53
  if not success:
54
  # Clean up file if processing failed
55
+ if os.path.exists(file_path):
56
  os.remove(file_path)
57
  raise HTTPException(status_code=400, detail=text_content)
58
 
 
101
  raise
102
  except Exception as e:
103
  # Clean up file if something went wrong
104
+ if 'file_path' in locals() and os.path.exists(file_path):
105
+ os.remove(file_path)
 
 
 
106
  raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
107
 
108
 
 
141
 
142
 
143
  @router.delete("/{document_id}", response_model=DocumentDeleteResponse)
144
+ def delete_document(document_id: int, db: Session = Depends(get_db)):
145
  """Delete a document and its vector embeddings"""
146
  try:
147
  document = db.query(Document).filter(Document.id == document_id).first()
 
153
 
154
  # Delete file from filesystem
155
  if isinstance(document.file_path, str) and os.path.exists(document.file_path):
156
+ os.remove(document.file_path)
 
 
 
157
 
158
  # Delete from database
159
  db.delete(document)
 
170
 
171
 
172
  @router.post("/clear_all")
173
+ async def clear_all_data(db: Session = Depends(get_db)):
174
  """Admin endpoint to clear all documents, chat messages, uploaded files, and vector store."""
175
  try:
176
  # Delete all documents and chat messages from DB
177
  db.query(Document).delete()
178
  db.query(ChatMessage).delete()
179
  db.commit()
180
+ # Delete all files in uploads directory
 
181
  upload_dir = settings.UPLOAD_DIR
182
+ loop = asyncio.get_event_loop()
183
+ def remove_uploads():
184
+ for filename in os.listdir(upload_dir):
185
+ file_path = os.path.join(upload_dir, filename)
186
+ try:
187
+ if os.path.isfile(file_path) or os.path.islink(file_path):
188
+ os.unlink(file_path)
189
+ elif os.path.isdir(file_path):
190
+ shutil.rmtree(file_path)
191
+ except Exception as e:
192
+ print(f"Failed to delete {file_path}: {e}")
193
+ await loop.run_in_executor(None, remove_uploads)
 
 
194
  # Clear ChromaDB vector store using the singleton
195
+ await loop.run_in_executor(None, vector_store.clear_all)
196
  return {"success": True, "message": "All documents, chat messages, uploads, and vectors cleared."}
197
  except Exception as e:
198
  return {"success": False, "message": f"Error clearing data: {str(e)}"}
199
 
200
 
201
  @router.get("/stats/summary")
202
+ def get_document_stats(db: Session = Depends(get_db)):
203
  """Get document statistics"""
204
  try:
205
  total_documents = db.query(Document).count()
backend/app/core/config.py CHANGED
@@ -14,17 +14,17 @@ class Settings(BaseSettings):
14
  ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
15
 
16
  # Database
17
- DATABASE_URL: str = os.environ.get("DATABASE_URL", f"sqlite:///{tempfile.gettempdir()}/pdf_chatbot.db")
18
 
19
  # Vector Database
20
- CHROMA_PERSIST_DIRECTORY: Optional[str] = os.environ.get("CHROMA_PERSIST_DIRECTORY", None)
21
 
22
  # AI Providers
23
  OPENROUTER_API_KEY: Optional[str] = None
24
  ANTHROPIC_API_KEY: Optional[str] = None
25
 
26
  # File Storage
27
- UPLOAD_DIR: str = os.environ.get("UPLOAD_DIR", "/tmp/uploads") # Use os.environ.get with /tmp default
28
  MAX_FILE_SIZE: int = 10 * 1024 * 1024 # 10MB
29
  ALLOWED_EXTENSIONS: list = [".pdf"]
30
 
@@ -42,4 +42,44 @@ class Settings(BaseSettings):
42
 
43
 
44
  settings = Settings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
 
14
  ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
15
 
16
  # Database
17
+ DATABASE_URL: str = "sqlite:///./pdf_chatbot.db"
18
 
19
  # Vector Database
20
+ CHROMA_PERSIST_DIRECTORY: str = "chroma_db"
21
 
22
  # AI Providers
23
  OPENROUTER_API_KEY: Optional[str] = None
24
  ANTHROPIC_API_KEY: Optional[str] = None
25
 
26
  # File Storage
27
+ UPLOAD_DIR: str = "uploads"
28
  MAX_FILE_SIZE: int = 10 * 1024 * 1024 # 10MB
29
  ALLOWED_EXTENSIONS: list = [".pdf"]
30
 
 
42
 
43
 
44
  settings = Settings()
45
+
46
+ # Ensure upload and persistence directories exist, falling back to system temporary
47
+ for attr, fallback_subdir in [("UPLOAD_DIR", "uploads"), ("CHROMA_PERSIST_DIRECTORY", "chroma_db")]:
48
+ dir_path = getattr(settings, attr)
49
+ try:
50
+ os.makedirs(dir_path, exist_ok=True)
51
+ except PermissionError:
52
+ # Fall back to a path inside the system temporary directory where writes are usually allowed
53
+ temp_dir = os.path.join(tempfile.gettempdir(), fallback_subdir)
54
+ os.makedirs(temp_dir, exist_ok=True)
55
+ setattr(settings, attr, temp_dir)
56
+
57
+ # Additional safety: ensure CHROMA_PERSIST_DIRECTORY is writable (fallback to temp dir if not)
58
+ try:
59
+ _test_chroma = os.path.join(settings.CHROMA_PERSIST_DIRECTORY, '.write_test')
60
+ with open(_test_chroma, 'w') as _f:
61
+ _f.write('')
62
+ os.remove(_test_chroma)
63
+ except (PermissionError, OSError):
64
+ temp_chroma = os.path.join(tempfile.gettempdir(), 'chroma_db')
65
+ os.makedirs(temp_chroma, exist_ok=True)
66
+ settings.CHROMA_PERSIST_DIRECTORY = temp_chroma
67
+
68
+ # Ensure SQLite database path is writable (fallback to temp dir if not)
69
+ if settings.DATABASE_URL.startswith("sqlite"):
70
+ db_uri_prefix = "sqlite:///"
71
+ db_path = settings.DATABASE_URL[len(db_uri_prefix):]
72
+ abs_db_path = os.path.abspath(db_path)
73
+ db_dir = os.path.dirname(abs_db_path)
74
+ try:
75
+ os.makedirs(db_dir, exist_ok=True)
76
+ # Try creating a temporary file to check write access
77
+ test_path = os.path.join(db_dir, ".write_test")
78
+ with open(test_path, "w") as _f:
79
+ _f.write("")
80
+ os.remove(test_path)
81
+ except (PermissionError, OSError):
82
+ # Fallback
83
+ temp_db_path = os.path.join(tempfile.gettempdir(), "pdf_chatbot.db")
84
+ settings.DATABASE_URL = f"sqlite:///{temp_db_path}"
85
 
backend/app/services/vector_store.py CHANGED
@@ -1,17 +1,12 @@
1
  import chromadb
2
  from chromadb.config import Settings as ChromaSettings
3
- from typing import List, Dict, Optional, Tuple, Any, Union
4
  import json
5
  import logging
6
  from app.core.config import settings
7
- from chromadb.types import Metadata
8
- from chromadb.api import ClientAPI
9
- from chromadb import PersistentClient # Import PersistentClient directly
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
- # Define a type for ChromaDB metadata values
14
- MetadataValue = Union[str, int, float, bool]
15
 
16
  class VectorStore:
17
  _instance = None
@@ -24,141 +19,122 @@ class VectorStore:
24
 
25
  def __init__(self):
26
  if not self._initialized:
27
- self.client: Optional[ClientAPI] = None
28
- self.collection = None
 
 
 
 
29
  self.collection_name = "pdf_documents"
 
30
  self._initialized = True
31
-
32
- def _get_client_and_collection(self):
33
- """Lazily initializes and returns the ChromaDB client and collection."""
34
- if self.client is None or self.collection is None:
35
- # Always use in-memory client for robust deployment on restricted filesystems
36
- self.client = chromadb.Client()
37
- logger.info("ChromaDB in-memory client initialized.")
38
-
39
- try:
40
- self.collection = self.client.get_or_create_collection(name=self.collection_name)
41
- logger.info(f"Using/Created collection: {self.collection_name}")
42
- except Exception as e:
43
- logger.error(f"Failed to get or create collection {self.collection_name}: {e}. This is a critical error.")
44
- raise # Re-raise if collection cannot be obtained/created
45
-
46
- return self.client, self.collection
47
-
48
- def add_document(self, document_id: str, content: str, metadata: Optional[Dict[str, Any]] = None) -> bool:
49
  """Add document content to vector store"""
50
- client, collection = self._get_client_and_collection()
51
  try:
52
  logger.info(f"Starting to add document {document_id} to vector store")
53
  logger.info(f"Content length: {len(content)} characters")
54
-
55
  # Split content into chunks for better retrieval
56
  chunks = self._split_text(content, chunk_size=1000, overlap=200)
57
  logger.info(f"Split content into {len(chunks)} chunks")
58
-
59
  # Prepare data for ChromaDB
60
  ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
61
  documents = chunks
62
-
63
- metadatas: List[Metadata] = []
64
- for i in range(len(chunks)):
65
- chunk_metadata: Dict[str, MetadataValue] = {
66
- "document_id": document_id,
67
- "chunk_index": i,
68
- **(metadata or {})
69
- }
70
- # Ensure all metadata values are of supported types by ChromaDB
71
- for k, v in chunk_metadata.items():
72
- if not isinstance(v, (str, int, float, bool)):
73
- chunk_metadata[k] = str(v) # Convert unsupported types to string
74
- metadatas.append(chunk_metadata)
75
-
76
  logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...") # Log first 3 IDs
77
-
78
  # Add to collection
79
  logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
80
- collection.add(
81
  ids=ids,
82
  documents=documents,
83
- metadatas=metadatas # This should now match Metadata type
84
  )
85
-
86
  logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
87
  return True
88
-
89
  except Exception as e:
90
  logger.error(f"Error adding document {document_id} to vector store: {e}")
91
  logger.error(f"Exception type: {type(e).__name__}")
92
  import traceback
93
  logger.error(f"Full traceback: {traceback.format_exc()}")
94
  return False
95
-
96
- def search_similar(self, query: str, n_results: int = 5, document_id: Optional[str] = None) -> List[Dict]:
97
  """Search for similar documents based on query, optionally filtering by document_id"""
98
- client, collection = self._get_client_and_collection()
99
  try:
100
- results = collection.query(
101
  query_texts=[query],
102
  n_results=n_results,
103
  include=["documents", "metadatas", "distances"]
104
  )
105
-
106
  # Format results
107
  formatted_results = []
108
- if results and results.get('documents') and results.get('metadatas') and results.get('distances'):
109
- documents_list = results['documents'][0] if results['documents'] else []
110
- metadatas_list = results['metadatas'][0] if results['metadatas'] else []
111
- distances_list = results['distances'][0] if results['distances'] else []
112
-
113
- # Ensure iterables are not empty before zipping
114
- if documents_list and metadatas_list and distances_list:
115
- for i, (doc, metadata, distance) in enumerate(zip(
116
- documents_list,
117
- metadatas_list,
118
- distances_list
119
- )):
120
- # Ensure metadata is a dict before trying .get()
121
- if not isinstance(metadata, dict):
122
- metadata = {}
123
-
124
- if document_id is not None and str(metadata.get('document_id')) != str(document_id):
125
- continue
126
- formatted_results.append({
127
- 'content': doc,
128
- 'metadata': metadata,
129
- 'similarity_score': 1 - distance, # Convert distance to similarity
130
- 'rank': i + 1 # Re-introduce rank based on enumerate
131
- })
132
  return formatted_results
133
  except Exception as e:
134
  logger.error(f"Error searching vector store: {e}")
135
  return []
136
-
137
  def delete_document(self, document_id: str) -> bool:
138
  """Delete all chunks for a specific document"""
139
- client, collection = self._get_client_and_collection()
140
  try:
141
  # Get all chunks for this document
142
- results = collection.get(
143
  where={"document_id": document_id}
144
  )
145
-
146
  if results['ids']:
147
- collection.delete(ids=results['ids'])
148
  logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")
149
-
150
  return True
151
-
152
  except Exception as e:
153
  logger.error(f"Error deleting document {document_id} from vector store: {e}")
154
  return False
155
-
156
  def get_collection_stats(self) -> Dict:
157
  """Get statistics about the vector store collection"""
158
- client, collection = self._get_client_and_collection()
159
  try:
160
  logger.info(f"Getting stats for collection: {self.collection_name}")
161
- count = collection.count()
162
  logger.info(f"Collection count: {count}")
163
  return {
164
  "total_documents": count,
@@ -170,18 +146,18 @@ class VectorStore:
170
  import traceback
171
  logger.error(f"Full traceback: {traceback.format_exc()}")
172
  return {"total_documents": 0, "collection_name": self.collection_name}
173
-
174
  def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
175
  """Split text into overlapping chunks"""
176
  if len(text) <= chunk_size:
177
  return [text]
178
-
179
  chunks = []
180
  start = 0
181
-
182
  while start < len(text):
183
  end = start + chunk_size
184
-
185
  # If this isn't the last chunk, try to break at a sentence boundary
186
  if end < len(text):
187
  # Look for sentence endings
@@ -189,36 +165,29 @@ class VectorStore:
189
  if text[i] in '.!?':
190
  end = i + 1
191
  break
192
-
193
  chunk = text[start:end].strip()
194
  if chunk:
195
  chunks.append(chunk)
196
-
197
  # Move start position with overlap
198
  start = end - overlap
199
  if start >= len(text):
200
  break
201
-
202
  return chunks
203
-
204
  def clear_all(self) -> bool:
205
  """Clear all documents from the vector store"""
206
- client, collection = self._get_client_and_collection()
207
  try:
208
- # Use client.reset() to clear all data and collections
209
- client.reset()
210
- logger.info("ChromaDB client reset. All data cleared.")
211
- # Re-create the collection after reset
212
- self.collection = client.create_collection(
213
- name=self.collection_name,
214
- metadata={"description": "PDF document embeddings for Q&A chatbot"}
215
- )
216
- logger.info(f"Re-created collection {self.collection_name} after reset.")
217
  return True
218
  except Exception as e:
219
- logger.error(f"Error clearing/resetting vector store: {e}")
220
  return False
221
-
222
  @classmethod
223
  def reset_instance(cls):
224
  """Reset the singleton instance - useful after clearing collections"""
 
1
  import chromadb
2
  from chromadb.config import Settings as ChromaSettings
3
+ from typing import List, Dict, Optional, Tuple
4
  import json
5
  import logging
6
  from app.core.config import settings
 
 
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
 
 
10
 
11
  class VectorStore:
12
  _instance = None
 
19
 
20
  def __init__(self):
21
  if not self._initialized:
22
+ self.client = chromadb.PersistentClient(
23
+ path=settings.CHROMA_PERSIST_DIRECTORY,
24
+ settings=ChromaSettings(
25
+ anonymized_telemetry=False
26
+ )
27
+ )
28
  self.collection_name = "pdf_documents"
29
+ self.collection = self._get_or_create_collection()
30
  self._initialized = True
31
+
32
+ def _get_or_create_collection(self):
33
+ """Get existing collection or create new one"""
34
+ try:
35
+ collection = self.client.get_collection(name=self.collection_name)
36
+ logger.info(f"Using existing collection: {self.collection_name}")
37
+ except Exception:
38
+ collection = self.client.create_collection(
39
+ name=self.collection_name,
40
+ metadata={"description": "PDF document embeddings for Q&A chatbot"}
41
+ )
42
+ logger.info(f"Created new collection: {self.collection_name}")
43
+
44
+ return collection
45
+
46
+ def add_document(self, document_id: str, content: str, metadata: Dict = None) -> bool:
 
 
47
  """Add document content to vector store"""
 
48
  try:
49
  logger.info(f"Starting to add document {document_id} to vector store")
50
  logger.info(f"Content length: {len(content)} characters")
51
+
52
  # Split content into chunks for better retrieval
53
  chunks = self._split_text(content, chunk_size=1000, overlap=200)
54
  logger.info(f"Split content into {len(chunks)} chunks")
55
+
56
  # Prepare data for ChromaDB
57
  ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
58
  documents = chunks
59
+ metadatas = [{
60
+ "document_id": document_id,
61
+ "chunk_index": i,
62
+ **(metadata or {})
63
+ } for i in range(len(chunks))]
64
+
 
 
 
 
 
 
 
 
65
  logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...") # Log first 3 IDs
66
+
67
  # Add to collection
68
  logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
69
+ self.collection.add(
70
  ids=ids,
71
  documents=documents,
72
+ metadatas=metadatas
73
  )
74
+
75
  logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
76
  return True
77
+
78
  except Exception as e:
79
  logger.error(f"Error adding document {document_id} to vector store: {e}")
80
  logger.error(f"Exception type: {type(e).__name__}")
81
  import traceback
82
  logger.error(f"Full traceback: {traceback.format_exc()}")
83
  return False
84
+
85
+ def search_similar(self, query: str, n_results: int = 5, document_id: str = None) -> List[Dict]:
86
  """Search for similar documents based on query, optionally filtering by document_id"""
 
87
  try:
88
+ results = self.collection.query(
89
  query_texts=[query],
90
  n_results=n_results,
91
  include=["documents", "metadatas", "distances"]
92
  )
93
+
94
  # Format results
95
  formatted_results = []
96
+ if results['documents'] and results['documents'][0]:
97
+ for i, (doc, metadata, distance) in enumerate(zip(
98
+ results['documents'][0],
99
+ results['metadatas'][0],
100
+ results['distances'][0]
101
+ )):
102
+ if document_id is not None and str(metadata.get('document_id')) != str(document_id):
103
+ continue
104
+ formatted_results.append({
105
+ 'content': doc,
106
+ 'metadata': metadata,
107
+ 'similarity_score': 1 - distance, # Convert distance to similarity
108
+ 'rank': i + 1
109
+ })
 
 
 
 
 
 
 
 
 
 
110
  return formatted_results
111
  except Exception as e:
112
  logger.error(f"Error searching vector store: {e}")
113
  return []
114
+
115
  def delete_document(self, document_id: str) -> bool:
116
  """Delete all chunks for a specific document"""
 
117
  try:
118
  # Get all chunks for this document
119
+ results = self.collection.get(
120
  where={"document_id": document_id}
121
  )
122
+
123
  if results['ids']:
124
+ self.collection.delete(ids=results['ids'])
125
  logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")
126
+
127
  return True
128
+
129
  except Exception as e:
130
  logger.error(f"Error deleting document {document_id} from vector store: {e}")
131
  return False
132
+
133
  def get_collection_stats(self) -> Dict:
134
  """Get statistics about the vector store collection"""
 
135
  try:
136
  logger.info(f"Getting stats for collection: {self.collection_name}")
137
+ count = self.collection.count()
138
  logger.info(f"Collection count: {count}")
139
  return {
140
  "total_documents": count,
 
146
  import traceback
147
  logger.error(f"Full traceback: {traceback.format_exc()}")
148
  return {"total_documents": 0, "collection_name": self.collection_name}
149
+
150
  def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
151
  """Split text into overlapping chunks"""
152
  if len(text) <= chunk_size:
153
  return [text]
154
+
155
  chunks = []
156
  start = 0
157
+
158
  while start < len(text):
159
  end = start + chunk_size
160
+
161
  # If this isn't the last chunk, try to break at a sentence boundary
162
  if end < len(text):
163
  # Look for sentence endings
 
165
  if text[i] in '.!?':
166
  end = i + 1
167
  break
168
+
169
  chunk = text[start:end].strip()
170
  if chunk:
171
  chunks.append(chunk)
172
+
173
  # Move start position with overlap
174
  start = end - overlap
175
  if start >= len(text):
176
  break
177
+
178
  return chunks
179
+
180
  def clear_all(self) -> bool:
181
  """Clear all documents from the vector store"""
 
182
  try:
183
+ self.client.delete_collection(name=self.collection_name)
184
+ self.collection = self._get_or_create_collection()
185
+ logger.info("Cleared all documents from vector store")
 
 
 
 
 
 
186
  return True
187
  except Exception as e:
188
+ logger.error(f"Error clearing vector store: {e}")
189
  return False
190
+
191
  @classmethod
192
  def reset_instance(cls):
193
  """Reset the singleton instance - useful after clearing collections"""
backend/main.py CHANGED
@@ -49,15 +49,9 @@ app.include_router(
49
  )
50
 
51
  # Serve static frontend (exported Next.js) at root
52
- frontend_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "frontend", "out"))
53
-
54
- # Log frontend path and existence
55
- logging.info(f"Calculated frontend_path: {frontend_path}")
56
  if os.path.isdir(frontend_path):
57
- logging.info(f"Frontend directory exists: {frontend_path}")
58
  app.mount("/", StaticFiles(directory=frontend_path, html=True), name="frontend")
59
- else:
60
- logging.warning(f"Frontend directory does NOT exist: {frontend_path}. Frontend will not be served.")
61
 
62
  # Health check endpoint
63
  @app.get("/health")
@@ -84,40 +78,39 @@ def root():
84
  @app.on_event("startup")
85
  async def startup_event():
86
  """Initialize application on startup"""
 
 
87
 
 
 
 
 
88
  # --- ERASE ALL DOCUMENTS, CHAT MESSAGES, AND VECTORS ON STARTUP ---
 
 
89
  try:
90
- # Log UPLOAD_DIR for debugging
91
- logging.info(f"Configured UPLOAD_DIR: {settings.UPLOAD_DIR}")
92
-
93
- # Ensure UPLOAD_DIR exists (even if it's /tmp, to catch permission issues)
 
 
 
 
 
94
  try:
95
- os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
96
- logging.info(f"Ensured UPLOAD_DIR exists: {settings.UPLOAD_DIR}")
97
- except PermissionError as e:
98
- logging.error(f"CRITICAL PERMISSION ERROR: Cannot create UPLOAD_DIR at {settings.UPLOAD_DIR}: {e}. Uploads will fail.")
 
 
 
 
 
 
 
99
 
100
- # 1. Delete all rows from documents and chat_messages tables
101
- db = SessionLocal()
102
- try:
103
- db.query(Document).delete()
104
- db.query(ChatMessage).delete()
105
- db.commit()
106
- finally:
107
- db.close()
108
-
109
- # 2. Clear ChromaDB vector store using the singleton (now purely in-memory)
110
- VectorStore.reset_instance()
111
- vector_store_instance = VectorStore()
112
- vector_store_instance.clear_all()
113
- logging.info("All documents, chat messages, and vector store erased on startup.")
114
-
115
- except Exception as e:
116
- logging.error(f"Error during startup data cleanup: {e}")
117
-
118
- # Create database tables (should be done after any potential database file cleanup)
119
- create_tables()
120
-
121
  logging.info("Application started successfully")
122
 
123
  # Shutdown event
 
49
  )
50
 
51
  # Serve static frontend (exported Next.js) at root
52
+ frontend_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "frontend_out"))
 
 
 
53
  if os.path.isdir(frontend_path):
 
54
  app.mount("/", StaticFiles(directory=frontend_path, html=True), name="frontend")
 
 
55
 
56
  # Health check endpoint
57
  @app.get("/health")
 
78
  @app.on_event("startup")
79
  async def startup_event():
80
  """Initialize application on startup"""
81
+ # Create database tables
82
+ create_tables()
83
 
84
+ # Ensure directories exist
85
+ os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
86
+ os.makedirs(settings.CHROMA_PERSIST_DIRECTORY, exist_ok=True)
87
+
88
  # --- ERASE ALL DOCUMENTS, CHAT MESSAGES, AND VECTORS ON STARTUP ---
89
+ # 1. Delete all rows from documents and chat_messages tables
90
+ db = SessionLocal()
91
  try:
92
+ db.query(Document).delete()
93
+ db.query(ChatMessage).delete()
94
+ db.commit()
95
+ finally:
96
+ db.close()
97
+ # 2. Remove all files in chroma_db directory (but keep the directory)
98
+ chroma_dir = settings.CHROMA_PERSIST_DIRECTORY
99
+ for filename in os.listdir(chroma_dir):
100
+ file_path = os.path.join(chroma_dir, filename)
101
  try:
102
+ if os.path.isfile(file_path) or os.path.islink(file_path):
103
+ os.unlink(file_path)
104
+ elif os.path.isdir(file_path):
105
+ shutil.rmtree(file_path)
106
+ except Exception as e:
107
+ logging.warning(f"Failed to delete {file_path}: {e}")
108
+ # 3. Explicitly clear ChromaDB vector store
109
+ vector_store = VectorStore()
110
+ vector_store.clear_all()
111
+ logging.info("All documents, chat messages, and vector store erased on startup.")
112
+ # --- END ERASE ---
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  logging.info("Application started successfully")
115
 
116
  # Shutdown event
frontend/package.json CHANGED
@@ -6,7 +6,8 @@
6
  "dev": "next dev",
7
  "build": "next build",
8
  "start": "next start",
9
- "lint": "next lint"
 
10
  },
11
  "dependencies": {
12
  "next": "14.0.4",
 
6
  "dev": "next dev",
7
  "build": "next build",
8
  "start": "next start",
9
+ "lint": "next lint",
10
+ "export": "next build"
11
  },
12
  "dependencies": {
13
  "next": "14.0.4",