Spaces:
Runtime error
Runtime error
initial commit
Browse files- Dockerfile +5 -4
- backend/app/api/endpoints/chat.py +10 -19
- backend/app/api/endpoints/documents.py +24 -46
- backend/app/core/config.py +43 -3
- backend/app/services/vector_store.py +76 -107
- backend/main.py +29 -36
- frontend/package.json +2 -1
Dockerfile
CHANGED
@@ -5,7 +5,7 @@
|
|
5 |
COPY frontend/package*.json ./
|
6 |
RUN npm ci --only=production
|
7 |
COPY frontend/ ./
|
8 |
-
RUN npm run build
|
9 |
|
10 |
|
11 |
# ---------- Back‑end stage ----------
|
@@ -24,11 +24,12 @@
|
|
24 |
COPY backend/ ./
|
25 |
|
26 |
# Static export from previous stage
|
27 |
-
COPY --from=frontend-build /app/frontend/out /app/
|
28 |
|
29 |
# Create folders & DB file and make them writable
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
# Expose API port
|
34 |
EXPOSE 8000
|
|
|
5 |
COPY frontend/package*.json ./
|
6 |
RUN npm ci --only=production
|
7 |
COPY frontend/ ./
|
8 |
+
RUN npm run build && npm run export
|
9 |
|
10 |
|
11 |
# ---------- Back‑end stage ----------
|
|
|
24 |
COPY backend/ ./
|
25 |
|
26 |
# Static export from previous stage
|
27 |
+
COPY --from=frontend-build /app/frontend/out /app/frontend_out
|
28 |
|
29 |
# Create folders & DB file and make them writable
|
30 |
+
RUN mkdir -p /app/backend/uploads /app/backend/chroma_db \
|
31 |
+
&& touch /app/backend/pdf_chatbot.db \
|
32 |
+
&& chmod -R 777 /app/backend/uploads /app/backend/chroma_db /app/backend/pdf_chatbot.db
|
33 |
|
34 |
# Expose API port
|
35 |
EXPOSE 8000
|
backend/app/api/endpoints/chat.py
CHANGED
@@ -4,7 +4,6 @@ from typing import List
|
|
4 |
import json
|
5 |
import uuid
|
6 |
from datetime import datetime
|
7 |
-
from sqlalchemy import func as sa_func
|
8 |
|
9 |
from app.core.database import get_db
|
10 |
from app.models.document import ChatMessage
|
@@ -13,20 +12,14 @@ from app.services.vector_store import VectorStore
|
|
13 |
from app.services.ai_service import AIService
|
14 |
|
15 |
router = APIRouter()
|
16 |
-
|
17 |
-
|
18 |
-
return VectorStore()
|
19 |
-
|
20 |
-
def get_ai_service() -> AIService:
|
21 |
-
return AIService()
|
22 |
|
23 |
|
24 |
@router.post("/", response_model=ChatResponse)
|
25 |
def chat_with_documents(
|
26 |
request: ChatRequest,
|
27 |
-
db: Session = Depends(get_db)
|
28 |
-
vector_store: VectorStore = Depends(get_vector_store),
|
29 |
-
ai_service: AIService = Depends(get_ai_service)
|
30 |
):
|
31 |
"""Send a question and get an answer based on uploaded documents"""
|
32 |
try:
|
@@ -70,20 +63,19 @@ def chat_with_documents(
|
|
70 |
)
|
71 |
db.add(assistant_message)
|
72 |
db.commit()
|
73 |
-
db.refresh(assistant_message) # Refresh to get the actual ID
|
74 |
|
75 |
return ChatResponse(
|
76 |
success=True,
|
77 |
answer=answer,
|
78 |
session_id=request.session_id,
|
79 |
-
message_id=
|
80 |
)
|
81 |
|
82 |
# Generate answer using AI
|
83 |
ai_response = ai_service.generate_answer(
|
84 |
request.question,
|
85 |
context_documents,
|
86 |
-
model=
|
87 |
)
|
88 |
|
89 |
# Save user message
|
@@ -105,7 +97,6 @@ def chat_with_documents(
|
|
105 |
)
|
106 |
db.add(assistant_message)
|
107 |
db.commit()
|
108 |
-
db.refresh(assistant_message) # Refresh to get the actual ID
|
109 |
|
110 |
return ChatResponse(
|
111 |
success=ai_response["success"],
|
@@ -113,7 +104,7 @@ def chat_with_documents(
|
|
113 |
model=ai_response.get("model"),
|
114 |
sources=ai_response.get("sources", []),
|
115 |
session_id=request.session_id,
|
116 |
-
message_id=
|
117 |
)
|
118 |
|
119 |
except HTTPException:
|
@@ -164,10 +155,10 @@ def list_sessions(db: Session = Depends(get_db)):
|
|
164 |
# Get unique session IDs with message counts
|
165 |
sessions = db.query(
|
166 |
ChatMessage.session_id,
|
167 |
-
|
168 |
-
|
169 |
).group_by(ChatMessage.session_id).order_by(
|
170 |
-
|
171 |
).all()
|
172 |
|
173 |
return [
|
@@ -201,7 +192,7 @@ def delete_session(session_id: str, db: Session = Depends(get_db)):
|
|
201 |
|
202 |
|
203 |
@router.get("/models/available")
|
204 |
-
def get_available_models(
|
205 |
"""Get list of available AI models"""
|
206 |
try:
|
207 |
models = ai_service.get_available_models()
|
|
|
4 |
import json
|
5 |
import uuid
|
6 |
from datetime import datetime
|
|
|
7 |
|
8 |
from app.core.database import get_db
|
9 |
from app.models.document import ChatMessage
|
|
|
12 |
from app.services.ai_service import AIService
|
13 |
|
14 |
router = APIRouter()
|
15 |
+
vector_store = VectorStore()
|
16 |
+
ai_service = AIService()
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
@router.post("/", response_model=ChatResponse)
|
20 |
def chat_with_documents(
|
21 |
request: ChatRequest,
|
22 |
+
db: Session = Depends(get_db)
|
|
|
|
|
23 |
):
|
24 |
"""Send a question and get an answer based on uploaded documents"""
|
25 |
try:
|
|
|
63 |
)
|
64 |
db.add(assistant_message)
|
65 |
db.commit()
|
|
|
66 |
|
67 |
return ChatResponse(
|
68 |
success=True,
|
69 |
answer=answer,
|
70 |
session_id=request.session_id,
|
71 |
+
message_id=assistant_message.id
|
72 |
)
|
73 |
|
74 |
# Generate answer using AI
|
75 |
ai_response = ai_service.generate_answer(
|
76 |
request.question,
|
77 |
context_documents,
|
78 |
+
model=request.model
|
79 |
)
|
80 |
|
81 |
# Save user message
|
|
|
97 |
)
|
98 |
db.add(assistant_message)
|
99 |
db.commit()
|
|
|
100 |
|
101 |
return ChatResponse(
|
102 |
success=ai_response["success"],
|
|
|
104 |
model=ai_response.get("model"),
|
105 |
sources=ai_response.get("sources", []),
|
106 |
session_id=request.session_id,
|
107 |
+
message_id=assistant_message.id
|
108 |
)
|
109 |
|
110 |
except HTTPException:
|
|
|
155 |
# Get unique session IDs with message counts
|
156 |
sessions = db.query(
|
157 |
ChatMessage.session_id,
|
158 |
+
db.func.count(ChatMessage.id).label('message_count'),
|
159 |
+
db.func.max(ChatMessage.created_at).label('last_message_at')
|
160 |
).group_by(ChatMessage.session_id).order_by(
|
161 |
+
db.func.max(ChatMessage.created_at).desc()
|
162 |
).all()
|
163 |
|
164 |
return [
|
|
|
192 |
|
193 |
|
194 |
@router.get("/models/available")
|
195 |
+
def get_available_models():
|
196 |
"""Get list of available AI models"""
|
197 |
try:
|
198 |
models = ai_service.get_available_models()
|
backend/app/api/endpoints/documents.py
CHANGED
@@ -18,23 +18,16 @@ import asyncio
|
|
18 |
from concurrent.futures import ThreadPoolExecutor
|
19 |
|
20 |
router = APIRouter()
|
21 |
-
|
22 |
-
|
23 |
-
return PDFProcessor()
|
24 |
-
|
25 |
-
def get_vector_store() -> VectorStore:
|
26 |
-
return VectorStore()
|
27 |
|
28 |
|
29 |
@router.post("/upload", response_model=UploadResponse)
|
30 |
async def upload_document(
|
31 |
file: UploadFile = File(...),
|
32 |
-
db: Session = Depends(get_db)
|
33 |
-
pdf_processor: PDFProcessor = Depends(get_pdf_processor),
|
34 |
-
vector_store: VectorStore = Depends(get_vector_store)
|
35 |
):
|
36 |
"""Upload and process a PDF document"""
|
37 |
-
file_path = None # Initialize file_path to None
|
38 |
try:
|
39 |
# Restrict to 3 documents max
|
40 |
doc_count = db.query(Document).count()
|
@@ -44,12 +37,6 @@ async def upload_document(
|
|
44 |
if not file.filename or not file.filename.lower().endswith('.pdf'):
|
45 |
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
|
46 |
|
47 |
-
# No need to explicitly create UPLOAD_DIR as it's /tmp/uploads and created by OS or on first write
|
48 |
-
# try:
|
49 |
-
# os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
|
50 |
-
# except PermissionError as e:
|
51 |
-
# raise HTTPException(status_code=500, detail=f"Failed to create upload directory: {e}")
|
52 |
-
|
53 |
# Generate unique filename
|
54 |
file_extension = os.path.splitext(str(file.filename))[1]
|
55 |
unique_filename = f"{uuid.uuid4()}{file_extension}"
|
@@ -65,7 +52,7 @@ async def upload_document(
|
|
65 |
|
66 |
if not success:
|
67 |
# Clean up file if processing failed
|
68 |
-
if
|
69 |
os.remove(file_path)
|
70 |
raise HTTPException(status_code=400, detail=text_content)
|
71 |
|
@@ -114,11 +101,8 @@ async def upload_document(
|
|
114 |
raise
|
115 |
except Exception as e:
|
116 |
# Clean up file if something went wrong
|
117 |
-
if file_path and os.path.exists(file_path):
|
118 |
-
|
119 |
-
os.remove(file_path)
|
120 |
-
except Exception as cleanup_e:
|
121 |
-
print(f"Error cleaning up file {file_path}: {cleanup_e}")
|
122 |
raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
|
123 |
|
124 |
|
@@ -157,7 +141,7 @@ def get_document(document_id: int, db: Session = Depends(get_db)):
|
|
157 |
|
158 |
|
159 |
@router.delete("/{document_id}", response_model=DocumentDeleteResponse)
|
160 |
-
def delete_document(document_id: int, db: Session = Depends(get_db)
|
161 |
"""Delete a document and its vector embeddings"""
|
162 |
try:
|
163 |
document = db.query(Document).filter(Document.id == document_id).first()
|
@@ -169,10 +153,7 @@ def delete_document(document_id: int, db: Session = Depends(get_db), vector_stor
|
|
169 |
|
170 |
# Delete file from filesystem
|
171 |
if isinstance(document.file_path, str) and os.path.exists(document.file_path):
|
172 |
-
|
173 |
-
os.remove(document.file_path)
|
174 |
-
except Exception as e:
|
175 |
-
print(f"Error deleting file {document.file_path}: {e}")
|
176 |
|
177 |
# Delete from database
|
178 |
db.delete(document)
|
@@ -189,39 +170,36 @@ def delete_document(document_id: int, db: Session = Depends(get_db), vector_stor
|
|
189 |
|
190 |
|
191 |
@router.post("/clear_all")
|
192 |
-
async def clear_all_data(db: Session = Depends(get_db)
|
193 |
"""Admin endpoint to clear all documents, chat messages, uploaded files, and vector store."""
|
194 |
try:
|
195 |
# Delete all documents and chat messages from DB
|
196 |
db.query(Document).delete()
|
197 |
db.query(ChatMessage).delete()
|
198 |
db.commit()
|
199 |
-
|
200 |
-
# Delete all files in uploads directory (only if UPLOAD_DIR is specified and exists)
|
201 |
upload_dir = settings.UPLOAD_DIR
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
await loop.run_in_executor(None, remove_uploads)
|
215 |
-
|
216 |
# Clear ChromaDB vector store using the singleton
|
217 |
-
await
|
218 |
return {"success": True, "message": "All documents, chat messages, uploads, and vectors cleared."}
|
219 |
except Exception as e:
|
220 |
return {"success": False, "message": f"Error clearing data: {str(e)}"}
|
221 |
|
222 |
|
223 |
@router.get("/stats/summary")
|
224 |
-
def get_document_stats(db: Session = Depends(get_db)
|
225 |
"""Get document statistics"""
|
226 |
try:
|
227 |
total_documents = db.query(Document).count()
|
|
|
18 |
from concurrent.futures import ThreadPoolExecutor
|
19 |
|
20 |
router = APIRouter()
|
21 |
+
pdf_processor = PDFProcessor()
|
22 |
+
vector_store = VectorStore()
|
|
|
|
|
|
|
|
|
23 |
|
24 |
|
25 |
@router.post("/upload", response_model=UploadResponse)
|
26 |
async def upload_document(
|
27 |
file: UploadFile = File(...),
|
28 |
+
db: Session = Depends(get_db)
|
|
|
|
|
29 |
):
|
30 |
"""Upload and process a PDF document"""
|
|
|
31 |
try:
|
32 |
# Restrict to 3 documents max
|
33 |
doc_count = db.query(Document).count()
|
|
|
37 |
if not file.filename or not file.filename.lower().endswith('.pdf'):
|
38 |
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# Generate unique filename
|
41 |
file_extension = os.path.splitext(str(file.filename))[1]
|
42 |
unique_filename = f"{uuid.uuid4()}{file_extension}"
|
|
|
52 |
|
53 |
if not success:
|
54 |
# Clean up file if processing failed
|
55 |
+
if os.path.exists(file_path):
|
56 |
os.remove(file_path)
|
57 |
raise HTTPException(status_code=400, detail=text_content)
|
58 |
|
|
|
101 |
raise
|
102 |
except Exception as e:
|
103 |
# Clean up file if something went wrong
|
104 |
+
if 'file_path' in locals() and os.path.exists(file_path):
|
105 |
+
os.remove(file_path)
|
|
|
|
|
|
|
106 |
raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
|
107 |
|
108 |
|
|
|
141 |
|
142 |
|
143 |
@router.delete("/{document_id}", response_model=DocumentDeleteResponse)
|
144 |
+
def delete_document(document_id: int, db: Session = Depends(get_db)):
|
145 |
"""Delete a document and its vector embeddings"""
|
146 |
try:
|
147 |
document = db.query(Document).filter(Document.id == document_id).first()
|
|
|
153 |
|
154 |
# Delete file from filesystem
|
155 |
if isinstance(document.file_path, str) and os.path.exists(document.file_path):
|
156 |
+
os.remove(document.file_path)
|
|
|
|
|
|
|
157 |
|
158 |
# Delete from database
|
159 |
db.delete(document)
|
|
|
170 |
|
171 |
|
172 |
@router.post("/clear_all")
|
173 |
+
async def clear_all_data(db: Session = Depends(get_db)):
|
174 |
"""Admin endpoint to clear all documents, chat messages, uploaded files, and vector store."""
|
175 |
try:
|
176 |
# Delete all documents and chat messages from DB
|
177 |
db.query(Document).delete()
|
178 |
db.query(ChatMessage).delete()
|
179 |
db.commit()
|
180 |
+
# Delete all files in uploads directory
|
|
|
181 |
upload_dir = settings.UPLOAD_DIR
|
182 |
+
loop = asyncio.get_event_loop()
|
183 |
+
def remove_uploads():
|
184 |
+
for filename in os.listdir(upload_dir):
|
185 |
+
file_path = os.path.join(upload_dir, filename)
|
186 |
+
try:
|
187 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
188 |
+
os.unlink(file_path)
|
189 |
+
elif os.path.isdir(file_path):
|
190 |
+
shutil.rmtree(file_path)
|
191 |
+
except Exception as e:
|
192 |
+
print(f"Failed to delete {file_path}: {e}")
|
193 |
+
await loop.run_in_executor(None, remove_uploads)
|
|
|
|
|
194 |
# Clear ChromaDB vector store using the singleton
|
195 |
+
await loop.run_in_executor(None, vector_store.clear_all)
|
196 |
return {"success": True, "message": "All documents, chat messages, uploads, and vectors cleared."}
|
197 |
except Exception as e:
|
198 |
return {"success": False, "message": f"Error clearing data: {str(e)}"}
|
199 |
|
200 |
|
201 |
@router.get("/stats/summary")
|
202 |
+
def get_document_stats(db: Session = Depends(get_db)):
|
203 |
"""Get document statistics"""
|
204 |
try:
|
205 |
total_documents = db.query(Document).count()
|
backend/app/core/config.py
CHANGED
@@ -14,17 +14,17 @@ class Settings(BaseSettings):
|
|
14 |
ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
|
15 |
|
16 |
# Database
|
17 |
-
DATABASE_URL: str =
|
18 |
|
19 |
# Vector Database
|
20 |
-
CHROMA_PERSIST_DIRECTORY:
|
21 |
|
22 |
# AI Providers
|
23 |
OPENROUTER_API_KEY: Optional[str] = None
|
24 |
ANTHROPIC_API_KEY: Optional[str] = None
|
25 |
|
26 |
# File Storage
|
27 |
-
UPLOAD_DIR: str =
|
28 |
MAX_FILE_SIZE: int = 10 * 1024 * 1024 # 10MB
|
29 |
ALLOWED_EXTENSIONS: list = [".pdf"]
|
30 |
|
@@ -42,4 +42,44 @@ class Settings(BaseSettings):
|
|
42 |
|
43 |
|
44 |
settings = Settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
|
|
14 |
ACCESS_TOKEN_EXPIRE_MINUTES: int = 60 * 24 * 8 # 8 days
|
15 |
|
16 |
# Database
|
17 |
+
DATABASE_URL: str = "sqlite:///./pdf_chatbot.db"
|
18 |
|
19 |
# Vector Database
|
20 |
+
CHROMA_PERSIST_DIRECTORY: str = "chroma_db"
|
21 |
|
22 |
# AI Providers
|
23 |
OPENROUTER_API_KEY: Optional[str] = None
|
24 |
ANTHROPIC_API_KEY: Optional[str] = None
|
25 |
|
26 |
# File Storage
|
27 |
+
UPLOAD_DIR: str = "uploads"
|
28 |
MAX_FILE_SIZE: int = 10 * 1024 * 1024 # 10MB
|
29 |
ALLOWED_EXTENSIONS: list = [".pdf"]
|
30 |
|
|
|
42 |
|
43 |
|
44 |
settings = Settings()
|
45 |
+
|
46 |
+
# Ensure upload and persistence directories exist, falling back to system temporary
|
47 |
+
for attr, fallback_subdir in [("UPLOAD_DIR", "uploads"), ("CHROMA_PERSIST_DIRECTORY", "chroma_db")]:
|
48 |
+
dir_path = getattr(settings, attr)
|
49 |
+
try:
|
50 |
+
os.makedirs(dir_path, exist_ok=True)
|
51 |
+
except PermissionError:
|
52 |
+
# Fall back to a path inside the system temporary directory where writes are usually allowed
|
53 |
+
temp_dir = os.path.join(tempfile.gettempdir(), fallback_subdir)
|
54 |
+
os.makedirs(temp_dir, exist_ok=True)
|
55 |
+
setattr(settings, attr, temp_dir)
|
56 |
+
|
57 |
+
# Additional safety: ensure CHROMA_PERSIST_DIRECTORY is writable (fallback to temp dir if not)
|
58 |
+
try:
|
59 |
+
_test_chroma = os.path.join(settings.CHROMA_PERSIST_DIRECTORY, '.write_test')
|
60 |
+
with open(_test_chroma, 'w') as _f:
|
61 |
+
_f.write('')
|
62 |
+
os.remove(_test_chroma)
|
63 |
+
except (PermissionError, OSError):
|
64 |
+
temp_chroma = os.path.join(tempfile.gettempdir(), 'chroma_db')
|
65 |
+
os.makedirs(temp_chroma, exist_ok=True)
|
66 |
+
settings.CHROMA_PERSIST_DIRECTORY = temp_chroma
|
67 |
+
|
68 |
+
# Ensure SQLite database path is writable (fallback to temp dir if not)
|
69 |
+
if settings.DATABASE_URL.startswith("sqlite"):
|
70 |
+
db_uri_prefix = "sqlite:///"
|
71 |
+
db_path = settings.DATABASE_URL[len(db_uri_prefix):]
|
72 |
+
abs_db_path = os.path.abspath(db_path)
|
73 |
+
db_dir = os.path.dirname(abs_db_path)
|
74 |
+
try:
|
75 |
+
os.makedirs(db_dir, exist_ok=True)
|
76 |
+
# Try creating a temporary file to check write access
|
77 |
+
test_path = os.path.join(db_dir, ".write_test")
|
78 |
+
with open(test_path, "w") as _f:
|
79 |
+
_f.write("")
|
80 |
+
os.remove(test_path)
|
81 |
+
except (PermissionError, OSError):
|
82 |
+
# Fallback
|
83 |
+
temp_db_path = os.path.join(tempfile.gettempdir(), "pdf_chatbot.db")
|
84 |
+
settings.DATABASE_URL = f"sqlite:///{temp_db_path}"
|
85 |
|
backend/app/services/vector_store.py
CHANGED
@@ -1,17 +1,12 @@
|
|
1 |
import chromadb
|
2 |
from chromadb.config import Settings as ChromaSettings
|
3 |
-
from typing import List, Dict, Optional, Tuple
|
4 |
import json
|
5 |
import logging
|
6 |
from app.core.config import settings
|
7 |
-
from chromadb.types import Metadata
|
8 |
-
from chromadb.api import ClientAPI
|
9 |
-
from chromadb import PersistentClient # Import PersistentClient directly
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
-
# Define a type for ChromaDB metadata values
|
14 |
-
MetadataValue = Union[str, int, float, bool]
|
15 |
|
16 |
class VectorStore:
|
17 |
_instance = None
|
@@ -24,141 +19,122 @@ class VectorStore:
|
|
24 |
|
25 |
def __init__(self):
|
26 |
if not self._initialized:
|
27 |
-
self.client
|
28 |
-
|
|
|
|
|
|
|
|
|
29 |
self.collection_name = "pdf_documents"
|
|
|
30 |
self._initialized = True
|
31 |
-
|
32 |
-
def
|
33 |
-
"""
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
def add_document(self, document_id: str, content: str, metadata: Optional[Dict[str, Any]] = None) -> bool:
|
49 |
"""Add document content to vector store"""
|
50 |
-
client, collection = self._get_client_and_collection()
|
51 |
try:
|
52 |
logger.info(f"Starting to add document {document_id} to vector store")
|
53 |
logger.info(f"Content length: {len(content)} characters")
|
54 |
-
|
55 |
# Split content into chunks for better retrieval
|
56 |
chunks = self._split_text(content, chunk_size=1000, overlap=200)
|
57 |
logger.info(f"Split content into {len(chunks)} chunks")
|
58 |
-
|
59 |
# Prepare data for ChromaDB
|
60 |
ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
|
61 |
documents = chunks
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
**(metadata or {})
|
69 |
-
}
|
70 |
-
# Ensure all metadata values are of supported types by ChromaDB
|
71 |
-
for k, v in chunk_metadata.items():
|
72 |
-
if not isinstance(v, (str, int, float, bool)):
|
73 |
-
chunk_metadata[k] = str(v) # Convert unsupported types to string
|
74 |
-
metadatas.append(chunk_metadata)
|
75 |
-
|
76 |
logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...") # Log first 3 IDs
|
77 |
-
|
78 |
# Add to collection
|
79 |
logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
|
80 |
-
collection.add(
|
81 |
ids=ids,
|
82 |
documents=documents,
|
83 |
-
metadatas=metadatas
|
84 |
)
|
85 |
-
|
86 |
logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
|
87 |
return True
|
88 |
-
|
89 |
except Exception as e:
|
90 |
logger.error(f"Error adding document {document_id} to vector store: {e}")
|
91 |
logger.error(f"Exception type: {type(e).__name__}")
|
92 |
import traceback
|
93 |
logger.error(f"Full traceback: {traceback.format_exc()}")
|
94 |
return False
|
95 |
-
|
96 |
-
def search_similar(self, query: str, n_results: int = 5, document_id:
|
97 |
"""Search for similar documents based on query, optionally filtering by document_id"""
|
98 |
-
client, collection = self._get_client_and_collection()
|
99 |
try:
|
100 |
-
results = collection.query(
|
101 |
query_texts=[query],
|
102 |
n_results=n_results,
|
103 |
include=["documents", "metadatas", "distances"]
|
104 |
)
|
105 |
-
|
106 |
# Format results
|
107 |
formatted_results = []
|
108 |
-
if results
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
metadata = {}
|
123 |
-
|
124 |
-
if document_id is not None and str(metadata.get('document_id')) != str(document_id):
|
125 |
-
continue
|
126 |
-
formatted_results.append({
|
127 |
-
'content': doc,
|
128 |
-
'metadata': metadata,
|
129 |
-
'similarity_score': 1 - distance, # Convert distance to similarity
|
130 |
-
'rank': i + 1 # Re-introduce rank based on enumerate
|
131 |
-
})
|
132 |
return formatted_results
|
133 |
except Exception as e:
|
134 |
logger.error(f"Error searching vector store: {e}")
|
135 |
return []
|
136 |
-
|
137 |
def delete_document(self, document_id: str) -> bool:
|
138 |
"""Delete all chunks for a specific document"""
|
139 |
-
client, collection = self._get_client_and_collection()
|
140 |
try:
|
141 |
# Get all chunks for this document
|
142 |
-
results = collection.get(
|
143 |
where={"document_id": document_id}
|
144 |
)
|
145 |
-
|
146 |
if results['ids']:
|
147 |
-
collection.delete(ids=results['ids'])
|
148 |
logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")
|
149 |
-
|
150 |
return True
|
151 |
-
|
152 |
except Exception as e:
|
153 |
logger.error(f"Error deleting document {document_id} from vector store: {e}")
|
154 |
return False
|
155 |
-
|
156 |
def get_collection_stats(self) -> Dict:
|
157 |
"""Get statistics about the vector store collection"""
|
158 |
-
client, collection = self._get_client_and_collection()
|
159 |
try:
|
160 |
logger.info(f"Getting stats for collection: {self.collection_name}")
|
161 |
-
count = collection.count()
|
162 |
logger.info(f"Collection count: {count}")
|
163 |
return {
|
164 |
"total_documents": count,
|
@@ -170,18 +146,18 @@ class VectorStore:
|
|
170 |
import traceback
|
171 |
logger.error(f"Full traceback: {traceback.format_exc()}")
|
172 |
return {"total_documents": 0, "collection_name": self.collection_name}
|
173 |
-
|
174 |
def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
175 |
"""Split text into overlapping chunks"""
|
176 |
if len(text) <= chunk_size:
|
177 |
return [text]
|
178 |
-
|
179 |
chunks = []
|
180 |
start = 0
|
181 |
-
|
182 |
while start < len(text):
|
183 |
end = start + chunk_size
|
184 |
-
|
185 |
# If this isn't the last chunk, try to break at a sentence boundary
|
186 |
if end < len(text):
|
187 |
# Look for sentence endings
|
@@ -189,36 +165,29 @@ class VectorStore:
|
|
189 |
if text[i] in '.!?':
|
190 |
end = i + 1
|
191 |
break
|
192 |
-
|
193 |
chunk = text[start:end].strip()
|
194 |
if chunk:
|
195 |
chunks.append(chunk)
|
196 |
-
|
197 |
# Move start position with overlap
|
198 |
start = end - overlap
|
199 |
if start >= len(text):
|
200 |
break
|
201 |
-
|
202 |
return chunks
|
203 |
-
|
204 |
def clear_all(self) -> bool:
|
205 |
"""Clear all documents from the vector store"""
|
206 |
-
client, collection = self._get_client_and_collection()
|
207 |
try:
|
208 |
-
|
209 |
-
|
210 |
-
logger.info("
|
211 |
-
# Re-create the collection after reset
|
212 |
-
self.collection = client.create_collection(
|
213 |
-
name=self.collection_name,
|
214 |
-
metadata={"description": "PDF document embeddings for Q&A chatbot"}
|
215 |
-
)
|
216 |
-
logger.info(f"Re-created collection {self.collection_name} after reset.")
|
217 |
return True
|
218 |
except Exception as e:
|
219 |
-
logger.error(f"Error clearing
|
220 |
return False
|
221 |
-
|
222 |
@classmethod
|
223 |
def reset_instance(cls):
|
224 |
"""Reset the singleton instance - useful after clearing collections"""
|
|
|
1 |
import chromadb
|
2 |
from chromadb.config import Settings as ChromaSettings
|
3 |
+
from typing import List, Dict, Optional, Tuple
|
4 |
import json
|
5 |
import logging
|
6 |
from app.core.config import settings
|
|
|
|
|
|
|
7 |
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
|
|
|
|
10 |
|
11 |
class VectorStore:
|
12 |
_instance = None
|
|
|
19 |
|
20 |
def __init__(self):
|
21 |
if not self._initialized:
|
22 |
+
self.client = chromadb.PersistentClient(
|
23 |
+
path=settings.CHROMA_PERSIST_DIRECTORY,
|
24 |
+
settings=ChromaSettings(
|
25 |
+
anonymized_telemetry=False
|
26 |
+
)
|
27 |
+
)
|
28 |
self.collection_name = "pdf_documents"
|
29 |
+
self.collection = self._get_or_create_collection()
|
30 |
self._initialized = True
|
31 |
+
|
32 |
+
def _get_or_create_collection(self):
|
33 |
+
"""Get existing collection or create new one"""
|
34 |
+
try:
|
35 |
+
collection = self.client.get_collection(name=self.collection_name)
|
36 |
+
logger.info(f"Using existing collection: {self.collection_name}")
|
37 |
+
except Exception:
|
38 |
+
collection = self.client.create_collection(
|
39 |
+
name=self.collection_name,
|
40 |
+
metadata={"description": "PDF document embeddings for Q&A chatbot"}
|
41 |
+
)
|
42 |
+
logger.info(f"Created new collection: {self.collection_name}")
|
43 |
+
|
44 |
+
return collection
|
45 |
+
|
46 |
+
def add_document(self, document_id: str, content: str, metadata: Dict = None) -> bool:
|
|
|
|
|
47 |
"""Add document content to vector store"""
|
|
|
48 |
try:
|
49 |
logger.info(f"Starting to add document {document_id} to vector store")
|
50 |
logger.info(f"Content length: {len(content)} characters")
|
51 |
+
|
52 |
# Split content into chunks for better retrieval
|
53 |
chunks = self._split_text(content, chunk_size=1000, overlap=200)
|
54 |
logger.info(f"Split content into {len(chunks)} chunks")
|
55 |
+
|
56 |
# Prepare data for ChromaDB
|
57 |
ids = [f"{document_id}_chunk_{i}" for i in range(len(chunks))]
|
58 |
documents = chunks
|
59 |
+
metadatas = [{
|
60 |
+
"document_id": document_id,
|
61 |
+
"chunk_index": i,
|
62 |
+
**(metadata or {})
|
63 |
+
} for i in range(len(chunks))]
|
64 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
logger.info(f"Prepared {len(ids)} chunks with IDs: {ids[:3]}...") # Log first 3 IDs
|
66 |
+
|
67 |
# Add to collection
|
68 |
logger.info(f"Adding chunks to ChromaDB collection: {self.collection_name}")
|
69 |
+
self.collection.add(
|
70 |
ids=ids,
|
71 |
documents=documents,
|
72 |
+
metadatas=metadatas
|
73 |
)
|
74 |
+
|
75 |
logger.info(f"Successfully added document {document_id} with {len(chunks)} chunks to vector store")
|
76 |
return True
|
77 |
+
|
78 |
except Exception as e:
|
79 |
logger.error(f"Error adding document {document_id} to vector store: {e}")
|
80 |
logger.error(f"Exception type: {type(e).__name__}")
|
81 |
import traceback
|
82 |
logger.error(f"Full traceback: {traceback.format_exc()}")
|
83 |
return False
|
84 |
+
|
85 |
+
def search_similar(self, query: str, n_results: int = 5, document_id: str = None) -> List[Dict]:
|
86 |
"""Search for similar documents based on query, optionally filtering by document_id"""
|
|
|
87 |
try:
|
88 |
+
results = self.collection.query(
|
89 |
query_texts=[query],
|
90 |
n_results=n_results,
|
91 |
include=["documents", "metadatas", "distances"]
|
92 |
)
|
93 |
+
|
94 |
# Format results
|
95 |
formatted_results = []
|
96 |
+
if results['documents'] and results['documents'][0]:
|
97 |
+
for i, (doc, metadata, distance) in enumerate(zip(
|
98 |
+
results['documents'][0],
|
99 |
+
results['metadatas'][0],
|
100 |
+
results['distances'][0]
|
101 |
+
)):
|
102 |
+
if document_id is not None and str(metadata.get('document_id')) != str(document_id):
|
103 |
+
continue
|
104 |
+
formatted_results.append({
|
105 |
+
'content': doc,
|
106 |
+
'metadata': metadata,
|
107 |
+
'similarity_score': 1 - distance, # Convert distance to similarity
|
108 |
+
'rank': i + 1
|
109 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
return formatted_results
|
111 |
except Exception as e:
|
112 |
logger.error(f"Error searching vector store: {e}")
|
113 |
return []
|
114 |
+
|
115 |
def delete_document(self, document_id: str) -> bool:
|
116 |
"""Delete all chunks for a specific document"""
|
|
|
117 |
try:
|
118 |
# Get all chunks for this document
|
119 |
+
results = self.collection.get(
|
120 |
where={"document_id": document_id}
|
121 |
)
|
122 |
+
|
123 |
if results['ids']:
|
124 |
+
self.collection.delete(ids=results['ids'])
|
125 |
logger.info(f"Deleted {len(results['ids'])} chunks for document {document_id}")
|
126 |
+
|
127 |
return True
|
128 |
+
|
129 |
except Exception as e:
|
130 |
logger.error(f"Error deleting document {document_id} from vector store: {e}")
|
131 |
return False
|
132 |
+
|
133 |
def get_collection_stats(self) -> Dict:
|
134 |
"""Get statistics about the vector store collection"""
|
|
|
135 |
try:
|
136 |
logger.info(f"Getting stats for collection: {self.collection_name}")
|
137 |
+
count = self.collection.count()
|
138 |
logger.info(f"Collection count: {count}")
|
139 |
return {
|
140 |
"total_documents": count,
|
|
|
146 |
import traceback
|
147 |
logger.error(f"Full traceback: {traceback.format_exc()}")
|
148 |
return {"total_documents": 0, "collection_name": self.collection_name}
|
149 |
+
|
150 |
def _split_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
151 |
"""Split text into overlapping chunks"""
|
152 |
if len(text) <= chunk_size:
|
153 |
return [text]
|
154 |
+
|
155 |
chunks = []
|
156 |
start = 0
|
157 |
+
|
158 |
while start < len(text):
|
159 |
end = start + chunk_size
|
160 |
+
|
161 |
# If this isn't the last chunk, try to break at a sentence boundary
|
162 |
if end < len(text):
|
163 |
# Look for sentence endings
|
|
|
165 |
if text[i] in '.!?':
|
166 |
end = i + 1
|
167 |
break
|
168 |
+
|
169 |
chunk = text[start:end].strip()
|
170 |
if chunk:
|
171 |
chunks.append(chunk)
|
172 |
+
|
173 |
# Move start position with overlap
|
174 |
start = end - overlap
|
175 |
if start >= len(text):
|
176 |
break
|
177 |
+
|
178 |
return chunks
|
179 |
+
|
180 |
def clear_all(self) -> bool:
|
181 |
"""Clear all documents from the vector store"""
|
|
|
182 |
try:
|
183 |
+
self.client.delete_collection(name=self.collection_name)
|
184 |
+
self.collection = self._get_or_create_collection()
|
185 |
+
logger.info("Cleared all documents from vector store")
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
return True
|
187 |
except Exception as e:
|
188 |
+
logger.error(f"Error clearing vector store: {e}")
|
189 |
return False
|
190 |
+
|
191 |
@classmethod
|
192 |
def reset_instance(cls):
|
193 |
"""Reset the singleton instance - useful after clearing collections"""
|
backend/main.py
CHANGED
@@ -49,15 +49,9 @@ app.include_router(
|
|
49 |
)
|
50 |
|
51 |
# Serve static frontend (exported Next.js) at root
|
52 |
-
frontend_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "
|
53 |
-
|
54 |
-
# Log frontend path and existence
|
55 |
-
logging.info(f"Calculated frontend_path: {frontend_path}")
|
56 |
if os.path.isdir(frontend_path):
|
57 |
-
logging.info(f"Frontend directory exists: {frontend_path}")
|
58 |
app.mount("/", StaticFiles(directory=frontend_path, html=True), name="frontend")
|
59 |
-
else:
|
60 |
-
logging.warning(f"Frontend directory does NOT exist: {frontend_path}. Frontend will not be served.")
|
61 |
|
62 |
# Health check endpoint
|
63 |
@app.get("/health")
|
@@ -84,40 +78,39 @@ def root():
|
|
84 |
@app.on_event("startup")
|
85 |
async def startup_event():
|
86 |
"""Initialize application on startup"""
|
|
|
|
|
87 |
|
|
|
|
|
|
|
|
|
88 |
# --- ERASE ALL DOCUMENTS, CHAT MESSAGES, AND VECTORS ON STARTUP ---
|
|
|
|
|
89 |
try:
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
94 |
try:
|
95 |
-
os.
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
# 1. Delete all rows from documents and chat_messages tables
|
101 |
-
db = SessionLocal()
|
102 |
-
try:
|
103 |
-
db.query(Document).delete()
|
104 |
-
db.query(ChatMessage).delete()
|
105 |
-
db.commit()
|
106 |
-
finally:
|
107 |
-
db.close()
|
108 |
-
|
109 |
-
# 2. Clear ChromaDB vector store using the singleton (now purely in-memory)
|
110 |
-
VectorStore.reset_instance()
|
111 |
-
vector_store_instance = VectorStore()
|
112 |
-
vector_store_instance.clear_all()
|
113 |
-
logging.info("All documents, chat messages, and vector store erased on startup.")
|
114 |
-
|
115 |
-
except Exception as e:
|
116 |
-
logging.error(f"Error during startup data cleanup: {e}")
|
117 |
-
|
118 |
-
# Create database tables (should be done after any potential database file cleanup)
|
119 |
-
create_tables()
|
120 |
-
|
121 |
logging.info("Application started successfully")
|
122 |
|
123 |
# Shutdown event
|
|
|
49 |
)
|
50 |
|
51 |
# Serve static frontend (exported Next.js) at root
|
52 |
+
frontend_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "frontend_out"))
|
|
|
|
|
|
|
53 |
if os.path.isdir(frontend_path):
|
|
|
54 |
app.mount("/", StaticFiles(directory=frontend_path, html=True), name="frontend")
|
|
|
|
|
55 |
|
56 |
# Health check endpoint
|
57 |
@app.get("/health")
|
|
|
78 |
@app.on_event("startup")
|
79 |
async def startup_event():
|
80 |
"""Initialize application on startup"""
|
81 |
+
# Create database tables
|
82 |
+
create_tables()
|
83 |
|
84 |
+
# Ensure directories exist
|
85 |
+
os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
|
86 |
+
os.makedirs(settings.CHROMA_PERSIST_DIRECTORY, exist_ok=True)
|
87 |
+
|
88 |
# --- ERASE ALL DOCUMENTS, CHAT MESSAGES, AND VECTORS ON STARTUP ---
|
89 |
+
# 1. Delete all rows from documents and chat_messages tables
|
90 |
+
db = SessionLocal()
|
91 |
try:
|
92 |
+
db.query(Document).delete()
|
93 |
+
db.query(ChatMessage).delete()
|
94 |
+
db.commit()
|
95 |
+
finally:
|
96 |
+
db.close()
|
97 |
+
# 2. Remove all files in chroma_db directory (but keep the directory)
|
98 |
+
chroma_dir = settings.CHROMA_PERSIST_DIRECTORY
|
99 |
+
for filename in os.listdir(chroma_dir):
|
100 |
+
file_path = os.path.join(chroma_dir, filename)
|
101 |
try:
|
102 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
103 |
+
os.unlink(file_path)
|
104 |
+
elif os.path.isdir(file_path):
|
105 |
+
shutil.rmtree(file_path)
|
106 |
+
except Exception as e:
|
107 |
+
logging.warning(f"Failed to delete {file_path}: {e}")
|
108 |
+
# 3. Explicitly clear ChromaDB vector store
|
109 |
+
vector_store = VectorStore()
|
110 |
+
vector_store.clear_all()
|
111 |
+
logging.info("All documents, chat messages, and vector store erased on startup.")
|
112 |
+
# --- END ERASE ---
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
logging.info("Application started successfully")
|
115 |
|
116 |
# Shutdown event
|
frontend/package.json
CHANGED
@@ -6,7 +6,8 @@
|
|
6 |
"dev": "next dev",
|
7 |
"build": "next build",
|
8 |
"start": "next start",
|
9 |
-
"lint": "next lint"
|
|
|
10 |
},
|
11 |
"dependencies": {
|
12 |
"next": "14.0.4",
|
|
|
6 |
"dev": "next dev",
|
7 |
"build": "next build",
|
8 |
"start": "next start",
|
9 |
+
"lint": "next lint",
|
10 |
+
"export": "next build"
|
11 |
},
|
12 |
"dependencies": {
|
13 |
"next": "14.0.4",
|