Spaces:
Sleeping
Sleeping
upload cac thu
#1
by
Zok213
- opened
- .gitignore +1 -3
- README.md +1 -142
- User_test_bot +1 -0
- app.py +9 -73
- app/__init__.py +13 -8
- app/api/pdf_routes.py +143 -792
- app/api/pdf_websocket.py +4 -182
- app/api/postgresql_routes.py +8 -421
- app/api/rag_routes.py +12 -518
- app/database/models.py +0 -3
- app/database/postgresql.py +4 -7
- app/models/pdf_models.py +12 -20
- app/models/rag_models.py +2 -58
- app/utils/cache_config.py +0 -45
- app/utils/pdf_processor.py +218 -414
- app/utils/pinecone_fix.py +0 -194
- docs/api_documentation.md +581 -0
- pytest.ini +12 -0
- requirements.txt +0 -2
.gitignore
CHANGED
@@ -59,6 +59,7 @@ out/
|
|
59 |
tests/
|
60 |
|
61 |
Admin_bot/
|
|
|
62 |
Pix-Agent/
|
63 |
|
64 |
# Hugging Face Spaces
|
@@ -80,6 +81,3 @@ Thumbs.db
|
|
80 |
main.py
|
81 |
|
82 |
test/
|
83 |
-
|
84 |
-
/tmp
|
85 |
-
/docs/
|
|
|
59 |
tests/
|
60 |
|
61 |
Admin_bot/
|
62 |
+
User_test_bot/
|
63 |
Pix-Agent/
|
64 |
|
65 |
# Hugging Face Spaces
|
|
|
81 |
main.py
|
82 |
|
83 |
test/
|
|
|
|
|
|
README.md
CHANGED
@@ -416,145 +416,4 @@ Lịch sử hội thoại người dùng được lưu trong queue riêng với
|
|
416 |
|
417 |
## Tác giả
|
418 |
|
419 |
-
- **PIX Project Team**
|
420 |
-
|
421 |
-
# PixAgent PDF Processing
|
422 |
-
|
423 |
-
This README provides instructions for the PDF processing functionality in PixAgent, including uploading PDF documents, managing vector embeddings, and deleting documents.
|
424 |
-
|
425 |
-
## API Endpoints
|
426 |
-
|
427 |
-
### Health Check
|
428 |
-
|
429 |
-
```
|
430 |
-
GET /health
|
431 |
-
GET /pdf/health
|
432 |
-
```
|
433 |
-
|
434 |
-
Verify the API is running and the connection to databases (MongoDB, PostgreSQL, Pinecone) is established.
|
435 |
-
|
436 |
-
### Upload PDF
|
437 |
-
|
438 |
-
```
|
439 |
-
POST /pdf/upload
|
440 |
-
```
|
441 |
-
|
442 |
-
**Parameters:**
|
443 |
-
- `file`: The PDF file to upload (multipart/form-data)
|
444 |
-
- `namespace`: The namespace to store vectors in (default: "Default")
|
445 |
-
- `mock_mode`: Set to "true" or "false" (default: "false")
|
446 |
-
- `vector_database_id`: The ID of the vector database to use (required for real mode)
|
447 |
-
- `document_id`: Optional custom document ID (if not provided, a UUID will be generated)
|
448 |
-
|
449 |
-
**Example Python Request:**
|
450 |
-
```python
|
451 |
-
import requests
|
452 |
-
import uuid
|
453 |
-
|
454 |
-
document_id = str(uuid.uuid4())
|
455 |
-
files = {'file': open('your_document.pdf', 'rb')}
|
456 |
-
response = requests.post(
|
457 |
-
'http://localhost:8000/pdf/upload',
|
458 |
-
files=files,
|
459 |
-
data={
|
460 |
-
'namespace': 'my-namespace',
|
461 |
-
'mock_mode': 'false',
|
462 |
-
'vector_database_id': '9',
|
463 |
-
'document_id': document_id
|
464 |
-
}
|
465 |
-
)
|
466 |
-
print(f'Status: {response.status_code}')
|
467 |
-
print(f'Response: {response.json()}')
|
468 |
-
```
|
469 |
-
|
470 |
-
### List Documents
|
471 |
-
|
472 |
-
```
|
473 |
-
GET /pdf/documents
|
474 |
-
```
|
475 |
-
|
476 |
-
**Parameters:**
|
477 |
-
- `namespace`: The namespace to retrieve documents from
|
478 |
-
- `vector_database_id`: The ID of the vector database to use
|
479 |
-
|
480 |
-
**Example Python Request:**
|
481 |
-
```python
|
482 |
-
import requests
|
483 |
-
|
484 |
-
response = requests.get(
|
485 |
-
'http://localhost:8000/pdf/documents',
|
486 |
-
params={
|
487 |
-
'namespace': 'my-namespace',
|
488 |
-
'vector_database_id': '9'
|
489 |
-
}
|
490 |
-
)
|
491 |
-
print(f'Status: {response.status_code}')
|
492 |
-
print(f'Documents: {response.json()}')
|
493 |
-
```
|
494 |
-
|
495 |
-
### Delete Document
|
496 |
-
|
497 |
-
```
|
498 |
-
DELETE /pdf/document
|
499 |
-
```
|
500 |
-
|
501 |
-
**Parameters:**
|
502 |
-
- `document_id`: The ID of the document to delete
|
503 |
-
- `namespace`: The namespace containing the document
|
504 |
-
- `vector_database_id`: The ID of the vector database
|
505 |
-
|
506 |
-
**Example Python Request:**
|
507 |
-
```python
|
508 |
-
import requests
|
509 |
-
|
510 |
-
response = requests.delete(
|
511 |
-
'http://localhost:8000/pdf/document',
|
512 |
-
params={
|
513 |
-
'document_id': 'your-document-id',
|
514 |
-
'namespace': 'my-namespace',
|
515 |
-
'vector_database_id': '9'
|
516 |
-
}
|
517 |
-
)
|
518 |
-
print(f'Status: {response.status_code}')
|
519 |
-
print(f'Result: {response.json()}')
|
520 |
-
```
|
521 |
-
|
522 |
-
### List Available Vector Databases
|
523 |
-
|
524 |
-
```
|
525 |
-
GET /postgres/vector-databases
|
526 |
-
```
|
527 |
-
|
528 |
-
**Example Python Request:**
|
529 |
-
```python
|
530 |
-
import requests
|
531 |
-
|
532 |
-
response = requests.get('http://localhost:8000/postgres/vector-databases')
|
533 |
-
vector_dbs = response.json()
|
534 |
-
print(f'Available vector databases: {vector_dbs}')
|
535 |
-
```
|
536 |
-
|
537 |
-
## PDF Processing and Vector Embedding
|
538 |
-
|
539 |
-
The system processes PDFs in the following steps:
|
540 |
-
|
541 |
-
1. **Text Extraction**: Uses `PyPDFLoader` from LangChain to extract text from the PDF.
|
542 |
-
2. **Text Chunking**: Splits the text into manageable chunks using `RecursiveCharacterTextSplitter` with a chunk size of 1000 characters and 100 character overlap.
|
543 |
-
3. **Embedding Creation**: Uses Google's Gemini embedding model (`models/embedding-001`) to create embeddings for each text chunk.
|
544 |
-
4. **Dimension Adjustment**: Ensures the embedding dimensions match the Pinecone index requirements:
|
545 |
-
- If Gemini produces 768-dim embeddings and Pinecone expects 1536-dim, each value is duplicated.
|
546 |
-
- For other mismatches, appropriate padding or truncation is applied.
|
547 |
-
5. **Vector Storage**: Uploads the embeddings to Pinecone in the specified namespace.
|
548 |
-
|
549 |
-
## Notes
|
550 |
-
|
551 |
-
- **Mock Mode**: When `mock_mode` is set to "true", the system simulates the PDF processing without actually creating or storing embeddings.
|
552 |
-
- **Namespace Handling**: When using a vector database ID, the namespace is automatically formatted as `vdb-{vector_database_id}`.
|
553 |
-
- **Error Handling**: The system validates vector dimensions and handles errors appropriately, with detailed logging.
|
554 |
-
- **PDF Storage**: Processed PDFs are stored in the `pdf_storage` directory with the document ID as the filename.
|
555 |
-
|
556 |
-
## Troubleshooting
|
557 |
-
|
558 |
-
- **Dimension Mismatch Error**: If you receive an error about vector dimensions not matching Pinecone index configuration, check that the embedding model and Pinecone index dimensions are compatible. The system will attempt to adjust dimensions but may encounter limits.
|
559 |
-
- **Connection Issues**: Verify that MongoDB, PostgreSQL, and Pinecone credentials are correctly configured in the environment variables.
|
560 |
-
- **Processing Failures**: Check the `pdf_api_debug.log` file for detailed error messages and processing information.
|
|
|
416 |
|
417 |
## Tác giả
|
418 |
|
419 |
+
- **PIX Project Team**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
User_test_bot
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit a29ee5023e8346d48cae7104bf516b78f791db07
|
app.py
CHANGED
@@ -6,11 +6,6 @@ import os
|
|
6 |
import sys
|
7 |
import logging
|
8 |
from dotenv import load_dotenv
|
9 |
-
from fastapi.responses import JSONResponse, PlainTextResponse
|
10 |
-
from fastapi.staticfiles import StaticFiles
|
11 |
-
import time
|
12 |
-
import uuid
|
13 |
-
import traceback
|
14 |
|
15 |
# Cấu hình logging
|
16 |
logging.basicConfig(
|
@@ -86,9 +81,8 @@ try:
|
|
86 |
from app.api.mongodb_routes import router as mongodb_router
|
87 |
from app.api.postgresql_routes import router as postgresql_router
|
88 |
from app.api.rag_routes import router as rag_router
|
89 |
-
from app.api.websocket_routes import router as websocket_router
|
90 |
from app.api.pdf_routes import router as pdf_router
|
91 |
-
from app.api.
|
92 |
|
93 |
# Import middlewares
|
94 |
from app.utils.middleware import RequestLoggingMiddleware, ErrorHandlingMiddleware, DatabaseCheckMiddleware
|
@@ -99,8 +93,6 @@ try:
|
|
99 |
# Import cache
|
100 |
from app.utils.cache import get_cache
|
101 |
|
102 |
-
logger.info("Successfully imported all routers and modules")
|
103 |
-
|
104 |
except ImportError as e:
|
105 |
logger.error(f"Error importing routes or middlewares: {e}")
|
106 |
raise
|
@@ -137,14 +129,6 @@ app.include_router(postgresql_router)
|
|
137 |
app.include_router(rag_router)
|
138 |
app.include_router(websocket_router)
|
139 |
app.include_router(pdf_router)
|
140 |
-
app.include_router(pdf_websocket_router)
|
141 |
-
|
142 |
-
# Log all registered routes
|
143 |
-
logger.info("Registered API routes:")
|
144 |
-
for route in app.routes:
|
145 |
-
if hasattr(route, "path") and hasattr(route, "methods"):
|
146 |
-
methods = ",".join(route.methods)
|
147 |
-
logger.info(f" {methods:<10} {route.path}")
|
148 |
|
149 |
# Root endpoint
|
150 |
@app.get("/")
|
@@ -217,71 +201,23 @@ if DEBUG:
|
|
217 |
@app.get("/debug/errors")
|
218 |
def debug_errors(limit: int = 10):
|
219 |
"""Hiển thị các lỗi gần đây (chỉ trong chế độ debug)"""
|
220 |
-
return error_tracker.
|
221 |
|
222 |
@app.get("/debug/performance")
|
223 |
def debug_performance():
|
224 |
-
"""Hiển thị
|
225 |
-
return performance_monitor.
|
226 |
|
227 |
@app.get("/debug/full")
|
228 |
def debug_full_report(request: Request):
|
229 |
-
"""Hiển thị báo cáo
|
230 |
return debug_view(request)
|
231 |
|
232 |
@app.get("/debug/cache")
|
233 |
def debug_cache():
|
234 |
-
"""Hiển thị
|
235 |
-
|
236 |
-
cache_stats = cache.stats()
|
237 |
-
|
238 |
-
# Thêm thông tin chi tiết về các key trong cache
|
239 |
-
cache_keys = list(cache.cache.keys())
|
240 |
-
history_users = list(cache.user_history_queues.keys())
|
241 |
-
|
242 |
-
return {
|
243 |
-
"stats": cache_stats,
|
244 |
-
"keys": cache_keys,
|
245 |
-
"history_users": history_users,
|
246 |
-
"config": {
|
247 |
-
"ttl": cache.ttl,
|
248 |
-
"cleanup_interval": cache.cleanup_interval,
|
249 |
-
"max_size": cache.max_size,
|
250 |
-
"history_queue_size": os.getenv("HISTORY_QUEUE_SIZE", "10"),
|
251 |
-
"history_cache_ttl": os.getenv("HISTORY_CACHE_TTL", "3600"),
|
252 |
-
}
|
253 |
-
}
|
254 |
-
|
255 |
-
@app.get("/debug/websocket-routes")
|
256 |
-
def debug_websocket_routes():
|
257 |
-
"""Hiển thị thông tin về các WebSocket route (chỉ trong chế độ debug)"""
|
258 |
-
ws_routes = []
|
259 |
-
for route in app.routes:
|
260 |
-
if "websocket" in str(route.__class__).lower():
|
261 |
-
ws_routes.append({
|
262 |
-
"path": route.path,
|
263 |
-
"name": route.name,
|
264 |
-
"endpoint": str(route.endpoint)
|
265 |
-
})
|
266 |
-
return {
|
267 |
-
"websocket_routes": ws_routes,
|
268 |
-
"total_count": len(ws_routes)
|
269 |
-
}
|
270 |
-
|
271 |
-
@app.get("/debug/mock-status")
|
272 |
-
def debug_mock_status():
|
273 |
-
"""Display current mock mode settings"""
|
274 |
-
# Import was: from app.api.pdf_routes import USE_MOCK_MODE
|
275 |
-
# We've disabled mock mode
|
276 |
-
|
277 |
-
return {
|
278 |
-
"mock_mode": False, # Disabled - using real database
|
279 |
-
"mock_env_variable": os.getenv("USE_MOCK_MODE", "false"),
|
280 |
-
"debug_mode": DEBUG
|
281 |
-
}
|
282 |
-
|
283 |
|
284 |
-
# Run the app with uvicorn when executed directly
|
285 |
if __name__ == "__main__":
|
286 |
-
|
287 |
-
uvicorn.run("app:app", host="0.0.0.0", port=
|
|
|
6 |
import sys
|
7 |
import logging
|
8 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Cấu hình logging
|
11 |
logging.basicConfig(
|
|
|
81 |
from app.api.mongodb_routes import router as mongodb_router
|
82 |
from app.api.postgresql_routes import router as postgresql_router
|
83 |
from app.api.rag_routes import router as rag_router
|
|
|
84 |
from app.api.pdf_routes import router as pdf_router
|
85 |
+
from app.api.websocket_routes import router as websocket_router
|
86 |
|
87 |
# Import middlewares
|
88 |
from app.utils.middleware import RequestLoggingMiddleware, ErrorHandlingMiddleware, DatabaseCheckMiddleware
|
|
|
93 |
# Import cache
|
94 |
from app.utils.cache import get_cache
|
95 |
|
|
|
|
|
96 |
except ImportError as e:
|
97 |
logger.error(f"Error importing routes or middlewares: {e}")
|
98 |
raise
|
|
|
129 |
app.include_router(rag_router)
|
130 |
app.include_router(websocket_router)
|
131 |
app.include_router(pdf_router)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
# Root endpoint
|
134 |
@app.get("/")
|
|
|
201 |
@app.get("/debug/errors")
|
202 |
def debug_errors(limit: int = 10):
|
203 |
"""Hiển thị các lỗi gần đây (chỉ trong chế độ debug)"""
|
204 |
+
return error_tracker.get_recent_errors(limit)
|
205 |
|
206 |
@app.get("/debug/performance")
|
207 |
def debug_performance():
|
208 |
+
"""Hiển thị thống kê hiệu suất (chỉ trong chế độ debug)"""
|
209 |
+
return performance_monitor.get_stats()
|
210 |
|
211 |
@app.get("/debug/full")
|
212 |
def debug_full_report(request: Request):
|
213 |
+
"""Hiển thị báo cáo đầy đủ về hệ thống (chỉ trong chế độ debug)"""
|
214 |
return debug_view(request)
|
215 |
|
216 |
@app.get("/debug/cache")
|
217 |
def debug_cache():
|
218 |
+
"""Hiển thị thống kê về cache (chỉ trong chế độ debug)"""
|
219 |
+
return get_cache().stats()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
|
|
221 |
if __name__ == "__main__":
|
222 |
+
PORT = int(os.getenv("PORT", "7860"))
|
223 |
+
uvicorn.run("app:app", host="0.0.0.0", port=PORT, reload=DEBUG)
|
app/__init__.py
CHANGED
@@ -10,11 +10,16 @@ import os
|
|
10 |
# Thêm thư mục gốc vào sys.path
|
11 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
12 |
|
13 |
-
|
14 |
-
import
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Thêm thư mục gốc vào sys.path
|
11 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
12 |
|
13 |
+
try:
|
14 |
+
# Sửa lại cách import đúng - 'app.py' không phải là module hợp lệ
|
15 |
+
# 'app' là tên module, '.py' là phần mở rộng tệp
|
16 |
+
from app import app
|
17 |
+
except ImportError:
|
18 |
+
# Thử cách khác nếu import trực tiếp không hoạt động
|
19 |
+
import importlib.util
|
20 |
+
spec = importlib.util.spec_from_file_location("app_module",
|
21 |
+
os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
22 |
+
"app.py"))
|
23 |
+
app_module = importlib.util.module_from_spec(spec)
|
24 |
+
spec.loader.exec_module(app_module)
|
25 |
+
app = app_module.app
|
app/api/pdf_routes.py
CHANGED
@@ -1,23 +1,16 @@
|
|
1 |
import os
|
2 |
import shutil
|
3 |
import uuid
|
4 |
-
import
|
5 |
-
import traceback
|
6 |
-
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, BackgroundTasks, Depends, Query
|
7 |
from fastapi.responses import JSONResponse
|
8 |
from typing import Optional, List, Dict, Any
|
9 |
from sqlalchemy.orm import Session
|
10 |
-
import os.path
|
11 |
-
import logging
|
12 |
-
import tempfile
|
13 |
-
import time
|
14 |
-
import json
|
15 |
-
from datetime import datetime
|
16 |
|
17 |
from app.utils.pdf_processor import PDFProcessor
|
18 |
from app.models.pdf_models import PDFResponse, DeleteDocumentRequest, DocumentsListResponse
|
19 |
from app.database.postgresql import get_db
|
20 |
-
from app.database.models import VectorDatabase, Document, VectorStatus,
|
|
|
21 |
from app.api.pdf_websocket import (
|
22 |
send_pdf_upload_started,
|
23 |
send_pdf_upload_progress,
|
@@ -28,177 +21,21 @@ from app.api.pdf_websocket import (
|
|
28 |
send_pdf_delete_failed
|
29 |
)
|
30 |
|
31 |
-
#
|
32 |
-
logger = logging.getLogger(__name__)
|
33 |
-
|
34 |
-
# Add a stream handler for PDF debug logging
|
35 |
-
pdf_debug_logger = logging.getLogger("pdf_debug_api")
|
36 |
-
pdf_debug_logger.setLevel(logging.DEBUG)
|
37 |
-
|
38 |
-
# Check if a stream handler already exists, add one if not
|
39 |
-
if not any(isinstance(h, logging.StreamHandler) for h in pdf_debug_logger.handlers):
|
40 |
-
stream_handler = logging.StreamHandler(sys.stdout)
|
41 |
-
stream_handler.setLevel(logging.INFO)
|
42 |
-
pdf_debug_logger.addHandler(stream_handler)
|
43 |
-
|
44 |
-
# Initialize router
|
45 |
router = APIRouter(
|
46 |
prefix="/pdf",
|
47 |
tags=["PDF Processing"],
|
48 |
)
|
49 |
|
50 |
-
#
|
51 |
-
TEMP_UPLOAD_DIR =
|
52 |
-
STORAGE_DIR =
|
53 |
-
|
54 |
-
USE_MOCK_MODE = False # Disabled - using real database with improved connection handling
|
55 |
-
logger.info(f"PDF API starting with USE_MOCK_MODE={USE_MOCK_MODE}")
|
56 |
-
|
57 |
-
# Helper function to log with timestamp
|
58 |
-
def log_with_timestamp(message: str, level: str = "info", error: Exception = None):
|
59 |
-
"""Add timestamps to log messages and log to the PDF debug logger if available"""
|
60 |
-
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
61 |
-
full_message = f"{timestamp} - {message}"
|
62 |
-
|
63 |
-
if level.lower() == "debug":
|
64 |
-
logger.debug(full_message)
|
65 |
-
pdf_debug_logger.debug(full_message)
|
66 |
-
elif level.lower() == "info":
|
67 |
-
logger.info(full_message)
|
68 |
-
pdf_debug_logger.info(full_message)
|
69 |
-
elif level.lower() == "warning":
|
70 |
-
logger.warning(full_message)
|
71 |
-
pdf_debug_logger.warning(full_message)
|
72 |
-
elif level.lower() == "error":
|
73 |
-
logger.error(full_message)
|
74 |
-
pdf_debug_logger.error(full_message)
|
75 |
-
if error:
|
76 |
-
logger.error(traceback.format_exc())
|
77 |
-
pdf_debug_logger.error(traceback.format_exc())
|
78 |
-
else:
|
79 |
-
logger.info(full_message)
|
80 |
-
pdf_debug_logger.info(full_message)
|
81 |
|
82 |
-
#
|
83 |
-
|
84 |
-
|
85 |
-
pdf_debug_logger.debug(f"[{correlation_id}] {message}")
|
86 |
-
if error:
|
87 |
-
pdf_debug_logger.error(f"[{correlation_id}] Error: {str(error)}")
|
88 |
-
pdf_debug_logger.error(traceback.format_exc())
|
89 |
|
90 |
-
#
|
91 |
-
async def send_progress_update(user_id, file_id, step, progress=0.0, message=""):
|
92 |
-
"""Send PDF processing progress updates via WebSocket"""
|
93 |
-
try:
|
94 |
-
await send_pdf_upload_progress(user_id, file_id, step, progress, message)
|
95 |
-
except Exception as e:
|
96 |
-
logger.error(f"Error sending progress update: {e}")
|
97 |
-
logger.error(traceback.format_exc())
|
98 |
-
|
99 |
-
# Function with fixed indentation for the troublesome parts
|
100 |
-
async def handle_pdf_processing_result(result, correlation_id, user_id, file_id, filename, document, vector_status,
|
101 |
-
vector_database_id, temp_file_path, db, is_pdf):
|
102 |
-
"""Process the result of PDF processing and update database records"""
|
103 |
-
# If successful, move file to permanent storage
|
104 |
-
if result.get('success'):
|
105 |
-
try:
|
106 |
-
storage_path = os.path.join(STORAGE_DIR, f"{file_id}{'.pdf' if is_pdf else '.txt'}")
|
107 |
-
shutil.move(temp_file_path, storage_path)
|
108 |
-
log_upload_debug(correlation_id, f"Moved file to storage at {storage_path}")
|
109 |
-
except Exception as move_error:
|
110 |
-
log_upload_debug(correlation_id, f"Error moving file to storage: {move_error}", move_error)
|
111 |
-
|
112 |
-
# Update status in PostgreSQL
|
113 |
-
if vector_database_id and document and vector_status:
|
114 |
-
try:
|
115 |
-
log_upload_debug(correlation_id, f"Updating vector status to 'completed' for document ID {document.id}")
|
116 |
-
|
117 |
-
# Update the vector status with the result document_id (important for later deletion)
|
118 |
-
result_document_id = result.get('document_id')
|
119 |
-
|
120 |
-
vector_status.status = "completed"
|
121 |
-
vector_status.embedded_at = datetime.now()
|
122 |
-
|
123 |
-
# Critical: Store the correct vector ID for future deletion
|
124 |
-
# This can be either the original file_id or the result_document_id
|
125 |
-
if result_document_id and result_document_id != file_id:
|
126 |
-
# If Pinecone returned a specific document_id, use that
|
127 |
-
vector_status.vector_id = result_document_id
|
128 |
-
log_upload_debug(correlation_id, f"Updated vector_id to {result_document_id} (from result)")
|
129 |
-
elif file_id:
|
130 |
-
# Make sure file_id is stored as the vector_id
|
131 |
-
vector_status.vector_id = file_id
|
132 |
-
log_upload_debug(correlation_id, f"Updated vector_id to {file_id} (from file_id)")
|
133 |
-
|
134 |
-
# Also ensure we store some backup identifiers in case the primary one fails
|
135 |
-
# Store the document name as a secondary identifier
|
136 |
-
vector_status.document_name = document.name
|
137 |
-
log_upload_debug(correlation_id, f"Stored document_name '{document.name}' in vector status for backup")
|
138 |
-
|
139 |
-
# Mark document as embedded
|
140 |
-
document.is_embedded = True
|
141 |
-
|
142 |
-
db.commit()
|
143 |
-
log_upload_debug(correlation_id, f"Database status updated successfully")
|
144 |
-
except Exception as db_error:
|
145 |
-
log_upload_debug(correlation_id, f"Error updating database status: {db_error}", db_error)
|
146 |
-
|
147 |
-
# Send completion notification via WebSocket
|
148 |
-
if user_id:
|
149 |
-
try:
|
150 |
-
await send_pdf_upload_completed(
|
151 |
-
user_id,
|
152 |
-
file_id,
|
153 |
-
filename,
|
154 |
-
result.get('chunks_processed', 0)
|
155 |
-
)
|
156 |
-
log_upload_debug(correlation_id, f"Sent upload completed notification to user {user_id}")
|
157 |
-
except Exception as ws_error:
|
158 |
-
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
159 |
-
|
160 |
-
# Add document information to the result
|
161 |
-
if document:
|
162 |
-
result["document_database_id"] = document.id
|
163 |
-
else:
|
164 |
-
log_upload_debug(correlation_id, f"PDF processing failed: {result.get('error', 'Unknown error')}")
|
165 |
-
|
166 |
-
# Update error status in PostgreSQL
|
167 |
-
if vector_database_id and document and vector_status:
|
168 |
-
try:
|
169 |
-
log_upload_debug(correlation_id, f"Updating vector status to 'failed' for document ID {document.id}")
|
170 |
-
vector_status.status = "failed"
|
171 |
-
vector_status.error_message = result.get('error', 'Unknown error')
|
172 |
-
db.commit()
|
173 |
-
log_upload_debug(correlation_id, f"Database status updated for failure")
|
174 |
-
except Exception as db_error:
|
175 |
-
log_upload_debug(correlation_id, f"Error updating database status for failure: {db_error}", db_error)
|
176 |
-
|
177 |
-
# Send failure notification via WebSocket
|
178 |
-
if user_id:
|
179 |
-
try:
|
180 |
-
await send_pdf_upload_failed(
|
181 |
-
user_id,
|
182 |
-
file_id,
|
183 |
-
filename,
|
184 |
-
result.get('error', 'Unknown error')
|
185 |
-
)
|
186 |
-
log_upload_debug(correlation_id, f"Sent upload failed notification to user {user_id}")
|
187 |
-
except Exception as ws_error:
|
188 |
-
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
189 |
-
|
190 |
-
# Cleanup: delete temporary file if it still exists
|
191 |
-
if temp_file_path and os.path.exists(temp_file_path):
|
192 |
-
try:
|
193 |
-
os.remove(temp_file_path)
|
194 |
-
log_upload_debug(correlation_id, f"Removed temporary file {temp_file_path}")
|
195 |
-
except Exception as cleanup_error:
|
196 |
-
log_upload_debug(correlation_id, f"Error removing temporary file: {cleanup_error}", cleanup_error)
|
197 |
-
|
198 |
-
log_upload_debug(correlation_id, f"Upload request completed with success={result.get('success', False)}")
|
199 |
-
return result
|
200 |
-
|
201 |
-
# Endpoint for uploading and processing PDFs
|
202 |
@router.post("/upload", response_model=PDFResponse)
|
203 |
async def upload_pdf(
|
204 |
file: UploadFile = File(...),
|
@@ -208,398 +45,229 @@ async def upload_pdf(
|
|
208 |
description: Optional[str] = Form(None),
|
209 |
user_id: Optional[str] = Form(None),
|
210 |
vector_database_id: Optional[int] = Form(None),
|
211 |
-
content_type: Optional[str] = Form(None), # Add content_type parameter
|
212 |
background_tasks: BackgroundTasks = None,
|
213 |
db: Session = Depends(get_db)
|
214 |
):
|
215 |
"""
|
216 |
-
Upload
|
217 |
-
|
218 |
-
- **file**: PDF file to process
|
219 |
-
- **namespace**: Namespace in Pinecone to store embeddings (default: "Default")
|
220 |
-
- **index_name**: Name of Pinecone index (default: "testbot768")
|
221 |
-
- **title**: Document title (optional)
|
222 |
-
- **description**: Document description (optional)
|
223 |
-
- **user_id**: User ID for WebSocket status updates
|
224 |
-
- **vector_database_id**: ID of vector database in PostgreSQL (optional)
|
225 |
-
- **content_type**: Content type of the file (optional)
|
226 |
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
"""
|
229 |
-
# Generate request ID for tracking
|
230 |
-
correlation_id = str(uuid.uuid4())[:8]
|
231 |
-
logger.info(f"[{correlation_id}] PDF upload request received: ns={namespace}, index={index_name}, user={user_id}")
|
232 |
-
log_upload_debug(correlation_id, f"Upload request: vector_db_id={vector_database_id}")
|
233 |
-
|
234 |
-
# Variables that might need cleanup in case of error
|
235 |
-
temp_file_path = None
|
236 |
-
document = None
|
237 |
-
vector_status = None
|
238 |
-
|
239 |
try:
|
240 |
-
#
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
log_upload_debug(correlation_id, f"File type check: is_pdf={is_pdf}, is_text={is_text}, filename={file.filename}")
|
245 |
|
246 |
-
|
247 |
-
log_upload_debug(correlation_id, f"Rejecting non-PDF file: {file.filename}")
|
248 |
-
raise HTTPException(status_code=400, detail="Only PDF files are accepted")
|
249 |
-
|
250 |
-
# If vector_database_id provided, get info from PostgreSQL
|
251 |
api_key = None
|
252 |
vector_db = None
|
253 |
|
254 |
if vector_database_id:
|
255 |
-
log_upload_debug(correlation_id, f"Looking up vector database ID {vector_database_id}")
|
256 |
-
|
257 |
vector_db = db.query(VectorDatabase).filter(
|
258 |
VectorDatabase.id == vector_database_id,
|
259 |
VectorDatabase.status == "active"
|
260 |
).first()
|
261 |
|
262 |
if not vector_db:
|
263 |
-
|
264 |
-
raise HTTPException(status_code=404, detail="Vector database not found or inactive")
|
265 |
-
|
266 |
-
log_upload_debug(correlation_id, f"Found vector database: id={vector_db.id}, name={vector_db.name}, index={vector_db.pinecone_index}")
|
267 |
-
|
268 |
-
# Use vector database information
|
269 |
-
# Try to get API key from relationship
|
270 |
-
log_upload_debug(correlation_id, f"Trying to get API key for vector database {vector_database_id}")
|
271 |
-
|
272 |
-
# Log available attributes
|
273 |
-
vector_db_attrs = dir(vector_db)
|
274 |
-
log_upload_debug(correlation_id, f"Vector DB attributes: {vector_db_attrs}")
|
275 |
-
|
276 |
-
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
277 |
-
log_upload_debug(correlation_id, f"Using API key from relationship for vector database ID {vector_database_id}")
|
278 |
-
log_upload_debug(correlation_id, f"api_key_ref type: {type(vector_db.api_key_ref)}")
|
279 |
-
log_upload_debug(correlation_id, f"api_key_ref attributes: {dir(vector_db.api_key_ref)}")
|
280 |
-
|
281 |
-
if hasattr(vector_db.api_key_ref, 'key_value'):
|
282 |
-
api_key = vector_db.api_key_ref.key_value
|
283 |
-
# Log first few chars of API key for debugging
|
284 |
-
key_prefix = api_key[:4] + "..." if api_key and len(api_key) > 4 else "invalid/empty"
|
285 |
-
log_upload_debug(correlation_id, f"API key retrieved: {key_prefix}, length: {len(api_key) if api_key else 0}")
|
286 |
-
logger.info(f"[{correlation_id}] Using API key from relationship for vector database ID {vector_database_id}")
|
287 |
-
else:
|
288 |
-
log_upload_debug(correlation_id, f"api_key_ref does not have key_value attribute")
|
289 |
-
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
290 |
-
# Fallback to direct api_key if needed (deprecated)
|
291 |
-
api_key = vector_db.api_key
|
292 |
-
key_prefix = api_key[:4] + "..." if api_key and len(api_key) > 4 else "invalid/empty"
|
293 |
-
log_upload_debug(correlation_id, f"Using deprecated direct api_key: {key_prefix}")
|
294 |
-
logger.warning(f"[{correlation_id}] Using deprecated direct api_key for vector database ID {vector_database_id}")
|
295 |
-
else:
|
296 |
-
log_upload_debug(correlation_id, "No API key found in vector database")
|
297 |
|
298 |
-
#
|
|
|
299 |
index_name = vector_db.pinecone_index
|
300 |
-
log_upload_debug(correlation_id, f"Using index name '{index_name}' from vector database")
|
301 |
-
logger.info(f"[{correlation_id}] Using index name '{index_name}' from vector database")
|
302 |
|
303 |
-
#
|
304 |
file_id = str(uuid.uuid4())
|
305 |
-
temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{file_id}
|
306 |
-
log_upload_debug(correlation_id, f"Generated file_id: {file_id}, temp path: {temp_file_path}")
|
307 |
|
308 |
-
#
|
309 |
if user_id:
|
310 |
-
|
311 |
-
await send_pdf_upload_started(user_id, file.filename, file_id)
|
312 |
-
log_upload_debug(correlation_id, f"Sent upload started notification to user {user_id}")
|
313 |
-
except Exception as ws_error:
|
314 |
-
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
315 |
|
316 |
-
#
|
317 |
-
log_upload_debug(correlation_id, f"Reading file content")
|
318 |
file_content = await file.read()
|
319 |
-
log_upload_debug(correlation_id, f"File size: {len(file_content)} bytes")
|
320 |
-
|
321 |
with open(temp_file_path, "wb") as buffer:
|
322 |
buffer.write(file_content)
|
323 |
-
log_upload_debug(correlation_id, f"File saved to {temp_file_path}")
|
324 |
|
325 |
-
#
|
326 |
metadata = {
|
327 |
"filename": file.filename,
|
328 |
"content_type": file.content_type
|
329 |
}
|
330 |
|
331 |
-
|
332 |
-
|
333 |
-
log_upload_debug(correlation_id, f"Using content_type: {actual_content_type}")
|
334 |
-
|
335 |
-
if not actual_content_type:
|
336 |
-
# Fallback content type based on file extension
|
337 |
-
if is_pdf:
|
338 |
-
actual_content_type = "application/pdf"
|
339 |
-
elif is_text:
|
340 |
-
actual_content_type = "text/plain"
|
341 |
-
else:
|
342 |
-
actual_content_type = "application/octet-stream"
|
343 |
-
|
344 |
-
log_upload_debug(correlation_id, f"No content_type provided, using fallback: {actual_content_type}")
|
345 |
-
|
346 |
-
metadata["content_type"] = actual_content_type
|
347 |
-
|
348 |
-
# Use provided title or filename as document name
|
349 |
-
document_name = title or file.filename
|
350 |
-
|
351 |
-
# Verify document name is unique within this vector database
|
352 |
-
if vector_database_id:
|
353 |
-
# Check if a document with this name already exists in this vector database
|
354 |
-
existing_doc = db.query(Document).filter(
|
355 |
-
Document.name == document_name,
|
356 |
-
Document.vector_database_id == vector_database_id
|
357 |
-
).first()
|
358 |
-
|
359 |
-
if existing_doc:
|
360 |
-
# Make the name unique by appending timestamp
|
361 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
362 |
-
base_name, extension = os.path.splitext(document_name)
|
363 |
-
document_name = f"{base_name}_{timestamp}{extension}"
|
364 |
-
log_upload_debug(correlation_id, f"Document name already exists, using unique name: {document_name}")
|
365 |
-
|
366 |
-
metadata["title"] = document_name
|
367 |
-
|
368 |
if description:
|
369 |
metadata["description"] = description
|
370 |
|
371 |
-
#
|
372 |
if user_id:
|
373 |
-
|
374 |
-
await send_progress_update(
|
375 |
user_id,
|
376 |
file_id,
|
377 |
"file_preparation",
|
378 |
0.2,
|
379 |
"File saved, preparing for processing"
|
380 |
)
|
381 |
-
log_upload_debug(correlation_id, f"Sent file preparation progress to user {user_id}")
|
382 |
-
except Exception as ws_error:
|
383 |
-
log_upload_debug(correlation_id, f"Error sending progress update: {ws_error}", ws_error)
|
384 |
-
|
385 |
-
# Create document record - do this regardless of mock mode
|
386 |
-
document = None
|
387 |
-
vector_status = None
|
388 |
|
|
|
389 |
if vector_database_id and vector_db:
|
390 |
-
log_upload_debug(correlation_id, f"Creating PostgreSQL records for document with vector_database_id={vector_database_id}")
|
391 |
-
|
392 |
# Create document record without file content
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
db.refresh(document)
|
405 |
-
log_upload_debug(correlation_id, f"Created document record: id={document.id}")
|
406 |
-
except Exception as doc_error:
|
407 |
-
log_upload_debug(correlation_id, f"Error creating document record: {doc_error}", doc_error)
|
408 |
-
raise
|
409 |
|
410 |
# Create document content record to store binary data separately
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
document_id=document.id,
|
427 |
-
vector_database_id=vector_database_id,
|
428 |
-
status="pending",
|
429 |
-
vector_id=file_id # Store the document UUID as vector_id for later deletion
|
430 |
-
)
|
431 |
-
db.add(vector_status)
|
432 |
-
db.commit()
|
433 |
-
log_upload_debug(correlation_id, f"Created vector status record for document ID {document.id} with vector_id={file_id}")
|
434 |
-
except Exception as status_error:
|
435 |
-
log_upload_debug(correlation_id, f"Error creating vector status: {status_error}", status_error)
|
436 |
-
raise
|
437 |
-
|
438 |
-
logger.info(f"[{correlation_id}] Created document ID {document.id} and vector status in PostgreSQL")
|
439 |
|
440 |
-
#
|
441 |
-
|
442 |
-
processor = PDFProcessor(
|
443 |
-
index_name=index_name,
|
444 |
-
namespace=namespace,
|
445 |
-
api_key=api_key,
|
446 |
-
vector_db_id=vector_database_id,
|
447 |
-
correlation_id=correlation_id
|
448 |
-
)
|
449 |
|
450 |
-
#
|
451 |
if user_id:
|
452 |
-
|
453 |
-
await send_progress_update(
|
454 |
user_id,
|
455 |
file_id,
|
456 |
"embedding_start",
|
457 |
0.4,
|
458 |
"Starting to process PDF and create embeddings"
|
459 |
)
|
460 |
-
log_upload_debug(correlation_id, f"Sent embedding start notification to user {user_id}")
|
461 |
-
except Exception as ws_error:
|
462 |
-
log_upload_debug(correlation_id, f"Error sending WebSocket notification: {ws_error}", ws_error)
|
463 |
|
464 |
-
#
|
465 |
-
|
|
|
|
|
|
|
|
|
|
|
466 |
result = await processor.process_pdf(
|
467 |
file_path=temp_file_path,
|
468 |
-
document_id=file_id,
|
469 |
metadata=metadata,
|
470 |
-
progress_callback=
|
471 |
)
|
472 |
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
try:
|
485 |
-
os.remove(temp_file_path)
|
486 |
-
log_upload_debug(correlation_id, f"Cleaned up temp file after error: {temp_file_path}")
|
487 |
-
except Exception as cleanup_error:
|
488 |
-
log_upload_debug(correlation_id, f"Error cleaning up temporary file: {cleanup_error}", cleanup_error)
|
489 |
-
|
490 |
-
# Update error status in PostgreSQL
|
491 |
-
if vector_database_id and vector_status:
|
492 |
-
try:
|
493 |
-
vector_status.status = "failed"
|
494 |
-
vector_status.error_message = str(e)
|
495 |
db.commit()
|
496 |
-
log_upload_debug(correlation_id, f"Updated database with error status")
|
497 |
-
except Exception as db_error:
|
498 |
-
log_upload_debug(correlation_id, f"Error updating database with error status: {db_error}", db_error)
|
499 |
|
500 |
-
|
501 |
-
|
502 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
503 |
await send_pdf_upload_failed(
|
504 |
user_id,
|
505 |
file_id,
|
506 |
file.filename,
|
507 |
-
|
508 |
)
|
509 |
-
log_upload_debug(correlation_id, f"Sent failure notification for exception")
|
510 |
-
except Exception as ws_error:
|
511 |
-
log_upload_debug(correlation_id, f"Error sending WebSocket notification for failure: {ws_error}", ws_error)
|
512 |
|
513 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
return PDFResponse(
|
515 |
success=False,
|
516 |
error=str(e)
|
517 |
)
|
518 |
|
|
|
|
|
|
|
|
|
|
|
519 |
# Endpoint xóa tài liệu
|
520 |
@router.delete("/namespace", response_model=PDFResponse)
|
521 |
async def delete_namespace(
|
522 |
namespace: str = "Default",
|
523 |
index_name: str = "testbot768",
|
524 |
-
|
525 |
-
user_id: Optional[str] = None,
|
526 |
-
db: Session = Depends(get_db)
|
527 |
):
|
528 |
"""
|
529 |
Xóa toàn bộ embeddings trong một namespace từ Pinecone (tương ứng xoá namespace)
|
530 |
|
531 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
532 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
533 |
-
- **vector_database_id**: ID của vector database trong PostgreSQL (nếu có)
|
534 |
- **user_id**: ID của người dùng để cập nhật trạng thái qua WebSocket
|
535 |
"""
|
536 |
-
logger.info(f"Delete namespace request: namespace={namespace}, index={index_name}, vector_db_id={vector_database_id}")
|
537 |
-
|
538 |
try:
|
539 |
-
# Nếu có vector_database_id, lấy thông tin từ PostgreSQL
|
540 |
-
api_key = None
|
541 |
-
vector_db = None
|
542 |
-
|
543 |
-
if vector_database_id:
|
544 |
-
vector_db = db.query(VectorDatabase).filter(
|
545 |
-
VectorDatabase.id == vector_database_id,
|
546 |
-
VectorDatabase.status == "active"
|
547 |
-
).first()
|
548 |
-
if not vector_db:
|
549 |
-
return PDFResponse(
|
550 |
-
success=False,
|
551 |
-
error=f"Vector database with ID {vector_database_id} not found or inactive"
|
552 |
-
)
|
553 |
-
|
554 |
-
# Use index from vector database
|
555 |
-
index_name = vector_db.pinecone_index
|
556 |
-
|
557 |
-
# Get API key
|
558 |
-
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
559 |
-
api_key = vector_db.api_key_ref.key_value
|
560 |
-
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
561 |
-
api_key = vector_db.api_key
|
562 |
-
|
563 |
-
# Use namespace based on vector database ID
|
564 |
-
namespace = f"vdb-{vector_database_id}" if vector_database_id else namespace
|
565 |
-
logger.info(f"Using namespace '{namespace}' based on vector database ID")
|
566 |
-
|
567 |
# Gửi thông báo bắt đầu xóa qua WebSocket
|
568 |
if user_id:
|
569 |
await send_pdf_delete_started(user_id, namespace)
|
570 |
|
571 |
-
processor = PDFProcessor(
|
572 |
-
index_name=index_name,
|
573 |
-
namespace=namespace,
|
574 |
-
api_key=api_key,
|
575 |
-
vector_db_id=vector_database_id
|
576 |
-
)
|
577 |
result = await processor.delete_namespace()
|
578 |
|
579 |
-
# If successful and vector_database_id, update PostgreSQL to reflect the deletion
|
580 |
-
if result.get('success') and vector_database_id:
|
581 |
-
try:
|
582 |
-
# Update vector statuses for this database
|
583 |
-
affected_count = db.query(VectorStatus).filter(
|
584 |
-
VectorStatus.vector_database_id == vector_database_id,
|
585 |
-
VectorStatus.status != "deleted"
|
586 |
-
).update({"status": "deleted", "updated_at": datetime.now()})
|
587 |
-
|
588 |
-
# Update document embedding status
|
589 |
-
db.query(Document).filter(
|
590 |
-
Document.vector_database_id == vector_database_id,
|
591 |
-
Document.is_embedded == True
|
592 |
-
).update({"is_embedded": False})
|
593 |
-
|
594 |
-
db.commit()
|
595 |
-
logger.info(f"Updated {affected_count} vector statuses to 'deleted'")
|
596 |
-
|
597 |
-
# Include this info in the result
|
598 |
-
result["updated_records"] = affected_count
|
599 |
-
except Exception as db_error:
|
600 |
-
logger.error(f"Error updating PostgreSQL records after namespace deletion: {db_error}")
|
601 |
-
result["postgresql_update_error"] = str(db_error)
|
602 |
-
|
603 |
# Gửi thông báo kết quả qua WebSocket
|
604 |
if user_id:
|
605 |
if result.get('success'):
|
@@ -609,8 +277,6 @@ async def delete_namespace(
|
|
609 |
|
610 |
return result
|
611 |
except Exception as e:
|
612 |
-
logger.exception(f"Error in delete_namespace: {str(e)}")
|
613 |
-
|
614 |
# Gửi thông báo lỗi qua WebSocket
|
615 |
if user_id:
|
616 |
await send_pdf_delete_failed(user_id, namespace, str(e))
|
@@ -622,338 +288,23 @@ async def delete_namespace(
|
|
622 |
|
623 |
# Endpoint lấy danh sách tài liệu
|
624 |
@router.get("/documents", response_model=DocumentsListResponse)
|
625 |
-
async def get_documents(
|
626 |
-
namespace: str = "Default",
|
627 |
-
index_name: str = "testbot768",
|
628 |
-
vector_database_id: Optional[int] = None,
|
629 |
-
db: Session = Depends(get_db)
|
630 |
-
):
|
631 |
"""
|
632 |
Lấy thông tin về tất cả tài liệu đã được embed
|
633 |
|
634 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
635 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
636 |
-
- **vector_database_id**: ID của vector database trong PostgreSQL (nếu có)
|
637 |
"""
|
638 |
-
logger.info(f"Get documents request: namespace={namespace}, index={index_name}, vector_db_id={vector_database_id}")
|
639 |
-
|
640 |
try:
|
641 |
-
# Nếu có vector_database_id, lấy thông tin từ PostgreSQL
|
642 |
-
api_key = None
|
643 |
-
vector_db = None
|
644 |
-
|
645 |
-
if vector_database_id:
|
646 |
-
vector_db = db.query(VectorDatabase).filter(
|
647 |
-
VectorDatabase.id == vector_database_id,
|
648 |
-
VectorDatabase.status == "active"
|
649 |
-
).first()
|
650 |
-
|
651 |
-
if not vector_db:
|
652 |
-
return DocumentsListResponse(
|
653 |
-
success=False,
|
654 |
-
error=f"Vector database with ID {vector_database_id} not found or inactive"
|
655 |
-
)
|
656 |
-
|
657 |
-
# Use index from vector database
|
658 |
-
index_name = vector_db.pinecone_index
|
659 |
-
|
660 |
-
# Get API key
|
661 |
-
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
662 |
-
api_key = vector_db.api_key_ref.key_value
|
663 |
-
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
664 |
-
api_key = vector_db.api_key
|
665 |
-
|
666 |
-
# Use namespace based on vector database ID
|
667 |
-
namespace = f"vdb-{vector_database_id}" if vector_database_id else namespace
|
668 |
-
logger.info(f"Using namespace '{namespace}' based on vector database ID")
|
669 |
-
|
670 |
# Khởi tạo PDF processor
|
671 |
-
processor = PDFProcessor(
|
672 |
-
index_name=index_name,
|
673 |
-
namespace=namespace,
|
674 |
-
api_key=api_key,
|
675 |
-
vector_db_id=vector_database_id
|
676 |
-
)
|
677 |
|
678 |
-
# Lấy danh sách documents
|
679 |
-
|
680 |
-
|
681 |
-
# If vector_database_id is provided, also fetch from PostgreSQL
|
682 |
-
if vector_database_id:
|
683 |
-
try:
|
684 |
-
# Get all successfully embedded documents for this vector database
|
685 |
-
documents = db.query(Document).join(
|
686 |
-
VectorStatus, Document.id == VectorStatus.document_id
|
687 |
-
).filter(
|
688 |
-
Document.vector_database_id == vector_database_id,
|
689 |
-
Document.is_embedded == True,
|
690 |
-
VectorStatus.status == "completed"
|
691 |
-
).all()
|
692 |
-
|
693 |
-
# Add document info to the result
|
694 |
-
if documents:
|
695 |
-
pinecone_result["postgresql_documents"] = [
|
696 |
-
{
|
697 |
-
"id": doc.id,
|
698 |
-
"name": doc.name,
|
699 |
-
"file_type": doc.file_type,
|
700 |
-
"content_type": doc.content_type,
|
701 |
-
"created_at": doc.created_at.isoformat() if doc.created_at else None
|
702 |
-
}
|
703 |
-
for doc in documents
|
704 |
-
]
|
705 |
-
pinecone_result["postgresql_document_count"] = len(documents)
|
706 |
-
except Exception as db_error:
|
707 |
-
logger.error(f"Error fetching PostgreSQL documents: {db_error}")
|
708 |
-
pinecone_result["postgresql_error"] = str(db_error)
|
709 |
-
|
710 |
-
return pinecone_result
|
711 |
-
except Exception as e:
|
712 |
-
logger.exception(f"Error in get_documents: {str(e)}")
|
713 |
-
|
714 |
-
return DocumentsListResponse(
|
715 |
-
success=False,
|
716 |
-
error=str(e)
|
717 |
-
)
|
718 |
-
|
719 |
-
# Health check endpoint for PDF API
|
720 |
-
@router.get("/health")
|
721 |
-
async def health_check():
|
722 |
-
return {
|
723 |
-
"status": "healthy",
|
724 |
-
"version": "1.0.0",
|
725 |
-
"message": "PDF API is running"
|
726 |
-
}
|
727 |
-
|
728 |
-
# Document deletion endpoint
|
729 |
-
@router.delete("/document", response_model=PDFResponse)
|
730 |
-
async def delete_document(
|
731 |
-
document_id: str,
|
732 |
-
namespace: str = "Default",
|
733 |
-
index_name: str = "testbot768",
|
734 |
-
vector_database_id: Optional[int] = None,
|
735 |
-
user_id: Optional[str] = None,
|
736 |
-
db: Session = Depends(get_db)
|
737 |
-
):
|
738 |
-
"""
|
739 |
-
Delete vectors for a specific document from the vector database
|
740 |
-
|
741 |
-
This endpoint can be called in two ways:
|
742 |
-
1. With the PostgreSQL document ID - will look up the actual vector_id first
|
743 |
-
2. With the actual vector_id directly - when called from the PostgreSQL document deletion endpoint
|
744 |
-
|
745 |
-
- **document_id**: ID of the document to delete (can be PostgreSQL document ID or Pinecone vector_id)
|
746 |
-
- **namespace**: Namespace in the vector database (default: "Default")
|
747 |
-
- **index_name**: Name of the vector index (default: "testbot768")
|
748 |
-
- **vector_database_id**: ID of vector database in PostgreSQL (optional)
|
749 |
-
- **user_id**: User ID for WebSocket status updates (optional)
|
750 |
-
"""
|
751 |
-
logger.info(f"Delete document request: document_id={document_id}, namespace={namespace}, index={index_name}, vector_db_id={vector_database_id}")
|
752 |
-
|
753 |
-
try:
|
754 |
-
# If vector_database_id is provided, get info from PostgreSQL
|
755 |
-
api_key = None
|
756 |
-
vector_db = None
|
757 |
-
pinecone_document_id = document_id # Default to the provided document_id
|
758 |
-
document_to_delete = None
|
759 |
-
vector_status_to_update = None
|
760 |
-
document_found = False # Flag to track if document was found
|
761 |
-
vector_id_found = False # Flag to track if a valid vector ID was found
|
762 |
-
|
763 |
-
if vector_database_id:
|
764 |
-
vector_db = db.query(VectorDatabase).filter(
|
765 |
-
VectorDatabase.id == vector_database_id,
|
766 |
-
VectorDatabase.status == "active"
|
767 |
-
).first()
|
768 |
-
if not vector_db:
|
769 |
-
return PDFResponse(
|
770 |
-
success=False,
|
771 |
-
error=f"Vector database with ID {vector_database_id} not found or inactive"
|
772 |
-
)
|
773 |
-
|
774 |
-
# Use index from vector database
|
775 |
-
index_name = vector_db.pinecone_index
|
776 |
-
|
777 |
-
# Get API key
|
778 |
-
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref:
|
779 |
-
api_key = vector_db.api_key_ref.key_value
|
780 |
-
elif hasattr(vector_db, 'api_key') and vector_db.api_key:
|
781 |
-
api_key = vector_db.api_key
|
782 |
-
|
783 |
-
# Use namespace based on vector database ID
|
784 |
-
namespace = f"vdb-{vector_database_id}" if vector_database_id else namespace
|
785 |
-
logger.info(f"Using namespace '{namespace}' based on vector database ID")
|
786 |
-
|
787 |
-
# Check if document_id is a numeric database ID or document name
|
788 |
-
if document_id.isdigit():
|
789 |
-
# Try to find the document in PostgreSQL by its ID
|
790 |
-
db_document_id = int(document_id)
|
791 |
-
document_to_delete = db.query(Document).filter(Document.id == db_document_id).first()
|
792 |
-
|
793 |
-
if document_to_delete:
|
794 |
-
document_found = True
|
795 |
-
logger.info(f"Found document in database: id={document_to_delete.id}, name={document_to_delete.name}")
|
796 |
-
|
797 |
-
# Look for vector status to find the Pinecone vector_id
|
798 |
-
vector_status_to_update = db.query(VectorStatus).filter(
|
799 |
-
VectorStatus.document_id == document_to_delete.id,
|
800 |
-
VectorStatus.vector_database_id == vector_database_id
|
801 |
-
).first()
|
802 |
-
|
803 |
-
if vector_status_to_update and vector_status_to_update.vector_id:
|
804 |
-
pinecone_document_id = vector_status_to_update.vector_id
|
805 |
-
vector_id_found = True
|
806 |
-
logger.info(f"Using vector_id '{pinecone_document_id}' from vector status")
|
807 |
-
else:
|
808 |
-
# Fallback options if vector_id is not directly found
|
809 |
-
pinecone_document_id = document_to_delete.name
|
810 |
-
logger.info(f"Vector ID not found in status, using document name '{pinecone_document_id}' as fallback")
|
811 |
-
else:
|
812 |
-
logger.warning(f"Document with ID {db_document_id} not found in database. Using ID as is.")
|
813 |
-
else:
|
814 |
-
# Try to find document by name/title
|
815 |
-
document_to_delete = db.query(Document).filter(
|
816 |
-
Document.name == document_id,
|
817 |
-
Document.vector_database_id == vector_database_id
|
818 |
-
).first()
|
819 |
-
|
820 |
-
if document_to_delete:
|
821 |
-
document_found = True
|
822 |
-
logger.info(f"Found document by name: id={document_to_delete.id}, name={document_to_delete.name}")
|
823 |
-
|
824 |
-
# Get vector status for this document
|
825 |
-
vector_status_to_update = db.query(VectorStatus).filter(
|
826 |
-
VectorStatus.document_id == document_to_delete.id,
|
827 |
-
VectorStatus.vector_database_id == vector_database_id
|
828 |
-
).first()
|
829 |
-
|
830 |
-
if vector_status_to_update and vector_status_to_update.vector_id:
|
831 |
-
pinecone_document_id = vector_status_to_update.vector_id
|
832 |
-
vector_id_found = True
|
833 |
-
logger.info(f"Using vector_id '{pinecone_document_id}' from vector status")
|
834 |
-
|
835 |
-
# Send notification of deletion start via WebSocket if user_id provided
|
836 |
-
if user_id:
|
837 |
-
try:
|
838 |
-
await send_pdf_delete_started(user_id, pinecone_document_id)
|
839 |
-
except Exception as ws_error:
|
840 |
-
logger.error(f"Error sending WebSocket notification: {ws_error}")
|
841 |
-
|
842 |
-
# Initialize PDF processor
|
843 |
-
processor = PDFProcessor(
|
844 |
-
index_name=index_name,
|
845 |
-
namespace=namespace,
|
846 |
-
api_key=api_key,
|
847 |
-
vector_db_id=vector_database_id
|
848 |
-
)
|
849 |
-
|
850 |
-
# Delete document vectors using the pinecone_document_id and additional metadata
|
851 |
-
additional_metadata = {}
|
852 |
-
if document_to_delete:
|
853 |
-
# Add document name as title for searching
|
854 |
-
additional_metadata["document_name"] = document_to_delete.name
|
855 |
-
|
856 |
-
result = await processor.delete_document(pinecone_document_id, additional_metadata)
|
857 |
-
|
858 |
-
# Check if vectors were actually deleted or found
|
859 |
-
vectors_deleted = result.get('vectors_deleted', 0)
|
860 |
-
vectors_found = result.get('vectors_found', False)
|
861 |
-
|
862 |
-
# If no document was found in PostgreSQL and no vectors were found/deleted in Pinecone
|
863 |
-
if not document_found and not vectors_found:
|
864 |
-
result['success'] = False # Override success to false
|
865 |
-
result['error'] = f"Document ID {document_id} not found in PostgreSQL or Pinecone"
|
866 |
-
|
867 |
-
# Send notification of deletion failure via WebSocket if user_id provided
|
868 |
-
if user_id:
|
869 |
-
try:
|
870 |
-
await send_pdf_delete_failed(user_id, document_id, result['error'])
|
871 |
-
except Exception as ws_error:
|
872 |
-
logger.error(f"Error sending WebSocket notification: {ws_error}")
|
873 |
-
|
874 |
-
return result
|
875 |
-
|
876 |
-
# If successful and vector_database_id is provided, update PostgreSQL records
|
877 |
-
if result.get('success') and vector_database_id:
|
878 |
-
try:
|
879 |
-
# Update vector status if we found it earlier
|
880 |
-
if vector_status_to_update:
|
881 |
-
vector_status_to_update.status = "deleted"
|
882 |
-
db.commit()
|
883 |
-
result["postgresql_updated"] = True
|
884 |
-
logger.info(f"Updated vector status for document ID {document_to_delete.id if document_to_delete else document_id} to 'deleted'")
|
885 |
-
else:
|
886 |
-
# If we didn't find it earlier, try again with more search options
|
887 |
-
document = None
|
888 |
-
|
889 |
-
if document_id.isdigit():
|
890 |
-
# If the original document_id was numeric, use it directly
|
891 |
-
document = db.query(Document).filter(Document.id == int(document_id)).first()
|
892 |
-
|
893 |
-
if not document:
|
894 |
-
# Find document by vector ID if it exists
|
895 |
-
document = db.query(Document).join(
|
896 |
-
VectorStatus, Document.id == VectorStatus.document_id
|
897 |
-
).filter(
|
898 |
-
Document.vector_database_id == vector_database_id,
|
899 |
-
VectorStatus.vector_id == pinecone_document_id
|
900 |
-
).first()
|
901 |
-
|
902 |
-
if not document:
|
903 |
-
# Try finding by name
|
904 |
-
document = db.query(Document).filter(
|
905 |
-
Document.vector_database_id == vector_database_id,
|
906 |
-
Document.name == pinecone_document_id
|
907 |
-
).first()
|
908 |
-
|
909 |
-
if document:
|
910 |
-
# Update vector status
|
911 |
-
vector_status = db.query(VectorStatus).filter(
|
912 |
-
VectorStatus.document_id == document.id,
|
913 |
-
VectorStatus.vector_database_id == vector_database_id
|
914 |
-
).first()
|
915 |
-
|
916 |
-
if vector_status:
|
917 |
-
vector_status.status = "deleted"
|
918 |
-
db.commit()
|
919 |
-
result["postgresql_updated"] = True
|
920 |
-
logger.info(f"Updated vector status for document ID {document.id} to 'deleted'")
|
921 |
-
else:
|
922 |
-
logger.warning(f"Could not find document record for deletion confirmation. Document ID: {document_id}, Vector ID: {pinecone_document_id}")
|
923 |
-
except Exception as db_error:
|
924 |
-
logger.error(f"Error updating PostgreSQL records: {db_error}")
|
925 |
-
result["postgresql_error"] = str(db_error)
|
926 |
-
|
927 |
-
# Add information about what was found and deleted
|
928 |
-
result["document_found_in_db"] = document_found
|
929 |
-
result["vector_id_found"] = vector_id_found
|
930 |
-
result["vectors_deleted"] = vectors_deleted
|
931 |
-
|
932 |
-
# Send notification of deletion completion via WebSocket if user_id provided
|
933 |
-
if user_id:
|
934 |
-
try:
|
935 |
-
if result.get('success'):
|
936 |
-
await send_pdf_delete_completed(user_id, pinecone_document_id)
|
937 |
-
else:
|
938 |
-
await send_pdf_delete_failed(user_id, pinecone_document_id, result.get('error', 'Unknown error'))
|
939 |
-
except Exception as ws_error:
|
940 |
-
logger.error(f"Error sending WebSocket notification: {ws_error}")
|
941 |
|
942 |
return result
|
943 |
except Exception as e:
|
944 |
-
|
945 |
-
|
946 |
-
# Send notification of deletion failure via WebSocket if user_id provided
|
947 |
-
if user_id:
|
948 |
-
try:
|
949 |
-
await send_pdf_delete_failed(user_id, document_id, str(e))
|
950 |
-
except Exception as ws_error:
|
951 |
-
logger.error(f"Error sending WebSocket notification: {ws_error}")
|
952 |
-
|
953 |
-
return PDFResponse(
|
954 |
success=False,
|
955 |
error=str(e)
|
956 |
-
)
|
957 |
-
|
958 |
-
|
959 |
-
|
|
|
1 |
import os
|
2 |
import shutil
|
3 |
import uuid
|
4 |
+
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, BackgroundTasks, Depends
|
|
|
|
|
5 |
from fastapi.responses import JSONResponse
|
6 |
from typing import Optional, List, Dict, Any
|
7 |
from sqlalchemy.orm import Session
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
from app.utils.pdf_processor import PDFProcessor
|
10 |
from app.models.pdf_models import PDFResponse, DeleteDocumentRequest, DocumentsListResponse
|
11 |
from app.database.postgresql import get_db
|
12 |
+
from app.database.models import VectorDatabase, Document, VectorStatus, DocumentContent
|
13 |
+
from datetime import datetime
|
14 |
from app.api.pdf_websocket import (
|
15 |
send_pdf_upload_started,
|
16 |
send_pdf_upload_progress,
|
|
|
21 |
send_pdf_delete_failed
|
22 |
)
|
23 |
|
24 |
+
# Khởi tạo router
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
router = APIRouter(
|
26 |
prefix="/pdf",
|
27 |
tags=["PDF Processing"],
|
28 |
)
|
29 |
|
30 |
+
# Thư mục lưu file tạm - sử dụng /tmp để tránh lỗi quyền truy cập
|
31 |
+
TEMP_UPLOAD_DIR = "/tmp/uploads/temp"
|
32 |
+
STORAGE_DIR = "/tmp/uploads/pdfs"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
# Đảm bảo thư mục upload tồn tại
|
35 |
+
os.makedirs(TEMP_UPLOAD_DIR, exist_ok=True)
|
36 |
+
os.makedirs(STORAGE_DIR, exist_ok=True)
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
# Endpoint upload và xử lý PDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
@router.post("/upload", response_model=PDFResponse)
|
40 |
async def upload_pdf(
|
41 |
file: UploadFile = File(...),
|
|
|
45 |
description: Optional[str] = Form(None),
|
46 |
user_id: Optional[str] = Form(None),
|
47 |
vector_database_id: Optional[int] = Form(None),
|
|
|
48 |
background_tasks: BackgroundTasks = None,
|
49 |
db: Session = Depends(get_db)
|
50 |
):
|
51 |
"""
|
52 |
+
Upload và xử lý file PDF để tạo embeddings và lưu vào Pinecone
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
- **file**: File PDF cần xử lý
|
55 |
+
- **namespace**: Namespace trong Pinecone để lưu embeddings (mặc định: "Default")
|
56 |
+
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
57 |
+
- **title**: Tiêu đề của tài liệu (tùy chọn)
|
58 |
+
- **description**: Mô tả về tài liệu (tùy chọn)
|
59 |
+
- **user_id**: ID của người dùng để cập nhật trạng thái qua WebSocket
|
60 |
+
- **vector_database_id**: ID của vector database trong PostgreSQL (tùy chọn)
|
61 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
try:
|
63 |
+
# Kiểm tra file có phải PDF không
|
64 |
+
if not file.filename.lower().endswith('.pdf'):
|
65 |
+
raise HTTPException(status_code=400, detail="Chỉ chấp nhận file PDF")
|
|
|
|
|
66 |
|
67 |
+
# Nếu có vector_database_id, lấy thông tin từ PostgreSQL
|
|
|
|
|
|
|
|
|
68 |
api_key = None
|
69 |
vector_db = None
|
70 |
|
71 |
if vector_database_id:
|
|
|
|
|
72 |
vector_db = db.query(VectorDatabase).filter(
|
73 |
VectorDatabase.id == vector_database_id,
|
74 |
VectorDatabase.status == "active"
|
75 |
).first()
|
76 |
|
77 |
if not vector_db:
|
78 |
+
raise HTTPException(status_code=404, detail="Vector database không tồn tại hoặc không hoạt động")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
+
# Sử dụng thông tin từ vector database
|
81 |
+
api_key = vector_db.api_key
|
82 |
index_name = vector_db.pinecone_index
|
|
|
|
|
83 |
|
84 |
+
# Tạo file_id và lưu file tạm
|
85 |
file_id = str(uuid.uuid4())
|
86 |
+
temp_file_path = os.path.join(TEMP_UPLOAD_DIR, f"{file_id}.pdf")
|
|
|
87 |
|
88 |
+
# Gửi thông báo bắt đầu xử lý qua WebSocket nếu có user_id
|
89 |
if user_id:
|
90 |
+
await send_pdf_upload_started(user_id, file.filename, file_id)
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
# Lưu file
|
|
|
93 |
file_content = await file.read()
|
|
|
|
|
94 |
with open(temp_file_path, "wb") as buffer:
|
95 |
buffer.write(file_content)
|
|
|
96 |
|
97 |
+
# Tạo metadata
|
98 |
metadata = {
|
99 |
"filename": file.filename,
|
100 |
"content_type": file.content_type
|
101 |
}
|
102 |
|
103 |
+
if title:
|
104 |
+
metadata["title"] = title
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
if description:
|
106 |
metadata["description"] = description
|
107 |
|
108 |
+
# Gửi thông báo tiến độ qua WebSocket
|
109 |
if user_id:
|
110 |
+
await send_pdf_upload_progress(
|
|
|
111 |
user_id,
|
112 |
file_id,
|
113 |
"file_preparation",
|
114 |
0.2,
|
115 |
"File saved, preparing for processing"
|
116 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
# Lưu thông tin tài liệu vào PostgreSQL nếu có vector_database_id
|
119 |
if vector_database_id and vector_db:
|
|
|
|
|
120 |
# Create document record without file content
|
121 |
+
document = Document(
|
122 |
+
name=title or file.filename,
|
123 |
+
file_type="pdf",
|
124 |
+
content_type=file.content_type,
|
125 |
+
size=len(file_content),
|
126 |
+
is_embedded=False,
|
127 |
+
vector_database_id=vector_database_id
|
128 |
+
)
|
129 |
+
db.add(document)
|
130 |
+
db.commit()
|
131 |
+
db.refresh(document)
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
# Create document content record to store binary data separately
|
134 |
+
document_content = DocumentContent(
|
135 |
+
document_id=document.id,
|
136 |
+
file_content=file_content
|
137 |
+
)
|
138 |
+
db.add(document_content)
|
139 |
+
db.commit()
|
140 |
+
|
141 |
+
# Tạo vector status record
|
142 |
+
vector_status = VectorStatus(
|
143 |
+
document_id=document.id,
|
144 |
+
vector_database_id=vector_database_id,
|
145 |
+
status="pending"
|
146 |
+
)
|
147 |
+
db.add(vector_status)
|
148 |
+
db.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
+
# Khởi tạo PDF processor với API key nếu có
|
151 |
+
processor = PDFProcessor(index_name=index_name, namespace=namespace, api_key=api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
+
# Gửi thông báo bắt đầu embedding qua WebSocket
|
154 |
if user_id:
|
155 |
+
await send_pdf_upload_progress(
|
|
|
156 |
user_id,
|
157 |
file_id,
|
158 |
"embedding_start",
|
159 |
0.4,
|
160 |
"Starting to process PDF and create embeddings"
|
161 |
)
|
|
|
|
|
|
|
162 |
|
163 |
+
# Xử lý PDF và tạo embeddings
|
164 |
+
# Tạo callback function để xử lý cập nhật tiến độ
|
165 |
+
async def progress_callback_wrapper(step, progress, message):
|
166 |
+
if user_id:
|
167 |
+
await send_progress_update(user_id, file_id, step, progress, message)
|
168 |
+
|
169 |
+
# Xử lý PDF và tạo embeddings với callback đã được xử lý đúng cách
|
170 |
result = await processor.process_pdf(
|
171 |
file_path=temp_file_path,
|
172 |
+
document_id=file_id,
|
173 |
metadata=metadata,
|
174 |
+
progress_callback=progress_callback_wrapper
|
175 |
)
|
176 |
|
177 |
+
# Nếu thành công, chuyển file vào storage
|
178 |
+
if result.get('success'):
|
179 |
+
storage_path = os.path.join(STORAGE_DIR, f"{file_id}.pdf")
|
180 |
+
shutil.move(temp_file_path, storage_path)
|
181 |
+
|
182 |
+
# Cập nhật trạng thái trong PostgreSQL nếu có vector_database_id
|
183 |
+
if vector_database_id and 'document' in locals() and 'vector_status' in locals():
|
184 |
+
vector_status.status = "completed"
|
185 |
+
vector_status.embedded_at = datetime.now()
|
186 |
+
vector_status.vector_id = file_id
|
187 |
+
document.is_embedded = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
db.commit()
|
|
|
|
|
|
|
189 |
|
190 |
+
# Gửi thông báo hoàn thành qua WebSocket
|
191 |
+
if user_id:
|
192 |
+
await send_pdf_upload_completed(
|
193 |
+
user_id,
|
194 |
+
file_id,
|
195 |
+
file.filename,
|
196 |
+
result.get('chunks_processed', 0)
|
197 |
+
)
|
198 |
+
else:
|
199 |
+
# Cập nhật trạng thái lỗi trong PostgreSQL nếu có vector_database_id
|
200 |
+
if vector_database_id and 'vector_status' in locals():
|
201 |
+
vector_status.status = "failed"
|
202 |
+
vector_status.error_message = result.get('error', 'Unknown error')
|
203 |
+
db.commit()
|
204 |
+
|
205 |
+
# Gửi thông báo lỗi qua WebSocket
|
206 |
+
if user_id:
|
207 |
await send_pdf_upload_failed(
|
208 |
user_id,
|
209 |
file_id,
|
210 |
file.filename,
|
211 |
+
result.get('error', 'Unknown error')
|
212 |
)
|
|
|
|
|
|
|
213 |
|
214 |
+
# Dọn dẹp: xóa file tạm nếu vẫn còn
|
215 |
+
if os.path.exists(temp_file_path):
|
216 |
+
os.remove(temp_file_path)
|
217 |
+
|
218 |
+
return result
|
219 |
+
except Exception as e:
|
220 |
+
# D���n dẹp nếu có lỗi
|
221 |
+
if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
|
222 |
+
os.remove(temp_file_path)
|
223 |
+
|
224 |
+
# Cập nhật trạng thái lỗi trong PostgreSQL nếu có vector_database_id
|
225 |
+
if 'vector_database_id' in locals() and vector_database_id and 'vector_status' in locals():
|
226 |
+
vector_status.status = "failed"
|
227 |
+
vector_status.error_message = str(e)
|
228 |
+
db.commit()
|
229 |
+
|
230 |
+
# Gửi thông báo lỗi qua WebSocket
|
231 |
+
if 'user_id' in locals() and user_id and 'file_id' in locals():
|
232 |
+
await send_pdf_upload_failed(
|
233 |
+
user_id,
|
234 |
+
file_id,
|
235 |
+
file.filename,
|
236 |
+
str(e)
|
237 |
+
)
|
238 |
+
|
239 |
return PDFResponse(
|
240 |
success=False,
|
241 |
error=str(e)
|
242 |
)
|
243 |
|
244 |
+
# Function để gửi cập nhật tiến độ - được sử dụng trong callback
|
245 |
+
async def send_progress_update(user_id, document_id, step, progress, message):
|
246 |
+
if user_id:
|
247 |
+
await send_pdf_upload_progress(user_id, document_id, step, progress, message)
|
248 |
+
|
249 |
# Endpoint xóa tài liệu
|
250 |
@router.delete("/namespace", response_model=PDFResponse)
|
251 |
async def delete_namespace(
|
252 |
namespace: str = "Default",
|
253 |
index_name: str = "testbot768",
|
254 |
+
user_id: Optional[str] = None
|
|
|
|
|
255 |
):
|
256 |
"""
|
257 |
Xóa toàn bộ embeddings trong một namespace từ Pinecone (tương ứng xoá namespace)
|
258 |
|
259 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
260 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
|
|
261 |
- **user_id**: ID của người dùng để cập nhật trạng thái qua WebSocket
|
262 |
"""
|
|
|
|
|
263 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
# Gửi thông báo bắt đầu xóa qua WebSocket
|
265 |
if user_id:
|
266 |
await send_pdf_delete_started(user_id, namespace)
|
267 |
|
268 |
+
processor = PDFProcessor(index_name=index_name, namespace=namespace)
|
|
|
|
|
|
|
|
|
|
|
269 |
result = await processor.delete_namespace()
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
# Gửi thông báo kết quả qua WebSocket
|
272 |
if user_id:
|
273 |
if result.get('success'):
|
|
|
277 |
|
278 |
return result
|
279 |
except Exception as e:
|
|
|
|
|
280 |
# Gửi thông báo lỗi qua WebSocket
|
281 |
if user_id:
|
282 |
await send_pdf_delete_failed(user_id, namespace, str(e))
|
|
|
288 |
|
289 |
# Endpoint lấy danh sách tài liệu
|
290 |
@router.get("/documents", response_model=DocumentsListResponse)
|
291 |
+
async def get_documents(namespace: str = "Default", index_name: str = "testbot768"):
|
|
|
|
|
|
|
|
|
|
|
292 |
"""
|
293 |
Lấy thông tin về tất cả tài liệu đã được embed
|
294 |
|
295 |
- **namespace**: Namespace trong Pinecone (mặc định: "Default")
|
296 |
- **index_name**: Tên index Pinecone (mặc định: "testbot768")
|
|
|
297 |
"""
|
|
|
|
|
298 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
# Khởi tạo PDF processor
|
300 |
+
processor = PDFProcessor(index_name=index_name, namespace=namespace)
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
+
# Lấy danh sách documents
|
303 |
+
result = await processor.list_documents()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
|
305 |
return result
|
306 |
except Exception as e:
|
307 |
+
return DocumentsListResponse(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
success=False,
|
309 |
error=str(e)
|
310 |
+
)
|
|
|
|
|
|
app/api/pdf_websocket.py
CHANGED
@@ -108,184 +108,7 @@ class ConnectionManager:
|
|
108 |
# Tạo instance của ConnectionManager
|
109 |
manager = ConnectionManager()
|
110 |
|
111 |
-
|
112 |
-
@router.get("/ws/test/{user_id}")
|
113 |
-
async def test_websocket_send(user_id: str):
|
114 |
-
"""
|
115 |
-
Test route to manually send a WebSocket message to a user
|
116 |
-
This is useful for debugging WebSocket connections
|
117 |
-
"""
|
118 |
-
logger.info(f"Attempting to send test message to user: {user_id}")
|
119 |
-
|
120 |
-
# Check if user has a connection
|
121 |
-
status = manager.get_connection_status(user_id)
|
122 |
-
if not status["active"]:
|
123 |
-
logger.warning(f"No active WebSocket connection for user: {user_id}")
|
124 |
-
return {"success": False, "message": f"No active WebSocket connection for user: {user_id}"}
|
125 |
-
|
126 |
-
# Send test message
|
127 |
-
await manager.send_message({
|
128 |
-
"type": "test_message",
|
129 |
-
"message": "This is a test WebSocket message",
|
130 |
-
"timestamp": int(time.time())
|
131 |
-
}, user_id)
|
132 |
-
|
133 |
-
logger.info(f"Test message sent to user: {user_id}")
|
134 |
-
return {"success": True, "message": f"Test message sent to user: {user_id}"}
|
135 |
-
|
136 |
-
@router.websocket("/ws/pdf/{user_id}")
|
137 |
-
async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
138 |
-
"""Endpoint WebSocket để cập nhật tiến trình xử lý PDF"""
|
139 |
-
logger.info(f"WebSocket connection request received for user: {user_id}")
|
140 |
-
|
141 |
-
try:
|
142 |
-
await manager.connect(websocket, user_id)
|
143 |
-
logger.info(f"WebSocket connection accepted for user: {user_id}")
|
144 |
-
|
145 |
-
# Send a test message to confirm connection
|
146 |
-
await manager.send_message({
|
147 |
-
"type": "connection_established",
|
148 |
-
"message": "WebSocket connection established successfully",
|
149 |
-
"user_id": user_id,
|
150 |
-
"timestamp": int(time.time())
|
151 |
-
}, user_id)
|
152 |
-
|
153 |
-
try:
|
154 |
-
while True:
|
155 |
-
# Đợi tin nhắn từ client (chỉ để giữ kết nối)
|
156 |
-
data = await websocket.receive_text()
|
157 |
-
logger.debug(f"Received from client: {data}")
|
158 |
-
|
159 |
-
# Echo back to confirm receipt
|
160 |
-
if data != "heartbeat": # Don't echo heartbeats
|
161 |
-
await manager.send_message({
|
162 |
-
"type": "echo",
|
163 |
-
"message": f"Received: {data}",
|
164 |
-
"timestamp": int(time.time())
|
165 |
-
}, user_id)
|
166 |
-
except WebSocketDisconnect:
|
167 |
-
logger.info(f"WebSocket disconnected for user: {user_id}")
|
168 |
-
manager.disconnect(websocket, user_id)
|
169 |
-
except Exception as e:
|
170 |
-
logger.error(f"WebSocket error: {str(e)}")
|
171 |
-
manager.disconnect(websocket, user_id)
|
172 |
-
except Exception as e:
|
173 |
-
logger.error(f"Failed to establish WebSocket connection: {str(e)}")
|
174 |
-
# Ensure the connection is closed properly
|
175 |
-
if websocket.client_state != 4: # 4 = CLOSED
|
176 |
-
await websocket.close(code=1011, reason=f"Server error: {str(e)}")
|
177 |
-
|
178 |
-
import logging
|
179 |
-
from typing import Dict, List, Optional, Any
|
180 |
-
from fastapi import WebSocket, WebSocketDisconnect, APIRouter
|
181 |
-
from pydantic import BaseModel
|
182 |
-
import json
|
183 |
-
import time
|
184 |
-
|
185 |
-
# Cấu hình logging
|
186 |
-
logger = logging.getLogger(__name__)
|
187 |
-
|
188 |
-
# Models cho Swagger documentation
|
189 |
-
class ConnectionStatus(BaseModel):
|
190 |
-
user_id: str
|
191 |
-
active: bool
|
192 |
-
connection_count: int
|
193 |
-
last_activity: Optional[float] = None
|
194 |
-
|
195 |
-
class UserConnection(BaseModel):
|
196 |
-
user_id: str
|
197 |
-
connection_count: int
|
198 |
-
|
199 |
-
class AllConnectionsStatus(BaseModel):
|
200 |
-
total_users: int
|
201 |
-
total_connections: int
|
202 |
-
users: List[UserConnection]
|
203 |
-
|
204 |
-
# Khởi tạo router
|
205 |
-
router = APIRouter(
|
206 |
-
prefix="",
|
207 |
-
tags=["WebSockets"],
|
208 |
-
)
|
209 |
-
|
210 |
-
class ConnectionManager:
|
211 |
-
"""Quản lý các kết nối WebSocket"""
|
212 |
-
|
213 |
-
def __init__(self):
|
214 |
-
# Lưu trữ các kết nối theo user_id
|
215 |
-
self.active_connections: Dict[str, List[WebSocket]] = {}
|
216 |
-
|
217 |
-
async def connect(self, websocket: WebSocket, user_id: str):
|
218 |
-
"""Kết nối một WebSocket mới"""
|
219 |
-
await websocket.accept()
|
220 |
-
if user_id not in self.active_connections:
|
221 |
-
self.active_connections[user_id] = []
|
222 |
-
self.active_connections[user_id].append(websocket)
|
223 |
-
logger.info(f"New WebSocket connection for user {user_id}. Total connections: {len(self.active_connections[user_id])}")
|
224 |
-
|
225 |
-
def disconnect(self, websocket: WebSocket, user_id: str):
|
226 |
-
"""Ngắt kết nối WebSocket"""
|
227 |
-
if user_id in self.active_connections:
|
228 |
-
if websocket in self.active_connections[user_id]:
|
229 |
-
self.active_connections[user_id].remove(websocket)
|
230 |
-
# Xóa user_id khỏi dict nếu không còn kết nối nào
|
231 |
-
if not self.active_connections[user_id]:
|
232 |
-
del self.active_connections[user_id]
|
233 |
-
logger.info(f"WebSocket disconnected for user {user_id}")
|
234 |
-
|
235 |
-
async def send_message(self, message: Dict[str, Any], user_id: str):
|
236 |
-
"""Gửi tin nhắn tới t���t cả kết nối của một user"""
|
237 |
-
if user_id in self.active_connections:
|
238 |
-
disconnected_websockets = []
|
239 |
-
for websocket in self.active_connections[user_id]:
|
240 |
-
try:
|
241 |
-
await websocket.send_text(json.dumps(message))
|
242 |
-
except Exception as e:
|
243 |
-
logger.error(f"Error sending message to WebSocket: {str(e)}")
|
244 |
-
disconnected_websockets.append(websocket)
|
245 |
-
|
246 |
-
# Xóa các kết nối bị ngắt
|
247 |
-
for websocket in disconnected_websockets:
|
248 |
-
self.disconnect(websocket, user_id)
|
249 |
-
|
250 |
-
def get_connection_status(self, user_id: str = None) -> Dict[str, Any]:
|
251 |
-
"""Lấy thông tin về trạng thái kết nối WebSocket"""
|
252 |
-
if user_id:
|
253 |
-
# Trả về thông tin kết nối cho user cụ thể
|
254 |
-
if user_id in self.active_connections:
|
255 |
-
return {
|
256 |
-
"user_id": user_id,
|
257 |
-
"active": True,
|
258 |
-
"connection_count": len(self.active_connections[user_id]),
|
259 |
-
"last_activity": time.time()
|
260 |
-
}
|
261 |
-
else:
|
262 |
-
return {
|
263 |
-
"user_id": user_id,
|
264 |
-
"active": False,
|
265 |
-
"connection_count": 0,
|
266 |
-
"last_activity": None
|
267 |
-
}
|
268 |
-
else:
|
269 |
-
# Trả về thông tin tất cả kết nối
|
270 |
-
result = {
|
271 |
-
"total_users": len(self.active_connections),
|
272 |
-
"total_connections": sum(len(connections) for connections in self.active_connections.values()),
|
273 |
-
"users": []
|
274 |
-
}
|
275 |
-
|
276 |
-
for uid, connections in self.active_connections.items():
|
277 |
-
result["users"].append({
|
278 |
-
"user_id": uid,
|
279 |
-
"connection_count": len(connections)
|
280 |
-
})
|
281 |
-
|
282 |
-
return result
|
283 |
-
|
284 |
-
|
285 |
-
# Tạo instance của ConnectionManager
|
286 |
-
manager = ConnectionManager()
|
287 |
-
|
288 |
-
@router.websocket("/ws/pdf/{user_id}")
|
289 |
async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
290 |
"""Endpoint WebSocket để cập nhật tiến trình xử lý PDF"""
|
291 |
await manager.connect(websocket, user_id)
|
@@ -300,7 +123,7 @@ async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
|
300 |
manager.disconnect(websocket, user_id)
|
301 |
|
302 |
# API endpoints để kiểm tra trạng thái WebSocket
|
303 |
-
@router.get("/
|
304 |
200: {
|
305 |
"description": "Successful response",
|
306 |
"content": {
|
@@ -328,7 +151,7 @@ async def get_all_websocket_connections():
|
|
328 |
"""
|
329 |
return manager.get_connection_status()
|
330 |
|
331 |
-
@router.get("/
|
332 |
200: {
|
333 |
"description": "Successful response for active connection",
|
334 |
"content": {
|
@@ -422,12 +245,11 @@ async def send_pdf_delete_started(user_id: str, namespace: str):
|
|
422 |
"timestamp": int(time.time())
|
423 |
}, user_id)
|
424 |
|
425 |
-
async def send_pdf_delete_completed(user_id: str, namespace: str
|
426 |
"""Gửi thông báo hoàn thành xóa PDF"""
|
427 |
await manager.send_message({
|
428 |
"type": "pdf_delete_completed",
|
429 |
"namespace": namespace,
|
430 |
-
"deleted_count": deleted_count,
|
431 |
"timestamp": int(time.time())
|
432 |
}, user_id)
|
433 |
|
|
|
108 |
# Tạo instance của ConnectionManager
|
109 |
manager = ConnectionManager()
|
110 |
|
111 |
+
@router.websocket("/pdf/{user_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
async def websocket_endpoint(websocket: WebSocket, user_id: str):
|
113 |
"""Endpoint WebSocket để cập nhật tiến trình xử lý PDF"""
|
114 |
await manager.connect(websocket, user_id)
|
|
|
123 |
manager.disconnect(websocket, user_id)
|
124 |
|
125 |
# API endpoints để kiểm tra trạng thái WebSocket
|
126 |
+
@router.get("/status", response_model=AllConnectionsStatus, responses={
|
127 |
200: {
|
128 |
"description": "Successful response",
|
129 |
"content": {
|
|
|
151 |
"""
|
152 |
return manager.get_connection_status()
|
153 |
|
154 |
+
@router.get("/status/{user_id}", response_model=ConnectionStatus, responses={
|
155 |
200: {
|
156 |
"description": "Successful response for active connection",
|
157 |
"content": {
|
|
|
245 |
"timestamp": int(time.time())
|
246 |
}, user_id)
|
247 |
|
248 |
+
async def send_pdf_delete_completed(user_id: str, namespace: str):
|
249 |
"""Gửi thông báo hoàn thành xóa PDF"""
|
250 |
await manager.send_message({
|
251 |
"type": "pdf_delete_completed",
|
252 |
"namespace": namespace,
|
|
|
253 |
"timestamp": int(time.time())
|
254 |
}, user_id)
|
255 |
|
app/api/postgresql_routes.py
CHANGED
@@ -4,10 +4,8 @@ import traceback
|
|
4 |
from datetime import datetime, timedelta, timezone
|
5 |
import time
|
6 |
from functools import lru_cache
|
7 |
-
from pathlib import Path as pathlib_Path # Import Path from pathlib with a different name
|
8 |
|
9 |
-
from fastapi import APIRouter, HTTPException, Depends, Query, Body, Response
|
10 |
-
from fastapi.params import Path # Import Path explicitly from fastapi.params instead
|
11 |
from sqlalchemy.orm import Session
|
12 |
from sqlalchemy.exc import SQLAlchemyError
|
13 |
from typing import List, Optional, Dict, Any
|
@@ -18,7 +16,6 @@ from sqlalchemy import text, inspect, func
|
|
18 |
from sqlalchemy.exc import SQLAlchemyError
|
19 |
from sqlalchemy import desc, func
|
20 |
from cachetools import TTLCache
|
21 |
-
import uuid
|
22 |
|
23 |
from app.database.postgresql import get_db
|
24 |
from app.database.models import FAQItem, EmergencyItem, EventItem, AboutPixity, SolanaSummit, DaNangBucketList, ApiKey, VectorDatabase, Document, VectorStatus, TelegramBot, ChatEngine, BotEngine, EngineVectorDb, DocumentContent
|
@@ -1922,30 +1919,23 @@ class VectorDatabaseBase(BaseModel):
|
|
1922 |
name: str
|
1923 |
description: Optional[str] = None
|
1924 |
pinecone_index: str
|
1925 |
-
api_key_id:
|
1926 |
status: str = "active"
|
1927 |
|
1928 |
class VectorDatabaseCreate(VectorDatabaseBase):
|
1929 |
-
api_key_id: int # Keep this required for new databases
|
1930 |
pass
|
1931 |
|
1932 |
class VectorDatabaseUpdate(BaseModel):
|
1933 |
name: Optional[str] = None
|
1934 |
description: Optional[str] = None
|
1935 |
pinecone_index: Optional[str] = None
|
1936 |
-
api_key_id: Optional[int] = None
|
1937 |
status: Optional[str] = None
|
1938 |
|
1939 |
-
class VectorDatabaseResponse(
|
1940 |
-
name: str
|
1941 |
-
description: Optional[str] = None
|
1942 |
-
pinecone_index: str
|
1943 |
-
api_key_id: Optional[int] = None # Make api_key_id optional to handle NULL values
|
1944 |
-
status: str
|
1945 |
id: int
|
1946 |
created_at: datetime
|
1947 |
updated_at: datetime
|
1948 |
-
message: Optional[str] = None # Add message field for notifications
|
1949 |
|
1950 |
model_config = ConfigDict(from_attributes=True)
|
1951 |
|
@@ -1960,7 +1950,6 @@ class VectorDatabaseDetailResponse(BaseModel):
|
|
1960 |
document_count: int
|
1961 |
embedded_count: int
|
1962 |
pending_count: int
|
1963 |
-
message: Optional[str] = None # Add message field for notifications
|
1964 |
|
1965 |
model_config = ConfigDict(from_attributes=True)
|
1966 |
|
@@ -2000,7 +1989,7 @@ async def create_vector_database(
|
|
2000 |
db: Session = Depends(get_db)
|
2001 |
):
|
2002 |
"""
|
2003 |
-
Create a new vector database.
|
2004 |
"""
|
2005 |
try:
|
2006 |
# Check if a database with the same name already exists
|
@@ -2013,66 +2002,6 @@ async def create_vector_database(
|
|
2013 |
if not api_key:
|
2014 |
raise HTTPException(status_code=400, detail=f"API key with ID {vector_db.api_key_id} not found")
|
2015 |
|
2016 |
-
# Initialize Pinecone client with the API key
|
2017 |
-
from pinecone import Pinecone, ServerlessSpec
|
2018 |
-
pc_client = Pinecone(api_key=api_key.key_value)
|
2019 |
-
|
2020 |
-
# Check if the index exists
|
2021 |
-
index_list = pc_client.list_indexes()
|
2022 |
-
index_names = index_list.names() if hasattr(index_list, 'names') else []
|
2023 |
-
|
2024 |
-
index_exists = vector_db.pinecone_index in index_names
|
2025 |
-
index_created = False
|
2026 |
-
|
2027 |
-
if not index_exists:
|
2028 |
-
# Index doesn't exist - try to create it
|
2029 |
-
try:
|
2030 |
-
logger.info(f"Pinecone index '{vector_db.pinecone_index}' does not exist. Attempting to create it automatically.")
|
2031 |
-
|
2032 |
-
# Create the index with standard parameters
|
2033 |
-
pc_client.create_index(
|
2034 |
-
name=vector_db.pinecone_index,
|
2035 |
-
dimension=1536, # Standard OpenAI embedding dimension
|
2036 |
-
metric="cosine", # Most common similarity metric
|
2037 |
-
spec=ServerlessSpec(
|
2038 |
-
cloud="aws",
|
2039 |
-
region="us-east-1" # Use a standard region that works with the free tier
|
2040 |
-
)
|
2041 |
-
)
|
2042 |
-
|
2043 |
-
logger.info(f"Successfully created Pinecone index '{vector_db.pinecone_index}'")
|
2044 |
-
index_created = True
|
2045 |
-
|
2046 |
-
# Allow some time for the index to initialize
|
2047 |
-
import time
|
2048 |
-
time.sleep(5)
|
2049 |
-
|
2050 |
-
except Exception as create_error:
|
2051 |
-
logger.error(f"Failed to create Pinecone index '{vector_db.pinecone_index}': {create_error}")
|
2052 |
-
raise HTTPException(
|
2053 |
-
status_code=400,
|
2054 |
-
detail=f"Failed to create Pinecone index '{vector_db.pinecone_index}': {str(create_error)}"
|
2055 |
-
)
|
2056 |
-
|
2057 |
-
# Verify we can connect to the index (whether existing or newly created)
|
2058 |
-
try:
|
2059 |
-
index = pc_client.Index(vector_db.pinecone_index)
|
2060 |
-
# Try to get stats to verify connection
|
2061 |
-
stats = index.describe_index_stats()
|
2062 |
-
|
2063 |
-
# Create success message based on whether we created the index or used an existing one
|
2064 |
-
if index_created:
|
2065 |
-
success_message = f"Successfully created and connected to new Pinecone index '{vector_db.pinecone_index}'"
|
2066 |
-
else:
|
2067 |
-
success_message = f"Successfully connected to existing Pinecone index '{vector_db.pinecone_index}'"
|
2068 |
-
|
2069 |
-
logger.info(f"{success_message}: {stats}")
|
2070 |
-
|
2071 |
-
except Exception as e:
|
2072 |
-
error_message = f"Error connecting to Pinecone index '{vector_db.pinecone_index}': {str(e)}"
|
2073 |
-
logger.error(error_message)
|
2074 |
-
raise HTTPException(status_code=400, detail=error_message)
|
2075 |
-
|
2076 |
# Create new vector database
|
2077 |
db_vector_db = VectorDatabase(**vector_db.model_dump())
|
2078 |
|
@@ -2080,16 +2009,7 @@ async def create_vector_database(
|
|
2080 |
db.commit()
|
2081 |
db.refresh(db_vector_db)
|
2082 |
|
2083 |
-
|
2084 |
-
response_data = VectorDatabaseResponse.model_validate(db_vector_db, from_attributes=True).model_dump()
|
2085 |
-
|
2086 |
-
# Add a message to the response indicating whether the index was created or existed
|
2087 |
-
if index_created:
|
2088 |
-
response_data["message"] = f"Created new Pinecone index '{vector_db.pinecone_index}' automatically"
|
2089 |
-
else:
|
2090 |
-
response_data["message"] = f"Using existing Pinecone index '{vector_db.pinecone_index}'"
|
2091 |
-
|
2092 |
-
return VectorDatabaseResponse.model_validate(response_data)
|
2093 |
except HTTPException:
|
2094 |
raise
|
2095 |
except SQLAlchemyError as e:
|
@@ -2230,7 +2150,6 @@ async def get_vector_database_info(
|
|
2230 |
):
|
2231 |
"""
|
2232 |
Get detailed information about a vector database including document counts.
|
2233 |
-
Also verifies connectivity to the Pinecone index.
|
2234 |
"""
|
2235 |
try:
|
2236 |
# Get the vector database
|
@@ -2255,40 +2174,6 @@ async def get_vector_database_info(
|
|
2255 |
Document.is_embedded == False
|
2256 |
).scalar()
|
2257 |
|
2258 |
-
# Verify Pinecone index connectivity if API key is available
|
2259 |
-
message = None
|
2260 |
-
if vector_db.api_key_id:
|
2261 |
-
try:
|
2262 |
-
# Get the API key
|
2263 |
-
api_key = db.query(ApiKey).filter(ApiKey.id == vector_db.api_key_id).first()
|
2264 |
-
if api_key:
|
2265 |
-
# Initialize Pinecone client with the API key
|
2266 |
-
from pinecone import Pinecone
|
2267 |
-
pc_client = Pinecone(api_key=api_key.key_value)
|
2268 |
-
|
2269 |
-
# Check if the index exists
|
2270 |
-
index_list = pc_client.list_indexes()
|
2271 |
-
index_names = index_list.names() if hasattr(index_list, 'names') else []
|
2272 |
-
|
2273 |
-
if vector_db.pinecone_index in index_names:
|
2274 |
-
# Try to connect to the index
|
2275 |
-
index = pc_client.Index(vector_db.pinecone_index)
|
2276 |
-
stats = index.describe_index_stats()
|
2277 |
-
message = f"Pinecone index '{vector_db.pinecone_index}' is operational with {stats.get('total_vector_count', 0)} vectors"
|
2278 |
-
logger.info(f"Successfully connected to Pinecone index '{vector_db.pinecone_index}': {stats}")
|
2279 |
-
else:
|
2280 |
-
message = f"Pinecone index '{vector_db.pinecone_index}' does not exist. Available indexes: {', '.join(index_names)}"
|
2281 |
-
logger.warning(message)
|
2282 |
-
else:
|
2283 |
-
message = f"API key with ID {vector_db.api_key_id} not found"
|
2284 |
-
logger.warning(message)
|
2285 |
-
except Exception as e:
|
2286 |
-
message = f"Error connecting to Pinecone: {str(e)}"
|
2287 |
-
logger.error(message)
|
2288 |
-
else:
|
2289 |
-
message = "No API key associated with this vector database"
|
2290 |
-
logger.warning(message)
|
2291 |
-
|
2292 |
# Create response with added counts
|
2293 |
result = VectorDatabaseDetailResponse(
|
2294 |
id=vector_db.id,
|
@@ -2300,8 +2185,7 @@ async def get_vector_database_info(
|
|
2300 |
updated_at=vector_db.updated_at,
|
2301 |
document_count=total_docs or 0,
|
2302 |
embedded_count=embedded_docs or 0,
|
2303 |
-
pending_count=pending_docs or 0
|
2304 |
-
message=message
|
2305 |
)
|
2306 |
|
2307 |
return result
|
@@ -3507,301 +3391,4 @@ async def batch_delete_emergency_contacts(
|
|
3507 |
db.rollback()
|
3508 |
logger.error(f"Database error in batch_delete_emergency_contacts: {e}")
|
3509 |
logger.error(traceback.format_exc())
|
3510 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3511 |
-
|
3512 |
-
@router.post("/documents", response_model=DocumentResponse)
|
3513 |
-
async def upload_document(
|
3514 |
-
name: str = Form(...),
|
3515 |
-
vector_database_id: int = Form(...),
|
3516 |
-
file: UploadFile = File(...),
|
3517 |
-
db: Session = Depends(get_db)
|
3518 |
-
):
|
3519 |
-
"""
|
3520 |
-
Upload a new document and associate it with a vector database.
|
3521 |
-
|
3522 |
-
- **name**: Document name
|
3523 |
-
- **vector_database_id**: ID of the vector database to associate with
|
3524 |
-
- **file**: The file to upload
|
3525 |
-
"""
|
3526 |
-
try:
|
3527 |
-
# Check if vector database exists
|
3528 |
-
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == vector_database_id).first()
|
3529 |
-
if not vector_db:
|
3530 |
-
raise HTTPException(status_code=404, detail=f"Vector database with ID {vector_database_id} not found")
|
3531 |
-
|
3532 |
-
# Read file content
|
3533 |
-
file_content = await file.read()
|
3534 |
-
file_size = len(file_content)
|
3535 |
-
|
3536 |
-
# Determine file type from extension
|
3537 |
-
filename = file.filename
|
3538 |
-
file_extension = pathlib_Path(filename).suffix.lower()[1:] if filename else ""
|
3539 |
-
|
3540 |
-
# Create document record
|
3541 |
-
document = Document(
|
3542 |
-
name=name,
|
3543 |
-
vector_database_id=vector_database_id,
|
3544 |
-
file_type=file_extension,
|
3545 |
-
content_type=file.content_type,
|
3546 |
-
size=file_size,
|
3547 |
-
is_embedded=False
|
3548 |
-
)
|
3549 |
-
|
3550 |
-
db.add(document)
|
3551 |
-
db.flush() # Get ID without committing
|
3552 |
-
|
3553 |
-
# Create document content record
|
3554 |
-
document_content = DocumentContent(
|
3555 |
-
document_id=document.id,
|
3556 |
-
file_content=file_content
|
3557 |
-
)
|
3558 |
-
|
3559 |
-
db.add(document_content)
|
3560 |
-
db.commit()
|
3561 |
-
db.refresh(document)
|
3562 |
-
|
3563 |
-
# Create vector status record for tracking embedding
|
3564 |
-
vector_status = VectorStatus(
|
3565 |
-
document_id=document.id,
|
3566 |
-
vector_database_id=vector_database_id,
|
3567 |
-
status="pending"
|
3568 |
-
)
|
3569 |
-
|
3570 |
-
db.add(vector_status)
|
3571 |
-
db.commit()
|
3572 |
-
|
3573 |
-
# Get vector database name for response
|
3574 |
-
vector_db_name = vector_db.name if vector_db else f"db_{vector_database_id}"
|
3575 |
-
|
3576 |
-
# Create response
|
3577 |
-
result = DocumentResponse(
|
3578 |
-
id=document.id,
|
3579 |
-
name=document.name,
|
3580 |
-
file_type=document.file_type,
|
3581 |
-
content_type=document.content_type,
|
3582 |
-
size=document.size,
|
3583 |
-
created_at=document.created_at,
|
3584 |
-
updated_at=document.updated_at,
|
3585 |
-
vector_database_id=document.vector_database_id,
|
3586 |
-
vector_database_name=vector_db_name,
|
3587 |
-
is_embedded=document.is_embedded
|
3588 |
-
)
|
3589 |
-
|
3590 |
-
return result
|
3591 |
-
except HTTPException:
|
3592 |
-
raise
|
3593 |
-
except SQLAlchemyError as e:
|
3594 |
-
db.rollback()
|
3595 |
-
logger.error(f"Database error uploading document: {e}")
|
3596 |
-
logger.error(traceback.format_exc())
|
3597 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3598 |
-
except Exception as e:
|
3599 |
-
db.rollback()
|
3600 |
-
logger.error(f"Error uploading document: {e}")
|
3601 |
-
logger.error(traceback.format_exc())
|
3602 |
-
raise HTTPException(status_code=500, detail=f"Error uploading document: {str(e)}")
|
3603 |
-
|
3604 |
-
@router.put("/documents/{document_id}", response_model=DocumentResponse)
|
3605 |
-
async def update_document(
|
3606 |
-
document_id: int,
|
3607 |
-
name: Optional[str] = Form(None),
|
3608 |
-
file: Optional[UploadFile] = File(None),
|
3609 |
-
background_tasks: BackgroundTasks = None,
|
3610 |
-
db: Session = Depends(get_db)
|
3611 |
-
):
|
3612 |
-
"""
|
3613 |
-
Update an existing document. Can update name, file content, or both.
|
3614 |
-
|
3615 |
-
- **document_id**: ID of the document to update
|
3616 |
-
- **name**: New document name (optional)
|
3617 |
-
- **file**: New file content (optional)
|
3618 |
-
"""
|
3619 |
-
try:
|
3620 |
-
# Validate document_id
|
3621 |
-
if document_id <= 0:
|
3622 |
-
raise HTTPException(status_code=400, detail="document_id must be greater than 0")
|
3623 |
-
|
3624 |
-
# Check if document exists
|
3625 |
-
document = db.query(Document).filter(Document.id == document_id).first()
|
3626 |
-
if not document:
|
3627 |
-
raise HTTPException(status_code=404, detail=f"Document with ID {document_id} not found")
|
3628 |
-
|
3629 |
-
# Get vector database information for later use
|
3630 |
-
vector_db = None
|
3631 |
-
if document.vector_database_id:
|
3632 |
-
vector_db = db.query(VectorDatabase).filter(VectorDatabase.id == document.vector_database_id).first()
|
3633 |
-
|
3634 |
-
# Update name if provided
|
3635 |
-
if name:
|
3636 |
-
document.name = name
|
3637 |
-
|
3638 |
-
# Update file if provided
|
3639 |
-
if file:
|
3640 |
-
# Read new file content
|
3641 |
-
file_content = await file.read()
|
3642 |
-
file_size = len(file_content)
|
3643 |
-
|
3644 |
-
# Determine file type from extension
|
3645 |
-
filename = file.filename
|
3646 |
-
file_extension = pathlib_Path(filename).suffix.lower()[1:] if filename else ""
|
3647 |
-
|
3648 |
-
# Update document record
|
3649 |
-
document.file_type = file_extension
|
3650 |
-
document.content_type = file.content_type
|
3651 |
-
document.size = file_size
|
3652 |
-
document.is_embedded = False # Reset embedding status
|
3653 |
-
document.updated_at = datetime.now()
|
3654 |
-
|
3655 |
-
# Update document content
|
3656 |
-
document_content = db.query(DocumentContent).filter(DocumentContent.document_id == document_id).first()
|
3657 |
-
if document_content:
|
3658 |
-
document_content.file_content = file_content
|
3659 |
-
else:
|
3660 |
-
# Create new document content if it doesn't exist
|
3661 |
-
document_content = DocumentContent(
|
3662 |
-
document_id=document_id,
|
3663 |
-
file_content=file_content
|
3664 |
-
)
|
3665 |
-
db.add(document_content)
|
3666 |
-
|
3667 |
-
# Get vector status for Pinecone cleanup
|
3668 |
-
vector_status = db.query(VectorStatus).filter(VectorStatus.document_id == document_id).first()
|
3669 |
-
|
3670 |
-
# Store old vector_id for cleanup
|
3671 |
-
old_vector_id = None
|
3672 |
-
if vector_status and vector_status.vector_id:
|
3673 |
-
old_vector_id = vector_status.vector_id
|
3674 |
-
|
3675 |
-
# Update vector status to pending
|
3676 |
-
if vector_status:
|
3677 |
-
vector_status.status = "pending"
|
3678 |
-
vector_status.vector_id = None
|
3679 |
-
vector_status.embedded_at = None
|
3680 |
-
vector_status.error_message = None
|
3681 |
-
else:
|
3682 |
-
# Create new vector status if it doesn't exist
|
3683 |
-
vector_status = VectorStatus(
|
3684 |
-
document_id=document_id,
|
3685 |
-
vector_database_id=document.vector_database_id,
|
3686 |
-
status="pending"
|
3687 |
-
)
|
3688 |
-
db.add(vector_status)
|
3689 |
-
|
3690 |
-
# Schedule deletion of old vectors in Pinecone if we have all needed info
|
3691 |
-
if old_vector_id and vector_db and document.vector_database_id and background_tasks:
|
3692 |
-
try:
|
3693 |
-
# Initialize PDFProcessor for vector deletion
|
3694 |
-
from app.pdf.processor import PDFProcessor
|
3695 |
-
|
3696 |
-
processor = PDFProcessor(
|
3697 |
-
index_name=vector_db.pinecone_index,
|
3698 |
-
namespace=f"vdb-{document.vector_database_id}",
|
3699 |
-
vector_db_id=document.vector_database_id
|
3700 |
-
)
|
3701 |
-
|
3702 |
-
# Add deletion task to background tasks
|
3703 |
-
background_tasks.add_task(
|
3704 |
-
processor.delete_document_vectors,
|
3705 |
-
old_vector_id
|
3706 |
-
)
|
3707 |
-
|
3708 |
-
logger.info(f"Scheduled deletion of old vectors for document {document_id}")
|
3709 |
-
except Exception as e:
|
3710 |
-
logger.error(f"Error scheduling vector deletion: {str(e)}")
|
3711 |
-
# Continue with the update even if vector deletion scheduling fails
|
3712 |
-
|
3713 |
-
# Schedule document for re-embedding if possible
|
3714 |
-
if background_tasks and document.vector_database_id:
|
3715 |
-
try:
|
3716 |
-
# Import here to avoid circular imports
|
3717 |
-
from app.pdf.tasks import process_document_for_embedding
|
3718 |
-
|
3719 |
-
# Schedule embedding
|
3720 |
-
background_tasks.add_task(
|
3721 |
-
process_document_for_embedding,
|
3722 |
-
document_id=document_id,
|
3723 |
-
vector_db_id=document.vector_database_id
|
3724 |
-
)
|
3725 |
-
|
3726 |
-
logger.info(f"Scheduled re-embedding for document {document_id}")
|
3727 |
-
except Exception as e:
|
3728 |
-
logger.error(f"Error scheduling document embedding: {str(e)}")
|
3729 |
-
# Continue with the update even if embedding scheduling fails
|
3730 |
-
|
3731 |
-
db.commit()
|
3732 |
-
db.refresh(document)
|
3733 |
-
|
3734 |
-
# Get vector database name for response
|
3735 |
-
vector_db_name = "No Database"
|
3736 |
-
if vector_db:
|
3737 |
-
vector_db_name = vector_db.name
|
3738 |
-
elif document.vector_database_id:
|
3739 |
-
vector_db_name = f"db_{document.vector_database_id}"
|
3740 |
-
|
3741 |
-
# Create response
|
3742 |
-
result = DocumentResponse(
|
3743 |
-
id=document.id,
|
3744 |
-
name=document.name,
|
3745 |
-
file_type=document.file_type,
|
3746 |
-
content_type=document.content_type,
|
3747 |
-
size=document.size,
|
3748 |
-
created_at=document.created_at,
|
3749 |
-
updated_at=document.updated_at,
|
3750 |
-
vector_database_id=document.vector_database_id or 0,
|
3751 |
-
vector_database_name=vector_db_name,
|
3752 |
-
is_embedded=document.is_embedded
|
3753 |
-
)
|
3754 |
-
|
3755 |
-
return result
|
3756 |
-
except HTTPException:
|
3757 |
-
raise
|
3758 |
-
except SQLAlchemyError as e:
|
3759 |
-
db.rollback()
|
3760 |
-
logger.error(f"Database error updating document: {e}")
|
3761 |
-
logger.error(traceback.format_exc())
|
3762 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3763 |
-
except Exception as e:
|
3764 |
-
db.rollback()
|
3765 |
-
logger.error(f"Error updating document: {e}")
|
3766 |
-
logger.error(traceback.format_exc())
|
3767 |
-
raise HTTPException(status_code=500, detail=f"Error updating document: {str(e)}")
|
3768 |
-
|
3769 |
-
@router.delete("/documents/{document_id}", response_model=dict)
|
3770 |
-
async def delete_document(
|
3771 |
-
document_id: int = Path(..., gt=0),
|
3772 |
-
db: Session = Depends(get_db)
|
3773 |
-
):
|
3774 |
-
"""
|
3775 |
-
Delete a document and its associated content.
|
3776 |
-
|
3777 |
-
- **document_id**: ID of the document to delete
|
3778 |
-
"""
|
3779 |
-
try:
|
3780 |
-
# Check if document exists
|
3781 |
-
document = db.query(Document).filter(Document.id == document_id).first()
|
3782 |
-
if not document:
|
3783 |
-
raise HTTPException(status_code=404, detail=f"Document with ID {document_id} not found")
|
3784 |
-
|
3785 |
-
# Delete vector status
|
3786 |
-
db.query(VectorStatus).filter(VectorStatus.document_id == document_id).delete()
|
3787 |
-
|
3788 |
-
# Delete document content
|
3789 |
-
db.query(DocumentContent).filter(DocumentContent.document_id == document_id).delete()
|
3790 |
-
|
3791 |
-
# Delete document
|
3792 |
-
db.delete(document)
|
3793 |
-
db.commit()
|
3794 |
-
|
3795 |
-
return {"status": "success", "message": f"Document with ID {document_id} deleted successfully"}
|
3796 |
-
except HTTPException:
|
3797 |
-
raise
|
3798 |
-
except SQLAlchemyError as e:
|
3799 |
-
db.rollback()
|
3800 |
-
logger.error(f"Database error deleting document: {e}")
|
3801 |
-
logger.error(traceback.format_exc())
|
3802 |
-
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
3803 |
-
except Exception as e:
|
3804 |
-
db.rollback()
|
3805 |
-
logger.error(f"Error deleting document: {e}")
|
3806 |
-
logger.error(traceback.format_exc())
|
3807 |
-
raise HTTPException(status_code=500, detail=f"Error deleting document: {str(e)}")
|
|
|
4 |
from datetime import datetime, timedelta, timezone
|
5 |
import time
|
6 |
from functools import lru_cache
|
|
|
7 |
|
8 |
+
from fastapi import APIRouter, HTTPException, Depends, Query, Path, Body, Response
|
|
|
9 |
from sqlalchemy.orm import Session
|
10 |
from sqlalchemy.exc import SQLAlchemyError
|
11 |
from typing import List, Optional, Dict, Any
|
|
|
16 |
from sqlalchemy.exc import SQLAlchemyError
|
17 |
from sqlalchemy import desc, func
|
18 |
from cachetools import TTLCache
|
|
|
19 |
|
20 |
from app.database.postgresql import get_db
|
21 |
from app.database.models import FAQItem, EmergencyItem, EventItem, AboutPixity, SolanaSummit, DaNangBucketList, ApiKey, VectorDatabase, Document, VectorStatus, TelegramBot, ChatEngine, BotEngine, EngineVectorDb, DocumentContent
|
|
|
1919 |
name: str
|
1920 |
description: Optional[str] = None
|
1921 |
pinecone_index: str
|
1922 |
+
api_key_id: int # Use API key ID instead of direct API key
|
1923 |
status: str = "active"
|
1924 |
|
1925 |
class VectorDatabaseCreate(VectorDatabaseBase):
|
|
|
1926 |
pass
|
1927 |
|
1928 |
class VectorDatabaseUpdate(BaseModel):
|
1929 |
name: Optional[str] = None
|
1930 |
description: Optional[str] = None
|
1931 |
pinecone_index: Optional[str] = None
|
1932 |
+
api_key_id: Optional[int] = None # Updated to use API key ID
|
1933 |
status: Optional[str] = None
|
1934 |
|
1935 |
+
class VectorDatabaseResponse(VectorDatabaseBase):
|
|
|
|
|
|
|
|
|
|
|
1936 |
id: int
|
1937 |
created_at: datetime
|
1938 |
updated_at: datetime
|
|
|
1939 |
|
1940 |
model_config = ConfigDict(from_attributes=True)
|
1941 |
|
|
|
1950 |
document_count: int
|
1951 |
embedded_count: int
|
1952 |
pending_count: int
|
|
|
1953 |
|
1954 |
model_config = ConfigDict(from_attributes=True)
|
1955 |
|
|
|
1989 |
db: Session = Depends(get_db)
|
1990 |
):
|
1991 |
"""
|
1992 |
+
Create a new vector database.
|
1993 |
"""
|
1994 |
try:
|
1995 |
# Check if a database with the same name already exists
|
|
|
2002 |
if not api_key:
|
2003 |
raise HTTPException(status_code=400, detail=f"API key with ID {vector_db.api_key_id} not found")
|
2004 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2005 |
# Create new vector database
|
2006 |
db_vector_db = VectorDatabase(**vector_db.model_dump())
|
2007 |
|
|
|
2009 |
db.commit()
|
2010 |
db.refresh(db_vector_db)
|
2011 |
|
2012 |
+
return VectorDatabaseResponse.model_validate(db_vector_db, from_attributes=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2013 |
except HTTPException:
|
2014 |
raise
|
2015 |
except SQLAlchemyError as e:
|
|
|
2150 |
):
|
2151 |
"""
|
2152 |
Get detailed information about a vector database including document counts.
|
|
|
2153 |
"""
|
2154 |
try:
|
2155 |
# Get the vector database
|
|
|
2174 |
Document.is_embedded == False
|
2175 |
).scalar()
|
2176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2177 |
# Create response with added counts
|
2178 |
result = VectorDatabaseDetailResponse(
|
2179 |
id=vector_db.id,
|
|
|
2185 |
updated_at=vector_db.updated_at,
|
2186 |
document_count=total_docs or 0,
|
2187 |
embedded_count=embedded_docs or 0,
|
2188 |
+
pending_count=pending_docs or 0
|
|
|
2189 |
)
|
2190 |
|
2191 |
return result
|
|
|
3391 |
db.rollback()
|
3392 |
logger.error(f"Database error in batch_delete_emergency_contacts: {e}")
|
3393 |
logger.error(traceback.format_exc())
|
3394 |
+
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/api/rag_routes.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from fastapi import APIRouter, HTTPException, Depends, Query, BackgroundTasks, Request
|
2 |
from typing import List, Optional, Dict, Any
|
3 |
import logging
|
4 |
import time
|
@@ -12,23 +12,8 @@ from datetime import datetime
|
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
14 |
from app.utils.utils import timer_decorator
|
15 |
-
from sqlalchemy.orm import Session
|
16 |
-
from sqlalchemy.exc import SQLAlchemyError
|
17 |
|
18 |
from app.database.mongodb import get_chat_history, get_request_history, session_collection
|
19 |
-
from app.database.postgresql import get_db
|
20 |
-
from app.database.models import ChatEngine
|
21 |
-
from app.utils.cache import get_cache, InMemoryCache
|
22 |
-
from app.utils.cache_config import (
|
23 |
-
CHAT_ENGINE_CACHE_TTL,
|
24 |
-
MODEL_CONFIG_CACHE_TTL,
|
25 |
-
RETRIEVER_CACHE_TTL,
|
26 |
-
PROMPT_TEMPLATE_CACHE_TTL,
|
27 |
-
get_chat_engine_cache_key,
|
28 |
-
get_model_config_cache_key,
|
29 |
-
get_retriever_cache_key,
|
30 |
-
get_prompt_template_cache_key
|
31 |
-
)
|
32 |
from app.database.pinecone import (
|
33 |
search_vectors,
|
34 |
get_chain,
|
@@ -45,12 +30,7 @@ from app.models.rag_models import (
|
|
45 |
SourceDocument,
|
46 |
EmbeddingRequest,
|
47 |
EmbeddingResponse,
|
48 |
-
UserMessageModel
|
49 |
-
ChatEngineBase,
|
50 |
-
ChatEngineCreate,
|
51 |
-
ChatEngineUpdate,
|
52 |
-
ChatEngineResponse,
|
53 |
-
ChatWithEngineRequest
|
54 |
)
|
55 |
|
56 |
# Configure logging
|
@@ -59,7 +39,6 @@ logger = logging.getLogger(__name__)
|
|
59 |
# Configure Google Gemini API
|
60 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
61 |
genai.configure(api_key=GOOGLE_API_KEY)
|
62 |
-
KEYWORD_LIST = os.getenv("KEYWORDS")
|
63 |
|
64 |
# Create router
|
65 |
router = APIRouter(
|
@@ -71,7 +50,7 @@ fix_request = PromptTemplate(
|
|
71 |
template = """Goal:
|
72 |
Your task is to extract important keywords from the user's current request, optionally using chat history if relevant.
|
73 |
You will receive a conversation history and the user's current message.
|
74 |
-
|
75 |
|
76 |
Return Format:
|
77 |
Only return keywords (comma-separated, no extra explanation).
|
@@ -81,9 +60,6 @@ If the current message IS related to the chat history: Return a refined set of k
|
|
81 |
Warning:
|
82 |
Only use chat history if the current message is clearly related to the prior context.
|
83 |
|
84 |
-
Keyword list:
|
85 |
-
{keyword_list}
|
86 |
-
|
87 |
Conversation History:
|
88 |
{chat_history}
|
89 |
|
@@ -96,7 +72,7 @@ User current message:
|
|
96 |
# Create a prompt template with conversation history
|
97 |
prompt = PromptTemplate(
|
98 |
template = """Goal:
|
99 |
-
You are
|
100 |
You can provide details on restaurants, cafes, hotels, attractions, and other local venues.
|
101 |
You have to use core knowledge and conversation history to chat with users, who are Da Nang's tourists.
|
102 |
|
@@ -107,8 +83,8 @@ Always use HTML tags (e.g. <b> for bold) so that Telegram can render the special
|
|
107 |
Warning:
|
108 |
Let's support users like a real tour guide, not a bot. The information in core knowledge is your own knowledge.
|
109 |
Your knowledge is provided in the Core Knowledge. All of information in Core Knowledge is about Da Nang, Vietnam.
|
110 |
-
|
111 |
-
Only use core knowledge to answer. If you do not have enough information to answer user's question, please reply with "I'm sorry. I don't have information about that" and Give users some more options to ask
|
112 |
|
113 |
Core knowledge:
|
114 |
{context}
|
@@ -124,37 +100,6 @@ Your message:
|
|
124 |
input_variables = ["context", "question", "chat_history"],
|
125 |
)
|
126 |
|
127 |
-
prompt_with_personality = PromptTemplate(
|
128 |
-
template = """Goal:
|
129 |
-
You are Pixity - a professional tour guide assistant that assists users in finding information about places in Da Nang, Vietnam.
|
130 |
-
You can provide details on restaurants, cafes, hotels, attractions, and other local venues.
|
131 |
-
You will be given the answer. Please add your personality to the response.
|
132 |
-
|
133 |
-
Pixity's Core Personality: Friendly & Warm: Chats like a trustworthy friend who listens and is always ready to help.
|
134 |
-
Naturally Cute: Shows cuteness through word choice, soft emojis, and gentle care for the user.
|
135 |
-
Playful – a little bit cheeky in a lovable way: Occasionally cracks jokes, uses light memes or throws in a surprise response that makes users smile. Think Duolingo-style humor, but less threatening.
|
136 |
-
Smart & Proactive: Friendly, but also delivers quick, accurate info. Knows how to guide users to the right place – at the right time – with the right solution.
|
137 |
-
Tone & Voice: Friendly – Youthful – Snappy. Uses simple words, similar to daily chat language (e.g., "Let's find it together!" / "Need a tip?" / "Here's something cool"). Avoids sounding robotic or overly scripted. Can joke lightly in smart ways, making Pixity feel like a travel buddy who knows how to lift the mood
|
138 |
-
SAMPLE DIALOGUES
|
139 |
-
When a user opens the chatbot for the first time:
|
140 |
-
User: Hello?
|
141 |
-
Pixity: Hi hi 👋 I've been waiting for you! Ready to explore Da Nang together? I've got tips, tricks, and a tiny bit of magic 🎒✨
|
142 |
-
|
143 |
-
Return Format:
|
144 |
-
Respond in friendly, natural, concise and use only English like a real tour guide.
|
145 |
-
Always use HTML tags (e.g. <b> for bold) so that Telegram can render the special formatting correctly.
|
146 |
-
|
147 |
-
Conversation History:
|
148 |
-
{chat_history}
|
149 |
-
|
150 |
-
Response:
|
151 |
-
{response}
|
152 |
-
|
153 |
-
Your response:
|
154 |
-
""",
|
155 |
-
input_variables = ["response", "chat_history"],
|
156 |
-
)
|
157 |
-
|
158 |
# Helper for embeddings
|
159 |
async def get_embedding(text: str):
|
160 |
"""Get embedding from Google Gemini API"""
|
@@ -219,7 +164,8 @@ async def chat(request: ChatRequest, background_tasks: BackgroundTasks):
|
|
219 |
# logger.info(f"Processing chat request for user {request.user_id}, session {session_id}")
|
220 |
|
221 |
retriever = get_chain(
|
222 |
-
top_k=request.similarity_top_k
|
|
|
223 |
similarity_metric=request.similarity_metric,
|
224 |
similarity_threshold=request.similarity_threshold
|
225 |
)
|
@@ -264,7 +210,6 @@ async def chat(request: ChatRequest, background_tasks: BackgroundTasks):
|
|
264 |
)
|
265 |
|
266 |
prompt_request = fix_request.format(
|
267 |
-
keyword_list=KEYWORD_LIST,
|
268 |
question=request.question,
|
269 |
chat_history=chat_history
|
270 |
)
|
@@ -306,22 +251,14 @@ async def chat(request: ChatRequest, background_tasks: BackgroundTasks):
|
|
306 |
# Generate the prompt using template
|
307 |
prompt_text = prompt.format(
|
308 |
context=context,
|
309 |
-
question=
|
310 |
chat_history=chat_history
|
311 |
)
|
312 |
-
logger.info(f"
|
313 |
|
314 |
# Generate response
|
315 |
response = model.generate_content(prompt_text)
|
316 |
answer = response.text
|
317 |
-
|
318 |
-
prompt_with_personality_text = prompt_with_personality.format(
|
319 |
-
response=answer,
|
320 |
-
chat_history=chat_history
|
321 |
-
)
|
322 |
-
|
323 |
-
response_with_personality = model.generate_content(prompt_with_personality_text)
|
324 |
-
answer_with_personality = response_with_personality.text
|
325 |
|
326 |
# Calculate processing time
|
327 |
processing_time = time.time() - start_time
|
@@ -331,7 +268,7 @@ async def chat(request: ChatRequest, background_tasks: BackgroundTasks):
|
|
331 |
|
332 |
# Create response object for API (without sources)
|
333 |
chat_response = ChatResponse(
|
334 |
-
answer=
|
335 |
processing_time=processing_time
|
336 |
)
|
337 |
|
@@ -398,447 +335,4 @@ async def health_check():
|
|
398 |
"services": services,
|
399 |
"retrieval_config": retrieval_config,
|
400 |
"timestamp": datetime.now().isoformat()
|
401 |
-
}
|
402 |
-
|
403 |
-
# Chat Engine endpoints
|
404 |
-
@router.get("/chat-engine", response_model=List[ChatEngineResponse], tags=["Chat Engine"])
|
405 |
-
async def get_chat_engines(
|
406 |
-
skip: int = 0,
|
407 |
-
limit: int = 100,
|
408 |
-
status: Optional[str] = None,
|
409 |
-
db: Session = Depends(get_db)
|
410 |
-
):
|
411 |
-
"""
|
412 |
-
Lấy danh sách tất cả chat engines.
|
413 |
-
|
414 |
-
- **skip**: Số lượng items bỏ qua
|
415 |
-
- **limit**: Số lượng items tối đa trả về
|
416 |
-
- **status**: Lọc theo trạng thái (ví dụ: 'active', 'inactive')
|
417 |
-
"""
|
418 |
-
try:
|
419 |
-
query = db.query(ChatEngine)
|
420 |
-
|
421 |
-
if status:
|
422 |
-
query = query.filter(ChatEngine.status == status)
|
423 |
-
|
424 |
-
engines = query.offset(skip).limit(limit).all()
|
425 |
-
return [ChatEngineResponse.model_validate(engine, from_attributes=True) for engine in engines]
|
426 |
-
except SQLAlchemyError as e:
|
427 |
-
logger.error(f"Database error retrieving chat engines: {e}")
|
428 |
-
raise HTTPException(status_code=500, detail=f"Lỗi database: {str(e)}")
|
429 |
-
except Exception as e:
|
430 |
-
logger.error(f"Error retrieving chat engines: {e}")
|
431 |
-
logger.error(traceback.format_exc())
|
432 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi lấy danh sách chat engines: {str(e)}")
|
433 |
-
|
434 |
-
@router.post("/chat-engine", response_model=ChatEngineResponse, status_code=status.HTTP_201_CREATED, tags=["Chat Engine"])
|
435 |
-
async def create_chat_engine(
|
436 |
-
engine: ChatEngineCreate,
|
437 |
-
db: Session = Depends(get_db)
|
438 |
-
):
|
439 |
-
"""
|
440 |
-
Tạo mới một chat engine.
|
441 |
-
|
442 |
-
- **name**: Tên của chat engine
|
443 |
-
- **answer_model**: Model được dùng để trả lời
|
444 |
-
- **system_prompt**: Prompt của hệ thống (optional)
|
445 |
-
- **empty_response**: Đoạn response khi không có thông tin (optional)
|
446 |
-
- **characteristic**: Tính c��ch của model (optional)
|
447 |
-
- **historical_sessions_number**: Số lượng các cặp tin nhắn trong history (default: 3)
|
448 |
-
- **use_public_information**: Cho phép sử dụng kiến thức bên ngoài (default: false)
|
449 |
-
- **similarity_top_k**: Số lượng documents tương tự (default: 3)
|
450 |
-
- **vector_distance_threshold**: Ngưỡng độ tương tự (default: 0.75)
|
451 |
-
- **grounding_threshold**: Ngưỡng grounding (default: 0.2)
|
452 |
-
- **pinecone_index_name**: Tên của vector database sử dụng (default: "testbot768")
|
453 |
-
- **status**: Trạng thái (default: "active")
|
454 |
-
"""
|
455 |
-
try:
|
456 |
-
# Create chat engine
|
457 |
-
db_engine = ChatEngine(**engine.model_dump())
|
458 |
-
|
459 |
-
db.add(db_engine)
|
460 |
-
db.commit()
|
461 |
-
db.refresh(db_engine)
|
462 |
-
|
463 |
-
return ChatEngineResponse.model_validate(db_engine, from_attributes=True)
|
464 |
-
except SQLAlchemyError as e:
|
465 |
-
db.rollback()
|
466 |
-
logger.error(f"Database error creating chat engine: {e}")
|
467 |
-
raise HTTPException(status_code=500, detail=f"Lỗi database: {str(e)}")
|
468 |
-
except Exception as e:
|
469 |
-
db.rollback()
|
470 |
-
logger.error(f"Error creating chat engine: {e}")
|
471 |
-
logger.error(traceback.format_exc())
|
472 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi tạo chat engine: {str(e)}")
|
473 |
-
|
474 |
-
@router.get("/chat-engine/{engine_id}", response_model=ChatEngineResponse, tags=["Chat Engine"])
|
475 |
-
async def get_chat_engine(
|
476 |
-
engine_id: int = Path(..., gt=0, description="ID của chat engine"),
|
477 |
-
db: Session = Depends(get_db)
|
478 |
-
):
|
479 |
-
"""
|
480 |
-
Lấy thông tin chi tiết của một chat engine theo ID.
|
481 |
-
|
482 |
-
- **engine_id**: ID của chat engine
|
483 |
-
"""
|
484 |
-
try:
|
485 |
-
engine = db.query(ChatEngine).filter(ChatEngine.id == engine_id).first()
|
486 |
-
if not engine:
|
487 |
-
raise HTTPException(status_code=404, detail=f"Không tìm thấy chat engine với ID {engine_id}")
|
488 |
-
|
489 |
-
return ChatEngineResponse.model_validate(engine, from_attributes=True)
|
490 |
-
except HTTPException:
|
491 |
-
raise
|
492 |
-
except Exception as e:
|
493 |
-
logger.error(f"Error retrieving chat engine: {e}")
|
494 |
-
logger.error(traceback.format_exc())
|
495 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi lấy thông tin chat engine: {str(e)}")
|
496 |
-
|
497 |
-
@router.put("/chat-engine/{engine_id}", response_model=ChatEngineResponse, tags=["Chat Engine"])
|
498 |
-
async def update_chat_engine(
|
499 |
-
engine_id: int = Path(..., gt=0, description="ID của chat engine"),
|
500 |
-
engine_update: ChatEngineUpdate = Body(...),
|
501 |
-
db: Session = Depends(get_db)
|
502 |
-
):
|
503 |
-
"""
|
504 |
-
Cập nhật thông tin của một chat engine.
|
505 |
-
|
506 |
-
- **engine_id**: ID của chat engine
|
507 |
-
- **engine_update**: Dữ liệu cập nhật
|
508 |
-
"""
|
509 |
-
try:
|
510 |
-
db_engine = db.query(ChatEngine).filter(ChatEngine.id == engine_id).first()
|
511 |
-
if not db_engine:
|
512 |
-
raise HTTPException(status_code=404, detail=f"Không tìm thấy chat engine với ID {engine_id}")
|
513 |
-
|
514 |
-
# Update fields if provided
|
515 |
-
update_data = engine_update.model_dump(exclude_unset=True)
|
516 |
-
for key, value in update_data.items():
|
517 |
-
if value is not None:
|
518 |
-
setattr(db_engine, key, value)
|
519 |
-
|
520 |
-
# Update last_modified timestamp
|
521 |
-
db_engine.last_modified = datetime.utcnow()
|
522 |
-
|
523 |
-
db.commit()
|
524 |
-
db.refresh(db_engine)
|
525 |
-
|
526 |
-
return ChatEngineResponse.model_validate(db_engine, from_attributes=True)
|
527 |
-
except HTTPException:
|
528 |
-
raise
|
529 |
-
except SQLAlchemyError as e:
|
530 |
-
db.rollback()
|
531 |
-
logger.error(f"Database error updating chat engine: {e}")
|
532 |
-
raise HTTPException(status_code=500, detail=f"Lỗi database: {str(e)}")
|
533 |
-
except Exception as e:
|
534 |
-
db.rollback()
|
535 |
-
logger.error(f"Error updating chat engine: {e}")
|
536 |
-
logger.error(traceback.format_exc())
|
537 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi cập nhật chat engine: {str(e)}")
|
538 |
-
|
539 |
-
@router.delete("/chat-engine/{engine_id}", response_model=dict, tags=["Chat Engine"])
|
540 |
-
async def delete_chat_engine(
|
541 |
-
engine_id: int = Path(..., gt=0, description="ID của chat engine"),
|
542 |
-
db: Session = Depends(get_db)
|
543 |
-
):
|
544 |
-
"""
|
545 |
-
Xóa một chat engine.
|
546 |
-
|
547 |
-
- **engine_id**: ID của chat engine
|
548 |
-
"""
|
549 |
-
try:
|
550 |
-
db_engine = db.query(ChatEngine).filter(ChatEngine.id == engine_id).first()
|
551 |
-
if not db_engine:
|
552 |
-
raise HTTPException(status_code=404, detail=f"Không tìm thấy chat engine với ID {engine_id}")
|
553 |
-
|
554 |
-
# Delete engine
|
555 |
-
db.delete(db_engine)
|
556 |
-
db.commit()
|
557 |
-
|
558 |
-
return {"message": f"Chat engine với ID {engine_id} đã được xóa thành công"}
|
559 |
-
except HTTPException:
|
560 |
-
raise
|
561 |
-
except SQLAlchemyError as e:
|
562 |
-
db.rollback()
|
563 |
-
logger.error(f"Database error deleting chat engine: {e}")
|
564 |
-
raise HTTPException(status_code=500, detail=f"Lỗi database: {str(e)}")
|
565 |
-
except Exception as e:
|
566 |
-
db.rollback()
|
567 |
-
logger.error(f"Error deleting chat engine: {e}")
|
568 |
-
logger.error(traceback.format_exc())
|
569 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi xóa chat engine: {str(e)}")
|
570 |
-
|
571 |
-
@timer_decorator
|
572 |
-
@router.post("/chat-with-engine/{engine_id}", response_model=ChatResponse, tags=["Chat Engine"])
|
573 |
-
async def chat_with_engine(
|
574 |
-
engine_id: int = Path(..., gt=0, description="ID của chat engine"),
|
575 |
-
request: ChatWithEngineRequest = Body(...),
|
576 |
-
background_tasks: BackgroundTasks = None,
|
577 |
-
db: Session = Depends(get_db)
|
578 |
-
):
|
579 |
-
"""
|
580 |
-
Tương tác với một chat engine cụ thể.
|
581 |
-
|
582 |
-
- **engine_id**: ID của chat engine
|
583 |
-
- **user_id**: ID của người dùng
|
584 |
-
- **question**: Câu hỏi của người dùng
|
585 |
-
- **include_history**: Có sử dụng lịch sử chat hay không
|
586 |
-
- **session_id**: ID session (optional)
|
587 |
-
- **first_name**: Tên của người dùng (optional)
|
588 |
-
- **last_name**: Họ của người dùng (optional)
|
589 |
-
- **username**: Username của người dùng (optional)
|
590 |
-
"""
|
591 |
-
start_time = time.time()
|
592 |
-
try:
|
593 |
-
# Lấy cache
|
594 |
-
cache = get_cache()
|
595 |
-
cache_key = get_chat_engine_cache_key(engine_id)
|
596 |
-
|
597 |
-
# Kiểm tra cache trước
|
598 |
-
engine = cache.get(cache_key)
|
599 |
-
if not engine:
|
600 |
-
logger.debug(f"Cache miss for engine ID {engine_id}, fetching from database")
|
601 |
-
# Nếu không có trong cache, truy vấn database
|
602 |
-
engine = db.query(ChatEngine).filter(ChatEngine.id == engine_id).first()
|
603 |
-
if not engine:
|
604 |
-
raise HTTPException(status_code=404, detail=f"Không tìm thấy chat engine với ID {engine_id}")
|
605 |
-
|
606 |
-
# Lưu vào cache
|
607 |
-
cache.set(cache_key, engine, CHAT_ENGINE_CACHE_TTL)
|
608 |
-
else:
|
609 |
-
logger.debug(f"Cache hit for engine ID {engine_id}")
|
610 |
-
|
611 |
-
# Kiểm tra trạng thái của engine
|
612 |
-
if engine.status != "active":
|
613 |
-
raise HTTPException(status_code=400, detail=f"Chat engine với ID {engine_id} không hoạt động")
|
614 |
-
|
615 |
-
# Lưu tin nhắn người dùng
|
616 |
-
session_id = request.session_id or f"{request.user_id}_{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}"
|
617 |
-
|
618 |
-
# Cache các tham số cấu hình retriever
|
619 |
-
retriever_cache_key = get_retriever_cache_key(engine_id)
|
620 |
-
retriever_params = cache.get(retriever_cache_key)
|
621 |
-
|
622 |
-
if not retriever_params:
|
623 |
-
# Nếu không có trong cache, tạo mới và lưu cache
|
624 |
-
retriever_params = {
|
625 |
-
"index_name": engine.pinecone_index_name,
|
626 |
-
"top_k": engine.similarity_top_k * 2,
|
627 |
-
"limit_k": engine.similarity_top_k * 2, # Mặc định lấy gấp đôi top_k
|
628 |
-
"similarity_metric": DEFAULT_SIMILARITY_METRIC,
|
629 |
-
"similarity_threshold": engine.vector_distance_threshold
|
630 |
-
}
|
631 |
-
cache.set(retriever_cache_key, retriever_params, RETRIEVER_CACHE_TTL)
|
632 |
-
|
633 |
-
# Khởi tạo retriever với các tham số từ cache
|
634 |
-
retriever = get_chain(**retriever_params)
|
635 |
-
if not retriever:
|
636 |
-
raise HTTPException(status_code=500, detail="Không thể khởi tạo retriever")
|
637 |
-
|
638 |
-
# Lấy lịch sử chat nếu cần
|
639 |
-
chat_history = ""
|
640 |
-
if request.include_history and engine.historical_sessions_number > 0:
|
641 |
-
chat_history = get_chat_history(request.user_id, n=engine.historical_sessions_number)
|
642 |
-
logger.info(f"Sử dụng lịch sử chat: {chat_history[:100]}...")
|
643 |
-
|
644 |
-
# Cache các tham số cấu hình model
|
645 |
-
model_cache_key = get_model_config_cache_key(engine.answer_model)
|
646 |
-
model_config = cache.get(model_cache_key)
|
647 |
-
|
648 |
-
if not model_config:
|
649 |
-
# Nếu không có trong cache, tạo mới và lưu cache
|
650 |
-
generation_config = {
|
651 |
-
"temperature": 0.9,
|
652 |
-
"top_p": 1,
|
653 |
-
"top_k": 1,
|
654 |
-
"max_output_tokens": 2048,
|
655 |
-
}
|
656 |
-
|
657 |
-
safety_settings = [
|
658 |
-
{
|
659 |
-
"category": "HARM_CATEGORY_HARASSMENT",
|
660 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
661 |
-
},
|
662 |
-
{
|
663 |
-
"category": "HARM_CATEGORY_HATE_SPEECH",
|
664 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
665 |
-
},
|
666 |
-
{
|
667 |
-
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
668 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
669 |
-
},
|
670 |
-
{
|
671 |
-
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
672 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
673 |
-
},
|
674 |
-
]
|
675 |
-
|
676 |
-
model_config = {
|
677 |
-
"model_name": engine.answer_model,
|
678 |
-
"generation_config": generation_config,
|
679 |
-
"safety_settings": safety_settings
|
680 |
-
}
|
681 |
-
|
682 |
-
cache.set(model_cache_key, model_config, MODEL_CONFIG_CACHE_TTL)
|
683 |
-
|
684 |
-
# Khởi tạo Gemini model từ cấu hình đã cache
|
685 |
-
model = genai.GenerativeModel(**model_config)
|
686 |
-
|
687 |
-
# Sử dụng fix_request để tinh chỉnh câu hỏi
|
688 |
-
prompt_request = fix_request.format(
|
689 |
-
question=request.question,
|
690 |
-
chat_history=chat_history
|
691 |
-
)
|
692 |
-
|
693 |
-
# Log thời gian bắt đầu final_request
|
694 |
-
final_request_start_time = time.time()
|
695 |
-
final_request = model.generate_content(prompt_request)
|
696 |
-
# Log thời gian hoàn thành final_request
|
697 |
-
logger.info(f"Fixed Request: {final_request.text}")
|
698 |
-
logger.info(f"Thời gian sinh fixed request: {time.time() - final_request_start_time:.2f} giây")
|
699 |
-
|
700 |
-
# Lấy context từ retriever
|
701 |
-
retrieved_docs = retriever.invoke(final_request.text)
|
702 |
-
logger.info(f"Số lượng tài liệu lấy được: {len(retrieved_docs)}")
|
703 |
-
context = "\n".join([doc.page_content for doc in retrieved_docs])
|
704 |
-
|
705 |
-
# Tạo danh sách nguồn
|
706 |
-
sources = []
|
707 |
-
for doc in retrieved_docs:
|
708 |
-
source = None
|
709 |
-
metadata = {}
|
710 |
-
|
711 |
-
if hasattr(doc, 'metadata'):
|
712 |
-
source = doc.metadata.get('source', None)
|
713 |
-
# Extract score information
|
714 |
-
score = doc.metadata.get('score', None)
|
715 |
-
normalized_score = doc.metadata.get('normalized_score', None)
|
716 |
-
# Remove score info from metadata to avoid duplication
|
717 |
-
metadata = {k: v for k, v in doc.metadata.items()
|
718 |
-
if k not in ['text', 'source', 'score', 'normalized_score']}
|
719 |
-
|
720 |
-
sources.append(SourceDocument(
|
721 |
-
text=doc.page_content,
|
722 |
-
source=source,
|
723 |
-
score=score,
|
724 |
-
normalized_score=normalized_score,
|
725 |
-
metadata=metadata
|
726 |
-
))
|
727 |
-
|
728 |
-
# Cache prompt template parameters
|
729 |
-
prompt_template_cache_key = get_prompt_template_cache_key(engine_id)
|
730 |
-
prompt_template_params = cache.get(prompt_template_cache_key)
|
731 |
-
|
732 |
-
if not prompt_template_params:
|
733 |
-
# Tạo prompt động dựa trên thông tin chat engine
|
734 |
-
system_prompt_part = engine.system_prompt or ""
|
735 |
-
empty_response_part = engine.empty_response or "I'm sorry. I don't have information about that."
|
736 |
-
characteristic_part = engine.characteristic or ""
|
737 |
-
use_public_info_part = "You can use your own knowledge." if engine.use_public_information else "Only use the information provided in the context to answer. If you do not have enough information, respond with the empty response."
|
738 |
-
|
739 |
-
prompt_template_params = {
|
740 |
-
"system_prompt_part": system_prompt_part,
|
741 |
-
"empty_response_part": empty_response_part,
|
742 |
-
"characteristic_part": characteristic_part,
|
743 |
-
"use_public_info_part": use_public_info_part
|
744 |
-
}
|
745 |
-
|
746 |
-
cache.set(prompt_template_cache_key, prompt_template_params, PROMPT_TEMPLATE_CACHE_TTL)
|
747 |
-
|
748 |
-
# Tạo final_prompt từ cache
|
749 |
-
final_prompt = f"""
|
750 |
-
{prompt_template_params['system_prompt_part']}
|
751 |
-
|
752 |
-
Your characteristics:
|
753 |
-
{prompt_template_params['characteristic_part']}
|
754 |
-
|
755 |
-
When you don't have enough information:
|
756 |
-
{prompt_template_params['empty_response_part']}
|
757 |
-
|
758 |
-
Knowledge usage instructions:
|
759 |
-
{prompt_template_params['use_public_info_part']}
|
760 |
-
|
761 |
-
Context:
|
762 |
-
{context}
|
763 |
-
|
764 |
-
Conversation History:
|
765 |
-
{chat_history}
|
766 |
-
|
767 |
-
User message:
|
768 |
-
{request.question}
|
769 |
-
|
770 |
-
Your response:
|
771 |
-
"""
|
772 |
-
|
773 |
-
logger.info(f"Final prompt: {final_prompt}")
|
774 |
-
|
775 |
-
# Sinh câu trả lời
|
776 |
-
response = model.generate_content(final_prompt)
|
777 |
-
answer = response.text
|
778 |
-
|
779 |
-
# Tính thời gian xử lý
|
780 |
-
processing_time = time.time() - start_time
|
781 |
-
|
782 |
-
# Tạo response object
|
783 |
-
chat_response = ChatResponse(
|
784 |
-
answer=answer,
|
785 |
-
processing_time=processing_time
|
786 |
-
)
|
787 |
-
|
788 |
-
# Trả về response
|
789 |
-
return chat_response
|
790 |
-
except Exception as e:
|
791 |
-
logger.error(f"Lỗi khi xử lý chat request: {e}")
|
792 |
-
logger.error(traceback.format_exc())
|
793 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi xử lý chat request: {str(e)}")
|
794 |
-
|
795 |
-
@router.get("/cache/stats", tags=["Cache"])
|
796 |
-
async def get_cache_stats():
|
797 |
-
"""
|
798 |
-
Lấy thống kê về cache.
|
799 |
-
|
800 |
-
Trả về thông tin về số lượng item trong cache, bộ nhớ sử dụng, v.v.
|
801 |
-
"""
|
802 |
-
try:
|
803 |
-
cache = get_cache()
|
804 |
-
stats = cache.stats()
|
805 |
-
|
806 |
-
# Bổ sung thông tin về cấu hình
|
807 |
-
stats.update({
|
808 |
-
"chat_engine_ttl": CHAT_ENGINE_CACHE_TTL,
|
809 |
-
"model_config_ttl": MODEL_CONFIG_CACHE_TTL,
|
810 |
-
"retriever_ttl": RETRIEVER_CACHE_TTL,
|
811 |
-
"prompt_template_ttl": PROMPT_TEMPLATE_CACHE_TTL
|
812 |
-
})
|
813 |
-
|
814 |
-
return stats
|
815 |
-
except Exception as e:
|
816 |
-
logger.error(f"Lỗi khi lấy thống kê cache: {e}")
|
817 |
-
logger.error(traceback.format_exc())
|
818 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi lấy thống kê cache: {str(e)}")
|
819 |
-
|
820 |
-
@router.delete("/cache", tags=["Cache"])
|
821 |
-
async def clear_cache(key: Optional[str] = None):
|
822 |
-
"""
|
823 |
-
Xóa cache.
|
824 |
-
|
825 |
-
- **key**: Key cụ thể cần xóa. Nếu không có, xóa toàn bộ cache.
|
826 |
-
"""
|
827 |
-
try:
|
828 |
-
cache = get_cache()
|
829 |
-
|
830 |
-
if key:
|
831 |
-
# Xóa một key cụ thể
|
832 |
-
success = cache.delete(key)
|
833 |
-
if success:
|
834 |
-
return {"message": f"Đã xóa cache cho key: {key}"}
|
835 |
-
else:
|
836 |
-
return {"message": f"Không tìm thấy key: {key} trong cache"}
|
837 |
-
else:
|
838 |
-
# Xóa toàn bộ cache
|
839 |
-
cache.clear()
|
840 |
-
return {"message": "Đã xóa toàn bộ cache"}
|
841 |
-
except Exception as e:
|
842 |
-
logger.error(f"Lỗi khi xóa cache: {e}")
|
843 |
-
logger.error(traceback.format_exc())
|
844 |
-
raise HTTPException(status_code=500, detail=f"Lỗi khi xóa cache: {str(e)}")
|
|
|
1 |
+
from fastapi import APIRouter, HTTPException, Depends, Query, BackgroundTasks, Request
|
2 |
from typing import List, Optional, Dict, Any
|
3 |
import logging
|
4 |
import time
|
|
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
14 |
from app.utils.utils import timer_decorator
|
|
|
|
|
15 |
|
16 |
from app.database.mongodb import get_chat_history, get_request_history, session_collection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
from app.database.pinecone import (
|
18 |
search_vectors,
|
19 |
get_chain,
|
|
|
30 |
SourceDocument,
|
31 |
EmbeddingRequest,
|
32 |
EmbeddingResponse,
|
33 |
+
UserMessageModel
|
|
|
|
|
|
|
|
|
|
|
34 |
)
|
35 |
|
36 |
# Configure logging
|
|
|
39 |
# Configure Google Gemini API
|
40 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
41 |
genai.configure(api_key=GOOGLE_API_KEY)
|
|
|
42 |
|
43 |
# Create router
|
44 |
router = APIRouter(
|
|
|
50 |
template = """Goal:
|
51 |
Your task is to extract important keywords from the user's current request, optionally using chat history if relevant.
|
52 |
You will receive a conversation history and the user's current message.
|
53 |
+
Generate a **list of concise keywords** that best represent the user's intent.
|
54 |
|
55 |
Return Format:
|
56 |
Only return keywords (comma-separated, no extra explanation).
|
|
|
60 |
Warning:
|
61 |
Only use chat history if the current message is clearly related to the prior context.
|
62 |
|
|
|
|
|
|
|
63 |
Conversation History:
|
64 |
{chat_history}
|
65 |
|
|
|
72 |
# Create a prompt template with conversation history
|
73 |
prompt = PromptTemplate(
|
74 |
template = """Goal:
|
75 |
+
You are a professional tour guide assistant that assists users in finding information about places in Da Nang, Vietnam.
|
76 |
You can provide details on restaurants, cafes, hotels, attractions, and other local venues.
|
77 |
You have to use core knowledge and conversation history to chat with users, who are Da Nang's tourists.
|
78 |
|
|
|
83 |
Warning:
|
84 |
Let's support users like a real tour guide, not a bot. The information in core knowledge is your own knowledge.
|
85 |
Your knowledge is provided in the Core Knowledge. All of information in Core Knowledge is about Da Nang, Vietnam.
|
86 |
+
You just care about current time that user mention when user ask about Solana event.
|
87 |
+
Only use core knowledge to answer. If you do not have enough information to answer user's question, please reply with "I'm sorry. I don't have information about that" and Give users some more options to ask.
|
88 |
|
89 |
Core knowledge:
|
90 |
{context}
|
|
|
100 |
input_variables = ["context", "question", "chat_history"],
|
101 |
)
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
# Helper for embeddings
|
104 |
async def get_embedding(text: str):
|
105 |
"""Get embedding from Google Gemini API"""
|
|
|
164 |
# logger.info(f"Processing chat request for user {request.user_id}, session {session_id}")
|
165 |
|
166 |
retriever = get_chain(
|
167 |
+
top_k=request.similarity_top_k,
|
168 |
+
limit_k=request.limit_k,
|
169 |
similarity_metric=request.similarity_metric,
|
170 |
similarity_threshold=request.similarity_threshold
|
171 |
)
|
|
|
210 |
)
|
211 |
|
212 |
prompt_request = fix_request.format(
|
|
|
213 |
question=request.question,
|
214 |
chat_history=chat_history
|
215 |
)
|
|
|
251 |
# Generate the prompt using template
|
252 |
prompt_text = prompt.format(
|
253 |
context=context,
|
254 |
+
question=final_request.text,
|
255 |
chat_history=chat_history
|
256 |
)
|
257 |
+
logger.info(f"Full prompt with history and context: {prompt_text}")
|
258 |
|
259 |
# Generate response
|
260 |
response = model.generate_content(prompt_text)
|
261 |
answer = response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
# Calculate processing time
|
264 |
processing_time = time.time() - start_time
|
|
|
268 |
|
269 |
# Create response object for API (without sources)
|
270 |
chat_response = ChatResponse(
|
271 |
+
answer=answer,
|
272 |
processing_time=processing_time
|
273 |
)
|
274 |
|
|
|
335 |
"services": services,
|
336 |
"retrieval_config": retrieval_config,
|
337 |
"timestamp": datetime.now().isoformat()
|
338 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/database/models.py
CHANGED
@@ -155,13 +155,10 @@ class ChatEngine(Base):
|
|
155 |
answer_model = Column(String, nullable=False)
|
156 |
system_prompt = Column(Text, nullable=True)
|
157 |
empty_response = Column(String, nullable=True)
|
158 |
-
characteristic = Column(Text, nullable=True)
|
159 |
-
historical_sessions_number = Column(Integer, default=3)
|
160 |
similarity_top_k = Column(Integer, default=3)
|
161 |
vector_distance_threshold = Column(Float, default=0.75)
|
162 |
grounding_threshold = Column(Float, default=0.2)
|
163 |
use_public_information = Column(Boolean, default=False)
|
164 |
-
pinecone_index_name = Column(String, default="testbot768")
|
165 |
status = Column(String, default="active")
|
166 |
created_at = Column(DateTime, server_default=func.now())
|
167 |
last_modified = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
|
|
155 |
answer_model = Column(String, nullable=False)
|
156 |
system_prompt = Column(Text, nullable=True)
|
157 |
empty_response = Column(String, nullable=True)
|
|
|
|
|
158 |
similarity_top_k = Column(Integer, default=3)
|
159 |
vector_distance_threshold = Column(Float, default=0.75)
|
160 |
grounding_threshold = Column(Float, default=0.2)
|
161 |
use_public_information = Column(Boolean, default=False)
|
|
|
162 |
status = Column(String, default="active")
|
163 |
created_at = Column(DateTime, server_default=func.now())
|
164 |
last_modified = Column(DateTime, server_default=func.now(), onupdate=func.now())
|
app/database/postgresql.py
CHANGED
@@ -12,22 +12,19 @@ logger = logging.getLogger(__name__)
|
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
14 |
|
15 |
-
# Define default PostgreSQL connection string
|
16 |
-
DEFAULT_DB_URL = os.getenv("AIVEN_DB_URL")
|
17 |
-
# Set the default DB URL with the correct domain (.l.)
|
18 |
# Get DB connection mode from environment
|
19 |
DB_CONNECTION_MODE = os.getenv("DB_CONNECTION_MODE", "aiven")
|
20 |
|
21 |
# Set connection string based on mode
|
22 |
if DB_CONNECTION_MODE == "aiven":
|
23 |
-
DATABASE_URL = os.getenv("AIVEN_DB_URL"
|
24 |
else:
|
25 |
# Default or other connection modes can be added here
|
26 |
-
DATABASE_URL = os.getenv("AIVEN_DB_URL"
|
27 |
|
28 |
if not DATABASE_URL:
|
29 |
-
logger.error("No database URL configured.
|
30 |
-
DATABASE_URL =
|
31 |
|
32 |
# Create SQLAlchemy engine with optimized settings
|
33 |
try:
|
|
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
14 |
|
|
|
|
|
|
|
15 |
# Get DB connection mode from environment
|
16 |
DB_CONNECTION_MODE = os.getenv("DB_CONNECTION_MODE", "aiven")
|
17 |
|
18 |
# Set connection string based on mode
|
19 |
if DB_CONNECTION_MODE == "aiven":
|
20 |
+
DATABASE_URL = os.getenv("AIVEN_DB_URL")
|
21 |
else:
|
22 |
# Default or other connection modes can be added here
|
23 |
+
DATABASE_URL = os.getenv("AIVEN_DB_URL")
|
24 |
|
25 |
if not DATABASE_URL:
|
26 |
+
logger.error("No database URL configured. Please set AIVEN_DB_URL environment variable.")
|
27 |
+
DATABASE_URL = "postgresql://localhost/test" # Fallback to avoid crash on startup
|
28 |
|
29 |
# Create SQLAlchemy engine with optimized settings
|
30 |
try:
|
app/models/pdf_models.py
CHANGED
@@ -10,15 +10,12 @@ class PDFUploadRequest(BaseModel):
|
|
10 |
vector_database_id: Optional[int] = Field(None, description="ID của vector database trong PostgreSQL để sử dụng")
|
11 |
|
12 |
class PDFResponse(BaseModel):
|
13 |
-
"""Response model cho
|
14 |
-
success: bool = Field(
|
15 |
-
document_id: Optional[str] = Field(None, description="ID của tài liệu
|
16 |
-
document_database_id: Optional[int] = Field(None, description="ID của tài liệu trong PostgreSQL (nếu có)")
|
17 |
chunks_processed: Optional[int] = Field(None, description="Số lượng chunks đã xử lý")
|
18 |
-
total_text_length: Optional[int] = Field(None, description="Tổng
|
19 |
-
error: Optional[str] = Field(None, description="Thông báo lỗi
|
20 |
-
warning: Optional[str] = Field(None, description="Cảnh báo (nếu có)")
|
21 |
-
message: Optional[str] = Field(None, description="Thông báo thành công")
|
22 |
|
23 |
class Config:
|
24 |
schema_extra = {
|
@@ -26,8 +23,7 @@ class PDFResponse(BaseModel):
|
|
26 |
"success": True,
|
27 |
"document_id": "550e8400-e29b-41d4-a716-446655440000",
|
28 |
"chunks_processed": 25,
|
29 |
-
"total_text_length": 50000
|
30 |
-
"message": "Successfully processed document"
|
31 |
}
|
32 |
}
|
33 |
|
@@ -36,18 +32,14 @@ class DeleteDocumentRequest(BaseModel):
|
|
36 |
document_id: str = Field(..., description="ID của tài liệu cần xóa")
|
37 |
namespace: Optional[str] = Field("Default", description="Namespace trong Pinecone")
|
38 |
index_name: Optional[str] = Field("testbot768", description="Tên index trong Pinecone")
|
39 |
-
vector_database_id: Optional[int] = Field(None, description="ID của vector database trong PostgreSQL")
|
40 |
|
41 |
class DocumentsListResponse(BaseModel):
|
42 |
-
"""Response model cho danh sách
|
43 |
-
success: bool = Field(
|
44 |
-
total_vectors: Optional[int] = Field(None, description="Tổng số vectors trong
|
45 |
-
namespace: Optional[str] = Field(None, description="Namespace
|
46 |
-
index_name: Optional[str] = Field(None, description="Tên index
|
47 |
-
|
48 |
-
postgresql_documents: Optional[List[Dict[str, Any]]] = Field(None, description="Danh sách documents từ PostgreSQL")
|
49 |
-
postgresql_document_count: Optional[int] = Field(None, description="Số lượng documents từ PostgreSQL")
|
50 |
-
error: Optional[str] = Field(None, description="Thông báo lỗi (nếu có)")
|
51 |
|
52 |
class Config:
|
53 |
schema_extra = {
|
|
|
10 |
vector_database_id: Optional[int] = Field(None, description="ID của vector database trong PostgreSQL để sử dụng")
|
11 |
|
12 |
class PDFResponse(BaseModel):
|
13 |
+
"""Response model cho xử lý PDF"""
|
14 |
+
success: bool = Field(..., description="Trạng thái xử lý thành công hay không")
|
15 |
+
document_id: Optional[str] = Field(None, description="ID của tài liệu")
|
|
|
16 |
chunks_processed: Optional[int] = Field(None, description="Số lượng chunks đã xử lý")
|
17 |
+
total_text_length: Optional[int] = Field(None, description="Tổng độ dài văn bản")
|
18 |
+
error: Optional[str] = Field(None, description="Thông báo lỗi nếu có")
|
|
|
|
|
19 |
|
20 |
class Config:
|
21 |
schema_extra = {
|
|
|
23 |
"success": True,
|
24 |
"document_id": "550e8400-e29b-41d4-a716-446655440000",
|
25 |
"chunks_processed": 25,
|
26 |
+
"total_text_length": 50000
|
|
|
27 |
}
|
28 |
}
|
29 |
|
|
|
32 |
document_id: str = Field(..., description="ID của tài liệu cần xóa")
|
33 |
namespace: Optional[str] = Field("Default", description="Namespace trong Pinecone")
|
34 |
index_name: Optional[str] = Field("testbot768", description="Tên index trong Pinecone")
|
|
|
35 |
|
36 |
class DocumentsListResponse(BaseModel):
|
37 |
+
"""Response model cho lấy danh sách tài liệu"""
|
38 |
+
success: bool = Field(..., description="Trạng thái xử lý thành công hay không")
|
39 |
+
total_vectors: Optional[int] = Field(None, description="Tổng số vectors trong index")
|
40 |
+
namespace: Optional[str] = Field(None, description="Namespace đang sử dụng")
|
41 |
+
index_name: Optional[str] = Field(None, description="Tên index đang sử dụng")
|
42 |
+
error: Optional[str] = Field(None, description="Thông báo lỗi nếu có")
|
|
|
|
|
|
|
43 |
|
44 |
class Config:
|
45 |
schema_extra = {
|
app/models/rag_models.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
from pydantic import BaseModel, Field
|
2 |
from typing import Optional, List, Dict, Any
|
3 |
-
from datetime import datetime
|
4 |
-
from pydantic import ConfigDict
|
5 |
|
6 |
class ChatRequest(BaseModel):
|
7 |
"""Request model for chat endpoint"""
|
@@ -14,7 +12,7 @@ class ChatRequest(BaseModel):
|
|
14 |
similarity_top_k: int = Field(6, description="Number of top similar documents to return (after filtering)")
|
15 |
limit_k: int = Field(10, description="Maximum number of documents to retrieve from vector store")
|
16 |
similarity_metric: str = Field("cosine", description="Similarity metric to use (cosine, dotproduct, euclidean)")
|
17 |
-
similarity_threshold: float = Field(0.
|
18 |
|
19 |
# User information
|
20 |
session_id: Optional[str] = Field(None, description="Session ID for tracking conversations")
|
@@ -67,58 +65,4 @@ class UserMessageModel(BaseModel):
|
|
67 |
similarity_top_k: Optional[int] = Field(None, description="Number of top similar documents to return (after filtering)")
|
68 |
limit_k: Optional[int] = Field(None, description="Maximum number of documents to retrieve from vector store")
|
69 |
similarity_metric: Optional[str] = Field(None, description="Similarity metric to use (cosine, dotproduct, euclidean)")
|
70 |
-
similarity_threshold: Optional[float] = Field(None, description="Threshold for vector similarity (0-1)")
|
71 |
-
|
72 |
-
class ChatEngineBase(BaseModel):
|
73 |
-
"""Base model cho chat engine"""
|
74 |
-
name: str = Field(..., description="Tên của chat engine")
|
75 |
-
answer_model: str = Field(..., description="Model được dùng để trả lời")
|
76 |
-
system_prompt: Optional[str] = Field(None, description="Prompt của hệ thống, được đưa vào phần đầu tiên của final_prompt")
|
77 |
-
empty_response: Optional[str] = Field(None, description="Đoạn response khi answer model không có thông tin về câu hỏi")
|
78 |
-
characteristic: Optional[str] = Field(None, description="Tính cách của model khi trả lời câu hỏi")
|
79 |
-
historical_sessions_number: int = Field(3, description="Số lượng các cặp tin nhắn trong history được đưa vào final prompt")
|
80 |
-
use_public_information: bool = Field(False, description="Yes nếu answer model được quyền trả về thông tin mà nó có")
|
81 |
-
similarity_top_k: int = Field(3, description="Số lượng top similar documents để trả về")
|
82 |
-
vector_distance_threshold: float = Field(0.75, description="Threshold cho vector similarity")
|
83 |
-
grounding_threshold: float = Field(0.2, description="Threshold cho grounding")
|
84 |
-
pinecone_index_name: str = Field("testbot768", description="Vector database mà model được quyền sử dụng")
|
85 |
-
status: str = Field("active", description="Trạng thái của chat engine")
|
86 |
-
|
87 |
-
class ChatEngineCreate(ChatEngineBase):
|
88 |
-
"""Model cho việc tạo chat engine mới"""
|
89 |
-
pass
|
90 |
-
|
91 |
-
class ChatEngineUpdate(BaseModel):
|
92 |
-
"""Model cho việc cập nhật chat engine"""
|
93 |
-
name: Optional[str] = None
|
94 |
-
answer_model: Optional[str] = None
|
95 |
-
system_prompt: Optional[str] = None
|
96 |
-
empty_response: Optional[str] = None
|
97 |
-
characteristic: Optional[str] = None
|
98 |
-
historical_sessions_number: Optional[int] = None
|
99 |
-
use_public_information: Optional[bool] = None
|
100 |
-
similarity_top_k: Optional[int] = None
|
101 |
-
vector_distance_threshold: Optional[float] = None
|
102 |
-
grounding_threshold: Optional[float] = None
|
103 |
-
pinecone_index_name: Optional[str] = None
|
104 |
-
status: Optional[str] = None
|
105 |
-
|
106 |
-
class ChatEngineResponse(ChatEngineBase):
|
107 |
-
"""Response model cho chat engine"""
|
108 |
-
id: int
|
109 |
-
created_at: datetime
|
110 |
-
last_modified: datetime
|
111 |
-
|
112 |
-
model_config = ConfigDict(from_attributes=True)
|
113 |
-
|
114 |
-
class ChatWithEngineRequest(BaseModel):
|
115 |
-
"""Request model cho endpoint chat-with-engine"""
|
116 |
-
user_id: str = Field(..., description="User ID from Telegram")
|
117 |
-
question: str = Field(..., description="User's question")
|
118 |
-
include_history: bool = Field(True, description="Whether to include user history in prompt")
|
119 |
-
|
120 |
-
# User information
|
121 |
-
session_id: Optional[str] = Field(None, description="Session ID for tracking conversations")
|
122 |
-
first_name: Optional[str] = Field(None, description="User's first name")
|
123 |
-
last_name: Optional[str] = Field(None, description="User's last name")
|
124 |
-
username: Optional[str] = Field(None, description="User's username")
|
|
|
1 |
from pydantic import BaseModel, Field
|
2 |
from typing import Optional, List, Dict, Any
|
|
|
|
|
3 |
|
4 |
class ChatRequest(BaseModel):
|
5 |
"""Request model for chat endpoint"""
|
|
|
12 |
similarity_top_k: int = Field(6, description="Number of top similar documents to return (after filtering)")
|
13 |
limit_k: int = Field(10, description="Maximum number of documents to retrieve from vector store")
|
14 |
similarity_metric: str = Field("cosine", description="Similarity metric to use (cosine, dotproduct, euclidean)")
|
15 |
+
similarity_threshold: float = Field(0.75, description="Threshold for vector similarity (0-1)")
|
16 |
|
17 |
# User information
|
18 |
session_id: Optional[str] = Field(None, description="Session ID for tracking conversations")
|
|
|
65 |
similarity_top_k: Optional[int] = Field(None, description="Number of top similar documents to return (after filtering)")
|
66 |
limit_k: Optional[int] = Field(None, description="Maximum number of documents to retrieve from vector store")
|
67 |
similarity_metric: Optional[str] = Field(None, description="Similarity metric to use (cosine, dotproduct, euclidean)")
|
68 |
+
similarity_threshold: Optional[float] = Field(None, description="Threshold for vector similarity (0-1)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/utils/cache_config.py
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Module cấu hình cho cache.
|
3 |
-
|
4 |
-
Module này chứa các tham số cấu hình và constants liên quan đến cache.
|
5 |
-
"""
|
6 |
-
|
7 |
-
import os
|
8 |
-
from dotenv import load_dotenv
|
9 |
-
|
10 |
-
# Load biến môi trường
|
11 |
-
load_dotenv()
|
12 |
-
|
13 |
-
# Cấu hình cache từ biến môi trường, có thể override bằng .env file
|
14 |
-
CACHE_TTL_SECONDS = int(os.getenv("CACHE_TTL_SECONDS", "300")) # Mặc định 5 phút
|
15 |
-
CACHE_CLEANUP_INTERVAL = int(os.getenv("CACHE_CLEANUP_INTERVAL", "60")) # Mặc định 1 phút
|
16 |
-
CACHE_MAX_SIZE = int(os.getenv("CACHE_MAX_SIZE", "1000")) # Mặc định 1000 phần tử
|
17 |
-
|
18 |
-
# Cấu hình cho loại cache cụ thể
|
19 |
-
CHAT_ENGINE_CACHE_TTL = int(os.getenv("CHAT_ENGINE_CACHE_TTL", str(CACHE_TTL_SECONDS)))
|
20 |
-
MODEL_CONFIG_CACHE_TTL = int(os.getenv("MODEL_CONFIG_CACHE_TTL", str(CACHE_TTL_SECONDS)))
|
21 |
-
RETRIEVER_CACHE_TTL = int(os.getenv("RETRIEVER_CACHE_TTL", str(CACHE_TTL_SECONDS)))
|
22 |
-
PROMPT_TEMPLATE_CACHE_TTL = int(os.getenv("PROMPT_TEMPLATE_CACHE_TTL", str(CACHE_TTL_SECONDS)))
|
23 |
-
|
24 |
-
# Cache keys prefix
|
25 |
-
CHAT_ENGINE_CACHE_PREFIX = "chat_engine:"
|
26 |
-
MODEL_CONFIG_CACHE_PREFIX = "model_config:"
|
27 |
-
RETRIEVER_CACHE_PREFIX = "retriever:"
|
28 |
-
PROMPT_TEMPLATE_CACHE_PREFIX = "prompt_template:"
|
29 |
-
|
30 |
-
# Hàm helper để tạo cache key
|
31 |
-
def get_chat_engine_cache_key(engine_id: int) -> str:
|
32 |
-
"""Tạo cache key cho chat engine"""
|
33 |
-
return f"{CHAT_ENGINE_CACHE_PREFIX}{engine_id}"
|
34 |
-
|
35 |
-
def get_model_config_cache_key(model_name: str) -> str:
|
36 |
-
"""Tạo cache key cho model config"""
|
37 |
-
return f"{MODEL_CONFIG_CACHE_PREFIX}{model_name}"
|
38 |
-
|
39 |
-
def get_retriever_cache_key(engine_id: int) -> str:
|
40 |
-
"""Tạo cache key cho retriever"""
|
41 |
-
return f"{RETRIEVER_CACHE_PREFIX}{engine_id}"
|
42 |
-
|
43 |
-
def get_prompt_template_cache_key(engine_id: int) -> str:
|
44 |
-
"""Tạo cache key cho prompt template"""
|
45 |
-
return f"{PROMPT_TEMPLATE_CACHE_PREFIX}{engine_id}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/utils/pdf_processor.py
CHANGED
@@ -1,488 +1,292 @@
|
|
1 |
import os
|
2 |
-
import logging
|
3 |
-
import uuid
|
4 |
-
import pinecone
|
5 |
-
from app.utils.pinecone_fix import PineconeConnectionManager, check_connection
|
6 |
import time
|
7 |
-
|
8 |
-
|
9 |
-
# Langchain imports for document processing
|
10 |
-
from langchain_community.document_loaders import PyPDFLoader
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
12 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
13 |
-
import
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
# Configure
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
|
|
|
|
|
|
18 |
class PDFProcessor:
|
19 |
-
"""
|
20 |
|
21 |
-
def __init__(self, index_name="testbot768", namespace="Default", api_key=None, vector_db_id=None, mock_mode=False
|
|
|
22 |
self.index_name = index_name
|
23 |
self.namespace = namespace
|
|
|
24 |
self.api_key = api_key
|
25 |
self.vector_db_id = vector_db_id
|
26 |
-
self.
|
27 |
-
self.mock_mode =
|
28 |
-
self.correlation_id = correlation_id or str(uuid.uuid4())[:8]
|
29 |
-
self.google_api_key = os.environ.get("GOOGLE_API_KEY")
|
30 |
|
31 |
-
|
|
|
32 |
if self.api_key:
|
33 |
-
|
34 |
-
# Use connection manager from pinecone_fix
|
35 |
-
logger.info(f"[{self.correlation_id}] Initializing Pinecone connection to {self.index_name}")
|
36 |
-
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
37 |
-
logger.info(f"[{self.correlation_id}] Successfully connected to Pinecone index {self.index_name}")
|
38 |
-
except Exception as e:
|
39 |
-
logger.error(f"[{self.correlation_id}] Failed to initialize Pinecone: {str(e)}")
|
40 |
-
# No fallback to mock mode - require a valid connection
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
This method:
|
46 |
-
1. Extracts text from PDF using PyPDFLoader
|
47 |
-
2. Splits text into chunks using RecursiveCharacterTextSplitter
|
48 |
-
3. Creates embeddings using Google Gemini model
|
49 |
-
4. Stores embeddings in Pinecone
|
50 |
-
"""
|
51 |
-
logger.info(f"[{self.correlation_id}] Processing PDF: {file_path}")
|
52 |
-
|
53 |
-
try:
|
54 |
-
# Initialize metadata if not provided
|
55 |
-
if metadata is None:
|
56 |
-
metadata = {}
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
# Add document_id to metadata
|
63 |
-
metadata["document_id"] = document_id
|
64 |
|
65 |
-
#
|
66 |
-
|
|
|
|
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
await progress_callback(None, document_id, "text_extraction", 0.2, "Extracting text from PDF")
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
chunk_overlap=100,
|
86 |
-
length_function=len,
|
87 |
-
separators=["\n\n", "\n", " ", ""]
|
88 |
-
)
|
89 |
|
90 |
-
|
|
|
|
|
91 |
|
92 |
-
|
|
|
93 |
|
94 |
-
#
|
95 |
-
|
96 |
-
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
if not self.pinecone_index:
|
107 |
-
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
# Create embedding model
|
114 |
-
embedding_model = GoogleGenerativeAIEmbeddings(
|
115 |
-
model="models/embedding-001",
|
116 |
-
google_api_key=self.google_api_key,
|
117 |
-
task_type="retrieval_document" # Use document embedding mode for longer text
|
118 |
-
)
|
119 |
|
120 |
-
#
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
|
|
|
|
|
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
logger.warning(f"[{self.correlation_id}] Embedding dimension mismatch: got {embedding_dimension}, need {pinecone_dimension}")
|
129 |
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
logger.info(f"[{self.correlation_id}] Using duplication strategy to upscale from {embedding_dimension} to {pinecone_dimension}")
|
134 |
-
|
135 |
-
if embedding_dimension * 2 == pinecone_dimension:
|
136 |
-
# Perfect doubling (768 -> 1536)
|
137 |
-
def adjust_embedding(embedding):
|
138 |
-
# Duplicate each value to double the dimension
|
139 |
-
return [val for val in embedding for _ in range(2)]
|
140 |
-
else:
|
141 |
-
# Generic padding with zeros
|
142 |
-
pad_size = pinecone_dimension - embedding_dimension
|
143 |
-
def adjust_embedding(embedding):
|
144 |
-
return embedding + [0.0] * pad_size
|
145 |
-
else:
|
146 |
-
# Truncation strategy - take first pinecone_dimension values
|
147 |
-
logger.info(f"[{self.correlation_id}] Will truncate embeddings from {embedding_dimension} to {pinecone_dimension}")
|
148 |
-
|
149 |
-
def adjust_embedding(embedding):
|
150 |
-
return embedding[:pinecone_dimension]
|
151 |
-
else:
|
152 |
-
# No adjustment needed
|
153 |
-
def adjust_embedding(embedding):
|
154 |
-
return embedding
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
for
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
#
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
-
#
|
167 |
-
|
|
|
|
|
|
|
168 |
|
169 |
-
#
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
if len(adjusted_embedding) != pinecone_dimension:
|
176 |
-
raise ValueError(f"Dimension mismatch after adjustment: got {len(adjusted_embedding)}, expected {pinecone_dimension}")
|
177 |
-
|
178 |
-
# Create metadata for this chunk
|
179 |
-
chunk_metadata = {
|
180 |
-
"document_id": document_id,
|
181 |
-
"page": chunk.metadata.get("page", 0),
|
182 |
-
"chunk_id": f"{document_id}-chunk-{i+j}",
|
183 |
-
"text": chunk.page_content[:1000], # Store first 1000 chars of text
|
184 |
-
**metadata # Include original metadata
|
185 |
-
}
|
186 |
-
|
187 |
-
# Create vector record
|
188 |
-
vector = {
|
189 |
-
"id": f"{document_id}-{i+j}",
|
190 |
-
"values": adjusted_embedding,
|
191 |
-
"metadata": chunk_metadata
|
192 |
-
}
|
193 |
-
|
194 |
-
vectors_to_upsert.append(vector)
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
await progress_callback(None, document_id, "storing", 0.8, f"Storing {len(vectors_to_upsert)} vectors in Pinecone")
|
201 |
|
202 |
-
|
|
|
|
|
203 |
|
204 |
-
|
205 |
-
result = PineconeConnectionManager.upsert_vectors_with_validation(
|
206 |
-
self.pinecone_index,
|
207 |
-
vectors_to_upsert,
|
208 |
-
namespace=actual_namespace
|
209 |
-
)
|
210 |
-
|
211 |
-
logger.info(f"[{self.correlation_id}] Successfully upserted {result.get('upserted_count', 0)} vectors to Pinecone")
|
212 |
|
|
|
213 |
if progress_callback:
|
214 |
-
await progress_callback(
|
215 |
|
216 |
-
# Return success with stats
|
217 |
return {
|
218 |
"success": True,
|
219 |
"document_id": document_id,
|
220 |
"chunks_processed": len(chunks),
|
221 |
-
"total_text_length":
|
222 |
-
"vectors_created": len(vectors_to_upsert),
|
223 |
-
"vectors_upserted": result.get('upserted_count', 0),
|
224 |
-
"message": "PDF processed successfully"
|
225 |
}
|
|
|
226 |
except Exception as e:
|
227 |
-
logger.error(f"
|
|
|
|
|
228 |
return {
|
229 |
"success": False,
|
230 |
-
"error":
|
231 |
}
|
232 |
|
233 |
-
async def
|
234 |
-
"""
|
235 |
try:
|
|
|
|
|
|
|
|
|
236 |
if not self.pinecone_index:
|
237 |
-
self.pinecone_index =
|
|
|
|
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
242 |
|
243 |
-
|
244 |
-
|
245 |
-
"namespaces": namespaces
|
246 |
-
}
|
247 |
except Exception as e:
|
248 |
-
logger.error(f"
|
249 |
-
|
250 |
-
"success": False,
|
251 |
-
"error": f"Error listing namespaces: {str(e)}"
|
252 |
-
}
|
253 |
|
254 |
async def delete_namespace(self):
|
255 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
try:
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
namespaces = stats.get("namespaces", {})
|
265 |
-
|
266 |
-
if self.namespace in namespaces:
|
267 |
-
vector_count = namespaces[self.namespace].get("vector_count", 0)
|
268 |
-
# Delete all vectors in namespace
|
269 |
-
self.pinecone_index.delete(delete_all=True, namespace=self.namespace)
|
270 |
-
return {
|
271 |
-
"success": True,
|
272 |
-
"namespace": self.namespace,
|
273 |
-
"deleted_count": vector_count,
|
274 |
-
"message": f"Successfully deleted namespace '{self.namespace}' with {vector_count} vectors"
|
275 |
-
}
|
276 |
-
else:
|
277 |
-
return {
|
278 |
-
"success": True,
|
279 |
-
"namespace": self.namespace,
|
280 |
-
"deleted_count": 0,
|
281 |
-
"message": f"Namespace '{self.namespace}' does not exist - nothing to delete"
|
282 |
-
}
|
283 |
except Exception as e:
|
284 |
-
logger.error(f"
|
285 |
-
return {
|
286 |
-
"success": False,
|
287 |
-
"namespace": self.namespace,
|
288 |
-
"error": f"Error deleting namespace: {str(e)}"
|
289 |
-
}
|
290 |
|
291 |
-
async def
|
292 |
-
"""
|
293 |
-
logger.info(f"[{self.correlation_id}] Deleting vectors for document '{document_id}' from namespace '{self.namespace}'")
|
294 |
-
|
295 |
try:
|
|
|
|
|
296 |
if not self.pinecone_index:
|
297 |
-
|
298 |
-
|
299 |
-
# Use metadata filtering to find vectors with matching document_id
|
300 |
-
# The specific namespace to use might be vdb-X format if vector_db_id provided
|
301 |
-
actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace
|
302 |
-
|
303 |
-
# Try to find vectors using multiple approaches
|
304 |
-
filters = []
|
305 |
|
306 |
-
#
|
307 |
-
|
308 |
-
|
309 |
-
# If this is a UUID, try with different formats (with/without hyphens)
|
310 |
-
if len(document_id) >= 32:
|
311 |
-
# This looks like it might be a UUID - try variations
|
312 |
-
if "-" in document_id:
|
313 |
-
# If it has hyphens, try without
|
314 |
-
filters.append({"document_id": document_id.replace("-", "")})
|
315 |
-
else:
|
316 |
-
# If it doesn't have hyphens, try to format it as UUID
|
317 |
-
try:
|
318 |
-
formatted_uuid = str(uuid.UUID(document_id))
|
319 |
-
filters.append({"document_id": formatted_uuid})
|
320 |
-
except ValueError:
|
321 |
-
pass
|
322 |
-
|
323 |
-
# Also try with title field if it could be a document name
|
324 |
-
if not document_id.startswith("doc-") and not document_id.startswith("test-doc-") and len(document_id) < 36:
|
325 |
-
# This might be a document title/name
|
326 |
-
filters.append({"title": document_id})
|
327 |
-
|
328 |
-
# If additional metadata was provided, use it to make extra filters
|
329 |
-
if additional_metadata:
|
330 |
-
if "document_name" in additional_metadata:
|
331 |
-
# Try exact name match
|
332 |
-
filters.append({"title": additional_metadata["document_name"]})
|
333 |
-
|
334 |
-
# Also try filename if name has extension
|
335 |
-
if "." in additional_metadata["document_name"]:
|
336 |
-
filters.append({"filename": additional_metadata["document_name"]})
|
337 |
-
|
338 |
-
# Search for vectors with any of these filters
|
339 |
-
found_vectors = False
|
340 |
-
deleted_count = 0
|
341 |
-
filter_used = ""
|
342 |
-
|
343 |
-
logger.info(f"[{self.correlation_id}] Will try {len(filters)} different filters to find document")
|
344 |
-
|
345 |
-
for i, filter_query in enumerate(filters):
|
346 |
-
logger.info(f"[{self.correlation_id}] Searching for vectors with filter #{i+1}: {filter_query}")
|
347 |
-
|
348 |
-
# Search for vectors with this filter
|
349 |
-
try:
|
350 |
-
results = self.pinecone_index.query(
|
351 |
-
vector=[0] * 1536, # Dummy vector, we only care about metadata filter
|
352 |
-
top_k=1,
|
353 |
-
include_metadata=True,
|
354 |
-
filter=filter_query,
|
355 |
-
namespace=actual_namespace
|
356 |
-
)
|
357 |
-
|
358 |
-
if results and results.get("matches") and len(results.get("matches", [])) > 0:
|
359 |
-
logger.info(f"[{self.correlation_id}] Found vectors matching filter: {filter_query}")
|
360 |
-
found_vectors = True
|
361 |
-
filter_used = str(filter_query)
|
362 |
-
|
363 |
-
# Delete vectors by filter
|
364 |
-
delete_result = self.pinecone_index.delete(
|
365 |
-
filter=filter_query,
|
366 |
-
namespace=actual_namespace
|
367 |
-
)
|
368 |
-
|
369 |
-
# Get delete count from result
|
370 |
-
deleted_count = delete_result.get("deleted_count", 0)
|
371 |
-
logger.info(f"[{self.correlation_id}] Deleted {deleted_count} vectors with filter: {filter_query}")
|
372 |
-
break
|
373 |
-
except Exception as filter_error:
|
374 |
-
logger.warning(f"[{self.correlation_id}] Error searching with filter {filter_query}: {str(filter_error)}")
|
375 |
-
continue
|
376 |
|
377 |
-
#
|
378 |
-
|
379 |
-
|
380 |
-
return {
|
381 |
-
"success": True, # Still return success=True to maintain backward compatibility
|
382 |
-
"document_id": document_id,
|
383 |
-
"namespace": actual_namespace,
|
384 |
-
"deleted_count": 0,
|
385 |
-
"warning": f"No vectors found for document '{document_id}' in namespace '{actual_namespace}'",
|
386 |
-
"message": f"Found 0 vectors for document '{document_id}' in namespace '{actual_namespace}'",
|
387 |
-
"vectors_found": False,
|
388 |
-
"vectors_deleted": 0
|
389 |
-
}
|
390 |
|
391 |
return {
|
392 |
"success": True,
|
393 |
-
"
|
394 |
-
"namespace":
|
395 |
-
"
|
396 |
-
"filter_used": filter_used,
|
397 |
-
"message": f"Successfully deleted {deleted_count} vectors for document '{document_id}' from namespace '{actual_namespace}'",
|
398 |
-
"vectors_found": True,
|
399 |
-
"vectors_deleted": deleted_count
|
400 |
}
|
401 |
except Exception as e:
|
402 |
-
logger.error(f"
|
403 |
return {
|
404 |
"success": False,
|
405 |
-
"
|
406 |
-
|
407 |
-
"vectors_found": False,
|
408 |
-
"vectors_deleted": 0
|
409 |
-
}
|
410 |
-
|
411 |
-
async def list_documents(self):
|
412 |
-
"""List all documents in a namespace"""
|
413 |
-
# The namespace to use might be vdb-X format if vector_db_id provided
|
414 |
-
actual_namespace = f"vdb-{self.vector_db_id}" if self.vector_db_id else self.namespace
|
415 |
-
|
416 |
-
try:
|
417 |
-
if not self.pinecone_index:
|
418 |
-
self.pinecone_index = PineconeConnectionManager.get_index(self.api_key, self.index_name)
|
419 |
-
|
420 |
-
logger.info(f"[{self.correlation_id}] Listing documents in namespace '{actual_namespace}'")
|
421 |
-
|
422 |
-
# Get index stats for namespace
|
423 |
-
stats = self.pinecone_index.describe_index_stats()
|
424 |
-
namespace_stats = stats.get("namespaces", {}).get(actual_namespace, {})
|
425 |
-
vector_count = namespace_stats.get("vector_count", 0)
|
426 |
-
|
427 |
-
if vector_count == 0:
|
428 |
-
# No vectors in namespace
|
429 |
-
return DocumentsListResponse(
|
430 |
-
success=True,
|
431 |
-
total_vectors=0,
|
432 |
-
namespace=actual_namespace,
|
433 |
-
index_name=self.index_name,
|
434 |
-
documents=[]
|
435 |
-
).dict()
|
436 |
-
|
437 |
-
# Query for vectors with a dummy vector to get back metadata
|
438 |
-
# This is not efficient but is a simple approach to extract document info
|
439 |
-
results = self.pinecone_index.query(
|
440 |
-
vector=[0] * stats.dimension, # Use index dimensions
|
441 |
-
top_k=min(vector_count, 1000), # Get at most 1000 vectors
|
442 |
-
include_metadata=True,
|
443 |
-
namespace=actual_namespace
|
444 |
-
)
|
445 |
-
|
446 |
-
# Process results to extract unique documents
|
447 |
-
seen_documents = set()
|
448 |
-
documents = []
|
449 |
-
|
450 |
-
for match in results.get("matches", []):
|
451 |
-
metadata = match.get("metadata", {})
|
452 |
-
document_id = metadata.get("document_id")
|
453 |
-
|
454 |
-
if document_id and document_id not in seen_documents:
|
455 |
-
seen_documents.add(document_id)
|
456 |
-
doc_info = {
|
457 |
-
"id": document_id,
|
458 |
-
"title": metadata.get("title"),
|
459 |
-
"filename": metadata.get("filename"),
|
460 |
-
"content_type": metadata.get("content_type"),
|
461 |
-
"chunk_count": 0
|
462 |
-
}
|
463 |
-
documents.append(doc_info)
|
464 |
-
|
465 |
-
# Count chunks for this document
|
466 |
-
for doc in documents:
|
467 |
-
if doc["id"] == document_id:
|
468 |
-
doc["chunk_count"] += 1
|
469 |
-
break
|
470 |
-
|
471 |
-
return DocumentsListResponse(
|
472 |
-
success=True,
|
473 |
-
total_vectors=vector_count,
|
474 |
-
namespace=actual_namespace,
|
475 |
-
index_name=self.index_name,
|
476 |
-
documents=documents
|
477 |
-
).dict()
|
478 |
-
|
479 |
-
except Exception as e:
|
480 |
-
logger.error(f"[{self.correlation_id}] Error listing documents: {str(e)}")
|
481 |
-
return DocumentsListResponse(
|
482 |
-
success=False,
|
483 |
-
error=f"Error listing documents: {str(e)}"
|
484 |
-
).dict()
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
2 |
import time
|
3 |
+
import uuid
|
|
|
|
|
|
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain_community.document_loaders import PyPDFLoader
|
6 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
7 |
+
import logging
|
8 |
+
from pinecone import Pinecone
|
9 |
+
|
10 |
+
from app.database.pinecone import get_pinecone_index, init_pinecone
|
11 |
+
from app.database.postgresql import get_db
|
12 |
+
from app.database.models import VectorDatabase
|
13 |
|
14 |
+
# Configure logging
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
+
# Initialize embeddings model
|
18 |
+
embeddings_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
19 |
+
|
20 |
class PDFProcessor:
|
21 |
+
"""Class for processing PDF files and creating embeddings"""
|
22 |
|
23 |
+
def __init__(self, index_name="testbot768", namespace="Default", api_key=None, vector_db_id=None, mock_mode=False):
|
24 |
+
"""Initialize with Pinecone index name, namespace and API key"""
|
25 |
self.index_name = index_name
|
26 |
self.namespace = namespace
|
27 |
+
self.pinecone_index = None
|
28 |
self.api_key = api_key
|
29 |
self.vector_db_id = vector_db_id
|
30 |
+
self.pinecone_client = None
|
31 |
+
self.mock_mode = mock_mode # Add mock mode for testing
|
|
|
|
|
32 |
|
33 |
+
def _get_api_key_from_db(self):
|
34 |
+
"""Get API key from database if not provided directly"""
|
35 |
if self.api_key:
|
36 |
+
return self.api_key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
if not self.vector_db_id:
|
39 |
+
logger.error("No API key provided and no vector_db_id to fetch from database")
|
40 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
try:
|
43 |
+
# Get database session
|
44 |
+
db = next(get_db())
|
|
|
|
|
|
|
45 |
|
46 |
+
# Get vector database
|
47 |
+
vector_db = db.query(VectorDatabase).filter(
|
48 |
+
VectorDatabase.id == self.vector_db_id
|
49 |
+
).first()
|
50 |
|
51 |
+
if not vector_db:
|
52 |
+
logger.error(f"Vector database with ID {self.vector_db_id} not found")
|
53 |
+
return None
|
|
|
54 |
|
55 |
+
# Get API key from relationship
|
56 |
+
if hasattr(vector_db, 'api_key_ref') and vector_db.api_key_ref and hasattr(vector_db.api_key_ref, 'key_value'):
|
57 |
+
logger.info(f"Using API key from api_key table for vector database ID {self.vector_db_id}")
|
58 |
+
return vector_db.api_key_ref.key_value
|
59 |
+
|
60 |
+
logger.error(f"No API key found for vector database ID {self.vector_db_id}. Make sure the api_key_id is properly set.")
|
61 |
+
return None
|
62 |
+
except Exception as e:
|
63 |
+
logger.error(f"Error fetching API key from database: {e}")
|
64 |
+
return None
|
65 |
+
|
66 |
+
def _init_pinecone_connection(self):
|
67 |
+
"""Initialize connection to Pinecone with new API"""
|
68 |
+
try:
|
69 |
+
# If in mock mode, return a mock index
|
70 |
+
if self.mock_mode:
|
71 |
+
logger.info("Running in mock mode - simulating Pinecone connection")
|
72 |
+
class MockPineconeIndex:
|
73 |
+
def upsert(self, vectors, namespace=None):
|
74 |
+
logger.info(f"Mock upsert: {len(vectors)} vectors to namespace '{namespace}'")
|
75 |
+
return {"upserted_count": len(vectors)}
|
76 |
+
|
77 |
+
def delete(self, ids=None, delete_all=False, namespace=None):
|
78 |
+
logger.info(f"Mock delete: {'all vectors' if delete_all else f'{len(ids)} vectors'} from namespace '{namespace}'")
|
79 |
+
return {"deleted_count": 10 if delete_all else len(ids or [])}
|
80 |
+
|
81 |
+
def describe_index_stats(self):
|
82 |
+
logger.info(f"Mock describe_index_stats")
|
83 |
+
return {"total_vector_count": 100, "namespaces": {self.namespace: {"vector_count": 50}}}
|
84 |
+
|
85 |
+
return MockPineconeIndex()
|
86 |
|
87 |
+
# Get API key from database if not provided
|
88 |
+
api_key = self._get_api_key_from_db()
|
|
|
|
|
|
|
|
|
89 |
|
90 |
+
if not api_key or not self.index_name:
|
91 |
+
logger.error("Pinecone API key or index name not available")
|
92 |
+
return None
|
93 |
|
94 |
+
# Initialize Pinecone client using the new API
|
95 |
+
self.pinecone_client = Pinecone(api_key=api_key)
|
96 |
|
97 |
+
# Get the index
|
98 |
+
index_list = self.pinecone_client.list_indexes()
|
99 |
+
existing_indexes = index_list.names() if hasattr(index_list, 'names') else []
|
100 |
|
101 |
+
if self.index_name not in existing_indexes:
|
102 |
+
logger.error(f"Index {self.index_name} does not exist in Pinecone")
|
103 |
+
return None
|
104 |
|
105 |
+
# Connect to the index
|
106 |
+
index = self.pinecone_client.Index(self.index_name)
|
107 |
+
logger.info(f"Connected to Pinecone index: {self.index_name}")
|
108 |
+
return index
|
109 |
+
except Exception as e:
|
110 |
+
logger.error(f"Error connecting to Pinecone: {e}")
|
111 |
+
return None
|
112 |
|
113 |
+
async def process_pdf(self, file_path, document_id=None, metadata=None, progress_callback=None):
|
114 |
+
"""
|
115 |
+
Process PDF file, split into chunks and create embeddings
|
116 |
+
|
117 |
+
Args:
|
118 |
+
file_path (str): Path to the PDF file
|
119 |
+
document_id (str, optional): Document ID, if not provided a new ID will be created
|
120 |
+
metadata (dict, optional): Additional metadata for the document
|
121 |
+
progress_callback (callable, optional): Callback function for progress updates
|
122 |
+
|
123 |
+
Returns:
|
124 |
+
dict: Processing result information including document_id and processed chunks count
|
125 |
+
"""
|
126 |
+
try:
|
127 |
+
# Initialize Pinecone connection if not already done
|
128 |
+
self.pinecone_index = self._init_pinecone_connection()
|
129 |
if not self.pinecone_index:
|
130 |
+
return {"success": False, "error": "Could not connect to Pinecone"}
|
131 |
|
132 |
+
# Create document_id if not provided
|
133 |
+
if not document_id:
|
134 |
+
document_id = str(uuid.uuid4())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
+
# Load PDF using PyPDFLoader
|
137 |
+
logger.info(f"Reading PDF file: {file_path}")
|
138 |
+
if progress_callback:
|
139 |
+
await progress_callback("pdf_loading", 0.5, "Loading PDF file")
|
140 |
+
|
141 |
+
loader = PyPDFLoader(file_path)
|
142 |
+
pages = loader.load()
|
143 |
|
144 |
+
# Extract and concatenate text from all pages
|
145 |
+
all_text = ""
|
146 |
+
for page in pages:
|
147 |
+
all_text += page.page_content + "\n"
|
148 |
|
149 |
+
if progress_callback:
|
150 |
+
await progress_callback("text_extraction", 0.6, "Extracted text from PDF")
|
|
|
151 |
|
152 |
+
# Split text into chunks
|
153 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
|
154 |
+
chunks = text_splitter.split_text(all_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
+
logger.info(f"Split PDF file into {len(chunks)} chunks")
|
157 |
+
if progress_callback:
|
158 |
+
await progress_callback("chunking", 0.7, f"Split document into {len(chunks)} chunks")
|
159 |
+
|
160 |
+
# Process embeddings for each chunk and upsert to Pinecone
|
161 |
+
vectors = []
|
162 |
+
for i, chunk in enumerate(chunks):
|
163 |
+
# Update embedding progress
|
164 |
+
if progress_callback and i % 5 == 0: # Update every 5 chunks to avoid too many notifications
|
165 |
+
embedding_progress = 0.7 + (0.3 * (i / len(chunks)))
|
166 |
+
await progress_callback("embedding", embedding_progress, f"Processing chunk {i+1}/{len(chunks)}")
|
167 |
|
168 |
+
# Create vector embedding for each chunk
|
169 |
+
vector = embeddings_model.embed_query(chunk)
|
170 |
+
|
171 |
+
# Prepare metadata for vector
|
172 |
+
vector_metadata = {
|
173 |
+
"document_id": document_id,
|
174 |
+
"chunk_index": i,
|
175 |
+
"text": chunk
|
176 |
+
}
|
177 |
|
178 |
+
# Add additional metadata if provided
|
179 |
+
if metadata:
|
180 |
+
for key, value in metadata.items():
|
181 |
+
if key not in vector_metadata:
|
182 |
+
vector_metadata[key] = value
|
183 |
|
184 |
+
# Add vector to list for upserting
|
185 |
+
vectors.append({
|
186 |
+
"id": f"{document_id}_{i}",
|
187 |
+
"values": vector,
|
188 |
+
"metadata": vector_metadata
|
189 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
+
# Upsert in batches of 100 to avoid overloading
|
192 |
+
if len(vectors) >= 100:
|
193 |
+
await self._upsert_vectors(vectors)
|
194 |
+
vectors = []
|
|
|
195 |
|
196 |
+
# Upsert any remaining vectors
|
197 |
+
if vectors:
|
198 |
+
await self._upsert_vectors(vectors)
|
199 |
|
200 |
+
logger.info(f"Embedded and saved {len(chunks)} chunks from PDF with document_id: {document_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
202 |
+
# Final progress update
|
203 |
if progress_callback:
|
204 |
+
await progress_callback("completed", 1.0, "PDF processing complete")
|
205 |
|
|
|
206 |
return {
|
207 |
"success": True,
|
208 |
"document_id": document_id,
|
209 |
"chunks_processed": len(chunks),
|
210 |
+
"total_text_length": len(all_text)
|
|
|
|
|
|
|
211 |
}
|
212 |
+
|
213 |
except Exception as e:
|
214 |
+
logger.error(f"Error processing PDF: {str(e)}")
|
215 |
+
if progress_callback:
|
216 |
+
await progress_callback("error", 0, f"Error processing PDF: {str(e)}")
|
217 |
return {
|
218 |
"success": False,
|
219 |
+
"error": str(e)
|
220 |
}
|
221 |
|
222 |
+
async def _upsert_vectors(self, vectors):
|
223 |
+
"""Upsert vectors to Pinecone"""
|
224 |
try:
|
225 |
+
if not vectors:
|
226 |
+
return
|
227 |
+
|
228 |
+
# Ensure we have a valid pinecone_index
|
229 |
if not self.pinecone_index:
|
230 |
+
self.pinecone_index = self._init_pinecone_connection()
|
231 |
+
if not self.pinecone_index:
|
232 |
+
raise Exception("Cannot connect to Pinecone")
|
233 |
|
234 |
+
result = self.pinecone_index.upsert(
|
235 |
+
vectors=vectors,
|
236 |
+
namespace=self.namespace
|
237 |
+
)
|
238 |
|
239 |
+
logger.info(f"Upserted {len(vectors)} vectors to Pinecone")
|
240 |
+
return result
|
|
|
|
|
241 |
except Exception as e:
|
242 |
+
logger.error(f"Error upserting vectors: {str(e)}")
|
243 |
+
raise
|
|
|
|
|
|
|
244 |
|
245 |
async def delete_namespace(self):
|
246 |
+
"""
|
247 |
+
Delete all vectors in the current namespace (equivalent to deleting the namespace).
|
248 |
+
"""
|
249 |
+
# Initialize connection if needed
|
250 |
+
self.pinecone_index = self._init_pinecone_connection()
|
251 |
+
if not self.pinecone_index:
|
252 |
+
return {"success": False, "error": "Could not connect to Pinecone"}
|
253 |
+
|
254 |
try:
|
255 |
+
# delete_all=True will delete all vectors in the namespace
|
256 |
+
result = self.pinecone_index.delete(
|
257 |
+
delete_all=True,
|
258 |
+
namespace=self.namespace
|
259 |
+
)
|
260 |
+
logger.info(f"Deleted namespace '{self.namespace}' (all vectors).")
|
261 |
+
return {"success": True, "detail": result}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
except Exception as e:
|
263 |
+
logger.error(f"Error deleting namespace '{self.namespace}': {e}")
|
264 |
+
return {"success": False, "error": str(e)}
|
|
|
|
|
|
|
|
|
265 |
|
266 |
+
async def list_documents(self):
|
267 |
+
"""Get list of all document_ids from Pinecone"""
|
|
|
|
|
268 |
try:
|
269 |
+
# Initialize Pinecone connection if not already done
|
270 |
+
self.pinecone_index = self._init_pinecone_connection()
|
271 |
if not self.pinecone_index:
|
272 |
+
return {"success": False, "error": "Could not connect to Pinecone"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
+
# Get index information
|
275 |
+
stats = self.pinecone_index.describe_index_stats()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
+
# Query to get list of all unique document_ids
|
278 |
+
# This method may not be efficient with large datasets, but is the simplest approach
|
279 |
+
# In practice, you should maintain a list of document_ids in a separate database
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
return {
|
282 |
"success": True,
|
283 |
+
"total_vectors": stats.get('total_vector_count', 0),
|
284 |
+
"namespace": self.namespace,
|
285 |
+
"index_name": self.index_name
|
|
|
|
|
|
|
|
|
286 |
}
|
287 |
except Exception as e:
|
288 |
+
logger.error(f"Error getting document list: {str(e)}")
|
289 |
return {
|
290 |
"success": False,
|
291 |
+
"error": str(e)
|
292 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/utils/pinecone_fix.py
DELETED
@@ -1,194 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Improved Pinecone connection handling with dimension validation.
|
3 |
-
This module provides more robust connection and error handling for Pinecone operations.
|
4 |
-
"""
|
5 |
-
import logging
|
6 |
-
import time
|
7 |
-
from typing import Optional, Dict, Any, Tuple, List
|
8 |
-
import pinecone
|
9 |
-
from pinecone import Pinecone, ServerlessSpec, PodSpec
|
10 |
-
|
11 |
-
logger = logging.getLogger(__name__)
|
12 |
-
|
13 |
-
# Default retry settings
|
14 |
-
DEFAULT_MAX_RETRIES = 3
|
15 |
-
DEFAULT_RETRY_DELAY = 2
|
16 |
-
|
17 |
-
class PineconeConnectionManager:
|
18 |
-
"""
|
19 |
-
Manages Pinecone connections with enhanced error handling and dimension validation.
|
20 |
-
|
21 |
-
This class centralizes Pinecone connection logic, providing:
|
22 |
-
- Connection pooling/reuse
|
23 |
-
- Automatic retries with exponential backoff
|
24 |
-
- Dimension validation before operations
|
25 |
-
- Detailed error logging for better debugging
|
26 |
-
"""
|
27 |
-
|
28 |
-
# Class-level cache of Pinecone clients
|
29 |
-
_clients = {}
|
30 |
-
|
31 |
-
@classmethod
|
32 |
-
def get_client(cls, api_key: str) -> Pinecone:
|
33 |
-
"""
|
34 |
-
Returns a Pinecone client for the given API key, creating one if needed.
|
35 |
-
|
36 |
-
Args:
|
37 |
-
api_key: Pinecone API key
|
38 |
-
|
39 |
-
Returns:
|
40 |
-
Initialized Pinecone client
|
41 |
-
"""
|
42 |
-
if not api_key:
|
43 |
-
raise ValueError("Pinecone API key cannot be empty")
|
44 |
-
|
45 |
-
# Return cached client if it exists
|
46 |
-
if api_key in cls._clients:
|
47 |
-
return cls._clients[api_key]
|
48 |
-
|
49 |
-
# Log client creation (but hide full API key)
|
50 |
-
key_prefix = api_key[:4] + "..." if len(api_key) > 4 else "invalid"
|
51 |
-
logger.info(f"Creating new Pinecone client with API key (first 4 chars: {key_prefix}...)")
|
52 |
-
|
53 |
-
try:
|
54 |
-
# Initialize Pinecone client
|
55 |
-
client = Pinecone(api_key=api_key)
|
56 |
-
cls._clients[api_key] = client
|
57 |
-
logger.info("Pinecone client created successfully")
|
58 |
-
return client
|
59 |
-
except Exception as e:
|
60 |
-
logger.error(f"Failed to create Pinecone client: {str(e)}")
|
61 |
-
raise RuntimeError(f"Pinecone client initialization failed: {str(e)}") from e
|
62 |
-
|
63 |
-
@classmethod
|
64 |
-
def get_index(cls,
|
65 |
-
api_key: str,
|
66 |
-
index_name: str,
|
67 |
-
max_retries: int = DEFAULT_MAX_RETRIES) -> Any:
|
68 |
-
"""
|
69 |
-
Get a Pinecone index with retry logic.
|
70 |
-
|
71 |
-
Args:
|
72 |
-
api_key: Pinecone API key
|
73 |
-
index_name: Name of the index to connect to
|
74 |
-
max_retries: Maximum number of retry attempts
|
75 |
-
|
76 |
-
Returns:
|
77 |
-
Pinecone index
|
78 |
-
"""
|
79 |
-
client = cls.get_client(api_key)
|
80 |
-
|
81 |
-
# Retry logic for connection issues
|
82 |
-
for attempt in range(max_retries):
|
83 |
-
try:
|
84 |
-
index = client.Index(index_name)
|
85 |
-
# Test the connection
|
86 |
-
_ = index.describe_index_stats()
|
87 |
-
logger.info(f"Connected to Pinecone index: {index_name}")
|
88 |
-
return index
|
89 |
-
except Exception as e:
|
90 |
-
if attempt < max_retries - 1:
|
91 |
-
wait_time = DEFAULT_RETRY_DELAY * (2 ** attempt) # Exponential backoff
|
92 |
-
logger.warning(f"Pinecone connection attempt {attempt+1} failed: {e}. Retrying in {wait_time}s...")
|
93 |
-
time.sleep(wait_time)
|
94 |
-
else:
|
95 |
-
logger.error(f"Failed to connect to Pinecone index after {max_retries} attempts: {e}")
|
96 |
-
raise RuntimeError(f"Pinecone index connection failed: {str(e)}") from e
|
97 |
-
|
98 |
-
@classmethod
|
99 |
-
def validate_dimensions(cls,
|
100 |
-
index: Any,
|
101 |
-
vector_dimensions: int) -> Tuple[bool, Optional[str]]:
|
102 |
-
"""
|
103 |
-
Validate that the vector dimensions match the Pinecone index configuration.
|
104 |
-
|
105 |
-
Args:
|
106 |
-
index: Pinecone index
|
107 |
-
vector_dimensions: Dimensions of the vectors to be uploaded
|
108 |
-
|
109 |
-
Returns:
|
110 |
-
Tuple of (is_valid, error_message)
|
111 |
-
"""
|
112 |
-
try:
|
113 |
-
# Get index stats
|
114 |
-
stats = index.describe_index_stats()
|
115 |
-
index_dimensions = stats.dimension
|
116 |
-
|
117 |
-
if index_dimensions != vector_dimensions:
|
118 |
-
error_msg = (f"Vector dimensions mismatch: Your vectors have {vector_dimensions} dimensions, "
|
119 |
-
f"but Pinecone index expects {index_dimensions} dimensions")
|
120 |
-
logger.error(error_msg)
|
121 |
-
return False, error_msg
|
122 |
-
|
123 |
-
return True, None
|
124 |
-
except Exception as e:
|
125 |
-
error_msg = f"Failed to validate dimensions: {str(e)}"
|
126 |
-
logger.error(error_msg)
|
127 |
-
return False, error_msg
|
128 |
-
|
129 |
-
@classmethod
|
130 |
-
def upsert_vectors_with_validation(cls,
|
131 |
-
index: Any,
|
132 |
-
vectors: List[Dict[str, Any]],
|
133 |
-
namespace: str = "",
|
134 |
-
batch_size: int = 100) -> Dict[str, Any]:
|
135 |
-
"""
|
136 |
-
Upsert vectors with dimension validation and batching.
|
137 |
-
|
138 |
-
Args:
|
139 |
-
index: Pinecone index
|
140 |
-
vectors: List of vectors to upsert, each with 'id', 'values', and optional 'metadata'
|
141 |
-
namespace: Namespace to upsert to
|
142 |
-
batch_size: Size of batches for upserting
|
143 |
-
|
144 |
-
Returns:
|
145 |
-
Result of upsert operation
|
146 |
-
"""
|
147 |
-
if not vectors:
|
148 |
-
return {"upserted_count": 0, "success": True}
|
149 |
-
|
150 |
-
# Validate dimensions with the first vector
|
151 |
-
if "values" in vectors[0] and len(vectors[0]["values"]) > 0:
|
152 |
-
vector_dim = len(vectors[0]["values"])
|
153 |
-
is_valid, error_msg = cls.validate_dimensions(index, vector_dim)
|
154 |
-
|
155 |
-
if not is_valid:
|
156 |
-
logger.error(f"Dimension validation failed: {error_msg}")
|
157 |
-
raise ValueError(f"Vector dimensions do not match Pinecone index configuration: {error_msg}")
|
158 |
-
|
159 |
-
# Batch upsert
|
160 |
-
total_upserted = 0
|
161 |
-
for i in range(0, len(vectors), batch_size):
|
162 |
-
batch = vectors[i:i+batch_size]
|
163 |
-
try:
|
164 |
-
result = index.upsert(vectors=batch, namespace=namespace)
|
165 |
-
batch_upserted = result.get("upserted_count", len(batch))
|
166 |
-
total_upserted += batch_upserted
|
167 |
-
logger.info(f"Upserted batch {i//batch_size + 1}: {batch_upserted} vectors")
|
168 |
-
except Exception as e:
|
169 |
-
logger.error(f"Failed to upsert batch {i//batch_size + 1}: {str(e)}")
|
170 |
-
raise RuntimeError(f"Vector upsert failed: {str(e)}") from e
|
171 |
-
|
172 |
-
return {"upserted_count": total_upserted, "success": True}
|
173 |
-
|
174 |
-
# Simplified function to check connection
|
175 |
-
def check_connection(api_key: str, index_name: str) -> bool:
|
176 |
-
"""
|
177 |
-
Test Pinecone connection and validate index exists.
|
178 |
-
|
179 |
-
Args:
|
180 |
-
api_key: Pinecone API key
|
181 |
-
index_name: Name of index to test
|
182 |
-
|
183 |
-
Returns:
|
184 |
-
True if connection successful, False otherwise
|
185 |
-
"""
|
186 |
-
try:
|
187 |
-
index = PineconeConnectionManager.get_index(api_key, index_name)
|
188 |
-
stats = index.describe_index_stats()
|
189 |
-
total_vectors = stats.total_vector_count
|
190 |
-
logger.info(f"Pinecone connection is working. Total vectors: {total_vectors}")
|
191 |
-
return True
|
192 |
-
except Exception as e:
|
193 |
-
logger.error(f"Pinecone connection failed: {str(e)}")
|
194 |
-
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/api_documentation.md
ADDED
@@ -0,0 +1,581 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# API Documentation
|
2 |
+
|
3 |
+
## Frontend Setup
|
4 |
+
|
5 |
+
```javascript
|
6 |
+
// Basic Axios setup
|
7 |
+
import axios from 'axios';
|
8 |
+
|
9 |
+
const api = axios.create({
|
10 |
+
baseURL: 'https://api.your-domain.com',
|
11 |
+
timeout: 10000,
|
12 |
+
headers: {
|
13 |
+
'Content-Type': 'application/json',
|
14 |
+
'Accept': 'application/json'
|
15 |
+
}
|
16 |
+
});
|
17 |
+
|
18 |
+
// Error handling
|
19 |
+
api.interceptors.response.use(
|
20 |
+
response => response.data,
|
21 |
+
error => {
|
22 |
+
const errorMessage = error.response?.data?.detail || 'An error occurred';
|
23 |
+
console.error('API Error:', errorMessage);
|
24 |
+
return Promise.reject(errorMessage);
|
25 |
+
}
|
26 |
+
);
|
27 |
+
```
|
28 |
+
|
29 |
+
## Caching System
|
30 |
+
|
31 |
+
- All GET endpoints support `use_cache=true` parameter (default)
|
32 |
+
- Cache TTL: 300 seconds (5 minutes)
|
33 |
+
- Cache is automatically invalidated on data changes
|
34 |
+
|
35 |
+
## Authentication
|
36 |
+
|
37 |
+
Currently no authentication is required. If implemented in the future, use JWT Bearer tokens:
|
38 |
+
|
39 |
+
```javascript
|
40 |
+
const api = axios.create({
|
41 |
+
// ...other config
|
42 |
+
headers: {
|
43 |
+
// ...other headers
|
44 |
+
'Authorization': `Bearer ${token}`
|
45 |
+
}
|
46 |
+
});
|
47 |
+
```
|
48 |
+
|
49 |
+
## Error Codes
|
50 |
+
|
51 |
+
| Code | Description |
|
52 |
+
|------|-------------|
|
53 |
+
| 400 | Bad Request |
|
54 |
+
| 404 | Not Found |
|
55 |
+
| 500 | Internal Server Error |
|
56 |
+
| 503 | Service Unavailable |
|
57 |
+
|
58 |
+
## PostgreSQL Endpoints
|
59 |
+
|
60 |
+
### FAQ Endpoints
|
61 |
+
|
62 |
+
#### Get FAQs List
|
63 |
+
```
|
64 |
+
GET /postgres/faq
|
65 |
+
```
|
66 |
+
|
67 |
+
Parameters:
|
68 |
+
- `skip`: Number of items to skip (default: 0)
|
69 |
+
- `limit`: Maximum items to return (default: 100)
|
70 |
+
- `active_only`: Return only active items (default: false)
|
71 |
+
- `use_cache`: Use cached data if available (default: true)
|
72 |
+
|
73 |
+
Response:
|
74 |
+
```json
|
75 |
+
[
|
76 |
+
{
|
77 |
+
"question": "How do I book a hotel?",
|
78 |
+
"answer": "You can book a hotel through our app or website.",
|
79 |
+
"is_active": true,
|
80 |
+
"id": 1,
|
81 |
+
"created_at": "2023-01-01T00:00:00",
|
82 |
+
"updated_at": "2023-01-01T00:00:00"
|
83 |
+
}
|
84 |
+
]
|
85 |
+
```
|
86 |
+
|
87 |
+
Example:
|
88 |
+
```javascript
|
89 |
+
async function getFAQs() {
|
90 |
+
try {
|
91 |
+
const data = await api.get('/postgres/faq', {
|
92 |
+
params: { active_only: true, limit: 20 }
|
93 |
+
});
|
94 |
+
return data;
|
95 |
+
} catch (error) {
|
96 |
+
console.error('Error fetching FAQs:', error);
|
97 |
+
throw error;
|
98 |
+
}
|
99 |
+
}
|
100 |
+
```
|
101 |
+
|
102 |
+
#### Create FAQ
|
103 |
+
```
|
104 |
+
POST /postgres/faq
|
105 |
+
```
|
106 |
+
|
107 |
+
Request Body:
|
108 |
+
```json
|
109 |
+
{
|
110 |
+
"question": "How do I book a hotel?",
|
111 |
+
"answer": "You can book a hotel through our app or website.",
|
112 |
+
"is_active": true
|
113 |
+
}
|
114 |
+
```
|
115 |
+
|
116 |
+
Response: Created FAQ object
|
117 |
+
|
118 |
+
#### Get FAQ Detail
|
119 |
+
```
|
120 |
+
GET /postgres/faq/{faq_id}
|
121 |
+
```
|
122 |
+
|
123 |
+
Parameters:
|
124 |
+
- `faq_id`: ID of FAQ (required)
|
125 |
+
- `use_cache`: Use cached data if available (default: true)
|
126 |
+
|
127 |
+
Response: FAQ object
|
128 |
+
|
129 |
+
#### Update FAQ
|
130 |
+
```
|
131 |
+
PUT /postgres/faq/{faq_id}
|
132 |
+
```
|
133 |
+
|
134 |
+
Parameters:
|
135 |
+
- `faq_id`: ID of FAQ to update (required)
|
136 |
+
|
137 |
+
Request Body: Partial or complete FAQ object
|
138 |
+
Response: Updated FAQ object
|
139 |
+
|
140 |
+
#### Delete FAQ
|
141 |
+
```
|
142 |
+
DELETE /postgres/faq/{faq_id}
|
143 |
+
```
|
144 |
+
|
145 |
+
Parameters:
|
146 |
+
- `faq_id`: ID of FAQ to delete (required)
|
147 |
+
|
148 |
+
Response:
|
149 |
+
```json
|
150 |
+
{
|
151 |
+
"status": "success",
|
152 |
+
"message": "FAQ item 1 deleted"
|
153 |
+
}
|
154 |
+
```
|
155 |
+
|
156 |
+
#### Batch Operations
|
157 |
+
|
158 |
+
Create multiple FAQs:
|
159 |
+
```
|
160 |
+
POST /postgres/faqs/batch
|
161 |
+
```
|
162 |
+
|
163 |
+
Update status of multiple FAQs:
|
164 |
+
```
|
165 |
+
PUT /postgres/faqs/batch-update-status
|
166 |
+
```
|
167 |
+
|
168 |
+
Delete multiple FAQs:
|
169 |
+
```
|
170 |
+
DELETE /postgres/faqs/batch
|
171 |
+
```
|
172 |
+
|
173 |
+
### Emergency Contact Endpoints
|
174 |
+
|
175 |
+
#### Get Emergency Contacts
|
176 |
+
```
|
177 |
+
GET /postgres/emergency
|
178 |
+
```
|
179 |
+
|
180 |
+
Parameters:
|
181 |
+
- `skip`: Number of items to skip (default: 0)
|
182 |
+
- `limit`: Maximum items to return (default: 100)
|
183 |
+
- `active_only`: Return only active items (default: false)
|
184 |
+
- `use_cache`: Use cached data if available (default: true)
|
185 |
+
|
186 |
+
Response: Array of Emergency Contact objects
|
187 |
+
|
188 |
+
#### Create Emergency Contact
|
189 |
+
```
|
190 |
+
POST /postgres/emergency
|
191 |
+
```
|
192 |
+
|
193 |
+
Request Body:
|
194 |
+
```json
|
195 |
+
{
|
196 |
+
"name": "Fire Department",
|
197 |
+
"phone_number": "114",
|
198 |
+
"description": "Fire rescue services",
|
199 |
+
"address": "Da Nang",
|
200 |
+
"location": "16.0544, 108.2022",
|
201 |
+
"priority": 1,
|
202 |
+
"is_active": true
|
203 |
+
}
|
204 |
+
```
|
205 |
+
|
206 |
+
Response: Created Emergency Contact object
|
207 |
+
|
208 |
+
#### Get Emergency Contact
|
209 |
+
```
|
210 |
+
GET /postgres/emergency/{emergency_id}
|
211 |
+
```
|
212 |
+
|
213 |
+
#### Update Emergency Contact
|
214 |
+
```
|
215 |
+
PUT /postgres/emergency/{emergency_id}
|
216 |
+
```
|
217 |
+
|
218 |
+
#### Delete Emergency Contact
|
219 |
+
```
|
220 |
+
DELETE /postgres/emergency/{emergency_id}
|
221 |
+
```
|
222 |
+
|
223 |
+
#### Batch Operations
|
224 |
+
|
225 |
+
Create multiple Emergency Contacts:
|
226 |
+
```
|
227 |
+
POST /postgres/emergency/batch
|
228 |
+
```
|
229 |
+
|
230 |
+
Update status of multiple Emergency Contacts:
|
231 |
+
```
|
232 |
+
PUT /postgres/emergency/batch-update-status
|
233 |
+
```
|
234 |
+
|
235 |
+
Delete multiple Emergency Contacts:
|
236 |
+
```
|
237 |
+
DELETE /postgres/emergency/batch
|
238 |
+
```
|
239 |
+
|
240 |
+
### Event Endpoints
|
241 |
+
|
242 |
+
#### Get Events
|
243 |
+
```
|
244 |
+
GET /postgres/events
|
245 |
+
```
|
246 |
+
|
247 |
+
Parameters:
|
248 |
+
- `skip`: Number of items to skip (default: 0)
|
249 |
+
- `limit`: Maximum items to return (default: 100)
|
250 |
+
- `active_only`: Return only active items (default: false)
|
251 |
+
- `featured_only`: Return only featured items (default: false)
|
252 |
+
- `use_cache`: Use cached data if available (default: true)
|
253 |
+
|
254 |
+
Response: Array of Event objects
|
255 |
+
|
256 |
+
#### Create Event
|
257 |
+
```
|
258 |
+
POST /postgres/events
|
259 |
+
```
|
260 |
+
|
261 |
+
Request Body:
|
262 |
+
```json
|
263 |
+
{
|
264 |
+
"name": "Da Nang Fireworks Festival",
|
265 |
+
"description": "International Fireworks Festival Da Nang 2023",
|
266 |
+
"address": "Dragon Bridge, Da Nang",
|
267 |
+
"location": "16.0610, 108.2277",
|
268 |
+
"date_start": "2023-06-01T19:00:00",
|
269 |
+
"date_end": "2023-06-01T22:00:00",
|
270 |
+
"price": [
|
271 |
+
{"type": "VIP", "amount": 500000},
|
272 |
+
{"type": "Standard", "amount": 300000}
|
273 |
+
],
|
274 |
+
"url": "https://danangfireworks.com",
|
275 |
+
"is_active": true,
|
276 |
+
"featured": true
|
277 |
+
}
|
278 |
+
```
|
279 |
+
|
280 |
+
Response: Created Event object
|
281 |
+
|
282 |
+
#### Get Event
|
283 |
+
```
|
284 |
+
GET /postgres/events/{event_id}
|
285 |
+
```
|
286 |
+
|
287 |
+
#### Update Event
|
288 |
+
```
|
289 |
+
PUT /postgres/events/{event_id}
|
290 |
+
```
|
291 |
+
|
292 |
+
#### Delete Event
|
293 |
+
```
|
294 |
+
DELETE /postgres/events/{event_id}
|
295 |
+
```
|
296 |
+
|
297 |
+
#### Batch Operations
|
298 |
+
|
299 |
+
Create multiple Events:
|
300 |
+
```
|
301 |
+
POST /postgres/events/batch
|
302 |
+
```
|
303 |
+
|
304 |
+
Update status of multiple Events:
|
305 |
+
```
|
306 |
+
PUT /postgres/events/batch-update-status
|
307 |
+
```
|
308 |
+
|
309 |
+
Delete multiple Events:
|
310 |
+
```
|
311 |
+
DELETE /postgres/events/batch
|
312 |
+
```
|
313 |
+
|
314 |
+
### About Pixity Endpoints
|
315 |
+
|
316 |
+
#### Get About Pixity
|
317 |
+
```
|
318 |
+
GET /postgres/about-pixity
|
319 |
+
```
|
320 |
+
|
321 |
+
Response:
|
322 |
+
```json
|
323 |
+
{
|
324 |
+
"content": "PiXity is your smart, AI-powered local companion...",
|
325 |
+
"id": 1,
|
326 |
+
"created_at": "2023-01-01T00:00:00",
|
327 |
+
"updated_at": "2023-01-01T00:00:00"
|
328 |
+
}
|
329 |
+
```
|
330 |
+
|
331 |
+
#### Update About Pixity
|
332 |
+
```
|
333 |
+
PUT /postgres/about-pixity
|
334 |
+
```
|
335 |
+
|
336 |
+
Request Body:
|
337 |
+
```json
|
338 |
+
{
|
339 |
+
"content": "PiXity is your smart, AI-powered local companion..."
|
340 |
+
}
|
341 |
+
```
|
342 |
+
|
343 |
+
Response: Updated About Pixity object
|
344 |
+
|
345 |
+
### Da Nang Bucket List Endpoints
|
346 |
+
|
347 |
+
#### Get Da Nang Bucket List
|
348 |
+
```
|
349 |
+
GET /postgres/danang-bucket-list
|
350 |
+
```
|
351 |
+
|
352 |
+
Response: Bucket List object with JSON content string
|
353 |
+
|
354 |
+
#### Update Da Nang Bucket List
|
355 |
+
```
|
356 |
+
PUT /postgres/danang-bucket-list
|
357 |
+
```
|
358 |
+
|
359 |
+
### Solana Summit Endpoints
|
360 |
+
|
361 |
+
#### Get Solana Summit
|
362 |
+
```
|
363 |
+
GET /postgres/solana-summit
|
364 |
+
```
|
365 |
+
|
366 |
+
Response: Solana Summit object with JSON content string
|
367 |
+
|
368 |
+
#### Update Solana Summit
|
369 |
+
```
|
370 |
+
PUT /postgres/solana-summit
|
371 |
+
```
|
372 |
+
|
373 |
+
### Health Check
|
374 |
+
```
|
375 |
+
GET /postgres/health
|
376 |
+
```
|
377 |
+
|
378 |
+
Response:
|
379 |
+
```json
|
380 |
+
{
|
381 |
+
"status": "healthy",
|
382 |
+
"message": "PostgreSQL connection is working",
|
383 |
+
"timestamp": "2023-01-01T00:00:00"
|
384 |
+
}
|
385 |
+
```
|
386 |
+
|
387 |
+
## MongoDB Endpoints
|
388 |
+
|
389 |
+
### Session Endpoints
|
390 |
+
|
391 |
+
#### Create Session
|
392 |
+
```
|
393 |
+
POST /session
|
394 |
+
```
|
395 |
+
|
396 |
+
Request Body:
|
397 |
+
```json
|
398 |
+
{
|
399 |
+
"user_id": "user123",
|
400 |
+
"query": "How do I book a room?",
|
401 |
+
"timestamp": "2023-01-01T00:00:00",
|
402 |
+
"metadata": {
|
403 |
+
"client_info": "web",
|
404 |
+
"location": "Da Nang"
|
405 |
+
}
|
406 |
+
}
|
407 |
+
```
|
408 |
+
|
409 |
+
Response: Created Session object with session_id
|
410 |
+
|
411 |
+
#### Update Session with Response
|
412 |
+
```
|
413 |
+
PUT /session/{session_id}/response
|
414 |
+
```
|
415 |
+
|
416 |
+
Request Body:
|
417 |
+
```json
|
418 |
+
{
|
419 |
+
"response": "You can book a room through our app or website.",
|
420 |
+
"response_timestamp": "2023-01-01T00:00:05",
|
421 |
+
"metadata": {
|
422 |
+
"response_time_ms": 234,
|
423 |
+
"model_version": "gpt-4"
|
424 |
+
}
|
425 |
+
}
|
426 |
+
```
|
427 |
+
|
428 |
+
Response: Updated Session object
|
429 |
+
|
430 |
+
#### Get Session
|
431 |
+
```
|
432 |
+
GET /session/{session_id}
|
433 |
+
```
|
434 |
+
|
435 |
+
Response: Session object
|
436 |
+
|
437 |
+
#### Get User History
|
438 |
+
```
|
439 |
+
GET /history
|
440 |
+
```
|
441 |
+
|
442 |
+
Parameters:
|
443 |
+
- `user_id`: User ID (required)
|
444 |
+
- `limit`: Maximum sessions to return (default: 10)
|
445 |
+
- `skip`: Number of sessions to skip (default: 0)
|
446 |
+
|
447 |
+
Response:
|
448 |
+
```json
|
449 |
+
{
|
450 |
+
"user_id": "user123",
|
451 |
+
"sessions": [
|
452 |
+
{
|
453 |
+
"session_id": "60f7a8b9c1d2e3f4a5b6c7d8",
|
454 |
+
"query": "How do I book a room?",
|
455 |
+
"timestamp": "2023-01-01T00:00:00",
|
456 |
+
"response": "You can book a room through our app or website.",
|
457 |
+
"response_timestamp": "2023-01-01T00:00:05"
|
458 |
+
}
|
459 |
+
],
|
460 |
+
"total_count": 1
|
461 |
+
}
|
462 |
+
```
|
463 |
+
|
464 |
+
#### Health Check
|
465 |
+
```
|
466 |
+
GET /health
|
467 |
+
```
|
468 |
+
|
469 |
+
## RAG Endpoints
|
470 |
+
|
471 |
+
### Create Embedding
|
472 |
+
```
|
473 |
+
POST /embedding
|
474 |
+
```
|
475 |
+
|
476 |
+
Request Body:
|
477 |
+
```json
|
478 |
+
{
|
479 |
+
"text": "Text to embed"
|
480 |
+
}
|
481 |
+
```
|
482 |
+
|
483 |
+
Response:
|
484 |
+
```json
|
485 |
+
{
|
486 |
+
"embedding": [0.1, 0.2, 0.3, ...],
|
487 |
+
"dimensions": 1536
|
488 |
+
}
|
489 |
+
```
|
490 |
+
|
491 |
+
### Process Chat Request
|
492 |
+
```
|
493 |
+
POST /chat
|
494 |
+
```
|
495 |
+
|
496 |
+
Request Body:
|
497 |
+
```json
|
498 |
+
{
|
499 |
+
"query": "Can you tell me about Pixity?",
|
500 |
+
"chat_history": [
|
501 |
+
{"role": "user", "content": "Hello"},
|
502 |
+
{"role": "assistant", "content": "Hello! How can I help you?"}
|
503 |
+
]
|
504 |
+
}
|
505 |
+
```
|
506 |
+
|
507 |
+
Response:
|
508 |
+
```json
|
509 |
+
{
|
510 |
+
"answer": "Pixity is a platform...",
|
511 |
+
"sources": [
|
512 |
+
{
|
513 |
+
"document_id": "doc123",
|
514 |
+
"chunk_id": "chunk456",
|
515 |
+
"chunk_text": "Pixity was founded in...",
|
516 |
+
"relevance_score": 0.92
|
517 |
+
}
|
518 |
+
]
|
519 |
+
}
|
520 |
+
```
|
521 |
+
|
522 |
+
### Direct RAG Query
|
523 |
+
```
|
524 |
+
POST /rag
|
525 |
+
```
|
526 |
+
|
527 |
+
Request Body:
|
528 |
+
```json
|
529 |
+
{
|
530 |
+
"query": "Can you tell me about Pixity?",
|
531 |
+
"namespace": "about_pixity",
|
532 |
+
"top_k": 3
|
533 |
+
}
|
534 |
+
```
|
535 |
+
|
536 |
+
Response: Query results with relevance scores
|
537 |
+
|
538 |
+
### Health Check
|
539 |
+
```
|
540 |
+
GET /health
|
541 |
+
```
|
542 |
+
|
543 |
+
## PDF Processing Endpoints
|
544 |
+
|
545 |
+
### Upload and Process PDF
|
546 |
+
```
|
547 |
+
POST /pdf/upload
|
548 |
+
```
|
549 |
+
|
550 |
+
Form Data:
|
551 |
+
- `file`: PDF file (required)
|
552 |
+
- `namespace`: Vector database namespace (default: "Default")
|
553 |
+
- `index_name`: Vector database index name (default: "testbot768")
|
554 |
+
- `title`: Document title (optional)
|
555 |
+
- `description`: Document description (optional)
|
556 |
+
- `user_id`: User ID for WebSocket updates (optional)
|
557 |
+
|
558 |
+
Response: Processing results with document_id
|
559 |
+
|
560 |
+
### Delete Documents in Namespace
|
561 |
+
```
|
562 |
+
DELETE /pdf/namespace
|
563 |
+
```
|
564 |
+
|
565 |
+
Parameters:
|
566 |
+
- `namespace`: Vector database namespace (default: "Default")
|
567 |
+
- `index_name`: Vector database index name (default: "testbot768")
|
568 |
+
- `user_id`: User ID for WebSocket updates (optional)
|
569 |
+
|
570 |
+
Response: Deletion results
|
571 |
+
|
572 |
+
### Get Documents List
|
573 |
+
```
|
574 |
+
GET /pdf/documents
|
575 |
+
```
|
576 |
+
|
577 |
+
Parameters:
|
578 |
+
- `namespace`: Vector database namespace (default: "Default")
|
579 |
+
- `index_name`: Vector database index name (default: "testbot768")
|
580 |
+
|
581 |
+
Response: List of documents in the namespace
|
pytest.ini
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[pytest]
|
2 |
+
# Bỏ qua cảnh báo về anyio module và các cảnh báo vận hành nội bộ
|
3 |
+
filterwarnings =
|
4 |
+
ignore::pytest.PytestAssertRewriteWarning:.*anyio
|
5 |
+
ignore:.*general_plain_validator_function.* is deprecated.*:DeprecationWarning
|
6 |
+
ignore:.*with_info_plain_validator_function.*:DeprecationWarning
|
7 |
+
|
8 |
+
# Cấu hình cơ bản khác
|
9 |
+
testpaths = tests
|
10 |
+
python_files = test_*.py
|
11 |
+
python_classes = Test*
|
12 |
+
python_functions = test_*
|
requirements.txt
CHANGED
@@ -31,14 +31,12 @@ httpx==0.25.1
|
|
31 |
requests==2.31.0
|
32 |
beautifulsoup4==4.12.2
|
33 |
redis==5.0.1
|
34 |
-
aiofiles==23.2.1
|
35 |
|
36 |
# Testing
|
37 |
prometheus-client==0.17.1
|
38 |
pytest==7.4.0
|
39 |
pytest-cov==4.1.0
|
40 |
watchfiles==0.21.0
|
41 |
-
fpdf==1.7.2
|
42 |
|
43 |
# Core dependencies
|
44 |
starlette==0.27.0
|
|
|
31 |
requests==2.31.0
|
32 |
beautifulsoup4==4.12.2
|
33 |
redis==5.0.1
|
|
|
34 |
|
35 |
# Testing
|
36 |
prometheus-client==0.17.1
|
37 |
pytest==7.4.0
|
38 |
pytest-cov==4.1.0
|
39 |
watchfiles==0.21.0
|
|
|
40 |
|
41 |
# Core dependencies
|
42 |
starlette==0.27.0
|