Initial deployment of AI Digital Library Assistant
Browse files- .dockerignore +67 -0
- Dockerfile +31 -0
- README.md +30 -8
- app.py +1374 -0
- config.py +56 -0
- core/__init__.py +1 -0
- core/__pycache__/__init__.cpython-313.pyc +0 -0
- core/__pycache__/chunker.cpython-313.pyc +0 -0
- core/__pycache__/document_parser.cpython-313.pyc +0 -0
- core/__pycache__/models.cpython-313.pyc +0 -0
- core/__pycache__/text_preprocessor.cpython-313.pyc +0 -0
- core/chunker.py +303 -0
- core/document_parser.py +199 -0
- core/models.py +102 -0
- core/text_preprocessor.py +186 -0
- mcp_server.py +290 -0
- mcp_tools/__init__.py +1 -0
- mcp_tools/__pycache__/__init__.cpython-313.pyc +0 -0
- mcp_tools/__pycache__/generative_tool.cpython-313.pyc +0 -0
- mcp_tools/__pycache__/ingestion_tool.cpython-313.pyc +0 -0
- mcp_tools/__pycache__/podcast_tool.cpython-313.pyc +0 -0
- mcp_tools/__pycache__/search_tool.cpython-313.pyc +0 -0
- mcp_tools/__pycache__/voice_tool.cpython-313.pyc +0 -0
- mcp_tools/generative_tool.py +407 -0
- mcp_tools/ingestion_tool.py +368 -0
- mcp_tools/podcast_tool.py +138 -0
- mcp_tools/search_tool.py +437 -0
- mcp_tools/utils.py +373 -0
- mcp_tools/voice_tool.py +63 -0
- requirements.txt +31 -0
- services/__init__.py +1 -0
- services/__pycache__/__init__.cpython-313.pyc +0 -0
- services/__pycache__/document_store_service.cpython-313.pyc +0 -0
- services/__pycache__/elevenlabs_service.cpython-313.pyc +0 -0
- services/__pycache__/embedding_service.cpython-313.pyc +0 -0
- services/__pycache__/llamaindex_service.cpython-313.pyc +0 -0
- services/__pycache__/llm_service.cpython-313.pyc +0 -0
- services/__pycache__/ocr_service.cpython-313.pyc +0 -0
- services/__pycache__/podcast_generator_service.cpython-313.pyc +0 -0
- services/__pycache__/vector_store_service.cpython-313.pyc +0 -0
- services/document_store_service.py +349 -0
- services/elevenlabs_service.py +341 -0
- services/embedding_service.py +243 -0
- services/llamaindex_service.py +199 -0
- services/llm_service.py +420 -0
- services/ocr_service.py +288 -0
- services/podcast_generator_service.py +663 -0
- services/vector_store_service.py +294 -0
.dockerignore
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python cache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
|
| 8 |
+
# Virtual environments
|
| 9 |
+
venv/
|
| 10 |
+
env/
|
| 11 |
+
ENV/
|
| 12 |
+
.venv
|
| 13 |
+
|
| 14 |
+
# IDE
|
| 15 |
+
.vscode/
|
| 16 |
+
.idea/
|
| 17 |
+
*.swp
|
| 18 |
+
*.swo
|
| 19 |
+
*~
|
| 20 |
+
|
| 21 |
+
# Git
|
| 22 |
+
.git/
|
| 23 |
+
.gitignore
|
| 24 |
+
.gitattributes
|
| 25 |
+
|
| 26 |
+
# CI/CD
|
| 27 |
+
.github/
|
| 28 |
+
.gitlab-ci.yml
|
| 29 |
+
|
| 30 |
+
# Documentation
|
| 31 |
+
README.md
|
| 32 |
+
docs/
|
| 33 |
+
*.md
|
| 34 |
+
!requirements.txt
|
| 35 |
+
|
| 36 |
+
# Test files
|
| 37 |
+
test_*.py
|
| 38 |
+
*_test.py
|
| 39 |
+
tests/
|
| 40 |
+
.pytest_cache/
|
| 41 |
+
|
| 42 |
+
# Large data files (these should be in volumes)
|
| 43 |
+
data/
|
| 44 |
+
vector_store/
|
| 45 |
+
documents/
|
| 46 |
+
podcasts/
|
| 47 |
+
*.db
|
| 48 |
+
*.sqlite
|
| 49 |
+
|
| 50 |
+
# Logs
|
| 51 |
+
*.log
|
| 52 |
+
logs/
|
| 53 |
+
|
| 54 |
+
# OS files
|
| 55 |
+
.DS_Store
|
| 56 |
+
Thumbs.db
|
| 57 |
+
|
| 58 |
+
# Deployment files (not needed in container)
|
| 59 |
+
deploy_from_env.py
|
| 60 |
+
modal_deploy.py
|
| 61 |
+
blaxel.yaml
|
| 62 |
+
bl.cmd
|
| 63 |
+
test_persistence.py
|
| 64 |
+
|
| 65 |
+
# Environment files
|
| 66 |
+
.env
|
| 67 |
+
.env.*
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
curl \
|
| 9 |
+
ca-certificates \
|
| 10 |
+
tesseract-ocr \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Copy requirements and install Python dependencies
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Copy application code
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
# Create data directories
|
| 21 |
+
RUN mkdir -p /data/vector_store /data/documents /data/podcasts
|
| 22 |
+
|
| 23 |
+
# Expose port (HuggingFace Spaces uses 7860)
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
# Set environment variables
|
| 27 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 28 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 29 |
+
|
| 30 |
+
# Run the MCP server
|
| 31 |
+
CMD ["python", "mcp_server.py"]
|
README.md
CHANGED
|
@@ -1,12 +1,34 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: AI Digital Library Assistant
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
|
|
|
| 8 |
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# AI Digital Library Assistant
|
| 13 |
+
|
| 14 |
+
An intelligent document organization and retrieval system powered by AI.
|
| 15 |
+
|
| 16 |
+
## Features
|
| 17 |
+
|
| 18 |
+
- **Document Ingestion**: Upload PDF, DOCX, TXT, and Images
|
| 19 |
+
- **Semantic Search**: Find documents using natural language queries
|
| 20 |
+
- **AI Q&A**: Ask questions about your document library
|
| 21 |
+
- **Voice Assistant**: Talk to your library using ElevenLabs voice AI
|
| 22 |
+
- **Podcast Generation**: Turn documents into engaging audio podcasts
|
| 23 |
+
|
| 24 |
+
## Setup
|
| 25 |
+
|
| 26 |
+
This Space is configured to run using Docker. It requires several API keys to function fully:
|
| 27 |
+
|
| 28 |
+
- `OPENAI_API_KEY`: For embeddings and LLM
|
| 29 |
+
- `ANTHROPIC_API_KEY`: For Claude 3.5 Sonnet
|
| 30 |
+
- `MISTRAL_API_KEY`: For Mistral models and OCR
|
| 31 |
+
- `ELEVENLABS_API_KEY`: For voice features
|
| 32 |
+
- `ELEVENLABS_AGENT_ID`: For conversational AI agent
|
| 33 |
+
|
| 34 |
+
Please set these in the Space Settings -> Variables and Secrets.
|
app.py
ADDED
|
@@ -0,0 +1,1374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import tempfile
|
| 7 |
+
import uuid
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List, Dict, Any, Optional
|
| 11 |
+
import nest_asyncio
|
| 12 |
+
|
| 13 |
+
# Apply nest_asyncio to handle nested event loops in Gradio
|
| 14 |
+
nest_asyncio.apply()
|
| 15 |
+
|
| 16 |
+
# Import our custom modules
|
| 17 |
+
from mcp_tools.ingestion_tool import IngestionTool
|
| 18 |
+
from mcp_tools.search_tool import SearchTool
|
| 19 |
+
from mcp_tools.generative_tool import GenerativeTool
|
| 20 |
+
from services.vector_store_service import VectorStoreService
|
| 21 |
+
from services.document_store_service import DocumentStoreService
|
| 22 |
+
from services.embedding_service import EmbeddingService
|
| 23 |
+
from services.llm_service import LLMService
|
| 24 |
+
from services.ocr_service import OCRService
|
| 25 |
+
from core.models import SearchResult, Document
|
| 26 |
+
import config
|
| 27 |
+
|
| 28 |
+
# Setup logging
|
| 29 |
+
logging.basicConfig(level=logging.INFO)
|
| 30 |
+
logger = logging.getLogger(__name__)
|
| 31 |
+
# Import our custom modules
|
| 32 |
+
from mcp_tools.ingestion_tool import IngestionTool
|
| 33 |
+
from mcp_tools.search_tool import SearchTool
|
| 34 |
+
from mcp_tools.generative_tool import GenerativeTool
|
| 35 |
+
from services.vector_store_service import VectorStoreService
|
| 36 |
+
from services.document_store_service import DocumentStoreService
|
| 37 |
+
from services.embedding_service import EmbeddingService
|
| 38 |
+
from services.llm_service import LLMService
|
| 39 |
+
from services.ocr_service import OCRService
|
| 40 |
+
from core.models import SearchResult, Document
|
| 41 |
+
import config
|
| 42 |
+
from services.llamaindex_service import LlamaIndexService
|
| 43 |
+
from services.elevenlabs_service import ElevenLabsService
|
| 44 |
+
from services.podcast_generator_service import PodcastGeneratorService
|
| 45 |
+
from mcp_tools.voice_tool import VoiceTool
|
| 46 |
+
from mcp_tools.podcast_tool import PodcastTool
|
| 47 |
+
|
| 48 |
+
# Setup logging
|
| 49 |
+
logging.basicConfig(level=logging.INFO)
|
| 50 |
+
logger = logging.getLogger(__name__)
|
| 51 |
+
|
| 52 |
+
class ContentOrganizerMCPServer:
|
| 53 |
+
def __init__(self):
|
| 54 |
+
# Initialize services
|
| 55 |
+
logger.info("Initializing Content Organizer MCP Server...")
|
| 56 |
+
self.vector_store = VectorStoreService()
|
| 57 |
+
self.document_store = DocumentStoreService()
|
| 58 |
+
self.embedding_service = EmbeddingService()
|
| 59 |
+
self.llm_service = LLMService()
|
| 60 |
+
self.ocr_service = OCRService()
|
| 61 |
+
self.llamaindex_service = LlamaIndexService(self.document_store)
|
| 62 |
+
|
| 63 |
+
# Initialize ElevenLabs voice service
|
| 64 |
+
self.elevenlabs_service = ElevenLabsService(self.llamaindex_service)
|
| 65 |
+
|
| 66 |
+
# Initialize Podcast Generator
|
| 67 |
+
self.podcast_generator = PodcastGeneratorService(
|
| 68 |
+
llamaindex_service=self.llamaindex_service,
|
| 69 |
+
llm_service=self.llm_service
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Initialize tools
|
| 73 |
+
self.ingestion_tool = IngestionTool(
|
| 74 |
+
vector_store=self.vector_store,
|
| 75 |
+
document_store=self.document_store,
|
| 76 |
+
embedding_service=self.embedding_service,
|
| 77 |
+
ocr_service=self.ocr_service
|
| 78 |
+
)
|
| 79 |
+
self.search_tool = SearchTool(
|
| 80 |
+
vector_store=self.vector_store,
|
| 81 |
+
embedding_service=self.embedding_service,
|
| 82 |
+
document_store=self.document_store
|
| 83 |
+
)
|
| 84 |
+
self.generative_tool = GenerativeTool(
|
| 85 |
+
llm_service=self.llm_service,
|
| 86 |
+
search_tool=self.search_tool
|
| 87 |
+
)
|
| 88 |
+
self.voice_tool = VoiceTool(self.elevenlabs_service)
|
| 89 |
+
self.podcast_tool = PodcastTool(self.podcast_generator)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# Track processing status
|
| 93 |
+
self.processing_status = {}
|
| 94 |
+
|
| 95 |
+
# Document cache for quick access
|
| 96 |
+
self.document_cache = {}
|
| 97 |
+
logger.info("Content Organizer MCP Server initialized successfully!")
|
| 98 |
+
|
| 99 |
+
def run_async(self, coro):
|
| 100 |
+
"""Helper to run async functions in Gradio"""
|
| 101 |
+
try:
|
| 102 |
+
loop = asyncio.get_event_loop()
|
| 103 |
+
except RuntimeError:
|
| 104 |
+
loop = asyncio.new_event_loop()
|
| 105 |
+
asyncio.set_event_loop(loop)
|
| 106 |
+
if loop.is_running():
|
| 107 |
+
# If loop is already running, create a task
|
| 108 |
+
import concurrent.futures
|
| 109 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 110 |
+
future = executor.submit(asyncio.run, coro)
|
| 111 |
+
return future.result()
|
| 112 |
+
else:
|
| 113 |
+
return loop.run_until_complete(coro)
|
| 114 |
+
|
| 115 |
+
async def ingest_document_async(self, file_path: str, file_type: str) -> Dict[str, Any]:
|
| 116 |
+
"""MCP Tool: Ingest and process a document"""
|
| 117 |
+
try:
|
| 118 |
+
task_id = str(uuid.uuid4())
|
| 119 |
+
self.processing_status[task_id] = {"status": "processing", "progress": 0}
|
| 120 |
+
result = await self.ingestion_tool.process_document(file_path, file_type, task_id)
|
| 121 |
+
if result.get("success"):
|
| 122 |
+
self.processing_status[task_id] = {"status": "completed", "progress": 100}
|
| 123 |
+
doc_id = result.get("document_id")
|
| 124 |
+
if doc_id:
|
| 125 |
+
doc = await self.document_store.get_document(doc_id)
|
| 126 |
+
if doc:
|
| 127 |
+
self.document_cache[doc_id] = doc
|
| 128 |
+
return result
|
| 129 |
+
else:
|
| 130 |
+
self.processing_status[task_id] = {"status": "failed", "error": result.get("error")}
|
| 131 |
+
return result
|
| 132 |
+
except Exception as e:
|
| 133 |
+
logger.error(f"Document ingestion failed: {str(e)}")
|
| 134 |
+
return {"success": False, "error": str(e), "message": "Failed to process document"}
|
| 135 |
+
|
| 136 |
+
async def get_document_content_async(self, document_id: str) -> Optional[str]:
|
| 137 |
+
"""Get document content by ID"""
|
| 138 |
+
try:
|
| 139 |
+
# Check cache first
|
| 140 |
+
if document_id in self.document_cache:
|
| 141 |
+
return self.document_cache[document_id].content
|
| 142 |
+
|
| 143 |
+
# Get from store
|
| 144 |
+
doc = await self.document_store.get_document(document_id)
|
| 145 |
+
if doc:
|
| 146 |
+
self.document_cache[document_id] = doc
|
| 147 |
+
return doc.content
|
| 148 |
+
return None
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.error(f"Error getting document content: {str(e)}")
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
async def semantic_search_async(self, query: str, top_k: int = 5, filters: Optional[Dict] = None) -> Dict[str, Any]:
|
| 154 |
+
"""MCP Tool: Perform semantic search"""
|
| 155 |
+
try:
|
| 156 |
+
results = await self.search_tool.search(query, top_k, filters)
|
| 157 |
+
return {"success": True, "query": query, "results": [result.to_dict() for result in results], "total_results": len(results)}
|
| 158 |
+
except Exception as e:
|
| 159 |
+
logger.error(f"Semantic search failed: {str(e)}")
|
| 160 |
+
return {"success": False, "error": str(e), "query": query, "results": []}
|
| 161 |
+
|
| 162 |
+
async def summarize_content_async(self, content: str = None, document_id: str = None, style: str = "concise") -> Dict[str, Any]:
|
| 163 |
+
try:
|
| 164 |
+
if document_id and document_id != "none":
|
| 165 |
+
content = await self.get_document_content_async(document_id)
|
| 166 |
+
if not content:
|
| 167 |
+
return {"success": False, "error": f"Document {document_id} not found"}
|
| 168 |
+
if not content or not content.strip():
|
| 169 |
+
return {"success": False, "error": "No content provided for summarization"}
|
| 170 |
+
max_content_length = 4000
|
| 171 |
+
if len(content) > max_content_length:
|
| 172 |
+
content = content[:max_content_length] + "..."
|
| 173 |
+
summary = await self.generative_tool.summarize(content, style)
|
| 174 |
+
return {"success": True, "summary": summary, "original_length": len(content), "summary_length": len(summary), "style": style, "document_id": document_id}
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.error(f"Summarization failed: {str(e)}")
|
| 177 |
+
return {"success": False, "error": str(e)}
|
| 178 |
+
|
| 179 |
+
async def generate_tags_async(self, content: str = None, document_id: str = None, max_tags: int = 5) -> Dict[str, Any]:
|
| 180 |
+
"""MCP Tool: Generate tags for content"""
|
| 181 |
+
try:
|
| 182 |
+
if document_id and document_id != "none":
|
| 183 |
+
content = await self.get_document_content_async(document_id)
|
| 184 |
+
if not content:
|
| 185 |
+
return {"success": False, "error": f"Document {document_id} not found"}
|
| 186 |
+
if not content or not content.strip():
|
| 187 |
+
return {"success": False, "error": "No content provided for tag generation"}
|
| 188 |
+
tags = await self.generative_tool.generate_tags(content, max_tags)
|
| 189 |
+
if document_id and document_id != "none" and tags:
|
| 190 |
+
await self.document_store.update_document_metadata(document_id, {"tags": tags})
|
| 191 |
+
return {"success": True, "tags": tags, "content_length": len(content), "document_id": document_id}
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logger.error(f"Tag generation failed: {str(e)}")
|
| 194 |
+
return {"success": False, "error": str(e)}
|
| 195 |
+
async def generate_podcast_async(
|
| 196 |
+
self,
|
| 197 |
+
document_ids: List[str],
|
| 198 |
+
style: str = "conversational",
|
| 199 |
+
duration_minutes: int = 10,
|
| 200 |
+
host1_voice: str = "Rachel",
|
| 201 |
+
host2_voice: str = "Adam"
|
| 202 |
+
) -> Dict[str, Any]:
|
| 203 |
+
"""Generate podcast from documents"""
|
| 204 |
+
try:
|
| 205 |
+
result = await self.podcast_tool.generate_podcast(
|
| 206 |
+
document_ids=document_ids,
|
| 207 |
+
style=style,
|
| 208 |
+
duration_minutes=duration_minutes,
|
| 209 |
+
host1_voice=host1_voice,
|
| 210 |
+
host2_voice=host2_voice
|
| 211 |
+
)
|
| 212 |
+
return result
|
| 213 |
+
except Exception as e:
|
| 214 |
+
logger.error(f"Podcast generation failed: {str(e)}")
|
| 215 |
+
return {"success": False, "error": str(e)}
|
| 216 |
+
|
| 217 |
+
async def answer_question_async(self, question: str, context_filter: Optional[Dict] = None) -> Dict[str, Any]:
|
| 218 |
+
try:
|
| 219 |
+
search_results = await self.search_tool.search(question, top_k=5, filters=context_filter)
|
| 220 |
+
if not search_results:
|
| 221 |
+
return {"success": False, "error": "No relevant context found in your documents. Please make sure you have uploaded relevant documents.", "question": question}
|
| 222 |
+
answer = await self.generative_tool.answer_question(question, search_results)
|
| 223 |
+
return {"success": True, "question": question, "answer": answer, "sources": [result.to_dict() for result in search_results], "confidence": "high" if len(search_results) >= 3 else "medium"}
|
| 224 |
+
except Exception as e:
|
| 225 |
+
logger.error(f"Question answering failed: {str(e)}")
|
| 226 |
+
return {"success": False, "error": str(e), "question": question}
|
| 227 |
+
|
| 228 |
+
async def generate_outline_async(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> Dict[str, Any]:
|
| 229 |
+
try:
|
| 230 |
+
outline = await self.generative_tool.generate_outline(topic, num_sections, detail_level)
|
| 231 |
+
return {"success": True, "result": outline}
|
| 232 |
+
except Exception as e:
|
| 233 |
+
return {"success": False, "error": str(e)}
|
| 234 |
+
|
| 235 |
+
async def explain_concept_async(self, concept: str, audience: str = "general", length: str = "medium") -> Dict[str, Any]:
|
| 236 |
+
try:
|
| 237 |
+
explanation = await self.generative_tool.explain_concept(concept, audience, length)
|
| 238 |
+
return {"success": True, "result": explanation}
|
| 239 |
+
except Exception as e:
|
| 240 |
+
return {"success": False, "error": str(e)}
|
| 241 |
+
|
| 242 |
+
async def paraphrase_text_async(self, text: str, style: str = "formal") -> Dict[str, Any]:
|
| 243 |
+
try:
|
| 244 |
+
paraphrase = await self.generative_tool.paraphrase_text(text, style)
|
| 245 |
+
return {"success": True, "result": paraphrase}
|
| 246 |
+
except Exception as e:
|
| 247 |
+
return {"success": False, "error": str(e)}
|
| 248 |
+
|
| 249 |
+
async def categorize_content_async(self, content: str, categories: List[str]) -> Dict[str, Any]:
|
| 250 |
+
try:
|
| 251 |
+
category = await self.generative_tool.categorize(content, categories)
|
| 252 |
+
return {"success": True, "result": category}
|
| 253 |
+
except Exception as e:
|
| 254 |
+
return {"success": False, "error": str(e)}
|
| 255 |
+
|
| 256 |
+
async def extract_key_insights_async(self, content: str, num_insights: int = 5) -> Dict[str, Any]:
|
| 257 |
+
try:
|
| 258 |
+
insights = await self.generative_tool.extract_key_insights(content, num_insights)
|
| 259 |
+
return {"success": True, "result": "\n".join([f"- {insight}" for insight in insights])}
|
| 260 |
+
except Exception as e:
|
| 261 |
+
return {"success": False, "error": str(e)}
|
| 262 |
+
|
| 263 |
+
async def generate_questions_async(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> Dict[str, Any]:
|
| 264 |
+
try:
|
| 265 |
+
questions = await self.generative_tool.generate_questions(content, question_type, num_questions)
|
| 266 |
+
return {"success": True, "result": "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])}
|
| 267 |
+
except Exception as e:
|
| 268 |
+
return {"success": False, "error": str(e)}
|
| 269 |
+
|
| 270 |
+
async def extract_key_information_async(self, content: str) -> Dict[str, Any]:
|
| 271 |
+
try:
|
| 272 |
+
info = await self.llm_service.extract_key_information(content)
|
| 273 |
+
return {"success": True, "result": json.dumps(info, indent=2)}
|
| 274 |
+
except Exception as e:
|
| 275 |
+
return {"success": False, "error": str(e)}
|
| 276 |
+
|
| 277 |
+
def list_documents_sync(self, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
|
| 278 |
+
try:
|
| 279 |
+
documents = self.run_async(self.document_store.list_documents(limit, offset))
|
| 280 |
+
return {"success": True, "documents": [doc.to_dict() for doc in documents], "total": len(documents)}
|
| 281 |
+
except Exception as e:
|
| 282 |
+
return {"success": False, "error": str(e)}
|
| 283 |
+
|
| 284 |
+
mcp_server = ContentOrganizerMCPServer()
|
| 285 |
+
|
| 286 |
+
def get_document_list():
|
| 287 |
+
try:
|
| 288 |
+
result = mcp_server.list_documents_sync(limit=100)
|
| 289 |
+
if result["success"]:
|
| 290 |
+
if result["documents"]:
|
| 291 |
+
doc_list_str = "📚 Documents in Library:\n\n"
|
| 292 |
+
for i, doc_item in enumerate(result["documents"], 1):
|
| 293 |
+
doc_list_str += f"{i}. {doc_item['filename']} (ID: {doc_item['id'][:8]}...)\n"
|
| 294 |
+
doc_list_str += f" Type: {doc_item['doc_type']}, Size: {doc_item['file_size']} bytes\n"
|
| 295 |
+
if doc_item.get('tags'):
|
| 296 |
+
doc_list_str += f" Tags: {', '.join(doc_item['tags'])}\n"
|
| 297 |
+
doc_list_str += f" Created: {doc_item['created_at'][:10]}\n\n"
|
| 298 |
+
return doc_list_str
|
| 299 |
+
else:
|
| 300 |
+
return "No documents in library yet. Upload some documents to get started!"
|
| 301 |
+
else:
|
| 302 |
+
return f"Error loading documents: {result['error']}"
|
| 303 |
+
except Exception as e:
|
| 304 |
+
return f"Error: {str(e)}"
|
| 305 |
+
|
| 306 |
+
def get_document_choices():
|
| 307 |
+
try:
|
| 308 |
+
result = mcp_server.list_documents_sync(limit=100)
|
| 309 |
+
if result["success"] and result["documents"]:
|
| 310 |
+
choices = [(f"{doc['filename']} ({doc['id'][:8]}...)", doc['id']) for doc in result["documents"]]
|
| 311 |
+
logger.info(f"Generated {len(choices)} document choices")
|
| 312 |
+
return choices
|
| 313 |
+
return []
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.error(f"Error getting document choices: {str(e)}")
|
| 316 |
+
return []
|
| 317 |
+
|
| 318 |
+
def refresh_library():
|
| 319 |
+
doc_list_refreshed = get_document_list()
|
| 320 |
+
doc_choices_refreshed = get_document_choices()
|
| 321 |
+
logger.info(f"Refreshing library. Found {len(doc_choices_refreshed)} choices.")
|
| 322 |
+
return (
|
| 323 |
+
doc_list_refreshed,
|
| 324 |
+
gr.update(choices=doc_choices_refreshed),
|
| 325 |
+
gr.update(choices=doc_choices_refreshed),
|
| 326 |
+
gr.update(choices=doc_choices_refreshed)
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
def upload_and_process_file(file):
|
| 330 |
+
if file is None:
|
| 331 |
+
doc_list_initial = get_document_list()
|
| 332 |
+
doc_choices_initial = get_document_choices()
|
| 333 |
+
return (
|
| 334 |
+
"No file uploaded", "", doc_list_initial,
|
| 335 |
+
gr.update(choices=doc_choices_initial),
|
| 336 |
+
gr.update(choices=doc_choices_initial),
|
| 337 |
+
gr.update(choices=doc_choices_initial)
|
| 338 |
+
)
|
| 339 |
+
try:
|
| 340 |
+
file_path = file.name if hasattr(file, 'name') else str(file)
|
| 341 |
+
file_type = Path(file_path).suffix.lower().strip('.') # Ensure suffix is clean
|
| 342 |
+
logger.info(f"Processing file: {file_path}, type: {file_type}")
|
| 343 |
+
result = mcp_server.run_async(mcp_server.ingest_document_async(file_path, file_type))
|
| 344 |
+
|
| 345 |
+
doc_list_updated = get_document_list()
|
| 346 |
+
doc_choices_updated = get_document_choices()
|
| 347 |
+
|
| 348 |
+
if result["success"]:
|
| 349 |
+
return (
|
| 350 |
+
f"✅ Success: {result['message']}\nDocument ID: {result['document_id']}\nChunks created: {result['chunks_created']}",
|
| 351 |
+
result["document_id"],
|
| 352 |
+
doc_list_updated,
|
| 353 |
+
gr.update(choices=doc_choices_updated),
|
| 354 |
+
gr.update(choices=doc_choices_updated),
|
| 355 |
+
gr.update(choices=doc_choices_updated)
|
| 356 |
+
)
|
| 357 |
+
else:
|
| 358 |
+
return (
|
| 359 |
+
f"❌ Error: {result.get('error', 'Unknown error')}", "",
|
| 360 |
+
doc_list_updated,
|
| 361 |
+
gr.update(choices=doc_choices_updated),
|
| 362 |
+
gr.update(choices=doc_choices_updated),
|
| 363 |
+
gr.update(choices=doc_choices_updated)
|
| 364 |
+
)
|
| 365 |
+
except Exception as e:
|
| 366 |
+
logger.error(f"Error processing file: {str(e)}")
|
| 367 |
+
doc_list_error = get_document_list()
|
| 368 |
+
doc_choices_error = get_document_choices()
|
| 369 |
+
return (
|
| 370 |
+
f"❌ Error: {str(e)}", "",
|
| 371 |
+
doc_list_error,
|
| 372 |
+
gr.update(choices=doc_choices_error),
|
| 373 |
+
gr.update(choices=doc_choices_error),
|
| 374 |
+
gr.update(choices=doc_choices_error)
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
def perform_search(query, top_k):
|
| 378 |
+
if not query.strip():
|
| 379 |
+
return "Please enter a search query"
|
| 380 |
+
try:
|
| 381 |
+
result = mcp_server.run_async(mcp_server.semantic_search_async(query, int(top_k)))
|
| 382 |
+
if result["success"]:
|
| 383 |
+
if result["results"]:
|
| 384 |
+
output_str = f"🔍 Found {result['total_results']} results for: '{query}'\n\n"
|
| 385 |
+
for i, res_item in enumerate(result["results"], 1):
|
| 386 |
+
output_str += f"Result {i}:\n"
|
| 387 |
+
output_str += f"📊 Relevance Score: {res_item['score']:.3f}\n"
|
| 388 |
+
output_str += f"📄 Content: {res_item['content'][:300]}...\n"
|
| 389 |
+
if 'document_filename' in res_item.get('metadata', {}):
|
| 390 |
+
output_str += f"📁 Source: {res_item['metadata']['document_filename']}\n"
|
| 391 |
+
output_str += f"🔗 Document ID: {res_item.get('document_id', 'Unknown')}\n"
|
| 392 |
+
output_str += "-" * 80 + "\n\n"
|
| 393 |
+
return output_str
|
| 394 |
+
else:
|
| 395 |
+
return f"No results found for: '{query}'\n\nMake sure you have uploaded relevant documents first."
|
| 396 |
+
else:
|
| 397 |
+
return f"❌ Search failed: {result['error']}"
|
| 398 |
+
except Exception as e:
|
| 399 |
+
logger.error(f"Search error: {str(e)}")
|
| 400 |
+
return f"❌ Error: {str(e)}"
|
| 401 |
+
|
| 402 |
+
def update_options_visibility(task):
|
| 403 |
+
"""Update visibility of options based on selected task"""
|
| 404 |
+
return (
|
| 405 |
+
gr.update(visible=task == "Summarize"), # summary_style
|
| 406 |
+
gr.update(visible=task == "Generate Outline"), # outline_sections
|
| 407 |
+
gr.update(visible=task == "Generate Outline"), # outline_detail
|
| 408 |
+
gr.update(visible=task == "Explain Concept"), # explain_audience
|
| 409 |
+
gr.update(visible=task == "Explain Concept"), # explain_length
|
| 410 |
+
gr.update(visible=task == "Paraphrase"), # paraphrase_style
|
| 411 |
+
gr.update(visible=task == "Categorize"), # categories_input
|
| 412 |
+
gr.update(visible=task in ["Key Insights", "Generate Questions"]), # num_items
|
| 413 |
+
gr.update(visible=task == "Generate Questions") # question_type
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
def execute_content_task(task, doc_choice, custom_text,
|
| 417 |
+
summary_style, outline_sections, outline_detail,
|
| 418 |
+
explain_audience, explain_length,
|
| 419 |
+
paraphrase_style, categories_input,
|
| 420 |
+
num_items, question_type):
|
| 421 |
+
try:
|
| 422 |
+
# Get content
|
| 423 |
+
content = ""
|
| 424 |
+
if custom_text and custom_text.strip():
|
| 425 |
+
content = custom_text
|
| 426 |
+
elif doc_choice and doc_choice != "none":
|
| 427 |
+
content = mcp_server.run_async(mcp_server.get_document_content_async(doc_choice))
|
| 428 |
+
if not content:
|
| 429 |
+
return "❌ Error: Document not found or empty"
|
| 430 |
+
else:
|
| 431 |
+
if task == "Generate Outline":
|
| 432 |
+
content = custom_text # Topic is passed as text
|
| 433 |
+
else:
|
| 434 |
+
return "⚠️ Please select a document or enter text"
|
| 435 |
+
|
| 436 |
+
# Execute task
|
| 437 |
+
result = {"success": False, "error": "Unknown task"}
|
| 438 |
+
|
| 439 |
+
if task == "Summarize":
|
| 440 |
+
result = mcp_server.run_async(mcp_server.summarize_content_async(content=content, style=summary_style))
|
| 441 |
+
if result["success"]:
|
| 442 |
+
return f"📝 Summary ({summary_style}):\n\n{result['summary']}"
|
| 443 |
+
|
| 444 |
+
elif task == "Generate Outline":
|
| 445 |
+
# For outline, content is the topic
|
| 446 |
+
result = mcp_server.run_async(mcp_server.generate_outline_async(content, int(outline_sections), outline_detail))
|
| 447 |
+
if result["success"]:
|
| 448 |
+
return f"📝 Outline for '{content}':\n\n{result['result']}"
|
| 449 |
+
|
| 450 |
+
elif task == "Explain Concept":
|
| 451 |
+
# For explain, content is the concept
|
| 452 |
+
result = mcp_server.run_async(mcp_server.explain_concept_async(content, explain_audience, explain_length))
|
| 453 |
+
if result["success"]:
|
| 454 |
+
return f"💡 Explanation ({explain_audience}):\n\n{result['result']}"
|
| 455 |
+
|
| 456 |
+
elif task == "Paraphrase":
|
| 457 |
+
result = mcp_server.run_async(mcp_server.paraphrase_text_async(content, paraphrase_style))
|
| 458 |
+
if result["success"]:
|
| 459 |
+
return f"🔄 Paraphrased Text ({paraphrase_style}):\n\n{result['result']}"
|
| 460 |
+
|
| 461 |
+
elif task == "Categorize":
|
| 462 |
+
categories = [c.strip() for c in categories_input.split(',')] if categories_input else []
|
| 463 |
+
result = mcp_server.run_async(mcp_server.categorize_content_async(content, categories))
|
| 464 |
+
if result["success"]:
|
| 465 |
+
return f"🏷️ Category:\n\n{result['result']}"
|
| 466 |
+
|
| 467 |
+
elif task == "Key Insights":
|
| 468 |
+
result = mcp_server.run_async(mcp_server.extract_key_insights_async(content, int(num_items)))
|
| 469 |
+
if result["success"]:
|
| 470 |
+
return f"🔍 Key Insights:\n\n{result['result']}"
|
| 471 |
+
|
| 472 |
+
elif task == "Generate Questions":
|
| 473 |
+
result = mcp_server.run_async(mcp_server.generate_questions_async(content, question_type, int(num_items)))
|
| 474 |
+
if result["success"]:
|
| 475 |
+
return f"❓ Generated Questions ({question_type}):\n\n{result['result']}"
|
| 476 |
+
|
| 477 |
+
elif task == "Extract Key Info":
|
| 478 |
+
result = mcp_server.run_async(mcp_server.extract_key_information_async(content))
|
| 479 |
+
if result["success"]:
|
| 480 |
+
return f"📊 Key Information:\n\n{result['result']}"
|
| 481 |
+
|
| 482 |
+
if not result["success"]:
|
| 483 |
+
return f"❌ Error: {result.get('error', 'Unknown error')}"
|
| 484 |
+
|
| 485 |
+
return "✅ Task completed"
|
| 486 |
+
|
| 487 |
+
except Exception as e:
|
| 488 |
+
logger.error(f"Task execution error: {str(e)}")
|
| 489 |
+
return f"❌ Error: {str(e)}"
|
| 490 |
+
|
| 491 |
+
def generate_tags_for_document(doc_choice, custom_text, max_tags):
|
| 492 |
+
try:
|
| 493 |
+
logger.info(f"Generate tags called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
|
| 494 |
+
document_id = doc_choice if doc_choice and doc_choice != "none" and doc_choice != "" else None
|
| 495 |
+
|
| 496 |
+
if custom_text and custom_text.strip():
|
| 497 |
+
logger.info("Using custom text for tag generation")
|
| 498 |
+
result = mcp_server.run_async(mcp_server.generate_tags_async(content=custom_text, max_tags=int(max_tags)))
|
| 499 |
+
elif document_id:
|
| 500 |
+
logger.info(f"Generating tags for document: {document_id}")
|
| 501 |
+
result = mcp_server.run_async(mcp_server.generate_tags_async(document_id=document_id, max_tags=int(max_tags)))
|
| 502 |
+
else:
|
| 503 |
+
return "Please select a document from the dropdown or enter text to generate tags"
|
| 504 |
+
|
| 505 |
+
if result["success"]:
|
| 506 |
+
tags_str = ", ".join(result["tags"])
|
| 507 |
+
output_str = f"🏷️ Generated Tags:\n\n{tags_str}\n\n"
|
| 508 |
+
output_str += f"📊 Statistics:\n"
|
| 509 |
+
output_str += f"- Content length: {result['content_length']} characters\n"
|
| 510 |
+
output_str += f"- Number of tags: {len(result['tags'])}\n"
|
| 511 |
+
if result.get('document_id'):
|
| 512 |
+
output_str += f"- Document ID: {result['document_id']}\n"
|
| 513 |
+
output_str += f"\n✅ Tags have been saved to the document."
|
| 514 |
+
return output_str
|
| 515 |
+
else:
|
| 516 |
+
return f"❌ Tag generation failed: {result['error']}"
|
| 517 |
+
except Exception as e:
|
| 518 |
+
logger.error(f"Tag generation error: {str(e)}")
|
| 519 |
+
return f"❌ Error: {str(e)}"
|
| 520 |
+
|
| 521 |
+
def ask_question(question):
|
| 522 |
+
if not question.strip():
|
| 523 |
+
return "Please enter a question"
|
| 524 |
+
try:
|
| 525 |
+
result = mcp_server.run_async(mcp_server.answer_question_async(question))
|
| 526 |
+
if result["success"]:
|
| 527 |
+
output_str = f"❓ Question: {result['question']}\n\n"
|
| 528 |
+
output_str += f"💡 Answer:\n{result['answer']}\n\n"
|
| 529 |
+
output_str += f"🎯 Confidence: {result['confidence']}\n\n"
|
| 530 |
+
output_str += f"📚 Sources Used ({len(result['sources'])}):\n"
|
| 531 |
+
for i, source_item in enumerate(result['sources'], 1):
|
| 532 |
+
filename = source_item.get('metadata', {}).get('document_filename', 'Unknown')
|
| 533 |
+
output_str += f"\n{i}. 📄 {filename}\n"
|
| 534 |
+
output_str += f" 📝 Excerpt: {source_item['content'][:150]}...\n"
|
| 535 |
+
output_str += f" 📊 Relevance: {source_item['score']:.3f}\n"
|
| 536 |
+
return output_str
|
| 537 |
+
else:
|
| 538 |
+
return f"❌ {result.get('error', 'Failed to answer question')}"
|
| 539 |
+
except Exception as e:
|
| 540 |
+
return f"❌ Error: {str(e)}"
|
| 541 |
+
|
| 542 |
+
def delete_document_from_library(document_id):
|
| 543 |
+
if not document_id:
|
| 544 |
+
doc_list_current = get_document_list()
|
| 545 |
+
doc_choices_current = get_document_choices()
|
| 546 |
+
return (
|
| 547 |
+
"No document selected to delete.",
|
| 548 |
+
doc_list_current,
|
| 549 |
+
gr.update(choices=doc_choices_current),
|
| 550 |
+
gr.update(choices=doc_choices_current),
|
| 551 |
+
gr.update(choices=doc_choices_current)
|
| 552 |
+
)
|
| 553 |
+
try:
|
| 554 |
+
delete_doc_store_result = mcp_server.run_async(mcp_server.document_store.delete_document(document_id))
|
| 555 |
+
delete_vec_store_result = mcp_server.run_async(mcp_server.vector_store.delete_document(document_id))
|
| 556 |
+
|
| 557 |
+
msg = ""
|
| 558 |
+
if delete_doc_store_result:
|
| 559 |
+
msg += f"🗑️ Document {document_id[:8]}... deleted from document store. "
|
| 560 |
+
else:
|
| 561 |
+
msg += f"❌ Failed to delete document {document_id[:8]}... from document store. "
|
| 562 |
+
|
| 563 |
+
if delete_vec_store_result:
|
| 564 |
+
msg += "Embeddings deleted from vector store."
|
| 565 |
+
else:
|
| 566 |
+
msg += "Failed to delete embeddings from vector store (or no embeddings existed)."
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
doc_list_updated = get_document_list()
|
| 570 |
+
doc_choices_updated = get_document_choices()
|
| 571 |
+
return (
|
| 572 |
+
msg,
|
| 573 |
+
doc_list_updated,
|
| 574 |
+
gr.update(choices=doc_choices_updated),
|
| 575 |
+
gr.update(choices=doc_choices_updated),
|
| 576 |
+
gr.update(choices=doc_choices_updated)
|
| 577 |
+
)
|
| 578 |
+
except Exception as e:
|
| 579 |
+
logger.error(f"Error deleting document: {str(e)}")
|
| 580 |
+
doc_list_error = get_document_list()
|
| 581 |
+
doc_choices_error = get_document_choices()
|
| 582 |
+
return (
|
| 583 |
+
f"❌ Error deleting document: {str(e)}",
|
| 584 |
+
doc_list_error,
|
| 585 |
+
gr.update(choices=doc_choices_error),
|
| 586 |
+
gr.update(choices=doc_choices_error),
|
| 587 |
+
gr.update(choices=doc_choices_error)
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
# Voice conversation state - global scope
|
| 591 |
+
voice_conversation_state = {
|
| 592 |
+
"session_id": None,
|
| 593 |
+
"active": False,
|
| 594 |
+
"transcript": []
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
def start_voice_conversation():
|
| 598 |
+
"""Start a new voice conversation session"""
|
| 599 |
+
try:
|
| 600 |
+
if not mcp_server.elevenlabs_service.is_available():
|
| 601 |
+
return (
|
| 602 |
+
"⚠️ Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID in .env",
|
| 603 |
+
gr.update(interactive=False),
|
| 604 |
+
gr.update(interactive=True),
|
| 605 |
+
""
|
| 606 |
+
)
|
| 607 |
+
|
| 608 |
+
session_id = str(uuid.uuid4())
|
| 609 |
+
result = mcp_server.run_async(mcp_server.elevenlabs_service.start_conversation(session_id))
|
| 610 |
+
|
| 611 |
+
if result.get("success"):
|
| 612 |
+
voice_conversation_state["session_id"] = session_id
|
| 613 |
+
voice_conversation_state["active"] = True
|
| 614 |
+
voice_conversation_state["transcript"] = []
|
| 615 |
+
|
| 616 |
+
return (
|
| 617 |
+
"🎙️ Voice assistant is ready. Type your question below.",
|
| 618 |
+
gr.update(interactive=False),
|
| 619 |
+
gr.update(interactive=True),
|
| 620 |
+
[]
|
| 621 |
+
)
|
| 622 |
+
else:
|
| 623 |
+
return (
|
| 624 |
+
f"❌ Failed to start conversation: {result.get('error')}",
|
| 625 |
+
gr.update(interactive=True),
|
| 626 |
+
gr.update(interactive=False),
|
| 627 |
+
[]
|
| 628 |
+
)
|
| 629 |
+
except Exception as e:
|
| 630 |
+
logger.error(f"Error starting voice conversation: {str(e)}")
|
| 631 |
+
return (
|
| 632 |
+
f"❌ Error: {str(e)}",
|
| 633 |
+
gr.update(interactive=True),
|
| 634 |
+
gr.update(interactive=False),
|
| 635 |
+
[]
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
|
| 639 |
+
def stop_voice_conversation():
|
| 640 |
+
"""Stop active voice conversation"""
|
| 641 |
+
try:
|
| 642 |
+
if not voice_conversation_state["active"]:
|
| 643 |
+
return (
|
| 644 |
+
"No active conversation",
|
| 645 |
+
gr.update(interactive=True),
|
| 646 |
+
gr.update(interactive=False),
|
| 647 |
+
voice_conversation_state["transcript"]
|
| 648 |
+
)
|
| 649 |
+
|
| 650 |
+
session_id = voice_conversation_state["session_id"]
|
| 651 |
+
if session_id:
|
| 652 |
+
mcp_server.run_async(mcp_server.elevenlabs_service.end_conversation(session_id))
|
| 653 |
+
|
| 654 |
+
voice_conversation_state["active"] = False
|
| 655 |
+
voice_conversation_state["session_id"] = None
|
| 656 |
+
|
| 657 |
+
return (
|
| 658 |
+
"✅ Conversation ended",
|
| 659 |
+
gr.update(interactive=True),
|
| 660 |
+
gr.update(interactive=False),
|
| 661 |
+
voice_conversation_state["transcript"]
|
| 662 |
+
)
|
| 663 |
+
except Exception as e:
|
| 664 |
+
logger.error(f"Error stopping conversation: {str(e)}")
|
| 665 |
+
return (
|
| 666 |
+
f"❌ Error: {str(e)}",
|
| 667 |
+
gr.update(interactive=True),
|
| 668 |
+
gr.update(interactive=False),
|
| 669 |
+
voice_conversation_state["transcript"]
|
| 670 |
+
)
|
| 671 |
+
|
| 672 |
+
|
| 673 |
+
def send_voice_message(message):
|
| 674 |
+
"""Send a text message in voice conversation"""
|
| 675 |
+
try:
|
| 676 |
+
if not voice_conversation_state["active"]:
|
| 677 |
+
return ("Please start a conversation first", "", format_transcript(voice_conversation_state["transcript"]))
|
| 678 |
+
|
| 679 |
+
if not message or not message.strip():
|
| 680 |
+
return ("Please enter a message", message, format_transcript(voice_conversation_state["transcript"]))
|
| 681 |
+
|
| 682 |
+
session_id = voice_conversation_state["session_id"]
|
| 683 |
+
voice_conversation_state["transcript"].append({"role": "user", "content": message})
|
| 684 |
+
|
| 685 |
+
result = mcp_server.run_async(mcp_server.voice_tool.voice_qa(message, session_id))
|
| 686 |
+
|
| 687 |
+
if result.get("success"):
|
| 688 |
+
answer = result.get("answer", "No response")
|
| 689 |
+
voice_conversation_state["transcript"].append({"role": "assistant", "content": answer})
|
| 690 |
+
return ("✅ Response received", "", format_transcript(voice_conversation_state["transcript"]))
|
| 691 |
+
else:
|
| 692 |
+
return (f"❌ Error: {result.get('error')}", message, format_transcript(voice_conversation_state["transcript"]))
|
| 693 |
+
except Exception as e:
|
| 694 |
+
logger.error(f"Error sending message: {str(e)}")
|
| 695 |
+
return (f"❌ Error: {str(e)}", message, format_transcript(voice_conversation_state["transcript"]))
|
| 696 |
+
|
| 697 |
+
def format_transcript(transcript):
|
| 698 |
+
"""Format conversation transcript for display"""
|
| 699 |
+
if not transcript:
|
| 700 |
+
return "No conversation yet. Start talking to the AI librarian!"
|
| 701 |
+
|
| 702 |
+
formatted = ""
|
| 703 |
+
for msg in transcript:
|
| 704 |
+
role = msg["role"]
|
| 705 |
+
content = msg["content"]
|
| 706 |
+
if role == "user":
|
| 707 |
+
formatted += f"👤 **You:** {content}\n\n"
|
| 708 |
+
else:
|
| 709 |
+
formatted += f"🤖 **AI Librarian:** {content}\n\n"
|
| 710 |
+
formatted += "---\n\n"
|
| 711 |
+
return formatted
|
| 712 |
+
|
| 713 |
+
def clear_voice_transcript():
|
| 714 |
+
"""Clear conversation transcript"""
|
| 715 |
+
voice_conversation_state["transcript"] = []
|
| 716 |
+
return ""
|
| 717 |
+
|
| 718 |
+
def send_voice_message_v6(message, chat_history):
|
| 719 |
+
"""Send message in voice conversation - Gradio 6 format"""
|
| 720 |
+
try:
|
| 721 |
+
if not voice_conversation_state["active"]:
|
| 722 |
+
return chat_history, ""
|
| 723 |
+
|
| 724 |
+
if not message or not message.strip():
|
| 725 |
+
return chat_history, message
|
| 726 |
+
|
| 727 |
+
session_id = voice_conversation_state["session_id"]
|
| 728 |
+
|
| 729 |
+
# Add user message in Gradio 6 format
|
| 730 |
+
chat_history.append({"role": "user", "content": message})
|
| 731 |
+
|
| 732 |
+
# Get AI response
|
| 733 |
+
result = mcp_server.run_async(mcp_server.voice_tool.voice_qa(message, session_id))
|
| 734 |
+
|
| 735 |
+
if result.get("success"):
|
| 736 |
+
answer = result.get("answer", "No response")
|
| 737 |
+
chat_history.append({"role": "assistant", "content": answer})
|
| 738 |
+
else:
|
| 739 |
+
chat_history.append({
|
| 740 |
+
"role": "assistant",
|
| 741 |
+
"content": f"❌ Error: {result.get('error')}"
|
| 742 |
+
})
|
| 743 |
+
|
| 744 |
+
return chat_history, ""
|
| 745 |
+
except Exception as e:
|
| 746 |
+
logger.error(f"Error in voice message: {str(e)}")
|
| 747 |
+
chat_history.append({
|
| 748 |
+
"role": "assistant",
|
| 749 |
+
"content": f"❌ Error: {str(e)}"
|
| 750 |
+
})
|
| 751 |
+
return chat_history, ""
|
| 752 |
+
|
| 753 |
+
def generate_podcast_ui(doc_ids, style, duration, voice1, voice2):
|
| 754 |
+
"""UI wrapper for podcast generation"""
|
| 755 |
+
try:
|
| 756 |
+
if not doc_ids or len(doc_ids) == 0:
|
| 757 |
+
return ("⚠️ Please select at least one document", None, "No documents selected", "")
|
| 758 |
+
|
| 759 |
+
logger.info(f"Generating podcast: {len(doc_ids)} docs, {style}, {duration}min")
|
| 760 |
+
|
| 761 |
+
result = mcp_server.run_async(
|
| 762 |
+
mcp_server.generate_podcast_async(
|
| 763 |
+
document_ids=doc_ids,
|
| 764 |
+
style=style,
|
| 765 |
+
duration_minutes=int(duration),
|
| 766 |
+
host1_voice=voice1,
|
| 767 |
+
host2_voice=voice2
|
| 768 |
+
)
|
| 769 |
+
)
|
| 770 |
+
|
| 771 |
+
if result.get("success"):
|
| 772 |
+
audio_file = result.get("audio_file")
|
| 773 |
+
transcript = result.get("transcript", "Transcript not available")
|
| 774 |
+
message = result.get("message", "Podcast generated!")
|
| 775 |
+
formatted_transcript = f"## Podcast Transcript\n\n{transcript}"
|
| 776 |
+
|
| 777 |
+
return (
|
| 778 |
+
f"✅ {message}",
|
| 779 |
+
audio_file,
|
| 780 |
+
formatted_transcript,
|
| 781 |
+
result.get("podcast_id", "")
|
| 782 |
+
)
|
| 783 |
+
else:
|
| 784 |
+
error = result.get("error", "Unknown error")
|
| 785 |
+
return (f"❌ Error: {error}", None, "Generation failed", "")
|
| 786 |
+
except Exception as e:
|
| 787 |
+
logger.error(f"Podcast UI error: {str(e)}")
|
| 788 |
+
return (f"❌ Error: {str(e)}", None, "An error occurred", "")
|
| 789 |
+
|
| 790 |
+
def load_dashboard_stats():
|
| 791 |
+
"""Load dashboard statistics for the UI"""
|
| 792 |
+
try:
|
| 793 |
+
# Get document list
|
| 794 |
+
docs_result = mcp_server.list_documents_sync(limit=1000)
|
| 795 |
+
doc_count = 0
|
| 796 |
+
total_chunks = 0
|
| 797 |
+
total_size = 0
|
| 798 |
+
recent_data = []
|
| 799 |
+
|
| 800 |
+
if docs_result.get("success"):
|
| 801 |
+
documents = docs_result.get("documents", [])
|
| 802 |
+
doc_count = len(documents)
|
| 803 |
+
total_chunks = sum(doc.get("metadata", {}).get("chunk_count", 0) for doc in documents)
|
| 804 |
+
total_size = sum(doc.get("file_size", 0) for doc in documents)
|
| 805 |
+
storage_mb = round(total_size / (1024 * 1024), 2) if total_size > 0 else 0.0
|
| 806 |
+
|
| 807 |
+
# Get recent 5 documents
|
| 808 |
+
recent = documents[:5]
|
| 809 |
+
recent_data = [
|
| 810 |
+
[
|
| 811 |
+
doc.get("filename", "Unknown"),
|
| 812 |
+
doc.get("doc_type", "unknown"),
|
| 813 |
+
doc.get("created_at", "")[:10] if doc.get("created_at") else "N/A",
|
| 814 |
+
f"{doc.get('file_size', 0)} bytes"
|
| 815 |
+
]
|
| 816 |
+
for doc in recent
|
| 817 |
+
]
|
| 818 |
+
else:
|
| 819 |
+
storage_mb = 0.0
|
| 820 |
+
|
| 821 |
+
# Service status indicators
|
| 822 |
+
vector_stat = "✅ Online" if getattr(mcp_server, "vector_store", None) else "❌ Offline"
|
| 823 |
+
llm_stat = "✅ Ready" if getattr(mcp_server, "llm_service", None) else "❌ Offline"
|
| 824 |
+
voice_stat = "✅ Ready" if (getattr(mcp_server, "elevenlabs_service", None) and mcp_server.elevenlabs_service.is_available()) else "⚠️ Configure API Key"
|
| 825 |
+
|
| 826 |
+
return (
|
| 827 |
+
doc_count,
|
| 828 |
+
total_chunks,
|
| 829 |
+
storage_mb,
|
| 830 |
+
recent_data,
|
| 831 |
+
vector_stat,
|
| 832 |
+
llm_stat,
|
| 833 |
+
voice_stat,
|
| 834 |
+
)
|
| 835 |
+
except Exception as e:
|
| 836 |
+
logger.error(f"Error loading dashboard stats: {str(e)}")
|
| 837 |
+
return (0, 0, 0.0, [], "❌ Error", "❌ Error", "❌ Error")
|
| 838 |
+
|
| 839 |
+
def create_gradio_interface():
|
| 840 |
+
# Create custom theme with modern aesthetics
|
| 841 |
+
custom_theme = gr.themes.Soft(
|
| 842 |
+
primary_hue=gr.themes.colors.indigo,
|
| 843 |
+
secondary_hue=gr.themes.colors.blue,
|
| 844 |
+
neutral_hue=gr.themes.colors.slate,
|
| 845 |
+
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
|
| 846 |
+
font_mono=[gr.themes.GoogleFont("Fira Code"), "monospace"],
|
| 847 |
+
).set(
|
| 848 |
+
button_primary_background_fill="*primary_500",
|
| 849 |
+
button_primary_background_fill_hover="*primary_600",
|
| 850 |
+
block_title_text_weight="600",
|
| 851 |
+
block_label_text_size="sm",
|
| 852 |
+
block_label_text_weight="500",
|
| 853 |
+
)
|
| 854 |
+
|
| 855 |
+
with gr.Blocks(title="🧠 AI Digital Library Assistant", theme=custom_theme) as interface:
|
| 856 |
+
with gr.Tabs():
|
| 857 |
+
# Dashboard Tab - New Landing Page
|
| 858 |
+
with gr.Tab("🏠 Dashboard"):
|
| 859 |
+
gr.Markdown("# Welcome to Your AI Library Assistant")
|
| 860 |
+
gr.Markdown("*Your intelligent document management and analysis platform powered by AI*")
|
| 861 |
+
|
| 862 |
+
# Quick Stats Section
|
| 863 |
+
gr.Markdown("## 📊 Quick Stats")
|
| 864 |
+
with gr.Row():
|
| 865 |
+
total_docs = gr.Number(
|
| 866 |
+
label="📚 Total Documents",
|
| 867 |
+
value=0,
|
| 868 |
+
interactive=False,
|
| 869 |
+
container=True
|
| 870 |
+
)
|
| 871 |
+
total_chunks = gr.Number(
|
| 872 |
+
label="🧩 Vector Chunks",
|
| 873 |
+
value=0,
|
| 874 |
+
interactive=False,
|
| 875 |
+
container=True
|
| 876 |
+
)
|
| 877 |
+
storage_size = gr.Number(
|
| 878 |
+
label="💾 Storage (MB)",
|
| 879 |
+
value=0,
|
| 880 |
+
interactive=False,
|
| 881 |
+
container=True
|
| 882 |
+
)
|
| 883 |
+
|
| 884 |
+
# Recent Activity Section
|
| 885 |
+
gr.Markdown("## 📊 Recent Activity")
|
| 886 |
+
with gr.Group():
|
| 887 |
+
recent_docs = gr.Dataframe(
|
| 888 |
+
headers=["Document", "Type", "Date", "Size"],
|
| 889 |
+
datatype=["str", "str", "str", "str"],
|
| 890 |
+
row_count=(5, "fixed"),
|
| 891 |
+
col_count=(4, "fixed"),
|
| 892 |
+
interactive=False,
|
| 893 |
+
label="Recently Added Documents"
|
| 894 |
+
)
|
| 895 |
+
|
| 896 |
+
# System Status Section
|
| 897 |
+
gr.Markdown("## � System Status")
|
| 898 |
+
with gr.Row():
|
| 899 |
+
vector_status = gr.Textbox(
|
| 900 |
+
label="Vector Store",
|
| 901 |
+
value="✅ Online",
|
| 902 |
+
interactive=False,
|
| 903 |
+
container=True
|
| 904 |
+
)
|
| 905 |
+
llm_status = gr.Textbox(
|
| 906 |
+
label="LLM Service",
|
| 907 |
+
value="✅ Ready",
|
| 908 |
+
interactive=False,
|
| 909 |
+
container=True
|
| 910 |
+
)
|
| 911 |
+
voice_status = gr.Textbox(
|
| 912 |
+
label="Voice Service",
|
| 913 |
+
value="⚠️ Configure API Key",
|
| 914 |
+
interactive=False,
|
| 915 |
+
container=True
|
| 916 |
+
)
|
| 917 |
+
|
| 918 |
+
with gr.Tab("📚 Document Library"):
|
| 919 |
+
with gr.Row():
|
| 920 |
+
with gr.Column():
|
| 921 |
+
gr.Markdown("### Your Document Collection")
|
| 922 |
+
document_list_display = gr.Textbox(label="Documents in Library", value=get_document_list(), lines=20, interactive=False)
|
| 923 |
+
refresh_btn_library = gr.Button("🔄 Refresh Library", variant="secondary")
|
| 924 |
+
delete_doc_dropdown_visible = gr.Dropdown(label="Select Document to Delete", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
|
| 925 |
+
delete_btn = gr.Button("🗑️ Delete Selected Document", variant="stop")
|
| 926 |
+
delete_output_display = gr.Textbox(label="Delete Status", visible=True)
|
| 927 |
+
|
| 928 |
+
with gr.Tab("📄 Upload Documents"):
|
| 929 |
+
gr.Markdown("""
|
| 930 |
+
### 📥 Add Documents to Library
|
| 931 |
+
Upload PDFs, Word documents, text files, or images. OCR will extract text from images automatically.
|
| 932 |
+
""")
|
| 933 |
+
|
| 934 |
+
with gr.Row():
|
| 935 |
+
with gr.Column():
|
| 936 |
+
with gr.Group():
|
| 937 |
+
gr.Markdown("**Supported formats:** PDF, DOCX, TXT, Images (JPG, PNG)")
|
| 938 |
+
file_input_upload = gr.File(
|
| 939 |
+
label="Select File",
|
| 940 |
+
file_types=[".pdf", ".txt", ".docx", ".png", ".jpg", ".jpeg"],
|
| 941 |
+
type="filepath",
|
| 942 |
+
file_count="single"
|
| 943 |
+
)
|
| 944 |
+
|
| 945 |
+
upload_btn_process = gr.Button("🚀 Upload & Process", variant="primary", size="lg")
|
| 946 |
+
|
| 947 |
+
|
| 948 |
+
with gr.Group():
|
| 949 |
+
upload_output_display = gr.Textbox(
|
| 950 |
+
label="Status",
|
| 951 |
+
lines=6,
|
| 952 |
+
interactive=False,
|
| 953 |
+
show_copy_button=False
|
| 954 |
+
)
|
| 955 |
+
|
| 956 |
+
doc_id_output_display = gr.Textbox(
|
| 957 |
+
label="Document ID",
|
| 958 |
+
interactive=False,
|
| 959 |
+
visible=False
|
| 960 |
+
)
|
| 961 |
+
|
| 962 |
+
|
| 963 |
+
with gr.Tab("🔍 Search Documents"):
|
| 964 |
+
gr.Markdown("""
|
| 965 |
+
### 🔎 Semantic Search
|
| 966 |
+
Find relevant content across your entire document library using AI-powered semantic search.
|
| 967 |
+
""")
|
| 968 |
+
|
| 969 |
+
with gr.Row():
|
| 970 |
+
with gr.Column(scale=1):
|
| 971 |
+
with gr.Group():
|
| 972 |
+
search_query_input = gr.Textbox(
|
| 973 |
+
label="Search Query",
|
| 974 |
+
placeholder="What are you looking for?",
|
| 975 |
+
lines=2,
|
| 976 |
+
info="Use natural language to describe what you need"
|
| 977 |
+
)
|
| 978 |
+
|
| 979 |
+
with gr.Accordion("🎛️ Search Options", open=False):
|
| 980 |
+
search_top_k_slider = gr.Slider(
|
| 981 |
+
label="Number of Results",
|
| 982 |
+
minimum=1, maximum=20, value=5, step=1,
|
| 983 |
+
info="More results = broader search"
|
| 984 |
+
)
|
| 985 |
+
|
| 986 |
+
search_btn_action = gr.Button("🔍 Search", variant="primary", size="lg")
|
| 987 |
+
|
| 988 |
+
with gr.Column(scale=2):
|
| 989 |
+
with gr.Group():
|
| 990 |
+
search_output_display = gr.Textbox(
|
| 991 |
+
label="Results",
|
| 992 |
+
lines=20,
|
| 993 |
+
placeholder="Search results will appear here...",
|
| 994 |
+
show_copy_button=True
|
| 995 |
+
)
|
| 996 |
+
|
| 997 |
+
|
| 998 |
+
with gr.Tab("📝 Content Studio"):
|
| 999 |
+
gr.Markdown("""
|
| 1000 |
+
### 🎨 Create & Analyze Content
|
| 1001 |
+
Transform documents with AI-powered tools: summarize, outline, explain, and more.
|
| 1002 |
+
""")
|
| 1003 |
+
|
| 1004 |
+
with gr.Row():
|
| 1005 |
+
with gr.Column(scale=2):
|
| 1006 |
+
# Source Selection with Group
|
| 1007 |
+
with gr.Group():
|
| 1008 |
+
gr.Markdown("#### 📄 Content Source")
|
| 1009 |
+
doc_dropdown_content = gr.Dropdown(
|
| 1010 |
+
label="Select Document",
|
| 1011 |
+
choices=get_document_choices(),
|
| 1012 |
+
value=None,
|
| 1013 |
+
interactive=True,
|
| 1014 |
+
info="Choose a document from your library"
|
| 1015 |
+
)
|
| 1016 |
+
|
| 1017 |
+
gr.Markdown("**OR**")
|
| 1018 |
+
|
| 1019 |
+
content_text_input = gr.Textbox(
|
| 1020 |
+
label="Enter Text or Topic",
|
| 1021 |
+
placeholder="Paste content or enter a topic...",
|
| 1022 |
+
lines=4,
|
| 1023 |
+
info="For outlines, enter a topic. For other tasks, paste text to analyze."
|
| 1024 |
+
)
|
| 1025 |
+
|
| 1026 |
+
# Task Configuration with Group
|
| 1027 |
+
with gr.Group():
|
| 1028 |
+
gr.Markdown("#### 🛠️ Task Configuration")
|
| 1029 |
+
task_dropdown = gr.Dropdown(
|
| 1030 |
+
label="Select Task",
|
| 1031 |
+
choices=[
|
| 1032 |
+
"Summarize", "Generate Outline", "Explain Concept",
|
| 1033 |
+
"Paraphrase", "Categorize", "Key Insights",
|
| 1034 |
+
"Generate Questions", "Extract Key Info"
|
| 1035 |
+
],
|
| 1036 |
+
value="Summarize",
|
| 1037 |
+
interactive=True,
|
| 1038 |
+
info="Choose the type of analysis to perform"
|
| 1039 |
+
)
|
| 1040 |
+
|
| 1041 |
+
# Dynamic Options with Accordion
|
| 1042 |
+
with gr.Accordion("⚙️ Advanced Options", open=False):
|
| 1043 |
+
summary_style_opt = gr.Dropdown(
|
| 1044 |
+
label="Summary Style",
|
| 1045 |
+
choices=["concise", "detailed", "bullet_points", "executive"],
|
| 1046 |
+
value="concise",
|
| 1047 |
+
visible=True,
|
| 1048 |
+
info="How detailed should the summary be?"
|
| 1049 |
+
)
|
| 1050 |
+
|
| 1051 |
+
outline_sections_opt = gr.Slider(
|
| 1052 |
+
label="Number of Sections",
|
| 1053 |
+
minimum=3, maximum=10, value=5, step=1,
|
| 1054 |
+
visible=False,
|
| 1055 |
+
info="How many main sections?"
|
| 1056 |
+
)
|
| 1057 |
+
outline_detail_opt = gr.Dropdown(
|
| 1058 |
+
label="Detail Level",
|
| 1059 |
+
choices=["brief", "medium", "detailed"],
|
| 1060 |
+
value="medium",
|
| 1061 |
+
visible=False
|
| 1062 |
+
)
|
| 1063 |
+
|
| 1064 |
+
explain_audience_opt = gr.Dropdown(
|
| 1065 |
+
label="Target Audience",
|
| 1066 |
+
choices=["general", "technical", "beginner", "expert"],
|
| 1067 |
+
value="general",
|
| 1068 |
+
visible=False,
|
| 1069 |
+
info="Who is this explanation for?"
|
| 1070 |
+
)
|
| 1071 |
+
explain_length_opt = gr.Dropdown(
|
| 1072 |
+
label="Length",
|
| 1073 |
+
choices=["brief", "medium", "detailed"],
|
| 1074 |
+
value="medium",
|
| 1075 |
+
visible=False
|
| 1076 |
+
)
|
| 1077 |
+
|
| 1078 |
+
paraphrase_style_opt = gr.Dropdown(
|
| 1079 |
+
label="Style",
|
| 1080 |
+
choices=["formal", "casual", "academic", "simple", "technical"],
|
| 1081 |
+
value="formal",
|
| 1082 |
+
visible=False,
|
| 1083 |
+
info="Writing style for paraphrasing"
|
| 1084 |
+
)
|
| 1085 |
+
|
| 1086 |
+
categories_input_opt = gr.Textbox(
|
| 1087 |
+
label="Categories (comma separated)",
|
| 1088 |
+
placeholder="Technology, Business, Science...",
|
| 1089 |
+
visible=False
|
| 1090 |
+
)
|
| 1091 |
+
|
| 1092 |
+
num_items_opt = gr.Slider(
|
| 1093 |
+
label="Number of Items",
|
| 1094 |
+
minimum=1, maximum=10, value=5, step=1,
|
| 1095 |
+
visible=False
|
| 1096 |
+
)
|
| 1097 |
+
question_type_opt = gr.Dropdown(
|
| 1098 |
+
label="Question Type",
|
| 1099 |
+
choices=["comprehension", "analysis", "application", "creative", "factual"],
|
| 1100 |
+
value="comprehension",
|
| 1101 |
+
visible=False
|
| 1102 |
+
)
|
| 1103 |
+
|
| 1104 |
+
run_task_btn = gr.Button("🚀 Run Task", variant="primary", size="lg")
|
| 1105 |
+
|
| 1106 |
+
with gr.Column(scale=3):
|
| 1107 |
+
# Results with copy button and Group
|
| 1108 |
+
with gr.Group():
|
| 1109 |
+
gr.Markdown("#### 📊 Result")
|
| 1110 |
+
content_output_display = gr.Textbox(
|
| 1111 |
+
label="",
|
| 1112 |
+
lines=25,
|
| 1113 |
+
placeholder="Results will appear here...",
|
| 1114 |
+
show_copy_button=True,
|
| 1115 |
+
container=False
|
| 1116 |
+
)
|
| 1117 |
+
|
| 1118 |
+
# Event Handlers
|
| 1119 |
+
task_dropdown.change(
|
| 1120 |
+
fn=update_options_visibility,
|
| 1121 |
+
inputs=[task_dropdown],
|
| 1122 |
+
outputs=[
|
| 1123 |
+
summary_style_opt, outline_sections_opt, outline_detail_opt,
|
| 1124 |
+
explain_audience_opt, explain_length_opt, paraphrase_style_opt,
|
| 1125 |
+
categories_input_opt, num_items_opt, question_type_opt
|
| 1126 |
+
]
|
| 1127 |
+
)
|
| 1128 |
+
|
| 1129 |
+
run_task_btn.click(
|
| 1130 |
+
fn=execute_content_task,
|
| 1131 |
+
inputs=[
|
| 1132 |
+
task_dropdown, doc_dropdown_content, content_text_input,
|
| 1133 |
+
summary_style_opt, outline_sections_opt, outline_detail_opt,
|
| 1134 |
+
explain_audience_opt, explain_length_opt, paraphrase_style_opt,
|
| 1135 |
+
categories_input_opt, num_items_opt, question_type_opt
|
| 1136 |
+
],
|
| 1137 |
+
outputs=[content_output_display]
|
| 1138 |
+
)
|
| 1139 |
+
|
| 1140 |
+
with gr.Tab("🏷️ Generate Tags"):
|
| 1141 |
+
with gr.Row():
|
| 1142 |
+
with gr.Column():
|
| 1143 |
+
gr.Markdown("### Generate Document Tags")
|
| 1144 |
+
doc_dropdown_tag_visible = gr.Dropdown(label="Select Document to Tag", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
|
| 1145 |
+
tag_text_input = gr.Textbox(label="Or Paste Text to Generate Tags", placeholder="Paste any text here to generate tags...", lines=8)
|
| 1146 |
+
max_tags_slider = gr.Slider(label="Number of Tags", minimum=3, maximum=15, value=5, step=1)
|
| 1147 |
+
tag_btn_action = gr.Button("🏷️ Generate Tags", variant="primary", size="lg")
|
| 1148 |
+
with gr.Column():
|
| 1149 |
+
tag_output_display = gr.Textbox(label="Generated Tags", lines=10, placeholder="Tags will appear here...")
|
| 1150 |
+
|
| 1151 |
+
with gr.Tab("🎙️ Voice Assistant"):
|
| 1152 |
+
gr.Markdown("""
|
| 1153 |
+
### 🗣️ Talk to Your AI Librarian
|
| 1154 |
+
|
| 1155 |
+
Have a natural conversation about your documents. Ask questions, request summaries,
|
| 1156 |
+
or explore your content library through voice-powered interaction.
|
| 1157 |
+
|
| 1158 |
+
**Note:** Requires ElevenLabs API configuration.
|
| 1159 |
+
""")
|
| 1160 |
+
|
| 1161 |
+
with gr.Row():
|
| 1162 |
+
with gr.Column(scale=2):
|
| 1163 |
+
# Status and Controls
|
| 1164 |
+
with gr.Group():
|
| 1165 |
+
voice_status_display = gr.Textbox(
|
| 1166 |
+
label="Status",
|
| 1167 |
+
value="Ready to start",
|
| 1168 |
+
interactive=False,
|
| 1169 |
+
lines=2
|
| 1170 |
+
)
|
| 1171 |
+
|
| 1172 |
+
with gr.Row():
|
| 1173 |
+
start_voice_btn = gr.Button("🎤 Start Conversation", variant="primary", size="lg")
|
| 1174 |
+
stop_voice_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", interactive=False)
|
| 1175 |
+
|
| 1176 |
+
# Message Input
|
| 1177 |
+
with gr.Group():
|
| 1178 |
+
gr.Markdown("#### 💬 Send Message")
|
| 1179 |
+
voice_input_text = gr.Textbox(
|
| 1180 |
+
label="",
|
| 1181 |
+
placeholder="Type your question...",
|
| 1182 |
+
lines=3,
|
| 1183 |
+
container=False,
|
| 1184 |
+
info="Press Enter or click Send"
|
| 1185 |
+
)
|
| 1186 |
+
send_voice_btn = gr.Button("📤 Send", variant="secondary")
|
| 1187 |
+
|
| 1188 |
+
with gr.Column(scale=3):
|
| 1189 |
+
# Chat Interface with Gradio 6 Chatbot
|
| 1190 |
+
with gr.Group():
|
| 1191 |
+
voice_chatbot = gr.Chatbot(
|
| 1192 |
+
label="Conversation",
|
| 1193 |
+
type="messages",
|
| 1194 |
+
height=500,
|
| 1195 |
+
show_copy_button=True
|
| 1196 |
+
)
|
| 1197 |
+
|
| 1198 |
+
clear_chat_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
|
| 1199 |
+
|
| 1200 |
+
# Voice Assistant event handlers
|
| 1201 |
+
start_voice_btn.click(
|
| 1202 |
+
fn=start_voice_conversation,
|
| 1203 |
+
outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
|
| 1204 |
+
)
|
| 1205 |
+
|
| 1206 |
+
stop_voice_btn.click(
|
| 1207 |
+
fn=stop_voice_conversation,
|
| 1208 |
+
outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
|
| 1209 |
+
)
|
| 1210 |
+
|
| 1211 |
+
send_voice_btn.click(
|
| 1212 |
+
fn=send_voice_message_v6,
|
| 1213 |
+
inputs=[voice_input_text, voice_chatbot],
|
| 1214 |
+
outputs=[voice_chatbot, voice_input_text]
|
| 1215 |
+
)
|
| 1216 |
+
|
| 1217 |
+
voice_input_text.submit(
|
| 1218 |
+
fn=send_voice_message_v6,
|
| 1219 |
+
inputs=[voice_input_text, voice_chatbot],
|
| 1220 |
+
outputs=[voice_chatbot, voice_input_text]
|
| 1221 |
+
)
|
| 1222 |
+
|
| 1223 |
+
clear_chat_btn.click(
|
| 1224 |
+
fn=lambda: [],
|
| 1225 |
+
outputs=[voice_chatbot]
|
| 1226 |
+
)
|
| 1227 |
+
|
| 1228 |
+
with gr.Tab("🎧 Podcast Studio"):
|
| 1229 |
+
gr.Markdown("""
|
| 1230 |
+
### 🎙️ AI-Powered Podcast Generation
|
| 1231 |
+
|
| 1232 |
+
Transform your documents into engaging audio conversations. Select documents,
|
| 1233 |
+
customize the style and voices, and let AI create a professional podcast.
|
| 1234 |
+
|
| 1235 |
+
**Powered by:** ElevenLabs AI Voice Technology
|
| 1236 |
+
""")
|
| 1237 |
+
|
| 1238 |
+
with gr.Row():
|
| 1239 |
+
with gr.Column(scale=2):
|
| 1240 |
+
# Configuration Panel
|
| 1241 |
+
with gr.Group():
|
| 1242 |
+
gr.Markdown("#### 📚 Select Content")
|
| 1243 |
+
|
| 1244 |
+
podcast_doc_selector = gr.CheckboxGroup(
|
| 1245 |
+
choices=get_document_choices(),
|
| 1246 |
+
label="Documents to Include",
|
| 1247 |
+
info="Choose 1-5 documents for best results",
|
| 1248 |
+
interactive=True
|
| 1249 |
+
)
|
| 1250 |
+
|
| 1251 |
+
with gr.Accordion("🎨 Podcast Settings", open=True):
|
| 1252 |
+
with gr.Row():
|
| 1253 |
+
podcast_style = gr.Dropdown(
|
| 1254 |
+
label="Style",
|
| 1255 |
+
choices=["conversational", "educational", "technical", "casual"],
|
| 1256 |
+
value="conversational",
|
| 1257 |
+
info="Sets the tone and format"
|
| 1258 |
+
)
|
| 1259 |
+
|
| 1260 |
+
podcast_duration = gr.Slider(
|
| 1261 |
+
label="Duration (minutes)",
|
| 1262 |
+
minimum=5,
|
| 1263 |
+
maximum=30,
|
| 1264 |
+
value=10,
|
| 1265 |
+
step=5,
|
| 1266 |
+
info="Approximate length"
|
| 1267 |
+
)
|
| 1268 |
+
|
| 1269 |
+
gr.Markdown("#### 🗣️ Voice Selection")
|
| 1270 |
+
with gr.Row():
|
| 1271 |
+
host1_voice_selector = gr.Dropdown(
|
| 1272 |
+
label="Host 1",
|
| 1273 |
+
choices=["Rachel", "Adam", "Domi", "Bella", "Antoni", "Elli", "Josh"],
|
| 1274 |
+
value="Rachel"
|
| 1275 |
+
)
|
| 1276 |
+
host2_voice_selector = gr.Dropdown(
|
| 1277 |
+
label="Host 2",
|
| 1278 |
+
choices=["Adam", "Rachel", "Josh", "Sam", "Emily", "Antoni", "Arnold"],
|
| 1279 |
+
value="Adam"
|
| 1280 |
+
)
|
| 1281 |
+
|
| 1282 |
+
generate_podcast_btn = gr.Button(
|
| 1283 |
+
"🎙️ Generate Podcast",
|
| 1284 |
+
variant="primary",
|
| 1285 |
+
size="lg"
|
| 1286 |
+
)
|
| 1287 |
+
|
| 1288 |
+
podcast_status = gr.Textbox(
|
| 1289 |
+
label="Status",
|
| 1290 |
+
interactive=False,
|
| 1291 |
+
lines=2
|
| 1292 |
+
)
|
| 1293 |
+
|
| 1294 |
+
podcast_id_display = gr.Textbox(
|
| 1295 |
+
label="Podcast ID",
|
| 1296 |
+
interactive=False,
|
| 1297 |
+
visible=False
|
| 1298 |
+
)
|
| 1299 |
+
|
| 1300 |
+
with gr.Column(scale=3):
|
| 1301 |
+
# Output Panel
|
| 1302 |
+
with gr.Group():
|
| 1303 |
+
gr.Markdown("#### 🎵 Generated Podcast")
|
| 1304 |
+
|
| 1305 |
+
podcast_audio_player = gr.Audio(
|
| 1306 |
+
label="",
|
| 1307 |
+
type="filepath",
|
| 1308 |
+
interactive=False,
|
| 1309 |
+
autoplay=True,
|
| 1310 |
+
container=False
|
| 1311 |
+
)
|
| 1312 |
+
|
| 1313 |
+
with gr.Accordion("📝 Transcript", open=False):
|
| 1314 |
+
podcast_transcript_display = gr.Markdown(
|
| 1315 |
+
value="*Transcript will appear after generation...*"
|
| 1316 |
+
)
|
| 1317 |
+
|
| 1318 |
+
# Event handlers
|
| 1319 |
+
generate_podcast_btn.click(
|
| 1320 |
+
fn=generate_podcast_ui,
|
| 1321 |
+
inputs=[
|
| 1322 |
+
podcast_doc_selector,
|
| 1323 |
+
podcast_style,
|
| 1324 |
+
podcast_duration,
|
| 1325 |
+
host1_voice_selector,
|
| 1326 |
+
host2_voice_selector
|
| 1327 |
+
],
|
| 1328 |
+
outputs=[
|
| 1329 |
+
podcast_status,
|
| 1330 |
+
podcast_audio_player,
|
| 1331 |
+
podcast_transcript_display,
|
| 1332 |
+
podcast_id_display
|
| 1333 |
+
]
|
| 1334 |
+
)
|
| 1335 |
+
|
| 1336 |
+
with gr.Tab("❓ Ask Questions"):
|
| 1337 |
+
with gr.Row():
|
| 1338 |
+
with gr.Column():
|
| 1339 |
+
gr.Markdown("""### Ask Questions About Your Documents
|
| 1340 |
+
The AI will search through all your uploaded documents to find relevant information
|
| 1341 |
+
and provide comprehensive answers with sources.""")
|
| 1342 |
+
qa_question_input = gr.Textbox(label="Your Question", placeholder="Ask anything about your documents...", lines=3)
|
| 1343 |
+
qa_btn_action = gr.Button("❓ Get Answer", variant="primary", size="lg")
|
| 1344 |
+
with gr.Column():
|
| 1345 |
+
qa_output_display = gr.Textbox(label="AI Answer", lines=20, placeholder="Answer will appear here with sources...")
|
| 1346 |
+
|
| 1347 |
+
all_dropdowns_to_update = [delete_doc_dropdown_visible, doc_dropdown_content, doc_dropdown_tag_visible]
|
| 1348 |
+
|
| 1349 |
+
refresh_outputs = [document_list_display] + [dd for dd in all_dropdowns_to_update]
|
| 1350 |
+
refresh_btn_library.click(fn=refresh_library, outputs=refresh_outputs)
|
| 1351 |
+
|
| 1352 |
+
upload_outputs = [upload_output_display, doc_id_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
|
| 1353 |
+
upload_btn_process.click(upload_and_process_file, inputs=[file_input_upload], outputs=upload_outputs)
|
| 1354 |
+
|
| 1355 |
+
delete_outputs = [delete_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
|
| 1356 |
+
delete_btn.click(delete_document_from_library, inputs=[delete_doc_dropdown_visible], outputs=delete_outputs)
|
| 1357 |
+
|
| 1358 |
+
search_btn_action.click(perform_search, inputs=[search_query_input, search_top_k_slider], outputs=[search_output_display])
|
| 1359 |
+
tag_btn_action.click(generate_tags_for_document, inputs=[doc_dropdown_tag_visible, tag_text_input, max_tags_slider], outputs=[tag_output_display])
|
| 1360 |
+
qa_btn_action.click(ask_question, inputs=[qa_question_input], outputs=[qa_output_display])
|
| 1361 |
+
|
| 1362 |
+
|
| 1363 |
+
# Load dashboard stats on interface load
|
| 1364 |
+
interface.load(
|
| 1365 |
+
fn=load_dashboard_stats,
|
| 1366 |
+
outputs=[total_docs, total_chunks, storage_size, recent_docs, vector_status, llm_status, voice_status]
|
| 1367 |
+
)
|
| 1368 |
+
|
| 1369 |
+
interface.load(fn=refresh_library, outputs=refresh_outputs)
|
| 1370 |
+
return interface
|
| 1371 |
+
|
| 1372 |
+
if __name__ == "__main__":
|
| 1373 |
+
gradio_interface = create_gradio_interface()
|
| 1374 |
+
gradio_interface.launch(mcp_server=True)
|
config.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import Optional
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Config:
|
| 9 |
+
# API Keys
|
| 10 |
+
NEBIUS_API_KEY: Optional[str] = os.getenv("NEBIUS_API_KEY")
|
| 11 |
+
MISTRAL_API_KEY: Optional[str] = os.getenv("MISTRAL_API_KEY")
|
| 12 |
+
HUGGINGFACE_API_KEY: Optional[str] = os.getenv("HUGGINGFACE_API_KEY", os.getenv("HF_TOKEN"))
|
| 13 |
+
OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY")
|
| 14 |
+
ANTHROPIC_API_KEY: Optional[str] = os.getenv("ANTHROPIC_API_KEY")
|
| 15 |
+
|
| 16 |
+
# NEBIUS Configuration (OpenAI OSS models)
|
| 17 |
+
NEBIUS_BASE_URL: str = os.getenv("NEBIUS_BASE_URL", "https://api.studio.nebius.com/v1/")
|
| 18 |
+
NEBIUS_MODEL: str = os.getenv("NEBIUS_MODEL", "meta-llama/Llama-3.3-70B-Instruct")
|
| 19 |
+
|
| 20 |
+
# Model Configuration
|
| 21 |
+
# Using OpenAI managed embeddings for performance/quality
|
| 22 |
+
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
| 23 |
+
|
| 24 |
+
MISTRAL_MODEL: str = os.getenv("MISTRAL_MODEL", "mistral-large-2407")
|
| 25 |
+
OPENAI_MODEL: str = os.getenv("OPENAI_MODEL", "gpt-5.1-chat-latest")
|
| 26 |
+
FAST_MODEL: str = os.getenv("FAST_MODEL", "gpt-5-mini")
|
| 27 |
+
|
| 28 |
+
# Vector Store Configuration
|
| 29 |
+
DATA_DIR: str = os.getenv("DATA_DIR", "./data")
|
| 30 |
+
VECTOR_STORE_PATH: str = os.getenv("VECTOR_STORE_PATH", "./data/vector_store")
|
| 31 |
+
DOCUMENT_STORE_PATH: str = os.getenv("DOCUMENT_STORE_PATH", "./data/documents")
|
| 32 |
+
INDEX_NAME: str = os.getenv("INDEX_NAME", "content_index")
|
| 33 |
+
|
| 34 |
+
# Processing Configuration
|
| 35 |
+
CHUNK_SIZE: int = int(os.getenv("CHUNK_SIZE", "500"))
|
| 36 |
+
CHUNK_OVERLAP: int = int(os.getenv("CHUNK_OVERLAP", "50"))
|
| 37 |
+
MAX_CONCURRENT_REQUESTS: int = int(os.getenv("MAX_CONCURRENT_REQUESTS", "5"))
|
| 38 |
+
# Search Configuration
|
| 39 |
+
DEFAULT_TOP_K: int = int(os.getenv("DEFAULT_TOP_K", "5"))
|
| 40 |
+
SIMILARITY_THRESHOLD: float = float(os.getenv("SIMILARITY_THRESHOLD", "0.3"))
|
| 41 |
+
|
| 42 |
+
# OCR Configuration
|
| 43 |
+
TESSERACT_PATH: Optional[str] = os.getenv("TESSERACT_PATH")
|
| 44 |
+
OCR_LANGUAGE: str = os.getenv("OCR_LANGUAGE", "eng")
|
| 45 |
+
|
| 46 |
+
# ElevenLabs Configuration
|
| 47 |
+
ELEVENLABS_API_KEY: Optional[str] = os.getenv("ELEVENLABS_API_KEY")
|
| 48 |
+
ELEVENLABS_AGENT_ID: Optional[str] = os.getenv("ELEVENLABS_AGENT_ID")
|
| 49 |
+
ELEVENLABS_VOICE_MODEL: str = os.getenv("ELEVENLABS_VOICE_MODEL", "Rachel")
|
| 50 |
+
|
| 51 |
+
# App Configuration
|
| 52 |
+
HOST: str = os.getenv("HOST", "0.0.0.0")
|
| 53 |
+
PORT: int = int(os.getenv("PORT", "7860"))
|
| 54 |
+
DEBUG: bool = os.getenv("DEBUG", "False").lower() == "true"
|
| 55 |
+
|
| 56 |
+
config = Config()
|
core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Core module initialization
|
core/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (145 Bytes). View file
|
|
|
core/__pycache__/chunker.cpython-313.pyc
ADDED
|
Binary file (11.1 kB). View file
|
|
|
core/__pycache__/document_parser.cpython-313.pyc
ADDED
|
Binary file (10.5 kB). View file
|
|
|
core/__pycache__/models.cpython-313.pyc
ADDED
|
Binary file (7.06 kB). View file
|
|
|
core/__pycache__/text_preprocessor.cpython-313.pyc
ADDED
|
Binary file (9.25 kB). View file
|
|
|
core/chunker.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# chunker.py
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Dict, Any, Optional
|
| 4 |
+
import re
|
| 5 |
+
from .models import Chunk
|
| 6 |
+
from .text_preprocessor import TextPreprocessor
|
| 7 |
+
import config
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class TextChunker:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.config = config.config
|
| 14 |
+
self.preprocessor = TextPreprocessor()
|
| 15 |
+
|
| 16 |
+
self.chunk_size = self.config.CHUNK_SIZE
|
| 17 |
+
self.chunk_overlap = self.config.CHUNK_OVERLAP
|
| 18 |
+
|
| 19 |
+
def chunk_document(self, document_id: str, content: str, method: str = "recursive") -> List[Chunk]:
|
| 20 |
+
"""Chunk a document using the specified method"""
|
| 21 |
+
if not content:
|
| 22 |
+
return []
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
if method == "recursive":
|
| 26 |
+
return self._recursive_chunk(document_id, content)
|
| 27 |
+
elif method == "sentence":
|
| 28 |
+
return self._sentence_chunk(document_id, content)
|
| 29 |
+
elif method == "paragraph":
|
| 30 |
+
return self._paragraph_chunk(document_id, content)
|
| 31 |
+
elif method == "fixed":
|
| 32 |
+
return self._fixed_chunk(document_id, content)
|
| 33 |
+
else:
|
| 34 |
+
logger.warning(f"Unknown chunking method: {method}, using recursive")
|
| 35 |
+
return self._recursive_chunk(document_id, content)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logger.error(f"Error chunking document: {str(e)}")
|
| 38 |
+
# Fallback to simple fixed chunking
|
| 39 |
+
return self._fixed_chunk(document_id, content)
|
| 40 |
+
|
| 41 |
+
def _recursive_chunk(self, document_id: str, content: str) -> List[Chunk]:
|
| 42 |
+
"""Recursively split text by different separators"""
|
| 43 |
+
chunks = []
|
| 44 |
+
|
| 45 |
+
# Define separators in order of preference
|
| 46 |
+
separators = [
|
| 47 |
+
"\n\n", # Paragraphs
|
| 48 |
+
"\n", # Lines
|
| 49 |
+
". ", # Sentences
|
| 50 |
+
", ", # Clauses
|
| 51 |
+
" " # Words
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
def split_text(text: str, separators: List[str], chunk_size: int) -> List[str]:
|
| 55 |
+
if len(text) <= chunk_size:
|
| 56 |
+
return [text] if text.strip() else []
|
| 57 |
+
|
| 58 |
+
if not separators:
|
| 59 |
+
# If no separators left, split by character
|
| 60 |
+
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
| 61 |
+
|
| 62 |
+
separator = separators[0]
|
| 63 |
+
remaining_separators = separators[1:]
|
| 64 |
+
|
| 65 |
+
splits = text.split(separator)
|
| 66 |
+
result = []
|
| 67 |
+
current_chunk = ""
|
| 68 |
+
|
| 69 |
+
for split in splits:
|
| 70 |
+
if len(current_chunk) + len(split) + len(separator) <= chunk_size:
|
| 71 |
+
if current_chunk:
|
| 72 |
+
current_chunk += separator + split
|
| 73 |
+
else:
|
| 74 |
+
current_chunk = split
|
| 75 |
+
else:
|
| 76 |
+
if current_chunk:
|
| 77 |
+
result.append(current_chunk)
|
| 78 |
+
|
| 79 |
+
if len(split) > chunk_size:
|
| 80 |
+
# Split is too big, need to split further
|
| 81 |
+
result.extend(split_text(split, remaining_separators, chunk_size))
|
| 82 |
+
current_chunk = ""
|
| 83 |
+
else:
|
| 84 |
+
current_chunk = split
|
| 85 |
+
|
| 86 |
+
if current_chunk:
|
| 87 |
+
result.append(current_chunk)
|
| 88 |
+
|
| 89 |
+
return result
|
| 90 |
+
|
| 91 |
+
text_chunks = split_text(content, separators, self.chunk_size)
|
| 92 |
+
|
| 93 |
+
# Create chunk objects with overlap
|
| 94 |
+
for i, chunk_text in enumerate(text_chunks):
|
| 95 |
+
if not chunk_text.strip():
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
# Calculate positions
|
| 99 |
+
start_pos = content.find(chunk_text)
|
| 100 |
+
if start_pos == -1:
|
| 101 |
+
start_pos = i * self.chunk_size
|
| 102 |
+
end_pos = start_pos + len(chunk_text)
|
| 103 |
+
|
| 104 |
+
# Add overlap from previous chunk if not the first chunk
|
| 105 |
+
if i > 0 and self.chunk_overlap > 0:
|
| 106 |
+
prev_chunk = text_chunks[i-1]
|
| 107 |
+
overlap_text = prev_chunk[-self.chunk_overlap:] if len(prev_chunk) > self.chunk_overlap else prev_chunk
|
| 108 |
+
chunk_text = overlap_text + " " + chunk_text
|
| 109 |
+
|
| 110 |
+
chunk = Chunk(
|
| 111 |
+
id=self._generate_chunk_id(document_id, i),
|
| 112 |
+
document_id=document_id,
|
| 113 |
+
content=chunk_text.strip(),
|
| 114 |
+
chunk_index=i,
|
| 115 |
+
start_pos=start_pos,
|
| 116 |
+
end_pos=end_pos,
|
| 117 |
+
metadata={
|
| 118 |
+
"chunk_method": "recursive",
|
| 119 |
+
"original_length": len(chunk_text),
|
| 120 |
+
"word_count": len(chunk_text.split())
|
| 121 |
+
}
|
| 122 |
+
)
|
| 123 |
+
chunks.append(chunk)
|
| 124 |
+
|
| 125 |
+
return chunks
|
| 126 |
+
|
| 127 |
+
def _sentence_chunk(self, document_id: str, content: str) -> List[Chunk]:
|
| 128 |
+
"""Chunk text by sentences"""
|
| 129 |
+
chunks = []
|
| 130 |
+
sentences = self.preprocessor.extract_sentences(content)
|
| 131 |
+
|
| 132 |
+
current_chunk = ""
|
| 133 |
+
chunk_index = 0
|
| 134 |
+
start_pos = 0
|
| 135 |
+
|
| 136 |
+
for sentence in sentences:
|
| 137 |
+
if len(current_chunk) + len(sentence) <= self.chunk_size:
|
| 138 |
+
if current_chunk:
|
| 139 |
+
current_chunk += " " + sentence
|
| 140 |
+
else:
|
| 141 |
+
current_chunk = sentence
|
| 142 |
+
start_pos = content.find(sentence)
|
| 143 |
+
else:
|
| 144 |
+
if current_chunk:
|
| 145 |
+
chunk = Chunk(
|
| 146 |
+
id=self._generate_chunk_id(document_id, chunk_index),
|
| 147 |
+
document_id=document_id,
|
| 148 |
+
content=current_chunk.strip(),
|
| 149 |
+
chunk_index=chunk_index,
|
| 150 |
+
start_pos=start_pos,
|
| 151 |
+
end_pos=start_pos + len(current_chunk),
|
| 152 |
+
metadata={
|
| 153 |
+
"chunk_method": "sentence",
|
| 154 |
+
"sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
|
| 155 |
+
}
|
| 156 |
+
)
|
| 157 |
+
chunks.append(chunk)
|
| 158 |
+
chunk_index += 1
|
| 159 |
+
|
| 160 |
+
current_chunk = sentence
|
| 161 |
+
start_pos = content.find(sentence)
|
| 162 |
+
|
| 163 |
+
# Add final chunk
|
| 164 |
+
if current_chunk:
|
| 165 |
+
chunk = Chunk(
|
| 166 |
+
id=self._generate_chunk_id(document_id, chunk_index),
|
| 167 |
+
document_id=document_id,
|
| 168 |
+
content=current_chunk.strip(),
|
| 169 |
+
chunk_index=chunk_index,
|
| 170 |
+
start_pos=start_pos,
|
| 171 |
+
end_pos=start_pos + len(current_chunk),
|
| 172 |
+
metadata={
|
| 173 |
+
"chunk_method": "sentence",
|
| 174 |
+
"sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
|
| 175 |
+
}
|
| 176 |
+
)
|
| 177 |
+
chunks.append(chunk)
|
| 178 |
+
|
| 179 |
+
return chunks
|
| 180 |
+
|
| 181 |
+
def _paragraph_chunk(self, document_id: str, content: str) -> List[Chunk]:
|
| 182 |
+
"""Chunk text by paragraphs"""
|
| 183 |
+
chunks = []
|
| 184 |
+
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
|
| 185 |
+
|
| 186 |
+
current_chunk = ""
|
| 187 |
+
chunk_index = 0
|
| 188 |
+
start_pos = 0
|
| 189 |
+
|
| 190 |
+
for paragraph in paragraphs:
|
| 191 |
+
if len(current_chunk) + len(paragraph) <= self.chunk_size:
|
| 192 |
+
if current_chunk:
|
| 193 |
+
current_chunk += "\n\n" + paragraph
|
| 194 |
+
else:
|
| 195 |
+
current_chunk = paragraph
|
| 196 |
+
start_pos = content.find(paragraph)
|
| 197 |
+
else:
|
| 198 |
+
if current_chunk:
|
| 199 |
+
chunk = Chunk(
|
| 200 |
+
id=self._generate_chunk_id(document_id, chunk_index),
|
| 201 |
+
document_id=document_id,
|
| 202 |
+
content=current_chunk.strip(),
|
| 203 |
+
chunk_index=chunk_index,
|
| 204 |
+
start_pos=start_pos,
|
| 205 |
+
end_pos=start_pos + len(current_chunk),
|
| 206 |
+
metadata={
|
| 207 |
+
"chunk_method": "paragraph",
|
| 208 |
+
"paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
|
| 209 |
+
}
|
| 210 |
+
)
|
| 211 |
+
chunks.append(chunk)
|
| 212 |
+
chunk_index += 1
|
| 213 |
+
|
| 214 |
+
# If paragraph is too long, split it further
|
| 215 |
+
if len(paragraph) > self.chunk_size:
|
| 216 |
+
para_chunks = self._fixed_chunk(document_id, paragraph)
|
| 217 |
+
for pc in para_chunks:
|
| 218 |
+
pc.chunk_index = chunk_index
|
| 219 |
+
pc.id = self._generate_chunk_id(document_id, chunk_index)
|
| 220 |
+
chunks.append(pc)
|
| 221 |
+
chunk_index += 1
|
| 222 |
+
else:
|
| 223 |
+
current_chunk = paragraph
|
| 224 |
+
start_pos = content.find(paragraph)
|
| 225 |
+
|
| 226 |
+
# Add final chunk
|
| 227 |
+
if current_chunk:
|
| 228 |
+
chunk = Chunk(
|
| 229 |
+
id=self._generate_chunk_id(document_id, chunk_index),
|
| 230 |
+
document_id=document_id,
|
| 231 |
+
content=current_chunk.strip(),
|
| 232 |
+
chunk_index=chunk_index,
|
| 233 |
+
start_pos=start_pos,
|
| 234 |
+
end_pos=start_pos + len(current_chunk),
|
| 235 |
+
metadata={
|
| 236 |
+
"chunk_method": "paragraph",
|
| 237 |
+
"paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
|
| 238 |
+
}
|
| 239 |
+
)
|
| 240 |
+
chunks.append(chunk)
|
| 241 |
+
|
| 242 |
+
return chunks
|
| 243 |
+
|
| 244 |
+
def _fixed_chunk(self, document_id: str, content: str) -> List[Chunk]:
|
| 245 |
+
"""Simple fixed-size chunking with overlap"""
|
| 246 |
+
chunks = []
|
| 247 |
+
|
| 248 |
+
for i in range(0, len(content), self.chunk_size - self.chunk_overlap):
|
| 249 |
+
chunk_text = content[i:i + self.chunk_size]
|
| 250 |
+
|
| 251 |
+
if not chunk_text.strip():
|
| 252 |
+
continue
|
| 253 |
+
|
| 254 |
+
chunk = Chunk(
|
| 255 |
+
id=self._generate_chunk_id(document_id, len(chunks)),
|
| 256 |
+
document_id=document_id,
|
| 257 |
+
content=chunk_text.strip(),
|
| 258 |
+
chunk_index=len(chunks),
|
| 259 |
+
start_pos=i,
|
| 260 |
+
end_pos=min(i + self.chunk_size, len(content)),
|
| 261 |
+
metadata={
|
| 262 |
+
"chunk_method": "fixed",
|
| 263 |
+
"original_length": len(chunk_text)
|
| 264 |
+
}
|
| 265 |
+
)
|
| 266 |
+
chunks.append(chunk)
|
| 267 |
+
|
| 268 |
+
return chunks
|
| 269 |
+
|
| 270 |
+
def _generate_chunk_id(self, document_id: str, chunk_index: int) -> str:
|
| 271 |
+
"""Generate a unique chunk ID"""
|
| 272 |
+
return f"{document_id}_chunk_{chunk_index}"
|
| 273 |
+
|
| 274 |
+
def optimize_chunks_for_embedding(self, chunks: List[Chunk]) -> List[Chunk]:
|
| 275 |
+
"""Optimize chunks for better embedding generation"""
|
| 276 |
+
optimized_chunks = []
|
| 277 |
+
|
| 278 |
+
for chunk in chunks:
|
| 279 |
+
# Clean the content for embedding
|
| 280 |
+
clean_content = self.preprocessor.prepare_for_embedding(chunk.content)
|
| 281 |
+
|
| 282 |
+
# Skip very short chunks
|
| 283 |
+
if len(clean_content.split()) < 5:
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
# Update chunk with optimized content
|
| 287 |
+
optimized_chunk = Chunk(
|
| 288 |
+
id=chunk.id,
|
| 289 |
+
document_id=chunk.document_id,
|
| 290 |
+
content=clean_content,
|
| 291 |
+
chunk_index=chunk.chunk_index,
|
| 292 |
+
start_pos=chunk.start_pos,
|
| 293 |
+
end_pos=chunk.end_pos,
|
| 294 |
+
metadata={
|
| 295 |
+
**chunk.metadata,
|
| 296 |
+
"optimized_for_embedding": True,
|
| 297 |
+
"original_content_length": len(chunk.content),
|
| 298 |
+
"optimized_content_length": len(clean_content)
|
| 299 |
+
}
|
| 300 |
+
)
|
| 301 |
+
optimized_chunks.append(optimized_chunk)
|
| 302 |
+
|
| 303 |
+
return optimized_chunks
|
core/document_parser.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import tempfile
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Optional, Dict, Any
|
| 6 |
+
import asyncio
|
| 7 |
+
|
| 8 |
+
# Document processing libraries
|
| 9 |
+
import PyPDF2
|
| 10 |
+
from docx import Document as DocxDocument
|
| 11 |
+
from PIL import Image
|
| 12 |
+
import pytesseract
|
| 13 |
+
|
| 14 |
+
from .models import Document, DocumentType
|
| 15 |
+
import config
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
class DocumentParser:
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.config = config.config
|
| 22 |
+
|
| 23 |
+
async def parse_document(self, file_path: str, filename: str) -> Document:
|
| 24 |
+
"""Parse a document and extract its content"""
|
| 25 |
+
try:
|
| 26 |
+
file_ext = Path(filename).suffix.lower()
|
| 27 |
+
file_size = os.path.getsize(file_path)
|
| 28 |
+
|
| 29 |
+
# Determine document type and parse accordingly
|
| 30 |
+
if file_ext == '.pdf':
|
| 31 |
+
content = await self._parse_pdf(file_path)
|
| 32 |
+
doc_type = DocumentType.PDF
|
| 33 |
+
elif file_ext == '.txt':
|
| 34 |
+
content = await self._parse_text(file_path)
|
| 35 |
+
doc_type = DocumentType.TEXT
|
| 36 |
+
elif file_ext == '.docx':
|
| 37 |
+
content = await self._parse_docx(file_path)
|
| 38 |
+
doc_type = DocumentType.DOCX
|
| 39 |
+
elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
|
| 40 |
+
content = await self._parse_image(file_path)
|
| 41 |
+
doc_type = DocumentType.IMAGE
|
| 42 |
+
else:
|
| 43 |
+
raise ValueError(f"Unsupported file type: {file_ext}")
|
| 44 |
+
|
| 45 |
+
# Create document object
|
| 46 |
+
document = Document(
|
| 47 |
+
id=self._generate_document_id(),
|
| 48 |
+
filename=filename,
|
| 49 |
+
content=content,
|
| 50 |
+
doc_type=doc_type,
|
| 51 |
+
file_size=file_size,
|
| 52 |
+
metadata={
|
| 53 |
+
"file_extension": file_ext,
|
| 54 |
+
"content_length": len(content),
|
| 55 |
+
"word_count": len(content.split()) if content else 0
|
| 56 |
+
}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
logger.info(f"Successfully parsed document: {filename}")
|
| 60 |
+
return document
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
logger.error(f"Error parsing document {filename}: {str(e)}")
|
| 64 |
+
raise
|
| 65 |
+
|
| 66 |
+
async def _parse_pdf(self, file_path: str) -> str:
|
| 67 |
+
"""Extract text from PDF file"""
|
| 68 |
+
try:
|
| 69 |
+
content = ""
|
| 70 |
+
with open(file_path, 'rb') as file:
|
| 71 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
| 72 |
+
for page_num, page in enumerate(pdf_reader.pages):
|
| 73 |
+
try:
|
| 74 |
+
page_text = page.extract_text()
|
| 75 |
+
if page_text.strip():
|
| 76 |
+
content += f"\n--- Page {page_num + 1} ---\n"
|
| 77 |
+
content += page_text + "\n"
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
return content.strip()
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Error parsing PDF: {str(e)}")
|
| 85 |
+
raise
|
| 86 |
+
|
| 87 |
+
async def _parse_text(self, file_path: str) -> str:
|
| 88 |
+
"""Read plain text file"""
|
| 89 |
+
try:
|
| 90 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
|
| 91 |
+
content = file.read()
|
| 92 |
+
return content.strip()
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"Error parsing text file: {str(e)}")
|
| 95 |
+
raise
|
| 96 |
+
|
| 97 |
+
async def _parse_docx(self, file_path: str) -> str:
|
| 98 |
+
"""Extract text from DOCX file"""
|
| 99 |
+
try:
|
| 100 |
+
doc = DocxDocument(file_path)
|
| 101 |
+
content = ""
|
| 102 |
+
|
| 103 |
+
for paragraph in doc.paragraphs:
|
| 104 |
+
if paragraph.text.strip():
|
| 105 |
+
content += paragraph.text + "\n"
|
| 106 |
+
|
| 107 |
+
# Extract text from tables
|
| 108 |
+
for table in doc.tables:
|
| 109 |
+
for row in table.rows:
|
| 110 |
+
row_text = []
|
| 111 |
+
for cell in row.cells:
|
| 112 |
+
if cell.text.strip():
|
| 113 |
+
row_text.append(cell.text.strip())
|
| 114 |
+
if row_text:
|
| 115 |
+
content += " | ".join(row_text) + "\n"
|
| 116 |
+
|
| 117 |
+
return content.strip()
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Error parsing DOCX file: {str(e)}")
|
| 120 |
+
raise
|
| 121 |
+
|
| 122 |
+
async def _parse_image(self, file_path: str) -> str:
|
| 123 |
+
"""Extract text from image using OCR"""
|
| 124 |
+
try:
|
| 125 |
+
# First try with OCR service if available
|
| 126 |
+
if hasattr(self, 'ocr_service') and self.ocr_service:
|
| 127 |
+
logger.info(f"Using OCR service for image: {file_path}")
|
| 128 |
+
text = await self.ocr_service.extract_text_from_image(file_path)
|
| 129 |
+
if text:
|
| 130 |
+
return text
|
| 131 |
+
|
| 132 |
+
# Fallback to direct pytesseract
|
| 133 |
+
logger.info(f"Using direct pytesseract for image: {file_path}")
|
| 134 |
+
image = Image.open(file_path)
|
| 135 |
+
|
| 136 |
+
# Perform OCR
|
| 137 |
+
content = pytesseract.image_to_string(
|
| 138 |
+
image,
|
| 139 |
+
lang=self.config.OCR_LANGUAGE,
|
| 140 |
+
config='--psm 6' # Assume a single uniform block of text
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
return content.strip()
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logger.error(f"Error performing OCR on image: {str(e)}")
|
| 146 |
+
# Return empty string if OCR fails
|
| 147 |
+
return ""
|
| 148 |
+
|
| 149 |
+
def _generate_document_id(self) -> str:
|
| 150 |
+
"""Generate a unique document ID"""
|
| 151 |
+
import uuid
|
| 152 |
+
return str(uuid.uuid4())
|
| 153 |
+
|
| 154 |
+
async def extract_metadata(self, file_path: str, content: str) -> Dict[str, Any]:
|
| 155 |
+
"""Extract additional metadata from the document"""
|
| 156 |
+
try:
|
| 157 |
+
metadata = {}
|
| 158 |
+
|
| 159 |
+
# Basic statistics
|
| 160 |
+
metadata["content_length"] = len(content)
|
| 161 |
+
metadata["word_count"] = len(content.split()) if content else 0
|
| 162 |
+
metadata["line_count"] = len(content.splitlines()) if content else 0
|
| 163 |
+
|
| 164 |
+
# File information
|
| 165 |
+
file_stat = os.stat(file_path)
|
| 166 |
+
metadata["file_size"] = file_stat.st_size
|
| 167 |
+
metadata["created_time"] = file_stat.st_ctime
|
| 168 |
+
metadata["modified_time"] = file_stat.st_mtime
|
| 169 |
+
|
| 170 |
+
# Content analysis
|
| 171 |
+
if content:
|
| 172 |
+
# Language detection (simple heuristic)
|
| 173 |
+
metadata["estimated_language"] = self._detect_language(content)
|
| 174 |
+
|
| 175 |
+
# Reading time estimation (average 200 words per minute)
|
| 176 |
+
metadata["estimated_reading_time_minutes"] = max(1, metadata["word_count"] // 200)
|
| 177 |
+
|
| 178 |
+
return metadata
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.error(f"Error extracting metadata: {str(e)}")
|
| 181 |
+
return {}
|
| 182 |
+
|
| 183 |
+
def _detect_language(self, content: str) -> str:
|
| 184 |
+
"""Simple language detection based on character patterns"""
|
| 185 |
+
# This is a very basic implementation
|
| 186 |
+
# In production, you might want to use a proper language detection library
|
| 187 |
+
if not content:
|
| 188 |
+
return "unknown"
|
| 189 |
+
|
| 190 |
+
# Count common English words
|
| 191 |
+
english_words = ["the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "this", "that", "these", "those"]
|
| 192 |
+
|
| 193 |
+
words = content.lower().split()
|
| 194 |
+
english_count = sum(1 for word in words if word in english_words)
|
| 195 |
+
|
| 196 |
+
if len(words) > 0 and english_count / len(words) > 0.1:
|
| 197 |
+
return "en"
|
| 198 |
+
else:
|
| 199 |
+
return "unknown"
|
core/models.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional, Dict, Any
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from enum import Enum
|
| 5 |
+
|
| 6 |
+
class DocumentType(str, Enum):
|
| 7 |
+
PDF = "pdf"
|
| 8 |
+
TEXT = "txt"
|
| 9 |
+
DOCX = "docx"
|
| 10 |
+
IMAGE = "image"
|
| 11 |
+
HTML = "html"
|
| 12 |
+
|
| 13 |
+
class ProcessingStatus(str, Enum):
|
| 14 |
+
PENDING = "pending"
|
| 15 |
+
PROCESSING = "processing"
|
| 16 |
+
COMPLETED = "completed"
|
| 17 |
+
FAILED = "failed"
|
| 18 |
+
|
| 19 |
+
class Document(BaseModel):
|
| 20 |
+
id: str = Field(..., description="Unique document identifier")
|
| 21 |
+
filename: str = Field(..., description="Original filename")
|
| 22 |
+
content: str = Field(..., description="Extracted text content")
|
| 23 |
+
doc_type: DocumentType = Field(..., description="Document type")
|
| 24 |
+
file_size: int = Field(..., description="File size in bytes")
|
| 25 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 26 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 27 |
+
tags: List[str] = Field(default_factory=list)
|
| 28 |
+
summary: Optional[str] = None
|
| 29 |
+
category: Optional[str] = None
|
| 30 |
+
language: Optional[str] = None
|
| 31 |
+
|
| 32 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 33 |
+
return {
|
| 34 |
+
"id": self.id,
|
| 35 |
+
"filename": self.filename,
|
| 36 |
+
"content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
|
| 37 |
+
"doc_type": self.doc_type,
|
| 38 |
+
"file_size": self.file_size,
|
| 39 |
+
"created_at": self.created_at.isoformat(),
|
| 40 |
+
"metadata": self.metadata,
|
| 41 |
+
"tags": self.tags,
|
| 42 |
+
"summary": self.summary,
|
| 43 |
+
"category": self.category,
|
| 44 |
+
"language": self.language
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
class Chunk(BaseModel):
|
| 48 |
+
id: str = Field(..., description="Unique chunk identifier")
|
| 49 |
+
document_id: str = Field(..., description="Parent document ID")
|
| 50 |
+
content: str = Field(..., description="Chunk text content")
|
| 51 |
+
chunk_index: int = Field(..., description="Position in document")
|
| 52 |
+
start_pos: int = Field(..., description="Start position in original document")
|
| 53 |
+
end_pos: int = Field(..., description="End position in original document")
|
| 54 |
+
embedding: Optional[List[float]] = None
|
| 55 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 56 |
+
|
| 57 |
+
class SearchResult(BaseModel):
|
| 58 |
+
chunk_id: str = Field(..., description="Matching chunk ID")
|
| 59 |
+
document_id: str = Field(..., description="Source document ID")
|
| 60 |
+
content: str = Field(..., description="Matching content")
|
| 61 |
+
score: float = Field(..., description="Similarity score")
|
| 62 |
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 63 |
+
|
| 64 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 65 |
+
return {
|
| 66 |
+
"chunk_id": self.chunk_id,
|
| 67 |
+
"document_id": self.document_id,
|
| 68 |
+
"content": self.content,
|
| 69 |
+
"score": self.score,
|
| 70 |
+
"metadata": self.metadata
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
class ProcessingTask(BaseModel):
|
| 74 |
+
task_id: str = Field(..., description="Unique task identifier")
|
| 75 |
+
document_id: Optional[str] = None
|
| 76 |
+
status: ProcessingStatus = ProcessingStatus.PENDING
|
| 77 |
+
progress: float = Field(default=0.0, ge=0.0, le=100.0)
|
| 78 |
+
message: Optional[str] = None
|
| 79 |
+
error: Optional[str] = None
|
| 80 |
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
| 81 |
+
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
| 82 |
+
|
| 83 |
+
class SummaryRequest(BaseModel):
|
| 84 |
+
content: Optional[str] = None
|
| 85 |
+
document_id: Optional[str] = None
|
| 86 |
+
style: str = Field(default="concise", description="Summary style")
|
| 87 |
+
max_length: Optional[int] = None
|
| 88 |
+
|
| 89 |
+
class TagGenerationRequest(BaseModel):
|
| 90 |
+
content: Optional[str] = None
|
| 91 |
+
document_id: Optional[str] = None
|
| 92 |
+
max_tags: int = Field(default=5, ge=1, le=20)
|
| 93 |
+
|
| 94 |
+
class QuestionAnswerRequest(BaseModel):
|
| 95 |
+
question: str = Field(..., description="Question to answer")
|
| 96 |
+
context_filter: Optional[Dict[str, Any]] = None
|
| 97 |
+
max_context_length: int = Field(default=2000)
|
| 98 |
+
|
| 99 |
+
class CategorizationRequest(BaseModel):
|
| 100 |
+
content: Optional[str] = None
|
| 101 |
+
document_id: Optional[str] = None
|
| 102 |
+
categories: Optional[List[str]] = None
|
core/text_preprocessor.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Optional
|
| 4 |
+
import unicodedata
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
class TextPreprocessor:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
# Common stop words for basic filtering
|
| 11 |
+
self.stop_words = {
|
| 12 |
+
'en': set([
|
| 13 |
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
| 14 |
+
'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
|
| 15 |
+
'before', 'after', 'above', 'below', 'between', 'among', 'throughout',
|
| 16 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
| 17 |
+
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
|
| 18 |
+
'must', 'shall', 'can', 'this', 'that', 'these', 'those', 'i', 'me',
|
| 19 |
+
'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours'
|
| 20 |
+
])
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
def clean_text(self, text: str, aggressive: bool = False) -> str:
|
| 24 |
+
"""Clean and normalize text"""
|
| 25 |
+
if not text:
|
| 26 |
+
return ""
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
# Normalize unicode characters
|
| 30 |
+
text = unicodedata.normalize('NFKD', text)
|
| 31 |
+
|
| 32 |
+
# Remove excessive whitespace
|
| 33 |
+
text = re.sub(r'\s+', ' ', text)
|
| 34 |
+
|
| 35 |
+
# Remove or replace special characters
|
| 36 |
+
if aggressive:
|
| 37 |
+
# More aggressive cleaning for embedding
|
| 38 |
+
text = re.sub(r'[^\w\s\-.,!?;:]', ' ', text)
|
| 39 |
+
text = re.sub(r'[.,!?;:]+', '.', text)
|
| 40 |
+
else:
|
| 41 |
+
# Basic cleaning for readability
|
| 42 |
+
text = re.sub(r'[^\w\s\-.,!?;:()\[\]{}"\']', ' ', text)
|
| 43 |
+
|
| 44 |
+
# Remove excessive punctuation
|
| 45 |
+
text = re.sub(r'\.{2,}', '.', text)
|
| 46 |
+
text = re.sub(r'[!?]{2,}', '!', text)
|
| 47 |
+
|
| 48 |
+
# Clean up whitespace again
|
| 49 |
+
text = re.sub(r'\s+', ' ', text)
|
| 50 |
+
|
| 51 |
+
# Remove leading/trailing whitespace
|
| 52 |
+
text = text.strip()
|
| 53 |
+
|
| 54 |
+
return text
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.error(f"Error cleaning text: {str(e)}")
|
| 57 |
+
return text
|
| 58 |
+
|
| 59 |
+
def extract_sentences(self, text: str) -> List[str]:
|
| 60 |
+
"""Extract sentences from text"""
|
| 61 |
+
if not text:
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
# Simple sentence splitting
|
| 66 |
+
sentences = re.split(r'[.!?]+', text)
|
| 67 |
+
|
| 68 |
+
# Clean and filter sentences
|
| 69 |
+
clean_sentences = []
|
| 70 |
+
for sentence in sentences:
|
| 71 |
+
sentence = sentence.strip()
|
| 72 |
+
if len(sentence) > 10: # Minimum sentence length
|
| 73 |
+
clean_sentences.append(sentence)
|
| 74 |
+
|
| 75 |
+
return clean_sentences
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Error extracting sentences: {str(e)}")
|
| 78 |
+
return [text]
|
| 79 |
+
|
| 80 |
+
def extract_keywords(self, text: str, language: str = 'en', max_keywords: int = 20) -> List[str]:
|
| 81 |
+
"""Extract potential keywords from text"""
|
| 82 |
+
if not text:
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
try:
|
| 86 |
+
# Convert to lowercase and split into words
|
| 87 |
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
|
| 88 |
+
|
| 89 |
+
# Remove stop words
|
| 90 |
+
stop_words = self.stop_words.get(language, set())
|
| 91 |
+
keywords = [word for word in words if word not in stop_words]
|
| 92 |
+
|
| 93 |
+
# Count word frequency
|
| 94 |
+
word_freq = {}
|
| 95 |
+
for word in keywords:
|
| 96 |
+
word_freq[word] = word_freq.get(word, 0) + 1
|
| 97 |
+
|
| 98 |
+
# Sort by frequency and return top keywords
|
| 99 |
+
sorted_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
| 100 |
+
|
| 101 |
+
return [word for word, freq in sorted_keywords[:max_keywords]]
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Error extracting keywords: {str(e)}")
|
| 104 |
+
return []
|
| 105 |
+
|
| 106 |
+
def prepare_for_embedding(self, text: str) -> str:
|
| 107 |
+
"""Prepare text specifically for embedding generation"""
|
| 108 |
+
if not text:
|
| 109 |
+
return ""
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
# Clean text aggressively for better embeddings
|
| 113 |
+
clean_text = self.clean_text(text, aggressive=True)
|
| 114 |
+
|
| 115 |
+
# Remove very short words
|
| 116 |
+
words = clean_text.split()
|
| 117 |
+
filtered_words = [word for word in words if len(word) >= 2]
|
| 118 |
+
|
| 119 |
+
# Rejoin and ensure reasonable length
|
| 120 |
+
result = ' '.join(filtered_words)
|
| 121 |
+
|
| 122 |
+
# Truncate if too long (most embedding models have token limits)
|
| 123 |
+
if len(result) > 5000: # Rough character limit
|
| 124 |
+
result = result[:5000] + "..."
|
| 125 |
+
|
| 126 |
+
return result
|
| 127 |
+
except Exception as e:
|
| 128 |
+
logger.error(f"Error preparing text for embedding: {str(e)}")
|
| 129 |
+
return text
|
| 130 |
+
|
| 131 |
+
def extract_metadata_from_text(self, text: str) -> dict:
|
| 132 |
+
"""Extract metadata from text content"""
|
| 133 |
+
if not text:
|
| 134 |
+
return {}
|
| 135 |
+
|
| 136 |
+
try:
|
| 137 |
+
metadata = {}
|
| 138 |
+
|
| 139 |
+
# Basic statistics
|
| 140 |
+
metadata['character_count'] = len(text)
|
| 141 |
+
metadata['word_count'] = len(text.split())
|
| 142 |
+
metadata['sentence_count'] = len(self.extract_sentences(text))
|
| 143 |
+
metadata['paragraph_count'] = len([p for p in text.split('\n\n') if p.strip()])
|
| 144 |
+
|
| 145 |
+
# Content characteristics
|
| 146 |
+
metadata['avg_word_length'] = sum(len(word) for word in text.split()) / max(1, len(text.split()))
|
| 147 |
+
metadata['avg_sentence_length'] = metadata['word_count'] / max(1, metadata['sentence_count'])
|
| 148 |
+
|
| 149 |
+
# Special content detection
|
| 150 |
+
metadata['has_urls'] = bool(re.search(r'https?://\S+', text))
|
| 151 |
+
metadata['has_emails'] = bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
|
| 152 |
+
metadata['has_phone_numbers'] = bool(re.search(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text))
|
| 153 |
+
metadata['has_dates'] = bool(re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text))
|
| 154 |
+
metadata['has_numbers'] = bool(re.search(r'\b\d+\b', text))
|
| 155 |
+
|
| 156 |
+
# Language indicators
|
| 157 |
+
metadata['punctuation_density'] = len(re.findall(r'[.,!?;:]', text)) / max(1, len(text))
|
| 158 |
+
metadata['caps_ratio'] = len(re.findall(r'[A-Z]', text)) / max(1, len(text))
|
| 159 |
+
|
| 160 |
+
return metadata
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.error(f"Error extracting text metadata: {str(e)}")
|
| 163 |
+
return {}
|
| 164 |
+
|
| 165 |
+
def normalize_for_search(self, text: str) -> str:
|
| 166 |
+
"""Normalize text for search queries"""
|
| 167 |
+
if not text:
|
| 168 |
+
return ""
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
# Convert to lowercase
|
| 172 |
+
text = text.lower()
|
| 173 |
+
|
| 174 |
+
# Remove special characters but keep spaces
|
| 175 |
+
text = re.sub(r'[^\w\s]', ' ', text)
|
| 176 |
+
|
| 177 |
+
# Normalize whitespace
|
| 178 |
+
text = re.sub(r'\s+', ' ', text)
|
| 179 |
+
|
| 180 |
+
# Strip leading/trailing whitespace
|
| 181 |
+
text = text.strip()
|
| 182 |
+
|
| 183 |
+
return text
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"Error normalizing text for search: {str(e)}")
|
| 186 |
+
return text
|
mcp_server.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Dict, Any, List, Optional
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from mcp.server.fastmcp import FastMCP
|
| 7 |
+
|
| 8 |
+
from services.vector_store_service import VectorStoreService
|
| 9 |
+
from services.document_store_service import DocumentStoreService
|
| 10 |
+
from services.embedding_service import EmbeddingService
|
| 11 |
+
from services.llm_service import LLMService
|
| 12 |
+
from services.ocr_service import OCRService
|
| 13 |
+
|
| 14 |
+
from mcp_tools.ingestion_tool import IngestionTool
|
| 15 |
+
from mcp_tools.search_tool import SearchTool
|
| 16 |
+
from mcp_tools.generative_tool import GenerativeTool
|
| 17 |
+
|
| 18 |
+
# Phase 2 & 3: Voice and Podcast
|
| 19 |
+
from services.llamaindex_service import LlamaIndexService
|
| 20 |
+
from services.elevenlabs_service import ElevenLabsService
|
| 21 |
+
from services.podcast_generator_service import PodcastGeneratorService
|
| 22 |
+
from mcp_tools.voice_tool import VoiceTool
|
| 23 |
+
from mcp_tools.podcast_tool import PodcastTool
|
| 24 |
+
|
| 25 |
+
logging.basicConfig(level=logging.INFO)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
logger.info("Initializing services for FastMCP...")
|
| 29 |
+
vector_store_service = VectorStoreService()
|
| 30 |
+
document_store_service = DocumentStoreService()
|
| 31 |
+
embedding_service_instance = EmbeddingService()
|
| 32 |
+
llm_service_instance = LLMService()
|
| 33 |
+
ocr_service_instance = OCRService()
|
| 34 |
+
|
| 35 |
+
ingestion_tool_instance = IngestionTool(
|
| 36 |
+
vector_store=vector_store_service,
|
| 37 |
+
document_store=document_store_service,
|
| 38 |
+
embedding_service=embedding_service_instance,
|
| 39 |
+
ocr_service=ocr_service_instance
|
| 40 |
+
)
|
| 41 |
+
search_tool_instance = SearchTool(
|
| 42 |
+
vector_store=vector_store_service,
|
| 43 |
+
embedding_service=embedding_service_instance,
|
| 44 |
+
document_store=document_store_service
|
| 45 |
+
)
|
| 46 |
+
generative_tool_instance = GenerativeTool(
|
| 47 |
+
llm_service=llm_service_instance,
|
| 48 |
+
search_tool=search_tool_instance
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Phase 2 & 3 Services
|
| 52 |
+
logger.info("Initializing Phase 2 & 3 services...")
|
| 53 |
+
llamaindex_service_instance = LlamaIndexService(document_store_service)
|
| 54 |
+
elevenlabs_service_instance = ElevenLabsService(llamaindex_service_instance)
|
| 55 |
+
podcast_generator_instance = PodcastGeneratorService(
|
| 56 |
+
llamaindex_service=llamaindex_service_instance,
|
| 57 |
+
llm_service=llm_service_instance
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
voice_tool_instance = VoiceTool(elevenlabs_service_instance)
|
| 61 |
+
podcast_tool_instance = PodcastTool(podcast_generator_instance)
|
| 62 |
+
|
| 63 |
+
mcp = FastMCP("")
|
| 64 |
+
logger.info("FastMCP server initialized.")
|
| 65 |
+
|
| 66 |
+
@mcp.tool()
|
| 67 |
+
async def ingest_document(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
|
| 68 |
+
"""
|
| 69 |
+
Process and index a document from a local file path for searching.
|
| 70 |
+
Automatically determines file_type if not provided.
|
| 71 |
+
"""
|
| 72 |
+
logger.info(f"Tool 'ingest_document' called with file_path: {file_path}, file_type: {file_type}")
|
| 73 |
+
try:
|
| 74 |
+
actual_file_type = file_type
|
| 75 |
+
if not actual_file_type:
|
| 76 |
+
actual_file_type = Path(file_path).suffix.lower().strip('.')
|
| 77 |
+
logger.info(f"Inferred file_type: {actual_file_type}")
|
| 78 |
+
result = await ingestion_tool_instance.process_document(file_path, actual_file_type)
|
| 79 |
+
logger.info(f"Ingestion result: {result}")
|
| 80 |
+
return result
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"Error in 'ingest_document' tool: {str(e)}", exc_info=True)
|
| 83 |
+
return {"success": False, "error": str(e)}
|
| 84 |
+
|
| 85 |
+
@mcp.tool()
|
| 86 |
+
async def semantic_search(query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 87 |
+
"""
|
| 88 |
+
Search through indexed content using natural language.
|
| 89 |
+
'filters' can be used to narrow down the search.
|
| 90 |
+
"""
|
| 91 |
+
logger.info(f"Tool 'semantic_search' called with query: {query}, top_k: {top_k}, filters: {filters}")
|
| 92 |
+
try:
|
| 93 |
+
results = await search_tool_instance.search(query, top_k, filters)
|
| 94 |
+
return {
|
| 95 |
+
"success": True,
|
| 96 |
+
"query": query,
|
| 97 |
+
"results": [result.to_dict() for result in results],
|
| 98 |
+
"total_results": len(results)
|
| 99 |
+
}
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.error(f"Error in 'semantic_search' tool: {str(e)}", exc_info=True)
|
| 102 |
+
return {"success": False, "error": str(e), "results": []}
|
| 103 |
+
|
| 104 |
+
@mcp.tool()
|
| 105 |
+
async def summarize_content(
|
| 106 |
+
content: Optional[str] = None,
|
| 107 |
+
document_id: Optional[str] = None,
|
| 108 |
+
style: str = "concise"
|
| 109 |
+
) -> Dict[str, Any]:
|
| 110 |
+
"""
|
| 111 |
+
Generate a summary of provided content or a document_id.
|
| 112 |
+
Available styles: concise, detailed, bullet_points, executive.
|
| 113 |
+
"""
|
| 114 |
+
logger.info(f"Tool 'summarize_content' called. doc_id: {document_id}, style: {style}, has_content: {content is not None}")
|
| 115 |
+
try:
|
| 116 |
+
text_to_summarize = content
|
| 117 |
+
if document_id and not text_to_summarize:
|
| 118 |
+
doc = await document_store_service.get_document(document_id)
|
| 119 |
+
if not doc:
|
| 120 |
+
return {"success": False, "error": f"Document {document_id} not found"}
|
| 121 |
+
text_to_summarize = doc.content
|
| 122 |
+
if not text_to_summarize:
|
| 123 |
+
return {"success": False, "error": "No content provided for summarization"}
|
| 124 |
+
max_length = 10000
|
| 125 |
+
if len(text_to_summarize) > max_length:
|
| 126 |
+
logger.warning(f"Content for summarization is long ({len(text_to_summarize)} chars), truncating to {max_length}")
|
| 127 |
+
text_to_summarize = text_to_summarize[:max_length] + "..."
|
| 128 |
+
summary = await generative_tool_instance.summarize(text_to_summarize, style)
|
| 129 |
+
return {
|
| 130 |
+
"success": True,
|
| 131 |
+
"summary": summary,
|
| 132 |
+
"original_length": len(text_to_summarize),
|
| 133 |
+
"summary_length": len(summary),
|
| 134 |
+
"style": style
|
| 135 |
+
}
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.error(f"Error in 'summarize_content' tool: {str(e)}", exc_info=True)
|
| 138 |
+
return {"success": False, "error": str(e)}
|
| 139 |
+
|
| 140 |
+
@mcp.tool()
|
| 141 |
+
async def generate_tags(
|
| 142 |
+
content: Optional[str] = None,
|
| 143 |
+
document_id: Optional[str] = None,
|
| 144 |
+
max_tags: int = 5
|
| 145 |
+
) -> Dict[str, Any]:
|
| 146 |
+
"""
|
| 147 |
+
Generate relevant tags for content or a document_id.
|
| 148 |
+
Saves tags to document metadata if document_id is provided.
|
| 149 |
+
"""
|
| 150 |
+
logger.info(f"Tool 'generate_tags' called. doc_id: {document_id}, max_tags: {max_tags}, has_content: {content is not None}")
|
| 151 |
+
try:
|
| 152 |
+
text_for_tags = content
|
| 153 |
+
if document_id and not text_for_tags:
|
| 154 |
+
doc = await document_store_service.get_document(document_id)
|
| 155 |
+
if not doc:
|
| 156 |
+
return {"success": False, "error": f"Document {document_id} not found"}
|
| 157 |
+
text_for_tags = doc.content
|
| 158 |
+
if not text_for_tags:
|
| 159 |
+
return {"success": False, "error": "No content provided for tag generation"}
|
| 160 |
+
tags = await generative_tool_instance.generate_tags(text_for_tags, max_tags)
|
| 161 |
+
if document_id and tags:
|
| 162 |
+
await document_store_service.update_document_metadata(document_id, {"tags": tags})
|
| 163 |
+
logger.info(f"Tags {tags} saved for document {document_id}")
|
| 164 |
+
return {
|
| 165 |
+
"success": True,
|
| 166 |
+
"tags": tags,
|
| 167 |
+
"content_length": len(text_for_tags),
|
| 168 |
+
"document_id": document_id
|
| 169 |
+
}
|
| 170 |
+
except Exception as e:
|
| 171 |
+
logger.error(f"Error in 'generate_tags' tool: {str(e)}", exc_info=True)
|
| 172 |
+
return {"success": False, "error": str(e)}
|
| 173 |
+
|
| 174 |
+
@mcp.tool()
|
| 175 |
+
async def answer_question(question: str, context_filter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 176 |
+
"""
|
| 177 |
+
Answer questions using RAG (Retrieval Augmented Generation) over indexed content.
|
| 178 |
+
'context_filter' can be used to narrow down the context search.
|
| 179 |
+
"""
|
| 180 |
+
logger.info(f"Tool 'answer_question' called with question: {question}, context_filter: {context_filter}")
|
| 181 |
+
try:
|
| 182 |
+
search_results = await search_tool_instance.search(question, top_k=5, filters=context_filter)
|
| 183 |
+
if not search_results:
|
| 184 |
+
return {
|
| 185 |
+
"success": False,
|
| 186 |
+
"error": "No relevant context found. Please upload relevant documents.",
|
| 187 |
+
"question": question,
|
| 188 |
+
"answer": "I could not find enough information in the documents to answer your question."
|
| 189 |
+
}
|
| 190 |
+
answer = await generative_tool_instance.answer_question(question, search_results)
|
| 191 |
+
return {
|
| 192 |
+
"success": True,
|
| 193 |
+
"question": question,
|
| 194 |
+
"answer": answer,
|
| 195 |
+
"sources": [result.to_dict() for result in search_results],
|
| 196 |
+
"confidence": "high" if len(search_results) >= 3 else "medium"
|
| 197 |
+
}
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.error(f"Error in 'answer_question' tool: {str(e)}", exc_info=True)
|
| 200 |
+
return {"success": False, "error": str(e)}
|
| 201 |
+
|
| 202 |
+
@mcp.tool()
|
| 203 |
+
async def voice_qa(question: str, session_id: Optional[str] = None) -> Dict[str, Any]:
|
| 204 |
+
"""
|
| 205 |
+
Ask a question using the AI voice assistant with RAG capabilities.
|
| 206 |
+
Provides text-based Q&A powered by LlamaIndex agentic search.
|
| 207 |
+
"""
|
| 208 |
+
logger.info(f"Tool 'voice_qa' called with question: {question}")
|
| 209 |
+
try:
|
| 210 |
+
result = await voice_tool_instance.voice_qa(question, session_id)
|
| 211 |
+
return result
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.error(f"Error in 'voice_qa' tool: {str(e)}", exc_info=True)
|
| 214 |
+
return {"success": False, "error": str(e)}
|
| 215 |
+
|
| 216 |
+
@mcp.tool()
|
| 217 |
+
async def generate_podcast(
|
| 218 |
+
document_ids: List[str],
|
| 219 |
+
style: str = "conversational",
|
| 220 |
+
duration_minutes: int = 10,
|
| 221 |
+
host1_voice: str = "Rachel",
|
| 222 |
+
host2_voice: str = "Adam"
|
| 223 |
+
) -> Dict[str, Any]:
|
| 224 |
+
"""
|
| 225 |
+
Generate a podcast from selected documents.
|
| 226 |
+
Styles: conversational, educational, technical, casual.
|
| 227 |
+
Duration: 5-30 minutes recommended.
|
| 228 |
+
Voices: Rachel, Adam, Domi, Bella, Antoni, Josh, Sam, Emily, etc.
|
| 229 |
+
"""
|
| 230 |
+
logger.info(f"Tool 'generate_podcast' called with {len(document_ids)} docs, style: {style}")
|
| 231 |
+
try:
|
| 232 |
+
result = await podcast_tool_instance.generate_podcast(
|
| 233 |
+
document_ids=document_ids,
|
| 234 |
+
style=style,
|
| 235 |
+
duration_minutes=duration_minutes,
|
| 236 |
+
host1_voice=host1_voice,
|
| 237 |
+
host2_voice=host2_voice
|
| 238 |
+
)
|
| 239 |
+
return result
|
| 240 |
+
except Exception as e:
|
| 241 |
+
logger.error(f"Error in 'generate_podcast' tool: {str(e)}", exc_info=True)
|
| 242 |
+
return {"success": False, "error": str(e)}
|
| 243 |
+
|
| 244 |
+
@mcp.tool()
|
| 245 |
+
async def list_documents_for_ui(limit: int = 100, offset: int = 0) -> Dict[str, Any]:
|
| 246 |
+
"""
|
| 247 |
+
(UI Helper) List documents from the document store.
|
| 248 |
+
Not a standard processing tool, but useful for UI population.
|
| 249 |
+
"""
|
| 250 |
+
logger.info(f"Tool 'list_documents_for_ui' called with limit: {limit}, offset: {offset}")
|
| 251 |
+
try:
|
| 252 |
+
documents = await document_store_service.list_documents(limit, offset)
|
| 253 |
+
return {
|
| 254 |
+
"success": True,
|
| 255 |
+
"documents": [doc.to_dict() for doc in documents],
|
| 256 |
+
"total": len(documents)
|
| 257 |
+
}
|
| 258 |
+
except Exception as e:
|
| 259 |
+
logger.error(f"Error in 'list_documents_for_ui' tool: {str(e)}", exc_info=True)
|
| 260 |
+
return {"success": False, "error": str(e), "documents": []}
|
| 261 |
+
|
| 262 |
+
# Blaxel Deployment Support
|
| 263 |
+
from fastapi import FastAPI
|
| 264 |
+
from mcp.server.streamable_http import StreamableHTTPServerTransport
|
| 265 |
+
import os
|
| 266 |
+
|
| 267 |
+
# Get Blaxel environment variables
|
| 268 |
+
host = os.getenv("BL_SERVER_HOST", "0.0.0.0")
|
| 269 |
+
port = int(os.getenv("BL_SERVER_PORT", "8000"))
|
| 270 |
+
|
| 271 |
+
# Create FastAPI app
|
| 272 |
+
app = FastAPI()
|
| 273 |
+
|
| 274 |
+
# Initialize HTTP transport instead of stdio
|
| 275 |
+
transport = StreamableHTTPServerTransport()
|
| 276 |
+
|
| 277 |
+
# Connect MCP server to HTTP transport
|
| 278 |
+
mcp.connect(transport)
|
| 279 |
+
|
| 280 |
+
# Mount transport to FastAPI
|
| 281 |
+
transport.mount(app)
|
| 282 |
+
|
| 283 |
+
@app.get("/health")
|
| 284 |
+
async def health_check():
|
| 285 |
+
"""Health check endpoint for Modal"""
|
| 286 |
+
return {"status": "healthy", "service": "mcp-server"}
|
| 287 |
+
|
| 288 |
+
if __name__ == "__main__":
|
| 289 |
+
import uvicorn
|
| 290 |
+
uvicorn.run(app, host=host, port=port)
|
mcp_tools/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# MCP tools module initialization
|
mcp_tools/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (150 Bytes). View file
|
|
|
mcp_tools/__pycache__/generative_tool.cpython-313.pyc
ADDED
|
Binary file (21.4 kB). View file
|
|
|
mcp_tools/__pycache__/ingestion_tool.cpython-313.pyc
ADDED
|
Binary file (16.8 kB). View file
|
|
|
mcp_tools/__pycache__/podcast_tool.cpython-313.pyc
ADDED
|
Binary file (5.19 kB). View file
|
|
|
mcp_tools/__pycache__/search_tool.cpython-313.pyc
ADDED
|
Binary file (22 kB). View file
|
|
|
mcp_tools/__pycache__/voice_tool.cpython-313.pyc
ADDED
|
Binary file (2.42 kB). View file
|
|
|
mcp_tools/generative_tool.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
+
import asyncio
|
| 4 |
+
|
| 5 |
+
from services.llm_service import LLMService
|
| 6 |
+
from mcp_tools.search_tool import SearchTool
|
| 7 |
+
from core.models import SearchResult
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class GenerativeTool:
|
| 12 |
+
def __init__(self, llm_service: LLMService, search_tool: Optional[SearchTool] = None):
|
| 13 |
+
self.llm_service = llm_service
|
| 14 |
+
self.search_tool = search_tool
|
| 15 |
+
|
| 16 |
+
async def summarize(self, content: str, style: str = "concise", max_length: Optional[int] = None) -> str:
|
| 17 |
+
"""Generate a summary of the given content"""
|
| 18 |
+
try:
|
| 19 |
+
if not content.strip():
|
| 20 |
+
return "No content provided for summarization."
|
| 21 |
+
|
| 22 |
+
logger.info(f"Generating {style} summary for content of length {len(content)}")
|
| 23 |
+
|
| 24 |
+
summary = await self.llm_service.summarize(content, style, max_length)
|
| 25 |
+
|
| 26 |
+
logger.info(f"Generated summary of length {len(summary)}")
|
| 27 |
+
return summary
|
| 28 |
+
|
| 29 |
+
except Exception as e:
|
| 30 |
+
logger.error(f"Error generating summary: {str(e)}")
|
| 31 |
+
return f"Error generating summary: {str(e)}"
|
| 32 |
+
|
| 33 |
+
async def generate_tags(self, content: str, max_tags: int = 5) -> List[str]:
|
| 34 |
+
"""Generate relevant tags for the given content"""
|
| 35 |
+
try:
|
| 36 |
+
if not content.strip():
|
| 37 |
+
return []
|
| 38 |
+
|
| 39 |
+
logger.info(f"Generating up to {max_tags} tags for content")
|
| 40 |
+
|
| 41 |
+
tags = await self.llm_service.generate_tags(content, max_tags)
|
| 42 |
+
|
| 43 |
+
logger.info(f"Generated {len(tags)} tags")
|
| 44 |
+
return tags
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f"Error generating tags: {str(e)}")
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
async def categorize(self, content: str, categories: List[str]) -> str:
|
| 51 |
+
"""Categorize content into one of the provided categories"""
|
| 52 |
+
try:
|
| 53 |
+
if not content.strip():
|
| 54 |
+
return "Uncategorized"
|
| 55 |
+
|
| 56 |
+
if not categories:
|
| 57 |
+
categories = ["Technology", "Business", "Science", "Education", "Entertainment", "News", "Research", "Other"]
|
| 58 |
+
|
| 59 |
+
logger.info(f"Categorizing content into one of {len(categories)} categories")
|
| 60 |
+
|
| 61 |
+
category = await self.llm_service.categorize(content, categories)
|
| 62 |
+
|
| 63 |
+
logger.info(f"Categorized as: {category}")
|
| 64 |
+
return category
|
| 65 |
+
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"Error categorizing content: {str(e)}")
|
| 68 |
+
return "Uncategorized"
|
| 69 |
+
|
| 70 |
+
async def answer_question(self, question: str, context_results: List[SearchResult] = None) -> str:
|
| 71 |
+
"""Answer a question using the provided context or RAG"""
|
| 72 |
+
try:
|
| 73 |
+
if not question.strip():
|
| 74 |
+
return "No question provided."
|
| 75 |
+
|
| 76 |
+
logger.info(f"Answering question: {question[:100]}...")
|
| 77 |
+
|
| 78 |
+
# If no context provided and search tool is available, search for relevant context
|
| 79 |
+
if not context_results and self.search_tool:
|
| 80 |
+
logger.info("No context provided, searching for relevant information")
|
| 81 |
+
context_results = await self.search_tool.search(question, top_k=5)
|
| 82 |
+
|
| 83 |
+
# Prepare context from search results
|
| 84 |
+
if context_results:
|
| 85 |
+
context_texts = []
|
| 86 |
+
for result in context_results:
|
| 87 |
+
context_texts.append(f"Source: {result.document_id}\nContent: {result.content}\n")
|
| 88 |
+
|
| 89 |
+
context = "\n---\n".join(context_texts)
|
| 90 |
+
logger.info(f"Using context from {len(context_results)} sources")
|
| 91 |
+
else:
|
| 92 |
+
context = ""
|
| 93 |
+
logger.info("No context available for answering question")
|
| 94 |
+
|
| 95 |
+
# Generate answer
|
| 96 |
+
answer = await self.llm_service.answer_question(question, context)
|
| 97 |
+
|
| 98 |
+
logger.info(f"Generated answer of length {len(answer)}")
|
| 99 |
+
return answer
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.error(f"Error answering question: {str(e)}")
|
| 103 |
+
return f"I encountered an error while trying to answer your question: {str(e)}"
|
| 104 |
+
|
| 105 |
+
async def generate_outline(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> str:
|
| 106 |
+
"""Generate an outline for the given topic"""
|
| 107 |
+
try:
|
| 108 |
+
if not topic.strip():
|
| 109 |
+
return "No topic provided."
|
| 110 |
+
|
| 111 |
+
detail_descriptions = {
|
| 112 |
+
"brief": "brief bullet points",
|
| 113 |
+
"medium": "detailed bullet points with descriptions",
|
| 114 |
+
"detailed": "comprehensive outline with sub-sections and explanations"
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
detail_desc = detail_descriptions.get(detail_level, "detailed bullet points")
|
| 118 |
+
|
| 119 |
+
prompt = f"""Create a {detail_desc} outline for the topic: "{topic}"
|
| 120 |
+
|
| 121 |
+
The outline should have {num_sections} main sections and be well-structured and informative.
|
| 122 |
+
|
| 123 |
+
Format the outline clearly with proper numbering and indentation.
|
| 124 |
+
|
| 125 |
+
Topic: {topic}
|
| 126 |
+
|
| 127 |
+
Outline:"""
|
| 128 |
+
|
| 129 |
+
outline = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.7)
|
| 130 |
+
|
| 131 |
+
logger.info(f"Generated outline for topic: {topic}")
|
| 132 |
+
return outline
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.error(f"Error generating outline: {str(e)}")
|
| 136 |
+
return f"Error generating outline: {str(e)}"
|
| 137 |
+
|
| 138 |
+
async def explain_concept(self, concept: str, audience: str = "general", length: str = "medium") -> str:
|
| 139 |
+
"""Explain a concept for a specific audience"""
|
| 140 |
+
try:
|
| 141 |
+
if not concept.strip():
|
| 142 |
+
return "No concept provided."
|
| 143 |
+
|
| 144 |
+
audience_styles = {
|
| 145 |
+
"general": "a general audience using simple, clear language",
|
| 146 |
+
"technical": "a technical audience with appropriate jargon and detail",
|
| 147 |
+
"beginner": "beginners with no prior knowledge, using analogies and examples",
|
| 148 |
+
"expert": "experts in the field with advanced terminology and depth"
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
length_guidance = {
|
| 152 |
+
"brief": "Keep the explanation concise and to the point (2-3 paragraphs).",
|
| 153 |
+
"medium": "Provide a comprehensive explanation (4-6 paragraphs).",
|
| 154 |
+
"detailed": "Give a thorough, in-depth explanation with examples."
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
audience_desc = audience_styles.get(audience, "a general audience")
|
| 158 |
+
length_desc = length_guidance.get(length, "Provide a comprehensive explanation.")
|
| 159 |
+
|
| 160 |
+
prompt = f"""Explain the concept of "{concept}" for {audience_desc}.
|
| 161 |
+
|
| 162 |
+
{length_desc}
|
| 163 |
+
|
| 164 |
+
Make sure to:
|
| 165 |
+
- Use appropriate language for the audience
|
| 166 |
+
- Include relevant examples or analogies
|
| 167 |
+
- Structure the explanation logically
|
| 168 |
+
- Ensure clarity and accuracy
|
| 169 |
+
|
| 170 |
+
Concept to explain: {concept}
|
| 171 |
+
|
| 172 |
+
Explanation:"""
|
| 173 |
+
|
| 174 |
+
explanation = await self.llm_service.generate_text(prompt, max_tokens=600, temperature=0.5)
|
| 175 |
+
|
| 176 |
+
logger.info(f"Generated explanation for concept: {concept}")
|
| 177 |
+
return explanation
|
| 178 |
+
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.error(f"Error explaining concept: {str(e)}")
|
| 181 |
+
return f"Error explaining concept: {str(e)}"
|
| 182 |
+
|
| 183 |
+
async def compare_concepts(self, concept1: str, concept2: str, aspects: List[str] = None) -> str:
|
| 184 |
+
"""Compare two concepts across specified aspects"""
|
| 185 |
+
try:
|
| 186 |
+
if not concept1.strip() or not concept2.strip():
|
| 187 |
+
return "Both concepts must be provided for comparison."
|
| 188 |
+
|
| 189 |
+
if not aspects:
|
| 190 |
+
aspects = ["definition", "key features", "advantages", "disadvantages", "use cases"]
|
| 191 |
+
|
| 192 |
+
aspects_str = ", ".join(aspects)
|
| 193 |
+
|
| 194 |
+
prompt = f"""Compare and contrast "{concept1}" and "{concept2}" across the following aspects: {aspects_str}.
|
| 195 |
+
|
| 196 |
+
Structure your comparison clearly, addressing each aspect for both concepts.
|
| 197 |
+
|
| 198 |
+
Format:
|
| 199 |
+
## Comparison: {concept1} vs {concept2}
|
| 200 |
+
|
| 201 |
+
For each aspect, provide:
|
| 202 |
+
- **{concept1}**: [description]
|
| 203 |
+
- **{concept2}**: [description]
|
| 204 |
+
- **Key Difference**: [summary]
|
| 205 |
+
|
| 206 |
+
For each aspect, provide:
|
| 207 |
+
- **{concept1}**: [description]
|
| 208 |
+
- **{concept2}**: [description]
|
| 209 |
+
- **Key Difference**: [summary]
|
| 210 |
+
|
| 211 |
+
Concepts to compare:
|
| 212 |
+
1. {concept1}
|
| 213 |
+
2. {concept2}
|
| 214 |
+
|
| 215 |
+
Comparison:"""
|
| 216 |
+
|
| 217 |
+
comparison = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.6)
|
| 218 |
+
|
| 219 |
+
logger.info(f"Generated comparison between {concept1} and {concept2}")
|
| 220 |
+
return comparison
|
| 221 |
+
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"Error comparing concepts: {str(e)}")
|
| 224 |
+
return f"Error comparing concepts: {str(e)}"
|
| 225 |
+
|
| 226 |
+
async def generate_questions(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> List[str]:
|
| 227 |
+
"""Generate questions based on the provided content"""
|
| 228 |
+
try:
|
| 229 |
+
if not content.strip():
|
| 230 |
+
return []
|
| 231 |
+
|
| 232 |
+
question_types = {
|
| 233 |
+
"comprehension": "comprehension questions that test understanding of key concepts",
|
| 234 |
+
"analysis": "analytical questions that require deeper thinking and evaluation",
|
| 235 |
+
"application": "application questions that ask how to use the concepts in practice",
|
| 236 |
+
"creative": "creative questions that encourage original thinking and exploration",
|
| 237 |
+
"factual": "factual questions about specific details and information"
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
question_desc = question_types.get(question_type, "comprehension questions")
|
| 241 |
+
|
| 242 |
+
prompt = f"""Based on the following content, generate {num_questions} {question_desc}.
|
| 243 |
+
|
| 244 |
+
The questions should be:
|
| 245 |
+
- Clear and well-formulated
|
| 246 |
+
- Relevant to the content
|
| 247 |
+
- Appropriate for the specified type
|
| 248 |
+
- Engaging and thought-provoking
|
| 249 |
+
|
| 250 |
+
Content:
|
| 251 |
+
{content[:2000]} # Limit content length
|
| 252 |
+
|
| 253 |
+
Questions:"""
|
| 254 |
+
|
| 255 |
+
response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.7)
|
| 256 |
+
|
| 257 |
+
# Parse questions from response
|
| 258 |
+
questions = []
|
| 259 |
+
lines = response.split('\n')
|
| 260 |
+
|
| 261 |
+
for line in lines:
|
| 262 |
+
line = line.strip()
|
| 263 |
+
if line and ('?' in line or line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*'))):
|
| 264 |
+
# Clean up the question
|
| 265 |
+
question = line.lstrip('0123456789.-* ').strip()
|
| 266 |
+
if question and '?' in question:
|
| 267 |
+
questions.append(question)
|
| 268 |
+
|
| 269 |
+
logger.info(f"Generated {len(questions)} {question_type} questions")
|
| 270 |
+
return questions[:num_questions]
|
| 271 |
+
|
| 272 |
+
except Exception as e:
|
| 273 |
+
logger.error(f"Error generating questions: {str(e)}")
|
| 274 |
+
return []
|
| 275 |
+
|
| 276 |
+
def _chunk_text(self, text: str, chunk_size: int = 2000) -> List[str]:
|
| 277 |
+
"""Split text into chunks respecting paragraph boundaries"""
|
| 278 |
+
if len(text) <= chunk_size:
|
| 279 |
+
return [text]
|
| 280 |
+
|
| 281 |
+
chunks = []
|
| 282 |
+
current_chunk = ""
|
| 283 |
+
|
| 284 |
+
# Split by paragraphs first
|
| 285 |
+
paragraphs = text.split('\n\n')
|
| 286 |
+
|
| 287 |
+
for para in paragraphs:
|
| 288 |
+
if len(current_chunk) + len(para) + 2 <= chunk_size:
|
| 289 |
+
current_chunk += para + "\n\n"
|
| 290 |
+
else:
|
| 291 |
+
if current_chunk:
|
| 292 |
+
chunks.append(current_chunk.strip())
|
| 293 |
+
current_chunk = para + "\n\n"
|
| 294 |
+
|
| 295 |
+
# If a single paragraph is too long, split it by sentences
|
| 296 |
+
if len(current_chunk) > chunk_size:
|
| 297 |
+
# Reset current_chunk and split the long paragraph
|
| 298 |
+
long_para = current_chunk.strip()
|
| 299 |
+
current_chunk = ""
|
| 300 |
+
|
| 301 |
+
sentences = long_para.replace('. ', '.\n').split('\n')
|
| 302 |
+
sub_chunk = ""
|
| 303 |
+
for sentence in sentences:
|
| 304 |
+
if len(sub_chunk) + len(sentence) + 1 <= chunk_size:
|
| 305 |
+
sub_chunk += sentence + " "
|
| 306 |
+
else:
|
| 307 |
+
if sub_chunk:
|
| 308 |
+
chunks.append(sub_chunk.strip())
|
| 309 |
+
sub_chunk = sentence + " "
|
| 310 |
+
if sub_chunk:
|
| 311 |
+
current_chunk = sub_chunk # Carry over remaining part
|
| 312 |
+
|
| 313 |
+
if current_chunk:
|
| 314 |
+
chunks.append(current_chunk.strip())
|
| 315 |
+
|
| 316 |
+
return chunks
|
| 317 |
+
|
| 318 |
+
async def paraphrase_text(self, text: str, style: str = "formal", preserve_meaning: bool = True) -> str:
|
| 319 |
+
"""Paraphrase text in a different style while preserving meaning"""
|
| 320 |
+
try:
|
| 321 |
+
if not text.strip():
|
| 322 |
+
return "No text provided for paraphrasing."
|
| 323 |
+
|
| 324 |
+
# Check length and chunk if necessary
|
| 325 |
+
MAX_CHUNK_SIZE = 2500
|
| 326 |
+
if len(text) > MAX_CHUNK_SIZE:
|
| 327 |
+
logger.info(f"Text length {len(text)} exceeds limit, chunking...")
|
| 328 |
+
chunks = self._chunk_text(text, MAX_CHUNK_SIZE)
|
| 329 |
+
logger.info(f"Split into {len(chunks)} chunks")
|
| 330 |
+
|
| 331 |
+
paraphrased_chunks = []
|
| 332 |
+
for i, chunk in enumerate(chunks):
|
| 333 |
+
logger.info(f"Processing chunk {i+1}/{len(chunks)}")
|
| 334 |
+
# Process chunk
|
| 335 |
+
chunk_result = await self.paraphrase_text(chunk, style, preserve_meaning)
|
| 336 |
+
paraphrased_chunks.append(chunk_result)
|
| 337 |
+
# Small delay to be nice to rate limits
|
| 338 |
+
await asyncio.sleep(0.5)
|
| 339 |
+
|
| 340 |
+
return "\n\n".join(paraphrased_chunks)
|
| 341 |
+
|
| 342 |
+
style_instructions = {
|
| 343 |
+
"formal": "formal, professional language",
|
| 344 |
+
"casual": "casual, conversational language",
|
| 345 |
+
"academic": "academic, scholarly language",
|
| 346 |
+
"simple": "simple, easy-to-understand language",
|
| 347 |
+
"technical": "technical, precise language"
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
style_desc = style_instructions.get(style, "clear, appropriate language")
|
| 351 |
+
meaning_instruction = "while preserving the exact meaning and key information" if preserve_meaning else "while maintaining the general intent"
|
| 352 |
+
|
| 353 |
+
prompt = f"""Paraphrase the following text using {style_desc} {meaning_instruction}.
|
| 354 |
+
|
| 355 |
+
Original text:
|
| 356 |
+
{text}
|
| 357 |
+
|
| 358 |
+
Paraphrased text:"""
|
| 359 |
+
|
| 360 |
+
paraphrase = await self.llm_service.generate_text(prompt, max_tokens=len(text.split()) * 2, temperature=0.6)
|
| 361 |
+
|
| 362 |
+
logger.info(f"Paraphrased text in {style} style")
|
| 363 |
+
return paraphrase.strip()
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
logger.error(f"Error paraphrasing text: {str(e)}")
|
| 367 |
+
return f"Error paraphrasing text: {str(e)}"
|
| 368 |
+
|
| 369 |
+
async def extract_key_insights(self, content: str, num_insights: int = 5) -> List[str]:
|
| 370 |
+
"""Extract key insights from the provided content"""
|
| 371 |
+
try:
|
| 372 |
+
if not content.strip():
|
| 373 |
+
return []
|
| 374 |
+
|
| 375 |
+
prompt = f'''Analyze the following content and extract {num_insights} key insights or takeaways.
|
| 376 |
+
|
| 377 |
+
Each insight should be:
|
| 378 |
+
- A clear, concise statement
|
| 379 |
+
- Significant and meaningful
|
| 380 |
+
- Based on the content provided
|
| 381 |
+
- Actionable or thought-provoking when possible
|
| 382 |
+
|
| 383 |
+
Content:
|
| 384 |
+
{content[:3000]} # Limit content length
|
| 385 |
+
|
| 386 |
+
Key Insights:'''
|
| 387 |
+
|
| 388 |
+
response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.6)
|
| 389 |
+
|
| 390 |
+
# Parse insights from response
|
| 391 |
+
insights = []
|
| 392 |
+
lines = response.split('\n')
|
| 393 |
+
|
| 394 |
+
for line in lines:
|
| 395 |
+
line = line.strip()
|
| 396 |
+
if line and (line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*')) or len(insights) == 0):
|
| 397 |
+
# Clean up the insight
|
| 398 |
+
insight = line.lstrip('0123456789.-* ').strip()
|
| 399 |
+
if insight and len(insight) > 10: # Minimum insight length
|
| 400 |
+
insights.append(insight)
|
| 401 |
+
|
| 402 |
+
logger.info(f"Extracted {len(insights)} key insights")
|
| 403 |
+
return insights[:num_insights]
|
| 404 |
+
|
| 405 |
+
except Exception as e:
|
| 406 |
+
logger.error(f"Error extracting insights: {str(e)}")
|
| 407 |
+
return []
|
mcp_tools/ingestion_tool.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import asyncio
|
| 3 |
+
from typing import Dict, Any, Optional
|
| 4 |
+
import tempfile
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import uuid
|
| 8 |
+
|
| 9 |
+
from core.document_parser import DocumentParser
|
| 10 |
+
from core.chunker import TextChunker
|
| 11 |
+
from core.text_preprocessor import TextPreprocessor
|
| 12 |
+
from services.vector_store_service import VectorStoreService
|
| 13 |
+
from services.document_store_service import DocumentStoreService
|
| 14 |
+
from services.embedding_service import EmbeddingService
|
| 15 |
+
from services.ocr_service import OCRService
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
class IngestionTool:
|
| 20 |
+
def __init__(self, vector_store: VectorStoreService, document_store: DocumentStoreService,
|
| 21 |
+
embedding_service: EmbeddingService, ocr_service: OCRService):
|
| 22 |
+
self.vector_store = vector_store
|
| 23 |
+
self.document_store = document_store
|
| 24 |
+
self.embedding_service = embedding_service
|
| 25 |
+
self.ocr_service = ocr_service
|
| 26 |
+
|
| 27 |
+
self.document_parser = DocumentParser()
|
| 28 |
+
# Pass OCR service to document parser
|
| 29 |
+
self.document_parser.ocr_service = ocr_service
|
| 30 |
+
|
| 31 |
+
self.text_chunker = TextChunker()
|
| 32 |
+
self.text_preprocessor = TextPreprocessor()
|
| 33 |
+
|
| 34 |
+
async def process_document(self, file_path: str, file_type: str, task_id: Optional[str] = None) -> Dict[str, Any]:
|
| 35 |
+
"""Process a document through the full ingestion pipeline"""
|
| 36 |
+
if task_id is None:
|
| 37 |
+
task_id = str(uuid.uuid4())
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
logger.info(f"Starting document processing for {file_path}")
|
| 41 |
+
|
| 42 |
+
# Step 1: Parse the document
|
| 43 |
+
filename = Path(file_path).name
|
| 44 |
+
document = await self.document_parser.parse_document(file_path, filename)
|
| 45 |
+
|
| 46 |
+
if not document.content:
|
| 47 |
+
logger.warning(f"No content extracted from document {filename}")
|
| 48 |
+
return {
|
| 49 |
+
"success": False,
|
| 50 |
+
"error": "No content could be extracted from the document",
|
| 51 |
+
"task_id": task_id
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
# Step 2: Store the document
|
| 55 |
+
await self.document_store.store_document(document)
|
| 56 |
+
|
| 57 |
+
# Step 3: Process content for embeddings
|
| 58 |
+
chunks = await self._create_and_embed_chunks(document)
|
| 59 |
+
|
| 60 |
+
if not chunks:
|
| 61 |
+
logger.warning(f"No chunks created for document {document.id}")
|
| 62 |
+
return {
|
| 63 |
+
"success": False,
|
| 64 |
+
"error": "Failed to create text chunks",
|
| 65 |
+
"task_id": task_id,
|
| 66 |
+
"document_id": document.id,
|
| 67 |
+
"filename": document.filename,
|
| 68 |
+
"chunks_created": len(chunks),
|
| 69 |
+
"content_length": len(document.content),
|
| 70 |
+
"doc_type": document.doc_type.value,
|
| 71 |
+
"message": f"Successfully processed {filename}"
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Step 4: Store embeddings
|
| 75 |
+
success = await self.vector_store.add_chunks(chunks)
|
| 76 |
+
|
| 77 |
+
if not success:
|
| 78 |
+
logger.error(f"Failed to store embeddings for document {document.id}")
|
| 79 |
+
return {
|
| 80 |
+
"success": False,
|
| 81 |
+
"error": "Failed to store embeddings",
|
| 82 |
+
"task_id": task_id,
|
| 83 |
+
"document_id": document.id
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# Step 5: Update document metadata with chunk count
|
| 87 |
+
try:
|
| 88 |
+
current_metadata = document.metadata or {}
|
| 89 |
+
current_metadata["chunk_count"] = len(chunks)
|
| 90 |
+
await self.document_store.update_document_metadata(
|
| 91 |
+
document.id,
|
| 92 |
+
{"metadata": current_metadata}
|
| 93 |
+
)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
|
| 96 |
+
|
| 97 |
+
logger.info(f"Successfully processed document {document.id} with {len(chunks)} chunks")
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
"success": True,
|
| 101 |
+
"task_id": task_id,
|
| 102 |
+
"document_id": document.id,
|
| 103 |
+
"filename": document.filename,
|
| 104 |
+
"chunks_created": len(chunks),
|
| 105 |
+
"content_length": len(document.content),
|
| 106 |
+
"doc_type": document.doc_type.value,
|
| 107 |
+
"message": f"Successfully processed {filename}"
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"Error processing document {file_path}: {str(e)}")
|
| 112 |
+
return {
|
| 113 |
+
"success": False,
|
| 114 |
+
"error": str(e),
|
| 115 |
+
"task_id": task_id,
|
| 116 |
+
"message": f"Failed to process document: {str(e)}"
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
async def _create_and_embed_chunks(self, document) -> list:
|
| 120 |
+
"""Create chunks and generate embeddings"""
|
| 121 |
+
try:
|
| 122 |
+
# Step 1: Create chunks
|
| 123 |
+
chunks = self.text_chunker.chunk_document(
|
| 124 |
+
document.id,
|
| 125 |
+
document.content,
|
| 126 |
+
method="recursive"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
if not chunks:
|
| 130 |
+
return []
|
| 131 |
+
|
| 132 |
+
# Step 2: Optimize chunks for embedding
|
| 133 |
+
optimized_chunks = self.text_chunker.optimize_chunks_for_embedding(chunks)
|
| 134 |
+
|
| 135 |
+
# Step 3: Generate embeddings
|
| 136 |
+
texts = [chunk.content for chunk in optimized_chunks]
|
| 137 |
+
embeddings = await self.embedding_service.generate_embeddings(texts)
|
| 138 |
+
|
| 139 |
+
# Step 4: Add embeddings to chunks
|
| 140 |
+
embedded_chunks = []
|
| 141 |
+
for i, chunk in enumerate(optimized_chunks):
|
| 142 |
+
if i < len(embeddings):
|
| 143 |
+
chunk.embedding = embeddings[i]
|
| 144 |
+
embedded_chunks.append(chunk)
|
| 145 |
+
|
| 146 |
+
return embedded_chunks
|
| 147 |
+
|
| 148 |
+
except Exception as e:
|
| 149 |
+
logger.error(f"Error creating and embedding chunks: {str(e)}")
|
| 150 |
+
return []
|
| 151 |
+
|
| 152 |
+
async def process_url(self, url: str, task_id: Optional[str] = None) -> Dict[str, Any]:
|
| 153 |
+
"""Process a document from a URL"""
|
| 154 |
+
try:
|
| 155 |
+
import requests
|
| 156 |
+
from urllib.parse import urlparse
|
| 157 |
+
|
| 158 |
+
# Download the file
|
| 159 |
+
response = requests.get(url, timeout=30)
|
| 160 |
+
response.raise_for_status()
|
| 161 |
+
|
| 162 |
+
# Determine file type from URL or content-type
|
| 163 |
+
parsed_url = urlparse(url)
|
| 164 |
+
filename = Path(parsed_url.path).name or "downloaded_file"
|
| 165 |
+
|
| 166 |
+
# Create temporary file
|
| 167 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
|
| 168 |
+
tmp_file.write(response.content)
|
| 169 |
+
tmp_file_path = tmp_file.name
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
# Process the downloaded file
|
| 173 |
+
result = await self.process_document(tmp_file_path, "", task_id)
|
| 174 |
+
result["source_url"] = url
|
| 175 |
+
return result
|
| 176 |
+
finally:
|
| 177 |
+
# Clean up temporary file
|
| 178 |
+
if os.path.exists(tmp_file_path):
|
| 179 |
+
os.unlink(tmp_file_path)
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.error(f"Error processing URL {url}: {str(e)}")
|
| 183 |
+
return {
|
| 184 |
+
"success": False,
|
| 185 |
+
"error": str(e),
|
| 186 |
+
"task_id": task_id or str(uuid.uuid4()),
|
| 187 |
+
"source_url": url
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
async def process_text_content(self, content: str, filename: str = "text_content.txt",
|
| 191 |
+
task_id: Optional[str] = None) -> Dict[str, Any]:
|
| 192 |
+
"""Process raw text content directly"""
|
| 193 |
+
try:
|
| 194 |
+
from core.models import Document, DocumentType
|
| 195 |
+
from datetime import datetime
|
| 196 |
+
|
| 197 |
+
# Create document object
|
| 198 |
+
document = Document(
|
| 199 |
+
id=str(uuid.uuid4()),
|
| 200 |
+
filename=filename,
|
| 201 |
+
content=content,
|
| 202 |
+
doc_type=DocumentType.TEXT,
|
| 203 |
+
file_size=len(content.encode('utf-8')),
|
| 204 |
+
created_at=datetime.utcnow(),
|
| 205 |
+
metadata={
|
| 206 |
+
"source": "direct_text_input",
|
| 207 |
+
"content_length": len(content),
|
| 208 |
+
"word_count": len(content.split())
|
| 209 |
+
}
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Store the document
|
| 213 |
+
await self.document_store.store_document(document)
|
| 214 |
+
|
| 215 |
+
# Process content for embeddings
|
| 216 |
+
chunks = await self._create_and_embed_chunks(document)
|
| 217 |
+
|
| 218 |
+
if chunks:
|
| 219 |
+
await self.vector_store.add_chunks(chunks)
|
| 220 |
+
|
| 221 |
+
# Update document metadata with chunk count
|
| 222 |
+
try:
|
| 223 |
+
current_metadata = document.metadata or {}
|
| 224 |
+
current_metadata["chunk_count"] = len(chunks)
|
| 225 |
+
await self.document_store.update_document_metadata(
|
| 226 |
+
document.id,
|
| 227 |
+
{"metadata": current_metadata}
|
| 228 |
+
)
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
"success": True,
|
| 234 |
+
"task_id": task_id or str(uuid.uuid4()),
|
| 235 |
+
"document_id": document.id,
|
| 236 |
+
"filename": filename,
|
| 237 |
+
"chunks_created": len(chunks),
|
| 238 |
+
"content_length": len(content),
|
| 239 |
+
"message": f"Successfully processed text content"
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
except Exception as e:
|
| 243 |
+
logger.error(f"Error processing text content: {str(e)}")
|
| 244 |
+
return {
|
| 245 |
+
"success": False,
|
| 246 |
+
"error": str(e),
|
| 247 |
+
"task_id": task_id or str(uuid.uuid4())
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
async def reprocess_document(self, document_id: str, task_id: Optional[str] = None) -> Dict[str, Any]:
|
| 251 |
+
"""Reprocess an existing document (useful for updating embeddings)"""
|
| 252 |
+
try:
|
| 253 |
+
# Get the document
|
| 254 |
+
document = await self.document_store.get_document(document_id)
|
| 255 |
+
|
| 256 |
+
if not document:
|
| 257 |
+
return {
|
| 258 |
+
"success": False,
|
| 259 |
+
"error": f"Document {document_id} not found",
|
| 260 |
+
"task_id": task_id or str(uuid.uuid4())
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
# Remove existing chunks from vector store
|
| 264 |
+
await self.vector_store.delete_document(document_id)
|
| 265 |
+
|
| 266 |
+
# Recreate and embed chunks
|
| 267 |
+
chunks = await self._create_and_embed_chunks(document)
|
| 268 |
+
|
| 269 |
+
if chunks:
|
| 270 |
+
await self.vector_store.add_chunks(chunks)
|
| 271 |
+
|
| 272 |
+
# Update document metadata with chunk count
|
| 273 |
+
try:
|
| 274 |
+
current_metadata = document.metadata or {}
|
| 275 |
+
current_metadata["chunk_count"] = len(chunks)
|
| 276 |
+
await self.document_store.update_document_metadata(
|
| 277 |
+
document.id,
|
| 278 |
+
{"metadata": current_metadata}
|
| 279 |
+
)
|
| 280 |
+
except Exception as e:
|
| 281 |
+
logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
|
| 282 |
+
|
| 283 |
+
return {
|
| 284 |
+
"success": True,
|
| 285 |
+
"task_id": task_id or str(uuid.uuid4()),
|
| 286 |
+
"document_id": document_id,
|
| 287 |
+
"filename": document.filename,
|
| 288 |
+
"chunks_created": len(chunks),
|
| 289 |
+
"message": f"Successfully reprocessed {document.filename}"
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
except Exception as e:
|
| 293 |
+
logger.error(f"Error reprocessing document {document_id}: {str(e)}")
|
| 294 |
+
return {
|
| 295 |
+
"success": False,
|
| 296 |
+
"error": str(e),
|
| 297 |
+
"task_id": task_id or str(uuid.uuid4()),
|
| 298 |
+
"document_id": document_id
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
async def batch_process_directory(self, directory_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
|
| 302 |
+
"""Process multiple documents from a directory"""
|
| 303 |
+
try:
|
| 304 |
+
directory = Path(directory_path)
|
| 305 |
+
if not directory.exists() or not directory.is_dir():
|
| 306 |
+
return {
|
| 307 |
+
"success": False,
|
| 308 |
+
"error": f"Directory {directory_path} does not exist",
|
| 309 |
+
"task_id": task_id or str(uuid.uuid4())
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
# Supported file extensions
|
| 313 |
+
supported_extensions = {'.txt', '.pdf', '.docx', '.png', '.jpg', '.jpeg', '.bmp', '.tiff'}
|
| 314 |
+
|
| 315 |
+
# Find all supported files
|
| 316 |
+
files_to_process = []
|
| 317 |
+
for ext in supported_extensions:
|
| 318 |
+
files_to_process.extend(directory.glob(f"*{ext}"))
|
| 319 |
+
files_to_process.extend(directory.glob(f"*{ext.upper()}"))
|
| 320 |
+
|
| 321 |
+
if not files_to_process:
|
| 322 |
+
return {
|
| 323 |
+
"success": False,
|
| 324 |
+
"error": "No supported files found in directory",
|
| 325 |
+
"task_id": task_id or str(uuid.uuid4())
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
# Process files
|
| 329 |
+
results = []
|
| 330 |
+
successful = 0
|
| 331 |
+
failed = 0
|
| 332 |
+
|
| 333 |
+
for file_path in files_to_process:
|
| 334 |
+
try:
|
| 335 |
+
result = await self.process_document(str(file_path), file_path.suffix)
|
| 336 |
+
results.append(result)
|
| 337 |
+
|
| 338 |
+
if result.get("success"):
|
| 339 |
+
successful += 1
|
| 340 |
+
else:
|
| 341 |
+
failed += 1
|
| 342 |
+
|
| 343 |
+
except Exception as e:
|
| 344 |
+
failed += 1
|
| 345 |
+
results.append({
|
| 346 |
+
"success": False,
|
| 347 |
+
"error": str(e),
|
| 348 |
+
"filename": file_path.name
|
| 349 |
+
})
|
| 350 |
+
|
| 351 |
+
return {
|
| 352 |
+
"success": True,
|
| 353 |
+
"task_id": task_id or str(uuid.uuid4()),
|
| 354 |
+
"directory": str(directory),
|
| 355 |
+
"total_files": len(files_to_process),
|
| 356 |
+
"successful": successful,
|
| 357 |
+
"failed": failed,
|
| 358 |
+
"results": results,
|
| 359 |
+
"message": f"Processed {successful}/{len(files_to_process)} files successfully"
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
except Exception as e:
|
| 363 |
+
logger.error(f"Error batch processing directory {directory_path}: {str(e)}")
|
| 364 |
+
return {
|
| 365 |
+
"success": False,
|
| 366 |
+
"error": str(e),
|
| 367 |
+
"task_id": task_id or str(uuid.uuid4())
|
| 368 |
+
}
|
mcp_tools/podcast_tool.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Dict, Any, List
|
| 3 |
+
from dataclasses import asdict
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class PodcastTool:
|
| 8 |
+
"""
|
| 9 |
+
MCP Tool for podcast generation from documents
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, podcast_generator):
|
| 13 |
+
"""
|
| 14 |
+
Initialize Podcast Tool
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
podcast_generator: PodcastGeneratorService instance
|
| 18 |
+
"""
|
| 19 |
+
self.podcast_generator = podcast_generator
|
| 20 |
+
|
| 21 |
+
async def generate_podcast(
|
| 22 |
+
self,
|
| 23 |
+
document_ids: List[str],
|
| 24 |
+
style: str = "conversational",
|
| 25 |
+
duration_minutes: int = 10,
|
| 26 |
+
host1_voice: str = "Rachel",
|
| 27 |
+
host2_voice: str = "Adam"
|
| 28 |
+
) -> Dict[str, Any]:
|
| 29 |
+
"""
|
| 30 |
+
MCP Tool: Generate podcast from documents
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
document_ids: List of document IDs to generate podcast from
|
| 34 |
+
style: Podcast style (conversational, educational, technical, casual)
|
| 35 |
+
duration_minutes: Target duration in minutes
|
| 36 |
+
host1_voice: Voice name for first host
|
| 37 |
+
host2_voice: Voice name for second host
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Dictionary with podcast ID, audio URL, transcript, and metadata
|
| 41 |
+
"""
|
| 42 |
+
try:
|
| 43 |
+
if not document_ids or len(document_ids) == 0:
|
| 44 |
+
return {
|
| 45 |
+
"success": False,
|
| 46 |
+
"error": "No documents provided. Please select at least one document."
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
logger.info(f"Generating podcast from {len(document_ids)} documents")
|
| 50 |
+
|
| 51 |
+
# Generate podcast using service
|
| 52 |
+
result = await self.podcast_generator.generate_podcast(
|
| 53 |
+
document_ids=document_ids,
|
| 54 |
+
style=style,
|
| 55 |
+
duration_minutes=duration_minutes,
|
| 56 |
+
host1_voice=host1_voice,
|
| 57 |
+
host2_voice=host2_voice
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if result.success:
|
| 61 |
+
return {
|
| 62 |
+
"success": True,
|
| 63 |
+
"podcast_id": result.podcast_id,
|
| 64 |
+
"audio_file": result.audio_file_path,
|
| 65 |
+
"audio_url": f"/data/podcasts/{result.podcast_id}.mp3",
|
| 66 |
+
"transcript": result.transcript,
|
| 67 |
+
"metadata": asdict(result.metadata) if result.metadata else {},
|
| 68 |
+
"generation_time": result.generation_time,
|
| 69 |
+
"message": f"Podcast generated successfully! Duration: {result.metadata.duration_seconds/60:.1f} minutes"
|
| 70 |
+
}
|
| 71 |
+
else:
|
| 72 |
+
return {
|
| 73 |
+
"success": False,
|
| 74 |
+
"error": result.error or "Unknown error during podcast generation"
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"Podcast generation failed: {str(e)}")
|
| 79 |
+
return {
|
| 80 |
+
"success": False,
|
| 81 |
+
"error": str(e)
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
def list_podcasts(self, limit: int = 10) -> Dict[str, Any]:
|
| 85 |
+
"""
|
| 86 |
+
List previously generated podcasts
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
limit: Maximum number of podcasts to return
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
Dictionary with list of podcast metadata
|
| 93 |
+
"""
|
| 94 |
+
try:
|
| 95 |
+
podcasts = self.podcast_generator.list_podcasts(limit=limit)
|
| 96 |
+
|
| 97 |
+
return {
|
| 98 |
+
"success": True,
|
| 99 |
+
"podcasts": [asdict(p) for p in podcasts],
|
| 100 |
+
"total": len(podcasts)
|
| 101 |
+
}
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"Failed to list podcasts: {str(e)}")
|
| 104 |
+
return {
|
| 105 |
+
"success": False,
|
| 106 |
+
"error": str(e),
|
| 107 |
+
"podcasts": []
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
def get_podcast(self, podcast_id: str) -> Dict[str, Any]:
|
| 111 |
+
"""
|
| 112 |
+
Get specific podcast by ID
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
podcast_id: Podcast identifier
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
Dictionary with podcast metadata
|
| 119 |
+
"""
|
| 120 |
+
try:
|
| 121 |
+
podcast = self.podcast_generator.get_podcast(podcast_id)
|
| 122 |
+
|
| 123 |
+
if podcast:
|
| 124 |
+
return {
|
| 125 |
+
"success": True,
|
| 126 |
+
"podcast": asdict(podcast)
|
| 127 |
+
}
|
| 128 |
+
else:
|
| 129 |
+
return {
|
| 130 |
+
"success": False,
|
| 131 |
+
"error": "Podcast not found"
|
| 132 |
+
}
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.error(f"Failed to get podcast: {str(e)}")
|
| 135 |
+
return {
|
| 136 |
+
"success": False,
|
| 137 |
+
"error": str(e)
|
| 138 |
+
}
|
mcp_tools/search_tool.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
+
import asyncio
|
| 4 |
+
|
| 5 |
+
from core.models import SearchResult
|
| 6 |
+
from services.vector_store_service import VectorStoreService
|
| 7 |
+
from services.embedding_service import EmbeddingService
|
| 8 |
+
from services.document_store_service import DocumentStoreService
|
| 9 |
+
import config
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class SearchTool:
|
| 14 |
+
def __init__(self, vector_store: VectorStoreService, embedding_service: EmbeddingService,
|
| 15 |
+
document_store: Optional[DocumentStoreService] = None, llamaindex_service: Any = None):
|
| 16 |
+
self.vector_store = vector_store
|
| 17 |
+
self.embedding_service = embedding_service
|
| 18 |
+
self.document_store = document_store
|
| 19 |
+
self.llamaindex_service = llamaindex_service
|
| 20 |
+
self.config = config.config
|
| 21 |
+
|
| 22 |
+
async def search(self, query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None,
|
| 23 |
+
similarity_threshold: Optional[float] = None) -> List[SearchResult]:
|
| 24 |
+
"""Perform semantic search"""
|
| 25 |
+
try:
|
| 26 |
+
if not query.strip():
|
| 27 |
+
logger.warning("Empty search query provided")
|
| 28 |
+
return []
|
| 29 |
+
|
| 30 |
+
# Use default threshold if not provided
|
| 31 |
+
if similarity_threshold is None:
|
| 32 |
+
similarity_threshold = self.config.SIMILARITY_THRESHOLD
|
| 33 |
+
|
| 34 |
+
logger.info(f"Performing semantic search for: '{query}' (top_k={top_k})")
|
| 35 |
+
|
| 36 |
+
# Generate query embedding
|
| 37 |
+
query_embedding = await self.embedding_service.generate_single_embedding(query)
|
| 38 |
+
|
| 39 |
+
if not query_embedding:
|
| 40 |
+
logger.error("Failed to generate query embedding")
|
| 41 |
+
return []
|
| 42 |
+
|
| 43 |
+
# Perform vector search
|
| 44 |
+
results = await self.vector_store.search(
|
| 45 |
+
query_embedding=query_embedding,
|
| 46 |
+
top_k=top_k,
|
| 47 |
+
filters=filters
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Filter by similarity threshold
|
| 51 |
+
filtered_results = [
|
| 52 |
+
result for result in results
|
| 53 |
+
if result.score >= similarity_threshold
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
logger.info(f"Found {len(filtered_results)} results above threshold {similarity_threshold}")
|
| 57 |
+
|
| 58 |
+
# Enhance results with additional metadata if document store is available
|
| 59 |
+
if self.document_store:
|
| 60 |
+
enhanced_results = await self._enhance_results_with_metadata(filtered_results)
|
| 61 |
+
return enhanced_results
|
| 62 |
+
|
| 63 |
+
return filtered_results
|
| 64 |
+
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Error performing semantic search: {str(e)}")
|
| 67 |
+
return []
|
| 68 |
+
|
| 69 |
+
async def agentic_search(self, query: str) -> str:
|
| 70 |
+
"""Perform agentic search using LlamaIndex"""
|
| 71 |
+
if not self.llamaindex_service:
|
| 72 |
+
logger.warning("LlamaIndex service not available for agentic search")
|
| 73 |
+
return "Agentic search not available."
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
logger.info(f"Performing agentic search for: '{query}'")
|
| 77 |
+
return await self.llamaindex_service.query(query)
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Error performing agentic search: {str(e)}")
|
| 80 |
+
return f"Error performing agentic search: {str(e)}"
|
| 81 |
+
|
| 82 |
+
async def _enhance_results_with_metadata(self, results: List[SearchResult]) -> List[SearchResult]:
|
| 83 |
+
"""Enhance search results with document metadata"""
|
| 84 |
+
try:
|
| 85 |
+
enhanced_results = []
|
| 86 |
+
|
| 87 |
+
for result in results:
|
| 88 |
+
try:
|
| 89 |
+
# Get document metadata
|
| 90 |
+
document = await self.document_store.get_document(result.document_id)
|
| 91 |
+
|
| 92 |
+
if document:
|
| 93 |
+
# Add document metadata to result
|
| 94 |
+
enhanced_metadata = {
|
| 95 |
+
**result.metadata,
|
| 96 |
+
"document_filename": document.filename,
|
| 97 |
+
"document_type": document.doc_type.value,
|
| 98 |
+
"document_tags": document.tags,
|
| 99 |
+
"document_category": document.category,
|
| 100 |
+
"document_created_at": document.created_at.isoformat(),
|
| 101 |
+
"document_summary": document.summary
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
enhanced_result = SearchResult(
|
| 105 |
+
chunk_id=result.chunk_id,
|
| 106 |
+
document_id=result.document_id,
|
| 107 |
+
content=result.content,
|
| 108 |
+
score=result.score,
|
| 109 |
+
metadata=enhanced_metadata
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
enhanced_results.append(enhanced_result)
|
| 113 |
+
else:
|
| 114 |
+
# Document not found, use original result
|
| 115 |
+
enhanced_results.append(result)
|
| 116 |
+
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.warning(f"Error enhancing result {result.chunk_id}: {str(e)}")
|
| 119 |
+
enhanced_results.append(result)
|
| 120 |
+
|
| 121 |
+
return enhanced_results
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Error enhancing results: {str(e)}")
|
| 125 |
+
return results
|
| 126 |
+
|
| 127 |
+
async def multi_query_search(self, queries: List[str], top_k: int = 5,
|
| 128 |
+
aggregate_method: str = "merge") -> List[SearchResult]:
|
| 129 |
+
"""Perform search with multiple queries and aggregate results"""
|
| 130 |
+
try:
|
| 131 |
+
all_results = []
|
| 132 |
+
|
| 133 |
+
# Perform search for each query
|
| 134 |
+
for query in queries:
|
| 135 |
+
if query.strip():
|
| 136 |
+
query_results = await self.search(query, top_k)
|
| 137 |
+
all_results.extend(query_results)
|
| 138 |
+
|
| 139 |
+
if not all_results:
|
| 140 |
+
return []
|
| 141 |
+
|
| 142 |
+
# Aggregate results
|
| 143 |
+
if aggregate_method == "merge":
|
| 144 |
+
return await self._merge_results(all_results, top_k)
|
| 145 |
+
elif aggregate_method == "intersect":
|
| 146 |
+
return await self._intersect_results(all_results, top_k)
|
| 147 |
+
elif aggregate_method == "average":
|
| 148 |
+
return await self._average_results(all_results, top_k)
|
| 149 |
+
else:
|
| 150 |
+
# Default to merge
|
| 151 |
+
return await self._merge_results(all_results, top_k)
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Error in multi-query search: {str(e)}")
|
| 155 |
+
return []
|
| 156 |
+
|
| 157 |
+
async def _merge_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
|
| 158 |
+
"""Merge results and remove duplicates, keeping highest scores"""
|
| 159 |
+
try:
|
| 160 |
+
# Group by chunk_id and keep highest score
|
| 161 |
+
chunk_scores = {}
|
| 162 |
+
chunk_results = {}
|
| 163 |
+
|
| 164 |
+
for result in results:
|
| 165 |
+
chunk_id = result.chunk_id
|
| 166 |
+
if chunk_id not in chunk_scores or result.score > chunk_scores[chunk_id]:
|
| 167 |
+
chunk_scores[chunk_id] = result.score
|
| 168 |
+
chunk_results[chunk_id] = result
|
| 169 |
+
|
| 170 |
+
# Sort by score and return top_k
|
| 171 |
+
merged_results = list(chunk_results.values())
|
| 172 |
+
merged_results.sort(key=lambda x: x.score, reverse=True)
|
| 173 |
+
|
| 174 |
+
return merged_results[:top_k]
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
logger.error(f"Error merging results: {str(e)}")
|
| 178 |
+
return results[:top_k]
|
| 179 |
+
|
| 180 |
+
async def _intersect_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
|
| 181 |
+
"""Find chunks that appear in multiple queries"""
|
| 182 |
+
try:
|
| 183 |
+
# Count occurrences of each chunk
|
| 184 |
+
chunk_counts = {}
|
| 185 |
+
chunk_results = {}
|
| 186 |
+
|
| 187 |
+
for result in results:
|
| 188 |
+
chunk_id = result.chunk_id
|
| 189 |
+
chunk_counts[chunk_id] = chunk_counts.get(chunk_id, 0) + 1
|
| 190 |
+
|
| 191 |
+
if chunk_id not in chunk_results or result.score > chunk_results[chunk_id].score:
|
| 192 |
+
chunk_results[chunk_id] = result
|
| 193 |
+
|
| 194 |
+
# Filter chunks that appear more than once
|
| 195 |
+
intersect_results = [
|
| 196 |
+
result for chunk_id, result in chunk_results.items()
|
| 197 |
+
if chunk_counts[chunk_id] > 1
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
# Sort by score
|
| 201 |
+
intersect_results.sort(key=lambda x: x.score, reverse=True)
|
| 202 |
+
|
| 203 |
+
return intersect_results[:top_k]
|
| 204 |
+
|
| 205 |
+
except Exception as e:
|
| 206 |
+
logger.error(f"Error intersecting results: {str(e)}")
|
| 207 |
+
return []
|
| 208 |
+
|
| 209 |
+
async def _average_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
|
| 210 |
+
"""Average scores for chunks that appear multiple times"""
|
| 211 |
+
try:
|
| 212 |
+
# Group by chunk_id and calculate average scores
|
| 213 |
+
chunk_groups = {}
|
| 214 |
+
|
| 215 |
+
for result in results:
|
| 216 |
+
chunk_id = result.chunk_id
|
| 217 |
+
if chunk_id not in chunk_groups:
|
| 218 |
+
chunk_groups[chunk_id] = []
|
| 219 |
+
chunk_groups[chunk_id].append(result)
|
| 220 |
+
|
| 221 |
+
# Calculate average scores
|
| 222 |
+
averaged_results = []
|
| 223 |
+
for chunk_id, group in chunk_groups.items():
|
| 224 |
+
avg_score = sum(r.score for r in group) / len(group)
|
| 225 |
+
|
| 226 |
+
# Use the result with the highest individual score but update the score to average
|
| 227 |
+
best_result = max(group, key=lambda x: x.score)
|
| 228 |
+
averaged_result = SearchResult(
|
| 229 |
+
chunk_id=best_result.chunk_id,
|
| 230 |
+
document_id=best_result.document_id,
|
| 231 |
+
content=best_result.content,
|
| 232 |
+
score=avg_score,
|
| 233 |
+
metadata={
|
| 234 |
+
**best_result.metadata,
|
| 235 |
+
"query_count": len(group),
|
| 236 |
+
"score_range": f"{min(r.score for r in group):.3f}-{max(r.score for r in group):.3f}"
|
| 237 |
+
}
|
| 238 |
+
)
|
| 239 |
+
averaged_results.append(averaged_result)
|
| 240 |
+
|
| 241 |
+
# Sort by average score
|
| 242 |
+
averaged_results.sort(key=lambda x: x.score, reverse=True)
|
| 243 |
+
|
| 244 |
+
return averaged_results[:top_k]
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
logger.error(f"Error averaging results: {str(e)}")
|
| 248 |
+
return results[:top_k]
|
| 249 |
+
|
| 250 |
+
async def search_by_document(self, document_id: str, query: str, top_k: int = 5) -> List[SearchResult]:
|
| 251 |
+
"""Search within a specific document"""
|
| 252 |
+
try:
|
| 253 |
+
filters = {"document_id": document_id}
|
| 254 |
+
return await self.search(query, top_k, filters)
|
| 255 |
+
|
| 256 |
+
except Exception as e:
|
| 257 |
+
logger.error(f"Error searching within document {document_id}: {str(e)}")
|
| 258 |
+
return []
|
| 259 |
+
|
| 260 |
+
async def search_by_category(self, category: str, query: str, top_k: int = 5) -> List[SearchResult]:
|
| 261 |
+
"""Search within documents of a specific category"""
|
| 262 |
+
try:
|
| 263 |
+
if not self.document_store:
|
| 264 |
+
logger.warning("Document store not available for category search")
|
| 265 |
+
return await self.search(query, top_k)
|
| 266 |
+
|
| 267 |
+
# Get documents in the category
|
| 268 |
+
documents = await self.document_store.list_documents(
|
| 269 |
+
limit=1000, # Adjust as needed
|
| 270 |
+
filters={"category": category}
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
if not documents:
|
| 274 |
+
logger.info(f"No documents found in category '{category}'")
|
| 275 |
+
return []
|
| 276 |
+
|
| 277 |
+
# Extract document IDs
|
| 278 |
+
document_ids = [doc.id for doc in documents]
|
| 279 |
+
|
| 280 |
+
# Search with document ID filter
|
| 281 |
+
filters = {"document_ids": document_ids}
|
| 282 |
+
return await self.search(query, top_k, filters)
|
| 283 |
+
|
| 284 |
+
except Exception as e:
|
| 285 |
+
logger.error(f"Error searching by category {category}: {str(e)}")
|
| 286 |
+
return []
|
| 287 |
+
|
| 288 |
+
async def search_with_date_range(self, query: str, start_date, end_date, top_k: int = 5) -> List[SearchResult]:
|
| 289 |
+
"""Search documents within a date range"""
|
| 290 |
+
try:
|
| 291 |
+
if not self.document_store:
|
| 292 |
+
logger.warning("Document store not available for date range search")
|
| 293 |
+
return await self.search(query, top_k)
|
| 294 |
+
|
| 295 |
+
# Get documents in the date range
|
| 296 |
+
documents = await self.document_store.list_documents(
|
| 297 |
+
limit=1000, # Adjust as needed
|
| 298 |
+
filters={
|
| 299 |
+
"created_after": start_date,
|
| 300 |
+
"created_before": end_date
|
| 301 |
+
}
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
if not documents:
|
| 305 |
+
logger.info(f"No documents found in date range")
|
| 306 |
+
return []
|
| 307 |
+
|
| 308 |
+
# Extract document IDs
|
| 309 |
+
document_ids = [doc.id for doc in documents]
|
| 310 |
+
|
| 311 |
+
# Search with document ID filter
|
| 312 |
+
filters = {"document_ids": document_ids}
|
| 313 |
+
return await self.search(query, top_k, filters)
|
| 314 |
+
|
| 315 |
+
except Exception as e:
|
| 316 |
+
logger.error(f"Error searching with date range: {str(e)}")
|
| 317 |
+
return []
|
| 318 |
+
|
| 319 |
+
async def get_search_suggestions(self, partial_query: str, limit: int = 5) -> List[str]:
|
| 320 |
+
"""Get search suggestions based on partial query"""
|
| 321 |
+
try:
|
| 322 |
+
# This is a simple implementation
|
| 323 |
+
# In a production system, you might want to use a more sophisticated approach
|
| 324 |
+
|
| 325 |
+
if len(partial_query) < 2:
|
| 326 |
+
return []
|
| 327 |
+
|
| 328 |
+
# Search for the partial query
|
| 329 |
+
results = await self.search(partial_query, top_k=20)
|
| 330 |
+
|
| 331 |
+
# Extract potential query expansions from content
|
| 332 |
+
suggestions = set()
|
| 333 |
+
|
| 334 |
+
for result in results:
|
| 335 |
+
content_words = result.content.lower().split()
|
| 336 |
+
for i, word in enumerate(content_words):
|
| 337 |
+
if partial_query.lower() in word:
|
| 338 |
+
# Add the word itself
|
| 339 |
+
suggestions.add(word.strip('.,!?;:'))
|
| 340 |
+
|
| 341 |
+
# Add phrases that include this word
|
| 342 |
+
if i > 0:
|
| 343 |
+
phrase = f"{content_words[i-1]} {word}".strip('.,!?;:')
|
| 344 |
+
suggestions.add(phrase)
|
| 345 |
+
if i < len(content_words) - 1:
|
| 346 |
+
phrase = f"{word} {content_words[i+1]}".strip('.,!?;:')
|
| 347 |
+
suggestions.add(phrase)
|
| 348 |
+
|
| 349 |
+
# Filter and sort suggestions
|
| 350 |
+
filtered_suggestions = [
|
| 351 |
+
s for s in suggestions
|
| 352 |
+
if len(s) > len(partial_query) and s.startswith(partial_query.lower())
|
| 353 |
+
]
|
| 354 |
+
|
| 355 |
+
return sorted(filtered_suggestions)[:limit]
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
logger.error(f"Error getting search suggestions: {str(e)}")
|
| 359 |
+
return []
|
| 360 |
+
|
| 361 |
+
async def explain_search(self, query: str, top_k: int = 3) -> Dict[str, Any]:
|
| 362 |
+
"""Provide detailed explanation of search process and results"""
|
| 363 |
+
try:
|
| 364 |
+
explanation = {
|
| 365 |
+
"query": query,
|
| 366 |
+
"steps": [],
|
| 367 |
+
"results_analysis": {},
|
| 368 |
+
"performance_metrics": {}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
# Step 1: Query processing
|
| 372 |
+
explanation["steps"].append({
|
| 373 |
+
"step": "query_processing",
|
| 374 |
+
"description": "Processing and normalizing the search query",
|
| 375 |
+
"details": {
|
| 376 |
+
"original_query": query,
|
| 377 |
+
"cleaned_query": query.strip(),
|
| 378 |
+
"query_length": len(query)
|
| 379 |
+
}
|
| 380 |
+
})
|
| 381 |
+
|
| 382 |
+
# Step 2: Embedding generation
|
| 383 |
+
import time
|
| 384 |
+
start_time = time.time()
|
| 385 |
+
|
| 386 |
+
query_embedding = await self.embedding_service.generate_single_embedding(query)
|
| 387 |
+
|
| 388 |
+
embedding_time = time.time() - start_time
|
| 389 |
+
|
| 390 |
+
explanation["steps"].append({
|
| 391 |
+
"step": "embedding_generation",
|
| 392 |
+
"description": "Converting query to vector embedding",
|
| 393 |
+
"details": {
|
| 394 |
+
"embedding_dimension": len(query_embedding) if query_embedding else 0,
|
| 395 |
+
"generation_time_ms": round(embedding_time * 1000, 2)
|
| 396 |
+
}
|
| 397 |
+
})
|
| 398 |
+
|
| 399 |
+
# Step 3: Vector search
|
| 400 |
+
start_time = time.time()
|
| 401 |
+
|
| 402 |
+
results = await self.vector_store.search(query_embedding, top_k)
|
| 403 |
+
|
| 404 |
+
search_time = time.time() - start_time
|
| 405 |
+
|
| 406 |
+
explanation["steps"].append({
|
| 407 |
+
"step": "vector_search",
|
| 408 |
+
"description": "Searching vector database for similar content",
|
| 409 |
+
"details": {
|
| 410 |
+
"search_time_ms": round(search_time * 1000, 2),
|
| 411 |
+
"results_found": len(results),
|
| 412 |
+
"top_score": results[0].score if results else 0,
|
| 413 |
+
"score_range": f"{min(r.score for r in results):.3f}-{max(r.score for r in results):.3f}" if results else "N/A"
|
| 414 |
+
}
|
| 415 |
+
})
|
| 416 |
+
|
| 417 |
+
# Results analysis
|
| 418 |
+
if results:
|
| 419 |
+
explanation["results_analysis"] = {
|
| 420 |
+
"total_results": len(results),
|
| 421 |
+
"average_score": sum(r.score for r in results) / len(results),
|
| 422 |
+
"unique_documents": len(set(r.document_id for r in results)),
|
| 423 |
+
"content_lengths": [len(r.content) for r in results]
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
# Performance metrics
|
| 427 |
+
explanation["performance_metrics"] = {
|
| 428 |
+
"total_time_ms": round((embedding_time + search_time) * 1000, 2),
|
| 429 |
+
"embedding_time_ms": round(embedding_time * 1000, 2),
|
| 430 |
+
"search_time_ms": round(search_time * 1000, 2)
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
return explanation
|
| 434 |
+
|
| 435 |
+
except Exception as e:
|
| 436 |
+
logger.error(f"Error explaining search: {str(e)}")
|
| 437 |
+
return {"error": str(e)}
|
mcp_tools/utils.py
ADDED
|
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import asyncio
|
| 3 |
+
import functools
|
| 4 |
+
from typing import Any, Callable, Dict, List, Optional
|
| 5 |
+
import time
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
def async_timer(func: Callable) -> Callable:
|
| 12 |
+
"""Decorator to time async function execution"""
|
| 13 |
+
@functools.wraps(func)
|
| 14 |
+
async def wrapper(*args, **kwargs):
|
| 15 |
+
start_time = time.time()
|
| 16 |
+
try:
|
| 17 |
+
result = await func(*args, **kwargs)
|
| 18 |
+
end_time = time.time()
|
| 19 |
+
logger.debug(f"{func.__name__} completed in {end_time - start_time:.3f}s")
|
| 20 |
+
return result
|
| 21 |
+
except Exception as e:
|
| 22 |
+
end_time = time.time()
|
| 23 |
+
logger.error(f"{func.__name__} failed after {end_time - start_time:.3f}s: {str(e)}")
|
| 24 |
+
raise
|
| 25 |
+
return wrapper
|
| 26 |
+
|
| 27 |
+
def retry_async(max_attempts: int = 3, delay: float = 1.0, backoff: float = 2.0):
|
| 28 |
+
"""Decorator to retry async functions with exponential backoff"""
|
| 29 |
+
def decorator(func: Callable) -> Callable:
|
| 30 |
+
@functools.wraps(func)
|
| 31 |
+
async def wrapper(*args, **kwargs):
|
| 32 |
+
attempt = 1
|
| 33 |
+
current_delay = delay
|
| 34 |
+
|
| 35 |
+
while attempt <= max_attempts:
|
| 36 |
+
try:
|
| 37 |
+
return await func(*args, **kwargs)
|
| 38 |
+
except Exception as e:
|
| 39 |
+
if attempt == max_attempts:
|
| 40 |
+
logger.error(f"{func.__name__} failed after {max_attempts} attempts: {str(e)}")
|
| 41 |
+
raise
|
| 42 |
+
|
| 43 |
+
logger.warning(f"{func.__name__} attempt {attempt} failed: {str(e)}")
|
| 44 |
+
logger.info(f"Retrying in {current_delay}s...")
|
| 45 |
+
|
| 46 |
+
await asyncio.sleep(current_delay)
|
| 47 |
+
attempt += 1
|
| 48 |
+
current_delay *= backoff
|
| 49 |
+
|
| 50 |
+
return wrapper
|
| 51 |
+
return decorator
|
| 52 |
+
|
| 53 |
+
class MCPToolResponse:
|
| 54 |
+
"""Standardized response format for MCP tools"""
|
| 55 |
+
|
| 56 |
+
def __init__(self, success: bool, data: Any = None, error: str = None,
|
| 57 |
+
metadata: Dict[str, Any] = None):
|
| 58 |
+
self.success = success
|
| 59 |
+
self.data = data
|
| 60 |
+
self.error = error
|
| 61 |
+
self.metadata = metadata or {}
|
| 62 |
+
self.timestamp = time.time()
|
| 63 |
+
|
| 64 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 65 |
+
"""Convert response to dictionary"""
|
| 66 |
+
result = {
|
| 67 |
+
"success": self.success,
|
| 68 |
+
"timestamp": self.timestamp
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
if self.success:
|
| 72 |
+
result["data"] = self.data
|
| 73 |
+
else:
|
| 74 |
+
result["error"] = self.error
|
| 75 |
+
|
| 76 |
+
if self.metadata:
|
| 77 |
+
result["metadata"] = self.metadata
|
| 78 |
+
|
| 79 |
+
return result
|
| 80 |
+
|
| 81 |
+
@classmethod
|
| 82 |
+
def success_response(cls, data: Any, metadata: Dict[str, Any] = None):
|
| 83 |
+
"""Create a success response"""
|
| 84 |
+
return cls(success=True, data=data, metadata=metadata)
|
| 85 |
+
|
| 86 |
+
@classmethod
|
| 87 |
+
def error_response(cls, error: str, metadata: Dict[str, Any] = None):
|
| 88 |
+
"""Create an error response"""
|
| 89 |
+
return cls(success=False, error=error, metadata=metadata)
|
| 90 |
+
|
| 91 |
+
def validate_required_params(params: Dict[str, Any], required: List[str]) -> Optional[str]:
|
| 92 |
+
"""Validate that required parameters are present"""
|
| 93 |
+
missing = []
|
| 94 |
+
for param in required:
|
| 95 |
+
if param not in params or params[param] is None:
|
| 96 |
+
missing.append(param)
|
| 97 |
+
|
| 98 |
+
if missing:
|
| 99 |
+
return f"Missing required parameters: {', '.join(missing)}"
|
| 100 |
+
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
def sanitize_filename(filename: str) -> str:
|
| 104 |
+
"""Sanitize filename for safe storage"""
|
| 105 |
+
import re
|
| 106 |
+
|
| 107 |
+
# Remove or replace invalid characters
|
| 108 |
+
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
| 109 |
+
|
| 110 |
+
# Remove leading/trailing dots and spaces
|
| 111 |
+
filename = filename.strip('. ')
|
| 112 |
+
|
| 113 |
+
# Limit length
|
| 114 |
+
if len(filename) > 255:
|
| 115 |
+
name, ext = Path(filename).stem, Path(filename).suffix
|
| 116 |
+
max_name_len = 255 - len(ext)
|
| 117 |
+
filename = name[:max_name_len] + ext
|
| 118 |
+
|
| 119 |
+
# Ensure not empty
|
| 120 |
+
if not filename:
|
| 121 |
+
filename = "unnamed_file"
|
| 122 |
+
|
| 123 |
+
return filename
|
| 124 |
+
|
| 125 |
+
def truncate_text(text: str, max_length: int, add_ellipsis: bool = True) -> str:
|
| 126 |
+
"""Truncate text to specified length"""
|
| 127 |
+
if len(text) <= max_length:
|
| 128 |
+
return text
|
| 129 |
+
|
| 130 |
+
if add_ellipsis and max_length > 3:
|
| 131 |
+
return text[:max_length - 3] + "..."
|
| 132 |
+
else:
|
| 133 |
+
return text[:max_length]
|
| 134 |
+
|
| 135 |
+
def extract_file_info(file_path: str) -> Dict[str, Any]:
|
| 136 |
+
"""Extract information about a file"""
|
| 137 |
+
try:
|
| 138 |
+
path = Path(file_path)
|
| 139 |
+
stat = path.stat()
|
| 140 |
+
|
| 141 |
+
return {
|
| 142 |
+
"filename": path.name,
|
| 143 |
+
"extension": path.suffix.lower(),
|
| 144 |
+
"size_bytes": stat.st_size,
|
| 145 |
+
"size_mb": round(stat.st_size / (1024 * 1024), 2),
|
| 146 |
+
"created_time": stat.st_ctime,
|
| 147 |
+
"modified_time": stat.st_mtime,
|
| 148 |
+
"exists": path.exists(),
|
| 149 |
+
"is_file": path.is_file(),
|
| 150 |
+
"is_dir": path.is_dir()
|
| 151 |
+
}
|
| 152 |
+
except Exception as e:
|
| 153 |
+
return {"error": str(e)}
|
| 154 |
+
|
| 155 |
+
async def batch_process(items: List[Any], processor: Callable, batch_size: int = 10,
|
| 156 |
+
max_concurrent: int = 5) -> List[Any]:
|
| 157 |
+
"""Process items in batches with concurrency control"""
|
| 158 |
+
results = []
|
| 159 |
+
semaphore = asyncio.Semaphore(max_concurrent)
|
| 160 |
+
|
| 161 |
+
async def process_item(item):
|
| 162 |
+
async with semaphore:
|
| 163 |
+
return await processor(item)
|
| 164 |
+
|
| 165 |
+
# Process in batches
|
| 166 |
+
for i in range(0, len(items), batch_size):
|
| 167 |
+
batch = items[i:i + batch_size]
|
| 168 |
+
batch_tasks = [process_item(item) for item in batch]
|
| 169 |
+
batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
|
| 170 |
+
results.extend(batch_results)
|
| 171 |
+
|
| 172 |
+
return results
|
| 173 |
+
|
| 174 |
+
def format_file_size(size_bytes: int) -> str:
|
| 175 |
+
"""Format file size in human-readable format"""
|
| 176 |
+
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
| 177 |
+
if size_bytes < 1024.0:
|
| 178 |
+
return f"{size_bytes:.1f} {unit}"
|
| 179 |
+
size_bytes /= 1024.0
|
| 180 |
+
return f"{size_bytes:.1f} PB"
|
| 181 |
+
|
| 182 |
+
def calculate_reading_time(text: str, words_per_minute: int = 200) -> int:
|
| 183 |
+
"""Calculate estimated reading time in minutes"""
|
| 184 |
+
word_count = len(text.split())
|
| 185 |
+
return max(1, round(word_count / words_per_minute))
|
| 186 |
+
|
| 187 |
+
class ProgressTracker:
|
| 188 |
+
"""Track progress of long-running operations"""
|
| 189 |
+
|
| 190 |
+
def __init__(self, total_items: int, description: str = "Processing"):
|
| 191 |
+
self.total_items = total_items
|
| 192 |
+
self.completed_items = 0
|
| 193 |
+
self.description = description
|
| 194 |
+
self.start_time = time.time()
|
| 195 |
+
self.errors = []
|
| 196 |
+
|
| 197 |
+
def update(self, completed: int = 1, error: str = None):
|
| 198 |
+
"""Update progress"""
|
| 199 |
+
self.completed_items += completed
|
| 200 |
+
if error:
|
| 201 |
+
self.errors.append(error)
|
| 202 |
+
|
| 203 |
+
def get_progress(self) -> Dict[str, Any]:
|
| 204 |
+
"""Get current progress information"""
|
| 205 |
+
elapsed_time = time.time() - self.start_time
|
| 206 |
+
progress_percent = (self.completed_items / self.total_items) * 100 if self.total_items > 0 else 0
|
| 207 |
+
|
| 208 |
+
# Estimate remaining time
|
| 209 |
+
if self.completed_items > 0:
|
| 210 |
+
avg_time_per_item = elapsed_time / self.completed_items
|
| 211 |
+
remaining_items = self.total_items - self.completed_items
|
| 212 |
+
estimated_remaining_time = avg_time_per_item * remaining_items
|
| 213 |
+
else:
|
| 214 |
+
estimated_remaining_time = 0
|
| 215 |
+
|
| 216 |
+
return {
|
| 217 |
+
"description": self.description,
|
| 218 |
+
"total_items": self.total_items,
|
| 219 |
+
"completed_items": self.completed_items,
|
| 220 |
+
"progress_percent": round(progress_percent, 1),
|
| 221 |
+
"elapsed_time_seconds": round(elapsed_time, 1),
|
| 222 |
+
"estimated_remaining_seconds": round(estimated_remaining_time, 1),
|
| 223 |
+
"errors_count": len(self.errors),
|
| 224 |
+
"errors": self.errors[-5:] if self.errors else [] # Last 5 errors
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
def is_complete(self) -> bool:
|
| 228 |
+
"""Check if processing is complete"""
|
| 229 |
+
return self.completed_items >= self.total_items
|
| 230 |
+
|
| 231 |
+
def load_json_config(config_path: str, default_config: Dict[str, Any] = None) -> Dict[str, Any]:
|
| 232 |
+
"""Load configuration from JSON file with fallback to defaults"""
|
| 233 |
+
try:
|
| 234 |
+
with open(config_path, 'r') as f:
|
| 235 |
+
config = json.load(f)
|
| 236 |
+
logger.info(f"Loaded configuration from {config_path}")
|
| 237 |
+
return config
|
| 238 |
+
except FileNotFoundError:
|
| 239 |
+
logger.warning(f"Configuration file {config_path} not found, using defaults")
|
| 240 |
+
return default_config or {}
|
| 241 |
+
except json.JSONDecodeError as e:
|
| 242 |
+
logger.error(f"Invalid JSON in configuration file {config_path}: {str(e)}")
|
| 243 |
+
return default_config or {}
|
| 244 |
+
|
| 245 |
+
def save_json_config(config: Dict[str, Any], config_path: str) -> bool:
|
| 246 |
+
"""Save configuration to JSON file"""
|
| 247 |
+
try:
|
| 248 |
+
# Create directory if it doesn't exist
|
| 249 |
+
Path(config_path).parent.mkdir(parents=True, exist_ok=True)
|
| 250 |
+
|
| 251 |
+
with open(config_path, 'w') as f:
|
| 252 |
+
json.dump(config, f, indent=2)
|
| 253 |
+
|
| 254 |
+
logger.info(f"Saved configuration to {config_path}")
|
| 255 |
+
return True
|
| 256 |
+
except Exception as e:
|
| 257 |
+
logger.error(f"Failed to save configuration to {config_path}: {str(e)}")
|
| 258 |
+
return False
|
| 259 |
+
|
| 260 |
+
class RateLimiter:
|
| 261 |
+
"""Simple rate limiter for API calls"""
|
| 262 |
+
|
| 263 |
+
def __init__(self, max_calls: int, time_window: float):
|
| 264 |
+
self.max_calls = max_calls
|
| 265 |
+
self.time_window = time_window
|
| 266 |
+
self.calls = []
|
| 267 |
+
|
| 268 |
+
async def acquire(self):
|
| 269 |
+
"""Acquire permission to make a call"""
|
| 270 |
+
now = time.time()
|
| 271 |
+
|
| 272 |
+
# Remove old calls outside the time window
|
| 273 |
+
self.calls = [call_time for call_time in self.calls if now - call_time < self.time_window]
|
| 274 |
+
|
| 275 |
+
# Check if we can make a new call
|
| 276 |
+
if len(self.calls) >= self.max_calls:
|
| 277 |
+
# Wait until we can make a call
|
| 278 |
+
oldest_call = min(self.calls)
|
| 279 |
+
wait_time = self.time_window - (now - oldest_call)
|
| 280 |
+
if wait_time > 0:
|
| 281 |
+
await asyncio.sleep(wait_time)
|
| 282 |
+
return await self.acquire() # Recursive call after waiting
|
| 283 |
+
|
| 284 |
+
# Record this call
|
| 285 |
+
self.calls.append(now)
|
| 286 |
+
|
| 287 |
+
def escape_markdown(text: str) -> str:
|
| 288 |
+
"""Escape markdown special characters"""
|
| 289 |
+
import re
|
| 290 |
+
|
| 291 |
+
# Characters that need escaping in markdown
|
| 292 |
+
markdown_chars = r'([*_`\[\]()#+\-!\\])'
|
| 293 |
+
return re.sub(markdown_chars, r'\\\1', text)
|
| 294 |
+
|
| 295 |
+
def create_error_summary(errors: List[Exception]) -> str:
|
| 296 |
+
"""Create a summary of multiple errors"""
|
| 297 |
+
if not errors:
|
| 298 |
+
return "No errors"
|
| 299 |
+
|
| 300 |
+
error_counts = {}
|
| 301 |
+
for error in errors:
|
| 302 |
+
error_type = type(error).__name__
|
| 303 |
+
error_counts[error_type] = error_counts.get(error_type, 0) + 1
|
| 304 |
+
|
| 305 |
+
summary_parts = []
|
| 306 |
+
for error_type, count in error_counts.items():
|
| 307 |
+
if count == 1:
|
| 308 |
+
summary_parts.append(f"1 {error_type}")
|
| 309 |
+
else:
|
| 310 |
+
summary_parts.append(f"{count} {error_type}s")
|
| 311 |
+
|
| 312 |
+
return f"Encountered {len(errors)} total errors: " + ", ".join(summary_parts)
|
| 313 |
+
|
| 314 |
+
async def safe_execute(func: Callable, *args, default_return=None, **kwargs):
|
| 315 |
+
"""Safely execute a function and return default on error"""
|
| 316 |
+
try:
|
| 317 |
+
if asyncio.iscoroutinefunction(func):
|
| 318 |
+
return await func(*args, **kwargs)
|
| 319 |
+
else:
|
| 320 |
+
return func(*args, **kwargs)
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.error(f"Error executing {func.__name__}: {str(e)}")
|
| 323 |
+
return default_return
|
| 324 |
+
|
| 325 |
+
def get_content_preview(content: str, max_length: int = 200) -> str:
|
| 326 |
+
"""Get a preview of content for display"""
|
| 327 |
+
if not content:
|
| 328 |
+
return "No content"
|
| 329 |
+
|
| 330 |
+
# Clean up whitespace
|
| 331 |
+
content = ' '.join(content.split())
|
| 332 |
+
|
| 333 |
+
if len(content) <= max_length:
|
| 334 |
+
return content
|
| 335 |
+
|
| 336 |
+
# Try to break at sentence boundary
|
| 337 |
+
preview = content[:max_length]
|
| 338 |
+
last_sentence_end = max(preview.rfind('.'), preview.rfind('!'), preview.rfind('?'))
|
| 339 |
+
|
| 340 |
+
if last_sentence_end > max_length * 0.7: # If we found a good breaking point
|
| 341 |
+
return preview[:last_sentence_end + 1]
|
| 342 |
+
else:
|
| 343 |
+
# Break at word boundary
|
| 344 |
+
last_space = preview.rfind(' ')
|
| 345 |
+
if last_space > max_length * 0.7:
|
| 346 |
+
return preview[:last_space] + "..."
|
| 347 |
+
else:
|
| 348 |
+
return preview + "..."
|
| 349 |
+
|
| 350 |
+
class MemoryUsageTracker:
|
| 351 |
+
"""Track memory usage of operations"""
|
| 352 |
+
|
| 353 |
+
def __init__(self):
|
| 354 |
+
self.start_memory = self._get_memory_usage()
|
| 355 |
+
|
| 356 |
+
def _get_memory_usage(self) -> float:
|
| 357 |
+
"""Get current memory usage in MB"""
|
| 358 |
+
try:
|
| 359 |
+
import psutil
|
| 360 |
+
process = psutil.Process()
|
| 361 |
+
return process.memory_info().rss / 1024 / 1024 # Convert to MB
|
| 362 |
+
except ImportError:
|
| 363 |
+
return 0.0
|
| 364 |
+
|
| 365 |
+
def get_usage_delta(self) -> float:
|
| 366 |
+
"""Get memory usage change since initialization"""
|
| 367 |
+
current_memory = self._get_memory_usage()
|
| 368 |
+
return current_memory - self.start_memory
|
| 369 |
+
|
| 370 |
+
def log_usage(self, operation_name: str):
|
| 371 |
+
"""Log current memory usage for an operation"""
|
| 372 |
+
delta = self.get_usage_delta()
|
| 373 |
+
logger.info(f"{operation_name} memory delta: {delta:.1f} MB")
|
mcp_tools/voice_tool.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import Dict, Any, Optional
|
| 3 |
+
import asyncio
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class VoiceTool:
|
| 8 |
+
"""
|
| 9 |
+
MCP Tool for voice-based Q&A using ElevenLabs conversational AI
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, elevenlabs_service):
|
| 13 |
+
"""
|
| 14 |
+
Initialize Voice Tool
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
elevenlabs_service: ElevenLabs service instance
|
| 18 |
+
"""
|
| 19 |
+
self.elevenlabs_service = elevenlabs_service
|
| 20 |
+
|
| 21 |
+
async def voice_qa(
|
| 22 |
+
self,
|
| 23 |
+
question: str,
|
| 24 |
+
session_id: Optional[str] = None
|
| 25 |
+
) -> Dict[str, Any]:
|
| 26 |
+
"""
|
| 27 |
+
MCP Tool: Ask a question using voice assistant
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
question: User's question (text or transcribed from voice)
|
| 31 |
+
session_id: Optional session ID for conversation context
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Dictionary with answer, audio URL (if applicable), and sources
|
| 35 |
+
"""
|
| 36 |
+
try:
|
| 37 |
+
if not self.elevenlabs_service or not self.elevenlabs_service.is_available():
|
| 38 |
+
return {
|
| 39 |
+
"success": False,
|
| 40 |
+
"error": "Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
logger.info(f"Voice QA: {question}")
|
| 44 |
+
|
| 45 |
+
# For text-based queries, we can use the RAG tool directly
|
| 46 |
+
# This provides the backend for voice queries
|
| 47 |
+
result = await self.elevenlabs_service.llamaindex_service.query(question)
|
| 48 |
+
|
| 49 |
+
return {
|
| 50 |
+
"success": True,
|
| 51 |
+
"question": question,
|
| 52 |
+
"answer": result,
|
| 53 |
+
"session_id": session_id,
|
| 54 |
+
"mode": "text" # Could be "voice" if audio processing is involved
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.error(f"Voice QA failed: {str(e)}")
|
| 59 |
+
return {
|
| 60 |
+
"success": False,
|
| 61 |
+
"error": str(e),
|
| 62 |
+
"question": question
|
| 63 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio[mcp]
|
| 2 |
+
anthropic>=0.7.0
|
| 3 |
+
mistralai
|
| 4 |
+
sentence-transformers>=2.2.2
|
| 5 |
+
transformers>=4.30.0
|
| 6 |
+
torch>=2.0.0
|
| 7 |
+
faiss-cpu>=1.7.4
|
| 8 |
+
numpy>=1.24.0
|
| 9 |
+
pandas>=2.0.0
|
| 10 |
+
PyPDF2>=3.0.0
|
| 11 |
+
python-docx>=0.8.11
|
| 12 |
+
Pillow>=10.0.0
|
| 13 |
+
pytesseract>=0.3.10
|
| 14 |
+
aiofiles>=23.0.0
|
| 15 |
+
pydantic>=2.0.0
|
| 16 |
+
httpx>=0.24.0
|
| 17 |
+
uvicorn[standard]
|
| 18 |
+
python-multipart>=0.0.6
|
| 19 |
+
asyncio-mqtt>=0.11.1
|
| 20 |
+
nest-asyncio>=1.5.6
|
| 21 |
+
fastapi
|
| 22 |
+
fastmcp
|
| 23 |
+
mcp
|
| 24 |
+
openai
|
| 25 |
+
python-dotenv
|
| 26 |
+
llama-index
|
| 27 |
+
llama-index-llms-openai
|
| 28 |
+
llama-index-llms-anthropic
|
| 29 |
+
llama-index-embeddings-huggingface
|
| 30 |
+
elevenlabs>=1.0.0
|
| 31 |
+
websockets>=12.0
|
services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Services module initialization
|
services/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (149 Bytes). View file
|
|
|
services/__pycache__/document_store_service.cpython-313.pyc
ADDED
|
Binary file (17.2 kB). View file
|
|
|
services/__pycache__/elevenlabs_service.cpython-313.pyc
ADDED
|
Binary file (12.8 kB). View file
|
|
|
services/__pycache__/embedding_service.cpython-313.pyc
ADDED
|
Binary file (13.4 kB). View file
|
|
|
services/__pycache__/llamaindex_service.cpython-313.pyc
ADDED
|
Binary file (11 kB). View file
|
|
|
services/__pycache__/llm_service.cpython-313.pyc
ADDED
|
Binary file (28.2 kB). View file
|
|
|
services/__pycache__/ocr_service.cpython-313.pyc
ADDED
|
Binary file (19.9 kB). View file
|
|
|
services/__pycache__/podcast_generator_service.cpython-313.pyc
ADDED
|
Binary file (28.3 kB). View file
|
|
|
services/__pycache__/vector_store_service.cpython-313.pyc
ADDED
|
Binary file (15.3 kB). View file
|
|
|
services/document_store_service.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import pickle
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import asyncio
|
| 9 |
+
|
| 10 |
+
from core.models import Document, DocumentType
|
| 11 |
+
import config
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class DocumentStoreService:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.config = config.config
|
| 18 |
+
self.store_path = Path(self.config.DOCUMENT_STORE_PATH)
|
| 19 |
+
self.store_path.mkdir(parents=True, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
# Separate paths for metadata and content
|
| 22 |
+
self.metadata_path = self.store_path / "metadata"
|
| 23 |
+
self.content_path = self.store_path / "content"
|
| 24 |
+
|
| 25 |
+
self.metadata_path.mkdir(exist_ok=True)
|
| 26 |
+
self.content_path.mkdir(exist_ok=True)
|
| 27 |
+
|
| 28 |
+
# In-memory cache for frequently accessed documents
|
| 29 |
+
self._cache = {}
|
| 30 |
+
self._cache_size_limit = 100
|
| 31 |
+
|
| 32 |
+
async def store_document(self, document: Document) -> bool:
|
| 33 |
+
"""Store a document and its metadata"""
|
| 34 |
+
try:
|
| 35 |
+
# Store metadata
|
| 36 |
+
metadata_file = self.metadata_path / f"{document.id}.json"
|
| 37 |
+
metadata = {
|
| 38 |
+
"id": document.id,
|
| 39 |
+
"filename": document.filename,
|
| 40 |
+
"doc_type": document.doc_type.value,
|
| 41 |
+
"file_size": document.file_size,
|
| 42 |
+
"created_at": document.created_at.isoformat(),
|
| 43 |
+
"metadata": document.metadata,
|
| 44 |
+
"tags": document.tags,
|
| 45 |
+
"summary": document.summary,
|
| 46 |
+
"category": document.category,
|
| 47 |
+
"language": document.language,
|
| 48 |
+
"content_length": len(document.content)
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
with open(metadata_file, 'w', encoding='utf-8') as f:
|
| 52 |
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
| 53 |
+
|
| 54 |
+
# Store content separately (can be large)
|
| 55 |
+
content_file = self.content_path / f"{document.id}.txt"
|
| 56 |
+
with open(content_file, 'w', encoding='utf-8') as f:
|
| 57 |
+
f.write(document.content)
|
| 58 |
+
|
| 59 |
+
# Cache the document
|
| 60 |
+
self._add_to_cache(document.id, document)
|
| 61 |
+
|
| 62 |
+
logger.info(f"Stored document {document.id} ({document.filename})")
|
| 63 |
+
return True
|
| 64 |
+
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Error storing document {document.id}: {str(e)}")
|
| 67 |
+
return False
|
| 68 |
+
|
| 69 |
+
async def get_document(self, document_id: str) -> Optional[Document]:
|
| 70 |
+
"""Retrieve a document by ID"""
|
| 71 |
+
try:
|
| 72 |
+
# Check cache first
|
| 73 |
+
if document_id in self._cache:
|
| 74 |
+
return self._cache[document_id]
|
| 75 |
+
|
| 76 |
+
# Load from disk
|
| 77 |
+
metadata_file = self.metadata_path / f"{document_id}.json"
|
| 78 |
+
content_file = self.content_path / f"{document_id}.txt"
|
| 79 |
+
|
| 80 |
+
if not metadata_file.exists() or not content_file.exists():
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
# Load metadata
|
| 84 |
+
with open(metadata_file, 'r', encoding='utf-8') as f:
|
| 85 |
+
metadata = json.load(f)
|
| 86 |
+
|
| 87 |
+
# Load content
|
| 88 |
+
with open(content_file, 'r', encoding='utf-8') as f:
|
| 89 |
+
content = f.read()
|
| 90 |
+
|
| 91 |
+
# Create document object
|
| 92 |
+
document = Document(
|
| 93 |
+
id=metadata["id"],
|
| 94 |
+
filename=metadata["filename"],
|
| 95 |
+
content=content,
|
| 96 |
+
doc_type=DocumentType(metadata["doc_type"]),
|
| 97 |
+
file_size=metadata["file_size"],
|
| 98 |
+
created_at=datetime.fromisoformat(metadata["created_at"]),
|
| 99 |
+
metadata=metadata.get("metadata", {}),
|
| 100 |
+
tags=metadata.get("tags", []),
|
| 101 |
+
summary=metadata.get("summary"),
|
| 102 |
+
category=metadata.get("category"),
|
| 103 |
+
language=metadata.get("language")
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Add to cache
|
| 107 |
+
self._add_to_cache(document_id, document)
|
| 108 |
+
|
| 109 |
+
return document
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.error(f"Error retrieving document {document_id}: {str(e)}")
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
async def list_documents(self, limit: int = 50, offset: int = 0,
|
| 116 |
+
filters: Optional[Dict[str, Any]] = None) -> List[Document]:
|
| 117 |
+
"""List documents with pagination and filtering"""
|
| 118 |
+
try:
|
| 119 |
+
documents = []
|
| 120 |
+
metadata_files = list(self.metadata_path.glob("*.json"))
|
| 121 |
+
|
| 122 |
+
# Sort by creation time (newest first)
|
| 123 |
+
metadata_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
|
| 124 |
+
|
| 125 |
+
# Apply pagination
|
| 126 |
+
start_idx = offset
|
| 127 |
+
end_idx = offset + limit
|
| 128 |
+
|
| 129 |
+
for metadata_file in metadata_files[start_idx:end_idx]:
|
| 130 |
+
try:
|
| 131 |
+
with open(metadata_file, 'r', encoding='utf-8') as f:
|
| 132 |
+
metadata = json.load(f)
|
| 133 |
+
|
| 134 |
+
# Apply filters
|
| 135 |
+
if filters and not self._apply_filters(metadata, filters):
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
# Load content if needed (for small documents)
|
| 139 |
+
content_file = self.content_path / f"{metadata['id']}.txt"
|
| 140 |
+
if content_file.exists():
|
| 141 |
+
with open(content_file, 'r', encoding='utf-8') as f:
|
| 142 |
+
content = f.read()
|
| 143 |
+
else:
|
| 144 |
+
content = ""
|
| 145 |
+
|
| 146 |
+
document = Document(
|
| 147 |
+
id=metadata["id"],
|
| 148 |
+
filename=metadata["filename"],
|
| 149 |
+
content=content,
|
| 150 |
+
doc_type=DocumentType(metadata["doc_type"]),
|
| 151 |
+
file_size=metadata["file_size"],
|
| 152 |
+
created_at=datetime.fromisoformat(metadata["created_at"]),
|
| 153 |
+
metadata=metadata.get("metadata", {}),
|
| 154 |
+
tags=metadata.get("tags", []),
|
| 155 |
+
summary=metadata.get("summary"),
|
| 156 |
+
category=metadata.get("category"),
|
| 157 |
+
language=metadata.get("language")
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
documents.append(document)
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.warning(f"Error loading document metadata from {metadata_file}: {str(e)}")
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
return documents
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.error(f"Error listing documents: {str(e)}")
|
| 170 |
+
return []
|
| 171 |
+
|
| 172 |
+
def _apply_filters(self, metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
|
| 173 |
+
"""Apply filters to document metadata"""
|
| 174 |
+
try:
|
| 175 |
+
for key, value in filters.items():
|
| 176 |
+
if key == "doc_type":
|
| 177 |
+
if metadata.get("doc_type") != value:
|
| 178 |
+
return False
|
| 179 |
+
elif key == "filename_contains":
|
| 180 |
+
if value.lower() not in metadata.get("filename", "").lower():
|
| 181 |
+
return False
|
| 182 |
+
elif key == "created_after":
|
| 183 |
+
doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
|
| 184 |
+
if doc_date < value:
|
| 185 |
+
return False
|
| 186 |
+
elif key == "created_before":
|
| 187 |
+
doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
|
| 188 |
+
if doc_date > value:
|
| 189 |
+
return False
|
| 190 |
+
elif key == "tags":
|
| 191 |
+
doc_tags = set(metadata.get("tags", []))
|
| 192 |
+
required_tags = set(value) if isinstance(value, list) else {value}
|
| 193 |
+
if not required_tags.intersection(doc_tags):
|
| 194 |
+
return False
|
| 195 |
+
elif key == "category":
|
| 196 |
+
if metadata.get("category") != value:
|
| 197 |
+
return False
|
| 198 |
+
elif key == "language":
|
| 199 |
+
if metadata.get("language") != value:
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
return True
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.error(f"Error applying filters: {str(e)}")
|
| 205 |
+
return True
|
| 206 |
+
|
| 207 |
+
async def update_document_metadata(self, document_id: str, updates: Dict[str, Any]) -> bool:
|
| 208 |
+
"""Update document metadata"""
|
| 209 |
+
try:
|
| 210 |
+
metadata_file = self.metadata_path / f"{document_id}.json"
|
| 211 |
+
|
| 212 |
+
if not metadata_file.exists():
|
| 213 |
+
logger.warning(f"Document {document_id} not found")
|
| 214 |
+
return False
|
| 215 |
+
|
| 216 |
+
# Load existing metadata
|
| 217 |
+
with open(metadata_file, 'r', encoding='utf-8') as f:
|
| 218 |
+
metadata = json.load(f)
|
| 219 |
+
|
| 220 |
+
# Apply updates
|
| 221 |
+
for key, value in updates.items():
|
| 222 |
+
if key in ["tags", "summary", "category", "language", "metadata"]:
|
| 223 |
+
metadata[key] = value
|
| 224 |
+
|
| 225 |
+
# Save updated metadata
|
| 226 |
+
with open(metadata_file, 'w', encoding='utf-8') as f:
|
| 227 |
+
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
| 228 |
+
|
| 229 |
+
# Update cache if document is cached
|
| 230 |
+
if document_id in self._cache:
|
| 231 |
+
document = self._cache[document_id]
|
| 232 |
+
for key, value in updates.items():
|
| 233 |
+
if hasattr(document, key):
|
| 234 |
+
setattr(document, key, value)
|
| 235 |
+
|
| 236 |
+
logger.info(f"Updated metadata for document {document_id}")
|
| 237 |
+
return True
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
logger.error(f"Error updating document metadata: {str(e)}")
|
| 241 |
+
return False
|
| 242 |
+
|
| 243 |
+
async def delete_document(self, document_id: str) -> bool:
|
| 244 |
+
"""Delete a document and its metadata"""
|
| 245 |
+
try:
|
| 246 |
+
metadata_file = self.metadata_path / f"{document_id}.json"
|
| 247 |
+
content_file = self.content_path / f"{document_id}.txt"
|
| 248 |
+
|
| 249 |
+
# Remove files
|
| 250 |
+
if metadata_file.exists():
|
| 251 |
+
metadata_file.unlink()
|
| 252 |
+
if content_file.exists():
|
| 253 |
+
content_file.unlink()
|
| 254 |
+
|
| 255 |
+
# Remove from cache
|
| 256 |
+
if document_id in self._cache:
|
| 257 |
+
del self._cache[document_id]
|
| 258 |
+
|
| 259 |
+
logger.info(f"Deleted document {document_id}")
|
| 260 |
+
return True
|
| 261 |
+
|
| 262 |
+
except Exception as e:
|
| 263 |
+
logger.error(f"Error deleting document {document_id}: {str(e)}")
|
| 264 |
+
return False
|
| 265 |
+
|
| 266 |
+
async def search_documents(self, query: str, fields: List[str] = None) -> List[Document]:
|
| 267 |
+
"""Simple text search across documents"""
|
| 268 |
+
if not fields:
|
| 269 |
+
fields = ["filename", "content", "tags", "summary"]
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
matching_documents = []
|
| 273 |
+
query_lower = query.lower()
|
| 274 |
+
|
| 275 |
+
# Get all documents
|
| 276 |
+
all_documents = await self.list_documents(limit=1000) # Adjust limit as needed
|
| 277 |
+
|
| 278 |
+
for document in all_documents:
|
| 279 |
+
match_found = False
|
| 280 |
+
|
| 281 |
+
for field in fields:
|
| 282 |
+
field_value = getattr(document, field, "")
|
| 283 |
+
if isinstance(field_value, list):
|
| 284 |
+
field_value = " ".join(field_value)
|
| 285 |
+
elif field_value is None:
|
| 286 |
+
field_value = ""
|
| 287 |
+
|
| 288 |
+
if query_lower in str(field_value).lower():
|
| 289 |
+
match_found = True
|
| 290 |
+
break
|
| 291 |
+
|
| 292 |
+
if match_found:
|
| 293 |
+
matching_documents.append(document)
|
| 294 |
+
|
| 295 |
+
logger.info(f"Found {len(matching_documents)} documents matching '{query}'")
|
| 296 |
+
return matching_documents
|
| 297 |
+
|
| 298 |
+
except Exception as e:
|
| 299 |
+
logger.error(f"Error searching documents: {str(e)}")
|
| 300 |
+
return []
|
| 301 |
+
|
| 302 |
+
def _add_to_cache(self, document_id: str, document: Document):
|
| 303 |
+
"""Add document to cache with size limit"""
|
| 304 |
+
try:
|
| 305 |
+
# Remove oldest items if cache is full
|
| 306 |
+
if len(self._cache) >= self._cache_size_limit:
|
| 307 |
+
# Remove first item (FIFO)
|
| 308 |
+
oldest_key = next(iter(self._cache))
|
| 309 |
+
del self._cache[oldest_key]
|
| 310 |
+
|
| 311 |
+
self._cache[document_id] = document
|
| 312 |
+
except Exception as e:
|
| 313 |
+
logger.error(f"Error adding to cache: {str(e)}")
|
| 314 |
+
|
| 315 |
+
async def get_stats(self) -> Dict[str, Any]:
|
| 316 |
+
"""Get statistics about the document store"""
|
| 317 |
+
try:
|
| 318 |
+
metadata_files = list(self.metadata_path.glob("*.json"))
|
| 319 |
+
content_files = list(self.content_path.glob("*.txt"))
|
| 320 |
+
|
| 321 |
+
# Calculate total storage size
|
| 322 |
+
total_size = 0
|
| 323 |
+
for file_path in metadata_files + content_files:
|
| 324 |
+
total_size += file_path.stat().st_size
|
| 325 |
+
|
| 326 |
+
# Count by document type
|
| 327 |
+
type_counts = {}
|
| 328 |
+
for metadata_file in metadata_files:
|
| 329 |
+
try:
|
| 330 |
+
with open(metadata_file, 'r') as f:
|
| 331 |
+
metadata = json.load(f)
|
| 332 |
+
doc_type = metadata.get("doc_type", "unknown")
|
| 333 |
+
type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
|
| 334 |
+
except:
|
| 335 |
+
continue
|
| 336 |
+
|
| 337 |
+
return {
|
| 338 |
+
"total_documents": len(metadata_files),
|
| 339 |
+
"total_size_bytes": total_size,
|
| 340 |
+
"total_size_mb": round(total_size / (1024 * 1024), 2),
|
| 341 |
+
"cache_size": len(self._cache),
|
| 342 |
+
"document_types": type_counts,
|
| 343 |
+
"storage_path": str(self.store_path),
|
| 344 |
+
"metadata_files": len(metadata_files),
|
| 345 |
+
"content_files": len(content_files)
|
| 346 |
+
}
|
| 347 |
+
except Exception as e:
|
| 348 |
+
logger.error(f"Error getting document store stats: {str(e)}")
|
| 349 |
+
return {"error": str(e)}
|
services/elevenlabs_service.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import asyncio
|
| 3 |
+
from typing import Optional, Dict, Any, List
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
try:
|
| 7 |
+
from elevenlabs.client import ElevenLabs
|
| 8 |
+
from elevenlabs.conversational_ai.conversation import Conversation, ClientTools
|
| 9 |
+
from elevenlabs.conversational_ai.default_audio_interface import DefaultAudioInterface
|
| 10 |
+
ELEVENLABS_AVAILABLE = True
|
| 11 |
+
except ImportError:
|
| 12 |
+
ELEVENLABS_AVAILABLE = False
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
logger.warning("ElevenLabs SDK not available. Voice features will be disabled.")
|
| 15 |
+
|
| 16 |
+
import config
|
| 17 |
+
from services.llamaindex_service import LlamaIndexService
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
class ElevenLabsService:
|
| 22 |
+
"""
|
| 23 |
+
Service for integrating ElevenLabs Conversational AI with RAG capabilities.
|
| 24 |
+
Provides voice-based interaction with the document library.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, llamaindex_service: LlamaIndexService):
|
| 28 |
+
"""
|
| 29 |
+
Initialize ElevenLabs service with RAG integration
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
llamaindex_service: LlamaIndex service for document queries
|
| 33 |
+
"""
|
| 34 |
+
self.config = config.config
|
| 35 |
+
self.llamaindex_service = llamaindex_service
|
| 36 |
+
self.client = None
|
| 37 |
+
self.client_tools = None
|
| 38 |
+
self.active_conversations: Dict[str, Conversation] = {}
|
| 39 |
+
|
| 40 |
+
if not ELEVENLABS_AVAILABLE:
|
| 41 |
+
logger.error("ElevenLabs SDK not installed. Run: pip install elevenlabs")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
if not self.config.ELEVENLABS_API_KEY:
|
| 45 |
+
logger.warning("ELEVENLABS_API_KEY not configured. Voice features will be limited.")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
# Initialize ElevenLabs client
|
| 50 |
+
self.client = ElevenLabs(api_key=self.config.ELEVENLABS_API_KEY)
|
| 51 |
+
logger.info("ElevenLabs client initialized successfully")
|
| 52 |
+
|
| 53 |
+
# Initialize client tools for custom tool registration
|
| 54 |
+
self.client_tools = ClientTools()
|
| 55 |
+
|
| 56 |
+
# Register RAG tool
|
| 57 |
+
self._register_rag_tool()
|
| 58 |
+
|
| 59 |
+
logger.info("ElevenLabs service initialized with RAG tool")
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Error initializing ElevenLabs service: {str(e)}")
|
| 63 |
+
|
| 64 |
+
def _register_rag_tool(self):
|
| 65 |
+
"""Register RAG query tool with ElevenLabs agent"""
|
| 66 |
+
if not self.client_tools:
|
| 67 |
+
return
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
# Register the query_documents tool
|
| 71 |
+
# Modern ElevenLabs SDK: register(tool_name, handler=callable)
|
| 72 |
+
self.client_tools.register("query_documents", handler=self._rag_query_tool)
|
| 73 |
+
|
| 74 |
+
logger.info("RAG tool 'query_documents' registered successfully")
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Error registering RAG tool: {str(e)}")
|
| 78 |
+
|
| 79 |
+
async def _rag_query_tool(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
| 80 |
+
"""
|
| 81 |
+
Custom tool for querying documents using LlamaIndex agentic RAG
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
params: Dictionary containing the query
|
| 85 |
+
- query (str): The user's question or search query
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
Dictionary with answer and metadata
|
| 89 |
+
"""
|
| 90 |
+
try:
|
| 91 |
+
query = params.get("query", "")
|
| 92 |
+
|
| 93 |
+
if not query:
|
| 94 |
+
return {
|
| 95 |
+
"error": "No query provided",
|
| 96 |
+
"answer": "I didn't receive a question to search for."
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
logger.info(f"RAG tool called with query: '{query}'")
|
| 100 |
+
|
| 101 |
+
# Query the LlamaIndex agentic RAG system
|
| 102 |
+
try:
|
| 103 |
+
result = await asyncio.wait_for(
|
| 104 |
+
self.llamaindex_service.query(query),
|
| 105 |
+
timeout=self.config.CONVERSATION_TIMEOUT
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
logger.info(f"RAG query successful")
|
| 109 |
+
|
| 110 |
+
return {
|
| 111 |
+
"answer": result,
|
| 112 |
+
"source": "document_library",
|
| 113 |
+
"confidence": "high"
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
except asyncio.TimeoutError:
|
| 117 |
+
logger.error("RAG query timeout")
|
| 118 |
+
return {
|
| 119 |
+
"error": "timeout",
|
| 120 |
+
"answer": "The search took too long. Please try a simpler question."
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Error in RAG query tool: {str(e)}")
|
| 125 |
+
return {
|
| 126 |
+
"error": str(e),
|
| 127 |
+
"answer": f"I encountered an error searching the documents: {str(e)}"
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
def create_conversation(
|
| 131 |
+
self,
|
| 132 |
+
agent_id: Optional[str] = None,
|
| 133 |
+
session_id: Optional[str] = None
|
| 134 |
+
) -> Optional[Conversation]:
|
| 135 |
+
"""
|
| 136 |
+
Create a new conversation session
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
agent_id: ElevenLabs agent ID (uses config default if not provided)
|
| 140 |
+
session_id: Optional session ID for tracking
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
Conversation object or None if initialization fails
|
| 144 |
+
"""
|
| 145 |
+
if not self.client:
|
| 146 |
+
logger.error("ElevenLabs client not initialized")
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
agent_id = agent_id or self.config.ELEVENLABS_AGENT_ID
|
| 151 |
+
|
| 152 |
+
if not agent_id:
|
| 153 |
+
logger.error("No agent ID provided or configured")
|
| 154 |
+
return None
|
| 155 |
+
|
| 156 |
+
# Create audio interface for real-time audio
|
| 157 |
+
audio_interface = DefaultAudioInterface()
|
| 158 |
+
|
| 159 |
+
# Create conversation with RAG tool
|
| 160 |
+
conversation = Conversation(
|
| 161 |
+
client=self.client,
|
| 162 |
+
agent_id=agent_id,
|
| 163 |
+
requires_auth=True,
|
| 164 |
+
audio_interface=audio_interface,
|
| 165 |
+
client_tools=self.client_tools
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Store conversation if session ID provided
|
| 169 |
+
if session_id:
|
| 170 |
+
self.active_conversations[session_id] = conversation
|
| 171 |
+
|
| 172 |
+
logger.info(f"Created conversation for agent: {agent_id}")
|
| 173 |
+
return conversation
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.error(f"Error creating conversation: {str(e)}")
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
+
async def start_conversation(self, session_id: Optional[str] = None) -> Dict[str, Any]:
|
| 180 |
+
"""
|
| 181 |
+
Start a new conversation session (async wrapper for UI)
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
session_id: Optional session ID for tracking
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
Dictionary with success status and conversation info
|
| 188 |
+
"""
|
| 189 |
+
try:
|
| 190 |
+
conversation = self.create_conversation(session_id=session_id)
|
| 191 |
+
|
| 192 |
+
if conversation:
|
| 193 |
+
return {
|
| 194 |
+
"success": True,
|
| 195 |
+
"session_id": session_id,
|
| 196 |
+
"message": "Conversation started successfully"
|
| 197 |
+
}
|
| 198 |
+
else:
|
| 199 |
+
return {
|
| 200 |
+
"success": False,
|
| 201 |
+
"error": "Failed to create conversation"
|
| 202 |
+
}
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.error(f"Error starting conversation: {str(e)}")
|
| 205 |
+
return {
|
| 206 |
+
"success": False,
|
| 207 |
+
"error": str(e)
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
async def process_voice_query(
|
| 211 |
+
self,
|
| 212 |
+
audio_file_path: str,
|
| 213 |
+
agent_id: Optional[str] = None
|
| 214 |
+
) -> Dict[str, Any]:
|
| 215 |
+
"""
|
| 216 |
+
Process a voice query file and return response
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
audio_file_path: Path to audio file
|
| 220 |
+
agent_id: Optional agent ID
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
Dictionary with transcription, answer, and metadata
|
| 224 |
+
"""
|
| 225 |
+
try:
|
| 226 |
+
# For now, this is a placeholder for file-based processing
|
| 227 |
+
# ElevenLabs Conversational AI is primarily WebSocket-based
|
| 228 |
+
# This would be used for async/batch processing
|
| 229 |
+
|
| 230 |
+
logger.info(f"Processing voice query from: {audio_file_path}")
|
| 231 |
+
|
| 232 |
+
# This would require additional implementation for file upload
|
| 233 |
+
# and processing through ElevenLabs API
|
| 234 |
+
|
| 235 |
+
return {
|
| 236 |
+
"status": "pending",
|
| 237 |
+
"message": "Voice query processing requires WebSocket connection",
|
| 238 |
+
"file": audio_file_path
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.error(f"Error processing voice query: {str(e)}")
|
| 243 |
+
return {
|
| 244 |
+
"status": "error",
|
| 245 |
+
"error": str(e)
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
async def end_conversation(self, session_id: str) -> bool:
|
| 249 |
+
"""
|
| 250 |
+
End an active conversation session
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
session_id: Session identifier
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
True if conversation ended successfully
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
if session_id in self.active_conversations:
|
| 260 |
+
conversation = self.active_conversations[session_id]
|
| 261 |
+
|
| 262 |
+
# Try to end the session gracefully
|
| 263 |
+
try:
|
| 264 |
+
conversation.end_session()
|
| 265 |
+
except AttributeError as ae:
|
| 266 |
+
# Handle cases where DefaultAudioInterface doesn't have expected methods
|
| 267 |
+
logger.warning(f"Could not cleanly end session: {str(ae)}")
|
| 268 |
+
except Exception as e:
|
| 269 |
+
logger.warning(f"Error during session cleanup: {str(e)}")
|
| 270 |
+
|
| 271 |
+
# Always remove from active conversations
|
| 272 |
+
del self.active_conversations[session_id]
|
| 273 |
+
logger.info(f"Ended conversation: {session_id}")
|
| 274 |
+
return True
|
| 275 |
+
return False
|
| 276 |
+
|
| 277 |
+
except Exception as e:
|
| 278 |
+
logger.error(f"Error ending conversation: {str(e)}")
|
| 279 |
+
return False
|
| 280 |
+
|
| 281 |
+
def get_available_voices(self) -> List[Dict[str, str]]:
|
| 282 |
+
"""
|
| 283 |
+
Get list of available voice models
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
List of voice model information
|
| 287 |
+
"""
|
| 288 |
+
try:
|
| 289 |
+
if not self.client:
|
| 290 |
+
return []
|
| 291 |
+
|
| 292 |
+
# Get voices from ElevenLabs API
|
| 293 |
+
voices = self.client.voices.get_all()
|
| 294 |
+
|
| 295 |
+
return [
|
| 296 |
+
{
|
| 297 |
+
"voice_id": voice.voice_id,
|
| 298 |
+
"name": voice.name,
|
| 299 |
+
"category": voice.category if hasattr(voice, 'category') else "general"
|
| 300 |
+
}
|
| 301 |
+
for voice in voices.voices
|
| 302 |
+
]
|
| 303 |
+
|
| 304 |
+
except Exception as e:
|
| 305 |
+
logger.error(f"Error getting voices: {str(e)}")
|
| 306 |
+
return []
|
| 307 |
+
|
| 308 |
+
def is_available(self) -> bool:
|
| 309 |
+
"""Check if ElevenLabs service is available and configured"""
|
| 310 |
+
return ELEVENLABS_AVAILABLE and self.client is not None
|
| 311 |
+
|
| 312 |
+
async def test_connection(self) -> Dict[str, Any]:
|
| 313 |
+
"""
|
| 314 |
+
Test ElevenLabs API connection
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
Dictionary with test results
|
| 318 |
+
"""
|
| 319 |
+
try:
|
| 320 |
+
if not self.client:
|
| 321 |
+
return {
|
| 322 |
+
"status": "error",
|
| 323 |
+
"message": "Client not initialized"
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
# Try to fetch user info or voices as a connection test
|
| 327 |
+
voices = self.get_available_voices()
|
| 328 |
+
|
| 329 |
+
return {
|
| 330 |
+
"status": "success",
|
| 331 |
+
"message": "ElevenLabs API connected",
|
| 332 |
+
"voices_available": len(voices),
|
| 333 |
+
"rag_tool_registered": self.client_tools is not None
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
except Exception as e:
|
| 337 |
+
logger.error(f"Connection test failed: {str(e)}")
|
| 338 |
+
return {
|
| 339 |
+
"status": "error",
|
| 340 |
+
"message": str(e)
|
| 341 |
+
}
|
services/embedding_service.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import asyncio
|
| 3 |
+
from typing import List, Optional, Dict, Any
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import torch
|
| 7 |
+
import openai
|
| 8 |
+
import config
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
class EmbeddingService:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.config = config.config
|
| 15 |
+
self.model_name = self.config.EMBEDDING_MODEL
|
| 16 |
+
self.model = None
|
| 17 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 18 |
+
self.openai_client = None
|
| 19 |
+
self.is_openai_model = False
|
| 20 |
+
|
| 21 |
+
# Initialize OpenAI client if needed
|
| 22 |
+
if self.config.OPENAI_API_KEY:
|
| 23 |
+
self.openai_client = openai.OpenAI(api_key=self.config.OPENAI_API_KEY)
|
| 24 |
+
|
| 25 |
+
# Load model lazily
|
| 26 |
+
self._load_model()
|
| 27 |
+
|
| 28 |
+
def _load_model(self):
|
| 29 |
+
"""Load the embedding model"""
|
| 30 |
+
try:
|
| 31 |
+
logger.info(f"Loading embedding model: {self.model_name}")
|
| 32 |
+
|
| 33 |
+
if self.model_name.startswith("text-embedding-"):
|
| 34 |
+
if not self.openai_client:
|
| 35 |
+
logger.warning(f"OpenAI model {self.model_name} requested but OPENAI_API_KEY not found. Falling back to local model.")
|
| 36 |
+
self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
| 37 |
+
self.is_openai_model = False
|
| 38 |
+
self.model = SentenceTransformer(self.model_name, device=self.device)
|
| 39 |
+
else:
|
| 40 |
+
self.is_openai_model = True
|
| 41 |
+
logger.info(f"Using OpenAI embedding model: {self.model_name}")
|
| 42 |
+
else:
|
| 43 |
+
self.is_openai_model = False
|
| 44 |
+
self.model = SentenceTransformer(self.model_name, device=self.device)
|
| 45 |
+
logger.info(f"Local embedding model loaded successfully on {self.device}")
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Failed to load embedding model: {str(e)}")
|
| 49 |
+
# Fallback to a smaller model
|
| 50 |
+
try:
|
| 51 |
+
self.model_name = "all-MiniLM-L6-v2"
|
| 52 |
+
self.is_openai_model = False
|
| 53 |
+
self.model = SentenceTransformer(self.model_name, device=self.device)
|
| 54 |
+
logger.info(f"Loaded fallback embedding model: {self.model_name}")
|
| 55 |
+
except Exception as fallback_error:
|
| 56 |
+
logger.error(f"Failed to load fallback model: {str(fallback_error)}")
|
| 57 |
+
raise
|
| 58 |
+
|
| 59 |
+
async def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
|
| 60 |
+
"""Generate embeddings for a list of texts"""
|
| 61 |
+
if not texts:
|
| 62 |
+
return []
|
| 63 |
+
|
| 64 |
+
if not self.is_openai_model and self.model is None:
|
| 65 |
+
raise RuntimeError("Embedding model not loaded")
|
| 66 |
+
|
| 67 |
+
try:
|
| 68 |
+
# Filter out empty texts
|
| 69 |
+
non_empty_texts = [text for text in texts if text and text.strip()]
|
| 70 |
+
if not non_empty_texts:
|
| 71 |
+
logger.warning("No non-empty texts provided for embedding")
|
| 72 |
+
return []
|
| 73 |
+
|
| 74 |
+
logger.info(f"Generating embeddings for {len(non_empty_texts)} texts using {self.model_name}")
|
| 75 |
+
|
| 76 |
+
# Process in batches to manage memory/API limits
|
| 77 |
+
all_embeddings = []
|
| 78 |
+
for i in range(0, len(non_empty_texts), batch_size):
|
| 79 |
+
batch = non_empty_texts[i:i + batch_size]
|
| 80 |
+
|
| 81 |
+
# Run embedding generation in thread pool to avoid blocking
|
| 82 |
+
loop = asyncio.get_event_loop()
|
| 83 |
+
batch_embeddings = await loop.run_in_executor(
|
| 84 |
+
None,
|
| 85 |
+
self._generate_batch_embeddings,
|
| 86 |
+
batch
|
| 87 |
+
)
|
| 88 |
+
all_embeddings.extend(batch_embeddings)
|
| 89 |
+
|
| 90 |
+
logger.info(f"Generated {len(all_embeddings)} embeddings")
|
| 91 |
+
return all_embeddings
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"Error generating embeddings: {str(e)}")
|
| 95 |
+
raise
|
| 96 |
+
|
| 97 |
+
def _generate_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
|
| 98 |
+
"""Generate embeddings for a batch of texts (synchronous)"""
|
| 99 |
+
try:
|
| 100 |
+
if self.is_openai_model:
|
| 101 |
+
# OpenAI Embeddings
|
| 102 |
+
response = self.openai_client.embeddings.create(
|
| 103 |
+
input=texts,
|
| 104 |
+
model=self.model_name
|
| 105 |
+
)
|
| 106 |
+
return [data.embedding for data in response.data]
|
| 107 |
+
else:
|
| 108 |
+
# Local SentenceTransformer
|
| 109 |
+
embeddings = self.model.encode(
|
| 110 |
+
texts,
|
| 111 |
+
convert_to_numpy=True,
|
| 112 |
+
normalize_embeddings=True,
|
| 113 |
+
batch_size=len(texts)
|
| 114 |
+
)
|
| 115 |
+
return embeddings.tolist()
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Error in batch embedding generation: {str(e)}")
|
| 118 |
+
raise
|
| 119 |
+
|
| 120 |
+
async def generate_single_embedding(self, text: str) -> Optional[List[float]]:
|
| 121 |
+
"""Generate embedding for a single text"""
|
| 122 |
+
if not text or not text.strip():
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
embeddings = await self.generate_embeddings([text])
|
| 127 |
+
return embeddings[0] if embeddings else None
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.error(f"Error generating single embedding: {str(e)}")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
def get_embedding_dimension(self) -> int:
|
| 133 |
+
"""Get the dimension of embeddings produced by the model"""
|
| 134 |
+
if self.is_openai_model:
|
| 135 |
+
if "small" in self.model_name:
|
| 136 |
+
return 1536
|
| 137 |
+
elif "large" in self.model_name:
|
| 138 |
+
return 3072
|
| 139 |
+
elif "ada" in self.model_name:
|
| 140 |
+
return 1536
|
| 141 |
+
else:
|
| 142 |
+
# Default fallback or make a call to check?
|
| 143 |
+
# For now assume 1536 as it's standard for recent OpenAI models
|
| 144 |
+
return 1536
|
| 145 |
+
|
| 146 |
+
if self.model is None:
|
| 147 |
+
raise RuntimeError("Embedding model not loaded")
|
| 148 |
+
|
| 149 |
+
return self.model.get_sentence_embedding_dimension()
|
| 150 |
+
|
| 151 |
+
def compute_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
|
| 152 |
+
"""Compute cosine similarity between two embeddings"""
|
| 153 |
+
try:
|
| 154 |
+
# Convert to numpy arrays
|
| 155 |
+
emb1 = np.array(embedding1)
|
| 156 |
+
emb2 = np.array(embedding2)
|
| 157 |
+
|
| 158 |
+
# Compute cosine similarity
|
| 159 |
+
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
|
| 160 |
+
|
| 161 |
+
return float(similarity)
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Error computing similarity: {str(e)}")
|
| 164 |
+
return 0.0
|
| 165 |
+
|
| 166 |
+
def compute_similarities(self, query_embedding: List[float], embeddings: List[List[float]]) -> List[float]:
|
| 167 |
+
"""Compute similarities between a query embedding and multiple embeddings"""
|
| 168 |
+
try:
|
| 169 |
+
query_emb = np.array(query_embedding)
|
| 170 |
+
emb_matrix = np.array(embeddings)
|
| 171 |
+
|
| 172 |
+
# Compute cosine similarities
|
| 173 |
+
similarities = np.dot(emb_matrix, query_emb) / (
|
| 174 |
+
np.linalg.norm(emb_matrix, axis=1) * np.linalg.norm(query_emb)
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
return similarities.tolist()
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.error(f"Error computing similarities: {str(e)}")
|
| 180 |
+
return [0.0] * len(embeddings)
|
| 181 |
+
|
| 182 |
+
async def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 183 |
+
"""Embed a list of chunks and add embeddings to them"""
|
| 184 |
+
if not chunks:
|
| 185 |
+
return []
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
# Extract texts
|
| 189 |
+
texts = [chunk.get('content', '') for chunk in chunks]
|
| 190 |
+
|
| 191 |
+
# Generate embeddings
|
| 192 |
+
embeddings = await self.generate_embeddings(texts)
|
| 193 |
+
|
| 194 |
+
# Add embeddings to chunks
|
| 195 |
+
embedded_chunks = []
|
| 196 |
+
for i, chunk in enumerate(chunks):
|
| 197 |
+
if i < len(embeddings):
|
| 198 |
+
chunk_copy = chunk.copy()
|
| 199 |
+
chunk_copy['embedding'] = embeddings[i]
|
| 200 |
+
embedded_chunks.append(chunk_copy)
|
| 201 |
+
else:
|
| 202 |
+
logger.warning(f"No embedding generated for chunk {i}")
|
| 203 |
+
embedded_chunks.append(chunk)
|
| 204 |
+
|
| 205 |
+
return embedded_chunks
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Error embedding chunks: {str(e)}")
|
| 208 |
+
raise
|
| 209 |
+
|
| 210 |
+
def validate_embedding(self, embedding: List[float]) -> bool:
|
| 211 |
+
"""Validate that an embedding is properly formatted"""
|
| 212 |
+
try:
|
| 213 |
+
if not embedding:
|
| 214 |
+
return False
|
| 215 |
+
|
| 216 |
+
if not isinstance(embedding, list):
|
| 217 |
+
return False
|
| 218 |
+
|
| 219 |
+
if len(embedding) != self.get_embedding_dimension():
|
| 220 |
+
return False
|
| 221 |
+
|
| 222 |
+
# Check for NaN or infinite values
|
| 223 |
+
emb_array = np.array(embedding)
|
| 224 |
+
if np.isnan(emb_array).any() or np.isinf(emb_array).any():
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
return True
|
| 228 |
+
except Exception:
|
| 229 |
+
return False
|
| 230 |
+
|
| 231 |
+
async def get_model_info(self) -> Dict[str, Any]:
|
| 232 |
+
"""Get information about the loaded model"""
|
| 233 |
+
try:
|
| 234 |
+
return {
|
| 235 |
+
"model_name": self.model_name,
|
| 236 |
+
"device": "openai-api" if self.is_openai_model else self.device,
|
| 237 |
+
"embedding_dimension": self.get_embedding_dimension(),
|
| 238 |
+
"max_sequence_length": "8191" if self.is_openai_model else getattr(self.model, 'max_seq_length', 'unknown'),
|
| 239 |
+
"model_loaded": self.is_openai_model or (self.model is not None)
|
| 240 |
+
}
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.error(f"Error getting model info: {str(e)}")
|
| 243 |
+
return {"error": str(e)}
|
services/llamaindex_service.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from typing import List, Optional, Any
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import shutil
|
| 6 |
+
import asyncio
|
| 7 |
+
|
| 8 |
+
from llama_index.core import (
|
| 9 |
+
VectorStoreIndex,
|
| 10 |
+
Document,
|
| 11 |
+
StorageContext,
|
| 12 |
+
load_index_from_storage,
|
| 13 |
+
Settings,
|
| 14 |
+
SummaryIndex
|
| 15 |
+
)
|
| 16 |
+
from llama_index.core.tools import QueryEngineTool, ToolMetadata
|
| 17 |
+
from llama_index.core.agent import ReActAgent
|
| 18 |
+
from llama_index.core.selectors import LLMSingleSelector
|
| 19 |
+
from llama_index.core.query_engine import RouterQueryEngine
|
| 20 |
+
from llama_index.llms.openai import OpenAI
|
| 21 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 22 |
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
| 23 |
+
|
| 24 |
+
import config
|
| 25 |
+
from services.document_store_service import DocumentStoreService
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
class LlamaIndexService:
|
| 30 |
+
def __init__(self, document_store: DocumentStoreService):
|
| 31 |
+
self.document_store = document_store
|
| 32 |
+
self.config = config.config
|
| 33 |
+
self.storage_dir = Path(self.config.DATA_DIR) / "llamaindex_storage"
|
| 34 |
+
self.index = None
|
| 35 |
+
self.agent = None
|
| 36 |
+
self.is_initialized = False
|
| 37 |
+
|
| 38 |
+
self._initialize_settings()
|
| 39 |
+
# We don't fully initialize index here because we need async access to doc store
|
| 40 |
+
# But we try to load existing storage if available
|
| 41 |
+
self._try_load_from_storage()
|
| 42 |
+
|
| 43 |
+
def _initialize_settings(self):
|
| 44 |
+
"""Initialize LlamaIndex settings (LLM, Embeddings)"""
|
| 45 |
+
try:
|
| 46 |
+
# LLM Setup
|
| 47 |
+
if self.config.OPENAI_API_KEY:
|
| 48 |
+
# Use configured OpenAI model (gpt-5.1-chat-latest or similar)
|
| 49 |
+
Settings.llm = OpenAI(model=self.config.OPENAI_MODEL, api_key=self.config.OPENAI_API_KEY)
|
| 50 |
+
logger.info(f"LlamaIndex using OpenAI model: {self.config.OPENAI_MODEL}")
|
| 51 |
+
elif self.config.NEBIUS_API_KEY:
|
| 52 |
+
# Use Nebius as OpenAI-compatible provider
|
| 53 |
+
Settings.llm = OpenAI(
|
| 54 |
+
model=self.config.NEBIUS_MODEL,
|
| 55 |
+
api_key=self.config.NEBIUS_API_KEY,
|
| 56 |
+
api_base=self.config.NEBIUS_BASE_URL
|
| 57 |
+
)
|
| 58 |
+
logger.info(f"LlamaIndex using Nebius model: {self.config.NEBIUS_MODEL}")
|
| 59 |
+
else:
|
| 60 |
+
logger.warning("No API key found for LlamaIndex LLM (OpenAI or Nebius). Agentic features may fail.")
|
| 61 |
+
|
| 62 |
+
# Embedding Setup
|
| 63 |
+
if self.config.EMBEDDING_MODEL.startswith("text-embedding-"):
|
| 64 |
+
if self.config.OPENAI_API_KEY:
|
| 65 |
+
Settings.embed_model = OpenAIEmbedding(
|
| 66 |
+
model=self.config.EMBEDDING_MODEL,
|
| 67 |
+
api_key=self.config.OPENAI_API_KEY
|
| 68 |
+
)
|
| 69 |
+
logger.info(f"LlamaIndex using OpenAI embeddings: {self.config.EMBEDDING_MODEL}")
|
| 70 |
+
else:
|
| 71 |
+
logger.warning("OpenAI embedding model requested but no API key found. Falling back to HuggingFace.")
|
| 72 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 73 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
| 74 |
+
)
|
| 75 |
+
else:
|
| 76 |
+
Settings.embed_model = HuggingFaceEmbedding(
|
| 77 |
+
model_name=self.config.EMBEDDING_MODEL
|
| 78 |
+
)
|
| 79 |
+
logger.info(f"LlamaIndex using HuggingFace embeddings: {self.config.EMBEDDING_MODEL}")
|
| 80 |
+
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"Error initializing LlamaIndex settings: {str(e)}")
|
| 83 |
+
|
| 84 |
+
def _try_load_from_storage(self):
|
| 85 |
+
"""Try to load index from storage synchronously"""
|
| 86 |
+
try:
|
| 87 |
+
if self.storage_dir.exists():
|
| 88 |
+
logger.info("Loading LlamaIndex from storage...")
|
| 89 |
+
storage_context = StorageContext.from_defaults(persist_dir=str(self.storage_dir))
|
| 90 |
+
self.index = load_index_from_storage(storage_context)
|
| 91 |
+
self._initialize_agent()
|
| 92 |
+
self.is_initialized = True
|
| 93 |
+
else:
|
| 94 |
+
logger.info("No existing LlamaIndex storage found. Waiting for async initialization.")
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.error(f"Error loading LlamaIndex from storage: {str(e)}")
|
| 97 |
+
|
| 98 |
+
async def initialize(self):
|
| 99 |
+
"""Async initialization to sync documents and build index"""
|
| 100 |
+
try:
|
| 101 |
+
logger.info("Starting LlamaIndex async initialization...")
|
| 102 |
+
|
| 103 |
+
# If we already have an index, we might still want to sync if it's empty or stale
|
| 104 |
+
# For now, if no index exists, we definitely need to build it
|
| 105 |
+
if self.index is None:
|
| 106 |
+
await self.sync_from_document_store()
|
| 107 |
+
|
| 108 |
+
self.is_initialized = True
|
| 109 |
+
logger.info("LlamaIndex async initialization complete.")
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.error(f"Error during LlamaIndex async initialization: {str(e)}")
|
| 113 |
+
|
| 114 |
+
async def sync_from_document_store(self):
|
| 115 |
+
"""Sync documents from DocumentStore to LlamaIndex"""
|
| 116 |
+
try:
|
| 117 |
+
logger.info("Syncing documents from DocumentStore to LlamaIndex...")
|
| 118 |
+
|
| 119 |
+
# Fetch documents from async document store
|
| 120 |
+
# Limit to 1000 for now to avoid memory issues
|
| 121 |
+
docs = await self.document_store.list_documents(limit=1000)
|
| 122 |
+
|
| 123 |
+
if not docs:
|
| 124 |
+
logger.warning("No documents found in DocumentStore to sync.")
|
| 125 |
+
# Create empty index if no docs
|
| 126 |
+
self.index = VectorStoreIndex.from_documents([])
|
| 127 |
+
else:
|
| 128 |
+
# Convert to LlamaIndex documents
|
| 129 |
+
llama_docs = []
|
| 130 |
+
for doc in docs:
|
| 131 |
+
llama_doc = Document(
|
| 132 |
+
text=doc.content,
|
| 133 |
+
metadata={
|
| 134 |
+
"filename": doc.filename,
|
| 135 |
+
"document_id": doc.id,
|
| 136 |
+
**doc.metadata
|
| 137 |
+
}
|
| 138 |
+
)
|
| 139 |
+
llama_docs.append(llama_doc)
|
| 140 |
+
|
| 141 |
+
logger.info(f"Building LlamaIndex with {len(llama_docs)} documents...")
|
| 142 |
+
self.index = VectorStoreIndex.from_documents(llama_docs)
|
| 143 |
+
|
| 144 |
+
# Persist storage
|
| 145 |
+
if not self.storage_dir.exists():
|
| 146 |
+
self.storage_dir.mkdir(parents=True, exist_ok=True)
|
| 147 |
+
self.index.storage_context.persist(persist_dir=str(self.storage_dir))
|
| 148 |
+
|
| 149 |
+
# Re-initialize agent with new index
|
| 150 |
+
self._initialize_agent()
|
| 151 |
+
logger.info("LlamaIndex sync complete.")
|
| 152 |
+
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Error syncing LlamaIndex: {str(e)}")
|
| 155 |
+
|
| 156 |
+
async def sync_on_demand(self):
|
| 157 |
+
"""Manual trigger for syncing documents"""
|
| 158 |
+
await self.sync_from_document_store()
|
| 159 |
+
return True
|
| 160 |
+
|
| 161 |
+
def _initialize_agent(self):
|
| 162 |
+
"""Initialize the ReAct agent with query engine tools"""
|
| 163 |
+
try:
|
| 164 |
+
if not self.index:
|
| 165 |
+
return
|
| 166 |
+
|
| 167 |
+
query_engine = self.index.as_query_engine()
|
| 168 |
+
|
| 169 |
+
query_engine_tool = QueryEngineTool(
|
| 170 |
+
query_engine=query_engine,
|
| 171 |
+
metadata=ToolMetadata(
|
| 172 |
+
name="document_search",
|
| 173 |
+
description="Search and retrieve information from the document library. Use this for specific questions about content."
|
| 174 |
+
)
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
self.agent = ReActAgent.from_tools(
|
| 178 |
+
[query_engine_tool],
|
| 179 |
+
llm=Settings.llm,
|
| 180 |
+
verbose=True
|
| 181 |
+
)
|
| 182 |
+
logger.info("LlamaIndex ReAct agent initialized")
|
| 183 |
+
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"Error initializing LlamaIndex agent: {str(e)}")
|
| 186 |
+
|
| 187 |
+
async def query(self, query_text: str) -> str:
|
| 188 |
+
"""Process a query using the agent"""
|
| 189 |
+
if not self.agent:
|
| 190 |
+
if not self.is_initialized:
|
| 191 |
+
return "Agent is initializing, please try again in a moment."
|
| 192 |
+
return "Agent failed to initialize. Please check logs."
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
response = await self.agent.achat(query_text)
|
| 196 |
+
return str(response)
|
| 197 |
+
except Exception as e:
|
| 198 |
+
logger.error(f"Error querying LlamaIndex agent: {str(e)}")
|
| 199 |
+
return f"Error processing query: {str(e)}"
|
services/llm_service.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mistralai import Mistral
|
| 2 |
+
import logging
|
| 3 |
+
import asyncio
|
| 4 |
+
from typing import List, Dict, Any, Optional
|
| 5 |
+
|
| 6 |
+
import openai
|
| 7 |
+
import config
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class LLMService:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.config = config.config
|
| 14 |
+
|
| 15 |
+
self.nebius_client = None
|
| 16 |
+
self.mistral_client = None
|
| 17 |
+
self.openai_client = None
|
| 18 |
+
|
| 19 |
+
self._initialize_clients()
|
| 20 |
+
|
| 21 |
+
def _initialize_clients(self):
|
| 22 |
+
"""Initialize LLM clients"""
|
| 23 |
+
try:
|
| 24 |
+
if self.config.OPENAI_API_KEY:
|
| 25 |
+
self.openai_client = openai.OpenAI(
|
| 26 |
+
api_key=self.config.OPENAI_API_KEY
|
| 27 |
+
)
|
| 28 |
+
logger.info("OpenAI client initialized")
|
| 29 |
+
|
| 30 |
+
if self.config.NEBIUS_API_KEY:
|
| 31 |
+
self.nebius_client = openai.OpenAI(
|
| 32 |
+
api_key=self.config.NEBIUS_API_KEY,
|
| 33 |
+
base_url=self.config.NEBIUS_BASE_URL
|
| 34 |
+
)
|
| 35 |
+
logger.info("NEBIUS client initialized")
|
| 36 |
+
|
| 37 |
+
if self.config.MISTRAL_API_KEY:
|
| 38 |
+
self.mistral_client = Mistral( # Standard sync client
|
| 39 |
+
api_key=self.config.MISTRAL_API_KEY
|
| 40 |
+
)
|
| 41 |
+
logger.info("Mistral client initialized")
|
| 42 |
+
|
| 43 |
+
# Check if at least one client is initialized
|
| 44 |
+
if not any([self.openai_client, self.nebius_client, self.mistral_client]):
|
| 45 |
+
logger.warning("No LLM clients could be initialized based on current config. Check API keys.")
|
| 46 |
+
else:
|
| 47 |
+
logger.info("LLM clients initialized successfully (at least one).")
|
| 48 |
+
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.error(f"Error initializing LLM clients: {str(e)}")
|
| 51 |
+
raise
|
| 52 |
+
|
| 53 |
+
async def generate_text(self, prompt: str, model: str = "auto", max_tokens: int = 1000, temperature: float = 0.7) -> str:
|
| 54 |
+
"""Generate text using the specified model, with new priority for 'auto'."""
|
| 55 |
+
try:
|
| 56 |
+
selected_model_name_for_call: str = ""
|
| 57 |
+
|
| 58 |
+
if model == "auto":
|
| 59 |
+
# Priority: 1. NEBIUS (Llama 3.3 - Cost Effective), 2. OpenAI (GPT-5.1), 3. Mistral
|
| 60 |
+
if self.nebius_client and self.config.NEBIUS_MODEL:
|
| 61 |
+
selected_model_name_for_call = self.config.NEBIUS_MODEL
|
| 62 |
+
logger.debug(f"Auto-selected NEBIUS model: {selected_model_name_for_call}")
|
| 63 |
+
return await self._generate_with_nebius(prompt, selected_model_name_for_call, max_tokens, temperature)
|
| 64 |
+
elif self.openai_client and self.config.OPENAI_MODEL:
|
| 65 |
+
selected_model_name_for_call = self.config.OPENAI_MODEL
|
| 66 |
+
logger.debug(f"Auto-selected OpenAI model: {selected_model_name_for_call}")
|
| 67 |
+
return await self._generate_with_openai(prompt, selected_model_name_for_call, max_tokens, temperature)
|
| 68 |
+
elif self.mistral_client and self.config.MISTRAL_MODEL:
|
| 69 |
+
selected_model_name_for_call = self.config.MISTRAL_MODEL
|
| 70 |
+
logger.debug(f"Auto-selected Mistral model: {selected_model_name_for_call}")
|
| 71 |
+
return await self._generate_with_mistral(prompt, selected_model_name_for_call, max_tokens, temperature)
|
| 72 |
+
else:
|
| 73 |
+
logger.error("No LLM clients available for 'auto' mode or default models not configured.")
|
| 74 |
+
raise ValueError("No LLM clients available for 'auto' mode or default models not configured.")
|
| 75 |
+
|
| 76 |
+
elif model == "fast":
|
| 77 |
+
# Priority for speed: 1. OpenAI (GPT-5-mini), 2. Mistral Small, 3. Nebius
|
| 78 |
+
if self.openai_client and self.config.FAST_MODEL:
|
| 79 |
+
return await self._generate_with_openai(prompt, self.config.FAST_MODEL, max_tokens, temperature)
|
| 80 |
+
# Fallback to auto if fast model not available
|
| 81 |
+
return await self.generate_text(prompt, "auto", max_tokens, temperature)
|
| 82 |
+
|
| 83 |
+
elif model.startswith("gpt-") or model.startswith("openai/") or "o1-" in model or "o3-" in model:
|
| 84 |
+
if self.openai_client:
|
| 85 |
+
actual_model = model.split('/')[-1] if '/' in model else model
|
| 86 |
+
return await self._generate_with_openai(prompt, actual_model, max_tokens, temperature)
|
| 87 |
+
elif self.nebius_client and "gpt-oss" in model: # Handle Nebius "openai/" prefix if any
|
| 88 |
+
actual_model = model.split('/')[-1] if '/' in model else model
|
| 89 |
+
return await self._generate_with_nebius(prompt, actual_model, max_tokens, temperature)
|
| 90 |
+
else:
|
| 91 |
+
raise ValueError("OpenAI client not available. Check API key.")
|
| 92 |
+
|
| 93 |
+
elif model.lower().startswith("nebius/") or model.lower().startswith("meta-llama/"):
|
| 94 |
+
if not self.nebius_client:
|
| 95 |
+
raise ValueError("NEBIUS client not available. Check API key.")
|
| 96 |
+
return await self._generate_with_nebius(prompt, model, max_tokens, temperature)
|
| 97 |
+
|
| 98 |
+
elif model.startswith("mistral"):
|
| 99 |
+
if not self.mistral_client:
|
| 100 |
+
raise ValueError("Mistral client not available. Check API key or model prefix.")
|
| 101 |
+
return await self._generate_with_mistral(prompt, model, max_tokens, temperature)
|
| 102 |
+
|
| 103 |
+
else:
|
| 104 |
+
raise ValueError(f"Unsupported model: {model}. Must start with 'gpt-', 'openai/', 'nebius/', 'mistral', or be 'auto'.")
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"Error generating text with model '{model}': {str(e)}")
|
| 108 |
+
raise
|
| 109 |
+
|
| 110 |
+
async def _generate_with_openai(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
|
| 111 |
+
"""Generate text using OpenAI"""
|
| 112 |
+
if not self.openai_client:
|
| 113 |
+
raise RuntimeError("OpenAI client not initialized.")
|
| 114 |
+
try:
|
| 115 |
+
logger.debug(f"Generating with OpenAI model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}")
|
| 116 |
+
loop = asyncio.get_event_loop()
|
| 117 |
+
|
| 118 |
+
# Determine correct token parameter based on model family
|
| 119 |
+
# GPT-5, o1, o3 series use max_completion_tokens
|
| 120 |
+
use_completion_tokens = any(x in model_name for x in ["gpt-5", "o1-", "o3-"])
|
| 121 |
+
|
| 122 |
+
kwargs = {
|
| 123 |
+
"model": model_name,
|
| 124 |
+
"messages": [{"role": "user", "content": prompt}],
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
if use_completion_tokens:
|
| 128 |
+
kwargs["max_completion_tokens"] = max_tokens
|
| 129 |
+
# Reasoning models enforce temperature=1
|
| 130 |
+
kwargs["temperature"] = 1
|
| 131 |
+
if temperature != 1:
|
| 132 |
+
logger.warning(f"Temperature {temperature} ignored for model {model_name} (requires 1).")
|
| 133 |
+
else:
|
| 134 |
+
kwargs["max_tokens"] = max_tokens
|
| 135 |
+
kwargs["temperature"] = temperature
|
| 136 |
+
|
| 137 |
+
response = await loop.run_in_executor(
|
| 138 |
+
None,
|
| 139 |
+
lambda: self.openai_client.chat.completions.create(**kwargs)
|
| 140 |
+
)
|
| 141 |
+
if response.choices and response.choices[0].message:
|
| 142 |
+
content = response.choices[0].message.content
|
| 143 |
+
if content is not None:
|
| 144 |
+
return content.strip()
|
| 145 |
+
return ""
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"Error with OpenAI generation (model: {model_name}): {str(e)}")
|
| 148 |
+
raise
|
| 149 |
+
|
| 150 |
+
async def _generate_with_nebius(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
|
| 151 |
+
"""Generate text using NEBIUS (OpenAI OSS models via sync client)"""
|
| 152 |
+
if not self.nebius_client:
|
| 153 |
+
raise RuntimeError("NEBIUS client not initialized.")
|
| 154 |
+
try:
|
| 155 |
+
logger.debug(f"Generating with NEBIUS model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}, prompt: '{prompt[:50]}...'")
|
| 156 |
+
loop = asyncio.get_event_loop()
|
| 157 |
+
|
| 158 |
+
response = await loop.run_in_executor(
|
| 159 |
+
None,
|
| 160 |
+
lambda: self.nebius_client.chat.completions.create(
|
| 161 |
+
model=model_name,
|
| 162 |
+
messages=[{"role": "user", "content": prompt}],
|
| 163 |
+
max_tokens=max_tokens,
|
| 164 |
+
temperature=temperature
|
| 165 |
+
)
|
| 166 |
+
)
|
| 167 |
+
if response.choices and response.choices[0].message:
|
| 168 |
+
content = response.choices[0].message.content
|
| 169 |
+
if content is not None:
|
| 170 |
+
return content.strip()
|
| 171 |
+
else:
|
| 172 |
+
logger.warning(f"NEBIUS response message content is None for model {model_name}.")
|
| 173 |
+
return ""
|
| 174 |
+
else:
|
| 175 |
+
logger.warning(f"NEBIUS response did not contain expected choices or message for model {model_name}.")
|
| 176 |
+
return ""
|
| 177 |
+
except Exception as e:
|
| 178 |
+
logger.error(f"Error with NEBIUS generation (model: {model_name}): {str(e)}")
|
| 179 |
+
raise
|
| 180 |
+
|
| 181 |
+
async def _generate_with_mistral(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
|
| 182 |
+
"""Generate text using Mistral (Sync via run_in_executor)"""
|
| 183 |
+
if not self.mistral_client:
|
| 184 |
+
raise RuntimeError("Mistral client not initialized.")
|
| 185 |
+
try:
|
| 186 |
+
logger.debug(f"Generating with Mistral model: {model_name}, temp: {temperature}, prompt: '{prompt[:50]}...' (max_tokens: {max_tokens} - note: not directly used by MistralClient.chat)")
|
| 187 |
+
loop = asyncio.get_event_loop()
|
| 188 |
+
|
| 189 |
+
response = await loop.run_in_executor(
|
| 190 |
+
None,
|
| 191 |
+
lambda: self.mistral_client.chat(
|
| 192 |
+
model=model_name,
|
| 193 |
+
messages=[{"role": "user", "content": prompt}],
|
| 194 |
+
max_tokens=max_tokens,
|
| 195 |
+
temperature=temperature
|
| 196 |
+
)
|
| 197 |
+
)
|
| 198 |
+
if response.choices and response.choices[0].message:
|
| 199 |
+
content = response.choices[0].message.content
|
| 200 |
+
if content is not None:
|
| 201 |
+
return content.strip()
|
| 202 |
+
else:
|
| 203 |
+
logger.warning(f"Mistral response message content is None for model {model_name}.")
|
| 204 |
+
return ""
|
| 205 |
+
else:
|
| 206 |
+
logger.warning(f"Mistral response did not contain expected choices or message for model {model_name}.")
|
| 207 |
+
return ""
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.error(f"Error with Mistral generation (model: {model_name}): {str(e)}")
|
| 210 |
+
raise
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
async def summarize(self, text: str, style: str = "concise", max_length: Optional[int] = None) -> str:
|
| 214 |
+
if not text.strip():
|
| 215 |
+
return ""
|
| 216 |
+
|
| 217 |
+
style_prompts = {
|
| 218 |
+
"concise": "Provide a concise summary of the following text, focusing on the main points:",
|
| 219 |
+
"detailed": "Provide a detailed summary of the following text, including key details and supporting information:",
|
| 220 |
+
"bullet_points": "Summarize the following text as a list of bullet points highlighting the main ideas:",
|
| 221 |
+
"executive": "Provide an executive summary of the following text, focusing on key findings and actionable insights:"
|
| 222 |
+
}
|
| 223 |
+
prompt_template = style_prompts.get(style, style_prompts["concise"])
|
| 224 |
+
if max_length:
|
| 225 |
+
prompt_template += f" Keep the summary under approximately {max_length} words."
|
| 226 |
+
|
| 227 |
+
prompt = f"{prompt_template}\n\nText to summarize:\n{text}\n\nSummary:"
|
| 228 |
+
|
| 229 |
+
try:
|
| 230 |
+
summary_max_tokens = (max_length * 2) if max_length else 500
|
| 231 |
+
summary = await self.generate_text(prompt, model="auto", max_tokens=summary_max_tokens, temperature=0.3)
|
| 232 |
+
return summary.strip()
|
| 233 |
+
except Exception as e:
|
| 234 |
+
logger.error(f"Error generating summary: {str(e)}")
|
| 235 |
+
return "Error generating summary"
|
| 236 |
+
|
| 237 |
+
async def generate_tags(self, text: str, max_tags: int = 5) -> List[str]:
|
| 238 |
+
if not text.strip():
|
| 239 |
+
return []
|
| 240 |
+
|
| 241 |
+
prompt = f"""Generate up to {max_tags} relevant tags for the following text.
|
| 242 |
+
Tags should be concise, descriptive keywords or phrases (1-3 words typically) that capture the main topics or themes.
|
| 243 |
+
Return only the tags, separated by commas. Do not include any preamble or explanation.
|
| 244 |
+
|
| 245 |
+
Text:
|
| 246 |
+
{text}
|
| 247 |
+
|
| 248 |
+
Tags:"""
|
| 249 |
+
|
| 250 |
+
try:
|
| 251 |
+
# Use FAST_MODEL for tags
|
| 252 |
+
response = await self.generate_text(prompt, model="fast", max_tokens=100, temperature=0.5)
|
| 253 |
+
tags = [tag.strip().lower() for tag in response.split(',') if tag.strip()]
|
| 254 |
+
tags = [tag for tag in tags if tag and len(tag) > 1 and len(tag) < 50]
|
| 255 |
+
return list(dict.fromkeys(tags))[:max_tags]
|
| 256 |
+
except Exception as e:
|
| 257 |
+
logger.error(f"Error generating tags: {str(e)}")
|
| 258 |
+
return []
|
| 259 |
+
|
| 260 |
+
async def categorize(self, text: str, categories: List[str]) -> str:
|
| 261 |
+
if not text.strip() or not categories:
|
| 262 |
+
return "Uncategorized"
|
| 263 |
+
|
| 264 |
+
categories_str = ", ".join([f"'{cat}'" for cat in categories])
|
| 265 |
+
prompt = f"""Classify the following text into ONE of these categories: {categories_str}.
|
| 266 |
+
Choose the single most appropriate category based on the content and main theme of the text.
|
| 267 |
+
Return only the category name as a string, exactly as it appears in the list provided. Do not add any other text or explanation.
|
| 268 |
+
|
| 269 |
+
Text to classify:
|
| 270 |
+
{text}
|
| 271 |
+
|
| 272 |
+
Category:"""
|
| 273 |
+
|
| 274 |
+
try:
|
| 275 |
+
# Use FAST_MODEL for categorization
|
| 276 |
+
response = await self.generate_text(prompt, model="fast", max_tokens=50, temperature=0.1)
|
| 277 |
+
category_candidate = response.strip().strip("'\"")
|
| 278 |
+
|
| 279 |
+
for cat in categories:
|
| 280 |
+
if cat.lower() == category_candidate.lower():
|
| 281 |
+
return cat
|
| 282 |
+
|
| 283 |
+
logger.warning(f"LLM returned category '{category_candidate}' which is not in the provided list: {categories}. Falling back.")
|
| 284 |
+
return categories[0] if categories else "Uncategorized"
|
| 285 |
+
except Exception as e:
|
| 286 |
+
logger.error(f"Error categorizing text: {str(e)}")
|
| 287 |
+
return "Uncategorized"
|
| 288 |
+
|
| 289 |
+
async def answer_question(self, question: str, context: str, max_context_length: int = 3000) -> str:
|
| 290 |
+
if not question.strip():
|
| 291 |
+
return "No question provided."
|
| 292 |
+
if not context.strip():
|
| 293 |
+
return "I don't have enough context to answer this question. Please provide relevant information."
|
| 294 |
+
|
| 295 |
+
if len(context) > max_context_length:
|
| 296 |
+
context = context[:max_context_length] + "..."
|
| 297 |
+
logger.warning(f"Context truncated to {max_context_length} characters for question answering.")
|
| 298 |
+
|
| 299 |
+
prompt = f"""You are an expert Q&A assistant. Your task is to synthesize an answer to the user's question based *only* on the provided source documents.
|
| 300 |
+
Analyze all the source documents provided in the context below.
|
| 301 |
+
If the information is present, provide a comprehensive answer.
|
| 302 |
+
|
| 303 |
+
Here are the source documents:
|
| 304 |
+
--- START OF CONTEXT ---
|
| 305 |
+
{context}
|
| 306 |
+
--- END OF CONTEXT ---
|
| 307 |
+
|
| 308 |
+
Based on the context above, please provide a clear and concise answer to the following question.
|
| 309 |
+
|
| 310 |
+
Question: {question}
|
| 311 |
+
|
| 312 |
+
Answer:"""
|
| 313 |
+
|
| 314 |
+
try:
|
| 315 |
+
answer = await self.generate_text(prompt, model="auto", max_tokens=800, temperature=0.5)
|
| 316 |
+
return answer.strip()
|
| 317 |
+
except Exception as e:
|
| 318 |
+
logger.error(f"Error answering question: {str(e)}")
|
| 319 |
+
return "I encountered an error while trying to answer your question."
|
| 320 |
+
|
| 321 |
+
async def extract_key_information(self, text: str) -> Dict[str, Any]:
|
| 322 |
+
if not text.strip():
|
| 323 |
+
return {}
|
| 324 |
+
|
| 325 |
+
prompt = f"""Analyze the following text and extract key information.
|
| 326 |
+
Provide the response as a JSON object with the following keys:
|
| 327 |
+
- "main_topic": (string) The main topic or subject of the text.
|
| 328 |
+
- "key_points": (array of strings) A list of 3-5 key points or takeaways.
|
| 329 |
+
- "entities": (array of strings) Important people, places, organizations, or products mentioned.
|
| 330 |
+
- "sentiment": (string) Overall sentiment of the text (e.g., "positive", "neutral", "negative", "mixed").
|
| 331 |
+
- "content_type": (string) The perceived type of content (e.g., "article", "email", "report", "conversation", "advertisement", "other").
|
| 332 |
+
|
| 333 |
+
If a piece of information is not found or not applicable, use null or an empty array/string as appropriate for the JSON structure.
|
| 334 |
+
|
| 335 |
+
Text to analyze:
|
| 336 |
+
---
|
| 337 |
+
{text}
|
| 338 |
+
---
|
| 339 |
+
|
| 340 |
+
JSON Analysis:"""
|
| 341 |
+
|
| 342 |
+
try:
|
| 343 |
+
response_str = await self.generate_text(prompt, model="auto", max_tokens=500, temperature=0.4)
|
| 344 |
+
|
| 345 |
+
import json
|
| 346 |
+
try:
|
| 347 |
+
if response_str.startswith("```json"):
|
| 348 |
+
response_str = response_str.lstrip("```json").rstrip("```").strip()
|
| 349 |
+
|
| 350 |
+
info = json.loads(response_str)
|
| 351 |
+
expected_keys = {"main_topic", "key_points", "entities", "sentiment", "content_type"}
|
| 352 |
+
if not expected_keys.issubset(info.keys()):
|
| 353 |
+
logger.warning(f"Extracted information missing some expected keys. Got: {info.keys()}")
|
| 354 |
+
return info
|
| 355 |
+
except json.JSONDecodeError as je:
|
| 356 |
+
logger.error(f"Failed to parse JSON from LLM response for key_information: {je}")
|
| 357 |
+
logger.debug(f"LLM Response string was: {response_str}")
|
| 358 |
+
info_fallback = {}
|
| 359 |
+
lines = response_str.split('\n')
|
| 360 |
+
for line in lines:
|
| 361 |
+
if ':' in line:
|
| 362 |
+
key, value = line.split(':', 1)
|
| 363 |
+
key_clean = key.strip().lower().replace(' ', '_')
|
| 364 |
+
value_clean = value.strip()
|
| 365 |
+
if value_clean:
|
| 366 |
+
if key_clean in ["key_points", "entities"] and '[' in value_clean and ']' in value_clean:
|
| 367 |
+
try:
|
| 368 |
+
info_fallback[key_clean] = [item.strip().strip("'\"") for item in value_clean.strip('[]').split(',') if item.strip()]
|
| 369 |
+
except: info_fallback[key_clean] = value_clean
|
| 370 |
+
else: info_fallback[key_clean] = value_clean
|
| 371 |
+
if info_fallback:
|
| 372 |
+
logger.info("Successfully parsed key information using fallback line-based method.")
|
| 373 |
+
return info_fallback
|
| 374 |
+
return {"error": "Failed to parse LLM output", "raw_response": response_str}
|
| 375 |
+
except Exception as e:
|
| 376 |
+
logger.error(f"Error extracting key information: {str(e)}")
|
| 377 |
+
return {"error": f"General error extracting key information: {str(e)}"}
|
| 378 |
+
|
| 379 |
+
async def check_availability(self) -> Dict[str, bool]:
|
| 380 |
+
"""Check which LLM services are available by making a tiny test call."""
|
| 381 |
+
availability = {
|
| 382 |
+
"openai": False,
|
| 383 |
+
"nebius": False,
|
| 384 |
+
"mistral": False
|
| 385 |
+
}
|
| 386 |
+
test_prompt = "Hello"
|
| 387 |
+
test_max_tokens = 5
|
| 388 |
+
test_temp = 0.1
|
| 389 |
+
|
| 390 |
+
logger.info("Checking LLM availability...")
|
| 391 |
+
|
| 392 |
+
if self.openai_client and self.config.OPENAI_MODEL:
|
| 393 |
+
try:
|
| 394 |
+
logger.debug(f"Testing OpenAI availability with model {self.config.OPENAI_MODEL}...")
|
| 395 |
+
test_response = await self._generate_with_openai(test_prompt, self.config.OPENAI_MODEL, test_max_tokens, test_temp)
|
| 396 |
+
availability["openai"] = bool(test_response.strip())
|
| 397 |
+
except Exception as e:
|
| 398 |
+
logger.warning(f"OpenAI availability check failed for model {self.config.OPENAI_MODEL}: {e}")
|
| 399 |
+
logger.info(f"OpenAI available: {availability['openai']}")
|
| 400 |
+
|
| 401 |
+
if self.nebius_client and self.config.NEBIUS_MODEL:
|
| 402 |
+
try:
|
| 403 |
+
logger.debug(f"Testing NEBIUS availability with model {self.config.NEBIUS_MODEL}...")
|
| 404 |
+
test_response = await self._generate_with_nebius(test_prompt, self.config.NEBIUS_MODEL, test_max_tokens, test_temp)
|
| 405 |
+
availability["nebius"] = bool(test_response.strip())
|
| 406 |
+
except Exception as e:
|
| 407 |
+
logger.warning(f"NEBIUS availability check failed for model {self.config.NEBIUS_MODEL}: {e}")
|
| 408 |
+
logger.info(f"NEBIUS available: {availability['nebius']}")
|
| 409 |
+
|
| 410 |
+
if self.mistral_client and self.config.MISTRAL_MODEL:
|
| 411 |
+
try:
|
| 412 |
+
logger.debug(f"Testing Mistral availability with model {self.config.MISTRAL_MODEL}...")
|
| 413 |
+
test_response = await self._generate_with_mistral(test_prompt, self.config.MISTRAL_MODEL, test_max_tokens, test_temp)
|
| 414 |
+
availability["mistral"] = bool(test_response.strip())
|
| 415 |
+
except Exception as e:
|
| 416 |
+
logger.warning(f"Mistral availability check failed for model {self.config.MISTRAL_MODEL}: {e}")
|
| 417 |
+
logger.info(f"Mistral available: {availability['mistral']}")
|
| 418 |
+
|
| 419 |
+
logger.info(f"Final LLM Availability: {availability}")
|
| 420 |
+
return availability
|
services/ocr_service.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import logging
|
| 3 |
+
import asyncio
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import os
|
| 6 |
+
import base64 # For encoding files
|
| 7 |
+
from typing import Optional, List, Dict, Any
|
| 8 |
+
import json
|
| 9 |
+
|
| 10 |
+
from mistralai import Mistral
|
| 11 |
+
from mistralai.models import SDKError
|
| 12 |
+
# PIL (Pillow) for dummy image creation in main_example
|
| 13 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class OCRService:
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.api_key = os.environ.get("MISTRAL_API_KEY")
|
| 20 |
+
if not self.api_key:
|
| 21 |
+
logger.error("MISTRAL_API_KEY environment variable not set.")
|
| 22 |
+
raise ValueError("MISTRAL_API_KEY not found in environment variables.")
|
| 23 |
+
|
| 24 |
+
self.client = Mistral(api_key=self.api_key)
|
| 25 |
+
self.ocr_model_name = "mistral-ocr-latest"
|
| 26 |
+
self.language = 'eng'
|
| 27 |
+
logger.info(f"OCRService (using Mistral AI model {self.ocr_model_name}) initialized.")
|
| 28 |
+
|
| 29 |
+
def _encode_file_to_base64(self, file_path: str) -> Optional[str]:
|
| 30 |
+
try:
|
| 31 |
+
with open(file_path, "rb") as file_to_encode:
|
| 32 |
+
return base64.b64encode(file_to_encode.read()).decode('utf-8')
|
| 33 |
+
except FileNotFoundError:
|
| 34 |
+
logger.error(f"Error: The file {file_path} was not found for Base64 encoding.")
|
| 35 |
+
return None
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logger.error(f"Error during Base64 encoding for {file_path}: {e}")
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
# In OCRService class:
|
| 41 |
+
|
| 42 |
+
async def _process_file_with_mistral(self, file_path: str, mime_type: str) -> str:
|
| 43 |
+
file_name = Path(file_path).name
|
| 44 |
+
logger.info(f"Preparing to process file: {file_name} (MIME: {mime_type}) with Mistral OCR.")
|
| 45 |
+
|
| 46 |
+
base64_encoded_file = self._encode_file_to_base64(file_path)
|
| 47 |
+
if not base64_encoded_file:
|
| 48 |
+
logger.warning(f"Base64 encoding failed for {file_name}, cannot process.")
|
| 49 |
+
return ""
|
| 50 |
+
|
| 51 |
+
document_type = "image_url" if mime_type.startswith("image/") else "document_url"
|
| 52 |
+
uri_key = "image_url" if document_type == "image_url" else "document_url"
|
| 53 |
+
data_uri = f"data:{mime_type};base64,{base64_encoded_file}"
|
| 54 |
+
|
| 55 |
+
document_payload = {
|
| 56 |
+
"type": document_type,
|
| 57 |
+
uri_key: data_uri
|
| 58 |
+
}
|
| 59 |
+
try:
|
| 60 |
+
logger.info(f"Calling Mistral client.ocr.process for {file_name} with model {self.ocr_model_name}.")
|
| 61 |
+
loop = asyncio.get_event_loop()
|
| 62 |
+
|
| 63 |
+
ocr_response = await loop.run_in_executor(
|
| 64 |
+
None,
|
| 65 |
+
lambda: self.client.ocr.process(
|
| 66 |
+
model=self.ocr_model_name,
|
| 67 |
+
document=document_payload,
|
| 68 |
+
include_image_base64=False
|
| 69 |
+
)
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
logger.info(f"Received OCR response for {file_name}. Type: {type(ocr_response)}")
|
| 73 |
+
|
| 74 |
+
extracted_markdown = ""
|
| 75 |
+
if hasattr(ocr_response, 'pages') and ocr_response.pages and isinstance(ocr_response.pages, list):
|
| 76 |
+
all_pages_markdown = []
|
| 77 |
+
for i, page in enumerate(ocr_response.pages):
|
| 78 |
+
page_content = None
|
| 79 |
+
if hasattr(page, 'markdown') and page.markdown: # Check for 'markdown' attribute
|
| 80 |
+
page_content = page.markdown
|
| 81 |
+
logger.debug(f"Extracted content from page {i} using 'page.markdown'.")
|
| 82 |
+
elif hasattr(page, 'markdown_content') and page.markdown_content:
|
| 83 |
+
page_content = page.markdown_content
|
| 84 |
+
logger.debug(f"Extracted content from page {i} using 'page.markdown_content'.")
|
| 85 |
+
elif hasattr(page, 'text') and page.text:
|
| 86 |
+
page_content = page.text
|
| 87 |
+
logger.debug(f"Extracted content from page {i} using 'page.text'.")
|
| 88 |
+
|
| 89 |
+
if page_content:
|
| 90 |
+
all_pages_markdown.append(page_content)
|
| 91 |
+
else:
|
| 92 |
+
page_details_for_log = str(page)[:200] # Default to string snippet
|
| 93 |
+
if hasattr(page, '__dict__'):
|
| 94 |
+
page_details_for_log = str(vars(page))[:200] # Log part of vars if it's an object
|
| 95 |
+
logger.warning(f"Page {i} in OCR response for {file_name} has no 'markdown', 'markdown_content', or 'text'. Page details: {page_details_for_log}")
|
| 96 |
+
|
| 97 |
+
if all_pages_markdown:
|
| 98 |
+
extracted_markdown = "\n\n---\nPage Break (simulated)\n---\n\n".join(all_pages_markdown) # Simulate page breaks
|
| 99 |
+
else:
|
| 100 |
+
logger.warning(f"'pages' attribute found but no content extracted from any pages for {file_name}.")
|
| 101 |
+
|
| 102 |
+
# Fallbacks if ocr_response doesn't have 'pages' but might have direct text/markdown
|
| 103 |
+
elif hasattr(ocr_response, 'text') and ocr_response.text:
|
| 104 |
+
extracted_markdown = ocr_response.text
|
| 105 |
+
logger.info(f"Extracted content from 'ocr_response.text' (no pages structure) for {file_name}.")
|
| 106 |
+
elif hasattr(ocr_response, 'markdown') and ocr_response.markdown:
|
| 107 |
+
extracted_markdown = ocr_response.markdown
|
| 108 |
+
logger.info(f"Extracted content from 'ocr_response.markdown' (no pages structure) for {file_name}.")
|
| 109 |
+
elif isinstance(ocr_response, str) and ocr_response:
|
| 110 |
+
extracted_markdown = ocr_response
|
| 111 |
+
logger.info(f"OCR response is a direct non-empty string for {file_name}.")
|
| 112 |
+
else:
|
| 113 |
+
logger.warning(f"Could not extract markdown from OCR response for {file_name} using known attributes (pages, text, markdown).")
|
| 114 |
+
|
| 115 |
+
if not extracted_markdown.strip():
|
| 116 |
+
logger.warning(f"Extracted markdown is empty for {file_name} after all parsing attempts.")
|
| 117 |
+
|
| 118 |
+
return extracted_markdown.strip()
|
| 119 |
+
|
| 120 |
+
except SDKError as e:
|
| 121 |
+
logger.error(f"Mistral API Exception during client.ocr.process for {file_name}: {e.message}")
|
| 122 |
+
logger.exception("SDKError details:")
|
| 123 |
+
return ""
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.error(f"Generic Exception during Mistral client.ocr.process call for {file_name}: {e}")
|
| 126 |
+
logger.exception("Exception details:")
|
| 127 |
+
return ""
|
| 128 |
+
|
| 129 |
+
async def extract_text_from_image(self, image_path: str, language: Optional[str] = None) -> str:
|
| 130 |
+
if language:
|
| 131 |
+
logger.info(f"Language parameter '{language}' provided, but Mistral OCR is broadly multilingual.")
|
| 132 |
+
|
| 133 |
+
ext = Path(image_path).suffix.lower()
|
| 134 |
+
mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png',
|
| 135 |
+
'.gif': 'image/gif', '.bmp': 'image/bmp', '.tiff': 'image/tiff', '.webp': 'image/webp',
|
| 136 |
+
'.avif': 'image/avif'}
|
| 137 |
+
mime_type = mime_map.get(ext)
|
| 138 |
+
if not mime_type:
|
| 139 |
+
logger.warning(f"Unsupported image extension '{ext}' for path '{image_path}'. Attempting with 'application/octet-stream'.")
|
| 140 |
+
mime_type = 'application/octet-stream'
|
| 141 |
+
|
| 142 |
+
return await self._process_file_with_mistral(image_path, mime_type)
|
| 143 |
+
|
| 144 |
+
async def extract_text_from_pdf(self, pdf_path: str) -> str:
|
| 145 |
+
return await self._process_file_with_mistral(pdf_path, "application/pdf")
|
| 146 |
+
|
| 147 |
+
async def extract_text_from_pdf_images(self, pdf_path: str) -> List[str]:
|
| 148 |
+
logger.info("Mistral processes PDFs directly. This method will return the full Markdown content as a single list item.")
|
| 149 |
+
full_markdown = await self._process_file_with_mistral(pdf_path, "application/pdf")
|
| 150 |
+
if full_markdown:
|
| 151 |
+
return [full_markdown]
|
| 152 |
+
return [""]
|
| 153 |
+
|
| 154 |
+
async def extract_text_with_confidence(self, image_path: str, min_confidence: float = 0.5) -> Dict[str, Any]:
|
| 155 |
+
logger.warning("Mistral Document AI API (ocr.process) typically returns structured text (Markdown). Word-level confidence scores are not standard. 'confidence' field is a placeholder.")
|
| 156 |
+
|
| 157 |
+
ext = Path(image_path).suffix.lower()
|
| 158 |
+
mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
|
| 159 |
+
mime_type = mime_map.get(ext)
|
| 160 |
+
if not mime_type:
|
| 161 |
+
logger.warning(f"Unsupported image extension '{ext}' in extract_text_with_confidence. Defaulting mime type.")
|
| 162 |
+
mime_type = 'application/octet-stream'
|
| 163 |
+
|
| 164 |
+
text_markdown = await self._process_file_with_mistral(image_path, mime_type)
|
| 165 |
+
|
| 166 |
+
return {
|
| 167 |
+
"text": text_markdown,
|
| 168 |
+
"confidence": 0.0,
|
| 169 |
+
"word_count": len(text_markdown.split()) if text_markdown else 0,
|
| 170 |
+
"raw_data": "Mistral ocr.process response contains structured data. See logs from _process_file_with_mistral for details."
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
async def detect_language(self, image_path: str) -> str:
|
| 174 |
+
logger.warning("Mistral OCR is multilingual; explicit language detection is not part of client.ocr.process.")
|
| 175 |
+
return 'eng'
|
| 176 |
+
|
| 177 |
+
async def extract_tables_from_image(self, image_path: str) -> List[List[str]]:
|
| 178 |
+
logger.info("Extracting text (Markdown) from image using Mistral. Mistral OCR preserves table structures in Markdown.")
|
| 179 |
+
|
| 180 |
+
ext = Path(image_path).suffix.lower()
|
| 181 |
+
mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
|
| 182 |
+
mime_type = mime_map.get(ext)
|
| 183 |
+
if not mime_type:
|
| 184 |
+
logger.warning(f"Unsupported image extension '{ext}' in extract_tables_from_image. Defaulting mime type.")
|
| 185 |
+
mime_type = 'application/octet-stream'
|
| 186 |
+
|
| 187 |
+
markdown_content = await self._process_file_with_mistral(image_path, mime_type)
|
| 188 |
+
|
| 189 |
+
if markdown_content:
|
| 190 |
+
logger.info("Attempting basic parsing of Markdown tables. For complex tables, a dedicated parser is recommended.")
|
| 191 |
+
table_data = []
|
| 192 |
+
# Simplified parsing logic for example purposes - can be improved significantly.
|
| 193 |
+
lines = markdown_content.split('\n')
|
| 194 |
+
for line in lines:
|
| 195 |
+
stripped_line = line.strip()
|
| 196 |
+
if stripped_line.startswith('|') and stripped_line.endswith('|') and "---" not in stripped_line:
|
| 197 |
+
cells = [cell.strip() for cell in stripped_line.strip('|').split('|')]
|
| 198 |
+
if any(cells):
|
| 199 |
+
table_data.append(cells)
|
| 200 |
+
|
| 201 |
+
if table_data:
|
| 202 |
+
logger.info(f"Extracted {len(table_data)} lines potentially forming tables using basic parsing.")
|
| 203 |
+
else:
|
| 204 |
+
logger.info("No distinct table structures found with basic parsing from extracted markdown.")
|
| 205 |
+
return table_data
|
| 206 |
+
return []
|
| 207 |
+
|
| 208 |
+
async def get_supported_languages(self) -> List[str]:
|
| 209 |
+
logger.info("Mistral OCR is multilingual. Refer to official Mistral AI documentation for details.")
|
| 210 |
+
return ['eng', 'multilingual (refer to Mistral documentation)']
|
| 211 |
+
|
| 212 |
+
async def validate_ocr_setup(self) -> Dict[str, Any]:
|
| 213 |
+
try:
|
| 214 |
+
models_response = await asyncio.to_thread(self.client.models.list)
|
| 215 |
+
model_ids = [model.id for model in models_response.data]
|
| 216 |
+
return {
|
| 217 |
+
"status": "operational",
|
| 218 |
+
"message": "Mistral client initialized. API key present. Model listing successful.",
|
| 219 |
+
"mistral_available_models_sample": model_ids[:5],
|
| 220 |
+
"configured_ocr_model": self.ocr_model_name,
|
| 221 |
+
}
|
| 222 |
+
except SDKError as e:
|
| 223 |
+
logger.error(f"Mistral API Exception during setup validation: {e.message}")
|
| 224 |
+
return { "status": "error", "error": f"Mistral API Error: {e.message}"}
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logger.error(f"Generic error during Mistral OCR setup validation: {str(e)}")
|
| 227 |
+
return { "status": "error", "error": str(e) }
|
| 228 |
+
|
| 229 |
+
def extract_text(self, file_path: str) -> str:
|
| 230 |
+
logger.warning("`extract_text` is a synchronous method. Running async Mistral OCR in a blocking way.")
|
| 231 |
+
try:
|
| 232 |
+
ext = Path(file_path).suffix.lower()
|
| 233 |
+
if ext in ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.avif']:
|
| 234 |
+
result = asyncio.run(self.extract_text_from_image(file_path))
|
| 235 |
+
elif ext == '.pdf':
|
| 236 |
+
result = asyncio.run(self.extract_text_from_pdf(file_path))
|
| 237 |
+
else:
|
| 238 |
+
logger.error(f"Unsupported file type for sync extract_text: {file_path}")
|
| 239 |
+
return "Unsupported file type."
|
| 240 |
+
return result
|
| 241 |
+
except Exception as e:
|
| 242 |
+
logger.error(f"Error in synchronous extract_text for {file_path}: {str(e)}")
|
| 243 |
+
return "Error during sync extraction."
|
| 244 |
+
|
| 245 |
+
# Example of how to use the OCRService (main execution part)
|
| 246 |
+
async def main_example():
|
| 247 |
+
logging.basicConfig(level=logging.DEBUG,
|
| 248 |
+
format='%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s')
|
| 249 |
+
|
| 250 |
+
if not os.environ.get("MISTRAL_API_KEY"):
|
| 251 |
+
logger.error("MISTRAL_API_KEY environment variable is not set. Please set it: export MISTRAL_API_KEY='yourkey'")
|
| 252 |
+
return
|
| 253 |
+
|
| 254 |
+
ocr_service = OCRService()
|
| 255 |
+
|
| 256 |
+
logger.info("--- Validating OCR Service Setup ---")
|
| 257 |
+
validation_status = await ocr_service.validate_ocr_setup()
|
| 258 |
+
logger.info(f"OCR Service Validation: {validation_status}")
|
| 259 |
+
if validation_status.get("status") == "error":
|
| 260 |
+
logger.error("Halting due to validation error.")
|
| 261 |
+
return
|
| 262 |
+
|
| 263 |
+
# --- Test with a specific PDF file ---
|
| 264 |
+
pdf_path_to_test = r"C:\path\to\your\certificate.pdf"
|
| 265 |
+
|
| 266 |
+
if os.path.exists(pdf_path_to_test):
|
| 267 |
+
logger.info(f"\n--- Extracting text from specific PDF: {pdf_path_to_test} ---")
|
| 268 |
+
# Using the method that aligns with original `extract_text_from_pdf_images` signature
|
| 269 |
+
pdf_markdown_list = await ocr_service.extract_text_from_pdf_images(pdf_path_to_test)
|
| 270 |
+
if pdf_markdown_list and pdf_markdown_list[0]:
|
| 271 |
+
logger.info(f"Extracted Markdown from PDF ({pdf_path_to_test}):\n" + pdf_markdown_list[0])
|
| 272 |
+
else:
|
| 273 |
+
logger.warning(f"No text extracted from PDF {pdf_path_to_test} or an error occurred.")
|
| 274 |
+
else:
|
| 275 |
+
logger.warning(f"PDF file for specific test '{pdf_path_to_test}' not found. Skipping this test.")
|
| 276 |
+
logger.warning("Please update `pdf_path_to_test` in `main_example` to a valid PDF path.")
|
| 277 |
+
|
| 278 |
+
image_path = "dummy_test_image_ocr.png"
|
| 279 |
+
if os.path.exists(image_path):
|
| 280 |
+
logger.info(f"\n---Extracting text from image: {image_path} ---")
|
| 281 |
+
# ... image processing logic ...
|
| 282 |
+
pass
|
| 283 |
+
else:
|
| 284 |
+
logger.info(f"Dummy image {image_path} not created or found, skipping optional image test.")
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
if __name__ == '__main__':
|
| 288 |
+
asyncio.run(main_example())
|
services/podcast_generator_service.py
ADDED
|
@@ -0,0 +1,663 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import asyncio
|
| 3 |
+
import json
|
| 4 |
+
import uuid
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
from dataclasses import dataclass, asdict
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from elevenlabs import VoiceSettings
|
| 13 |
+
from elevenlabs.client import ElevenLabs
|
| 14 |
+
ELEVENLABS_AVAILABLE = True
|
| 15 |
+
except ImportError:
|
| 16 |
+
ELEVENLABS_AVAILABLE = False
|
| 17 |
+
|
| 18 |
+
import config
|
| 19 |
+
from services.llamaindex_service import LlamaIndexService
|
| 20 |
+
from services.llm_service import LLMService
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class DocumentAnalysis:
|
| 26 |
+
"""Analysis results from document(s)"""
|
| 27 |
+
key_insights: List[str] # 5-7 main points
|
| 28 |
+
topics: List[str]
|
| 29 |
+
complexity_level: str # beginner, intermediate, advanced
|
| 30 |
+
estimated_words: int
|
| 31 |
+
source_documents: List[str]
|
| 32 |
+
summary: str
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class DialogueLine:
|
| 36 |
+
"""Single line of podcast dialogue"""
|
| 37 |
+
speaker: str # "HOST1" or "HOST2"
|
| 38 |
+
text: str
|
| 39 |
+
pause_after: float = 0.5 # seconds
|
| 40 |
+
|
| 41 |
+
@dataclass
|
| 42 |
+
class PodcastScript:
|
| 43 |
+
"""Complete podcast script"""
|
| 44 |
+
dialogue: List[DialogueLine]
|
| 45 |
+
total_duration_estimate: float
|
| 46 |
+
word_count: int
|
| 47 |
+
style: str
|
| 48 |
+
|
| 49 |
+
def to_text(self) -> str:
|
| 50 |
+
"""Convert to readable transcript"""
|
| 51 |
+
lines = []
|
| 52 |
+
for line in self.dialogue:
|
| 53 |
+
lines.append(f"{line.speaker}: {line.text}")
|
| 54 |
+
return "\n\n".join(lines)
|
| 55 |
+
|
| 56 |
+
@dataclass
|
| 57 |
+
class PodcastMetadata:
|
| 58 |
+
"""Metadata for generated podcast"""
|
| 59 |
+
podcast_id: str
|
| 60 |
+
title: str
|
| 61 |
+
description: str
|
| 62 |
+
source_documents: List[str]
|
| 63 |
+
style: str
|
| 64 |
+
duration_seconds: float
|
| 65 |
+
file_size_mb: float
|
| 66 |
+
voices: Dict[str, str]
|
| 67 |
+
generated_at: str
|
| 68 |
+
generation_cost: Dict[str, float]
|
| 69 |
+
key_topics: List[str]
|
| 70 |
+
|
| 71 |
+
@dataclass
|
| 72 |
+
class PodcastResult:
|
| 73 |
+
"""Complete podcast generation result"""
|
| 74 |
+
podcast_id: str
|
| 75 |
+
audio_file_path: str
|
| 76 |
+
transcript: str
|
| 77 |
+
metadata: PodcastMetadata
|
| 78 |
+
generation_time: float
|
| 79 |
+
success: bool
|
| 80 |
+
error: Optional[str] = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class PodcastGeneratorService:
|
| 84 |
+
"""
|
| 85 |
+
Service for generating conversational podcasts from documents.
|
| 86 |
+
Combines LlamaIndex for analysis and ElevenLabs for voice synthesis.
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
# Word count per minute for podcast pacing
|
| 90 |
+
WORDS_PER_MINUTE = 150
|
| 91 |
+
|
| 92 |
+
# Script generation prompts for different styles
|
| 93 |
+
SCRIPT_PROMPTS = {
|
| 94 |
+
"conversational": """You are an expert podcast script writer. Create an engaging 2-host podcast discussing insights from documents.
|
| 95 |
+
|
| 96 |
+
CONTEXT:
|
| 97 |
+
{analysis}
|
| 98 |
+
|
| 99 |
+
REQUIREMENTS:
|
| 100 |
+
- Duration: {duration_minutes} minutes (approximately {word_count} words)
|
| 101 |
+
- Style: Conversational, friendly, and accessible
|
| 102 |
+
- Format: Alternating dialogue between HOST1 and HOST2
|
| 103 |
+
- Include natural transitions, questions, and "aha!" moments
|
| 104 |
+
- Make complex topics easy to understand
|
| 105 |
+
- Add enthusiasm and genuine curiosity
|
| 106 |
+
- Balance speaking time between both hosts
|
| 107 |
+
|
| 108 |
+
DIALOGUE FORMAT (strictly follow):
|
| 109 |
+
HOST1: [What they say]
|
| 110 |
+
HOST2: [What they say]
|
| 111 |
+
|
| 112 |
+
STRUCTURE:
|
| 113 |
+
1. Opening Hook (30 seconds): Grab attention with an intriguing question or fact
|
| 114 |
+
2. Introduction (1 minute): Set context and preview what's coming
|
| 115 |
+
3. Main Discussion (70% of time): Deep dive into key insights
|
| 116 |
+
4. Wrap-up (1 minute): Summarize key takeaways and final thoughts
|
| 117 |
+
|
| 118 |
+
TONE: Friendly, enthusiastic, educational but not condescending
|
| 119 |
+
|
| 120 |
+
Generate the complete podcast script now:""",
|
| 121 |
+
|
| 122 |
+
"educational": """You are creating an educational podcast script. Two hosts discuss document insights in a clear, instructive manner.
|
| 123 |
+
|
| 124 |
+
CONTEXT:
|
| 125 |
+
{analysis}
|
| 126 |
+
|
| 127 |
+
REQUIREMENTS:
|
| 128 |
+
- Duration: {duration_minutes} minutes (approximately {word_count} words)
|
| 129 |
+
- Style: Clear, methodical, educational
|
| 130 |
+
- HOST1 acts as the teacher/expert, HOST2 as the curious learner
|
| 131 |
+
- Include explanations of complex concepts
|
| 132 |
+
- Use examples and analogies
|
| 133 |
+
- Build knowledge progressively
|
| 134 |
+
|
| 135 |
+
DIALOGUE FORMAT:
|
| 136 |
+
HOST1: [Expert explanation]
|
| 137 |
+
HOST2: [Clarifying question or observation]
|
| 138 |
+
|
| 139 |
+
Generate the complete educational podcast script now:""",
|
| 140 |
+
|
| 141 |
+
"technical": """You are writing a technical podcast for an informed audience. Discuss document insights with precision and depth.
|
| 142 |
+
|
| 143 |
+
CONTEXT:
|
| 144 |
+
{analysis}
|
| 145 |
+
|
| 146 |
+
REQUIREMENTS:
|
| 147 |
+
- Duration: {duration_minutes} minutes (approximately {word_count} words)
|
| 148 |
+
- Style: Professional, detailed, technically accurate
|
| 149 |
+
- HOST1 is the subject matter expert, HOST2 is an informed interviewer
|
| 150 |
+
- Use proper technical terminology
|
| 151 |
+
- Dive into implementation details
|
| 152 |
+
- Discuss implications and applications
|
| 153 |
+
|
| 154 |
+
DIALOGUE FORMAT:
|
| 155 |
+
HOST1: [Technical insight]
|
| 156 |
+
HOST2: [Probing question]
|
| 157 |
+
|
| 158 |
+
Generate the complete technical podcast script now:""",
|
| 159 |
+
|
| 160 |
+
"casual": """You are creating a fun, casual podcast. Two friends discuss interesting ideas from documents.
|
| 161 |
+
|
| 162 |
+
CONTEXT:
|
| 163 |
+
{analysis}
|
| 164 |
+
|
| 165 |
+
REQUIREMENTS:
|
| 166 |
+
- Duration: {duration_minutes} minutes (approximately {word_count} words)
|
| 167 |
+
- Style: Relaxed, humorous, energetic
|
| 168 |
+
- Both hosts are enthusiastic and engaged
|
| 169 |
+
- Use casual language and occasional humor
|
| 170 |
+
- Make it entertaining while staying informative
|
| 171 |
+
- Quick pacing with energy
|
| 172 |
+
|
| 173 |
+
DIALOGUE FORMAT:
|
| 174 |
+
HOST1: [Casual commentary]
|
| 175 |
+
HOST2: [Enthusiastic response]
|
| 176 |
+
|
| 177 |
+
Generate the complete casual podcast script now:"""
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
def __init__(
|
| 181 |
+
self,
|
| 182 |
+
llamaindex_service: LlamaIndexService,
|
| 183 |
+
llm_service: LLMService,
|
| 184 |
+
elevenlabs_api_key: Optional[str] = None
|
| 185 |
+
):
|
| 186 |
+
"""
|
| 187 |
+
Initialize podcast generator service
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
llamaindex_service: Service for document analysis
|
| 191 |
+
llm_service: Service for script generation
|
| 192 |
+
elevenlabs_api_key: ElevenLabs API key (uses config if not provided)
|
| 193 |
+
"""
|
| 194 |
+
self.config = config.config
|
| 195 |
+
self.llamaindex_service = llamaindex_service
|
| 196 |
+
self.llm_service = llm_service
|
| 197 |
+
|
| 198 |
+
# Initialize ElevenLabs client
|
| 199 |
+
self.elevenlabs_client = None
|
| 200 |
+
if ELEVENLABS_AVAILABLE:
|
| 201 |
+
api_key = elevenlabs_api_key or self.config.ELEVENLABS_API_KEY
|
| 202 |
+
if api_key:
|
| 203 |
+
try:
|
| 204 |
+
self.elevenlabs_client = ElevenLabs(api_key=api_key)
|
| 205 |
+
logger.info("ElevenLabs client initialized for podcast generation")
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Failed to initialize ElevenLabs client: {e}")
|
| 208 |
+
|
| 209 |
+
# Create podcast storage directory
|
| 210 |
+
self.podcast_dir = Path("./data/podcasts")
|
| 211 |
+
self.podcast_dir.mkdir(parents=True, exist_ok=True)
|
| 212 |
+
|
| 213 |
+
# Metadata database file
|
| 214 |
+
self.metadata_file = self.podcast_dir / "metadata_db.json"
|
| 215 |
+
self._ensure_metadata_db()
|
| 216 |
+
|
| 217 |
+
def _ensure_metadata_db(self):
|
| 218 |
+
"""Ensure metadata database exists"""
|
| 219 |
+
if not self.metadata_file.exists():
|
| 220 |
+
self.metadata_file.write_text(json.dumps([], indent=2))
|
| 221 |
+
|
| 222 |
+
async def generate_podcast(
|
| 223 |
+
self,
|
| 224 |
+
document_ids: List[str],
|
| 225 |
+
style: str = "conversational",
|
| 226 |
+
duration_minutes: int = 10,
|
| 227 |
+
host1_voice: str = "Rachel",
|
| 228 |
+
host2_voice: str = "Adam"
|
| 229 |
+
) -> PodcastResult:
|
| 230 |
+
"""
|
| 231 |
+
Generate a complete podcast from documents
|
| 232 |
+
|
| 233 |
+
Args:
|
| 234 |
+
document_ids: List of document IDs to analyze
|
| 235 |
+
style: Podcast style (conversational, educational, technical, casual)
|
| 236 |
+
duration_minutes: Target duration in minutes
|
| 237 |
+
host1_voice: Voice name for first host
|
| 238 |
+
host2_voice: Voice name for second host
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
PodcastResult with audio file path and metadata
|
| 242 |
+
"""
|
| 243 |
+
start_time = datetime.now()
|
| 244 |
+
podcast_id = str(uuid.uuid4())
|
| 245 |
+
|
| 246 |
+
try:
|
| 247 |
+
logger.info(f"Starting podcast generation {podcast_id}")
|
| 248 |
+
logger.info(f"Documents: {document_ids}, Style: {style}, Duration: {duration_minutes}min")
|
| 249 |
+
|
| 250 |
+
# Step 1: Analyze documents
|
| 251 |
+
logger.info("Step 1: Analyzing documents...")
|
| 252 |
+
analysis = await self.analyze_documents(document_ids)
|
| 253 |
+
|
| 254 |
+
# Step 2: Generate script
|
| 255 |
+
logger.info("Step 2: Generating podcast script...")
|
| 256 |
+
script = await self.generate_script(analysis, style, duration_minutes)
|
| 257 |
+
|
| 258 |
+
# Step 3: Synthesize audio
|
| 259 |
+
logger.info("Step 3: Synthesizing audio with voices...")
|
| 260 |
+
audio_file_path = await self.synthesize_audio(
|
| 261 |
+
podcast_id,
|
| 262 |
+
script,
|
| 263 |
+
host1_voice,
|
| 264 |
+
host2_voice
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Calculate generation time
|
| 268 |
+
generation_time = (datetime.now() - start_time).total_seconds()
|
| 269 |
+
|
| 270 |
+
# Step 4: Create metadata
|
| 271 |
+
logger.info("Step 4: Creating metadata...")
|
| 272 |
+
metadata = self._create_metadata(
|
| 273 |
+
podcast_id,
|
| 274 |
+
analysis,
|
| 275 |
+
script,
|
| 276 |
+
audio_file_path,
|
| 277 |
+
{host1_voice, host2_voice},
|
| 278 |
+
document_ids,
|
| 279 |
+
style
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
# Save metadata
|
| 283 |
+
self._save_metadata(metadata)
|
| 284 |
+
|
| 285 |
+
# Save transcript
|
| 286 |
+
transcript_path = self.podcast_dir / f"{podcast_id}_transcript.txt"
|
| 287 |
+
transcript_path.write_text(script.to_text(), encoding="utf-8")
|
| 288 |
+
|
| 289 |
+
logger.info(f"Podcast generated successfully: {podcast_id}")
|
| 290 |
+
|
| 291 |
+
return PodcastResult(
|
| 292 |
+
podcast_id=podcast_id,
|
| 293 |
+
audio_file_path=str(audio_file_path),
|
| 294 |
+
transcript=script.to_text(),
|
| 295 |
+
metadata=metadata,
|
| 296 |
+
generation_time=generation_time,
|
| 297 |
+
success=True
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
except Exception as e:
|
| 301 |
+
logger.error(f"Podcast generation failed: {str(e)}", exc_info=True)
|
| 302 |
+
return PodcastResult(
|
| 303 |
+
podcast_id=podcast_id,
|
| 304 |
+
audio_file_path="",
|
| 305 |
+
transcript="",
|
| 306 |
+
metadata=None,
|
| 307 |
+
generation_time=(datetime.now() - start_time).total_seconds(),
|
| 308 |
+
success=False,
|
| 309 |
+
error=str(e)
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
async def analyze_documents(self, document_ids: List[str]) -> DocumentAnalysis:
|
| 313 |
+
"""
|
| 314 |
+
Analyze documents to extract key insights for podcast
|
| 315 |
+
|
| 316 |
+
Args:
|
| 317 |
+
document_ids: List of document IDs
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
DocumentAnalysis with key insights and topics
|
| 321 |
+
"""
|
| 322 |
+
# Create analysis query for the agentic RAG
|
| 323 |
+
analysis_query = f"""Analyze the following documents and provide:
|
| 324 |
+
1. The 5-7 most important insights or key points
|
| 325 |
+
2. Main themes and topics covered
|
| 326 |
+
3. The overall complexity level (beginner/intermediate/advanced)
|
| 327 |
+
4. A brief summary suitable for podcast discussion
|
| 328 |
+
|
| 329 |
+
Document IDs: {', '.join(document_ids)}
|
| 330 |
+
|
| 331 |
+
Provide a structured analysis optimized for creating an engaging podcast discussion."""
|
| 332 |
+
|
| 333 |
+
# Use LlamaIndex agentic RAG for analysis
|
| 334 |
+
result = await self.llamaindex_service.query(analysis_query)
|
| 335 |
+
|
| 336 |
+
# Parse the result to extract structured information
|
| 337 |
+
# This is a simplified parser - in production, you might want more robust parsing
|
| 338 |
+
insights = self._extract_insights(result)
|
| 339 |
+
topics = self._extract_topics(result)
|
| 340 |
+
complexity = self._determine_complexity(result)
|
| 341 |
+
|
| 342 |
+
return DocumentAnalysis(
|
| 343 |
+
key_insights=insights[:7], # Limit to 7
|
| 344 |
+
topics=topics,
|
| 345 |
+
complexity_level=complexity,
|
| 346 |
+
estimated_words=len(result.split()),
|
| 347 |
+
source_documents=document_ids,
|
| 348 |
+
summary=result
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
def _extract_insights(self, text: str) -> List[str]:
|
| 352 |
+
"""Extract key insights from analysis text"""
|
| 353 |
+
insights = []
|
| 354 |
+
#Simple extraction based on numbered lists or bullet points
|
| 355 |
+
lines = text.split('\n')
|
| 356 |
+
for line in lines:
|
| 357 |
+
line = line.strip()
|
| 358 |
+
# Match patterns like "1.", "2.", "-", "*", "•"
|
| 359 |
+
if re.match(r'^\d+\.|\-|\*|•', line):
|
| 360 |
+
insight = re.sub(r'^\d+\.|\-|\*|•', '', line).strip()
|
| 361 |
+
if len(insight) > 20: # Ensure it's substantial
|
| 362 |
+
insights.append(insight)
|
| 363 |
+
|
| 364 |
+
# If no insights found, create from first few sentences
|
| 365 |
+
if not insights:
|
| 366 |
+
sentences = text.split('.')
|
| 367 |
+
insights = [s.strip() + '.' for s in sentences[:7] if len(s.strip()) > 20]
|
| 368 |
+
|
| 369 |
+
return insights
|
| 370 |
+
|
| 371 |
+
def _extract_topics(self, text: str) -> List[str]:
|
| 372 |
+
"""Extract main topics from analysis"""
|
| 373 |
+
# Simple keyword extraction - could be enhanced with NLP
|
| 374 |
+
common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
|
| 375 |
+
words = text.lower().split()
|
| 376 |
+
word_freq = {}
|
| 377 |
+
|
| 378 |
+
for word in words:
|
| 379 |
+
word = re.sub(r'[^\w\s]', '', word)
|
| 380 |
+
if len(word) > 4 and word not in common_words:
|
| 381 |
+
word_freq[word] = word_freq.get(word, 0) + 1
|
| 382 |
+
|
| 383 |
+
# Get top topics
|
| 384 |
+
topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
|
| 385 |
+
return [topic[0].title() for topic in topics]
|
| 386 |
+
|
| 387 |
+
def _determine_complexity(self, text: str) -> str:
|
| 388 |
+
"""Determine content complexity level"""
|
| 389 |
+
text_lower = text.lower()
|
| 390 |
+
|
| 391 |
+
# Simple heuristic based on keywords
|
| 392 |
+
if any(word in text_lower for word in ['basic', 'introduction', 'beginner', 'simple']):
|
| 393 |
+
return "beginner"
|
| 394 |
+
elif any(word in text_lower for word in ['advanced', 'complex', 'sophisticated', 'expert']):
|
| 395 |
+
return "advanced"
|
| 396 |
+
else:
|
| 397 |
+
return "intermediate"
|
| 398 |
+
|
| 399 |
+
async def generate_script(
|
| 400 |
+
self,
|
| 401 |
+
analysis: DocumentAnalysis,
|
| 402 |
+
style: str,
|
| 403 |
+
duration_minutes: int
|
| 404 |
+
) -> PodcastScript:
|
| 405 |
+
"""
|
| 406 |
+
Generate podcast script from analysis
|
| 407 |
+
|
| 408 |
+
Args:
|
| 409 |
+
analysis: Document analysis results
|
| 410 |
+
style: Podcast style
|
| 411 |
+
duration_minutes: Target duration
|
| 412 |
+
|
| 413 |
+
Returns:
|
| 414 |
+
Complete podcast script
|
| 415 |
+
"""
|
| 416 |
+
# Calculate target word count
|
| 417 |
+
target_words = duration_minutes * self.WORDS_PER_MINUTE
|
| 418 |
+
|
| 419 |
+
# Prepare analysis context
|
| 420 |
+
analysis_context = f"""
|
| 421 |
+
KEY INSIGHTS:
|
| 422 |
+
{chr(10).join(f"{i+1}. {insight}" for i, insight in enumerate(analysis.key_insights))}
|
| 423 |
+
|
| 424 |
+
TOPICS: {', '.join(analysis.topics)}
|
| 425 |
+
COMPLEXITY: {analysis.complexity_level}
|
| 426 |
+
|
| 427 |
+
SUMMARY:
|
| 428 |
+
{analysis.summary[:500]}...
|
| 429 |
+
"""
|
| 430 |
+
|
| 431 |
+
# Get prompt template for style
|
| 432 |
+
prompt_template = self.SCRIPT_PROMPTS.get(style, self.SCRIPT_PROMPTS["conversational"])
|
| 433 |
+
|
| 434 |
+
# Fill in the template
|
| 435 |
+
prompt = prompt_template.format(
|
| 436 |
+
analysis=analysis_context,
|
| 437 |
+
duration_minutes=duration_minutes,
|
| 438 |
+
word_count=target_words
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
# Generate script using LLM
|
| 442 |
+
script_text = await self.llm_service.generate_text(
|
| 443 |
+
prompt,
|
| 444 |
+
max_tokens=target_words * 2, # Give room for generation
|
| 445 |
+
temperature=0.8 # More creative
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
# Parse script into dialogue lines
|
| 449 |
+
dialogue = self._parse_script(script_text)
|
| 450 |
+
|
| 451 |
+
# Calculate actual word count and duration
|
| 452 |
+
word_count = sum(len(line.text.split()) for line in dialogue)
|
| 453 |
+
duration_estimate = word_count / self.WORDS_PER_MINUTE
|
| 454 |
+
|
| 455 |
+
return PodcastScript(
|
| 456 |
+
dialogue=dialogue,
|
| 457 |
+
total_duration_estimate=duration_estimate * 60, # Convert to seconds
|
| 458 |
+
word_count=word_count,
|
| 459 |
+
style=style
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
def _parse_script(self, script_text: str) -> List[DialogueLine]:
|
| 463 |
+
"""Parse generated script into dialogue lines"""
|
| 464 |
+
dialogue = []
|
| 465 |
+
lines = script_text.split('\n')
|
| 466 |
+
|
| 467 |
+
for line in lines:
|
| 468 |
+
line = line.strip()
|
| 469 |
+
if not line:
|
| 470 |
+
continue
|
| 471 |
+
|
| 472 |
+
# Match "HOST1:" or "HOST2:" format
|
| 473 |
+
if line.startswith('HOST1:'):
|
| 474 |
+
text = line[6:].strip()
|
| 475 |
+
if text:
|
| 476 |
+
dialogue.append(DialogueLine(speaker="HOST1", text=text))
|
| 477 |
+
elif line.startswith('HOST2:'):
|
| 478 |
+
text = line[6:].strip()
|
| 479 |
+
if text:
|
| 480 |
+
dialogue.append(DialogueLine(speaker="HOST2", text=text))
|
| 481 |
+
|
| 482 |
+
return dialogue
|
| 483 |
+
|
| 484 |
+
def _get_voice_id(self, voice_name: str) -> str:
|
| 485 |
+
"""
|
| 486 |
+
Get voice ID from voice name.
|
| 487 |
+
Falls back to first available voice if not found.
|
| 488 |
+
|
| 489 |
+
Args:
|
| 490 |
+
voice_name: Voice name (e.g., "Rachel", "Adam")
|
| 491 |
+
|
| 492 |
+
Returns:
|
| 493 |
+
Voice ID string
|
| 494 |
+
"""
|
| 495 |
+
try:
|
| 496 |
+
# Try to get voices and find by name
|
| 497 |
+
voices = self.elevenlabs_client.voices.get_all()
|
| 498 |
+
|
| 499 |
+
if not voices or not voices.voices:
|
| 500 |
+
logger.error("No voices available from ElevenLabs")
|
| 501 |
+
raise RuntimeError("No voices available")
|
| 502 |
+
|
| 503 |
+
# First, try exact name match
|
| 504 |
+
for voice in voices.voices:
|
| 505 |
+
if voice.name.lower() == voice_name.lower():
|
| 506 |
+
logger.info(f"Found exact voice match for '{voice_name}': {voice.voice_id}")
|
| 507 |
+
return voice.voice_id
|
| 508 |
+
|
| 509 |
+
# Try partial match
|
| 510 |
+
for voice in voices.voices:
|
| 511 |
+
if voice_name.lower() in voice.name.lower():
|
| 512 |
+
logger.info(f"Found partial voice match for '{voice_name}': {voice.name} ({voice.voice_id})")
|
| 513 |
+
return voice.voice_id
|
| 514 |
+
|
| 515 |
+
# Use first available voice as fallback
|
| 516 |
+
first_voice = voices.voices[0]
|
| 517 |
+
logger.warning(f"Voice '{voice_name}' not found, using first available voice: {first_voice.name} ({first_voice.voice_id})")
|
| 518 |
+
return first_voice.voice_id
|
| 519 |
+
|
| 520 |
+
except Exception as e:
|
| 521 |
+
logger.error(f"Could not fetch voices: {e}", exc_info=True)
|
| 522 |
+
raise RuntimeError(f"Failed to get voice ID: {str(e)}")
|
| 523 |
+
|
| 524 |
+
async def synthesize_audio(
|
| 525 |
+
self,
|
| 526 |
+
podcast_id: str,
|
| 527 |
+
script: PodcastScript,
|
| 528 |
+
host1_voice: str,
|
| 529 |
+
host2_voice: str
|
| 530 |
+
) -> Path:
|
| 531 |
+
"""
|
| 532 |
+
Synthesize audio from script using ElevenLabs
|
| 533 |
+
|
| 534 |
+
Args:
|
| 535 |
+
podcast_id: Unique podcast ID
|
| 536 |
+
script: Podcast script
|
| 537 |
+
host1_voice: Voice for HOST1
|
| 538 |
+
host2_voice: Voice for HOST2
|
| 539 |
+
|
| 540 |
+
Returns:
|
| 541 |
+
Path to generated MP3 file
|
| 542 |
+
"""
|
| 543 |
+
if not self.elevenlabs_client:
|
| 544 |
+
raise RuntimeError("ElevenLabs client not initialized")
|
| 545 |
+
|
| 546 |
+
audio_file = self.podcast_dir / f"{podcast_id}.mp3"
|
| 547 |
+
|
| 548 |
+
# For now, create a simple text-to-speech for the full script
|
| 549 |
+
# In production, you'd combine segments with pauses
|
| 550 |
+
full_text = script.to_text()
|
| 551 |
+
|
| 552 |
+
# Get actual voice ID
|
| 553 |
+
voice_id = self._get_voice_id(host1_voice)
|
| 554 |
+
|
| 555 |
+
try:
|
| 556 |
+
# Use modern ElevenLabs TTS API
|
| 557 |
+
# Note: This is a simplified version using single voice
|
| 558 |
+
# Full implementation would process each line separately with different voices
|
| 559 |
+
|
| 560 |
+
logger.info(f"Generating audio with voice: {host1_voice}")
|
| 561 |
+
|
| 562 |
+
# Use the modern text_to_speech API
|
| 563 |
+
audio_generator = self.elevenlabs_client.text_to_speech.convert(
|
| 564 |
+
voice_id=voice_id, # Using resolved voice ID
|
| 565 |
+
text=full_text,
|
| 566 |
+
model_id="eleven_multilingual_v2"
|
| 567 |
+
)
|
| 568 |
+
|
| 569 |
+
# Write audio chunks to file
|
| 570 |
+
with open(audio_file, 'wb') as f:
|
| 571 |
+
for chunk in audio_generator:
|
| 572 |
+
if chunk:
|
| 573 |
+
f.write(chunk)
|
| 574 |
+
|
| 575 |
+
# Verify file was created with content
|
| 576 |
+
if audio_file.exists() and audio_file.stat().st_size > 1000:
|
| 577 |
+
logger.info(f"Audio synthesized successfully: {audio_file} ({audio_file.stat().st_size} bytes)")
|
| 578 |
+
return audio_file
|
| 579 |
+
else:
|
| 580 |
+
raise RuntimeError(f"Generated audio file is too small or empty: {audio_file.stat().st_size} bytes")
|
| 581 |
+
|
| 582 |
+
except Exception as e:
|
| 583 |
+
logger.error(f"Audio synthesis failed: {e}", exc_info=True)
|
| 584 |
+
raise RuntimeError(f"Failed to generate podcast audio: {str(e)}")
|
| 585 |
+
|
| 586 |
+
def _create_metadata(
|
| 587 |
+
self,
|
| 588 |
+
podcast_id: str,
|
| 589 |
+
analysis: DocumentAnalysis,
|
| 590 |
+
script: PodcastScript,
|
| 591 |
+
audio_path: Path,
|
| 592 |
+
voices: set,
|
| 593 |
+
document_ids: List[str],
|
| 594 |
+
style: str
|
| 595 |
+
) -> PodcastMetadata:
|
| 596 |
+
"""Create podcast metadata"""
|
| 597 |
+
# Auto-generate title
|
| 598 |
+
title = f"Podcast: {analysis.topics[0] if analysis.topics else 'Document Discussion'}"
|
| 599 |
+
|
| 600 |
+
# Create description
|
| 601 |
+
description = f"A {style} podcast discussing insights from {len(document_ids)} document(s)."
|
| 602 |
+
|
| 603 |
+
# Calculate file size
|
| 604 |
+
file_size_mb = audio_path.stat().st_size / (1024 * 1024) if audio_path.exists() else 0
|
| 605 |
+
|
| 606 |
+
# Estimate costs
|
| 607 |
+
llm_cost = (script.word_count / 1000) * 0.01 # Rough estimate
|
| 608 |
+
tts_cost = (script.word_count * 5 / 1000) * 0.30 # Rough estimate
|
| 609 |
+
|
| 610 |
+
return PodcastMetadata(
|
| 611 |
+
podcast_id=podcast_id,
|
| 612 |
+
title=title,
|
| 613 |
+
description=description,
|
| 614 |
+
source_documents=document_ids,
|
| 615 |
+
style=style,
|
| 616 |
+
duration_seconds=script.total_duration_estimate,
|
| 617 |
+
file_size_mb=file_size_mb,
|
| 618 |
+
voices={"host1": list(voices)[0] if len(voices) > 0 else "Rachel",
|
| 619 |
+
"host2": list(voices)[1] if len(voices) > 1 else "Adam"},
|
| 620 |
+
generated_at=datetime.now().isoformat(),
|
| 621 |
+
generation_cost={"llm_cost": llm_cost, "tts_cost": tts_cost, "total": llm_cost + tts_cost},
|
| 622 |
+
key_topics=analysis.topics
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
def _save_metadata(self, metadata: PodcastMetadata):
|
| 626 |
+
"""Save metadata to database"""
|
| 627 |
+
try:
|
| 628 |
+
# Load existing metadata
|
| 629 |
+
existing = json.loads(self.metadata_file.read_text())
|
| 630 |
+
|
| 631 |
+
# Add new metadata
|
| 632 |
+
existing.append(asdict(metadata))
|
| 633 |
+
|
| 634 |
+
# Save back
|
| 635 |
+
self.metadata_file.write_text(json.dumps(existing, indent=2))
|
| 636 |
+
|
| 637 |
+
logger.info(f"Metadata saved for podcast: {metadata.podcast_id}")
|
| 638 |
+
|
| 639 |
+
except Exception as e:
|
| 640 |
+
logger.error(f"Failed to save metadata: {e}")
|
| 641 |
+
|
| 642 |
+
def list_podcasts(self, limit: int = 10) -> List[PodcastMetadata]:
|
| 643 |
+
"""List generated podcasts"""
|
| 644 |
+
try:
|
| 645 |
+
data = json.loads(self.metadata_file.read_text())
|
| 646 |
+
podcasts = [PodcastMetadata(**item) for item in data[-limit:]]
|
| 647 |
+
return list(reversed(podcasts)) # Most recent first
|
| 648 |
+
except Exception as e:
|
| 649 |
+
logger.error(f"Failed to list podcasts: {e}")
|
| 650 |
+
return []
|
| 651 |
+
|
| 652 |
+
def get_podcast(self, podcast_id: str) -> Optional[PodcastMetadata]:
|
| 653 |
+
"""Get specific podcast metadata"""
|
| 654 |
+
try:
|
| 655 |
+
data = json.loads(self.metadata_file.read_text())
|
| 656 |
+
for item in data:
|
| 657 |
+
if item.get('podcast_id') == podcast_id:
|
| 658 |
+
return PodcastMetadata(**item)
|
| 659 |
+
return None
|
| 660 |
+
except Exception as e:
|
| 661 |
+
logger.error(f"Failed to get podcast: {e}")
|
| 662 |
+
return None
|
| 663 |
+
|
services/vector_store_service.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import pickle
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 6 |
+
import faiss
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import asyncio
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
from core.models import SearchResult, Chunk
|
| 12 |
+
import config
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
class VectorStoreService:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.config = config.config
|
| 19 |
+
self.index = None
|
| 20 |
+
self.chunks_metadata = {} # Maps index position to chunk metadata
|
| 21 |
+
self.dimension = None
|
| 22 |
+
|
| 23 |
+
# Paths
|
| 24 |
+
self.store_path = Path(self.config.VECTOR_STORE_PATH)
|
| 25 |
+
self.store_path.mkdir(parents=True, exist_ok=True)
|
| 26 |
+
|
| 27 |
+
self.index_path = self.store_path / f"{self.config.INDEX_NAME}.index"
|
| 28 |
+
self.metadata_path = self.store_path / f"{self.config.INDEX_NAME}_metadata.json"
|
| 29 |
+
|
| 30 |
+
# Load existing index if available
|
| 31 |
+
self._load_index()
|
| 32 |
+
|
| 33 |
+
def _load_index(self):
|
| 34 |
+
"""Load existing FAISS index and metadata"""
|
| 35 |
+
try:
|
| 36 |
+
if self.index_path.exists() and self.metadata_path.exists():
|
| 37 |
+
logger.info("Loading existing FAISS index...")
|
| 38 |
+
|
| 39 |
+
# Load FAISS index
|
| 40 |
+
self.index = faiss.read_index(str(self.index_path))
|
| 41 |
+
self.dimension = self.index.d
|
| 42 |
+
|
| 43 |
+
# Load metadata
|
| 44 |
+
with open(self.metadata_path, 'r') as f:
|
| 45 |
+
self.chunks_metadata = json.load(f)
|
| 46 |
+
|
| 47 |
+
logger.info(f"Loaded index with {self.index.ntotal} vectors, dimension {self.dimension}")
|
| 48 |
+
else:
|
| 49 |
+
logger.info("No existing index found, will create new one")
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.error(f"Error loading index: {str(e)}")
|
| 52 |
+
|
| 53 |
+
def _initialize_index(self, dimension: int):
|
| 54 |
+
"""Initialize a new FAISS index"""
|
| 55 |
+
try:
|
| 56 |
+
# Use IndexFlatIP for cosine similarity (since embeddings are normalized)
|
| 57 |
+
self.index = faiss.IndexFlatIP(dimension)
|
| 58 |
+
self.dimension = dimension
|
| 59 |
+
self.chunks_metadata = {}
|
| 60 |
+
logger.info(f"Initialized new FAISS index with dimension {dimension}")
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Error initializing index: {str(e)}")
|
| 63 |
+
raise
|
| 64 |
+
|
| 65 |
+
async def add_chunks(self, chunks: List[Chunk]) -> bool:
|
| 66 |
+
"""Add chunks to the vector store"""
|
| 67 |
+
if not chunks:
|
| 68 |
+
return True
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
# Extract embeddings and metadata
|
| 72 |
+
embeddings = []
|
| 73 |
+
new_metadata = {}
|
| 74 |
+
|
| 75 |
+
for chunk in chunks:
|
| 76 |
+
if chunk.embedding and len(chunk.embedding) > 0:
|
| 77 |
+
embeddings.append(chunk.embedding)
|
| 78 |
+
# Store metadata using the current index position
|
| 79 |
+
current_index = len(self.chunks_metadata) + len(embeddings) - 1
|
| 80 |
+
new_metadata[str(current_index)] = {
|
| 81 |
+
"chunk_id": chunk.id,
|
| 82 |
+
"document_id": chunk.document_id,
|
| 83 |
+
"content": chunk.content,
|
| 84 |
+
"chunk_index": chunk.chunk_index,
|
| 85 |
+
"start_pos": chunk.start_pos,
|
| 86 |
+
"end_pos": chunk.end_pos,
|
| 87 |
+
"metadata": chunk.metadata
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
if not embeddings:
|
| 91 |
+
logger.warning("No valid embeddings found in chunks")
|
| 92 |
+
return False
|
| 93 |
+
|
| 94 |
+
# Check for dimension mismatch
|
| 95 |
+
if self.index is not None and self.dimension is not None:
|
| 96 |
+
if len(embeddings[0]) != self.dimension:
|
| 97 |
+
logger.warning(f"Dimension mismatch! New embeddings have {len(embeddings[0])}, but index has {self.dimension}. Rebuilding index.")
|
| 98 |
+
# Reset index
|
| 99 |
+
self.index = None
|
| 100 |
+
self.chunks_metadata = {}
|
| 101 |
+
self.dimension = None
|
| 102 |
+
|
| 103 |
+
# Initialize index if needed
|
| 104 |
+
if self.index is None:
|
| 105 |
+
self._initialize_index(len(embeddings[0]))
|
| 106 |
+
|
| 107 |
+
# Convert to numpy array
|
| 108 |
+
embeddings_array = np.array(embeddings, dtype=np.float32)
|
| 109 |
+
|
| 110 |
+
# Add to FAISS index
|
| 111 |
+
self.index.add(embeddings_array)
|
| 112 |
+
|
| 113 |
+
# Update metadata
|
| 114 |
+
self.chunks_metadata.update(new_metadata)
|
| 115 |
+
|
| 116 |
+
# Save index and metadata
|
| 117 |
+
await self._save_index()
|
| 118 |
+
|
| 119 |
+
logger.info(f"Added {len(embeddings)} chunks to vector store")
|
| 120 |
+
return True
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"Error adding chunks to vector store: {str(e)}")
|
| 124 |
+
return False
|
| 125 |
+
|
| 126 |
+
async def search(self, query_embedding: List[float], top_k: int = 5,
|
| 127 |
+
filters: Optional[Dict[str, Any]] = None) -> List[SearchResult]:
|
| 128 |
+
"""Search for similar chunks"""
|
| 129 |
+
if self.index is None or self.index.ntotal == 0:
|
| 130 |
+
logger.warning("No index available or index is empty")
|
| 131 |
+
return []
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
# Convert query embedding to numpy array
|
| 135 |
+
query_array = np.array([query_embedding], dtype=np.float32)
|
| 136 |
+
|
| 137 |
+
# Perform search
|
| 138 |
+
scores, indices = self.index.search(query_array, min(top_k, self.index.ntotal))
|
| 139 |
+
|
| 140 |
+
# Convert results to SearchResult objects
|
| 141 |
+
results = []
|
| 142 |
+
for score, idx in zip(scores[0], indices[0]):
|
| 143 |
+
if idx == -1: # FAISS returns -1 for empty slots
|
| 144 |
+
continue
|
| 145 |
+
|
| 146 |
+
chunk_metadata = self.chunks_metadata.get(str(idx))
|
| 147 |
+
if chunk_metadata:
|
| 148 |
+
# Apply filters if specified
|
| 149 |
+
if filters and not self._apply_filters(chunk_metadata, filters):
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
result = SearchResult(
|
| 153 |
+
chunk_id=chunk_metadata["chunk_id"],
|
| 154 |
+
document_id=chunk_metadata["document_id"],
|
| 155 |
+
content=chunk_metadata["content"],
|
| 156 |
+
score=float(score),
|
| 157 |
+
metadata=chunk_metadata.get("metadata", {})
|
| 158 |
+
)
|
| 159 |
+
results.append(result)
|
| 160 |
+
|
| 161 |
+
# Sort by score (descending)
|
| 162 |
+
results.sort(key=lambda x: x.score, reverse=True)
|
| 163 |
+
|
| 164 |
+
logger.info(f"Found {len(results)} search results")
|
| 165 |
+
return results
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Error searching vector store: {str(e)}")
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
def _apply_filters(self, chunk_metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
|
| 172 |
+
"""Apply filters to chunk metadata"""
|
| 173 |
+
try:
|
| 174 |
+
for key, value in filters.items():
|
| 175 |
+
if key == "document_id":
|
| 176 |
+
if chunk_metadata.get("document_id") != value:
|
| 177 |
+
return False
|
| 178 |
+
elif key == "document_ids":
|
| 179 |
+
if chunk_metadata.get("document_id") not in value:
|
| 180 |
+
return False
|
| 181 |
+
elif key == "content_length_min":
|
| 182 |
+
if len(chunk_metadata.get("content", "")) < value:
|
| 183 |
+
return False
|
| 184 |
+
elif key == "content_length_max":
|
| 185 |
+
if len(chunk_metadata.get("content", "")) > value:
|
| 186 |
+
return False
|
| 187 |
+
# Add more filter types as needed
|
| 188 |
+
|
| 189 |
+
return True
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.error(f"Error applying filters: {str(e)}")
|
| 192 |
+
return True
|
| 193 |
+
|
| 194 |
+
async def _save_index(self):
|
| 195 |
+
"""Save the FAISS index and metadata to disk"""
|
| 196 |
+
try:
|
| 197 |
+
if self.index is not None:
|
| 198 |
+
# Save FAISS index
|
| 199 |
+
faiss.write_index(self.index, str(self.index_path))
|
| 200 |
+
|
| 201 |
+
# Save metadata
|
| 202 |
+
with open(self.metadata_path, 'w') as f:
|
| 203 |
+
json.dump(self.chunks_metadata, f, indent=2)
|
| 204 |
+
|
| 205 |
+
logger.debug("Saved index and metadata to disk")
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Error saving index: {str(e)}")
|
| 208 |
+
|
| 209 |
+
async def get_stats(self) -> Dict[str, Any]:
|
| 210 |
+
"""Get statistics about the vector store"""
|
| 211 |
+
try:
|
| 212 |
+
return {
|
| 213 |
+
"total_vectors": self.index.ntotal if self.index else 0,
|
| 214 |
+
"dimension": self.dimension,
|
| 215 |
+
"index_type": type(self.index).__name__ if self.index else None,
|
| 216 |
+
"metadata_entries": len(self.chunks_metadata),
|
| 217 |
+
"index_file_exists": self.index_path.exists(),
|
| 218 |
+
"metadata_file_exists": self.metadata_path.exists()
|
| 219 |
+
}
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.error(f"Error getting stats: {str(e)}")
|
| 222 |
+
return {"error": str(e)}
|
| 223 |
+
|
| 224 |
+
async def delete_document(self, document_id: str) -> bool:
|
| 225 |
+
"""Delete all chunks for a specific document"""
|
| 226 |
+
try:
|
| 227 |
+
# Find indices to remove
|
| 228 |
+
indices_to_remove = []
|
| 229 |
+
for idx, metadata in self.chunks_metadata.items():
|
| 230 |
+
if metadata.get("document_id") == document_id:
|
| 231 |
+
indices_to_remove.append(int(idx))
|
| 232 |
+
|
| 233 |
+
if not indices_to_remove:
|
| 234 |
+
logger.warning(f"No chunks found for document {document_id}")
|
| 235 |
+
return False
|
| 236 |
+
|
| 237 |
+
# FAISS doesn't support removing individual vectors efficiently
|
| 238 |
+
# We need to rebuild the index without the removed vectors
|
| 239 |
+
if self.index and self.index.ntotal > 0:
|
| 240 |
+
# Get all embeddings except the ones to remove
|
| 241 |
+
all_embeddings = []
|
| 242 |
+
new_metadata = {}
|
| 243 |
+
new_index = 0
|
| 244 |
+
|
| 245 |
+
for old_idx in range(self.index.ntotal):
|
| 246 |
+
if old_idx not in indices_to_remove:
|
| 247 |
+
# Get the embedding from FAISS
|
| 248 |
+
embedding = self.index.reconstruct(old_idx)
|
| 249 |
+
all_embeddings.append(embedding)
|
| 250 |
+
|
| 251 |
+
# Update metadata with new index
|
| 252 |
+
old_metadata = self.chunks_metadata.get(str(old_idx))
|
| 253 |
+
if old_metadata:
|
| 254 |
+
new_metadata[str(new_index)] = old_metadata
|
| 255 |
+
new_index += 1
|
| 256 |
+
|
| 257 |
+
# Rebuild index
|
| 258 |
+
if all_embeddings:
|
| 259 |
+
self._initialize_index(self.dimension)
|
| 260 |
+
embeddings_array = np.array(all_embeddings, dtype=np.float32)
|
| 261 |
+
self.index.add(embeddings_array)
|
| 262 |
+
self.chunks_metadata = new_metadata
|
| 263 |
+
else:
|
| 264 |
+
# No embeddings left, create empty index
|
| 265 |
+
self._initialize_index(self.dimension)
|
| 266 |
+
|
| 267 |
+
# Save updated index
|
| 268 |
+
await self._save_index()
|
| 269 |
+
|
| 270 |
+
logger.info(f"Deleted {len(indices_to_remove)} chunks for document {document_id}")
|
| 271 |
+
return True
|
| 272 |
+
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.error(f"Error deleting document chunks: {str(e)}")
|
| 275 |
+
return False
|
| 276 |
+
|
| 277 |
+
async def clear_all(self) -> bool:
|
| 278 |
+
"""Clear all data from the vector store"""
|
| 279 |
+
try:
|
| 280 |
+
self.index = None
|
| 281 |
+
self.chunks_metadata = {}
|
| 282 |
+
self.dimension = None
|
| 283 |
+
|
| 284 |
+
# Remove files
|
| 285 |
+
if self.index_path.exists():
|
| 286 |
+
self.index_path.unlink()
|
| 287 |
+
if self.metadata_path.exists():
|
| 288 |
+
self.metadata_path.unlink()
|
| 289 |
+
|
| 290 |
+
logger.info("Cleared all data from vector store")
|
| 291 |
+
return True
|
| 292 |
+
except Exception as e:
|
| 293 |
+
logger.error(f"Error clearing vector store: {str(e)}")
|
| 294 |
+
return False
|