Nihal2000 commited on
Commit
86aa5e4
·
1 Parent(s): 2bc9ae2

Initial deployment of AI Digital Library Assistant

Browse files
Files changed (48) hide show
  1. .dockerignore +67 -0
  2. Dockerfile +31 -0
  3. README.md +30 -8
  4. app.py +1374 -0
  5. config.py +56 -0
  6. core/__init__.py +1 -0
  7. core/__pycache__/__init__.cpython-313.pyc +0 -0
  8. core/__pycache__/chunker.cpython-313.pyc +0 -0
  9. core/__pycache__/document_parser.cpython-313.pyc +0 -0
  10. core/__pycache__/models.cpython-313.pyc +0 -0
  11. core/__pycache__/text_preprocessor.cpython-313.pyc +0 -0
  12. core/chunker.py +303 -0
  13. core/document_parser.py +199 -0
  14. core/models.py +102 -0
  15. core/text_preprocessor.py +186 -0
  16. mcp_server.py +290 -0
  17. mcp_tools/__init__.py +1 -0
  18. mcp_tools/__pycache__/__init__.cpython-313.pyc +0 -0
  19. mcp_tools/__pycache__/generative_tool.cpython-313.pyc +0 -0
  20. mcp_tools/__pycache__/ingestion_tool.cpython-313.pyc +0 -0
  21. mcp_tools/__pycache__/podcast_tool.cpython-313.pyc +0 -0
  22. mcp_tools/__pycache__/search_tool.cpython-313.pyc +0 -0
  23. mcp_tools/__pycache__/voice_tool.cpython-313.pyc +0 -0
  24. mcp_tools/generative_tool.py +407 -0
  25. mcp_tools/ingestion_tool.py +368 -0
  26. mcp_tools/podcast_tool.py +138 -0
  27. mcp_tools/search_tool.py +437 -0
  28. mcp_tools/utils.py +373 -0
  29. mcp_tools/voice_tool.py +63 -0
  30. requirements.txt +31 -0
  31. services/__init__.py +1 -0
  32. services/__pycache__/__init__.cpython-313.pyc +0 -0
  33. services/__pycache__/document_store_service.cpython-313.pyc +0 -0
  34. services/__pycache__/elevenlabs_service.cpython-313.pyc +0 -0
  35. services/__pycache__/embedding_service.cpython-313.pyc +0 -0
  36. services/__pycache__/llamaindex_service.cpython-313.pyc +0 -0
  37. services/__pycache__/llm_service.cpython-313.pyc +0 -0
  38. services/__pycache__/ocr_service.cpython-313.pyc +0 -0
  39. services/__pycache__/podcast_generator_service.cpython-313.pyc +0 -0
  40. services/__pycache__/vector_store_service.cpython-313.pyc +0 -0
  41. services/document_store_service.py +349 -0
  42. services/elevenlabs_service.py +341 -0
  43. services/embedding_service.py +243 -0
  44. services/llamaindex_service.py +199 -0
  45. services/llm_service.py +420 -0
  46. services/ocr_service.py +288 -0
  47. services/podcast_generator_service.py +663 -0
  48. services/vector_store_service.py +294 -0
.dockerignore ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ venv/
10
+ env/
11
+ ENV/
12
+ .venv
13
+
14
+ # IDE
15
+ .vscode/
16
+ .idea/
17
+ *.swp
18
+ *.swo
19
+ *~
20
+
21
+ # Git
22
+ .git/
23
+ .gitignore
24
+ .gitattributes
25
+
26
+ # CI/CD
27
+ .github/
28
+ .gitlab-ci.yml
29
+
30
+ # Documentation
31
+ README.md
32
+ docs/
33
+ *.md
34
+ !requirements.txt
35
+
36
+ # Test files
37
+ test_*.py
38
+ *_test.py
39
+ tests/
40
+ .pytest_cache/
41
+
42
+ # Large data files (these should be in volumes)
43
+ data/
44
+ vector_store/
45
+ documents/
46
+ podcasts/
47
+ *.db
48
+ *.sqlite
49
+
50
+ # Logs
51
+ *.log
52
+ logs/
53
+
54
+ # OS files
55
+ .DS_Store
56
+ Thumbs.db
57
+
58
+ # Deployment files (not needed in container)
59
+ deploy_from_env.py
60
+ modal_deploy.py
61
+ blaxel.yaml
62
+ bl.cmd
63
+ test_persistence.py
64
+
65
+ # Environment files
66
+ .env
67
+ .env.*
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ ca-certificates \
10
+ tesseract-ocr \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements and install Python dependencies
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy application code
18
+ COPY . .
19
+
20
+ # Create data directories
21
+ RUN mkdir -p /data/vector_store /data/documents /data/podcasts
22
+
23
+ # Expose port (HuggingFace Spaces uses 7860)
24
+ EXPOSE 7860
25
+
26
+ # Set environment variables
27
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
28
+ ENV GRADIO_SERVER_PORT=7860
29
+
30
+ # Run the MCP server
31
+ CMD ["python", "mcp_server.py"]
README.md CHANGED
@@ -1,12 +1,34 @@
1
  ---
2
- title: AiDigitalLibraryAssistant
3
- emoji: 🏢
4
- colorFrom: green
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 6.0.1
8
- app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: AI Digital Library Assistant
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_port: 7860
 
8
  pinned: false
9
+ license: mit
10
  ---
11
 
12
+ # AI Digital Library Assistant
13
+
14
+ An intelligent document organization and retrieval system powered by AI.
15
+
16
+ ## Features
17
+
18
+ - **Document Ingestion**: Upload PDF, DOCX, TXT, and Images
19
+ - **Semantic Search**: Find documents using natural language queries
20
+ - **AI Q&A**: Ask questions about your document library
21
+ - **Voice Assistant**: Talk to your library using ElevenLabs voice AI
22
+ - **Podcast Generation**: Turn documents into engaging audio podcasts
23
+
24
+ ## Setup
25
+
26
+ This Space is configured to run using Docker. It requires several API keys to function fully:
27
+
28
+ - `OPENAI_API_KEY`: For embeddings and LLM
29
+ - `ANTHROPIC_API_KEY`: For Claude 3.5 Sonnet
30
+ - `MISTRAL_API_KEY`: For Mistral models and OCR
31
+ - `ELEVENLABS_API_KEY`: For voice features
32
+ - `ELEVENLABS_AGENT_ID`: For conversational AI agent
33
+
34
+ Please set these in the Space Settings -> Variables and Secrets.
app.py ADDED
@@ -0,0 +1,1374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import tempfile
7
+ import uuid
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import List, Dict, Any, Optional
11
+ import nest_asyncio
12
+
13
+ # Apply nest_asyncio to handle nested event loops in Gradio
14
+ nest_asyncio.apply()
15
+
16
+ # Import our custom modules
17
+ from mcp_tools.ingestion_tool import IngestionTool
18
+ from mcp_tools.search_tool import SearchTool
19
+ from mcp_tools.generative_tool import GenerativeTool
20
+ from services.vector_store_service import VectorStoreService
21
+ from services.document_store_service import DocumentStoreService
22
+ from services.embedding_service import EmbeddingService
23
+ from services.llm_service import LLMService
24
+ from services.ocr_service import OCRService
25
+ from core.models import SearchResult, Document
26
+ import config
27
+
28
+ # Setup logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+ # Import our custom modules
32
+ from mcp_tools.ingestion_tool import IngestionTool
33
+ from mcp_tools.search_tool import SearchTool
34
+ from mcp_tools.generative_tool import GenerativeTool
35
+ from services.vector_store_service import VectorStoreService
36
+ from services.document_store_service import DocumentStoreService
37
+ from services.embedding_service import EmbeddingService
38
+ from services.llm_service import LLMService
39
+ from services.ocr_service import OCRService
40
+ from core.models import SearchResult, Document
41
+ import config
42
+ from services.llamaindex_service import LlamaIndexService
43
+ from services.elevenlabs_service import ElevenLabsService
44
+ from services.podcast_generator_service import PodcastGeneratorService
45
+ from mcp_tools.voice_tool import VoiceTool
46
+ from mcp_tools.podcast_tool import PodcastTool
47
+
48
+ # Setup logging
49
+ logging.basicConfig(level=logging.INFO)
50
+ logger = logging.getLogger(__name__)
51
+
52
+ class ContentOrganizerMCPServer:
53
+ def __init__(self):
54
+ # Initialize services
55
+ logger.info("Initializing Content Organizer MCP Server...")
56
+ self.vector_store = VectorStoreService()
57
+ self.document_store = DocumentStoreService()
58
+ self.embedding_service = EmbeddingService()
59
+ self.llm_service = LLMService()
60
+ self.ocr_service = OCRService()
61
+ self.llamaindex_service = LlamaIndexService(self.document_store)
62
+
63
+ # Initialize ElevenLabs voice service
64
+ self.elevenlabs_service = ElevenLabsService(self.llamaindex_service)
65
+
66
+ # Initialize Podcast Generator
67
+ self.podcast_generator = PodcastGeneratorService(
68
+ llamaindex_service=self.llamaindex_service,
69
+ llm_service=self.llm_service
70
+ )
71
+
72
+ # Initialize tools
73
+ self.ingestion_tool = IngestionTool(
74
+ vector_store=self.vector_store,
75
+ document_store=self.document_store,
76
+ embedding_service=self.embedding_service,
77
+ ocr_service=self.ocr_service
78
+ )
79
+ self.search_tool = SearchTool(
80
+ vector_store=self.vector_store,
81
+ embedding_service=self.embedding_service,
82
+ document_store=self.document_store
83
+ )
84
+ self.generative_tool = GenerativeTool(
85
+ llm_service=self.llm_service,
86
+ search_tool=self.search_tool
87
+ )
88
+ self.voice_tool = VoiceTool(self.elevenlabs_service)
89
+ self.podcast_tool = PodcastTool(self.podcast_generator)
90
+
91
+
92
+ # Track processing status
93
+ self.processing_status = {}
94
+
95
+ # Document cache for quick access
96
+ self.document_cache = {}
97
+ logger.info("Content Organizer MCP Server initialized successfully!")
98
+
99
+ def run_async(self, coro):
100
+ """Helper to run async functions in Gradio"""
101
+ try:
102
+ loop = asyncio.get_event_loop()
103
+ except RuntimeError:
104
+ loop = asyncio.new_event_loop()
105
+ asyncio.set_event_loop(loop)
106
+ if loop.is_running():
107
+ # If loop is already running, create a task
108
+ import concurrent.futures
109
+ with concurrent.futures.ThreadPoolExecutor() as executor:
110
+ future = executor.submit(asyncio.run, coro)
111
+ return future.result()
112
+ else:
113
+ return loop.run_until_complete(coro)
114
+
115
+ async def ingest_document_async(self, file_path: str, file_type: str) -> Dict[str, Any]:
116
+ """MCP Tool: Ingest and process a document"""
117
+ try:
118
+ task_id = str(uuid.uuid4())
119
+ self.processing_status[task_id] = {"status": "processing", "progress": 0}
120
+ result = await self.ingestion_tool.process_document(file_path, file_type, task_id)
121
+ if result.get("success"):
122
+ self.processing_status[task_id] = {"status": "completed", "progress": 100}
123
+ doc_id = result.get("document_id")
124
+ if doc_id:
125
+ doc = await self.document_store.get_document(doc_id)
126
+ if doc:
127
+ self.document_cache[doc_id] = doc
128
+ return result
129
+ else:
130
+ self.processing_status[task_id] = {"status": "failed", "error": result.get("error")}
131
+ return result
132
+ except Exception as e:
133
+ logger.error(f"Document ingestion failed: {str(e)}")
134
+ return {"success": False, "error": str(e), "message": "Failed to process document"}
135
+
136
+ async def get_document_content_async(self, document_id: str) -> Optional[str]:
137
+ """Get document content by ID"""
138
+ try:
139
+ # Check cache first
140
+ if document_id in self.document_cache:
141
+ return self.document_cache[document_id].content
142
+
143
+ # Get from store
144
+ doc = await self.document_store.get_document(document_id)
145
+ if doc:
146
+ self.document_cache[document_id] = doc
147
+ return doc.content
148
+ return None
149
+ except Exception as e:
150
+ logger.error(f"Error getting document content: {str(e)}")
151
+ return None
152
+
153
+ async def semantic_search_async(self, query: str, top_k: int = 5, filters: Optional[Dict] = None) -> Dict[str, Any]:
154
+ """MCP Tool: Perform semantic search"""
155
+ try:
156
+ results = await self.search_tool.search(query, top_k, filters)
157
+ return {"success": True, "query": query, "results": [result.to_dict() for result in results], "total_results": len(results)}
158
+ except Exception as e:
159
+ logger.error(f"Semantic search failed: {str(e)}")
160
+ return {"success": False, "error": str(e), "query": query, "results": []}
161
+
162
+ async def summarize_content_async(self, content: str = None, document_id: str = None, style: str = "concise") -> Dict[str, Any]:
163
+ try:
164
+ if document_id and document_id != "none":
165
+ content = await self.get_document_content_async(document_id)
166
+ if not content:
167
+ return {"success": False, "error": f"Document {document_id} not found"}
168
+ if not content or not content.strip():
169
+ return {"success": False, "error": "No content provided for summarization"}
170
+ max_content_length = 4000
171
+ if len(content) > max_content_length:
172
+ content = content[:max_content_length] + "..."
173
+ summary = await self.generative_tool.summarize(content, style)
174
+ return {"success": True, "summary": summary, "original_length": len(content), "summary_length": len(summary), "style": style, "document_id": document_id}
175
+ except Exception as e:
176
+ logger.error(f"Summarization failed: {str(e)}")
177
+ return {"success": False, "error": str(e)}
178
+
179
+ async def generate_tags_async(self, content: str = None, document_id: str = None, max_tags: int = 5) -> Dict[str, Any]:
180
+ """MCP Tool: Generate tags for content"""
181
+ try:
182
+ if document_id and document_id != "none":
183
+ content = await self.get_document_content_async(document_id)
184
+ if not content:
185
+ return {"success": False, "error": f"Document {document_id} not found"}
186
+ if not content or not content.strip():
187
+ return {"success": False, "error": "No content provided for tag generation"}
188
+ tags = await self.generative_tool.generate_tags(content, max_tags)
189
+ if document_id and document_id != "none" and tags:
190
+ await self.document_store.update_document_metadata(document_id, {"tags": tags})
191
+ return {"success": True, "tags": tags, "content_length": len(content), "document_id": document_id}
192
+ except Exception as e:
193
+ logger.error(f"Tag generation failed: {str(e)}")
194
+ return {"success": False, "error": str(e)}
195
+ async def generate_podcast_async(
196
+ self,
197
+ document_ids: List[str],
198
+ style: str = "conversational",
199
+ duration_minutes: int = 10,
200
+ host1_voice: str = "Rachel",
201
+ host2_voice: str = "Adam"
202
+ ) -> Dict[str, Any]:
203
+ """Generate podcast from documents"""
204
+ try:
205
+ result = await self.podcast_tool.generate_podcast(
206
+ document_ids=document_ids,
207
+ style=style,
208
+ duration_minutes=duration_minutes,
209
+ host1_voice=host1_voice,
210
+ host2_voice=host2_voice
211
+ )
212
+ return result
213
+ except Exception as e:
214
+ logger.error(f"Podcast generation failed: {str(e)}")
215
+ return {"success": False, "error": str(e)}
216
+
217
+ async def answer_question_async(self, question: str, context_filter: Optional[Dict] = None) -> Dict[str, Any]:
218
+ try:
219
+ search_results = await self.search_tool.search(question, top_k=5, filters=context_filter)
220
+ if not search_results:
221
+ return {"success": False, "error": "No relevant context found in your documents. Please make sure you have uploaded relevant documents.", "question": question}
222
+ answer = await self.generative_tool.answer_question(question, search_results)
223
+ return {"success": True, "question": question, "answer": answer, "sources": [result.to_dict() for result in search_results], "confidence": "high" if len(search_results) >= 3 else "medium"}
224
+ except Exception as e:
225
+ logger.error(f"Question answering failed: {str(e)}")
226
+ return {"success": False, "error": str(e), "question": question}
227
+
228
+ async def generate_outline_async(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> Dict[str, Any]:
229
+ try:
230
+ outline = await self.generative_tool.generate_outline(topic, num_sections, detail_level)
231
+ return {"success": True, "result": outline}
232
+ except Exception as e:
233
+ return {"success": False, "error": str(e)}
234
+
235
+ async def explain_concept_async(self, concept: str, audience: str = "general", length: str = "medium") -> Dict[str, Any]:
236
+ try:
237
+ explanation = await self.generative_tool.explain_concept(concept, audience, length)
238
+ return {"success": True, "result": explanation}
239
+ except Exception as e:
240
+ return {"success": False, "error": str(e)}
241
+
242
+ async def paraphrase_text_async(self, text: str, style: str = "formal") -> Dict[str, Any]:
243
+ try:
244
+ paraphrase = await self.generative_tool.paraphrase_text(text, style)
245
+ return {"success": True, "result": paraphrase}
246
+ except Exception as e:
247
+ return {"success": False, "error": str(e)}
248
+
249
+ async def categorize_content_async(self, content: str, categories: List[str]) -> Dict[str, Any]:
250
+ try:
251
+ category = await self.generative_tool.categorize(content, categories)
252
+ return {"success": True, "result": category}
253
+ except Exception as e:
254
+ return {"success": False, "error": str(e)}
255
+
256
+ async def extract_key_insights_async(self, content: str, num_insights: int = 5) -> Dict[str, Any]:
257
+ try:
258
+ insights = await self.generative_tool.extract_key_insights(content, num_insights)
259
+ return {"success": True, "result": "\n".join([f"- {insight}" for insight in insights])}
260
+ except Exception as e:
261
+ return {"success": False, "error": str(e)}
262
+
263
+ async def generate_questions_async(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> Dict[str, Any]:
264
+ try:
265
+ questions = await self.generative_tool.generate_questions(content, question_type, num_questions)
266
+ return {"success": True, "result": "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])}
267
+ except Exception as e:
268
+ return {"success": False, "error": str(e)}
269
+
270
+ async def extract_key_information_async(self, content: str) -> Dict[str, Any]:
271
+ try:
272
+ info = await self.llm_service.extract_key_information(content)
273
+ return {"success": True, "result": json.dumps(info, indent=2)}
274
+ except Exception as e:
275
+ return {"success": False, "error": str(e)}
276
+
277
+ def list_documents_sync(self, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
278
+ try:
279
+ documents = self.run_async(self.document_store.list_documents(limit, offset))
280
+ return {"success": True, "documents": [doc.to_dict() for doc in documents], "total": len(documents)}
281
+ except Exception as e:
282
+ return {"success": False, "error": str(e)}
283
+
284
+ mcp_server = ContentOrganizerMCPServer()
285
+
286
+ def get_document_list():
287
+ try:
288
+ result = mcp_server.list_documents_sync(limit=100)
289
+ if result["success"]:
290
+ if result["documents"]:
291
+ doc_list_str = "📚 Documents in Library:\n\n"
292
+ for i, doc_item in enumerate(result["documents"], 1):
293
+ doc_list_str += f"{i}. {doc_item['filename']} (ID: {doc_item['id'][:8]}...)\n"
294
+ doc_list_str += f" Type: {doc_item['doc_type']}, Size: {doc_item['file_size']} bytes\n"
295
+ if doc_item.get('tags'):
296
+ doc_list_str += f" Tags: {', '.join(doc_item['tags'])}\n"
297
+ doc_list_str += f" Created: {doc_item['created_at'][:10]}\n\n"
298
+ return doc_list_str
299
+ else:
300
+ return "No documents in library yet. Upload some documents to get started!"
301
+ else:
302
+ return f"Error loading documents: {result['error']}"
303
+ except Exception as e:
304
+ return f"Error: {str(e)}"
305
+
306
+ def get_document_choices():
307
+ try:
308
+ result = mcp_server.list_documents_sync(limit=100)
309
+ if result["success"] and result["documents"]:
310
+ choices = [(f"{doc['filename']} ({doc['id'][:8]}...)", doc['id']) for doc in result["documents"]]
311
+ logger.info(f"Generated {len(choices)} document choices")
312
+ return choices
313
+ return []
314
+ except Exception as e:
315
+ logger.error(f"Error getting document choices: {str(e)}")
316
+ return []
317
+
318
+ def refresh_library():
319
+ doc_list_refreshed = get_document_list()
320
+ doc_choices_refreshed = get_document_choices()
321
+ logger.info(f"Refreshing library. Found {len(doc_choices_refreshed)} choices.")
322
+ return (
323
+ doc_list_refreshed,
324
+ gr.update(choices=doc_choices_refreshed),
325
+ gr.update(choices=doc_choices_refreshed),
326
+ gr.update(choices=doc_choices_refreshed)
327
+ )
328
+
329
+ def upload_and_process_file(file):
330
+ if file is None:
331
+ doc_list_initial = get_document_list()
332
+ doc_choices_initial = get_document_choices()
333
+ return (
334
+ "No file uploaded", "", doc_list_initial,
335
+ gr.update(choices=doc_choices_initial),
336
+ gr.update(choices=doc_choices_initial),
337
+ gr.update(choices=doc_choices_initial)
338
+ )
339
+ try:
340
+ file_path = file.name if hasattr(file, 'name') else str(file)
341
+ file_type = Path(file_path).suffix.lower().strip('.') # Ensure suffix is clean
342
+ logger.info(f"Processing file: {file_path}, type: {file_type}")
343
+ result = mcp_server.run_async(mcp_server.ingest_document_async(file_path, file_type))
344
+
345
+ doc_list_updated = get_document_list()
346
+ doc_choices_updated = get_document_choices()
347
+
348
+ if result["success"]:
349
+ return (
350
+ f"✅ Success: {result['message']}\nDocument ID: {result['document_id']}\nChunks created: {result['chunks_created']}",
351
+ result["document_id"],
352
+ doc_list_updated,
353
+ gr.update(choices=doc_choices_updated),
354
+ gr.update(choices=doc_choices_updated),
355
+ gr.update(choices=doc_choices_updated)
356
+ )
357
+ else:
358
+ return (
359
+ f"❌ Error: {result.get('error', 'Unknown error')}", "",
360
+ doc_list_updated,
361
+ gr.update(choices=doc_choices_updated),
362
+ gr.update(choices=doc_choices_updated),
363
+ gr.update(choices=doc_choices_updated)
364
+ )
365
+ except Exception as e:
366
+ logger.error(f"Error processing file: {str(e)}")
367
+ doc_list_error = get_document_list()
368
+ doc_choices_error = get_document_choices()
369
+ return (
370
+ f"❌ Error: {str(e)}", "",
371
+ doc_list_error,
372
+ gr.update(choices=doc_choices_error),
373
+ gr.update(choices=doc_choices_error),
374
+ gr.update(choices=doc_choices_error)
375
+ )
376
+
377
+ def perform_search(query, top_k):
378
+ if not query.strip():
379
+ return "Please enter a search query"
380
+ try:
381
+ result = mcp_server.run_async(mcp_server.semantic_search_async(query, int(top_k)))
382
+ if result["success"]:
383
+ if result["results"]:
384
+ output_str = f"🔍 Found {result['total_results']} results for: '{query}'\n\n"
385
+ for i, res_item in enumerate(result["results"], 1):
386
+ output_str += f"Result {i}:\n"
387
+ output_str += f"📊 Relevance Score: {res_item['score']:.3f}\n"
388
+ output_str += f"📄 Content: {res_item['content'][:300]}...\n"
389
+ if 'document_filename' in res_item.get('metadata', {}):
390
+ output_str += f"📁 Source: {res_item['metadata']['document_filename']}\n"
391
+ output_str += f"🔗 Document ID: {res_item.get('document_id', 'Unknown')}\n"
392
+ output_str += "-" * 80 + "\n\n"
393
+ return output_str
394
+ else:
395
+ return f"No results found for: '{query}'\n\nMake sure you have uploaded relevant documents first."
396
+ else:
397
+ return f"❌ Search failed: {result['error']}"
398
+ except Exception as e:
399
+ logger.error(f"Search error: {str(e)}")
400
+ return f"❌ Error: {str(e)}"
401
+
402
+ def update_options_visibility(task):
403
+ """Update visibility of options based on selected task"""
404
+ return (
405
+ gr.update(visible=task == "Summarize"), # summary_style
406
+ gr.update(visible=task == "Generate Outline"), # outline_sections
407
+ gr.update(visible=task == "Generate Outline"), # outline_detail
408
+ gr.update(visible=task == "Explain Concept"), # explain_audience
409
+ gr.update(visible=task == "Explain Concept"), # explain_length
410
+ gr.update(visible=task == "Paraphrase"), # paraphrase_style
411
+ gr.update(visible=task == "Categorize"), # categories_input
412
+ gr.update(visible=task in ["Key Insights", "Generate Questions"]), # num_items
413
+ gr.update(visible=task == "Generate Questions") # question_type
414
+ )
415
+
416
+ def execute_content_task(task, doc_choice, custom_text,
417
+ summary_style, outline_sections, outline_detail,
418
+ explain_audience, explain_length,
419
+ paraphrase_style, categories_input,
420
+ num_items, question_type):
421
+ try:
422
+ # Get content
423
+ content = ""
424
+ if custom_text and custom_text.strip():
425
+ content = custom_text
426
+ elif doc_choice and doc_choice != "none":
427
+ content = mcp_server.run_async(mcp_server.get_document_content_async(doc_choice))
428
+ if not content:
429
+ return "❌ Error: Document not found or empty"
430
+ else:
431
+ if task == "Generate Outline":
432
+ content = custom_text # Topic is passed as text
433
+ else:
434
+ return "⚠️ Please select a document or enter text"
435
+
436
+ # Execute task
437
+ result = {"success": False, "error": "Unknown task"}
438
+
439
+ if task == "Summarize":
440
+ result = mcp_server.run_async(mcp_server.summarize_content_async(content=content, style=summary_style))
441
+ if result["success"]:
442
+ return f"📝 Summary ({summary_style}):\n\n{result['summary']}"
443
+
444
+ elif task == "Generate Outline":
445
+ # For outline, content is the topic
446
+ result = mcp_server.run_async(mcp_server.generate_outline_async(content, int(outline_sections), outline_detail))
447
+ if result["success"]:
448
+ return f"📝 Outline for '{content}':\n\n{result['result']}"
449
+
450
+ elif task == "Explain Concept":
451
+ # For explain, content is the concept
452
+ result = mcp_server.run_async(mcp_server.explain_concept_async(content, explain_audience, explain_length))
453
+ if result["success"]:
454
+ return f"💡 Explanation ({explain_audience}):\n\n{result['result']}"
455
+
456
+ elif task == "Paraphrase":
457
+ result = mcp_server.run_async(mcp_server.paraphrase_text_async(content, paraphrase_style))
458
+ if result["success"]:
459
+ return f"🔄 Paraphrased Text ({paraphrase_style}):\n\n{result['result']}"
460
+
461
+ elif task == "Categorize":
462
+ categories = [c.strip() for c in categories_input.split(',')] if categories_input else []
463
+ result = mcp_server.run_async(mcp_server.categorize_content_async(content, categories))
464
+ if result["success"]:
465
+ return f"🏷️ Category:\n\n{result['result']}"
466
+
467
+ elif task == "Key Insights":
468
+ result = mcp_server.run_async(mcp_server.extract_key_insights_async(content, int(num_items)))
469
+ if result["success"]:
470
+ return f"🔍 Key Insights:\n\n{result['result']}"
471
+
472
+ elif task == "Generate Questions":
473
+ result = mcp_server.run_async(mcp_server.generate_questions_async(content, question_type, int(num_items)))
474
+ if result["success"]:
475
+ return f"❓ Generated Questions ({question_type}):\n\n{result['result']}"
476
+
477
+ elif task == "Extract Key Info":
478
+ result = mcp_server.run_async(mcp_server.extract_key_information_async(content))
479
+ if result["success"]:
480
+ return f"📊 Key Information:\n\n{result['result']}"
481
+
482
+ if not result["success"]:
483
+ return f"❌ Error: {result.get('error', 'Unknown error')}"
484
+
485
+ return "✅ Task completed"
486
+
487
+ except Exception as e:
488
+ logger.error(f"Task execution error: {str(e)}")
489
+ return f"❌ Error: {str(e)}"
490
+
491
+ def generate_tags_for_document(doc_choice, custom_text, max_tags):
492
+ try:
493
+ logger.info(f"Generate tags called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
494
+ document_id = doc_choice if doc_choice and doc_choice != "none" and doc_choice != "" else None
495
+
496
+ if custom_text and custom_text.strip():
497
+ logger.info("Using custom text for tag generation")
498
+ result = mcp_server.run_async(mcp_server.generate_tags_async(content=custom_text, max_tags=int(max_tags)))
499
+ elif document_id:
500
+ logger.info(f"Generating tags for document: {document_id}")
501
+ result = mcp_server.run_async(mcp_server.generate_tags_async(document_id=document_id, max_tags=int(max_tags)))
502
+ else:
503
+ return "Please select a document from the dropdown or enter text to generate tags"
504
+
505
+ if result["success"]:
506
+ tags_str = ", ".join(result["tags"])
507
+ output_str = f"🏷️ Generated Tags:\n\n{tags_str}\n\n"
508
+ output_str += f"📊 Statistics:\n"
509
+ output_str += f"- Content length: {result['content_length']} characters\n"
510
+ output_str += f"- Number of tags: {len(result['tags'])}\n"
511
+ if result.get('document_id'):
512
+ output_str += f"- Document ID: {result['document_id']}\n"
513
+ output_str += f"\n✅ Tags have been saved to the document."
514
+ return output_str
515
+ else:
516
+ return f"❌ Tag generation failed: {result['error']}"
517
+ except Exception as e:
518
+ logger.error(f"Tag generation error: {str(e)}")
519
+ return f"❌ Error: {str(e)}"
520
+
521
+ def ask_question(question):
522
+ if not question.strip():
523
+ return "Please enter a question"
524
+ try:
525
+ result = mcp_server.run_async(mcp_server.answer_question_async(question))
526
+ if result["success"]:
527
+ output_str = f"❓ Question: {result['question']}\n\n"
528
+ output_str += f"💡 Answer:\n{result['answer']}\n\n"
529
+ output_str += f"🎯 Confidence: {result['confidence']}\n\n"
530
+ output_str += f"📚 Sources Used ({len(result['sources'])}):\n"
531
+ for i, source_item in enumerate(result['sources'], 1):
532
+ filename = source_item.get('metadata', {}).get('document_filename', 'Unknown')
533
+ output_str += f"\n{i}. 📄 {filename}\n"
534
+ output_str += f" 📝 Excerpt: {source_item['content'][:150]}...\n"
535
+ output_str += f" 📊 Relevance: {source_item['score']:.3f}\n"
536
+ return output_str
537
+ else:
538
+ return f"❌ {result.get('error', 'Failed to answer question')}"
539
+ except Exception as e:
540
+ return f"❌ Error: {str(e)}"
541
+
542
+ def delete_document_from_library(document_id):
543
+ if not document_id:
544
+ doc_list_current = get_document_list()
545
+ doc_choices_current = get_document_choices()
546
+ return (
547
+ "No document selected to delete.",
548
+ doc_list_current,
549
+ gr.update(choices=doc_choices_current),
550
+ gr.update(choices=doc_choices_current),
551
+ gr.update(choices=doc_choices_current)
552
+ )
553
+ try:
554
+ delete_doc_store_result = mcp_server.run_async(mcp_server.document_store.delete_document(document_id))
555
+ delete_vec_store_result = mcp_server.run_async(mcp_server.vector_store.delete_document(document_id))
556
+
557
+ msg = ""
558
+ if delete_doc_store_result:
559
+ msg += f"🗑️ Document {document_id[:8]}... deleted from document store. "
560
+ else:
561
+ msg += f"❌ Failed to delete document {document_id[:8]}... from document store. "
562
+
563
+ if delete_vec_store_result:
564
+ msg += "Embeddings deleted from vector store."
565
+ else:
566
+ msg += "Failed to delete embeddings from vector store (or no embeddings existed)."
567
+
568
+
569
+ doc_list_updated = get_document_list()
570
+ doc_choices_updated = get_document_choices()
571
+ return (
572
+ msg,
573
+ doc_list_updated,
574
+ gr.update(choices=doc_choices_updated),
575
+ gr.update(choices=doc_choices_updated),
576
+ gr.update(choices=doc_choices_updated)
577
+ )
578
+ except Exception as e:
579
+ logger.error(f"Error deleting document: {str(e)}")
580
+ doc_list_error = get_document_list()
581
+ doc_choices_error = get_document_choices()
582
+ return (
583
+ f"❌ Error deleting document: {str(e)}",
584
+ doc_list_error,
585
+ gr.update(choices=doc_choices_error),
586
+ gr.update(choices=doc_choices_error),
587
+ gr.update(choices=doc_choices_error)
588
+ )
589
+
590
+ # Voice conversation state - global scope
591
+ voice_conversation_state = {
592
+ "session_id": None,
593
+ "active": False,
594
+ "transcript": []
595
+ }
596
+
597
+ def start_voice_conversation():
598
+ """Start a new voice conversation session"""
599
+ try:
600
+ if not mcp_server.elevenlabs_service.is_available():
601
+ return (
602
+ "⚠️ Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID in .env",
603
+ gr.update(interactive=False),
604
+ gr.update(interactive=True),
605
+ ""
606
+ )
607
+
608
+ session_id = str(uuid.uuid4())
609
+ result = mcp_server.run_async(mcp_server.elevenlabs_service.start_conversation(session_id))
610
+
611
+ if result.get("success"):
612
+ voice_conversation_state["session_id"] = session_id
613
+ voice_conversation_state["active"] = True
614
+ voice_conversation_state["transcript"] = []
615
+
616
+ return (
617
+ "🎙️ Voice assistant is ready. Type your question below.",
618
+ gr.update(interactive=False),
619
+ gr.update(interactive=True),
620
+ []
621
+ )
622
+ else:
623
+ return (
624
+ f"❌ Failed to start conversation: {result.get('error')}",
625
+ gr.update(interactive=True),
626
+ gr.update(interactive=False),
627
+ []
628
+ )
629
+ except Exception as e:
630
+ logger.error(f"Error starting voice conversation: {str(e)}")
631
+ return (
632
+ f"❌ Error: {str(e)}",
633
+ gr.update(interactive=True),
634
+ gr.update(interactive=False),
635
+ []
636
+ )
637
+
638
+
639
+ def stop_voice_conversation():
640
+ """Stop active voice conversation"""
641
+ try:
642
+ if not voice_conversation_state["active"]:
643
+ return (
644
+ "No active conversation",
645
+ gr.update(interactive=True),
646
+ gr.update(interactive=False),
647
+ voice_conversation_state["transcript"]
648
+ )
649
+
650
+ session_id = voice_conversation_state["session_id"]
651
+ if session_id:
652
+ mcp_server.run_async(mcp_server.elevenlabs_service.end_conversation(session_id))
653
+
654
+ voice_conversation_state["active"] = False
655
+ voice_conversation_state["session_id"] = None
656
+
657
+ return (
658
+ "✅ Conversation ended",
659
+ gr.update(interactive=True),
660
+ gr.update(interactive=False),
661
+ voice_conversation_state["transcript"]
662
+ )
663
+ except Exception as e:
664
+ logger.error(f"Error stopping conversation: {str(e)}")
665
+ return (
666
+ f"❌ Error: {str(e)}",
667
+ gr.update(interactive=True),
668
+ gr.update(interactive=False),
669
+ voice_conversation_state["transcript"]
670
+ )
671
+
672
+
673
+ def send_voice_message(message):
674
+ """Send a text message in voice conversation"""
675
+ try:
676
+ if not voice_conversation_state["active"]:
677
+ return ("Please start a conversation first", "", format_transcript(voice_conversation_state["transcript"]))
678
+
679
+ if not message or not message.strip():
680
+ return ("Please enter a message", message, format_transcript(voice_conversation_state["transcript"]))
681
+
682
+ session_id = voice_conversation_state["session_id"]
683
+ voice_conversation_state["transcript"].append({"role": "user", "content": message})
684
+
685
+ result = mcp_server.run_async(mcp_server.voice_tool.voice_qa(message, session_id))
686
+
687
+ if result.get("success"):
688
+ answer = result.get("answer", "No response")
689
+ voice_conversation_state["transcript"].append({"role": "assistant", "content": answer})
690
+ return ("✅ Response received", "", format_transcript(voice_conversation_state["transcript"]))
691
+ else:
692
+ return (f"❌ Error: {result.get('error')}", message, format_transcript(voice_conversation_state["transcript"]))
693
+ except Exception as e:
694
+ logger.error(f"Error sending message: {str(e)}")
695
+ return (f"❌ Error: {str(e)}", message, format_transcript(voice_conversation_state["transcript"]))
696
+
697
+ def format_transcript(transcript):
698
+ """Format conversation transcript for display"""
699
+ if not transcript:
700
+ return "No conversation yet. Start talking to the AI librarian!"
701
+
702
+ formatted = ""
703
+ for msg in transcript:
704
+ role = msg["role"]
705
+ content = msg["content"]
706
+ if role == "user":
707
+ formatted += f"👤 **You:** {content}\n\n"
708
+ else:
709
+ formatted += f"🤖 **AI Librarian:** {content}\n\n"
710
+ formatted += "---\n\n"
711
+ return formatted
712
+
713
+ def clear_voice_transcript():
714
+ """Clear conversation transcript"""
715
+ voice_conversation_state["transcript"] = []
716
+ return ""
717
+
718
+ def send_voice_message_v6(message, chat_history):
719
+ """Send message in voice conversation - Gradio 6 format"""
720
+ try:
721
+ if not voice_conversation_state["active"]:
722
+ return chat_history, ""
723
+
724
+ if not message or not message.strip():
725
+ return chat_history, message
726
+
727
+ session_id = voice_conversation_state["session_id"]
728
+
729
+ # Add user message in Gradio 6 format
730
+ chat_history.append({"role": "user", "content": message})
731
+
732
+ # Get AI response
733
+ result = mcp_server.run_async(mcp_server.voice_tool.voice_qa(message, session_id))
734
+
735
+ if result.get("success"):
736
+ answer = result.get("answer", "No response")
737
+ chat_history.append({"role": "assistant", "content": answer})
738
+ else:
739
+ chat_history.append({
740
+ "role": "assistant",
741
+ "content": f"❌ Error: {result.get('error')}"
742
+ })
743
+
744
+ return chat_history, ""
745
+ except Exception as e:
746
+ logger.error(f"Error in voice message: {str(e)}")
747
+ chat_history.append({
748
+ "role": "assistant",
749
+ "content": f"❌ Error: {str(e)}"
750
+ })
751
+ return chat_history, ""
752
+
753
+ def generate_podcast_ui(doc_ids, style, duration, voice1, voice2):
754
+ """UI wrapper for podcast generation"""
755
+ try:
756
+ if not doc_ids or len(doc_ids) == 0:
757
+ return ("⚠️ Please select at least one document", None, "No documents selected", "")
758
+
759
+ logger.info(f"Generating podcast: {len(doc_ids)} docs, {style}, {duration}min")
760
+
761
+ result = mcp_server.run_async(
762
+ mcp_server.generate_podcast_async(
763
+ document_ids=doc_ids,
764
+ style=style,
765
+ duration_minutes=int(duration),
766
+ host1_voice=voice1,
767
+ host2_voice=voice2
768
+ )
769
+ )
770
+
771
+ if result.get("success"):
772
+ audio_file = result.get("audio_file")
773
+ transcript = result.get("transcript", "Transcript not available")
774
+ message = result.get("message", "Podcast generated!")
775
+ formatted_transcript = f"## Podcast Transcript\n\n{transcript}"
776
+
777
+ return (
778
+ f"✅ {message}",
779
+ audio_file,
780
+ formatted_transcript,
781
+ result.get("podcast_id", "")
782
+ )
783
+ else:
784
+ error = result.get("error", "Unknown error")
785
+ return (f"❌ Error: {error}", None, "Generation failed", "")
786
+ except Exception as e:
787
+ logger.error(f"Podcast UI error: {str(e)}")
788
+ return (f"❌ Error: {str(e)}", None, "An error occurred", "")
789
+
790
+ def load_dashboard_stats():
791
+ """Load dashboard statistics for the UI"""
792
+ try:
793
+ # Get document list
794
+ docs_result = mcp_server.list_documents_sync(limit=1000)
795
+ doc_count = 0
796
+ total_chunks = 0
797
+ total_size = 0
798
+ recent_data = []
799
+
800
+ if docs_result.get("success"):
801
+ documents = docs_result.get("documents", [])
802
+ doc_count = len(documents)
803
+ total_chunks = sum(doc.get("metadata", {}).get("chunk_count", 0) for doc in documents)
804
+ total_size = sum(doc.get("file_size", 0) for doc in documents)
805
+ storage_mb = round(total_size / (1024 * 1024), 2) if total_size > 0 else 0.0
806
+
807
+ # Get recent 5 documents
808
+ recent = documents[:5]
809
+ recent_data = [
810
+ [
811
+ doc.get("filename", "Unknown"),
812
+ doc.get("doc_type", "unknown"),
813
+ doc.get("created_at", "")[:10] if doc.get("created_at") else "N/A",
814
+ f"{doc.get('file_size', 0)} bytes"
815
+ ]
816
+ for doc in recent
817
+ ]
818
+ else:
819
+ storage_mb = 0.0
820
+
821
+ # Service status indicators
822
+ vector_stat = "✅ Online" if getattr(mcp_server, "vector_store", None) else "❌ Offline"
823
+ llm_stat = "✅ Ready" if getattr(mcp_server, "llm_service", None) else "❌ Offline"
824
+ voice_stat = "✅ Ready" if (getattr(mcp_server, "elevenlabs_service", None) and mcp_server.elevenlabs_service.is_available()) else "⚠️ Configure API Key"
825
+
826
+ return (
827
+ doc_count,
828
+ total_chunks,
829
+ storage_mb,
830
+ recent_data,
831
+ vector_stat,
832
+ llm_stat,
833
+ voice_stat,
834
+ )
835
+ except Exception as e:
836
+ logger.error(f"Error loading dashboard stats: {str(e)}")
837
+ return (0, 0, 0.0, [], "❌ Error", "❌ Error", "❌ Error")
838
+
839
+ def create_gradio_interface():
840
+ # Create custom theme with modern aesthetics
841
+ custom_theme = gr.themes.Soft(
842
+ primary_hue=gr.themes.colors.indigo,
843
+ secondary_hue=gr.themes.colors.blue,
844
+ neutral_hue=gr.themes.colors.slate,
845
+ font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
846
+ font_mono=[gr.themes.GoogleFont("Fira Code"), "monospace"],
847
+ ).set(
848
+ button_primary_background_fill="*primary_500",
849
+ button_primary_background_fill_hover="*primary_600",
850
+ block_title_text_weight="600",
851
+ block_label_text_size="sm",
852
+ block_label_text_weight="500",
853
+ )
854
+
855
+ with gr.Blocks(title="🧠 AI Digital Library Assistant", theme=custom_theme) as interface:
856
+ with gr.Tabs():
857
+ # Dashboard Tab - New Landing Page
858
+ with gr.Tab("🏠 Dashboard"):
859
+ gr.Markdown("# Welcome to Your AI Library Assistant")
860
+ gr.Markdown("*Your intelligent document management and analysis platform powered by AI*")
861
+
862
+ # Quick Stats Section
863
+ gr.Markdown("## 📊 Quick Stats")
864
+ with gr.Row():
865
+ total_docs = gr.Number(
866
+ label="📚 Total Documents",
867
+ value=0,
868
+ interactive=False,
869
+ container=True
870
+ )
871
+ total_chunks = gr.Number(
872
+ label="🧩 Vector Chunks",
873
+ value=0,
874
+ interactive=False,
875
+ container=True
876
+ )
877
+ storage_size = gr.Number(
878
+ label="💾 Storage (MB)",
879
+ value=0,
880
+ interactive=False,
881
+ container=True
882
+ )
883
+
884
+ # Recent Activity Section
885
+ gr.Markdown("## 📊 Recent Activity")
886
+ with gr.Group():
887
+ recent_docs = gr.Dataframe(
888
+ headers=["Document", "Type", "Date", "Size"],
889
+ datatype=["str", "str", "str", "str"],
890
+ row_count=(5, "fixed"),
891
+ col_count=(4, "fixed"),
892
+ interactive=False,
893
+ label="Recently Added Documents"
894
+ )
895
+
896
+ # System Status Section
897
+ gr.Markdown("## � System Status")
898
+ with gr.Row():
899
+ vector_status = gr.Textbox(
900
+ label="Vector Store",
901
+ value="✅ Online",
902
+ interactive=False,
903
+ container=True
904
+ )
905
+ llm_status = gr.Textbox(
906
+ label="LLM Service",
907
+ value="✅ Ready",
908
+ interactive=False,
909
+ container=True
910
+ )
911
+ voice_status = gr.Textbox(
912
+ label="Voice Service",
913
+ value="⚠️ Configure API Key",
914
+ interactive=False,
915
+ container=True
916
+ )
917
+
918
+ with gr.Tab("📚 Document Library"):
919
+ with gr.Row():
920
+ with gr.Column():
921
+ gr.Markdown("### Your Document Collection")
922
+ document_list_display = gr.Textbox(label="Documents in Library", value=get_document_list(), lines=20, interactive=False)
923
+ refresh_btn_library = gr.Button("🔄 Refresh Library", variant="secondary")
924
+ delete_doc_dropdown_visible = gr.Dropdown(label="Select Document to Delete", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
925
+ delete_btn = gr.Button("🗑️ Delete Selected Document", variant="stop")
926
+ delete_output_display = gr.Textbox(label="Delete Status", visible=True)
927
+
928
+ with gr.Tab("📄 Upload Documents"):
929
+ gr.Markdown("""
930
+ ### 📥 Add Documents to Library
931
+ Upload PDFs, Word documents, text files, or images. OCR will extract text from images automatically.
932
+ """)
933
+
934
+ with gr.Row():
935
+ with gr.Column():
936
+ with gr.Group():
937
+ gr.Markdown("**Supported formats:** PDF, DOCX, TXT, Images (JPG, PNG)")
938
+ file_input_upload = gr.File(
939
+ label="Select File",
940
+ file_types=[".pdf", ".txt", ".docx", ".png", ".jpg", ".jpeg"],
941
+ type="filepath",
942
+ file_count="single"
943
+ )
944
+
945
+ upload_btn_process = gr.Button("🚀 Upload & Process", variant="primary", size="lg")
946
+
947
+
948
+ with gr.Group():
949
+ upload_output_display = gr.Textbox(
950
+ label="Status",
951
+ lines=6,
952
+ interactive=False,
953
+ show_copy_button=False
954
+ )
955
+
956
+ doc_id_output_display = gr.Textbox(
957
+ label="Document ID",
958
+ interactive=False,
959
+ visible=False
960
+ )
961
+
962
+
963
+ with gr.Tab("🔍 Search Documents"):
964
+ gr.Markdown("""
965
+ ### 🔎 Semantic Search
966
+ Find relevant content across your entire document library using AI-powered semantic search.
967
+ """)
968
+
969
+ with gr.Row():
970
+ with gr.Column(scale=1):
971
+ with gr.Group():
972
+ search_query_input = gr.Textbox(
973
+ label="Search Query",
974
+ placeholder="What are you looking for?",
975
+ lines=2,
976
+ info="Use natural language to describe what you need"
977
+ )
978
+
979
+ with gr.Accordion("🎛️ Search Options", open=False):
980
+ search_top_k_slider = gr.Slider(
981
+ label="Number of Results",
982
+ minimum=1, maximum=20, value=5, step=1,
983
+ info="More results = broader search"
984
+ )
985
+
986
+ search_btn_action = gr.Button("🔍 Search", variant="primary", size="lg")
987
+
988
+ with gr.Column(scale=2):
989
+ with gr.Group():
990
+ search_output_display = gr.Textbox(
991
+ label="Results",
992
+ lines=20,
993
+ placeholder="Search results will appear here...",
994
+ show_copy_button=True
995
+ )
996
+
997
+
998
+ with gr.Tab("📝 Content Studio"):
999
+ gr.Markdown("""
1000
+ ### 🎨 Create & Analyze Content
1001
+ Transform documents with AI-powered tools: summarize, outline, explain, and more.
1002
+ """)
1003
+
1004
+ with gr.Row():
1005
+ with gr.Column(scale=2):
1006
+ # Source Selection with Group
1007
+ with gr.Group():
1008
+ gr.Markdown("#### 📄 Content Source")
1009
+ doc_dropdown_content = gr.Dropdown(
1010
+ label="Select Document",
1011
+ choices=get_document_choices(),
1012
+ value=None,
1013
+ interactive=True,
1014
+ info="Choose a document from your library"
1015
+ )
1016
+
1017
+ gr.Markdown("**OR**")
1018
+
1019
+ content_text_input = gr.Textbox(
1020
+ label="Enter Text or Topic",
1021
+ placeholder="Paste content or enter a topic...",
1022
+ lines=4,
1023
+ info="For outlines, enter a topic. For other tasks, paste text to analyze."
1024
+ )
1025
+
1026
+ # Task Configuration with Group
1027
+ with gr.Group():
1028
+ gr.Markdown("#### 🛠️ Task Configuration")
1029
+ task_dropdown = gr.Dropdown(
1030
+ label="Select Task",
1031
+ choices=[
1032
+ "Summarize", "Generate Outline", "Explain Concept",
1033
+ "Paraphrase", "Categorize", "Key Insights",
1034
+ "Generate Questions", "Extract Key Info"
1035
+ ],
1036
+ value="Summarize",
1037
+ interactive=True,
1038
+ info="Choose the type of analysis to perform"
1039
+ )
1040
+
1041
+ # Dynamic Options with Accordion
1042
+ with gr.Accordion("⚙️ Advanced Options", open=False):
1043
+ summary_style_opt = gr.Dropdown(
1044
+ label="Summary Style",
1045
+ choices=["concise", "detailed", "bullet_points", "executive"],
1046
+ value="concise",
1047
+ visible=True,
1048
+ info="How detailed should the summary be?"
1049
+ )
1050
+
1051
+ outline_sections_opt = gr.Slider(
1052
+ label="Number of Sections",
1053
+ minimum=3, maximum=10, value=5, step=1,
1054
+ visible=False,
1055
+ info="How many main sections?"
1056
+ )
1057
+ outline_detail_opt = gr.Dropdown(
1058
+ label="Detail Level",
1059
+ choices=["brief", "medium", "detailed"],
1060
+ value="medium",
1061
+ visible=False
1062
+ )
1063
+
1064
+ explain_audience_opt = gr.Dropdown(
1065
+ label="Target Audience",
1066
+ choices=["general", "technical", "beginner", "expert"],
1067
+ value="general",
1068
+ visible=False,
1069
+ info="Who is this explanation for?"
1070
+ )
1071
+ explain_length_opt = gr.Dropdown(
1072
+ label="Length",
1073
+ choices=["brief", "medium", "detailed"],
1074
+ value="medium",
1075
+ visible=False
1076
+ )
1077
+
1078
+ paraphrase_style_opt = gr.Dropdown(
1079
+ label="Style",
1080
+ choices=["formal", "casual", "academic", "simple", "technical"],
1081
+ value="formal",
1082
+ visible=False,
1083
+ info="Writing style for paraphrasing"
1084
+ )
1085
+
1086
+ categories_input_opt = gr.Textbox(
1087
+ label="Categories (comma separated)",
1088
+ placeholder="Technology, Business, Science...",
1089
+ visible=False
1090
+ )
1091
+
1092
+ num_items_opt = gr.Slider(
1093
+ label="Number of Items",
1094
+ minimum=1, maximum=10, value=5, step=1,
1095
+ visible=False
1096
+ )
1097
+ question_type_opt = gr.Dropdown(
1098
+ label="Question Type",
1099
+ choices=["comprehension", "analysis", "application", "creative", "factual"],
1100
+ value="comprehension",
1101
+ visible=False
1102
+ )
1103
+
1104
+ run_task_btn = gr.Button("🚀 Run Task", variant="primary", size="lg")
1105
+
1106
+ with gr.Column(scale=3):
1107
+ # Results with copy button and Group
1108
+ with gr.Group():
1109
+ gr.Markdown("#### 📊 Result")
1110
+ content_output_display = gr.Textbox(
1111
+ label="",
1112
+ lines=25,
1113
+ placeholder="Results will appear here...",
1114
+ show_copy_button=True,
1115
+ container=False
1116
+ )
1117
+
1118
+ # Event Handlers
1119
+ task_dropdown.change(
1120
+ fn=update_options_visibility,
1121
+ inputs=[task_dropdown],
1122
+ outputs=[
1123
+ summary_style_opt, outline_sections_opt, outline_detail_opt,
1124
+ explain_audience_opt, explain_length_opt, paraphrase_style_opt,
1125
+ categories_input_opt, num_items_opt, question_type_opt
1126
+ ]
1127
+ )
1128
+
1129
+ run_task_btn.click(
1130
+ fn=execute_content_task,
1131
+ inputs=[
1132
+ task_dropdown, doc_dropdown_content, content_text_input,
1133
+ summary_style_opt, outline_sections_opt, outline_detail_opt,
1134
+ explain_audience_opt, explain_length_opt, paraphrase_style_opt,
1135
+ categories_input_opt, num_items_opt, question_type_opt
1136
+ ],
1137
+ outputs=[content_output_display]
1138
+ )
1139
+
1140
+ with gr.Tab("🏷️ Generate Tags"):
1141
+ with gr.Row():
1142
+ with gr.Column():
1143
+ gr.Markdown("### Generate Document Tags")
1144
+ doc_dropdown_tag_visible = gr.Dropdown(label="Select Document to Tag", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
1145
+ tag_text_input = gr.Textbox(label="Or Paste Text to Generate Tags", placeholder="Paste any text here to generate tags...", lines=8)
1146
+ max_tags_slider = gr.Slider(label="Number of Tags", minimum=3, maximum=15, value=5, step=1)
1147
+ tag_btn_action = gr.Button("🏷️ Generate Tags", variant="primary", size="lg")
1148
+ with gr.Column():
1149
+ tag_output_display = gr.Textbox(label="Generated Tags", lines=10, placeholder="Tags will appear here...")
1150
+
1151
+ with gr.Tab("🎙️ Voice Assistant"):
1152
+ gr.Markdown("""
1153
+ ### 🗣️ Talk to Your AI Librarian
1154
+
1155
+ Have a natural conversation about your documents. Ask questions, request summaries,
1156
+ or explore your content library through voice-powered interaction.
1157
+
1158
+ **Note:** Requires ElevenLabs API configuration.
1159
+ """)
1160
+
1161
+ with gr.Row():
1162
+ with gr.Column(scale=2):
1163
+ # Status and Controls
1164
+ with gr.Group():
1165
+ voice_status_display = gr.Textbox(
1166
+ label="Status",
1167
+ value="Ready to start",
1168
+ interactive=False,
1169
+ lines=2
1170
+ )
1171
+
1172
+ with gr.Row():
1173
+ start_voice_btn = gr.Button("🎤 Start Conversation", variant="primary", size="lg")
1174
+ stop_voice_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", interactive=False)
1175
+
1176
+ # Message Input
1177
+ with gr.Group():
1178
+ gr.Markdown("#### 💬 Send Message")
1179
+ voice_input_text = gr.Textbox(
1180
+ label="",
1181
+ placeholder="Type your question...",
1182
+ lines=3,
1183
+ container=False,
1184
+ info="Press Enter or click Send"
1185
+ )
1186
+ send_voice_btn = gr.Button("📤 Send", variant="secondary")
1187
+
1188
+ with gr.Column(scale=3):
1189
+ # Chat Interface with Gradio 6 Chatbot
1190
+ with gr.Group():
1191
+ voice_chatbot = gr.Chatbot(
1192
+ label="Conversation",
1193
+ type="messages",
1194
+ height=500,
1195
+ show_copy_button=True
1196
+ )
1197
+
1198
+ clear_chat_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
1199
+
1200
+ # Voice Assistant event handlers
1201
+ start_voice_btn.click(
1202
+ fn=start_voice_conversation,
1203
+ outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
1204
+ )
1205
+
1206
+ stop_voice_btn.click(
1207
+ fn=stop_voice_conversation,
1208
+ outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
1209
+ )
1210
+
1211
+ send_voice_btn.click(
1212
+ fn=send_voice_message_v6,
1213
+ inputs=[voice_input_text, voice_chatbot],
1214
+ outputs=[voice_chatbot, voice_input_text]
1215
+ )
1216
+
1217
+ voice_input_text.submit(
1218
+ fn=send_voice_message_v6,
1219
+ inputs=[voice_input_text, voice_chatbot],
1220
+ outputs=[voice_chatbot, voice_input_text]
1221
+ )
1222
+
1223
+ clear_chat_btn.click(
1224
+ fn=lambda: [],
1225
+ outputs=[voice_chatbot]
1226
+ )
1227
+
1228
+ with gr.Tab("🎧 Podcast Studio"):
1229
+ gr.Markdown("""
1230
+ ### 🎙️ AI-Powered Podcast Generation
1231
+
1232
+ Transform your documents into engaging audio conversations. Select documents,
1233
+ customize the style and voices, and let AI create a professional podcast.
1234
+
1235
+ **Powered by:** ElevenLabs AI Voice Technology
1236
+ """)
1237
+
1238
+ with gr.Row():
1239
+ with gr.Column(scale=2):
1240
+ # Configuration Panel
1241
+ with gr.Group():
1242
+ gr.Markdown("#### 📚 Select Content")
1243
+
1244
+ podcast_doc_selector = gr.CheckboxGroup(
1245
+ choices=get_document_choices(),
1246
+ label="Documents to Include",
1247
+ info="Choose 1-5 documents for best results",
1248
+ interactive=True
1249
+ )
1250
+
1251
+ with gr.Accordion("🎨 Podcast Settings", open=True):
1252
+ with gr.Row():
1253
+ podcast_style = gr.Dropdown(
1254
+ label="Style",
1255
+ choices=["conversational", "educational", "technical", "casual"],
1256
+ value="conversational",
1257
+ info="Sets the tone and format"
1258
+ )
1259
+
1260
+ podcast_duration = gr.Slider(
1261
+ label="Duration (minutes)",
1262
+ minimum=5,
1263
+ maximum=30,
1264
+ value=10,
1265
+ step=5,
1266
+ info="Approximate length"
1267
+ )
1268
+
1269
+ gr.Markdown("#### 🗣️ Voice Selection")
1270
+ with gr.Row():
1271
+ host1_voice_selector = gr.Dropdown(
1272
+ label="Host 1",
1273
+ choices=["Rachel", "Adam", "Domi", "Bella", "Antoni", "Elli", "Josh"],
1274
+ value="Rachel"
1275
+ )
1276
+ host2_voice_selector = gr.Dropdown(
1277
+ label="Host 2",
1278
+ choices=["Adam", "Rachel", "Josh", "Sam", "Emily", "Antoni", "Arnold"],
1279
+ value="Adam"
1280
+ )
1281
+
1282
+ generate_podcast_btn = gr.Button(
1283
+ "🎙️ Generate Podcast",
1284
+ variant="primary",
1285
+ size="lg"
1286
+ )
1287
+
1288
+ podcast_status = gr.Textbox(
1289
+ label="Status",
1290
+ interactive=False,
1291
+ lines=2
1292
+ )
1293
+
1294
+ podcast_id_display = gr.Textbox(
1295
+ label="Podcast ID",
1296
+ interactive=False,
1297
+ visible=False
1298
+ )
1299
+
1300
+ with gr.Column(scale=3):
1301
+ # Output Panel
1302
+ with gr.Group():
1303
+ gr.Markdown("#### 🎵 Generated Podcast")
1304
+
1305
+ podcast_audio_player = gr.Audio(
1306
+ label="",
1307
+ type="filepath",
1308
+ interactive=False,
1309
+ autoplay=True,
1310
+ container=False
1311
+ )
1312
+
1313
+ with gr.Accordion("📝 Transcript", open=False):
1314
+ podcast_transcript_display = gr.Markdown(
1315
+ value="*Transcript will appear after generation...*"
1316
+ )
1317
+
1318
+ # Event handlers
1319
+ generate_podcast_btn.click(
1320
+ fn=generate_podcast_ui,
1321
+ inputs=[
1322
+ podcast_doc_selector,
1323
+ podcast_style,
1324
+ podcast_duration,
1325
+ host1_voice_selector,
1326
+ host2_voice_selector
1327
+ ],
1328
+ outputs=[
1329
+ podcast_status,
1330
+ podcast_audio_player,
1331
+ podcast_transcript_display,
1332
+ podcast_id_display
1333
+ ]
1334
+ )
1335
+
1336
+ with gr.Tab("❓ Ask Questions"):
1337
+ with gr.Row():
1338
+ with gr.Column():
1339
+ gr.Markdown("""### Ask Questions About Your Documents
1340
+ The AI will search through all your uploaded documents to find relevant information
1341
+ and provide comprehensive answers with sources.""")
1342
+ qa_question_input = gr.Textbox(label="Your Question", placeholder="Ask anything about your documents...", lines=3)
1343
+ qa_btn_action = gr.Button("❓ Get Answer", variant="primary", size="lg")
1344
+ with gr.Column():
1345
+ qa_output_display = gr.Textbox(label="AI Answer", lines=20, placeholder="Answer will appear here with sources...")
1346
+
1347
+ all_dropdowns_to_update = [delete_doc_dropdown_visible, doc_dropdown_content, doc_dropdown_tag_visible]
1348
+
1349
+ refresh_outputs = [document_list_display] + [dd for dd in all_dropdowns_to_update]
1350
+ refresh_btn_library.click(fn=refresh_library, outputs=refresh_outputs)
1351
+
1352
+ upload_outputs = [upload_output_display, doc_id_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
1353
+ upload_btn_process.click(upload_and_process_file, inputs=[file_input_upload], outputs=upload_outputs)
1354
+
1355
+ delete_outputs = [delete_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
1356
+ delete_btn.click(delete_document_from_library, inputs=[delete_doc_dropdown_visible], outputs=delete_outputs)
1357
+
1358
+ search_btn_action.click(perform_search, inputs=[search_query_input, search_top_k_slider], outputs=[search_output_display])
1359
+ tag_btn_action.click(generate_tags_for_document, inputs=[doc_dropdown_tag_visible, tag_text_input, max_tags_slider], outputs=[tag_output_display])
1360
+ qa_btn_action.click(ask_question, inputs=[qa_question_input], outputs=[qa_output_display])
1361
+
1362
+
1363
+ # Load dashboard stats on interface load
1364
+ interface.load(
1365
+ fn=load_dashboard_stats,
1366
+ outputs=[total_docs, total_chunks, storage_size, recent_docs, vector_status, llm_status, voice_status]
1367
+ )
1368
+
1369
+ interface.load(fn=refresh_library, outputs=refresh_outputs)
1370
+ return interface
1371
+
1372
+ if __name__ == "__main__":
1373
+ gradio_interface = create_gradio_interface()
1374
+ gradio_interface.launch(mcp_server=True)
config.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+
8
+ class Config:
9
+ # API Keys
10
+ NEBIUS_API_KEY: Optional[str] = os.getenv("NEBIUS_API_KEY")
11
+ MISTRAL_API_KEY: Optional[str] = os.getenv("MISTRAL_API_KEY")
12
+ HUGGINGFACE_API_KEY: Optional[str] = os.getenv("HUGGINGFACE_API_KEY", os.getenv("HF_TOKEN"))
13
+ OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY")
14
+ ANTHROPIC_API_KEY: Optional[str] = os.getenv("ANTHROPIC_API_KEY")
15
+
16
+ # NEBIUS Configuration (OpenAI OSS models)
17
+ NEBIUS_BASE_URL: str = os.getenv("NEBIUS_BASE_URL", "https://api.studio.nebius.com/v1/")
18
+ NEBIUS_MODEL: str = os.getenv("NEBIUS_MODEL", "meta-llama/Llama-3.3-70B-Instruct")
19
+
20
+ # Model Configuration
21
+ # Using OpenAI managed embeddings for performance/quality
22
+ EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
23
+
24
+ MISTRAL_MODEL: str = os.getenv("MISTRAL_MODEL", "mistral-large-2407")
25
+ OPENAI_MODEL: str = os.getenv("OPENAI_MODEL", "gpt-5.1-chat-latest")
26
+ FAST_MODEL: str = os.getenv("FAST_MODEL", "gpt-5-mini")
27
+
28
+ # Vector Store Configuration
29
+ DATA_DIR: str = os.getenv("DATA_DIR", "./data")
30
+ VECTOR_STORE_PATH: str = os.getenv("VECTOR_STORE_PATH", "./data/vector_store")
31
+ DOCUMENT_STORE_PATH: str = os.getenv("DOCUMENT_STORE_PATH", "./data/documents")
32
+ INDEX_NAME: str = os.getenv("INDEX_NAME", "content_index")
33
+
34
+ # Processing Configuration
35
+ CHUNK_SIZE: int = int(os.getenv("CHUNK_SIZE", "500"))
36
+ CHUNK_OVERLAP: int = int(os.getenv("CHUNK_OVERLAP", "50"))
37
+ MAX_CONCURRENT_REQUESTS: int = int(os.getenv("MAX_CONCURRENT_REQUESTS", "5"))
38
+ # Search Configuration
39
+ DEFAULT_TOP_K: int = int(os.getenv("DEFAULT_TOP_K", "5"))
40
+ SIMILARITY_THRESHOLD: float = float(os.getenv("SIMILARITY_THRESHOLD", "0.3"))
41
+
42
+ # OCR Configuration
43
+ TESSERACT_PATH: Optional[str] = os.getenv("TESSERACT_PATH")
44
+ OCR_LANGUAGE: str = os.getenv("OCR_LANGUAGE", "eng")
45
+
46
+ # ElevenLabs Configuration
47
+ ELEVENLABS_API_KEY: Optional[str] = os.getenv("ELEVENLABS_API_KEY")
48
+ ELEVENLABS_AGENT_ID: Optional[str] = os.getenv("ELEVENLABS_AGENT_ID")
49
+ ELEVENLABS_VOICE_MODEL: str = os.getenv("ELEVENLABS_VOICE_MODEL", "Rachel")
50
+
51
+ # App Configuration
52
+ HOST: str = os.getenv("HOST", "0.0.0.0")
53
+ PORT: int = int(os.getenv("PORT", "7860"))
54
+ DEBUG: bool = os.getenv("DEBUG", "False").lower() == "true"
55
+
56
+ config = Config()
core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Core module initialization
core/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (145 Bytes). View file
 
core/__pycache__/chunker.cpython-313.pyc ADDED
Binary file (11.1 kB). View file
 
core/__pycache__/document_parser.cpython-313.pyc ADDED
Binary file (10.5 kB). View file
 
core/__pycache__/models.cpython-313.pyc ADDED
Binary file (7.06 kB). View file
 
core/__pycache__/text_preprocessor.cpython-313.pyc ADDED
Binary file (9.25 kB). View file
 
core/chunker.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # chunker.py
2
+ import logging
3
+ from typing import List, Dict, Any, Optional
4
+ import re
5
+ from .models import Chunk
6
+ from .text_preprocessor import TextPreprocessor
7
+ import config
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class TextChunker:
12
+ def __init__(self):
13
+ self.config = config.config
14
+ self.preprocessor = TextPreprocessor()
15
+
16
+ self.chunk_size = self.config.CHUNK_SIZE
17
+ self.chunk_overlap = self.config.CHUNK_OVERLAP
18
+
19
+ def chunk_document(self, document_id: str, content: str, method: str = "recursive") -> List[Chunk]:
20
+ """Chunk a document using the specified method"""
21
+ if not content:
22
+ return []
23
+
24
+ try:
25
+ if method == "recursive":
26
+ return self._recursive_chunk(document_id, content)
27
+ elif method == "sentence":
28
+ return self._sentence_chunk(document_id, content)
29
+ elif method == "paragraph":
30
+ return self._paragraph_chunk(document_id, content)
31
+ elif method == "fixed":
32
+ return self._fixed_chunk(document_id, content)
33
+ else:
34
+ logger.warning(f"Unknown chunking method: {method}, using recursive")
35
+ return self._recursive_chunk(document_id, content)
36
+ except Exception as e:
37
+ logger.error(f"Error chunking document: {str(e)}")
38
+ # Fallback to simple fixed chunking
39
+ return self._fixed_chunk(document_id, content)
40
+
41
+ def _recursive_chunk(self, document_id: str, content: str) -> List[Chunk]:
42
+ """Recursively split text by different separators"""
43
+ chunks = []
44
+
45
+ # Define separators in order of preference
46
+ separators = [
47
+ "\n\n", # Paragraphs
48
+ "\n", # Lines
49
+ ". ", # Sentences
50
+ ", ", # Clauses
51
+ " " # Words
52
+ ]
53
+
54
+ def split_text(text: str, separators: List[str], chunk_size: int) -> List[str]:
55
+ if len(text) <= chunk_size:
56
+ return [text] if text.strip() else []
57
+
58
+ if not separators:
59
+ # If no separators left, split by character
60
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
61
+
62
+ separator = separators[0]
63
+ remaining_separators = separators[1:]
64
+
65
+ splits = text.split(separator)
66
+ result = []
67
+ current_chunk = ""
68
+
69
+ for split in splits:
70
+ if len(current_chunk) + len(split) + len(separator) <= chunk_size:
71
+ if current_chunk:
72
+ current_chunk += separator + split
73
+ else:
74
+ current_chunk = split
75
+ else:
76
+ if current_chunk:
77
+ result.append(current_chunk)
78
+
79
+ if len(split) > chunk_size:
80
+ # Split is too big, need to split further
81
+ result.extend(split_text(split, remaining_separators, chunk_size))
82
+ current_chunk = ""
83
+ else:
84
+ current_chunk = split
85
+
86
+ if current_chunk:
87
+ result.append(current_chunk)
88
+
89
+ return result
90
+
91
+ text_chunks = split_text(content, separators, self.chunk_size)
92
+
93
+ # Create chunk objects with overlap
94
+ for i, chunk_text in enumerate(text_chunks):
95
+ if not chunk_text.strip():
96
+ continue
97
+
98
+ # Calculate positions
99
+ start_pos = content.find(chunk_text)
100
+ if start_pos == -1:
101
+ start_pos = i * self.chunk_size
102
+ end_pos = start_pos + len(chunk_text)
103
+
104
+ # Add overlap from previous chunk if not the first chunk
105
+ if i > 0 and self.chunk_overlap > 0:
106
+ prev_chunk = text_chunks[i-1]
107
+ overlap_text = prev_chunk[-self.chunk_overlap:] if len(prev_chunk) > self.chunk_overlap else prev_chunk
108
+ chunk_text = overlap_text + " " + chunk_text
109
+
110
+ chunk = Chunk(
111
+ id=self._generate_chunk_id(document_id, i),
112
+ document_id=document_id,
113
+ content=chunk_text.strip(),
114
+ chunk_index=i,
115
+ start_pos=start_pos,
116
+ end_pos=end_pos,
117
+ metadata={
118
+ "chunk_method": "recursive",
119
+ "original_length": len(chunk_text),
120
+ "word_count": len(chunk_text.split())
121
+ }
122
+ )
123
+ chunks.append(chunk)
124
+
125
+ return chunks
126
+
127
+ def _sentence_chunk(self, document_id: str, content: str) -> List[Chunk]:
128
+ """Chunk text by sentences"""
129
+ chunks = []
130
+ sentences = self.preprocessor.extract_sentences(content)
131
+
132
+ current_chunk = ""
133
+ chunk_index = 0
134
+ start_pos = 0
135
+
136
+ for sentence in sentences:
137
+ if len(current_chunk) + len(sentence) <= self.chunk_size:
138
+ if current_chunk:
139
+ current_chunk += " " + sentence
140
+ else:
141
+ current_chunk = sentence
142
+ start_pos = content.find(sentence)
143
+ else:
144
+ if current_chunk:
145
+ chunk = Chunk(
146
+ id=self._generate_chunk_id(document_id, chunk_index),
147
+ document_id=document_id,
148
+ content=current_chunk.strip(),
149
+ chunk_index=chunk_index,
150
+ start_pos=start_pos,
151
+ end_pos=start_pos + len(current_chunk),
152
+ metadata={
153
+ "chunk_method": "sentence",
154
+ "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
155
+ }
156
+ )
157
+ chunks.append(chunk)
158
+ chunk_index += 1
159
+
160
+ current_chunk = sentence
161
+ start_pos = content.find(sentence)
162
+
163
+ # Add final chunk
164
+ if current_chunk:
165
+ chunk = Chunk(
166
+ id=self._generate_chunk_id(document_id, chunk_index),
167
+ document_id=document_id,
168
+ content=current_chunk.strip(),
169
+ chunk_index=chunk_index,
170
+ start_pos=start_pos,
171
+ end_pos=start_pos + len(current_chunk),
172
+ metadata={
173
+ "chunk_method": "sentence",
174
+ "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
175
+ }
176
+ )
177
+ chunks.append(chunk)
178
+
179
+ return chunks
180
+
181
+ def _paragraph_chunk(self, document_id: str, content: str) -> List[Chunk]:
182
+ """Chunk text by paragraphs"""
183
+ chunks = []
184
+ paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
185
+
186
+ current_chunk = ""
187
+ chunk_index = 0
188
+ start_pos = 0
189
+
190
+ for paragraph in paragraphs:
191
+ if len(current_chunk) + len(paragraph) <= self.chunk_size:
192
+ if current_chunk:
193
+ current_chunk += "\n\n" + paragraph
194
+ else:
195
+ current_chunk = paragraph
196
+ start_pos = content.find(paragraph)
197
+ else:
198
+ if current_chunk:
199
+ chunk = Chunk(
200
+ id=self._generate_chunk_id(document_id, chunk_index),
201
+ document_id=document_id,
202
+ content=current_chunk.strip(),
203
+ chunk_index=chunk_index,
204
+ start_pos=start_pos,
205
+ end_pos=start_pos + len(current_chunk),
206
+ metadata={
207
+ "chunk_method": "paragraph",
208
+ "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
209
+ }
210
+ )
211
+ chunks.append(chunk)
212
+ chunk_index += 1
213
+
214
+ # If paragraph is too long, split it further
215
+ if len(paragraph) > self.chunk_size:
216
+ para_chunks = self._fixed_chunk(document_id, paragraph)
217
+ for pc in para_chunks:
218
+ pc.chunk_index = chunk_index
219
+ pc.id = self._generate_chunk_id(document_id, chunk_index)
220
+ chunks.append(pc)
221
+ chunk_index += 1
222
+ else:
223
+ current_chunk = paragraph
224
+ start_pos = content.find(paragraph)
225
+
226
+ # Add final chunk
227
+ if current_chunk:
228
+ chunk = Chunk(
229
+ id=self._generate_chunk_id(document_id, chunk_index),
230
+ document_id=document_id,
231
+ content=current_chunk.strip(),
232
+ chunk_index=chunk_index,
233
+ start_pos=start_pos,
234
+ end_pos=start_pos + len(current_chunk),
235
+ metadata={
236
+ "chunk_method": "paragraph",
237
+ "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
238
+ }
239
+ )
240
+ chunks.append(chunk)
241
+
242
+ return chunks
243
+
244
+ def _fixed_chunk(self, document_id: str, content: str) -> List[Chunk]:
245
+ """Simple fixed-size chunking with overlap"""
246
+ chunks = []
247
+
248
+ for i in range(0, len(content), self.chunk_size - self.chunk_overlap):
249
+ chunk_text = content[i:i + self.chunk_size]
250
+
251
+ if not chunk_text.strip():
252
+ continue
253
+
254
+ chunk = Chunk(
255
+ id=self._generate_chunk_id(document_id, len(chunks)),
256
+ document_id=document_id,
257
+ content=chunk_text.strip(),
258
+ chunk_index=len(chunks),
259
+ start_pos=i,
260
+ end_pos=min(i + self.chunk_size, len(content)),
261
+ metadata={
262
+ "chunk_method": "fixed",
263
+ "original_length": len(chunk_text)
264
+ }
265
+ )
266
+ chunks.append(chunk)
267
+
268
+ return chunks
269
+
270
+ def _generate_chunk_id(self, document_id: str, chunk_index: int) -> str:
271
+ """Generate a unique chunk ID"""
272
+ return f"{document_id}_chunk_{chunk_index}"
273
+
274
+ def optimize_chunks_for_embedding(self, chunks: List[Chunk]) -> List[Chunk]:
275
+ """Optimize chunks for better embedding generation"""
276
+ optimized_chunks = []
277
+
278
+ for chunk in chunks:
279
+ # Clean the content for embedding
280
+ clean_content = self.preprocessor.prepare_for_embedding(chunk.content)
281
+
282
+ # Skip very short chunks
283
+ if len(clean_content.split()) < 5:
284
+ continue
285
+
286
+ # Update chunk with optimized content
287
+ optimized_chunk = Chunk(
288
+ id=chunk.id,
289
+ document_id=chunk.document_id,
290
+ content=clean_content,
291
+ chunk_index=chunk.chunk_index,
292
+ start_pos=chunk.start_pos,
293
+ end_pos=chunk.end_pos,
294
+ metadata={
295
+ **chunk.metadata,
296
+ "optimized_for_embedding": True,
297
+ "original_content_length": len(chunk.content),
298
+ "optimized_content_length": len(clean_content)
299
+ }
300
+ )
301
+ optimized_chunks.append(optimized_chunk)
302
+
303
+ return optimized_chunks
core/document_parser.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import tempfile
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional, Dict, Any
6
+ import asyncio
7
+
8
+ # Document processing libraries
9
+ import PyPDF2
10
+ from docx import Document as DocxDocument
11
+ from PIL import Image
12
+ import pytesseract
13
+
14
+ from .models import Document, DocumentType
15
+ import config
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class DocumentParser:
20
+ def __init__(self):
21
+ self.config = config.config
22
+
23
+ async def parse_document(self, file_path: str, filename: str) -> Document:
24
+ """Parse a document and extract its content"""
25
+ try:
26
+ file_ext = Path(filename).suffix.lower()
27
+ file_size = os.path.getsize(file_path)
28
+
29
+ # Determine document type and parse accordingly
30
+ if file_ext == '.pdf':
31
+ content = await self._parse_pdf(file_path)
32
+ doc_type = DocumentType.PDF
33
+ elif file_ext == '.txt':
34
+ content = await self._parse_text(file_path)
35
+ doc_type = DocumentType.TEXT
36
+ elif file_ext == '.docx':
37
+ content = await self._parse_docx(file_path)
38
+ doc_type = DocumentType.DOCX
39
+ elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
40
+ content = await self._parse_image(file_path)
41
+ doc_type = DocumentType.IMAGE
42
+ else:
43
+ raise ValueError(f"Unsupported file type: {file_ext}")
44
+
45
+ # Create document object
46
+ document = Document(
47
+ id=self._generate_document_id(),
48
+ filename=filename,
49
+ content=content,
50
+ doc_type=doc_type,
51
+ file_size=file_size,
52
+ metadata={
53
+ "file_extension": file_ext,
54
+ "content_length": len(content),
55
+ "word_count": len(content.split()) if content else 0
56
+ }
57
+ )
58
+
59
+ logger.info(f"Successfully parsed document: {filename}")
60
+ return document
61
+
62
+ except Exception as e:
63
+ logger.error(f"Error parsing document {filename}: {str(e)}")
64
+ raise
65
+
66
+ async def _parse_pdf(self, file_path: str) -> str:
67
+ """Extract text from PDF file"""
68
+ try:
69
+ content = ""
70
+ with open(file_path, 'rb') as file:
71
+ pdf_reader = PyPDF2.PdfReader(file)
72
+ for page_num, page in enumerate(pdf_reader.pages):
73
+ try:
74
+ page_text = page.extract_text()
75
+ if page_text.strip():
76
+ content += f"\n--- Page {page_num + 1} ---\n"
77
+ content += page_text + "\n"
78
+ except Exception as e:
79
+ logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
80
+ continue
81
+
82
+ return content.strip()
83
+ except Exception as e:
84
+ logger.error(f"Error parsing PDF: {str(e)}")
85
+ raise
86
+
87
+ async def _parse_text(self, file_path: str) -> str:
88
+ """Read plain text file"""
89
+ try:
90
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
91
+ content = file.read()
92
+ return content.strip()
93
+ except Exception as e:
94
+ logger.error(f"Error parsing text file: {str(e)}")
95
+ raise
96
+
97
+ async def _parse_docx(self, file_path: str) -> str:
98
+ """Extract text from DOCX file"""
99
+ try:
100
+ doc = DocxDocument(file_path)
101
+ content = ""
102
+
103
+ for paragraph in doc.paragraphs:
104
+ if paragraph.text.strip():
105
+ content += paragraph.text + "\n"
106
+
107
+ # Extract text from tables
108
+ for table in doc.tables:
109
+ for row in table.rows:
110
+ row_text = []
111
+ for cell in row.cells:
112
+ if cell.text.strip():
113
+ row_text.append(cell.text.strip())
114
+ if row_text:
115
+ content += " | ".join(row_text) + "\n"
116
+
117
+ return content.strip()
118
+ except Exception as e:
119
+ logger.error(f"Error parsing DOCX file: {str(e)}")
120
+ raise
121
+
122
+ async def _parse_image(self, file_path: str) -> str:
123
+ """Extract text from image using OCR"""
124
+ try:
125
+ # First try with OCR service if available
126
+ if hasattr(self, 'ocr_service') and self.ocr_service:
127
+ logger.info(f"Using OCR service for image: {file_path}")
128
+ text = await self.ocr_service.extract_text_from_image(file_path)
129
+ if text:
130
+ return text
131
+
132
+ # Fallback to direct pytesseract
133
+ logger.info(f"Using direct pytesseract for image: {file_path}")
134
+ image = Image.open(file_path)
135
+
136
+ # Perform OCR
137
+ content = pytesseract.image_to_string(
138
+ image,
139
+ lang=self.config.OCR_LANGUAGE,
140
+ config='--psm 6' # Assume a single uniform block of text
141
+ )
142
+
143
+ return content.strip()
144
+ except Exception as e:
145
+ logger.error(f"Error performing OCR on image: {str(e)}")
146
+ # Return empty string if OCR fails
147
+ return ""
148
+
149
+ def _generate_document_id(self) -> str:
150
+ """Generate a unique document ID"""
151
+ import uuid
152
+ return str(uuid.uuid4())
153
+
154
+ async def extract_metadata(self, file_path: str, content: str) -> Dict[str, Any]:
155
+ """Extract additional metadata from the document"""
156
+ try:
157
+ metadata = {}
158
+
159
+ # Basic statistics
160
+ metadata["content_length"] = len(content)
161
+ metadata["word_count"] = len(content.split()) if content else 0
162
+ metadata["line_count"] = len(content.splitlines()) if content else 0
163
+
164
+ # File information
165
+ file_stat = os.stat(file_path)
166
+ metadata["file_size"] = file_stat.st_size
167
+ metadata["created_time"] = file_stat.st_ctime
168
+ metadata["modified_time"] = file_stat.st_mtime
169
+
170
+ # Content analysis
171
+ if content:
172
+ # Language detection (simple heuristic)
173
+ metadata["estimated_language"] = self._detect_language(content)
174
+
175
+ # Reading time estimation (average 200 words per minute)
176
+ metadata["estimated_reading_time_minutes"] = max(1, metadata["word_count"] // 200)
177
+
178
+ return metadata
179
+ except Exception as e:
180
+ logger.error(f"Error extracting metadata: {str(e)}")
181
+ return {}
182
+
183
+ def _detect_language(self, content: str) -> str:
184
+ """Simple language detection based on character patterns"""
185
+ # This is a very basic implementation
186
+ # In production, you might want to use a proper language detection library
187
+ if not content:
188
+ return "unknown"
189
+
190
+ # Count common English words
191
+ english_words = ["the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "this", "that", "these", "those"]
192
+
193
+ words = content.lower().split()
194
+ english_count = sum(1 for word in words if word in english_words)
195
+
196
+ if len(words) > 0 and english_count / len(words) > 0.1:
197
+ return "en"
198
+ else:
199
+ return "unknown"
core/models.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional, Dict, Any
3
+ from datetime import datetime
4
+ from enum import Enum
5
+
6
+ class DocumentType(str, Enum):
7
+ PDF = "pdf"
8
+ TEXT = "txt"
9
+ DOCX = "docx"
10
+ IMAGE = "image"
11
+ HTML = "html"
12
+
13
+ class ProcessingStatus(str, Enum):
14
+ PENDING = "pending"
15
+ PROCESSING = "processing"
16
+ COMPLETED = "completed"
17
+ FAILED = "failed"
18
+
19
+ class Document(BaseModel):
20
+ id: str = Field(..., description="Unique document identifier")
21
+ filename: str = Field(..., description="Original filename")
22
+ content: str = Field(..., description="Extracted text content")
23
+ doc_type: DocumentType = Field(..., description="Document type")
24
+ file_size: int = Field(..., description="File size in bytes")
25
+ created_at: datetime = Field(default_factory=datetime.utcnow)
26
+ metadata: Dict[str, Any] = Field(default_factory=dict)
27
+ tags: List[str] = Field(default_factory=list)
28
+ summary: Optional[str] = None
29
+ category: Optional[str] = None
30
+ language: Optional[str] = None
31
+
32
+ def to_dict(self) -> Dict[str, Any]:
33
+ return {
34
+ "id": self.id,
35
+ "filename": self.filename,
36
+ "content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
37
+ "doc_type": self.doc_type,
38
+ "file_size": self.file_size,
39
+ "created_at": self.created_at.isoformat(),
40
+ "metadata": self.metadata,
41
+ "tags": self.tags,
42
+ "summary": self.summary,
43
+ "category": self.category,
44
+ "language": self.language
45
+ }
46
+
47
+ class Chunk(BaseModel):
48
+ id: str = Field(..., description="Unique chunk identifier")
49
+ document_id: str = Field(..., description="Parent document ID")
50
+ content: str = Field(..., description="Chunk text content")
51
+ chunk_index: int = Field(..., description="Position in document")
52
+ start_pos: int = Field(..., description="Start position in original document")
53
+ end_pos: int = Field(..., description="End position in original document")
54
+ embedding: Optional[List[float]] = None
55
+ metadata: Dict[str, Any] = Field(default_factory=dict)
56
+
57
+ class SearchResult(BaseModel):
58
+ chunk_id: str = Field(..., description="Matching chunk ID")
59
+ document_id: str = Field(..., description="Source document ID")
60
+ content: str = Field(..., description="Matching content")
61
+ score: float = Field(..., description="Similarity score")
62
+ metadata: Dict[str, Any] = Field(default_factory=dict)
63
+
64
+ def to_dict(self) -> Dict[str, Any]:
65
+ return {
66
+ "chunk_id": self.chunk_id,
67
+ "document_id": self.document_id,
68
+ "content": self.content,
69
+ "score": self.score,
70
+ "metadata": self.metadata
71
+ }
72
+
73
+ class ProcessingTask(BaseModel):
74
+ task_id: str = Field(..., description="Unique task identifier")
75
+ document_id: Optional[str] = None
76
+ status: ProcessingStatus = ProcessingStatus.PENDING
77
+ progress: float = Field(default=0.0, ge=0.0, le=100.0)
78
+ message: Optional[str] = None
79
+ error: Optional[str] = None
80
+ created_at: datetime = Field(default_factory=datetime.utcnow)
81
+ updated_at: datetime = Field(default_factory=datetime.utcnow)
82
+
83
+ class SummaryRequest(BaseModel):
84
+ content: Optional[str] = None
85
+ document_id: Optional[str] = None
86
+ style: str = Field(default="concise", description="Summary style")
87
+ max_length: Optional[int] = None
88
+
89
+ class TagGenerationRequest(BaseModel):
90
+ content: Optional[str] = None
91
+ document_id: Optional[str] = None
92
+ max_tags: int = Field(default=5, ge=1, le=20)
93
+
94
+ class QuestionAnswerRequest(BaseModel):
95
+ question: str = Field(..., description="Question to answer")
96
+ context_filter: Optional[Dict[str, Any]] = None
97
+ max_context_length: int = Field(default=2000)
98
+
99
+ class CategorizationRequest(BaseModel):
100
+ content: Optional[str] = None
101
+ document_id: Optional[str] = None
102
+ categories: Optional[List[str]] = None
core/text_preprocessor.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import List, Optional
4
+ import unicodedata
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class TextPreprocessor:
9
+ def __init__(self):
10
+ # Common stop words for basic filtering
11
+ self.stop_words = {
12
+ 'en': set([
13
+ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
14
+ 'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
15
+ 'before', 'after', 'above', 'below', 'between', 'among', 'throughout',
16
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
17
+ 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
18
+ 'must', 'shall', 'can', 'this', 'that', 'these', 'those', 'i', 'me',
19
+ 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours'
20
+ ])
21
+ }
22
+
23
+ def clean_text(self, text: str, aggressive: bool = False) -> str:
24
+ """Clean and normalize text"""
25
+ if not text:
26
+ return ""
27
+
28
+ try:
29
+ # Normalize unicode characters
30
+ text = unicodedata.normalize('NFKD', text)
31
+
32
+ # Remove excessive whitespace
33
+ text = re.sub(r'\s+', ' ', text)
34
+
35
+ # Remove or replace special characters
36
+ if aggressive:
37
+ # More aggressive cleaning for embedding
38
+ text = re.sub(r'[^\w\s\-.,!?;:]', ' ', text)
39
+ text = re.sub(r'[.,!?;:]+', '.', text)
40
+ else:
41
+ # Basic cleaning for readability
42
+ text = re.sub(r'[^\w\s\-.,!?;:()\[\]{}"\']', ' ', text)
43
+
44
+ # Remove excessive punctuation
45
+ text = re.sub(r'\.{2,}', '.', text)
46
+ text = re.sub(r'[!?]{2,}', '!', text)
47
+
48
+ # Clean up whitespace again
49
+ text = re.sub(r'\s+', ' ', text)
50
+
51
+ # Remove leading/trailing whitespace
52
+ text = text.strip()
53
+
54
+ return text
55
+ except Exception as e:
56
+ logger.error(f"Error cleaning text: {str(e)}")
57
+ return text
58
+
59
+ def extract_sentences(self, text: str) -> List[str]:
60
+ """Extract sentences from text"""
61
+ if not text:
62
+ return []
63
+
64
+ try:
65
+ # Simple sentence splitting
66
+ sentences = re.split(r'[.!?]+', text)
67
+
68
+ # Clean and filter sentences
69
+ clean_sentences = []
70
+ for sentence in sentences:
71
+ sentence = sentence.strip()
72
+ if len(sentence) > 10: # Minimum sentence length
73
+ clean_sentences.append(sentence)
74
+
75
+ return clean_sentences
76
+ except Exception as e:
77
+ logger.error(f"Error extracting sentences: {str(e)}")
78
+ return [text]
79
+
80
+ def extract_keywords(self, text: str, language: str = 'en', max_keywords: int = 20) -> List[str]:
81
+ """Extract potential keywords from text"""
82
+ if not text:
83
+ return []
84
+
85
+ try:
86
+ # Convert to lowercase and split into words
87
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
88
+
89
+ # Remove stop words
90
+ stop_words = self.stop_words.get(language, set())
91
+ keywords = [word for word in words if word not in stop_words]
92
+
93
+ # Count word frequency
94
+ word_freq = {}
95
+ for word in keywords:
96
+ word_freq[word] = word_freq.get(word, 0) + 1
97
+
98
+ # Sort by frequency and return top keywords
99
+ sorted_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
100
+
101
+ return [word for word, freq in sorted_keywords[:max_keywords]]
102
+ except Exception as e:
103
+ logger.error(f"Error extracting keywords: {str(e)}")
104
+ return []
105
+
106
+ def prepare_for_embedding(self, text: str) -> str:
107
+ """Prepare text specifically for embedding generation"""
108
+ if not text:
109
+ return ""
110
+
111
+ try:
112
+ # Clean text aggressively for better embeddings
113
+ clean_text = self.clean_text(text, aggressive=True)
114
+
115
+ # Remove very short words
116
+ words = clean_text.split()
117
+ filtered_words = [word for word in words if len(word) >= 2]
118
+
119
+ # Rejoin and ensure reasonable length
120
+ result = ' '.join(filtered_words)
121
+
122
+ # Truncate if too long (most embedding models have token limits)
123
+ if len(result) > 5000: # Rough character limit
124
+ result = result[:5000] + "..."
125
+
126
+ return result
127
+ except Exception as e:
128
+ logger.error(f"Error preparing text for embedding: {str(e)}")
129
+ return text
130
+
131
+ def extract_metadata_from_text(self, text: str) -> dict:
132
+ """Extract metadata from text content"""
133
+ if not text:
134
+ return {}
135
+
136
+ try:
137
+ metadata = {}
138
+
139
+ # Basic statistics
140
+ metadata['character_count'] = len(text)
141
+ metadata['word_count'] = len(text.split())
142
+ metadata['sentence_count'] = len(self.extract_sentences(text))
143
+ metadata['paragraph_count'] = len([p for p in text.split('\n\n') if p.strip()])
144
+
145
+ # Content characteristics
146
+ metadata['avg_word_length'] = sum(len(word) for word in text.split()) / max(1, len(text.split()))
147
+ metadata['avg_sentence_length'] = metadata['word_count'] / max(1, metadata['sentence_count'])
148
+
149
+ # Special content detection
150
+ metadata['has_urls'] = bool(re.search(r'https?://\S+', text))
151
+ metadata['has_emails'] = bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
152
+ metadata['has_phone_numbers'] = bool(re.search(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text))
153
+ metadata['has_dates'] = bool(re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text))
154
+ metadata['has_numbers'] = bool(re.search(r'\b\d+\b', text))
155
+
156
+ # Language indicators
157
+ metadata['punctuation_density'] = len(re.findall(r'[.,!?;:]', text)) / max(1, len(text))
158
+ metadata['caps_ratio'] = len(re.findall(r'[A-Z]', text)) / max(1, len(text))
159
+
160
+ return metadata
161
+ except Exception as e:
162
+ logger.error(f"Error extracting text metadata: {str(e)}")
163
+ return {}
164
+
165
+ def normalize_for_search(self, text: str) -> str:
166
+ """Normalize text for search queries"""
167
+ if not text:
168
+ return ""
169
+
170
+ try:
171
+ # Convert to lowercase
172
+ text = text.lower()
173
+
174
+ # Remove special characters but keep spaces
175
+ text = re.sub(r'[^\w\s]', ' ', text)
176
+
177
+ # Normalize whitespace
178
+ text = re.sub(r'\s+', ' ', text)
179
+
180
+ # Strip leading/trailing whitespace
181
+ text = text.strip()
182
+
183
+ return text
184
+ except Exception as e:
185
+ logger.error(f"Error normalizing text for search: {str(e)}")
186
+ return text
mcp_server.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from typing import Dict, Any, List, Optional
4
+ from pathlib import Path
5
+
6
+ from mcp.server.fastmcp import FastMCP
7
+
8
+ from services.vector_store_service import VectorStoreService
9
+ from services.document_store_service import DocumentStoreService
10
+ from services.embedding_service import EmbeddingService
11
+ from services.llm_service import LLMService
12
+ from services.ocr_service import OCRService
13
+
14
+ from mcp_tools.ingestion_tool import IngestionTool
15
+ from mcp_tools.search_tool import SearchTool
16
+ from mcp_tools.generative_tool import GenerativeTool
17
+
18
+ # Phase 2 & 3: Voice and Podcast
19
+ from services.llamaindex_service import LlamaIndexService
20
+ from services.elevenlabs_service import ElevenLabsService
21
+ from services.podcast_generator_service import PodcastGeneratorService
22
+ from mcp_tools.voice_tool import VoiceTool
23
+ from mcp_tools.podcast_tool import PodcastTool
24
+
25
+ logging.basicConfig(level=logging.INFO)
26
+ logger = logging.getLogger(__name__)
27
+
28
+ logger.info("Initializing services for FastMCP...")
29
+ vector_store_service = VectorStoreService()
30
+ document_store_service = DocumentStoreService()
31
+ embedding_service_instance = EmbeddingService()
32
+ llm_service_instance = LLMService()
33
+ ocr_service_instance = OCRService()
34
+
35
+ ingestion_tool_instance = IngestionTool(
36
+ vector_store=vector_store_service,
37
+ document_store=document_store_service,
38
+ embedding_service=embedding_service_instance,
39
+ ocr_service=ocr_service_instance
40
+ )
41
+ search_tool_instance = SearchTool(
42
+ vector_store=vector_store_service,
43
+ embedding_service=embedding_service_instance,
44
+ document_store=document_store_service
45
+ )
46
+ generative_tool_instance = GenerativeTool(
47
+ llm_service=llm_service_instance,
48
+ search_tool=search_tool_instance
49
+ )
50
+
51
+ # Phase 2 & 3 Services
52
+ logger.info("Initializing Phase 2 & 3 services...")
53
+ llamaindex_service_instance = LlamaIndexService(document_store_service)
54
+ elevenlabs_service_instance = ElevenLabsService(llamaindex_service_instance)
55
+ podcast_generator_instance = PodcastGeneratorService(
56
+ llamaindex_service=llamaindex_service_instance,
57
+ llm_service=llm_service_instance
58
+ )
59
+
60
+ voice_tool_instance = VoiceTool(elevenlabs_service_instance)
61
+ podcast_tool_instance = PodcastTool(podcast_generator_instance)
62
+
63
+ mcp = FastMCP("")
64
+ logger.info("FastMCP server initialized.")
65
+
66
+ @mcp.tool()
67
+ async def ingest_document(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
68
+ """
69
+ Process and index a document from a local file path for searching.
70
+ Automatically determines file_type if not provided.
71
+ """
72
+ logger.info(f"Tool 'ingest_document' called with file_path: {file_path}, file_type: {file_type}")
73
+ try:
74
+ actual_file_type = file_type
75
+ if not actual_file_type:
76
+ actual_file_type = Path(file_path).suffix.lower().strip('.')
77
+ logger.info(f"Inferred file_type: {actual_file_type}")
78
+ result = await ingestion_tool_instance.process_document(file_path, actual_file_type)
79
+ logger.info(f"Ingestion result: {result}")
80
+ return result
81
+ except Exception as e:
82
+ logger.error(f"Error in 'ingest_document' tool: {str(e)}", exc_info=True)
83
+ return {"success": False, "error": str(e)}
84
+
85
+ @mcp.tool()
86
+ async def semantic_search(query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
87
+ """
88
+ Search through indexed content using natural language.
89
+ 'filters' can be used to narrow down the search.
90
+ """
91
+ logger.info(f"Tool 'semantic_search' called with query: {query}, top_k: {top_k}, filters: {filters}")
92
+ try:
93
+ results = await search_tool_instance.search(query, top_k, filters)
94
+ return {
95
+ "success": True,
96
+ "query": query,
97
+ "results": [result.to_dict() for result in results],
98
+ "total_results": len(results)
99
+ }
100
+ except Exception as e:
101
+ logger.error(f"Error in 'semantic_search' tool: {str(e)}", exc_info=True)
102
+ return {"success": False, "error": str(e), "results": []}
103
+
104
+ @mcp.tool()
105
+ async def summarize_content(
106
+ content: Optional[str] = None,
107
+ document_id: Optional[str] = None,
108
+ style: str = "concise"
109
+ ) -> Dict[str, Any]:
110
+ """
111
+ Generate a summary of provided content or a document_id.
112
+ Available styles: concise, detailed, bullet_points, executive.
113
+ """
114
+ logger.info(f"Tool 'summarize_content' called. doc_id: {document_id}, style: {style}, has_content: {content is not None}")
115
+ try:
116
+ text_to_summarize = content
117
+ if document_id and not text_to_summarize:
118
+ doc = await document_store_service.get_document(document_id)
119
+ if not doc:
120
+ return {"success": False, "error": f"Document {document_id} not found"}
121
+ text_to_summarize = doc.content
122
+ if not text_to_summarize:
123
+ return {"success": False, "error": "No content provided for summarization"}
124
+ max_length = 10000
125
+ if len(text_to_summarize) > max_length:
126
+ logger.warning(f"Content for summarization is long ({len(text_to_summarize)} chars), truncating to {max_length}")
127
+ text_to_summarize = text_to_summarize[:max_length] + "..."
128
+ summary = await generative_tool_instance.summarize(text_to_summarize, style)
129
+ return {
130
+ "success": True,
131
+ "summary": summary,
132
+ "original_length": len(text_to_summarize),
133
+ "summary_length": len(summary),
134
+ "style": style
135
+ }
136
+ except Exception as e:
137
+ logger.error(f"Error in 'summarize_content' tool: {str(e)}", exc_info=True)
138
+ return {"success": False, "error": str(e)}
139
+
140
+ @mcp.tool()
141
+ async def generate_tags(
142
+ content: Optional[str] = None,
143
+ document_id: Optional[str] = None,
144
+ max_tags: int = 5
145
+ ) -> Dict[str, Any]:
146
+ """
147
+ Generate relevant tags for content or a document_id.
148
+ Saves tags to document metadata if document_id is provided.
149
+ """
150
+ logger.info(f"Tool 'generate_tags' called. doc_id: {document_id}, max_tags: {max_tags}, has_content: {content is not None}")
151
+ try:
152
+ text_for_tags = content
153
+ if document_id and not text_for_tags:
154
+ doc = await document_store_service.get_document(document_id)
155
+ if not doc:
156
+ return {"success": False, "error": f"Document {document_id} not found"}
157
+ text_for_tags = doc.content
158
+ if not text_for_tags:
159
+ return {"success": False, "error": "No content provided for tag generation"}
160
+ tags = await generative_tool_instance.generate_tags(text_for_tags, max_tags)
161
+ if document_id and tags:
162
+ await document_store_service.update_document_metadata(document_id, {"tags": tags})
163
+ logger.info(f"Tags {tags} saved for document {document_id}")
164
+ return {
165
+ "success": True,
166
+ "tags": tags,
167
+ "content_length": len(text_for_tags),
168
+ "document_id": document_id
169
+ }
170
+ except Exception as e:
171
+ logger.error(f"Error in 'generate_tags' tool: {str(e)}", exc_info=True)
172
+ return {"success": False, "error": str(e)}
173
+
174
+ @mcp.tool()
175
+ async def answer_question(question: str, context_filter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
176
+ """
177
+ Answer questions using RAG (Retrieval Augmented Generation) over indexed content.
178
+ 'context_filter' can be used to narrow down the context search.
179
+ """
180
+ logger.info(f"Tool 'answer_question' called with question: {question}, context_filter: {context_filter}")
181
+ try:
182
+ search_results = await search_tool_instance.search(question, top_k=5, filters=context_filter)
183
+ if not search_results:
184
+ return {
185
+ "success": False,
186
+ "error": "No relevant context found. Please upload relevant documents.",
187
+ "question": question,
188
+ "answer": "I could not find enough information in the documents to answer your question."
189
+ }
190
+ answer = await generative_tool_instance.answer_question(question, search_results)
191
+ return {
192
+ "success": True,
193
+ "question": question,
194
+ "answer": answer,
195
+ "sources": [result.to_dict() for result in search_results],
196
+ "confidence": "high" if len(search_results) >= 3 else "medium"
197
+ }
198
+ except Exception as e:
199
+ logger.error(f"Error in 'answer_question' tool: {str(e)}", exc_info=True)
200
+ return {"success": False, "error": str(e)}
201
+
202
+ @mcp.tool()
203
+ async def voice_qa(question: str, session_id: Optional[str] = None) -> Dict[str, Any]:
204
+ """
205
+ Ask a question using the AI voice assistant with RAG capabilities.
206
+ Provides text-based Q&A powered by LlamaIndex agentic search.
207
+ """
208
+ logger.info(f"Tool 'voice_qa' called with question: {question}")
209
+ try:
210
+ result = await voice_tool_instance.voice_qa(question, session_id)
211
+ return result
212
+ except Exception as e:
213
+ logger.error(f"Error in 'voice_qa' tool: {str(e)}", exc_info=True)
214
+ return {"success": False, "error": str(e)}
215
+
216
+ @mcp.tool()
217
+ async def generate_podcast(
218
+ document_ids: List[str],
219
+ style: str = "conversational",
220
+ duration_minutes: int = 10,
221
+ host1_voice: str = "Rachel",
222
+ host2_voice: str = "Adam"
223
+ ) -> Dict[str, Any]:
224
+ """
225
+ Generate a podcast from selected documents.
226
+ Styles: conversational, educational, technical, casual.
227
+ Duration: 5-30 minutes recommended.
228
+ Voices: Rachel, Adam, Domi, Bella, Antoni, Josh, Sam, Emily, etc.
229
+ """
230
+ logger.info(f"Tool 'generate_podcast' called with {len(document_ids)} docs, style: {style}")
231
+ try:
232
+ result = await podcast_tool_instance.generate_podcast(
233
+ document_ids=document_ids,
234
+ style=style,
235
+ duration_minutes=duration_minutes,
236
+ host1_voice=host1_voice,
237
+ host2_voice=host2_voice
238
+ )
239
+ return result
240
+ except Exception as e:
241
+ logger.error(f"Error in 'generate_podcast' tool: {str(e)}", exc_info=True)
242
+ return {"success": False, "error": str(e)}
243
+
244
+ @mcp.tool()
245
+ async def list_documents_for_ui(limit: int = 100, offset: int = 0) -> Dict[str, Any]:
246
+ """
247
+ (UI Helper) List documents from the document store.
248
+ Not a standard processing tool, but useful for UI population.
249
+ """
250
+ logger.info(f"Tool 'list_documents_for_ui' called with limit: {limit}, offset: {offset}")
251
+ try:
252
+ documents = await document_store_service.list_documents(limit, offset)
253
+ return {
254
+ "success": True,
255
+ "documents": [doc.to_dict() for doc in documents],
256
+ "total": len(documents)
257
+ }
258
+ except Exception as e:
259
+ logger.error(f"Error in 'list_documents_for_ui' tool: {str(e)}", exc_info=True)
260
+ return {"success": False, "error": str(e), "documents": []}
261
+
262
+ # Blaxel Deployment Support
263
+ from fastapi import FastAPI
264
+ from mcp.server.streamable_http import StreamableHTTPServerTransport
265
+ import os
266
+
267
+ # Get Blaxel environment variables
268
+ host = os.getenv("BL_SERVER_HOST", "0.0.0.0")
269
+ port = int(os.getenv("BL_SERVER_PORT", "8000"))
270
+
271
+ # Create FastAPI app
272
+ app = FastAPI()
273
+
274
+ # Initialize HTTP transport instead of stdio
275
+ transport = StreamableHTTPServerTransport()
276
+
277
+ # Connect MCP server to HTTP transport
278
+ mcp.connect(transport)
279
+
280
+ # Mount transport to FastAPI
281
+ transport.mount(app)
282
+
283
+ @app.get("/health")
284
+ async def health_check():
285
+ """Health check endpoint for Modal"""
286
+ return {"status": "healthy", "service": "mcp-server"}
287
+
288
+ if __name__ == "__main__":
289
+ import uvicorn
290
+ uvicorn.run(app, host=host, port=port)
mcp_tools/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # MCP tools module initialization
mcp_tools/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (150 Bytes). View file
 
mcp_tools/__pycache__/generative_tool.cpython-313.pyc ADDED
Binary file (21.4 kB). View file
 
mcp_tools/__pycache__/ingestion_tool.cpython-313.pyc ADDED
Binary file (16.8 kB). View file
 
mcp_tools/__pycache__/podcast_tool.cpython-313.pyc ADDED
Binary file (5.19 kB). View file
 
mcp_tools/__pycache__/search_tool.cpython-313.pyc ADDED
Binary file (22 kB). View file
 
mcp_tools/__pycache__/voice_tool.cpython-313.pyc ADDED
Binary file (2.42 kB). View file
 
mcp_tools/generative_tool.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Any, Optional
3
+ import asyncio
4
+
5
+ from services.llm_service import LLMService
6
+ from mcp_tools.search_tool import SearchTool
7
+ from core.models import SearchResult
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class GenerativeTool:
12
+ def __init__(self, llm_service: LLMService, search_tool: Optional[SearchTool] = None):
13
+ self.llm_service = llm_service
14
+ self.search_tool = search_tool
15
+
16
+ async def summarize(self, content: str, style: str = "concise", max_length: Optional[int] = None) -> str:
17
+ """Generate a summary of the given content"""
18
+ try:
19
+ if not content.strip():
20
+ return "No content provided for summarization."
21
+
22
+ logger.info(f"Generating {style} summary for content of length {len(content)}")
23
+
24
+ summary = await self.llm_service.summarize(content, style, max_length)
25
+
26
+ logger.info(f"Generated summary of length {len(summary)}")
27
+ return summary
28
+
29
+ except Exception as e:
30
+ logger.error(f"Error generating summary: {str(e)}")
31
+ return f"Error generating summary: {str(e)}"
32
+
33
+ async def generate_tags(self, content: str, max_tags: int = 5) -> List[str]:
34
+ """Generate relevant tags for the given content"""
35
+ try:
36
+ if not content.strip():
37
+ return []
38
+
39
+ logger.info(f"Generating up to {max_tags} tags for content")
40
+
41
+ tags = await self.llm_service.generate_tags(content, max_tags)
42
+
43
+ logger.info(f"Generated {len(tags)} tags")
44
+ return tags
45
+
46
+ except Exception as e:
47
+ logger.error(f"Error generating tags: {str(e)}")
48
+ return []
49
+
50
+ async def categorize(self, content: str, categories: List[str]) -> str:
51
+ """Categorize content into one of the provided categories"""
52
+ try:
53
+ if not content.strip():
54
+ return "Uncategorized"
55
+
56
+ if not categories:
57
+ categories = ["Technology", "Business", "Science", "Education", "Entertainment", "News", "Research", "Other"]
58
+
59
+ logger.info(f"Categorizing content into one of {len(categories)} categories")
60
+
61
+ category = await self.llm_service.categorize(content, categories)
62
+
63
+ logger.info(f"Categorized as: {category}")
64
+ return category
65
+
66
+ except Exception as e:
67
+ logger.error(f"Error categorizing content: {str(e)}")
68
+ return "Uncategorized"
69
+
70
+ async def answer_question(self, question: str, context_results: List[SearchResult] = None) -> str:
71
+ """Answer a question using the provided context or RAG"""
72
+ try:
73
+ if not question.strip():
74
+ return "No question provided."
75
+
76
+ logger.info(f"Answering question: {question[:100]}...")
77
+
78
+ # If no context provided and search tool is available, search for relevant context
79
+ if not context_results and self.search_tool:
80
+ logger.info("No context provided, searching for relevant information")
81
+ context_results = await self.search_tool.search(question, top_k=5)
82
+
83
+ # Prepare context from search results
84
+ if context_results:
85
+ context_texts = []
86
+ for result in context_results:
87
+ context_texts.append(f"Source: {result.document_id}\nContent: {result.content}\n")
88
+
89
+ context = "\n---\n".join(context_texts)
90
+ logger.info(f"Using context from {len(context_results)} sources")
91
+ else:
92
+ context = ""
93
+ logger.info("No context available for answering question")
94
+
95
+ # Generate answer
96
+ answer = await self.llm_service.answer_question(question, context)
97
+
98
+ logger.info(f"Generated answer of length {len(answer)}")
99
+ return answer
100
+
101
+ except Exception as e:
102
+ logger.error(f"Error answering question: {str(e)}")
103
+ return f"I encountered an error while trying to answer your question: {str(e)}"
104
+
105
+ async def generate_outline(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> str:
106
+ """Generate an outline for the given topic"""
107
+ try:
108
+ if not topic.strip():
109
+ return "No topic provided."
110
+
111
+ detail_descriptions = {
112
+ "brief": "brief bullet points",
113
+ "medium": "detailed bullet points with descriptions",
114
+ "detailed": "comprehensive outline with sub-sections and explanations"
115
+ }
116
+
117
+ detail_desc = detail_descriptions.get(detail_level, "detailed bullet points")
118
+
119
+ prompt = f"""Create a {detail_desc} outline for the topic: "{topic}"
120
+
121
+ The outline should have {num_sections} main sections and be well-structured and informative.
122
+
123
+ Format the outline clearly with proper numbering and indentation.
124
+
125
+ Topic: {topic}
126
+
127
+ Outline:"""
128
+
129
+ outline = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.7)
130
+
131
+ logger.info(f"Generated outline for topic: {topic}")
132
+ return outline
133
+
134
+ except Exception as e:
135
+ logger.error(f"Error generating outline: {str(e)}")
136
+ return f"Error generating outline: {str(e)}"
137
+
138
+ async def explain_concept(self, concept: str, audience: str = "general", length: str = "medium") -> str:
139
+ """Explain a concept for a specific audience"""
140
+ try:
141
+ if not concept.strip():
142
+ return "No concept provided."
143
+
144
+ audience_styles = {
145
+ "general": "a general audience using simple, clear language",
146
+ "technical": "a technical audience with appropriate jargon and detail",
147
+ "beginner": "beginners with no prior knowledge, using analogies and examples",
148
+ "expert": "experts in the field with advanced terminology and depth"
149
+ }
150
+
151
+ length_guidance = {
152
+ "brief": "Keep the explanation concise and to the point (2-3 paragraphs).",
153
+ "medium": "Provide a comprehensive explanation (4-6 paragraphs).",
154
+ "detailed": "Give a thorough, in-depth explanation with examples."
155
+ }
156
+
157
+ audience_desc = audience_styles.get(audience, "a general audience")
158
+ length_desc = length_guidance.get(length, "Provide a comprehensive explanation.")
159
+
160
+ prompt = f"""Explain the concept of "{concept}" for {audience_desc}.
161
+
162
+ {length_desc}
163
+
164
+ Make sure to:
165
+ - Use appropriate language for the audience
166
+ - Include relevant examples or analogies
167
+ - Structure the explanation logically
168
+ - Ensure clarity and accuracy
169
+
170
+ Concept to explain: {concept}
171
+
172
+ Explanation:"""
173
+
174
+ explanation = await self.llm_service.generate_text(prompt, max_tokens=600, temperature=0.5)
175
+
176
+ logger.info(f"Generated explanation for concept: {concept}")
177
+ return explanation
178
+
179
+ except Exception as e:
180
+ logger.error(f"Error explaining concept: {str(e)}")
181
+ return f"Error explaining concept: {str(e)}"
182
+
183
+ async def compare_concepts(self, concept1: str, concept2: str, aspects: List[str] = None) -> str:
184
+ """Compare two concepts across specified aspects"""
185
+ try:
186
+ if not concept1.strip() or not concept2.strip():
187
+ return "Both concepts must be provided for comparison."
188
+
189
+ if not aspects:
190
+ aspects = ["definition", "key features", "advantages", "disadvantages", "use cases"]
191
+
192
+ aspects_str = ", ".join(aspects)
193
+
194
+ prompt = f"""Compare and contrast "{concept1}" and "{concept2}" across the following aspects: {aspects_str}.
195
+
196
+ Structure your comparison clearly, addressing each aspect for both concepts.
197
+
198
+ Format:
199
+ ## Comparison: {concept1} vs {concept2}
200
+
201
+ For each aspect, provide:
202
+ - **{concept1}**: [description]
203
+ - **{concept2}**: [description]
204
+ - **Key Difference**: [summary]
205
+
206
+ For each aspect, provide:
207
+ - **{concept1}**: [description]
208
+ - **{concept2}**: [description]
209
+ - **Key Difference**: [summary]
210
+
211
+ Concepts to compare:
212
+ 1. {concept1}
213
+ 2. {concept2}
214
+
215
+ Comparison:"""
216
+
217
+ comparison = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.6)
218
+
219
+ logger.info(f"Generated comparison between {concept1} and {concept2}")
220
+ return comparison
221
+
222
+ except Exception as e:
223
+ logger.error(f"Error comparing concepts: {str(e)}")
224
+ return f"Error comparing concepts: {str(e)}"
225
+
226
+ async def generate_questions(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> List[str]:
227
+ """Generate questions based on the provided content"""
228
+ try:
229
+ if not content.strip():
230
+ return []
231
+
232
+ question_types = {
233
+ "comprehension": "comprehension questions that test understanding of key concepts",
234
+ "analysis": "analytical questions that require deeper thinking and evaluation",
235
+ "application": "application questions that ask how to use the concepts in practice",
236
+ "creative": "creative questions that encourage original thinking and exploration",
237
+ "factual": "factual questions about specific details and information"
238
+ }
239
+
240
+ question_desc = question_types.get(question_type, "comprehension questions")
241
+
242
+ prompt = f"""Based on the following content, generate {num_questions} {question_desc}.
243
+
244
+ The questions should be:
245
+ - Clear and well-formulated
246
+ - Relevant to the content
247
+ - Appropriate for the specified type
248
+ - Engaging and thought-provoking
249
+
250
+ Content:
251
+ {content[:2000]} # Limit content length
252
+
253
+ Questions:"""
254
+
255
+ response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.7)
256
+
257
+ # Parse questions from response
258
+ questions = []
259
+ lines = response.split('\n')
260
+
261
+ for line in lines:
262
+ line = line.strip()
263
+ if line and ('?' in line or line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*'))):
264
+ # Clean up the question
265
+ question = line.lstrip('0123456789.-* ').strip()
266
+ if question and '?' in question:
267
+ questions.append(question)
268
+
269
+ logger.info(f"Generated {len(questions)} {question_type} questions")
270
+ return questions[:num_questions]
271
+
272
+ except Exception as e:
273
+ logger.error(f"Error generating questions: {str(e)}")
274
+ return []
275
+
276
+ def _chunk_text(self, text: str, chunk_size: int = 2000) -> List[str]:
277
+ """Split text into chunks respecting paragraph boundaries"""
278
+ if len(text) <= chunk_size:
279
+ return [text]
280
+
281
+ chunks = []
282
+ current_chunk = ""
283
+
284
+ # Split by paragraphs first
285
+ paragraphs = text.split('\n\n')
286
+
287
+ for para in paragraphs:
288
+ if len(current_chunk) + len(para) + 2 <= chunk_size:
289
+ current_chunk += para + "\n\n"
290
+ else:
291
+ if current_chunk:
292
+ chunks.append(current_chunk.strip())
293
+ current_chunk = para + "\n\n"
294
+
295
+ # If a single paragraph is too long, split it by sentences
296
+ if len(current_chunk) > chunk_size:
297
+ # Reset current_chunk and split the long paragraph
298
+ long_para = current_chunk.strip()
299
+ current_chunk = ""
300
+
301
+ sentences = long_para.replace('. ', '.\n').split('\n')
302
+ sub_chunk = ""
303
+ for sentence in sentences:
304
+ if len(sub_chunk) + len(sentence) + 1 <= chunk_size:
305
+ sub_chunk += sentence + " "
306
+ else:
307
+ if sub_chunk:
308
+ chunks.append(sub_chunk.strip())
309
+ sub_chunk = sentence + " "
310
+ if sub_chunk:
311
+ current_chunk = sub_chunk # Carry over remaining part
312
+
313
+ if current_chunk:
314
+ chunks.append(current_chunk.strip())
315
+
316
+ return chunks
317
+
318
+ async def paraphrase_text(self, text: str, style: str = "formal", preserve_meaning: bool = True) -> str:
319
+ """Paraphrase text in a different style while preserving meaning"""
320
+ try:
321
+ if not text.strip():
322
+ return "No text provided for paraphrasing."
323
+
324
+ # Check length and chunk if necessary
325
+ MAX_CHUNK_SIZE = 2500
326
+ if len(text) > MAX_CHUNK_SIZE:
327
+ logger.info(f"Text length {len(text)} exceeds limit, chunking...")
328
+ chunks = self._chunk_text(text, MAX_CHUNK_SIZE)
329
+ logger.info(f"Split into {len(chunks)} chunks")
330
+
331
+ paraphrased_chunks = []
332
+ for i, chunk in enumerate(chunks):
333
+ logger.info(f"Processing chunk {i+1}/{len(chunks)}")
334
+ # Process chunk
335
+ chunk_result = await self.paraphrase_text(chunk, style, preserve_meaning)
336
+ paraphrased_chunks.append(chunk_result)
337
+ # Small delay to be nice to rate limits
338
+ await asyncio.sleep(0.5)
339
+
340
+ return "\n\n".join(paraphrased_chunks)
341
+
342
+ style_instructions = {
343
+ "formal": "formal, professional language",
344
+ "casual": "casual, conversational language",
345
+ "academic": "academic, scholarly language",
346
+ "simple": "simple, easy-to-understand language",
347
+ "technical": "technical, precise language"
348
+ }
349
+
350
+ style_desc = style_instructions.get(style, "clear, appropriate language")
351
+ meaning_instruction = "while preserving the exact meaning and key information" if preserve_meaning else "while maintaining the general intent"
352
+
353
+ prompt = f"""Paraphrase the following text using {style_desc} {meaning_instruction}.
354
+
355
+ Original text:
356
+ {text}
357
+
358
+ Paraphrased text:"""
359
+
360
+ paraphrase = await self.llm_service.generate_text(prompt, max_tokens=len(text.split()) * 2, temperature=0.6)
361
+
362
+ logger.info(f"Paraphrased text in {style} style")
363
+ return paraphrase.strip()
364
+
365
+ except Exception as e:
366
+ logger.error(f"Error paraphrasing text: {str(e)}")
367
+ return f"Error paraphrasing text: {str(e)}"
368
+
369
+ async def extract_key_insights(self, content: str, num_insights: int = 5) -> List[str]:
370
+ """Extract key insights from the provided content"""
371
+ try:
372
+ if not content.strip():
373
+ return []
374
+
375
+ prompt = f'''Analyze the following content and extract {num_insights} key insights or takeaways.
376
+
377
+ Each insight should be:
378
+ - A clear, concise statement
379
+ - Significant and meaningful
380
+ - Based on the content provided
381
+ - Actionable or thought-provoking when possible
382
+
383
+ Content:
384
+ {content[:3000]} # Limit content length
385
+
386
+ Key Insights:'''
387
+
388
+ response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.6)
389
+
390
+ # Parse insights from response
391
+ insights = []
392
+ lines = response.split('\n')
393
+
394
+ for line in lines:
395
+ line = line.strip()
396
+ if line and (line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*')) or len(insights) == 0):
397
+ # Clean up the insight
398
+ insight = line.lstrip('0123456789.-* ').strip()
399
+ if insight and len(insight) > 10: # Minimum insight length
400
+ insights.append(insight)
401
+
402
+ logger.info(f"Extracted {len(insights)} key insights")
403
+ return insights[:num_insights]
404
+
405
+ except Exception as e:
406
+ logger.error(f"Error extracting insights: {str(e)}")
407
+ return []
mcp_tools/ingestion_tool.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from typing import Dict, Any, Optional
4
+ import tempfile
5
+ import os
6
+ from pathlib import Path
7
+ import uuid
8
+
9
+ from core.document_parser import DocumentParser
10
+ from core.chunker import TextChunker
11
+ from core.text_preprocessor import TextPreprocessor
12
+ from services.vector_store_service import VectorStoreService
13
+ from services.document_store_service import DocumentStoreService
14
+ from services.embedding_service import EmbeddingService
15
+ from services.ocr_service import OCRService
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class IngestionTool:
20
+ def __init__(self, vector_store: VectorStoreService, document_store: DocumentStoreService,
21
+ embedding_service: EmbeddingService, ocr_service: OCRService):
22
+ self.vector_store = vector_store
23
+ self.document_store = document_store
24
+ self.embedding_service = embedding_service
25
+ self.ocr_service = ocr_service
26
+
27
+ self.document_parser = DocumentParser()
28
+ # Pass OCR service to document parser
29
+ self.document_parser.ocr_service = ocr_service
30
+
31
+ self.text_chunker = TextChunker()
32
+ self.text_preprocessor = TextPreprocessor()
33
+
34
+ async def process_document(self, file_path: str, file_type: str, task_id: Optional[str] = None) -> Dict[str, Any]:
35
+ """Process a document through the full ingestion pipeline"""
36
+ if task_id is None:
37
+ task_id = str(uuid.uuid4())
38
+
39
+ try:
40
+ logger.info(f"Starting document processing for {file_path}")
41
+
42
+ # Step 1: Parse the document
43
+ filename = Path(file_path).name
44
+ document = await self.document_parser.parse_document(file_path, filename)
45
+
46
+ if not document.content:
47
+ logger.warning(f"No content extracted from document {filename}")
48
+ return {
49
+ "success": False,
50
+ "error": "No content could be extracted from the document",
51
+ "task_id": task_id
52
+ }
53
+
54
+ # Step 2: Store the document
55
+ await self.document_store.store_document(document)
56
+
57
+ # Step 3: Process content for embeddings
58
+ chunks = await self._create_and_embed_chunks(document)
59
+
60
+ if not chunks:
61
+ logger.warning(f"No chunks created for document {document.id}")
62
+ return {
63
+ "success": False,
64
+ "error": "Failed to create text chunks",
65
+ "task_id": task_id,
66
+ "document_id": document.id,
67
+ "filename": document.filename,
68
+ "chunks_created": len(chunks),
69
+ "content_length": len(document.content),
70
+ "doc_type": document.doc_type.value,
71
+ "message": f"Successfully processed {filename}"
72
+ }
73
+
74
+ # Step 4: Store embeddings
75
+ success = await self.vector_store.add_chunks(chunks)
76
+
77
+ if not success:
78
+ logger.error(f"Failed to store embeddings for document {document.id}")
79
+ return {
80
+ "success": False,
81
+ "error": "Failed to store embeddings",
82
+ "task_id": task_id,
83
+ "document_id": document.id
84
+ }
85
+
86
+ # Step 5: Update document metadata with chunk count
87
+ try:
88
+ current_metadata = document.metadata or {}
89
+ current_metadata["chunk_count"] = len(chunks)
90
+ await self.document_store.update_document_metadata(
91
+ document.id,
92
+ {"metadata": current_metadata}
93
+ )
94
+ except Exception as e:
95
+ logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
96
+
97
+ logger.info(f"Successfully processed document {document.id} with {len(chunks)} chunks")
98
+
99
+ return {
100
+ "success": True,
101
+ "task_id": task_id,
102
+ "document_id": document.id,
103
+ "filename": document.filename,
104
+ "chunks_created": len(chunks),
105
+ "content_length": len(document.content),
106
+ "doc_type": document.doc_type.value,
107
+ "message": f"Successfully processed {filename}"
108
+ }
109
+
110
+ except Exception as e:
111
+ logger.error(f"Error processing document {file_path}: {str(e)}")
112
+ return {
113
+ "success": False,
114
+ "error": str(e),
115
+ "task_id": task_id,
116
+ "message": f"Failed to process document: {str(e)}"
117
+ }
118
+
119
+ async def _create_and_embed_chunks(self, document) -> list:
120
+ """Create chunks and generate embeddings"""
121
+ try:
122
+ # Step 1: Create chunks
123
+ chunks = self.text_chunker.chunk_document(
124
+ document.id,
125
+ document.content,
126
+ method="recursive"
127
+ )
128
+
129
+ if not chunks:
130
+ return []
131
+
132
+ # Step 2: Optimize chunks for embedding
133
+ optimized_chunks = self.text_chunker.optimize_chunks_for_embedding(chunks)
134
+
135
+ # Step 3: Generate embeddings
136
+ texts = [chunk.content for chunk in optimized_chunks]
137
+ embeddings = await self.embedding_service.generate_embeddings(texts)
138
+
139
+ # Step 4: Add embeddings to chunks
140
+ embedded_chunks = []
141
+ for i, chunk in enumerate(optimized_chunks):
142
+ if i < len(embeddings):
143
+ chunk.embedding = embeddings[i]
144
+ embedded_chunks.append(chunk)
145
+
146
+ return embedded_chunks
147
+
148
+ except Exception as e:
149
+ logger.error(f"Error creating and embedding chunks: {str(e)}")
150
+ return []
151
+
152
+ async def process_url(self, url: str, task_id: Optional[str] = None) -> Dict[str, Any]:
153
+ """Process a document from a URL"""
154
+ try:
155
+ import requests
156
+ from urllib.parse import urlparse
157
+
158
+ # Download the file
159
+ response = requests.get(url, timeout=30)
160
+ response.raise_for_status()
161
+
162
+ # Determine file type from URL or content-type
163
+ parsed_url = urlparse(url)
164
+ filename = Path(parsed_url.path).name or "downloaded_file"
165
+
166
+ # Create temporary file
167
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
168
+ tmp_file.write(response.content)
169
+ tmp_file_path = tmp_file.name
170
+
171
+ try:
172
+ # Process the downloaded file
173
+ result = await self.process_document(tmp_file_path, "", task_id)
174
+ result["source_url"] = url
175
+ return result
176
+ finally:
177
+ # Clean up temporary file
178
+ if os.path.exists(tmp_file_path):
179
+ os.unlink(tmp_file_path)
180
+
181
+ except Exception as e:
182
+ logger.error(f"Error processing URL {url}: {str(e)}")
183
+ return {
184
+ "success": False,
185
+ "error": str(e),
186
+ "task_id": task_id or str(uuid.uuid4()),
187
+ "source_url": url
188
+ }
189
+
190
+ async def process_text_content(self, content: str, filename: str = "text_content.txt",
191
+ task_id: Optional[str] = None) -> Dict[str, Any]:
192
+ """Process raw text content directly"""
193
+ try:
194
+ from core.models import Document, DocumentType
195
+ from datetime import datetime
196
+
197
+ # Create document object
198
+ document = Document(
199
+ id=str(uuid.uuid4()),
200
+ filename=filename,
201
+ content=content,
202
+ doc_type=DocumentType.TEXT,
203
+ file_size=len(content.encode('utf-8')),
204
+ created_at=datetime.utcnow(),
205
+ metadata={
206
+ "source": "direct_text_input",
207
+ "content_length": len(content),
208
+ "word_count": len(content.split())
209
+ }
210
+ )
211
+
212
+ # Store the document
213
+ await self.document_store.store_document(document)
214
+
215
+ # Process content for embeddings
216
+ chunks = await self._create_and_embed_chunks(document)
217
+
218
+ if chunks:
219
+ await self.vector_store.add_chunks(chunks)
220
+
221
+ # Update document metadata with chunk count
222
+ try:
223
+ current_metadata = document.metadata or {}
224
+ current_metadata["chunk_count"] = len(chunks)
225
+ await self.document_store.update_document_metadata(
226
+ document.id,
227
+ {"metadata": current_metadata}
228
+ )
229
+ except Exception as e:
230
+ logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
231
+
232
+ return {
233
+ "success": True,
234
+ "task_id": task_id or str(uuid.uuid4()),
235
+ "document_id": document.id,
236
+ "filename": filename,
237
+ "chunks_created": len(chunks),
238
+ "content_length": len(content),
239
+ "message": f"Successfully processed text content"
240
+ }
241
+
242
+ except Exception as e:
243
+ logger.error(f"Error processing text content: {str(e)}")
244
+ return {
245
+ "success": False,
246
+ "error": str(e),
247
+ "task_id": task_id or str(uuid.uuid4())
248
+ }
249
+
250
+ async def reprocess_document(self, document_id: str, task_id: Optional[str] = None) -> Dict[str, Any]:
251
+ """Reprocess an existing document (useful for updating embeddings)"""
252
+ try:
253
+ # Get the document
254
+ document = await self.document_store.get_document(document_id)
255
+
256
+ if not document:
257
+ return {
258
+ "success": False,
259
+ "error": f"Document {document_id} not found",
260
+ "task_id": task_id or str(uuid.uuid4())
261
+ }
262
+
263
+ # Remove existing chunks from vector store
264
+ await self.vector_store.delete_document(document_id)
265
+
266
+ # Recreate and embed chunks
267
+ chunks = await self._create_and_embed_chunks(document)
268
+
269
+ if chunks:
270
+ await self.vector_store.add_chunks(chunks)
271
+
272
+ # Update document metadata with chunk count
273
+ try:
274
+ current_metadata = document.metadata or {}
275
+ current_metadata["chunk_count"] = len(chunks)
276
+ await self.document_store.update_document_metadata(
277
+ document.id,
278
+ {"metadata": current_metadata}
279
+ )
280
+ except Exception as e:
281
+ logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
282
+
283
+ return {
284
+ "success": True,
285
+ "task_id": task_id or str(uuid.uuid4()),
286
+ "document_id": document_id,
287
+ "filename": document.filename,
288
+ "chunks_created": len(chunks),
289
+ "message": f"Successfully reprocessed {document.filename}"
290
+ }
291
+
292
+ except Exception as e:
293
+ logger.error(f"Error reprocessing document {document_id}: {str(e)}")
294
+ return {
295
+ "success": False,
296
+ "error": str(e),
297
+ "task_id": task_id or str(uuid.uuid4()),
298
+ "document_id": document_id
299
+ }
300
+
301
+ async def batch_process_directory(self, directory_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
302
+ """Process multiple documents from a directory"""
303
+ try:
304
+ directory = Path(directory_path)
305
+ if not directory.exists() or not directory.is_dir():
306
+ return {
307
+ "success": False,
308
+ "error": f"Directory {directory_path} does not exist",
309
+ "task_id": task_id or str(uuid.uuid4())
310
+ }
311
+
312
+ # Supported file extensions
313
+ supported_extensions = {'.txt', '.pdf', '.docx', '.png', '.jpg', '.jpeg', '.bmp', '.tiff'}
314
+
315
+ # Find all supported files
316
+ files_to_process = []
317
+ for ext in supported_extensions:
318
+ files_to_process.extend(directory.glob(f"*{ext}"))
319
+ files_to_process.extend(directory.glob(f"*{ext.upper()}"))
320
+
321
+ if not files_to_process:
322
+ return {
323
+ "success": False,
324
+ "error": "No supported files found in directory",
325
+ "task_id": task_id or str(uuid.uuid4())
326
+ }
327
+
328
+ # Process files
329
+ results = []
330
+ successful = 0
331
+ failed = 0
332
+
333
+ for file_path in files_to_process:
334
+ try:
335
+ result = await self.process_document(str(file_path), file_path.suffix)
336
+ results.append(result)
337
+
338
+ if result.get("success"):
339
+ successful += 1
340
+ else:
341
+ failed += 1
342
+
343
+ except Exception as e:
344
+ failed += 1
345
+ results.append({
346
+ "success": False,
347
+ "error": str(e),
348
+ "filename": file_path.name
349
+ })
350
+
351
+ return {
352
+ "success": True,
353
+ "task_id": task_id or str(uuid.uuid4()),
354
+ "directory": str(directory),
355
+ "total_files": len(files_to_process),
356
+ "successful": successful,
357
+ "failed": failed,
358
+ "results": results,
359
+ "message": f"Processed {successful}/{len(files_to_process)} files successfully"
360
+ }
361
+
362
+ except Exception as e:
363
+ logger.error(f"Error batch processing directory {directory_path}: {str(e)}")
364
+ return {
365
+ "success": False,
366
+ "error": str(e),
367
+ "task_id": task_id or str(uuid.uuid4())
368
+ }
mcp_tools/podcast_tool.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Any, List
3
+ from dataclasses import asdict
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class PodcastTool:
8
+ """
9
+ MCP Tool for podcast generation from documents
10
+ """
11
+
12
+ def __init__(self, podcast_generator):
13
+ """
14
+ Initialize Podcast Tool
15
+
16
+ Args:
17
+ podcast_generator: PodcastGeneratorService instance
18
+ """
19
+ self.podcast_generator = podcast_generator
20
+
21
+ async def generate_podcast(
22
+ self,
23
+ document_ids: List[str],
24
+ style: str = "conversational",
25
+ duration_minutes: int = 10,
26
+ host1_voice: str = "Rachel",
27
+ host2_voice: str = "Adam"
28
+ ) -> Dict[str, Any]:
29
+ """
30
+ MCP Tool: Generate podcast from documents
31
+
32
+ Args:
33
+ document_ids: List of document IDs to generate podcast from
34
+ style: Podcast style (conversational, educational, technical, casual)
35
+ duration_minutes: Target duration in minutes
36
+ host1_voice: Voice name for first host
37
+ host2_voice: Voice name for second host
38
+
39
+ Returns:
40
+ Dictionary with podcast ID, audio URL, transcript, and metadata
41
+ """
42
+ try:
43
+ if not document_ids or len(document_ids) == 0:
44
+ return {
45
+ "success": False,
46
+ "error": "No documents provided. Please select at least one document."
47
+ }
48
+
49
+ logger.info(f"Generating podcast from {len(document_ids)} documents")
50
+
51
+ # Generate podcast using service
52
+ result = await self.podcast_generator.generate_podcast(
53
+ document_ids=document_ids,
54
+ style=style,
55
+ duration_minutes=duration_minutes,
56
+ host1_voice=host1_voice,
57
+ host2_voice=host2_voice
58
+ )
59
+
60
+ if result.success:
61
+ return {
62
+ "success": True,
63
+ "podcast_id": result.podcast_id,
64
+ "audio_file": result.audio_file_path,
65
+ "audio_url": f"/data/podcasts/{result.podcast_id}.mp3",
66
+ "transcript": result.transcript,
67
+ "metadata": asdict(result.metadata) if result.metadata else {},
68
+ "generation_time": result.generation_time,
69
+ "message": f"Podcast generated successfully! Duration: {result.metadata.duration_seconds/60:.1f} minutes"
70
+ }
71
+ else:
72
+ return {
73
+ "success": False,
74
+ "error": result.error or "Unknown error during podcast generation"
75
+ }
76
+
77
+ except Exception as e:
78
+ logger.error(f"Podcast generation failed: {str(e)}")
79
+ return {
80
+ "success": False,
81
+ "error": str(e)
82
+ }
83
+
84
+ def list_podcasts(self, limit: int = 10) -> Dict[str, Any]:
85
+ """
86
+ List previously generated podcasts
87
+
88
+ Args:
89
+ limit: Maximum number of podcasts to return
90
+
91
+ Returns:
92
+ Dictionary with list of podcast metadata
93
+ """
94
+ try:
95
+ podcasts = self.podcast_generator.list_podcasts(limit=limit)
96
+
97
+ return {
98
+ "success": True,
99
+ "podcasts": [asdict(p) for p in podcasts],
100
+ "total": len(podcasts)
101
+ }
102
+ except Exception as e:
103
+ logger.error(f"Failed to list podcasts: {str(e)}")
104
+ return {
105
+ "success": False,
106
+ "error": str(e),
107
+ "podcasts": []
108
+ }
109
+
110
+ def get_podcast(self, podcast_id: str) -> Dict[str, Any]:
111
+ """
112
+ Get specific podcast by ID
113
+
114
+ Args:
115
+ podcast_id: Podcast identifier
116
+
117
+ Returns:
118
+ Dictionary with podcast metadata
119
+ """
120
+ try:
121
+ podcast = self.podcast_generator.get_podcast(podcast_id)
122
+
123
+ if podcast:
124
+ return {
125
+ "success": True,
126
+ "podcast": asdict(podcast)
127
+ }
128
+ else:
129
+ return {
130
+ "success": False,
131
+ "error": "Podcast not found"
132
+ }
133
+ except Exception as e:
134
+ logger.error(f"Failed to get podcast: {str(e)}")
135
+ return {
136
+ "success": False,
137
+ "error": str(e)
138
+ }
mcp_tools/search_tool.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Any, Optional
3
+ import asyncio
4
+
5
+ from core.models import SearchResult
6
+ from services.vector_store_service import VectorStoreService
7
+ from services.embedding_service import EmbeddingService
8
+ from services.document_store_service import DocumentStoreService
9
+ import config
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class SearchTool:
14
+ def __init__(self, vector_store: VectorStoreService, embedding_service: EmbeddingService,
15
+ document_store: Optional[DocumentStoreService] = None, llamaindex_service: Any = None):
16
+ self.vector_store = vector_store
17
+ self.embedding_service = embedding_service
18
+ self.document_store = document_store
19
+ self.llamaindex_service = llamaindex_service
20
+ self.config = config.config
21
+
22
+ async def search(self, query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None,
23
+ similarity_threshold: Optional[float] = None) -> List[SearchResult]:
24
+ """Perform semantic search"""
25
+ try:
26
+ if not query.strip():
27
+ logger.warning("Empty search query provided")
28
+ return []
29
+
30
+ # Use default threshold if not provided
31
+ if similarity_threshold is None:
32
+ similarity_threshold = self.config.SIMILARITY_THRESHOLD
33
+
34
+ logger.info(f"Performing semantic search for: '{query}' (top_k={top_k})")
35
+
36
+ # Generate query embedding
37
+ query_embedding = await self.embedding_service.generate_single_embedding(query)
38
+
39
+ if not query_embedding:
40
+ logger.error("Failed to generate query embedding")
41
+ return []
42
+
43
+ # Perform vector search
44
+ results = await self.vector_store.search(
45
+ query_embedding=query_embedding,
46
+ top_k=top_k,
47
+ filters=filters
48
+ )
49
+
50
+ # Filter by similarity threshold
51
+ filtered_results = [
52
+ result for result in results
53
+ if result.score >= similarity_threshold
54
+ ]
55
+
56
+ logger.info(f"Found {len(filtered_results)} results above threshold {similarity_threshold}")
57
+
58
+ # Enhance results with additional metadata if document store is available
59
+ if self.document_store:
60
+ enhanced_results = await self._enhance_results_with_metadata(filtered_results)
61
+ return enhanced_results
62
+
63
+ return filtered_results
64
+
65
+ except Exception as e:
66
+ logger.error(f"Error performing semantic search: {str(e)}")
67
+ return []
68
+
69
+ async def agentic_search(self, query: str) -> str:
70
+ """Perform agentic search using LlamaIndex"""
71
+ if not self.llamaindex_service:
72
+ logger.warning("LlamaIndex service not available for agentic search")
73
+ return "Agentic search not available."
74
+
75
+ try:
76
+ logger.info(f"Performing agentic search for: '{query}'")
77
+ return await self.llamaindex_service.query(query)
78
+ except Exception as e:
79
+ logger.error(f"Error performing agentic search: {str(e)}")
80
+ return f"Error performing agentic search: {str(e)}"
81
+
82
+ async def _enhance_results_with_metadata(self, results: List[SearchResult]) -> List[SearchResult]:
83
+ """Enhance search results with document metadata"""
84
+ try:
85
+ enhanced_results = []
86
+
87
+ for result in results:
88
+ try:
89
+ # Get document metadata
90
+ document = await self.document_store.get_document(result.document_id)
91
+
92
+ if document:
93
+ # Add document metadata to result
94
+ enhanced_metadata = {
95
+ **result.metadata,
96
+ "document_filename": document.filename,
97
+ "document_type": document.doc_type.value,
98
+ "document_tags": document.tags,
99
+ "document_category": document.category,
100
+ "document_created_at": document.created_at.isoformat(),
101
+ "document_summary": document.summary
102
+ }
103
+
104
+ enhanced_result = SearchResult(
105
+ chunk_id=result.chunk_id,
106
+ document_id=result.document_id,
107
+ content=result.content,
108
+ score=result.score,
109
+ metadata=enhanced_metadata
110
+ )
111
+
112
+ enhanced_results.append(enhanced_result)
113
+ else:
114
+ # Document not found, use original result
115
+ enhanced_results.append(result)
116
+
117
+ except Exception as e:
118
+ logger.warning(f"Error enhancing result {result.chunk_id}: {str(e)}")
119
+ enhanced_results.append(result)
120
+
121
+ return enhanced_results
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error enhancing results: {str(e)}")
125
+ return results
126
+
127
+ async def multi_query_search(self, queries: List[str], top_k: int = 5,
128
+ aggregate_method: str = "merge") -> List[SearchResult]:
129
+ """Perform search with multiple queries and aggregate results"""
130
+ try:
131
+ all_results = []
132
+
133
+ # Perform search for each query
134
+ for query in queries:
135
+ if query.strip():
136
+ query_results = await self.search(query, top_k)
137
+ all_results.extend(query_results)
138
+
139
+ if not all_results:
140
+ return []
141
+
142
+ # Aggregate results
143
+ if aggregate_method == "merge":
144
+ return await self._merge_results(all_results, top_k)
145
+ elif aggregate_method == "intersect":
146
+ return await self._intersect_results(all_results, top_k)
147
+ elif aggregate_method == "average":
148
+ return await self._average_results(all_results, top_k)
149
+ else:
150
+ # Default to merge
151
+ return await self._merge_results(all_results, top_k)
152
+
153
+ except Exception as e:
154
+ logger.error(f"Error in multi-query search: {str(e)}")
155
+ return []
156
+
157
+ async def _merge_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
158
+ """Merge results and remove duplicates, keeping highest scores"""
159
+ try:
160
+ # Group by chunk_id and keep highest score
161
+ chunk_scores = {}
162
+ chunk_results = {}
163
+
164
+ for result in results:
165
+ chunk_id = result.chunk_id
166
+ if chunk_id not in chunk_scores or result.score > chunk_scores[chunk_id]:
167
+ chunk_scores[chunk_id] = result.score
168
+ chunk_results[chunk_id] = result
169
+
170
+ # Sort by score and return top_k
171
+ merged_results = list(chunk_results.values())
172
+ merged_results.sort(key=lambda x: x.score, reverse=True)
173
+
174
+ return merged_results[:top_k]
175
+
176
+ except Exception as e:
177
+ logger.error(f"Error merging results: {str(e)}")
178
+ return results[:top_k]
179
+
180
+ async def _intersect_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
181
+ """Find chunks that appear in multiple queries"""
182
+ try:
183
+ # Count occurrences of each chunk
184
+ chunk_counts = {}
185
+ chunk_results = {}
186
+
187
+ for result in results:
188
+ chunk_id = result.chunk_id
189
+ chunk_counts[chunk_id] = chunk_counts.get(chunk_id, 0) + 1
190
+
191
+ if chunk_id not in chunk_results or result.score > chunk_results[chunk_id].score:
192
+ chunk_results[chunk_id] = result
193
+
194
+ # Filter chunks that appear more than once
195
+ intersect_results = [
196
+ result for chunk_id, result in chunk_results.items()
197
+ if chunk_counts[chunk_id] > 1
198
+ ]
199
+
200
+ # Sort by score
201
+ intersect_results.sort(key=lambda x: x.score, reverse=True)
202
+
203
+ return intersect_results[:top_k]
204
+
205
+ except Exception as e:
206
+ logger.error(f"Error intersecting results: {str(e)}")
207
+ return []
208
+
209
+ async def _average_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
210
+ """Average scores for chunks that appear multiple times"""
211
+ try:
212
+ # Group by chunk_id and calculate average scores
213
+ chunk_groups = {}
214
+
215
+ for result in results:
216
+ chunk_id = result.chunk_id
217
+ if chunk_id not in chunk_groups:
218
+ chunk_groups[chunk_id] = []
219
+ chunk_groups[chunk_id].append(result)
220
+
221
+ # Calculate average scores
222
+ averaged_results = []
223
+ for chunk_id, group in chunk_groups.items():
224
+ avg_score = sum(r.score for r in group) / len(group)
225
+
226
+ # Use the result with the highest individual score but update the score to average
227
+ best_result = max(group, key=lambda x: x.score)
228
+ averaged_result = SearchResult(
229
+ chunk_id=best_result.chunk_id,
230
+ document_id=best_result.document_id,
231
+ content=best_result.content,
232
+ score=avg_score,
233
+ metadata={
234
+ **best_result.metadata,
235
+ "query_count": len(group),
236
+ "score_range": f"{min(r.score for r in group):.3f}-{max(r.score for r in group):.3f}"
237
+ }
238
+ )
239
+ averaged_results.append(averaged_result)
240
+
241
+ # Sort by average score
242
+ averaged_results.sort(key=lambda x: x.score, reverse=True)
243
+
244
+ return averaged_results[:top_k]
245
+
246
+ except Exception as e:
247
+ logger.error(f"Error averaging results: {str(e)}")
248
+ return results[:top_k]
249
+
250
+ async def search_by_document(self, document_id: str, query: str, top_k: int = 5) -> List[SearchResult]:
251
+ """Search within a specific document"""
252
+ try:
253
+ filters = {"document_id": document_id}
254
+ return await self.search(query, top_k, filters)
255
+
256
+ except Exception as e:
257
+ logger.error(f"Error searching within document {document_id}: {str(e)}")
258
+ return []
259
+
260
+ async def search_by_category(self, category: str, query: str, top_k: int = 5) -> List[SearchResult]:
261
+ """Search within documents of a specific category"""
262
+ try:
263
+ if not self.document_store:
264
+ logger.warning("Document store not available for category search")
265
+ return await self.search(query, top_k)
266
+
267
+ # Get documents in the category
268
+ documents = await self.document_store.list_documents(
269
+ limit=1000, # Adjust as needed
270
+ filters={"category": category}
271
+ )
272
+
273
+ if not documents:
274
+ logger.info(f"No documents found in category '{category}'")
275
+ return []
276
+
277
+ # Extract document IDs
278
+ document_ids = [doc.id for doc in documents]
279
+
280
+ # Search with document ID filter
281
+ filters = {"document_ids": document_ids}
282
+ return await self.search(query, top_k, filters)
283
+
284
+ except Exception as e:
285
+ logger.error(f"Error searching by category {category}: {str(e)}")
286
+ return []
287
+
288
+ async def search_with_date_range(self, query: str, start_date, end_date, top_k: int = 5) -> List[SearchResult]:
289
+ """Search documents within a date range"""
290
+ try:
291
+ if not self.document_store:
292
+ logger.warning("Document store not available for date range search")
293
+ return await self.search(query, top_k)
294
+
295
+ # Get documents in the date range
296
+ documents = await self.document_store.list_documents(
297
+ limit=1000, # Adjust as needed
298
+ filters={
299
+ "created_after": start_date,
300
+ "created_before": end_date
301
+ }
302
+ )
303
+
304
+ if not documents:
305
+ logger.info(f"No documents found in date range")
306
+ return []
307
+
308
+ # Extract document IDs
309
+ document_ids = [doc.id for doc in documents]
310
+
311
+ # Search with document ID filter
312
+ filters = {"document_ids": document_ids}
313
+ return await self.search(query, top_k, filters)
314
+
315
+ except Exception as e:
316
+ logger.error(f"Error searching with date range: {str(e)}")
317
+ return []
318
+
319
+ async def get_search_suggestions(self, partial_query: str, limit: int = 5) -> List[str]:
320
+ """Get search suggestions based on partial query"""
321
+ try:
322
+ # This is a simple implementation
323
+ # In a production system, you might want to use a more sophisticated approach
324
+
325
+ if len(partial_query) < 2:
326
+ return []
327
+
328
+ # Search for the partial query
329
+ results = await self.search(partial_query, top_k=20)
330
+
331
+ # Extract potential query expansions from content
332
+ suggestions = set()
333
+
334
+ for result in results:
335
+ content_words = result.content.lower().split()
336
+ for i, word in enumerate(content_words):
337
+ if partial_query.lower() in word:
338
+ # Add the word itself
339
+ suggestions.add(word.strip('.,!?;:'))
340
+
341
+ # Add phrases that include this word
342
+ if i > 0:
343
+ phrase = f"{content_words[i-1]} {word}".strip('.,!?;:')
344
+ suggestions.add(phrase)
345
+ if i < len(content_words) - 1:
346
+ phrase = f"{word} {content_words[i+1]}".strip('.,!?;:')
347
+ suggestions.add(phrase)
348
+
349
+ # Filter and sort suggestions
350
+ filtered_suggestions = [
351
+ s for s in suggestions
352
+ if len(s) > len(partial_query) and s.startswith(partial_query.lower())
353
+ ]
354
+
355
+ return sorted(filtered_suggestions)[:limit]
356
+
357
+ except Exception as e:
358
+ logger.error(f"Error getting search suggestions: {str(e)}")
359
+ return []
360
+
361
+ async def explain_search(self, query: str, top_k: int = 3) -> Dict[str, Any]:
362
+ """Provide detailed explanation of search process and results"""
363
+ try:
364
+ explanation = {
365
+ "query": query,
366
+ "steps": [],
367
+ "results_analysis": {},
368
+ "performance_metrics": {}
369
+ }
370
+
371
+ # Step 1: Query processing
372
+ explanation["steps"].append({
373
+ "step": "query_processing",
374
+ "description": "Processing and normalizing the search query",
375
+ "details": {
376
+ "original_query": query,
377
+ "cleaned_query": query.strip(),
378
+ "query_length": len(query)
379
+ }
380
+ })
381
+
382
+ # Step 2: Embedding generation
383
+ import time
384
+ start_time = time.time()
385
+
386
+ query_embedding = await self.embedding_service.generate_single_embedding(query)
387
+
388
+ embedding_time = time.time() - start_time
389
+
390
+ explanation["steps"].append({
391
+ "step": "embedding_generation",
392
+ "description": "Converting query to vector embedding",
393
+ "details": {
394
+ "embedding_dimension": len(query_embedding) if query_embedding else 0,
395
+ "generation_time_ms": round(embedding_time * 1000, 2)
396
+ }
397
+ })
398
+
399
+ # Step 3: Vector search
400
+ start_time = time.time()
401
+
402
+ results = await self.vector_store.search(query_embedding, top_k)
403
+
404
+ search_time = time.time() - start_time
405
+
406
+ explanation["steps"].append({
407
+ "step": "vector_search",
408
+ "description": "Searching vector database for similar content",
409
+ "details": {
410
+ "search_time_ms": round(search_time * 1000, 2),
411
+ "results_found": len(results),
412
+ "top_score": results[0].score if results else 0,
413
+ "score_range": f"{min(r.score for r in results):.3f}-{max(r.score for r in results):.3f}" if results else "N/A"
414
+ }
415
+ })
416
+
417
+ # Results analysis
418
+ if results:
419
+ explanation["results_analysis"] = {
420
+ "total_results": len(results),
421
+ "average_score": sum(r.score for r in results) / len(results),
422
+ "unique_documents": len(set(r.document_id for r in results)),
423
+ "content_lengths": [len(r.content) for r in results]
424
+ }
425
+
426
+ # Performance metrics
427
+ explanation["performance_metrics"] = {
428
+ "total_time_ms": round((embedding_time + search_time) * 1000, 2),
429
+ "embedding_time_ms": round(embedding_time * 1000, 2),
430
+ "search_time_ms": round(search_time * 1000, 2)
431
+ }
432
+
433
+ return explanation
434
+
435
+ except Exception as e:
436
+ logger.error(f"Error explaining search: {str(e)}")
437
+ return {"error": str(e)}
mcp_tools/utils.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ import functools
4
+ from typing import Any, Callable, Dict, List, Optional
5
+ import time
6
+ import json
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def async_timer(func: Callable) -> Callable:
12
+ """Decorator to time async function execution"""
13
+ @functools.wraps(func)
14
+ async def wrapper(*args, **kwargs):
15
+ start_time = time.time()
16
+ try:
17
+ result = await func(*args, **kwargs)
18
+ end_time = time.time()
19
+ logger.debug(f"{func.__name__} completed in {end_time - start_time:.3f}s")
20
+ return result
21
+ except Exception as e:
22
+ end_time = time.time()
23
+ logger.error(f"{func.__name__} failed after {end_time - start_time:.3f}s: {str(e)}")
24
+ raise
25
+ return wrapper
26
+
27
+ def retry_async(max_attempts: int = 3, delay: float = 1.0, backoff: float = 2.0):
28
+ """Decorator to retry async functions with exponential backoff"""
29
+ def decorator(func: Callable) -> Callable:
30
+ @functools.wraps(func)
31
+ async def wrapper(*args, **kwargs):
32
+ attempt = 1
33
+ current_delay = delay
34
+
35
+ while attempt <= max_attempts:
36
+ try:
37
+ return await func(*args, **kwargs)
38
+ except Exception as e:
39
+ if attempt == max_attempts:
40
+ logger.error(f"{func.__name__} failed after {max_attempts} attempts: {str(e)}")
41
+ raise
42
+
43
+ logger.warning(f"{func.__name__} attempt {attempt} failed: {str(e)}")
44
+ logger.info(f"Retrying in {current_delay}s...")
45
+
46
+ await asyncio.sleep(current_delay)
47
+ attempt += 1
48
+ current_delay *= backoff
49
+
50
+ return wrapper
51
+ return decorator
52
+
53
+ class MCPToolResponse:
54
+ """Standardized response format for MCP tools"""
55
+
56
+ def __init__(self, success: bool, data: Any = None, error: str = None,
57
+ metadata: Dict[str, Any] = None):
58
+ self.success = success
59
+ self.data = data
60
+ self.error = error
61
+ self.metadata = metadata or {}
62
+ self.timestamp = time.time()
63
+
64
+ def to_dict(self) -> Dict[str, Any]:
65
+ """Convert response to dictionary"""
66
+ result = {
67
+ "success": self.success,
68
+ "timestamp": self.timestamp
69
+ }
70
+
71
+ if self.success:
72
+ result["data"] = self.data
73
+ else:
74
+ result["error"] = self.error
75
+
76
+ if self.metadata:
77
+ result["metadata"] = self.metadata
78
+
79
+ return result
80
+
81
+ @classmethod
82
+ def success_response(cls, data: Any, metadata: Dict[str, Any] = None):
83
+ """Create a success response"""
84
+ return cls(success=True, data=data, metadata=metadata)
85
+
86
+ @classmethod
87
+ def error_response(cls, error: str, metadata: Dict[str, Any] = None):
88
+ """Create an error response"""
89
+ return cls(success=False, error=error, metadata=metadata)
90
+
91
+ def validate_required_params(params: Dict[str, Any], required: List[str]) -> Optional[str]:
92
+ """Validate that required parameters are present"""
93
+ missing = []
94
+ for param in required:
95
+ if param not in params or params[param] is None:
96
+ missing.append(param)
97
+
98
+ if missing:
99
+ return f"Missing required parameters: {', '.join(missing)}"
100
+
101
+ return None
102
+
103
+ def sanitize_filename(filename: str) -> str:
104
+ """Sanitize filename for safe storage"""
105
+ import re
106
+
107
+ # Remove or replace invalid characters
108
+ filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
109
+
110
+ # Remove leading/trailing dots and spaces
111
+ filename = filename.strip('. ')
112
+
113
+ # Limit length
114
+ if len(filename) > 255:
115
+ name, ext = Path(filename).stem, Path(filename).suffix
116
+ max_name_len = 255 - len(ext)
117
+ filename = name[:max_name_len] + ext
118
+
119
+ # Ensure not empty
120
+ if not filename:
121
+ filename = "unnamed_file"
122
+
123
+ return filename
124
+
125
+ def truncate_text(text: str, max_length: int, add_ellipsis: bool = True) -> str:
126
+ """Truncate text to specified length"""
127
+ if len(text) <= max_length:
128
+ return text
129
+
130
+ if add_ellipsis and max_length > 3:
131
+ return text[:max_length - 3] + "..."
132
+ else:
133
+ return text[:max_length]
134
+
135
+ def extract_file_info(file_path: str) -> Dict[str, Any]:
136
+ """Extract information about a file"""
137
+ try:
138
+ path = Path(file_path)
139
+ stat = path.stat()
140
+
141
+ return {
142
+ "filename": path.name,
143
+ "extension": path.suffix.lower(),
144
+ "size_bytes": stat.st_size,
145
+ "size_mb": round(stat.st_size / (1024 * 1024), 2),
146
+ "created_time": stat.st_ctime,
147
+ "modified_time": stat.st_mtime,
148
+ "exists": path.exists(),
149
+ "is_file": path.is_file(),
150
+ "is_dir": path.is_dir()
151
+ }
152
+ except Exception as e:
153
+ return {"error": str(e)}
154
+
155
+ async def batch_process(items: List[Any], processor: Callable, batch_size: int = 10,
156
+ max_concurrent: int = 5) -> List[Any]:
157
+ """Process items in batches with concurrency control"""
158
+ results = []
159
+ semaphore = asyncio.Semaphore(max_concurrent)
160
+
161
+ async def process_item(item):
162
+ async with semaphore:
163
+ return await processor(item)
164
+
165
+ # Process in batches
166
+ for i in range(0, len(items), batch_size):
167
+ batch = items[i:i + batch_size]
168
+ batch_tasks = [process_item(item) for item in batch]
169
+ batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
170
+ results.extend(batch_results)
171
+
172
+ return results
173
+
174
+ def format_file_size(size_bytes: int) -> str:
175
+ """Format file size in human-readable format"""
176
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
177
+ if size_bytes < 1024.0:
178
+ return f"{size_bytes:.1f} {unit}"
179
+ size_bytes /= 1024.0
180
+ return f"{size_bytes:.1f} PB"
181
+
182
+ def calculate_reading_time(text: str, words_per_minute: int = 200) -> int:
183
+ """Calculate estimated reading time in minutes"""
184
+ word_count = len(text.split())
185
+ return max(1, round(word_count / words_per_minute))
186
+
187
+ class ProgressTracker:
188
+ """Track progress of long-running operations"""
189
+
190
+ def __init__(self, total_items: int, description: str = "Processing"):
191
+ self.total_items = total_items
192
+ self.completed_items = 0
193
+ self.description = description
194
+ self.start_time = time.time()
195
+ self.errors = []
196
+
197
+ def update(self, completed: int = 1, error: str = None):
198
+ """Update progress"""
199
+ self.completed_items += completed
200
+ if error:
201
+ self.errors.append(error)
202
+
203
+ def get_progress(self) -> Dict[str, Any]:
204
+ """Get current progress information"""
205
+ elapsed_time = time.time() - self.start_time
206
+ progress_percent = (self.completed_items / self.total_items) * 100 if self.total_items > 0 else 0
207
+
208
+ # Estimate remaining time
209
+ if self.completed_items > 0:
210
+ avg_time_per_item = elapsed_time / self.completed_items
211
+ remaining_items = self.total_items - self.completed_items
212
+ estimated_remaining_time = avg_time_per_item * remaining_items
213
+ else:
214
+ estimated_remaining_time = 0
215
+
216
+ return {
217
+ "description": self.description,
218
+ "total_items": self.total_items,
219
+ "completed_items": self.completed_items,
220
+ "progress_percent": round(progress_percent, 1),
221
+ "elapsed_time_seconds": round(elapsed_time, 1),
222
+ "estimated_remaining_seconds": round(estimated_remaining_time, 1),
223
+ "errors_count": len(self.errors),
224
+ "errors": self.errors[-5:] if self.errors else [] # Last 5 errors
225
+ }
226
+
227
+ def is_complete(self) -> bool:
228
+ """Check if processing is complete"""
229
+ return self.completed_items >= self.total_items
230
+
231
+ def load_json_config(config_path: str, default_config: Dict[str, Any] = None) -> Dict[str, Any]:
232
+ """Load configuration from JSON file with fallback to defaults"""
233
+ try:
234
+ with open(config_path, 'r') as f:
235
+ config = json.load(f)
236
+ logger.info(f"Loaded configuration from {config_path}")
237
+ return config
238
+ except FileNotFoundError:
239
+ logger.warning(f"Configuration file {config_path} not found, using defaults")
240
+ return default_config or {}
241
+ except json.JSONDecodeError as e:
242
+ logger.error(f"Invalid JSON in configuration file {config_path}: {str(e)}")
243
+ return default_config or {}
244
+
245
+ def save_json_config(config: Dict[str, Any], config_path: str) -> bool:
246
+ """Save configuration to JSON file"""
247
+ try:
248
+ # Create directory if it doesn't exist
249
+ Path(config_path).parent.mkdir(parents=True, exist_ok=True)
250
+
251
+ with open(config_path, 'w') as f:
252
+ json.dump(config, f, indent=2)
253
+
254
+ logger.info(f"Saved configuration to {config_path}")
255
+ return True
256
+ except Exception as e:
257
+ logger.error(f"Failed to save configuration to {config_path}: {str(e)}")
258
+ return False
259
+
260
+ class RateLimiter:
261
+ """Simple rate limiter for API calls"""
262
+
263
+ def __init__(self, max_calls: int, time_window: float):
264
+ self.max_calls = max_calls
265
+ self.time_window = time_window
266
+ self.calls = []
267
+
268
+ async def acquire(self):
269
+ """Acquire permission to make a call"""
270
+ now = time.time()
271
+
272
+ # Remove old calls outside the time window
273
+ self.calls = [call_time for call_time in self.calls if now - call_time < self.time_window]
274
+
275
+ # Check if we can make a new call
276
+ if len(self.calls) >= self.max_calls:
277
+ # Wait until we can make a call
278
+ oldest_call = min(self.calls)
279
+ wait_time = self.time_window - (now - oldest_call)
280
+ if wait_time > 0:
281
+ await asyncio.sleep(wait_time)
282
+ return await self.acquire() # Recursive call after waiting
283
+
284
+ # Record this call
285
+ self.calls.append(now)
286
+
287
+ def escape_markdown(text: str) -> str:
288
+ """Escape markdown special characters"""
289
+ import re
290
+
291
+ # Characters that need escaping in markdown
292
+ markdown_chars = r'([*_`\[\]()#+\-!\\])'
293
+ return re.sub(markdown_chars, r'\\\1', text)
294
+
295
+ def create_error_summary(errors: List[Exception]) -> str:
296
+ """Create a summary of multiple errors"""
297
+ if not errors:
298
+ return "No errors"
299
+
300
+ error_counts = {}
301
+ for error in errors:
302
+ error_type = type(error).__name__
303
+ error_counts[error_type] = error_counts.get(error_type, 0) + 1
304
+
305
+ summary_parts = []
306
+ for error_type, count in error_counts.items():
307
+ if count == 1:
308
+ summary_parts.append(f"1 {error_type}")
309
+ else:
310
+ summary_parts.append(f"{count} {error_type}s")
311
+
312
+ return f"Encountered {len(errors)} total errors: " + ", ".join(summary_parts)
313
+
314
+ async def safe_execute(func: Callable, *args, default_return=None, **kwargs):
315
+ """Safely execute a function and return default on error"""
316
+ try:
317
+ if asyncio.iscoroutinefunction(func):
318
+ return await func(*args, **kwargs)
319
+ else:
320
+ return func(*args, **kwargs)
321
+ except Exception as e:
322
+ logger.error(f"Error executing {func.__name__}: {str(e)}")
323
+ return default_return
324
+
325
+ def get_content_preview(content: str, max_length: int = 200) -> str:
326
+ """Get a preview of content for display"""
327
+ if not content:
328
+ return "No content"
329
+
330
+ # Clean up whitespace
331
+ content = ' '.join(content.split())
332
+
333
+ if len(content) <= max_length:
334
+ return content
335
+
336
+ # Try to break at sentence boundary
337
+ preview = content[:max_length]
338
+ last_sentence_end = max(preview.rfind('.'), preview.rfind('!'), preview.rfind('?'))
339
+
340
+ if last_sentence_end > max_length * 0.7: # If we found a good breaking point
341
+ return preview[:last_sentence_end + 1]
342
+ else:
343
+ # Break at word boundary
344
+ last_space = preview.rfind(' ')
345
+ if last_space > max_length * 0.7:
346
+ return preview[:last_space] + "..."
347
+ else:
348
+ return preview + "..."
349
+
350
+ class MemoryUsageTracker:
351
+ """Track memory usage of operations"""
352
+
353
+ def __init__(self):
354
+ self.start_memory = self._get_memory_usage()
355
+
356
+ def _get_memory_usage(self) -> float:
357
+ """Get current memory usage in MB"""
358
+ try:
359
+ import psutil
360
+ process = psutil.Process()
361
+ return process.memory_info().rss / 1024 / 1024 # Convert to MB
362
+ except ImportError:
363
+ return 0.0
364
+
365
+ def get_usage_delta(self) -> float:
366
+ """Get memory usage change since initialization"""
367
+ current_memory = self._get_memory_usage()
368
+ return current_memory - self.start_memory
369
+
370
+ def log_usage(self, operation_name: str):
371
+ """Log current memory usage for an operation"""
372
+ delta = self.get_usage_delta()
373
+ logger.info(f"{operation_name} memory delta: {delta:.1f} MB")
mcp_tools/voice_tool.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Any, Optional
3
+ import asyncio
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ class VoiceTool:
8
+ """
9
+ MCP Tool for voice-based Q&A using ElevenLabs conversational AI
10
+ """
11
+
12
+ def __init__(self, elevenlabs_service):
13
+ """
14
+ Initialize Voice Tool
15
+
16
+ Args:
17
+ elevenlabs_service: ElevenLabs service instance
18
+ """
19
+ self.elevenlabs_service = elevenlabs_service
20
+
21
+ async def voice_qa(
22
+ self,
23
+ question: str,
24
+ session_id: Optional[str] = None
25
+ ) -> Dict[str, Any]:
26
+ """
27
+ MCP Tool: Ask a question using voice assistant
28
+
29
+ Args:
30
+ question: User's question (text or transcribed from voice)
31
+ session_id: Optional session ID for conversation context
32
+
33
+ Returns:
34
+ Dictionary with answer, audio URL (if applicable), and sources
35
+ """
36
+ try:
37
+ if not self.elevenlabs_service or not self.elevenlabs_service.is_available():
38
+ return {
39
+ "success": False,
40
+ "error": "Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID"
41
+ }
42
+
43
+ logger.info(f"Voice QA: {question}")
44
+
45
+ # For text-based queries, we can use the RAG tool directly
46
+ # This provides the backend for voice queries
47
+ result = await self.elevenlabs_service.llamaindex_service.query(question)
48
+
49
+ return {
50
+ "success": True,
51
+ "question": question,
52
+ "answer": result,
53
+ "session_id": session_id,
54
+ "mode": "text" # Could be "voice" if audio processing is involved
55
+ }
56
+
57
+ except Exception as e:
58
+ logger.error(f"Voice QA failed: {str(e)}")
59
+ return {
60
+ "success": False,
61
+ "error": str(e),
62
+ "question": question
63
+ }
requirements.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio[mcp]
2
+ anthropic>=0.7.0
3
+ mistralai
4
+ sentence-transformers>=2.2.2
5
+ transformers>=4.30.0
6
+ torch>=2.0.0
7
+ faiss-cpu>=1.7.4
8
+ numpy>=1.24.0
9
+ pandas>=2.0.0
10
+ PyPDF2>=3.0.0
11
+ python-docx>=0.8.11
12
+ Pillow>=10.0.0
13
+ pytesseract>=0.3.10
14
+ aiofiles>=23.0.0
15
+ pydantic>=2.0.0
16
+ httpx>=0.24.0
17
+ uvicorn[standard]
18
+ python-multipart>=0.0.6
19
+ asyncio-mqtt>=0.11.1
20
+ nest-asyncio>=1.5.6
21
+ fastapi
22
+ fastmcp
23
+ mcp
24
+ openai
25
+ python-dotenv
26
+ llama-index
27
+ llama-index-llms-openai
28
+ llama-index-llms-anthropic
29
+ llama-index-embeddings-huggingface
30
+ elevenlabs>=1.0.0
31
+ websockets>=12.0
services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Services module initialization
services/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (149 Bytes). View file
 
services/__pycache__/document_store_service.cpython-313.pyc ADDED
Binary file (17.2 kB). View file
 
services/__pycache__/elevenlabs_service.cpython-313.pyc ADDED
Binary file (12.8 kB). View file
 
services/__pycache__/embedding_service.cpython-313.pyc ADDED
Binary file (13.4 kB). View file
 
services/__pycache__/llamaindex_service.cpython-313.pyc ADDED
Binary file (11 kB). View file
 
services/__pycache__/llm_service.cpython-313.pyc ADDED
Binary file (28.2 kB). View file
 
services/__pycache__/ocr_service.cpython-313.pyc ADDED
Binary file (19.9 kB). View file
 
services/__pycache__/podcast_generator_service.cpython-313.pyc ADDED
Binary file (28.3 kB). View file
 
services/__pycache__/vector_store_service.cpython-313.pyc ADDED
Binary file (15.3 kB). View file
 
services/document_store_service.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import json
3
+ import os
4
+ from typing import List, Dict, Any, Optional
5
+ from pathlib import Path
6
+ import pickle
7
+ from datetime import datetime
8
+ import asyncio
9
+
10
+ from core.models import Document, DocumentType
11
+ import config
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class DocumentStoreService:
16
+ def __init__(self):
17
+ self.config = config.config
18
+ self.store_path = Path(self.config.DOCUMENT_STORE_PATH)
19
+ self.store_path.mkdir(parents=True, exist_ok=True)
20
+
21
+ # Separate paths for metadata and content
22
+ self.metadata_path = self.store_path / "metadata"
23
+ self.content_path = self.store_path / "content"
24
+
25
+ self.metadata_path.mkdir(exist_ok=True)
26
+ self.content_path.mkdir(exist_ok=True)
27
+
28
+ # In-memory cache for frequently accessed documents
29
+ self._cache = {}
30
+ self._cache_size_limit = 100
31
+
32
+ async def store_document(self, document: Document) -> bool:
33
+ """Store a document and its metadata"""
34
+ try:
35
+ # Store metadata
36
+ metadata_file = self.metadata_path / f"{document.id}.json"
37
+ metadata = {
38
+ "id": document.id,
39
+ "filename": document.filename,
40
+ "doc_type": document.doc_type.value,
41
+ "file_size": document.file_size,
42
+ "created_at": document.created_at.isoformat(),
43
+ "metadata": document.metadata,
44
+ "tags": document.tags,
45
+ "summary": document.summary,
46
+ "category": document.category,
47
+ "language": document.language,
48
+ "content_length": len(document.content)
49
+ }
50
+
51
+ with open(metadata_file, 'w', encoding='utf-8') as f:
52
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
53
+
54
+ # Store content separately (can be large)
55
+ content_file = self.content_path / f"{document.id}.txt"
56
+ with open(content_file, 'w', encoding='utf-8') as f:
57
+ f.write(document.content)
58
+
59
+ # Cache the document
60
+ self._add_to_cache(document.id, document)
61
+
62
+ logger.info(f"Stored document {document.id} ({document.filename})")
63
+ return True
64
+
65
+ except Exception as e:
66
+ logger.error(f"Error storing document {document.id}: {str(e)}")
67
+ return False
68
+
69
+ async def get_document(self, document_id: str) -> Optional[Document]:
70
+ """Retrieve a document by ID"""
71
+ try:
72
+ # Check cache first
73
+ if document_id in self._cache:
74
+ return self._cache[document_id]
75
+
76
+ # Load from disk
77
+ metadata_file = self.metadata_path / f"{document_id}.json"
78
+ content_file = self.content_path / f"{document_id}.txt"
79
+
80
+ if not metadata_file.exists() or not content_file.exists():
81
+ return None
82
+
83
+ # Load metadata
84
+ with open(metadata_file, 'r', encoding='utf-8') as f:
85
+ metadata = json.load(f)
86
+
87
+ # Load content
88
+ with open(content_file, 'r', encoding='utf-8') as f:
89
+ content = f.read()
90
+
91
+ # Create document object
92
+ document = Document(
93
+ id=metadata["id"],
94
+ filename=metadata["filename"],
95
+ content=content,
96
+ doc_type=DocumentType(metadata["doc_type"]),
97
+ file_size=metadata["file_size"],
98
+ created_at=datetime.fromisoformat(metadata["created_at"]),
99
+ metadata=metadata.get("metadata", {}),
100
+ tags=metadata.get("tags", []),
101
+ summary=metadata.get("summary"),
102
+ category=metadata.get("category"),
103
+ language=metadata.get("language")
104
+ )
105
+
106
+ # Add to cache
107
+ self._add_to_cache(document_id, document)
108
+
109
+ return document
110
+
111
+ except Exception as e:
112
+ logger.error(f"Error retrieving document {document_id}: {str(e)}")
113
+ return None
114
+
115
+ async def list_documents(self, limit: int = 50, offset: int = 0,
116
+ filters: Optional[Dict[str, Any]] = None) -> List[Document]:
117
+ """List documents with pagination and filtering"""
118
+ try:
119
+ documents = []
120
+ metadata_files = list(self.metadata_path.glob("*.json"))
121
+
122
+ # Sort by creation time (newest first)
123
+ metadata_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
124
+
125
+ # Apply pagination
126
+ start_idx = offset
127
+ end_idx = offset + limit
128
+
129
+ for metadata_file in metadata_files[start_idx:end_idx]:
130
+ try:
131
+ with open(metadata_file, 'r', encoding='utf-8') as f:
132
+ metadata = json.load(f)
133
+
134
+ # Apply filters
135
+ if filters and not self._apply_filters(metadata, filters):
136
+ continue
137
+
138
+ # Load content if needed (for small documents)
139
+ content_file = self.content_path / f"{metadata['id']}.txt"
140
+ if content_file.exists():
141
+ with open(content_file, 'r', encoding='utf-8') as f:
142
+ content = f.read()
143
+ else:
144
+ content = ""
145
+
146
+ document = Document(
147
+ id=metadata["id"],
148
+ filename=metadata["filename"],
149
+ content=content,
150
+ doc_type=DocumentType(metadata["doc_type"]),
151
+ file_size=metadata["file_size"],
152
+ created_at=datetime.fromisoformat(metadata["created_at"]),
153
+ metadata=metadata.get("metadata", {}),
154
+ tags=metadata.get("tags", []),
155
+ summary=metadata.get("summary"),
156
+ category=metadata.get("category"),
157
+ language=metadata.get("language")
158
+ )
159
+
160
+ documents.append(document)
161
+
162
+ except Exception as e:
163
+ logger.warning(f"Error loading document metadata from {metadata_file}: {str(e)}")
164
+ continue
165
+
166
+ return documents
167
+
168
+ except Exception as e:
169
+ logger.error(f"Error listing documents: {str(e)}")
170
+ return []
171
+
172
+ def _apply_filters(self, metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
173
+ """Apply filters to document metadata"""
174
+ try:
175
+ for key, value in filters.items():
176
+ if key == "doc_type":
177
+ if metadata.get("doc_type") != value:
178
+ return False
179
+ elif key == "filename_contains":
180
+ if value.lower() not in metadata.get("filename", "").lower():
181
+ return False
182
+ elif key == "created_after":
183
+ doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
184
+ if doc_date < value:
185
+ return False
186
+ elif key == "created_before":
187
+ doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
188
+ if doc_date > value:
189
+ return False
190
+ elif key == "tags":
191
+ doc_tags = set(metadata.get("tags", []))
192
+ required_tags = set(value) if isinstance(value, list) else {value}
193
+ if not required_tags.intersection(doc_tags):
194
+ return False
195
+ elif key == "category":
196
+ if metadata.get("category") != value:
197
+ return False
198
+ elif key == "language":
199
+ if metadata.get("language") != value:
200
+ return False
201
+
202
+ return True
203
+ except Exception as e:
204
+ logger.error(f"Error applying filters: {str(e)}")
205
+ return True
206
+
207
+ async def update_document_metadata(self, document_id: str, updates: Dict[str, Any]) -> bool:
208
+ """Update document metadata"""
209
+ try:
210
+ metadata_file = self.metadata_path / f"{document_id}.json"
211
+
212
+ if not metadata_file.exists():
213
+ logger.warning(f"Document {document_id} not found")
214
+ return False
215
+
216
+ # Load existing metadata
217
+ with open(metadata_file, 'r', encoding='utf-8') as f:
218
+ metadata = json.load(f)
219
+
220
+ # Apply updates
221
+ for key, value in updates.items():
222
+ if key in ["tags", "summary", "category", "language", "metadata"]:
223
+ metadata[key] = value
224
+
225
+ # Save updated metadata
226
+ with open(metadata_file, 'w', encoding='utf-8') as f:
227
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
228
+
229
+ # Update cache if document is cached
230
+ if document_id in self._cache:
231
+ document = self._cache[document_id]
232
+ for key, value in updates.items():
233
+ if hasattr(document, key):
234
+ setattr(document, key, value)
235
+
236
+ logger.info(f"Updated metadata for document {document_id}")
237
+ return True
238
+
239
+ except Exception as e:
240
+ logger.error(f"Error updating document metadata: {str(e)}")
241
+ return False
242
+
243
+ async def delete_document(self, document_id: str) -> bool:
244
+ """Delete a document and its metadata"""
245
+ try:
246
+ metadata_file = self.metadata_path / f"{document_id}.json"
247
+ content_file = self.content_path / f"{document_id}.txt"
248
+
249
+ # Remove files
250
+ if metadata_file.exists():
251
+ metadata_file.unlink()
252
+ if content_file.exists():
253
+ content_file.unlink()
254
+
255
+ # Remove from cache
256
+ if document_id in self._cache:
257
+ del self._cache[document_id]
258
+
259
+ logger.info(f"Deleted document {document_id}")
260
+ return True
261
+
262
+ except Exception as e:
263
+ logger.error(f"Error deleting document {document_id}: {str(e)}")
264
+ return False
265
+
266
+ async def search_documents(self, query: str, fields: List[str] = None) -> List[Document]:
267
+ """Simple text search across documents"""
268
+ if not fields:
269
+ fields = ["filename", "content", "tags", "summary"]
270
+
271
+ try:
272
+ matching_documents = []
273
+ query_lower = query.lower()
274
+
275
+ # Get all documents
276
+ all_documents = await self.list_documents(limit=1000) # Adjust limit as needed
277
+
278
+ for document in all_documents:
279
+ match_found = False
280
+
281
+ for field in fields:
282
+ field_value = getattr(document, field, "")
283
+ if isinstance(field_value, list):
284
+ field_value = " ".join(field_value)
285
+ elif field_value is None:
286
+ field_value = ""
287
+
288
+ if query_lower in str(field_value).lower():
289
+ match_found = True
290
+ break
291
+
292
+ if match_found:
293
+ matching_documents.append(document)
294
+
295
+ logger.info(f"Found {len(matching_documents)} documents matching '{query}'")
296
+ return matching_documents
297
+
298
+ except Exception as e:
299
+ logger.error(f"Error searching documents: {str(e)}")
300
+ return []
301
+
302
+ def _add_to_cache(self, document_id: str, document: Document):
303
+ """Add document to cache with size limit"""
304
+ try:
305
+ # Remove oldest items if cache is full
306
+ if len(self._cache) >= self._cache_size_limit:
307
+ # Remove first item (FIFO)
308
+ oldest_key = next(iter(self._cache))
309
+ del self._cache[oldest_key]
310
+
311
+ self._cache[document_id] = document
312
+ except Exception as e:
313
+ logger.error(f"Error adding to cache: {str(e)}")
314
+
315
+ async def get_stats(self) -> Dict[str, Any]:
316
+ """Get statistics about the document store"""
317
+ try:
318
+ metadata_files = list(self.metadata_path.glob("*.json"))
319
+ content_files = list(self.content_path.glob("*.txt"))
320
+
321
+ # Calculate total storage size
322
+ total_size = 0
323
+ for file_path in metadata_files + content_files:
324
+ total_size += file_path.stat().st_size
325
+
326
+ # Count by document type
327
+ type_counts = {}
328
+ for metadata_file in metadata_files:
329
+ try:
330
+ with open(metadata_file, 'r') as f:
331
+ metadata = json.load(f)
332
+ doc_type = metadata.get("doc_type", "unknown")
333
+ type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
334
+ except:
335
+ continue
336
+
337
+ return {
338
+ "total_documents": len(metadata_files),
339
+ "total_size_bytes": total_size,
340
+ "total_size_mb": round(total_size / (1024 * 1024), 2),
341
+ "cache_size": len(self._cache),
342
+ "document_types": type_counts,
343
+ "storage_path": str(self.store_path),
344
+ "metadata_files": len(metadata_files),
345
+ "content_files": len(content_files)
346
+ }
347
+ except Exception as e:
348
+ logger.error(f"Error getting document store stats: {str(e)}")
349
+ return {"error": str(e)}
services/elevenlabs_service.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from typing import Optional, Dict, Any, List
4
+ import json
5
+
6
+ try:
7
+ from elevenlabs.client import ElevenLabs
8
+ from elevenlabs.conversational_ai.conversation import Conversation, ClientTools
9
+ from elevenlabs.conversational_ai.default_audio_interface import DefaultAudioInterface
10
+ ELEVENLABS_AVAILABLE = True
11
+ except ImportError:
12
+ ELEVENLABS_AVAILABLE = False
13
+ logger = logging.getLogger(__name__)
14
+ logger.warning("ElevenLabs SDK not available. Voice features will be disabled.")
15
+
16
+ import config
17
+ from services.llamaindex_service import LlamaIndexService
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class ElevenLabsService:
22
+ """
23
+ Service for integrating ElevenLabs Conversational AI with RAG capabilities.
24
+ Provides voice-based interaction with the document library.
25
+ """
26
+
27
+ def __init__(self, llamaindex_service: LlamaIndexService):
28
+ """
29
+ Initialize ElevenLabs service with RAG integration
30
+
31
+ Args:
32
+ llamaindex_service: LlamaIndex service for document queries
33
+ """
34
+ self.config = config.config
35
+ self.llamaindex_service = llamaindex_service
36
+ self.client = None
37
+ self.client_tools = None
38
+ self.active_conversations: Dict[str, Conversation] = {}
39
+
40
+ if not ELEVENLABS_AVAILABLE:
41
+ logger.error("ElevenLabs SDK not installed. Run: pip install elevenlabs")
42
+ return
43
+
44
+ if not self.config.ELEVENLABS_API_KEY:
45
+ logger.warning("ELEVENLABS_API_KEY not configured. Voice features will be limited.")
46
+ return
47
+
48
+ try:
49
+ # Initialize ElevenLabs client
50
+ self.client = ElevenLabs(api_key=self.config.ELEVENLABS_API_KEY)
51
+ logger.info("ElevenLabs client initialized successfully")
52
+
53
+ # Initialize client tools for custom tool registration
54
+ self.client_tools = ClientTools()
55
+
56
+ # Register RAG tool
57
+ self._register_rag_tool()
58
+
59
+ logger.info("ElevenLabs service initialized with RAG tool")
60
+
61
+ except Exception as e:
62
+ logger.error(f"Error initializing ElevenLabs service: {str(e)}")
63
+
64
+ def _register_rag_tool(self):
65
+ """Register RAG query tool with ElevenLabs agent"""
66
+ if not self.client_tools:
67
+ return
68
+
69
+ try:
70
+ # Register the query_documents tool
71
+ # Modern ElevenLabs SDK: register(tool_name, handler=callable)
72
+ self.client_tools.register("query_documents", handler=self._rag_query_tool)
73
+
74
+ logger.info("RAG tool 'query_documents' registered successfully")
75
+
76
+ except Exception as e:
77
+ logger.error(f"Error registering RAG tool: {str(e)}")
78
+
79
+ async def _rag_query_tool(self, params: Dict[str, Any]) -> Dict[str, Any]:
80
+ """
81
+ Custom tool for querying documents using LlamaIndex agentic RAG
82
+
83
+ Args:
84
+ params: Dictionary containing the query
85
+ - query (str): The user's question or search query
86
+
87
+ Returns:
88
+ Dictionary with answer and metadata
89
+ """
90
+ try:
91
+ query = params.get("query", "")
92
+
93
+ if not query:
94
+ return {
95
+ "error": "No query provided",
96
+ "answer": "I didn't receive a question to search for."
97
+ }
98
+
99
+ logger.info(f"RAG tool called with query: '{query}'")
100
+
101
+ # Query the LlamaIndex agentic RAG system
102
+ try:
103
+ result = await asyncio.wait_for(
104
+ self.llamaindex_service.query(query),
105
+ timeout=self.config.CONVERSATION_TIMEOUT
106
+ )
107
+
108
+ logger.info(f"RAG query successful")
109
+
110
+ return {
111
+ "answer": result,
112
+ "source": "document_library",
113
+ "confidence": "high"
114
+ }
115
+
116
+ except asyncio.TimeoutError:
117
+ logger.error("RAG query timeout")
118
+ return {
119
+ "error": "timeout",
120
+ "answer": "The search took too long. Please try a simpler question."
121
+ }
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error in RAG query tool: {str(e)}")
125
+ return {
126
+ "error": str(e),
127
+ "answer": f"I encountered an error searching the documents: {str(e)}"
128
+ }
129
+
130
+ def create_conversation(
131
+ self,
132
+ agent_id: Optional[str] = None,
133
+ session_id: Optional[str] = None
134
+ ) -> Optional[Conversation]:
135
+ """
136
+ Create a new conversation session
137
+
138
+ Args:
139
+ agent_id: ElevenLabs agent ID (uses config default if not provided)
140
+ session_id: Optional session ID for tracking
141
+
142
+ Returns:
143
+ Conversation object or None if initialization fails
144
+ """
145
+ if not self.client:
146
+ logger.error("ElevenLabs client not initialized")
147
+ return None
148
+
149
+ try:
150
+ agent_id = agent_id or self.config.ELEVENLABS_AGENT_ID
151
+
152
+ if not agent_id:
153
+ logger.error("No agent ID provided or configured")
154
+ return None
155
+
156
+ # Create audio interface for real-time audio
157
+ audio_interface = DefaultAudioInterface()
158
+
159
+ # Create conversation with RAG tool
160
+ conversation = Conversation(
161
+ client=self.client,
162
+ agent_id=agent_id,
163
+ requires_auth=True,
164
+ audio_interface=audio_interface,
165
+ client_tools=self.client_tools
166
+ )
167
+
168
+ # Store conversation if session ID provided
169
+ if session_id:
170
+ self.active_conversations[session_id] = conversation
171
+
172
+ logger.info(f"Created conversation for agent: {agent_id}")
173
+ return conversation
174
+
175
+ except Exception as e:
176
+ logger.error(f"Error creating conversation: {str(e)}")
177
+ return None
178
+
179
+ async def start_conversation(self, session_id: Optional[str] = None) -> Dict[str, Any]:
180
+ """
181
+ Start a new conversation session (async wrapper for UI)
182
+
183
+ Args:
184
+ session_id: Optional session ID for tracking
185
+
186
+ Returns:
187
+ Dictionary with success status and conversation info
188
+ """
189
+ try:
190
+ conversation = self.create_conversation(session_id=session_id)
191
+
192
+ if conversation:
193
+ return {
194
+ "success": True,
195
+ "session_id": session_id,
196
+ "message": "Conversation started successfully"
197
+ }
198
+ else:
199
+ return {
200
+ "success": False,
201
+ "error": "Failed to create conversation"
202
+ }
203
+ except Exception as e:
204
+ logger.error(f"Error starting conversation: {str(e)}")
205
+ return {
206
+ "success": False,
207
+ "error": str(e)
208
+ }
209
+
210
+ async def process_voice_query(
211
+ self,
212
+ audio_file_path: str,
213
+ agent_id: Optional[str] = None
214
+ ) -> Dict[str, Any]:
215
+ """
216
+ Process a voice query file and return response
217
+
218
+ Args:
219
+ audio_file_path: Path to audio file
220
+ agent_id: Optional agent ID
221
+
222
+ Returns:
223
+ Dictionary with transcription, answer, and metadata
224
+ """
225
+ try:
226
+ # For now, this is a placeholder for file-based processing
227
+ # ElevenLabs Conversational AI is primarily WebSocket-based
228
+ # This would be used for async/batch processing
229
+
230
+ logger.info(f"Processing voice query from: {audio_file_path}")
231
+
232
+ # This would require additional implementation for file upload
233
+ # and processing through ElevenLabs API
234
+
235
+ return {
236
+ "status": "pending",
237
+ "message": "Voice query processing requires WebSocket connection",
238
+ "file": audio_file_path
239
+ }
240
+
241
+ except Exception as e:
242
+ logger.error(f"Error processing voice query: {str(e)}")
243
+ return {
244
+ "status": "error",
245
+ "error": str(e)
246
+ }
247
+
248
+ async def end_conversation(self, session_id: str) -> bool:
249
+ """
250
+ End an active conversation session
251
+
252
+ Args:
253
+ session_id: Session identifier
254
+
255
+ Returns:
256
+ True if conversation ended successfully
257
+ """
258
+ try:
259
+ if session_id in self.active_conversations:
260
+ conversation = self.active_conversations[session_id]
261
+
262
+ # Try to end the session gracefully
263
+ try:
264
+ conversation.end_session()
265
+ except AttributeError as ae:
266
+ # Handle cases where DefaultAudioInterface doesn't have expected methods
267
+ logger.warning(f"Could not cleanly end session: {str(ae)}")
268
+ except Exception as e:
269
+ logger.warning(f"Error during session cleanup: {str(e)}")
270
+
271
+ # Always remove from active conversations
272
+ del self.active_conversations[session_id]
273
+ logger.info(f"Ended conversation: {session_id}")
274
+ return True
275
+ return False
276
+
277
+ except Exception as e:
278
+ logger.error(f"Error ending conversation: {str(e)}")
279
+ return False
280
+
281
+ def get_available_voices(self) -> List[Dict[str, str]]:
282
+ """
283
+ Get list of available voice models
284
+
285
+ Returns:
286
+ List of voice model information
287
+ """
288
+ try:
289
+ if not self.client:
290
+ return []
291
+
292
+ # Get voices from ElevenLabs API
293
+ voices = self.client.voices.get_all()
294
+
295
+ return [
296
+ {
297
+ "voice_id": voice.voice_id,
298
+ "name": voice.name,
299
+ "category": voice.category if hasattr(voice, 'category') else "general"
300
+ }
301
+ for voice in voices.voices
302
+ ]
303
+
304
+ except Exception as e:
305
+ logger.error(f"Error getting voices: {str(e)}")
306
+ return []
307
+
308
+ def is_available(self) -> bool:
309
+ """Check if ElevenLabs service is available and configured"""
310
+ return ELEVENLABS_AVAILABLE and self.client is not None
311
+
312
+ async def test_connection(self) -> Dict[str, Any]:
313
+ """
314
+ Test ElevenLabs API connection
315
+
316
+ Returns:
317
+ Dictionary with test results
318
+ """
319
+ try:
320
+ if not self.client:
321
+ return {
322
+ "status": "error",
323
+ "message": "Client not initialized"
324
+ }
325
+
326
+ # Try to fetch user info or voices as a connection test
327
+ voices = self.get_available_voices()
328
+
329
+ return {
330
+ "status": "success",
331
+ "message": "ElevenLabs API connected",
332
+ "voices_available": len(voices),
333
+ "rag_tool_registered": self.client_tools is not None
334
+ }
335
+
336
+ except Exception as e:
337
+ logger.error(f"Connection test failed: {str(e)}")
338
+ return {
339
+ "status": "error",
340
+ "message": str(e)
341
+ }
services/embedding_service.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ from typing import List, Optional, Dict, Any
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ import torch
7
+ import openai
8
+ import config
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class EmbeddingService:
13
+ def __init__(self):
14
+ self.config = config.config
15
+ self.model_name = self.config.EMBEDDING_MODEL
16
+ self.model = None
17
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+ self.openai_client = None
19
+ self.is_openai_model = False
20
+
21
+ # Initialize OpenAI client if needed
22
+ if self.config.OPENAI_API_KEY:
23
+ self.openai_client = openai.OpenAI(api_key=self.config.OPENAI_API_KEY)
24
+
25
+ # Load model lazily
26
+ self._load_model()
27
+
28
+ def _load_model(self):
29
+ """Load the embedding model"""
30
+ try:
31
+ logger.info(f"Loading embedding model: {self.model_name}")
32
+
33
+ if self.model_name.startswith("text-embedding-"):
34
+ if not self.openai_client:
35
+ logger.warning(f"OpenAI model {self.model_name} requested but OPENAI_API_KEY not found. Falling back to local model.")
36
+ self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
37
+ self.is_openai_model = False
38
+ self.model = SentenceTransformer(self.model_name, device=self.device)
39
+ else:
40
+ self.is_openai_model = True
41
+ logger.info(f"Using OpenAI embedding model: {self.model_name}")
42
+ else:
43
+ self.is_openai_model = False
44
+ self.model = SentenceTransformer(self.model_name, device=self.device)
45
+ logger.info(f"Local embedding model loaded successfully on {self.device}")
46
+
47
+ except Exception as e:
48
+ logger.error(f"Failed to load embedding model: {str(e)}")
49
+ # Fallback to a smaller model
50
+ try:
51
+ self.model_name = "all-MiniLM-L6-v2"
52
+ self.is_openai_model = False
53
+ self.model = SentenceTransformer(self.model_name, device=self.device)
54
+ logger.info(f"Loaded fallback embedding model: {self.model_name}")
55
+ except Exception as fallback_error:
56
+ logger.error(f"Failed to load fallback model: {str(fallback_error)}")
57
+ raise
58
+
59
+ async def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
60
+ """Generate embeddings for a list of texts"""
61
+ if not texts:
62
+ return []
63
+
64
+ if not self.is_openai_model and self.model is None:
65
+ raise RuntimeError("Embedding model not loaded")
66
+
67
+ try:
68
+ # Filter out empty texts
69
+ non_empty_texts = [text for text in texts if text and text.strip()]
70
+ if not non_empty_texts:
71
+ logger.warning("No non-empty texts provided for embedding")
72
+ return []
73
+
74
+ logger.info(f"Generating embeddings for {len(non_empty_texts)} texts using {self.model_name}")
75
+
76
+ # Process in batches to manage memory/API limits
77
+ all_embeddings = []
78
+ for i in range(0, len(non_empty_texts), batch_size):
79
+ batch = non_empty_texts[i:i + batch_size]
80
+
81
+ # Run embedding generation in thread pool to avoid blocking
82
+ loop = asyncio.get_event_loop()
83
+ batch_embeddings = await loop.run_in_executor(
84
+ None,
85
+ self._generate_batch_embeddings,
86
+ batch
87
+ )
88
+ all_embeddings.extend(batch_embeddings)
89
+
90
+ logger.info(f"Generated {len(all_embeddings)} embeddings")
91
+ return all_embeddings
92
+
93
+ except Exception as e:
94
+ logger.error(f"Error generating embeddings: {str(e)}")
95
+ raise
96
+
97
+ def _generate_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
98
+ """Generate embeddings for a batch of texts (synchronous)"""
99
+ try:
100
+ if self.is_openai_model:
101
+ # OpenAI Embeddings
102
+ response = self.openai_client.embeddings.create(
103
+ input=texts,
104
+ model=self.model_name
105
+ )
106
+ return [data.embedding for data in response.data]
107
+ else:
108
+ # Local SentenceTransformer
109
+ embeddings = self.model.encode(
110
+ texts,
111
+ convert_to_numpy=True,
112
+ normalize_embeddings=True,
113
+ batch_size=len(texts)
114
+ )
115
+ return embeddings.tolist()
116
+ except Exception as e:
117
+ logger.error(f"Error in batch embedding generation: {str(e)}")
118
+ raise
119
+
120
+ async def generate_single_embedding(self, text: str) -> Optional[List[float]]:
121
+ """Generate embedding for a single text"""
122
+ if not text or not text.strip():
123
+ return None
124
+
125
+ try:
126
+ embeddings = await self.generate_embeddings([text])
127
+ return embeddings[0] if embeddings else None
128
+ except Exception as e:
129
+ logger.error(f"Error generating single embedding: {str(e)}")
130
+ return None
131
+
132
+ def get_embedding_dimension(self) -> int:
133
+ """Get the dimension of embeddings produced by the model"""
134
+ if self.is_openai_model:
135
+ if "small" in self.model_name:
136
+ return 1536
137
+ elif "large" in self.model_name:
138
+ return 3072
139
+ elif "ada" in self.model_name:
140
+ return 1536
141
+ else:
142
+ # Default fallback or make a call to check?
143
+ # For now assume 1536 as it's standard for recent OpenAI models
144
+ return 1536
145
+
146
+ if self.model is None:
147
+ raise RuntimeError("Embedding model not loaded")
148
+
149
+ return self.model.get_sentence_embedding_dimension()
150
+
151
+ def compute_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
152
+ """Compute cosine similarity between two embeddings"""
153
+ try:
154
+ # Convert to numpy arrays
155
+ emb1 = np.array(embedding1)
156
+ emb2 = np.array(embedding2)
157
+
158
+ # Compute cosine similarity
159
+ similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
160
+
161
+ return float(similarity)
162
+ except Exception as e:
163
+ logger.error(f"Error computing similarity: {str(e)}")
164
+ return 0.0
165
+
166
+ def compute_similarities(self, query_embedding: List[float], embeddings: List[List[float]]) -> List[float]:
167
+ """Compute similarities between a query embedding and multiple embeddings"""
168
+ try:
169
+ query_emb = np.array(query_embedding)
170
+ emb_matrix = np.array(embeddings)
171
+
172
+ # Compute cosine similarities
173
+ similarities = np.dot(emb_matrix, query_emb) / (
174
+ np.linalg.norm(emb_matrix, axis=1) * np.linalg.norm(query_emb)
175
+ )
176
+
177
+ return similarities.tolist()
178
+ except Exception as e:
179
+ logger.error(f"Error computing similarities: {str(e)}")
180
+ return [0.0] * len(embeddings)
181
+
182
+ async def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
183
+ """Embed a list of chunks and add embeddings to them"""
184
+ if not chunks:
185
+ return []
186
+
187
+ try:
188
+ # Extract texts
189
+ texts = [chunk.get('content', '') for chunk in chunks]
190
+
191
+ # Generate embeddings
192
+ embeddings = await self.generate_embeddings(texts)
193
+
194
+ # Add embeddings to chunks
195
+ embedded_chunks = []
196
+ for i, chunk in enumerate(chunks):
197
+ if i < len(embeddings):
198
+ chunk_copy = chunk.copy()
199
+ chunk_copy['embedding'] = embeddings[i]
200
+ embedded_chunks.append(chunk_copy)
201
+ else:
202
+ logger.warning(f"No embedding generated for chunk {i}")
203
+ embedded_chunks.append(chunk)
204
+
205
+ return embedded_chunks
206
+ except Exception as e:
207
+ logger.error(f"Error embedding chunks: {str(e)}")
208
+ raise
209
+
210
+ def validate_embedding(self, embedding: List[float]) -> bool:
211
+ """Validate that an embedding is properly formatted"""
212
+ try:
213
+ if not embedding:
214
+ return False
215
+
216
+ if not isinstance(embedding, list):
217
+ return False
218
+
219
+ if len(embedding) != self.get_embedding_dimension():
220
+ return False
221
+
222
+ # Check for NaN or infinite values
223
+ emb_array = np.array(embedding)
224
+ if np.isnan(emb_array).any() or np.isinf(emb_array).any():
225
+ return False
226
+
227
+ return True
228
+ except Exception:
229
+ return False
230
+
231
+ async def get_model_info(self) -> Dict[str, Any]:
232
+ """Get information about the loaded model"""
233
+ try:
234
+ return {
235
+ "model_name": self.model_name,
236
+ "device": "openai-api" if self.is_openai_model else self.device,
237
+ "embedding_dimension": self.get_embedding_dimension(),
238
+ "max_sequence_length": "8191" if self.is_openai_model else getattr(self.model, 'max_seq_length', 'unknown'),
239
+ "model_loaded": self.is_openai_model or (self.model is not None)
240
+ }
241
+ except Exception as e:
242
+ logger.error(f"Error getting model info: {str(e)}")
243
+ return {"error": str(e)}
services/llamaindex_service.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Optional, Any
4
+ from pathlib import Path
5
+ import shutil
6
+ import asyncio
7
+
8
+ from llama_index.core import (
9
+ VectorStoreIndex,
10
+ Document,
11
+ StorageContext,
12
+ load_index_from_storage,
13
+ Settings,
14
+ SummaryIndex
15
+ )
16
+ from llama_index.core.tools import QueryEngineTool, ToolMetadata
17
+ from llama_index.core.agent import ReActAgent
18
+ from llama_index.core.selectors import LLMSingleSelector
19
+ from llama_index.core.query_engine import RouterQueryEngine
20
+ from llama_index.llms.openai import OpenAI
21
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
22
+ from llama_index.embeddings.openai import OpenAIEmbedding
23
+
24
+ import config
25
+ from services.document_store_service import DocumentStoreService
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ class LlamaIndexService:
30
+ def __init__(self, document_store: DocumentStoreService):
31
+ self.document_store = document_store
32
+ self.config = config.config
33
+ self.storage_dir = Path(self.config.DATA_DIR) / "llamaindex_storage"
34
+ self.index = None
35
+ self.agent = None
36
+ self.is_initialized = False
37
+
38
+ self._initialize_settings()
39
+ # We don't fully initialize index here because we need async access to doc store
40
+ # But we try to load existing storage if available
41
+ self._try_load_from_storage()
42
+
43
+ def _initialize_settings(self):
44
+ """Initialize LlamaIndex settings (LLM, Embeddings)"""
45
+ try:
46
+ # LLM Setup
47
+ if self.config.OPENAI_API_KEY:
48
+ # Use configured OpenAI model (gpt-5.1-chat-latest or similar)
49
+ Settings.llm = OpenAI(model=self.config.OPENAI_MODEL, api_key=self.config.OPENAI_API_KEY)
50
+ logger.info(f"LlamaIndex using OpenAI model: {self.config.OPENAI_MODEL}")
51
+ elif self.config.NEBIUS_API_KEY:
52
+ # Use Nebius as OpenAI-compatible provider
53
+ Settings.llm = OpenAI(
54
+ model=self.config.NEBIUS_MODEL,
55
+ api_key=self.config.NEBIUS_API_KEY,
56
+ api_base=self.config.NEBIUS_BASE_URL
57
+ )
58
+ logger.info(f"LlamaIndex using Nebius model: {self.config.NEBIUS_MODEL}")
59
+ else:
60
+ logger.warning("No API key found for LlamaIndex LLM (OpenAI or Nebius). Agentic features may fail.")
61
+
62
+ # Embedding Setup
63
+ if self.config.EMBEDDING_MODEL.startswith("text-embedding-"):
64
+ if self.config.OPENAI_API_KEY:
65
+ Settings.embed_model = OpenAIEmbedding(
66
+ model=self.config.EMBEDDING_MODEL,
67
+ api_key=self.config.OPENAI_API_KEY
68
+ )
69
+ logger.info(f"LlamaIndex using OpenAI embeddings: {self.config.EMBEDDING_MODEL}")
70
+ else:
71
+ logger.warning("OpenAI embedding model requested but no API key found. Falling back to HuggingFace.")
72
+ Settings.embed_model = HuggingFaceEmbedding(
73
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
74
+ )
75
+ else:
76
+ Settings.embed_model = HuggingFaceEmbedding(
77
+ model_name=self.config.EMBEDDING_MODEL
78
+ )
79
+ logger.info(f"LlamaIndex using HuggingFace embeddings: {self.config.EMBEDDING_MODEL}")
80
+
81
+ except Exception as e:
82
+ logger.error(f"Error initializing LlamaIndex settings: {str(e)}")
83
+
84
+ def _try_load_from_storage(self):
85
+ """Try to load index from storage synchronously"""
86
+ try:
87
+ if self.storage_dir.exists():
88
+ logger.info("Loading LlamaIndex from storage...")
89
+ storage_context = StorageContext.from_defaults(persist_dir=str(self.storage_dir))
90
+ self.index = load_index_from_storage(storage_context)
91
+ self._initialize_agent()
92
+ self.is_initialized = True
93
+ else:
94
+ logger.info("No existing LlamaIndex storage found. Waiting for async initialization.")
95
+ except Exception as e:
96
+ logger.error(f"Error loading LlamaIndex from storage: {str(e)}")
97
+
98
+ async def initialize(self):
99
+ """Async initialization to sync documents and build index"""
100
+ try:
101
+ logger.info("Starting LlamaIndex async initialization...")
102
+
103
+ # If we already have an index, we might still want to sync if it's empty or stale
104
+ # For now, if no index exists, we definitely need to build it
105
+ if self.index is None:
106
+ await self.sync_from_document_store()
107
+
108
+ self.is_initialized = True
109
+ logger.info("LlamaIndex async initialization complete.")
110
+
111
+ except Exception as e:
112
+ logger.error(f"Error during LlamaIndex async initialization: {str(e)}")
113
+
114
+ async def sync_from_document_store(self):
115
+ """Sync documents from DocumentStore to LlamaIndex"""
116
+ try:
117
+ logger.info("Syncing documents from DocumentStore to LlamaIndex...")
118
+
119
+ # Fetch documents from async document store
120
+ # Limit to 1000 for now to avoid memory issues
121
+ docs = await self.document_store.list_documents(limit=1000)
122
+
123
+ if not docs:
124
+ logger.warning("No documents found in DocumentStore to sync.")
125
+ # Create empty index if no docs
126
+ self.index = VectorStoreIndex.from_documents([])
127
+ else:
128
+ # Convert to LlamaIndex documents
129
+ llama_docs = []
130
+ for doc in docs:
131
+ llama_doc = Document(
132
+ text=doc.content,
133
+ metadata={
134
+ "filename": doc.filename,
135
+ "document_id": doc.id,
136
+ **doc.metadata
137
+ }
138
+ )
139
+ llama_docs.append(llama_doc)
140
+
141
+ logger.info(f"Building LlamaIndex with {len(llama_docs)} documents...")
142
+ self.index = VectorStoreIndex.from_documents(llama_docs)
143
+
144
+ # Persist storage
145
+ if not self.storage_dir.exists():
146
+ self.storage_dir.mkdir(parents=True, exist_ok=True)
147
+ self.index.storage_context.persist(persist_dir=str(self.storage_dir))
148
+
149
+ # Re-initialize agent with new index
150
+ self._initialize_agent()
151
+ logger.info("LlamaIndex sync complete.")
152
+
153
+ except Exception as e:
154
+ logger.error(f"Error syncing LlamaIndex: {str(e)}")
155
+
156
+ async def sync_on_demand(self):
157
+ """Manual trigger for syncing documents"""
158
+ await self.sync_from_document_store()
159
+ return True
160
+
161
+ def _initialize_agent(self):
162
+ """Initialize the ReAct agent with query engine tools"""
163
+ try:
164
+ if not self.index:
165
+ return
166
+
167
+ query_engine = self.index.as_query_engine()
168
+
169
+ query_engine_tool = QueryEngineTool(
170
+ query_engine=query_engine,
171
+ metadata=ToolMetadata(
172
+ name="document_search",
173
+ description="Search and retrieve information from the document library. Use this for specific questions about content."
174
+ )
175
+ )
176
+
177
+ self.agent = ReActAgent.from_tools(
178
+ [query_engine_tool],
179
+ llm=Settings.llm,
180
+ verbose=True
181
+ )
182
+ logger.info("LlamaIndex ReAct agent initialized")
183
+
184
+ except Exception as e:
185
+ logger.error(f"Error initializing LlamaIndex agent: {str(e)}")
186
+
187
+ async def query(self, query_text: str) -> str:
188
+ """Process a query using the agent"""
189
+ if not self.agent:
190
+ if not self.is_initialized:
191
+ return "Agent is initializing, please try again in a moment."
192
+ return "Agent failed to initialize. Please check logs."
193
+
194
+ try:
195
+ response = await self.agent.achat(query_text)
196
+ return str(response)
197
+ except Exception as e:
198
+ logger.error(f"Error querying LlamaIndex agent: {str(e)}")
199
+ return f"Error processing query: {str(e)}"
services/llm_service.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mistralai import Mistral
2
+ import logging
3
+ import asyncio
4
+ from typing import List, Dict, Any, Optional
5
+
6
+ import openai
7
+ import config
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class LLMService:
12
+ def __init__(self):
13
+ self.config = config.config
14
+
15
+ self.nebius_client = None
16
+ self.mistral_client = None
17
+ self.openai_client = None
18
+
19
+ self._initialize_clients()
20
+
21
+ def _initialize_clients(self):
22
+ """Initialize LLM clients"""
23
+ try:
24
+ if self.config.OPENAI_API_KEY:
25
+ self.openai_client = openai.OpenAI(
26
+ api_key=self.config.OPENAI_API_KEY
27
+ )
28
+ logger.info("OpenAI client initialized")
29
+
30
+ if self.config.NEBIUS_API_KEY:
31
+ self.nebius_client = openai.OpenAI(
32
+ api_key=self.config.NEBIUS_API_KEY,
33
+ base_url=self.config.NEBIUS_BASE_URL
34
+ )
35
+ logger.info("NEBIUS client initialized")
36
+
37
+ if self.config.MISTRAL_API_KEY:
38
+ self.mistral_client = Mistral( # Standard sync client
39
+ api_key=self.config.MISTRAL_API_KEY
40
+ )
41
+ logger.info("Mistral client initialized")
42
+
43
+ # Check if at least one client is initialized
44
+ if not any([self.openai_client, self.nebius_client, self.mistral_client]):
45
+ logger.warning("No LLM clients could be initialized based on current config. Check API keys.")
46
+ else:
47
+ logger.info("LLM clients initialized successfully (at least one).")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Error initializing LLM clients: {str(e)}")
51
+ raise
52
+
53
+ async def generate_text(self, prompt: str, model: str = "auto", max_tokens: int = 1000, temperature: float = 0.7) -> str:
54
+ """Generate text using the specified model, with new priority for 'auto'."""
55
+ try:
56
+ selected_model_name_for_call: str = ""
57
+
58
+ if model == "auto":
59
+ # Priority: 1. NEBIUS (Llama 3.3 - Cost Effective), 2. OpenAI (GPT-5.1), 3. Mistral
60
+ if self.nebius_client and self.config.NEBIUS_MODEL:
61
+ selected_model_name_for_call = self.config.NEBIUS_MODEL
62
+ logger.debug(f"Auto-selected NEBIUS model: {selected_model_name_for_call}")
63
+ return await self._generate_with_nebius(prompt, selected_model_name_for_call, max_tokens, temperature)
64
+ elif self.openai_client and self.config.OPENAI_MODEL:
65
+ selected_model_name_for_call = self.config.OPENAI_MODEL
66
+ logger.debug(f"Auto-selected OpenAI model: {selected_model_name_for_call}")
67
+ return await self._generate_with_openai(prompt, selected_model_name_for_call, max_tokens, temperature)
68
+ elif self.mistral_client and self.config.MISTRAL_MODEL:
69
+ selected_model_name_for_call = self.config.MISTRAL_MODEL
70
+ logger.debug(f"Auto-selected Mistral model: {selected_model_name_for_call}")
71
+ return await self._generate_with_mistral(prompt, selected_model_name_for_call, max_tokens, temperature)
72
+ else:
73
+ logger.error("No LLM clients available for 'auto' mode or default models not configured.")
74
+ raise ValueError("No LLM clients available for 'auto' mode or default models not configured.")
75
+
76
+ elif model == "fast":
77
+ # Priority for speed: 1. OpenAI (GPT-5-mini), 2. Mistral Small, 3. Nebius
78
+ if self.openai_client and self.config.FAST_MODEL:
79
+ return await self._generate_with_openai(prompt, self.config.FAST_MODEL, max_tokens, temperature)
80
+ # Fallback to auto if fast model not available
81
+ return await self.generate_text(prompt, "auto", max_tokens, temperature)
82
+
83
+ elif model.startswith("gpt-") or model.startswith("openai/") or "o1-" in model or "o3-" in model:
84
+ if self.openai_client:
85
+ actual_model = model.split('/')[-1] if '/' in model else model
86
+ return await self._generate_with_openai(prompt, actual_model, max_tokens, temperature)
87
+ elif self.nebius_client and "gpt-oss" in model: # Handle Nebius "openai/" prefix if any
88
+ actual_model = model.split('/')[-1] if '/' in model else model
89
+ return await self._generate_with_nebius(prompt, actual_model, max_tokens, temperature)
90
+ else:
91
+ raise ValueError("OpenAI client not available. Check API key.")
92
+
93
+ elif model.lower().startswith("nebius/") or model.lower().startswith("meta-llama/"):
94
+ if not self.nebius_client:
95
+ raise ValueError("NEBIUS client not available. Check API key.")
96
+ return await self._generate_with_nebius(prompt, model, max_tokens, temperature)
97
+
98
+ elif model.startswith("mistral"):
99
+ if not self.mistral_client:
100
+ raise ValueError("Mistral client not available. Check API key or model prefix.")
101
+ return await self._generate_with_mistral(prompt, model, max_tokens, temperature)
102
+
103
+ else:
104
+ raise ValueError(f"Unsupported model: {model}. Must start with 'gpt-', 'openai/', 'nebius/', 'mistral', or be 'auto'.")
105
+
106
+ except Exception as e:
107
+ logger.error(f"Error generating text with model '{model}': {str(e)}")
108
+ raise
109
+
110
+ async def _generate_with_openai(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
111
+ """Generate text using OpenAI"""
112
+ if not self.openai_client:
113
+ raise RuntimeError("OpenAI client not initialized.")
114
+ try:
115
+ logger.debug(f"Generating with OpenAI model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}")
116
+ loop = asyncio.get_event_loop()
117
+
118
+ # Determine correct token parameter based on model family
119
+ # GPT-5, o1, o3 series use max_completion_tokens
120
+ use_completion_tokens = any(x in model_name for x in ["gpt-5", "o1-", "o3-"])
121
+
122
+ kwargs = {
123
+ "model": model_name,
124
+ "messages": [{"role": "user", "content": prompt}],
125
+ }
126
+
127
+ if use_completion_tokens:
128
+ kwargs["max_completion_tokens"] = max_tokens
129
+ # Reasoning models enforce temperature=1
130
+ kwargs["temperature"] = 1
131
+ if temperature != 1:
132
+ logger.warning(f"Temperature {temperature} ignored for model {model_name} (requires 1).")
133
+ else:
134
+ kwargs["max_tokens"] = max_tokens
135
+ kwargs["temperature"] = temperature
136
+
137
+ response = await loop.run_in_executor(
138
+ None,
139
+ lambda: self.openai_client.chat.completions.create(**kwargs)
140
+ )
141
+ if response.choices and response.choices[0].message:
142
+ content = response.choices[0].message.content
143
+ if content is not None:
144
+ return content.strip()
145
+ return ""
146
+ except Exception as e:
147
+ logger.error(f"Error with OpenAI generation (model: {model_name}): {str(e)}")
148
+ raise
149
+
150
+ async def _generate_with_nebius(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
151
+ """Generate text using NEBIUS (OpenAI OSS models via sync client)"""
152
+ if not self.nebius_client:
153
+ raise RuntimeError("NEBIUS client not initialized.")
154
+ try:
155
+ logger.debug(f"Generating with NEBIUS model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}, prompt: '{prompt[:50]}...'")
156
+ loop = asyncio.get_event_loop()
157
+
158
+ response = await loop.run_in_executor(
159
+ None,
160
+ lambda: self.nebius_client.chat.completions.create(
161
+ model=model_name,
162
+ messages=[{"role": "user", "content": prompt}],
163
+ max_tokens=max_tokens,
164
+ temperature=temperature
165
+ )
166
+ )
167
+ if response.choices and response.choices[0].message:
168
+ content = response.choices[0].message.content
169
+ if content is not None:
170
+ return content.strip()
171
+ else:
172
+ logger.warning(f"NEBIUS response message content is None for model {model_name}.")
173
+ return ""
174
+ else:
175
+ logger.warning(f"NEBIUS response did not contain expected choices or message for model {model_name}.")
176
+ return ""
177
+ except Exception as e:
178
+ logger.error(f"Error with NEBIUS generation (model: {model_name}): {str(e)}")
179
+ raise
180
+
181
+ async def _generate_with_mistral(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
182
+ """Generate text using Mistral (Sync via run_in_executor)"""
183
+ if not self.mistral_client:
184
+ raise RuntimeError("Mistral client not initialized.")
185
+ try:
186
+ logger.debug(f"Generating with Mistral model: {model_name}, temp: {temperature}, prompt: '{prompt[:50]}...' (max_tokens: {max_tokens} - note: not directly used by MistralClient.chat)")
187
+ loop = asyncio.get_event_loop()
188
+
189
+ response = await loop.run_in_executor(
190
+ None,
191
+ lambda: self.mistral_client.chat(
192
+ model=model_name,
193
+ messages=[{"role": "user", "content": prompt}],
194
+ max_tokens=max_tokens,
195
+ temperature=temperature
196
+ )
197
+ )
198
+ if response.choices and response.choices[0].message:
199
+ content = response.choices[0].message.content
200
+ if content is not None:
201
+ return content.strip()
202
+ else:
203
+ logger.warning(f"Mistral response message content is None for model {model_name}.")
204
+ return ""
205
+ else:
206
+ logger.warning(f"Mistral response did not contain expected choices or message for model {model_name}.")
207
+ return ""
208
+ except Exception as e:
209
+ logger.error(f"Error with Mistral generation (model: {model_name}): {str(e)}")
210
+ raise
211
+
212
+
213
+ async def summarize(self, text: str, style: str = "concise", max_length: Optional[int] = None) -> str:
214
+ if not text.strip():
215
+ return ""
216
+
217
+ style_prompts = {
218
+ "concise": "Provide a concise summary of the following text, focusing on the main points:",
219
+ "detailed": "Provide a detailed summary of the following text, including key details and supporting information:",
220
+ "bullet_points": "Summarize the following text as a list of bullet points highlighting the main ideas:",
221
+ "executive": "Provide an executive summary of the following text, focusing on key findings and actionable insights:"
222
+ }
223
+ prompt_template = style_prompts.get(style, style_prompts["concise"])
224
+ if max_length:
225
+ prompt_template += f" Keep the summary under approximately {max_length} words."
226
+
227
+ prompt = f"{prompt_template}\n\nText to summarize:\n{text}\n\nSummary:"
228
+
229
+ try:
230
+ summary_max_tokens = (max_length * 2) if max_length else 500
231
+ summary = await self.generate_text(prompt, model="auto", max_tokens=summary_max_tokens, temperature=0.3)
232
+ return summary.strip()
233
+ except Exception as e:
234
+ logger.error(f"Error generating summary: {str(e)}")
235
+ return "Error generating summary"
236
+
237
+ async def generate_tags(self, text: str, max_tags: int = 5) -> List[str]:
238
+ if not text.strip():
239
+ return []
240
+
241
+ prompt = f"""Generate up to {max_tags} relevant tags for the following text.
242
+ Tags should be concise, descriptive keywords or phrases (1-3 words typically) that capture the main topics or themes.
243
+ Return only the tags, separated by commas. Do not include any preamble or explanation.
244
+
245
+ Text:
246
+ {text}
247
+
248
+ Tags:"""
249
+
250
+ try:
251
+ # Use FAST_MODEL for tags
252
+ response = await self.generate_text(prompt, model="fast", max_tokens=100, temperature=0.5)
253
+ tags = [tag.strip().lower() for tag in response.split(',') if tag.strip()]
254
+ tags = [tag for tag in tags if tag and len(tag) > 1 and len(tag) < 50]
255
+ return list(dict.fromkeys(tags))[:max_tags]
256
+ except Exception as e:
257
+ logger.error(f"Error generating tags: {str(e)}")
258
+ return []
259
+
260
+ async def categorize(self, text: str, categories: List[str]) -> str:
261
+ if not text.strip() or not categories:
262
+ return "Uncategorized"
263
+
264
+ categories_str = ", ".join([f"'{cat}'" for cat in categories])
265
+ prompt = f"""Classify the following text into ONE of these categories: {categories_str}.
266
+ Choose the single most appropriate category based on the content and main theme of the text.
267
+ Return only the category name as a string, exactly as it appears in the list provided. Do not add any other text or explanation.
268
+
269
+ Text to classify:
270
+ {text}
271
+
272
+ Category:"""
273
+
274
+ try:
275
+ # Use FAST_MODEL for categorization
276
+ response = await self.generate_text(prompt, model="fast", max_tokens=50, temperature=0.1)
277
+ category_candidate = response.strip().strip("'\"")
278
+
279
+ for cat in categories:
280
+ if cat.lower() == category_candidate.lower():
281
+ return cat
282
+
283
+ logger.warning(f"LLM returned category '{category_candidate}' which is not in the provided list: {categories}. Falling back.")
284
+ return categories[0] if categories else "Uncategorized"
285
+ except Exception as e:
286
+ logger.error(f"Error categorizing text: {str(e)}")
287
+ return "Uncategorized"
288
+
289
+ async def answer_question(self, question: str, context: str, max_context_length: int = 3000) -> str:
290
+ if not question.strip():
291
+ return "No question provided."
292
+ if not context.strip():
293
+ return "I don't have enough context to answer this question. Please provide relevant information."
294
+
295
+ if len(context) > max_context_length:
296
+ context = context[:max_context_length] + "..."
297
+ logger.warning(f"Context truncated to {max_context_length} characters for question answering.")
298
+
299
+ prompt = f"""You are an expert Q&A assistant. Your task is to synthesize an answer to the user's question based *only* on the provided source documents.
300
+ Analyze all the source documents provided in the context below.
301
+ If the information is present, provide a comprehensive answer.
302
+
303
+ Here are the source documents:
304
+ --- START OF CONTEXT ---
305
+ {context}
306
+ --- END OF CONTEXT ---
307
+
308
+ Based on the context above, please provide a clear and concise answer to the following question.
309
+
310
+ Question: {question}
311
+
312
+ Answer:"""
313
+
314
+ try:
315
+ answer = await self.generate_text(prompt, model="auto", max_tokens=800, temperature=0.5)
316
+ return answer.strip()
317
+ except Exception as e:
318
+ logger.error(f"Error answering question: {str(e)}")
319
+ return "I encountered an error while trying to answer your question."
320
+
321
+ async def extract_key_information(self, text: str) -> Dict[str, Any]:
322
+ if not text.strip():
323
+ return {}
324
+
325
+ prompt = f"""Analyze the following text and extract key information.
326
+ Provide the response as a JSON object with the following keys:
327
+ - "main_topic": (string) The main topic or subject of the text.
328
+ - "key_points": (array of strings) A list of 3-5 key points or takeaways.
329
+ - "entities": (array of strings) Important people, places, organizations, or products mentioned.
330
+ - "sentiment": (string) Overall sentiment of the text (e.g., "positive", "neutral", "negative", "mixed").
331
+ - "content_type": (string) The perceived type of content (e.g., "article", "email", "report", "conversation", "advertisement", "other").
332
+
333
+ If a piece of information is not found or not applicable, use null or an empty array/string as appropriate for the JSON structure.
334
+
335
+ Text to analyze:
336
+ ---
337
+ {text}
338
+ ---
339
+
340
+ JSON Analysis:"""
341
+
342
+ try:
343
+ response_str = await self.generate_text(prompt, model="auto", max_tokens=500, temperature=0.4)
344
+
345
+ import json
346
+ try:
347
+ if response_str.startswith("```json"):
348
+ response_str = response_str.lstrip("```json").rstrip("```").strip()
349
+
350
+ info = json.loads(response_str)
351
+ expected_keys = {"main_topic", "key_points", "entities", "sentiment", "content_type"}
352
+ if not expected_keys.issubset(info.keys()):
353
+ logger.warning(f"Extracted information missing some expected keys. Got: {info.keys()}")
354
+ return info
355
+ except json.JSONDecodeError as je:
356
+ logger.error(f"Failed to parse JSON from LLM response for key_information: {je}")
357
+ logger.debug(f"LLM Response string was: {response_str}")
358
+ info_fallback = {}
359
+ lines = response_str.split('\n')
360
+ for line in lines:
361
+ if ':' in line:
362
+ key, value = line.split(':', 1)
363
+ key_clean = key.strip().lower().replace(' ', '_')
364
+ value_clean = value.strip()
365
+ if value_clean:
366
+ if key_clean in ["key_points", "entities"] and '[' in value_clean and ']' in value_clean:
367
+ try:
368
+ info_fallback[key_clean] = [item.strip().strip("'\"") for item in value_clean.strip('[]').split(',') if item.strip()]
369
+ except: info_fallback[key_clean] = value_clean
370
+ else: info_fallback[key_clean] = value_clean
371
+ if info_fallback:
372
+ logger.info("Successfully parsed key information using fallback line-based method.")
373
+ return info_fallback
374
+ return {"error": "Failed to parse LLM output", "raw_response": response_str}
375
+ except Exception as e:
376
+ logger.error(f"Error extracting key information: {str(e)}")
377
+ return {"error": f"General error extracting key information: {str(e)}"}
378
+
379
+ async def check_availability(self) -> Dict[str, bool]:
380
+ """Check which LLM services are available by making a tiny test call."""
381
+ availability = {
382
+ "openai": False,
383
+ "nebius": False,
384
+ "mistral": False
385
+ }
386
+ test_prompt = "Hello"
387
+ test_max_tokens = 5
388
+ test_temp = 0.1
389
+
390
+ logger.info("Checking LLM availability...")
391
+
392
+ if self.openai_client and self.config.OPENAI_MODEL:
393
+ try:
394
+ logger.debug(f"Testing OpenAI availability with model {self.config.OPENAI_MODEL}...")
395
+ test_response = await self._generate_with_openai(test_prompt, self.config.OPENAI_MODEL, test_max_tokens, test_temp)
396
+ availability["openai"] = bool(test_response.strip())
397
+ except Exception as e:
398
+ logger.warning(f"OpenAI availability check failed for model {self.config.OPENAI_MODEL}: {e}")
399
+ logger.info(f"OpenAI available: {availability['openai']}")
400
+
401
+ if self.nebius_client and self.config.NEBIUS_MODEL:
402
+ try:
403
+ logger.debug(f"Testing NEBIUS availability with model {self.config.NEBIUS_MODEL}...")
404
+ test_response = await self._generate_with_nebius(test_prompt, self.config.NEBIUS_MODEL, test_max_tokens, test_temp)
405
+ availability["nebius"] = bool(test_response.strip())
406
+ except Exception as e:
407
+ logger.warning(f"NEBIUS availability check failed for model {self.config.NEBIUS_MODEL}: {e}")
408
+ logger.info(f"NEBIUS available: {availability['nebius']}")
409
+
410
+ if self.mistral_client and self.config.MISTRAL_MODEL:
411
+ try:
412
+ logger.debug(f"Testing Mistral availability with model {self.config.MISTRAL_MODEL}...")
413
+ test_response = await self._generate_with_mistral(test_prompt, self.config.MISTRAL_MODEL, test_max_tokens, test_temp)
414
+ availability["mistral"] = bool(test_response.strip())
415
+ except Exception as e:
416
+ logger.warning(f"Mistral availability check failed for model {self.config.MISTRAL_MODEL}: {e}")
417
+ logger.info(f"Mistral available: {availability['mistral']}")
418
+
419
+ logger.info(f"Final LLM Availability: {availability}")
420
+ return availability
services/ocr_service.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import asyncio
4
+ from pathlib import Path
5
+ import os
6
+ import base64 # For encoding files
7
+ from typing import Optional, List, Dict, Any
8
+ import json
9
+
10
+ from mistralai import Mistral
11
+ from mistralai.models import SDKError
12
+ # PIL (Pillow) for dummy image creation in main_example
13
+ from PIL import Image, ImageDraw, ImageFont
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class OCRService:
18
+ def __init__(self):
19
+ self.api_key = os.environ.get("MISTRAL_API_KEY")
20
+ if not self.api_key:
21
+ logger.error("MISTRAL_API_KEY environment variable not set.")
22
+ raise ValueError("MISTRAL_API_KEY not found in environment variables.")
23
+
24
+ self.client = Mistral(api_key=self.api_key)
25
+ self.ocr_model_name = "mistral-ocr-latest"
26
+ self.language = 'eng'
27
+ logger.info(f"OCRService (using Mistral AI model {self.ocr_model_name}) initialized.")
28
+
29
+ def _encode_file_to_base64(self, file_path: str) -> Optional[str]:
30
+ try:
31
+ with open(file_path, "rb") as file_to_encode:
32
+ return base64.b64encode(file_to_encode.read()).decode('utf-8')
33
+ except FileNotFoundError:
34
+ logger.error(f"Error: The file {file_path} was not found for Base64 encoding.")
35
+ return None
36
+ except Exception as e:
37
+ logger.error(f"Error during Base64 encoding for {file_path}: {e}")
38
+ return None
39
+
40
+ # In OCRService class:
41
+
42
+ async def _process_file_with_mistral(self, file_path: str, mime_type: str) -> str:
43
+ file_name = Path(file_path).name
44
+ logger.info(f"Preparing to process file: {file_name} (MIME: {mime_type}) with Mistral OCR.")
45
+
46
+ base64_encoded_file = self._encode_file_to_base64(file_path)
47
+ if not base64_encoded_file:
48
+ logger.warning(f"Base64 encoding failed for {file_name}, cannot process.")
49
+ return ""
50
+
51
+ document_type = "image_url" if mime_type.startswith("image/") else "document_url"
52
+ uri_key = "image_url" if document_type == "image_url" else "document_url"
53
+ data_uri = f"data:{mime_type};base64,{base64_encoded_file}"
54
+
55
+ document_payload = {
56
+ "type": document_type,
57
+ uri_key: data_uri
58
+ }
59
+ try:
60
+ logger.info(f"Calling Mistral client.ocr.process for {file_name} with model {self.ocr_model_name}.")
61
+ loop = asyncio.get_event_loop()
62
+
63
+ ocr_response = await loop.run_in_executor(
64
+ None,
65
+ lambda: self.client.ocr.process(
66
+ model=self.ocr_model_name,
67
+ document=document_payload,
68
+ include_image_base64=False
69
+ )
70
+ )
71
+
72
+ logger.info(f"Received OCR response for {file_name}. Type: {type(ocr_response)}")
73
+
74
+ extracted_markdown = ""
75
+ if hasattr(ocr_response, 'pages') and ocr_response.pages and isinstance(ocr_response.pages, list):
76
+ all_pages_markdown = []
77
+ for i, page in enumerate(ocr_response.pages):
78
+ page_content = None
79
+ if hasattr(page, 'markdown') and page.markdown: # Check for 'markdown' attribute
80
+ page_content = page.markdown
81
+ logger.debug(f"Extracted content from page {i} using 'page.markdown'.")
82
+ elif hasattr(page, 'markdown_content') and page.markdown_content:
83
+ page_content = page.markdown_content
84
+ logger.debug(f"Extracted content from page {i} using 'page.markdown_content'.")
85
+ elif hasattr(page, 'text') and page.text:
86
+ page_content = page.text
87
+ logger.debug(f"Extracted content from page {i} using 'page.text'.")
88
+
89
+ if page_content:
90
+ all_pages_markdown.append(page_content)
91
+ else:
92
+ page_details_for_log = str(page)[:200] # Default to string snippet
93
+ if hasattr(page, '__dict__'):
94
+ page_details_for_log = str(vars(page))[:200] # Log part of vars if it's an object
95
+ logger.warning(f"Page {i} in OCR response for {file_name} has no 'markdown', 'markdown_content', or 'text'. Page details: {page_details_for_log}")
96
+
97
+ if all_pages_markdown:
98
+ extracted_markdown = "\n\n---\nPage Break (simulated)\n---\n\n".join(all_pages_markdown) # Simulate page breaks
99
+ else:
100
+ logger.warning(f"'pages' attribute found but no content extracted from any pages for {file_name}.")
101
+
102
+ # Fallbacks if ocr_response doesn't have 'pages' but might have direct text/markdown
103
+ elif hasattr(ocr_response, 'text') and ocr_response.text:
104
+ extracted_markdown = ocr_response.text
105
+ logger.info(f"Extracted content from 'ocr_response.text' (no pages structure) for {file_name}.")
106
+ elif hasattr(ocr_response, 'markdown') and ocr_response.markdown:
107
+ extracted_markdown = ocr_response.markdown
108
+ logger.info(f"Extracted content from 'ocr_response.markdown' (no pages structure) for {file_name}.")
109
+ elif isinstance(ocr_response, str) and ocr_response:
110
+ extracted_markdown = ocr_response
111
+ logger.info(f"OCR response is a direct non-empty string for {file_name}.")
112
+ else:
113
+ logger.warning(f"Could not extract markdown from OCR response for {file_name} using known attributes (pages, text, markdown).")
114
+
115
+ if not extracted_markdown.strip():
116
+ logger.warning(f"Extracted markdown is empty for {file_name} after all parsing attempts.")
117
+
118
+ return extracted_markdown.strip()
119
+
120
+ except SDKError as e:
121
+ logger.error(f"Mistral API Exception during client.ocr.process for {file_name}: {e.message}")
122
+ logger.exception("SDKError details:")
123
+ return ""
124
+ except Exception as e:
125
+ logger.error(f"Generic Exception during Mistral client.ocr.process call for {file_name}: {e}")
126
+ logger.exception("Exception details:")
127
+ return ""
128
+
129
+ async def extract_text_from_image(self, image_path: str, language: Optional[str] = None) -> str:
130
+ if language:
131
+ logger.info(f"Language parameter '{language}' provided, but Mistral OCR is broadly multilingual.")
132
+
133
+ ext = Path(image_path).suffix.lower()
134
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png',
135
+ '.gif': 'image/gif', '.bmp': 'image/bmp', '.tiff': 'image/tiff', '.webp': 'image/webp',
136
+ '.avif': 'image/avif'}
137
+ mime_type = mime_map.get(ext)
138
+ if not mime_type:
139
+ logger.warning(f"Unsupported image extension '{ext}' for path '{image_path}'. Attempting with 'application/octet-stream'.")
140
+ mime_type = 'application/octet-stream'
141
+
142
+ return await self._process_file_with_mistral(image_path, mime_type)
143
+
144
+ async def extract_text_from_pdf(self, pdf_path: str) -> str:
145
+ return await self._process_file_with_mistral(pdf_path, "application/pdf")
146
+
147
+ async def extract_text_from_pdf_images(self, pdf_path: str) -> List[str]:
148
+ logger.info("Mistral processes PDFs directly. This method will return the full Markdown content as a single list item.")
149
+ full_markdown = await self._process_file_with_mistral(pdf_path, "application/pdf")
150
+ if full_markdown:
151
+ return [full_markdown]
152
+ return [""]
153
+
154
+ async def extract_text_with_confidence(self, image_path: str, min_confidence: float = 0.5) -> Dict[str, Any]:
155
+ logger.warning("Mistral Document AI API (ocr.process) typically returns structured text (Markdown). Word-level confidence scores are not standard. 'confidence' field is a placeholder.")
156
+
157
+ ext = Path(image_path).suffix.lower()
158
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
159
+ mime_type = mime_map.get(ext)
160
+ if not mime_type:
161
+ logger.warning(f"Unsupported image extension '{ext}' in extract_text_with_confidence. Defaulting mime type.")
162
+ mime_type = 'application/octet-stream'
163
+
164
+ text_markdown = await self._process_file_with_mistral(image_path, mime_type)
165
+
166
+ return {
167
+ "text": text_markdown,
168
+ "confidence": 0.0,
169
+ "word_count": len(text_markdown.split()) if text_markdown else 0,
170
+ "raw_data": "Mistral ocr.process response contains structured data. See logs from _process_file_with_mistral for details."
171
+ }
172
+
173
+ async def detect_language(self, image_path: str) -> str:
174
+ logger.warning("Mistral OCR is multilingual; explicit language detection is not part of client.ocr.process.")
175
+ return 'eng'
176
+
177
+ async def extract_tables_from_image(self, image_path: str) -> List[List[str]]:
178
+ logger.info("Extracting text (Markdown) from image using Mistral. Mistral OCR preserves table structures in Markdown.")
179
+
180
+ ext = Path(image_path).suffix.lower()
181
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
182
+ mime_type = mime_map.get(ext)
183
+ if not mime_type:
184
+ logger.warning(f"Unsupported image extension '{ext}' in extract_tables_from_image. Defaulting mime type.")
185
+ mime_type = 'application/octet-stream'
186
+
187
+ markdown_content = await self._process_file_with_mistral(image_path, mime_type)
188
+
189
+ if markdown_content:
190
+ logger.info("Attempting basic parsing of Markdown tables. For complex tables, a dedicated parser is recommended.")
191
+ table_data = []
192
+ # Simplified parsing logic for example purposes - can be improved significantly.
193
+ lines = markdown_content.split('\n')
194
+ for line in lines:
195
+ stripped_line = line.strip()
196
+ if stripped_line.startswith('|') and stripped_line.endswith('|') and "---" not in stripped_line:
197
+ cells = [cell.strip() for cell in stripped_line.strip('|').split('|')]
198
+ if any(cells):
199
+ table_data.append(cells)
200
+
201
+ if table_data:
202
+ logger.info(f"Extracted {len(table_data)} lines potentially forming tables using basic parsing.")
203
+ else:
204
+ logger.info("No distinct table structures found with basic parsing from extracted markdown.")
205
+ return table_data
206
+ return []
207
+
208
+ async def get_supported_languages(self) -> List[str]:
209
+ logger.info("Mistral OCR is multilingual. Refer to official Mistral AI documentation for details.")
210
+ return ['eng', 'multilingual (refer to Mistral documentation)']
211
+
212
+ async def validate_ocr_setup(self) -> Dict[str, Any]:
213
+ try:
214
+ models_response = await asyncio.to_thread(self.client.models.list)
215
+ model_ids = [model.id for model in models_response.data]
216
+ return {
217
+ "status": "operational",
218
+ "message": "Mistral client initialized. API key present. Model listing successful.",
219
+ "mistral_available_models_sample": model_ids[:5],
220
+ "configured_ocr_model": self.ocr_model_name,
221
+ }
222
+ except SDKError as e:
223
+ logger.error(f"Mistral API Exception during setup validation: {e.message}")
224
+ return { "status": "error", "error": f"Mistral API Error: {e.message}"}
225
+ except Exception as e:
226
+ logger.error(f"Generic error during Mistral OCR setup validation: {str(e)}")
227
+ return { "status": "error", "error": str(e) }
228
+
229
+ def extract_text(self, file_path: str) -> str:
230
+ logger.warning("`extract_text` is a synchronous method. Running async Mistral OCR in a blocking way.")
231
+ try:
232
+ ext = Path(file_path).suffix.lower()
233
+ if ext in ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.avif']:
234
+ result = asyncio.run(self.extract_text_from_image(file_path))
235
+ elif ext == '.pdf':
236
+ result = asyncio.run(self.extract_text_from_pdf(file_path))
237
+ else:
238
+ logger.error(f"Unsupported file type for sync extract_text: {file_path}")
239
+ return "Unsupported file type."
240
+ return result
241
+ except Exception as e:
242
+ logger.error(f"Error in synchronous extract_text for {file_path}: {str(e)}")
243
+ return "Error during sync extraction."
244
+
245
+ # Example of how to use the OCRService (main execution part)
246
+ async def main_example():
247
+ logging.basicConfig(level=logging.DEBUG,
248
+ format='%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s')
249
+
250
+ if not os.environ.get("MISTRAL_API_KEY"):
251
+ logger.error("MISTRAL_API_KEY environment variable is not set. Please set it: export MISTRAL_API_KEY='yourkey'")
252
+ return
253
+
254
+ ocr_service = OCRService()
255
+
256
+ logger.info("--- Validating OCR Service Setup ---")
257
+ validation_status = await ocr_service.validate_ocr_setup()
258
+ logger.info(f"OCR Service Validation: {validation_status}")
259
+ if validation_status.get("status") == "error":
260
+ logger.error("Halting due to validation error.")
261
+ return
262
+
263
+ # --- Test with a specific PDF file ---
264
+ pdf_path_to_test = r"C:\path\to\your\certificate.pdf"
265
+
266
+ if os.path.exists(pdf_path_to_test):
267
+ logger.info(f"\n--- Extracting text from specific PDF: {pdf_path_to_test} ---")
268
+ # Using the method that aligns with original `extract_text_from_pdf_images` signature
269
+ pdf_markdown_list = await ocr_service.extract_text_from_pdf_images(pdf_path_to_test)
270
+ if pdf_markdown_list and pdf_markdown_list[0]:
271
+ logger.info(f"Extracted Markdown from PDF ({pdf_path_to_test}):\n" + pdf_markdown_list[0])
272
+ else:
273
+ logger.warning(f"No text extracted from PDF {pdf_path_to_test} or an error occurred.")
274
+ else:
275
+ logger.warning(f"PDF file for specific test '{pdf_path_to_test}' not found. Skipping this test.")
276
+ logger.warning("Please update `pdf_path_to_test` in `main_example` to a valid PDF path.")
277
+
278
+ image_path = "dummy_test_image_ocr.png"
279
+ if os.path.exists(image_path):
280
+ logger.info(f"\n---Extracting text from image: {image_path} ---")
281
+ # ... image processing logic ...
282
+ pass
283
+ else:
284
+ logger.info(f"Dummy image {image_path} not created or found, skipping optional image test.")
285
+
286
+
287
+ if __name__ == '__main__':
288
+ asyncio.run(main_example())
services/podcast_generator_service.py ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import asyncio
3
+ import json
4
+ import uuid
5
+ from typing import List, Dict, Any, Optional
6
+ from dataclasses import dataclass, asdict
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ import re
10
+
11
+ try:
12
+ from elevenlabs import VoiceSettings
13
+ from elevenlabs.client import ElevenLabs
14
+ ELEVENLABS_AVAILABLE = True
15
+ except ImportError:
16
+ ELEVENLABS_AVAILABLE = False
17
+
18
+ import config
19
+ from services.llamaindex_service import LlamaIndexService
20
+ from services.llm_service import LLMService
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ @dataclass
25
+ class DocumentAnalysis:
26
+ """Analysis results from document(s)"""
27
+ key_insights: List[str] # 5-7 main points
28
+ topics: List[str]
29
+ complexity_level: str # beginner, intermediate, advanced
30
+ estimated_words: int
31
+ source_documents: List[str]
32
+ summary: str
33
+
34
+ @dataclass
35
+ class DialogueLine:
36
+ """Single line of podcast dialogue"""
37
+ speaker: str # "HOST1" or "HOST2"
38
+ text: str
39
+ pause_after: float = 0.5 # seconds
40
+
41
+ @dataclass
42
+ class PodcastScript:
43
+ """Complete podcast script"""
44
+ dialogue: List[DialogueLine]
45
+ total_duration_estimate: float
46
+ word_count: int
47
+ style: str
48
+
49
+ def to_text(self) -> str:
50
+ """Convert to readable transcript"""
51
+ lines = []
52
+ for line in self.dialogue:
53
+ lines.append(f"{line.speaker}: {line.text}")
54
+ return "\n\n".join(lines)
55
+
56
+ @dataclass
57
+ class PodcastMetadata:
58
+ """Metadata for generated podcast"""
59
+ podcast_id: str
60
+ title: str
61
+ description: str
62
+ source_documents: List[str]
63
+ style: str
64
+ duration_seconds: float
65
+ file_size_mb: float
66
+ voices: Dict[str, str]
67
+ generated_at: str
68
+ generation_cost: Dict[str, float]
69
+ key_topics: List[str]
70
+
71
+ @dataclass
72
+ class PodcastResult:
73
+ """Complete podcast generation result"""
74
+ podcast_id: str
75
+ audio_file_path: str
76
+ transcript: str
77
+ metadata: PodcastMetadata
78
+ generation_time: float
79
+ success: bool
80
+ error: Optional[str] = None
81
+
82
+
83
+ class PodcastGeneratorService:
84
+ """
85
+ Service for generating conversational podcasts from documents.
86
+ Combines LlamaIndex for analysis and ElevenLabs for voice synthesis.
87
+ """
88
+
89
+ # Word count per minute for podcast pacing
90
+ WORDS_PER_MINUTE = 150
91
+
92
+ # Script generation prompts for different styles
93
+ SCRIPT_PROMPTS = {
94
+ "conversational": """You are an expert podcast script writer. Create an engaging 2-host podcast discussing insights from documents.
95
+
96
+ CONTEXT:
97
+ {analysis}
98
+
99
+ REQUIREMENTS:
100
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
101
+ - Style: Conversational, friendly, and accessible
102
+ - Format: Alternating dialogue between HOST1 and HOST2
103
+ - Include natural transitions, questions, and "aha!" moments
104
+ - Make complex topics easy to understand
105
+ - Add enthusiasm and genuine curiosity
106
+ - Balance speaking time between both hosts
107
+
108
+ DIALOGUE FORMAT (strictly follow):
109
+ HOST1: [What they say]
110
+ HOST2: [What they say]
111
+
112
+ STRUCTURE:
113
+ 1. Opening Hook (30 seconds): Grab attention with an intriguing question or fact
114
+ 2. Introduction (1 minute): Set context and preview what's coming
115
+ 3. Main Discussion (70% of time): Deep dive into key insights
116
+ 4. Wrap-up (1 minute): Summarize key takeaways and final thoughts
117
+
118
+ TONE: Friendly, enthusiastic, educational but not condescending
119
+
120
+ Generate the complete podcast script now:""",
121
+
122
+ "educational": """You are creating an educational podcast script. Two hosts discuss document insights in a clear, instructive manner.
123
+
124
+ CONTEXT:
125
+ {analysis}
126
+
127
+ REQUIREMENTS:
128
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
129
+ - Style: Clear, methodical, educational
130
+ - HOST1 acts as the teacher/expert, HOST2 as the curious learner
131
+ - Include explanations of complex concepts
132
+ - Use examples and analogies
133
+ - Build knowledge progressively
134
+
135
+ DIALOGUE FORMAT:
136
+ HOST1: [Expert explanation]
137
+ HOST2: [Clarifying question or observation]
138
+
139
+ Generate the complete educational podcast script now:""",
140
+
141
+ "technical": """You are writing a technical podcast for an informed audience. Discuss document insights with precision and depth.
142
+
143
+ CONTEXT:
144
+ {analysis}
145
+
146
+ REQUIREMENTS:
147
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
148
+ - Style: Professional, detailed, technically accurate
149
+ - HOST1 is the subject matter expert, HOST2 is an informed interviewer
150
+ - Use proper technical terminology
151
+ - Dive into implementation details
152
+ - Discuss implications and applications
153
+
154
+ DIALOGUE FORMAT:
155
+ HOST1: [Technical insight]
156
+ HOST2: [Probing question]
157
+
158
+ Generate the complete technical podcast script now:""",
159
+
160
+ "casual": """You are creating a fun, casual podcast. Two friends discuss interesting ideas from documents.
161
+
162
+ CONTEXT:
163
+ {analysis}
164
+
165
+ REQUIREMENTS:
166
+ - Duration: {duration_minutes} minutes (approximately {word_count} words)
167
+ - Style: Relaxed, humorous, energetic
168
+ - Both hosts are enthusiastic and engaged
169
+ - Use casual language and occasional humor
170
+ - Make it entertaining while staying informative
171
+ - Quick pacing with energy
172
+
173
+ DIALOGUE FORMAT:
174
+ HOST1: [Casual commentary]
175
+ HOST2: [Enthusiastic response]
176
+
177
+ Generate the complete casual podcast script now:"""
178
+ }
179
+
180
+ def __init__(
181
+ self,
182
+ llamaindex_service: LlamaIndexService,
183
+ llm_service: LLMService,
184
+ elevenlabs_api_key: Optional[str] = None
185
+ ):
186
+ """
187
+ Initialize podcast generator service
188
+
189
+ Args:
190
+ llamaindex_service: Service for document analysis
191
+ llm_service: Service for script generation
192
+ elevenlabs_api_key: ElevenLabs API key (uses config if not provided)
193
+ """
194
+ self.config = config.config
195
+ self.llamaindex_service = llamaindex_service
196
+ self.llm_service = llm_service
197
+
198
+ # Initialize ElevenLabs client
199
+ self.elevenlabs_client = None
200
+ if ELEVENLABS_AVAILABLE:
201
+ api_key = elevenlabs_api_key or self.config.ELEVENLABS_API_KEY
202
+ if api_key:
203
+ try:
204
+ self.elevenlabs_client = ElevenLabs(api_key=api_key)
205
+ logger.info("ElevenLabs client initialized for podcast generation")
206
+ except Exception as e:
207
+ logger.error(f"Failed to initialize ElevenLabs client: {e}")
208
+
209
+ # Create podcast storage directory
210
+ self.podcast_dir = Path("./data/podcasts")
211
+ self.podcast_dir.mkdir(parents=True, exist_ok=True)
212
+
213
+ # Metadata database file
214
+ self.metadata_file = self.podcast_dir / "metadata_db.json"
215
+ self._ensure_metadata_db()
216
+
217
+ def _ensure_metadata_db(self):
218
+ """Ensure metadata database exists"""
219
+ if not self.metadata_file.exists():
220
+ self.metadata_file.write_text(json.dumps([], indent=2))
221
+
222
+ async def generate_podcast(
223
+ self,
224
+ document_ids: List[str],
225
+ style: str = "conversational",
226
+ duration_minutes: int = 10,
227
+ host1_voice: str = "Rachel",
228
+ host2_voice: str = "Adam"
229
+ ) -> PodcastResult:
230
+ """
231
+ Generate a complete podcast from documents
232
+
233
+ Args:
234
+ document_ids: List of document IDs to analyze
235
+ style: Podcast style (conversational, educational, technical, casual)
236
+ duration_minutes: Target duration in minutes
237
+ host1_voice: Voice name for first host
238
+ host2_voice: Voice name for second host
239
+
240
+ Returns:
241
+ PodcastResult with audio file path and metadata
242
+ """
243
+ start_time = datetime.now()
244
+ podcast_id = str(uuid.uuid4())
245
+
246
+ try:
247
+ logger.info(f"Starting podcast generation {podcast_id}")
248
+ logger.info(f"Documents: {document_ids}, Style: {style}, Duration: {duration_minutes}min")
249
+
250
+ # Step 1: Analyze documents
251
+ logger.info("Step 1: Analyzing documents...")
252
+ analysis = await self.analyze_documents(document_ids)
253
+
254
+ # Step 2: Generate script
255
+ logger.info("Step 2: Generating podcast script...")
256
+ script = await self.generate_script(analysis, style, duration_minutes)
257
+
258
+ # Step 3: Synthesize audio
259
+ logger.info("Step 3: Synthesizing audio with voices...")
260
+ audio_file_path = await self.synthesize_audio(
261
+ podcast_id,
262
+ script,
263
+ host1_voice,
264
+ host2_voice
265
+ )
266
+
267
+ # Calculate generation time
268
+ generation_time = (datetime.now() - start_time).total_seconds()
269
+
270
+ # Step 4: Create metadata
271
+ logger.info("Step 4: Creating metadata...")
272
+ metadata = self._create_metadata(
273
+ podcast_id,
274
+ analysis,
275
+ script,
276
+ audio_file_path,
277
+ {host1_voice, host2_voice},
278
+ document_ids,
279
+ style
280
+ )
281
+
282
+ # Save metadata
283
+ self._save_metadata(metadata)
284
+
285
+ # Save transcript
286
+ transcript_path = self.podcast_dir / f"{podcast_id}_transcript.txt"
287
+ transcript_path.write_text(script.to_text(), encoding="utf-8")
288
+
289
+ logger.info(f"Podcast generated successfully: {podcast_id}")
290
+
291
+ return PodcastResult(
292
+ podcast_id=podcast_id,
293
+ audio_file_path=str(audio_file_path),
294
+ transcript=script.to_text(),
295
+ metadata=metadata,
296
+ generation_time=generation_time,
297
+ success=True
298
+ )
299
+
300
+ except Exception as e:
301
+ logger.error(f"Podcast generation failed: {str(e)}", exc_info=True)
302
+ return PodcastResult(
303
+ podcast_id=podcast_id,
304
+ audio_file_path="",
305
+ transcript="",
306
+ metadata=None,
307
+ generation_time=(datetime.now() - start_time).total_seconds(),
308
+ success=False,
309
+ error=str(e)
310
+ )
311
+
312
+ async def analyze_documents(self, document_ids: List[str]) -> DocumentAnalysis:
313
+ """
314
+ Analyze documents to extract key insights for podcast
315
+
316
+ Args:
317
+ document_ids: List of document IDs
318
+
319
+ Returns:
320
+ DocumentAnalysis with key insights and topics
321
+ """
322
+ # Create analysis query for the agentic RAG
323
+ analysis_query = f"""Analyze the following documents and provide:
324
+ 1. The 5-7 most important insights or key points
325
+ 2. Main themes and topics covered
326
+ 3. The overall complexity level (beginner/intermediate/advanced)
327
+ 4. A brief summary suitable for podcast discussion
328
+
329
+ Document IDs: {', '.join(document_ids)}
330
+
331
+ Provide a structured analysis optimized for creating an engaging podcast discussion."""
332
+
333
+ # Use LlamaIndex agentic RAG for analysis
334
+ result = await self.llamaindex_service.query(analysis_query)
335
+
336
+ # Parse the result to extract structured information
337
+ # This is a simplified parser - in production, you might want more robust parsing
338
+ insights = self._extract_insights(result)
339
+ topics = self._extract_topics(result)
340
+ complexity = self._determine_complexity(result)
341
+
342
+ return DocumentAnalysis(
343
+ key_insights=insights[:7], # Limit to 7
344
+ topics=topics,
345
+ complexity_level=complexity,
346
+ estimated_words=len(result.split()),
347
+ source_documents=document_ids,
348
+ summary=result
349
+ )
350
+
351
+ def _extract_insights(self, text: str) -> List[str]:
352
+ """Extract key insights from analysis text"""
353
+ insights = []
354
+ #Simple extraction based on numbered lists or bullet points
355
+ lines = text.split('\n')
356
+ for line in lines:
357
+ line = line.strip()
358
+ # Match patterns like "1.", "2.", "-", "*", "•"
359
+ if re.match(r'^\d+\.|\-|\*|•', line):
360
+ insight = re.sub(r'^\d+\.|\-|\*|•', '', line).strip()
361
+ if len(insight) > 20: # Ensure it's substantial
362
+ insights.append(insight)
363
+
364
+ # If no insights found, create from first few sentences
365
+ if not insights:
366
+ sentences = text.split('.')
367
+ insights = [s.strip() + '.' for s in sentences[:7] if len(s.strip()) > 20]
368
+
369
+ return insights
370
+
371
+ def _extract_topics(self, text: str) -> List[str]:
372
+ """Extract main topics from analysis"""
373
+ # Simple keyword extraction - could be enhanced with NLP
374
+ common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
375
+ words = text.lower().split()
376
+ word_freq = {}
377
+
378
+ for word in words:
379
+ word = re.sub(r'[^\w\s]', '', word)
380
+ if len(word) > 4 and word not in common_words:
381
+ word_freq[word] = word_freq.get(word, 0) + 1
382
+
383
+ # Get top topics
384
+ topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
385
+ return [topic[0].title() for topic in topics]
386
+
387
+ def _determine_complexity(self, text: str) -> str:
388
+ """Determine content complexity level"""
389
+ text_lower = text.lower()
390
+
391
+ # Simple heuristic based on keywords
392
+ if any(word in text_lower for word in ['basic', 'introduction', 'beginner', 'simple']):
393
+ return "beginner"
394
+ elif any(word in text_lower for word in ['advanced', 'complex', 'sophisticated', 'expert']):
395
+ return "advanced"
396
+ else:
397
+ return "intermediate"
398
+
399
+ async def generate_script(
400
+ self,
401
+ analysis: DocumentAnalysis,
402
+ style: str,
403
+ duration_minutes: int
404
+ ) -> PodcastScript:
405
+ """
406
+ Generate podcast script from analysis
407
+
408
+ Args:
409
+ analysis: Document analysis results
410
+ style: Podcast style
411
+ duration_minutes: Target duration
412
+
413
+ Returns:
414
+ Complete podcast script
415
+ """
416
+ # Calculate target word count
417
+ target_words = duration_minutes * self.WORDS_PER_MINUTE
418
+
419
+ # Prepare analysis context
420
+ analysis_context = f"""
421
+ KEY INSIGHTS:
422
+ {chr(10).join(f"{i+1}. {insight}" for i, insight in enumerate(analysis.key_insights))}
423
+
424
+ TOPICS: {', '.join(analysis.topics)}
425
+ COMPLEXITY: {analysis.complexity_level}
426
+
427
+ SUMMARY:
428
+ {analysis.summary[:500]}...
429
+ """
430
+
431
+ # Get prompt template for style
432
+ prompt_template = self.SCRIPT_PROMPTS.get(style, self.SCRIPT_PROMPTS["conversational"])
433
+
434
+ # Fill in the template
435
+ prompt = prompt_template.format(
436
+ analysis=analysis_context,
437
+ duration_minutes=duration_minutes,
438
+ word_count=target_words
439
+ )
440
+
441
+ # Generate script using LLM
442
+ script_text = await self.llm_service.generate_text(
443
+ prompt,
444
+ max_tokens=target_words * 2, # Give room for generation
445
+ temperature=0.8 # More creative
446
+ )
447
+
448
+ # Parse script into dialogue lines
449
+ dialogue = self._parse_script(script_text)
450
+
451
+ # Calculate actual word count and duration
452
+ word_count = sum(len(line.text.split()) for line in dialogue)
453
+ duration_estimate = word_count / self.WORDS_PER_MINUTE
454
+
455
+ return PodcastScript(
456
+ dialogue=dialogue,
457
+ total_duration_estimate=duration_estimate * 60, # Convert to seconds
458
+ word_count=word_count,
459
+ style=style
460
+ )
461
+
462
+ def _parse_script(self, script_text: str) -> List[DialogueLine]:
463
+ """Parse generated script into dialogue lines"""
464
+ dialogue = []
465
+ lines = script_text.split('\n')
466
+
467
+ for line in lines:
468
+ line = line.strip()
469
+ if not line:
470
+ continue
471
+
472
+ # Match "HOST1:" or "HOST2:" format
473
+ if line.startswith('HOST1:'):
474
+ text = line[6:].strip()
475
+ if text:
476
+ dialogue.append(DialogueLine(speaker="HOST1", text=text))
477
+ elif line.startswith('HOST2:'):
478
+ text = line[6:].strip()
479
+ if text:
480
+ dialogue.append(DialogueLine(speaker="HOST2", text=text))
481
+
482
+ return dialogue
483
+
484
+ def _get_voice_id(self, voice_name: str) -> str:
485
+ """
486
+ Get voice ID from voice name.
487
+ Falls back to first available voice if not found.
488
+
489
+ Args:
490
+ voice_name: Voice name (e.g., "Rachel", "Adam")
491
+
492
+ Returns:
493
+ Voice ID string
494
+ """
495
+ try:
496
+ # Try to get voices and find by name
497
+ voices = self.elevenlabs_client.voices.get_all()
498
+
499
+ if not voices or not voices.voices:
500
+ logger.error("No voices available from ElevenLabs")
501
+ raise RuntimeError("No voices available")
502
+
503
+ # First, try exact name match
504
+ for voice in voices.voices:
505
+ if voice.name.lower() == voice_name.lower():
506
+ logger.info(f"Found exact voice match for '{voice_name}': {voice.voice_id}")
507
+ return voice.voice_id
508
+
509
+ # Try partial match
510
+ for voice in voices.voices:
511
+ if voice_name.lower() in voice.name.lower():
512
+ logger.info(f"Found partial voice match for '{voice_name}': {voice.name} ({voice.voice_id})")
513
+ return voice.voice_id
514
+
515
+ # Use first available voice as fallback
516
+ first_voice = voices.voices[0]
517
+ logger.warning(f"Voice '{voice_name}' not found, using first available voice: {first_voice.name} ({first_voice.voice_id})")
518
+ return first_voice.voice_id
519
+
520
+ except Exception as e:
521
+ logger.error(f"Could not fetch voices: {e}", exc_info=True)
522
+ raise RuntimeError(f"Failed to get voice ID: {str(e)}")
523
+
524
+ async def synthesize_audio(
525
+ self,
526
+ podcast_id: str,
527
+ script: PodcastScript,
528
+ host1_voice: str,
529
+ host2_voice: str
530
+ ) -> Path:
531
+ """
532
+ Synthesize audio from script using ElevenLabs
533
+
534
+ Args:
535
+ podcast_id: Unique podcast ID
536
+ script: Podcast script
537
+ host1_voice: Voice for HOST1
538
+ host2_voice: Voice for HOST2
539
+
540
+ Returns:
541
+ Path to generated MP3 file
542
+ """
543
+ if not self.elevenlabs_client:
544
+ raise RuntimeError("ElevenLabs client not initialized")
545
+
546
+ audio_file = self.podcast_dir / f"{podcast_id}.mp3"
547
+
548
+ # For now, create a simple text-to-speech for the full script
549
+ # In production, you'd combine segments with pauses
550
+ full_text = script.to_text()
551
+
552
+ # Get actual voice ID
553
+ voice_id = self._get_voice_id(host1_voice)
554
+
555
+ try:
556
+ # Use modern ElevenLabs TTS API
557
+ # Note: This is a simplified version using single voice
558
+ # Full implementation would process each line separately with different voices
559
+
560
+ logger.info(f"Generating audio with voice: {host1_voice}")
561
+
562
+ # Use the modern text_to_speech API
563
+ audio_generator = self.elevenlabs_client.text_to_speech.convert(
564
+ voice_id=voice_id, # Using resolved voice ID
565
+ text=full_text,
566
+ model_id="eleven_multilingual_v2"
567
+ )
568
+
569
+ # Write audio chunks to file
570
+ with open(audio_file, 'wb') as f:
571
+ for chunk in audio_generator:
572
+ if chunk:
573
+ f.write(chunk)
574
+
575
+ # Verify file was created with content
576
+ if audio_file.exists() and audio_file.stat().st_size > 1000:
577
+ logger.info(f"Audio synthesized successfully: {audio_file} ({audio_file.stat().st_size} bytes)")
578
+ return audio_file
579
+ else:
580
+ raise RuntimeError(f"Generated audio file is too small or empty: {audio_file.stat().st_size} bytes")
581
+
582
+ except Exception as e:
583
+ logger.error(f"Audio synthesis failed: {e}", exc_info=True)
584
+ raise RuntimeError(f"Failed to generate podcast audio: {str(e)}")
585
+
586
+ def _create_metadata(
587
+ self,
588
+ podcast_id: str,
589
+ analysis: DocumentAnalysis,
590
+ script: PodcastScript,
591
+ audio_path: Path,
592
+ voices: set,
593
+ document_ids: List[str],
594
+ style: str
595
+ ) -> PodcastMetadata:
596
+ """Create podcast metadata"""
597
+ # Auto-generate title
598
+ title = f"Podcast: {analysis.topics[0] if analysis.topics else 'Document Discussion'}"
599
+
600
+ # Create description
601
+ description = f"A {style} podcast discussing insights from {len(document_ids)} document(s)."
602
+
603
+ # Calculate file size
604
+ file_size_mb = audio_path.stat().st_size / (1024 * 1024) if audio_path.exists() else 0
605
+
606
+ # Estimate costs
607
+ llm_cost = (script.word_count / 1000) * 0.01 # Rough estimate
608
+ tts_cost = (script.word_count * 5 / 1000) * 0.30 # Rough estimate
609
+
610
+ return PodcastMetadata(
611
+ podcast_id=podcast_id,
612
+ title=title,
613
+ description=description,
614
+ source_documents=document_ids,
615
+ style=style,
616
+ duration_seconds=script.total_duration_estimate,
617
+ file_size_mb=file_size_mb,
618
+ voices={"host1": list(voices)[0] if len(voices) > 0 else "Rachel",
619
+ "host2": list(voices)[1] if len(voices) > 1 else "Adam"},
620
+ generated_at=datetime.now().isoformat(),
621
+ generation_cost={"llm_cost": llm_cost, "tts_cost": tts_cost, "total": llm_cost + tts_cost},
622
+ key_topics=analysis.topics
623
+ )
624
+
625
+ def _save_metadata(self, metadata: PodcastMetadata):
626
+ """Save metadata to database"""
627
+ try:
628
+ # Load existing metadata
629
+ existing = json.loads(self.metadata_file.read_text())
630
+
631
+ # Add new metadata
632
+ existing.append(asdict(metadata))
633
+
634
+ # Save back
635
+ self.metadata_file.write_text(json.dumps(existing, indent=2))
636
+
637
+ logger.info(f"Metadata saved for podcast: {metadata.podcast_id}")
638
+
639
+ except Exception as e:
640
+ logger.error(f"Failed to save metadata: {e}")
641
+
642
+ def list_podcasts(self, limit: int = 10) -> List[PodcastMetadata]:
643
+ """List generated podcasts"""
644
+ try:
645
+ data = json.loads(self.metadata_file.read_text())
646
+ podcasts = [PodcastMetadata(**item) for item in data[-limit:]]
647
+ return list(reversed(podcasts)) # Most recent first
648
+ except Exception as e:
649
+ logger.error(f"Failed to list podcasts: {e}")
650
+ return []
651
+
652
+ def get_podcast(self, podcast_id: str) -> Optional[PodcastMetadata]:
653
+ """Get specific podcast metadata"""
654
+ try:
655
+ data = json.loads(self.metadata_file.read_text())
656
+ for item in data:
657
+ if item.get('podcast_id') == podcast_id:
658
+ return PodcastMetadata(**item)
659
+ return None
660
+ except Exception as e:
661
+ logger.error(f"Failed to get podcast: {e}")
662
+ return None
663
+
services/vector_store_service.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import pickle
4
+ import numpy as np
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ import faiss
7
+ from pathlib import Path
8
+ import asyncio
9
+ import json
10
+
11
+ from core.models import SearchResult, Chunk
12
+ import config
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class VectorStoreService:
17
+ def __init__(self):
18
+ self.config = config.config
19
+ self.index = None
20
+ self.chunks_metadata = {} # Maps index position to chunk metadata
21
+ self.dimension = None
22
+
23
+ # Paths
24
+ self.store_path = Path(self.config.VECTOR_STORE_PATH)
25
+ self.store_path.mkdir(parents=True, exist_ok=True)
26
+
27
+ self.index_path = self.store_path / f"{self.config.INDEX_NAME}.index"
28
+ self.metadata_path = self.store_path / f"{self.config.INDEX_NAME}_metadata.json"
29
+
30
+ # Load existing index if available
31
+ self._load_index()
32
+
33
+ def _load_index(self):
34
+ """Load existing FAISS index and metadata"""
35
+ try:
36
+ if self.index_path.exists() and self.metadata_path.exists():
37
+ logger.info("Loading existing FAISS index...")
38
+
39
+ # Load FAISS index
40
+ self.index = faiss.read_index(str(self.index_path))
41
+ self.dimension = self.index.d
42
+
43
+ # Load metadata
44
+ with open(self.metadata_path, 'r') as f:
45
+ self.chunks_metadata = json.load(f)
46
+
47
+ logger.info(f"Loaded index with {self.index.ntotal} vectors, dimension {self.dimension}")
48
+ else:
49
+ logger.info("No existing index found, will create new one")
50
+ except Exception as e:
51
+ logger.error(f"Error loading index: {str(e)}")
52
+
53
+ def _initialize_index(self, dimension: int):
54
+ """Initialize a new FAISS index"""
55
+ try:
56
+ # Use IndexFlatIP for cosine similarity (since embeddings are normalized)
57
+ self.index = faiss.IndexFlatIP(dimension)
58
+ self.dimension = dimension
59
+ self.chunks_metadata = {}
60
+ logger.info(f"Initialized new FAISS index with dimension {dimension}")
61
+ except Exception as e:
62
+ logger.error(f"Error initializing index: {str(e)}")
63
+ raise
64
+
65
+ async def add_chunks(self, chunks: List[Chunk]) -> bool:
66
+ """Add chunks to the vector store"""
67
+ if not chunks:
68
+ return True
69
+
70
+ try:
71
+ # Extract embeddings and metadata
72
+ embeddings = []
73
+ new_metadata = {}
74
+
75
+ for chunk in chunks:
76
+ if chunk.embedding and len(chunk.embedding) > 0:
77
+ embeddings.append(chunk.embedding)
78
+ # Store metadata using the current index position
79
+ current_index = len(self.chunks_metadata) + len(embeddings) - 1
80
+ new_metadata[str(current_index)] = {
81
+ "chunk_id": chunk.id,
82
+ "document_id": chunk.document_id,
83
+ "content": chunk.content,
84
+ "chunk_index": chunk.chunk_index,
85
+ "start_pos": chunk.start_pos,
86
+ "end_pos": chunk.end_pos,
87
+ "metadata": chunk.metadata
88
+ }
89
+
90
+ if not embeddings:
91
+ logger.warning("No valid embeddings found in chunks")
92
+ return False
93
+
94
+ # Check for dimension mismatch
95
+ if self.index is not None and self.dimension is not None:
96
+ if len(embeddings[0]) != self.dimension:
97
+ logger.warning(f"Dimension mismatch! New embeddings have {len(embeddings[0])}, but index has {self.dimension}. Rebuilding index.")
98
+ # Reset index
99
+ self.index = None
100
+ self.chunks_metadata = {}
101
+ self.dimension = None
102
+
103
+ # Initialize index if needed
104
+ if self.index is None:
105
+ self._initialize_index(len(embeddings[0]))
106
+
107
+ # Convert to numpy array
108
+ embeddings_array = np.array(embeddings, dtype=np.float32)
109
+
110
+ # Add to FAISS index
111
+ self.index.add(embeddings_array)
112
+
113
+ # Update metadata
114
+ self.chunks_metadata.update(new_metadata)
115
+
116
+ # Save index and metadata
117
+ await self._save_index()
118
+
119
+ logger.info(f"Added {len(embeddings)} chunks to vector store")
120
+ return True
121
+
122
+ except Exception as e:
123
+ logger.error(f"Error adding chunks to vector store: {str(e)}")
124
+ return False
125
+
126
+ async def search(self, query_embedding: List[float], top_k: int = 5,
127
+ filters: Optional[Dict[str, Any]] = None) -> List[SearchResult]:
128
+ """Search for similar chunks"""
129
+ if self.index is None or self.index.ntotal == 0:
130
+ logger.warning("No index available or index is empty")
131
+ return []
132
+
133
+ try:
134
+ # Convert query embedding to numpy array
135
+ query_array = np.array([query_embedding], dtype=np.float32)
136
+
137
+ # Perform search
138
+ scores, indices = self.index.search(query_array, min(top_k, self.index.ntotal))
139
+
140
+ # Convert results to SearchResult objects
141
+ results = []
142
+ for score, idx in zip(scores[0], indices[0]):
143
+ if idx == -1: # FAISS returns -1 for empty slots
144
+ continue
145
+
146
+ chunk_metadata = self.chunks_metadata.get(str(idx))
147
+ if chunk_metadata:
148
+ # Apply filters if specified
149
+ if filters and not self._apply_filters(chunk_metadata, filters):
150
+ continue
151
+
152
+ result = SearchResult(
153
+ chunk_id=chunk_metadata["chunk_id"],
154
+ document_id=chunk_metadata["document_id"],
155
+ content=chunk_metadata["content"],
156
+ score=float(score),
157
+ metadata=chunk_metadata.get("metadata", {})
158
+ )
159
+ results.append(result)
160
+
161
+ # Sort by score (descending)
162
+ results.sort(key=lambda x: x.score, reverse=True)
163
+
164
+ logger.info(f"Found {len(results)} search results")
165
+ return results
166
+
167
+ except Exception as e:
168
+ logger.error(f"Error searching vector store: {str(e)}")
169
+ return []
170
+
171
+ def _apply_filters(self, chunk_metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
172
+ """Apply filters to chunk metadata"""
173
+ try:
174
+ for key, value in filters.items():
175
+ if key == "document_id":
176
+ if chunk_metadata.get("document_id") != value:
177
+ return False
178
+ elif key == "document_ids":
179
+ if chunk_metadata.get("document_id") not in value:
180
+ return False
181
+ elif key == "content_length_min":
182
+ if len(chunk_metadata.get("content", "")) < value:
183
+ return False
184
+ elif key == "content_length_max":
185
+ if len(chunk_metadata.get("content", "")) > value:
186
+ return False
187
+ # Add more filter types as needed
188
+
189
+ return True
190
+ except Exception as e:
191
+ logger.error(f"Error applying filters: {str(e)}")
192
+ return True
193
+
194
+ async def _save_index(self):
195
+ """Save the FAISS index and metadata to disk"""
196
+ try:
197
+ if self.index is not None:
198
+ # Save FAISS index
199
+ faiss.write_index(self.index, str(self.index_path))
200
+
201
+ # Save metadata
202
+ with open(self.metadata_path, 'w') as f:
203
+ json.dump(self.chunks_metadata, f, indent=2)
204
+
205
+ logger.debug("Saved index and metadata to disk")
206
+ except Exception as e:
207
+ logger.error(f"Error saving index: {str(e)}")
208
+
209
+ async def get_stats(self) -> Dict[str, Any]:
210
+ """Get statistics about the vector store"""
211
+ try:
212
+ return {
213
+ "total_vectors": self.index.ntotal if self.index else 0,
214
+ "dimension": self.dimension,
215
+ "index_type": type(self.index).__name__ if self.index else None,
216
+ "metadata_entries": len(self.chunks_metadata),
217
+ "index_file_exists": self.index_path.exists(),
218
+ "metadata_file_exists": self.metadata_path.exists()
219
+ }
220
+ except Exception as e:
221
+ logger.error(f"Error getting stats: {str(e)}")
222
+ return {"error": str(e)}
223
+
224
+ async def delete_document(self, document_id: str) -> bool:
225
+ """Delete all chunks for a specific document"""
226
+ try:
227
+ # Find indices to remove
228
+ indices_to_remove = []
229
+ for idx, metadata in self.chunks_metadata.items():
230
+ if metadata.get("document_id") == document_id:
231
+ indices_to_remove.append(int(idx))
232
+
233
+ if not indices_to_remove:
234
+ logger.warning(f"No chunks found for document {document_id}")
235
+ return False
236
+
237
+ # FAISS doesn't support removing individual vectors efficiently
238
+ # We need to rebuild the index without the removed vectors
239
+ if self.index and self.index.ntotal > 0:
240
+ # Get all embeddings except the ones to remove
241
+ all_embeddings = []
242
+ new_metadata = {}
243
+ new_index = 0
244
+
245
+ for old_idx in range(self.index.ntotal):
246
+ if old_idx not in indices_to_remove:
247
+ # Get the embedding from FAISS
248
+ embedding = self.index.reconstruct(old_idx)
249
+ all_embeddings.append(embedding)
250
+
251
+ # Update metadata with new index
252
+ old_metadata = self.chunks_metadata.get(str(old_idx))
253
+ if old_metadata:
254
+ new_metadata[str(new_index)] = old_metadata
255
+ new_index += 1
256
+
257
+ # Rebuild index
258
+ if all_embeddings:
259
+ self._initialize_index(self.dimension)
260
+ embeddings_array = np.array(all_embeddings, dtype=np.float32)
261
+ self.index.add(embeddings_array)
262
+ self.chunks_metadata = new_metadata
263
+ else:
264
+ # No embeddings left, create empty index
265
+ self._initialize_index(self.dimension)
266
+
267
+ # Save updated index
268
+ await self._save_index()
269
+
270
+ logger.info(f"Deleted {len(indices_to_remove)} chunks for document {document_id}")
271
+ return True
272
+
273
+ except Exception as e:
274
+ logger.error(f"Error deleting document chunks: {str(e)}")
275
+ return False
276
+
277
+ async def clear_all(self) -> bool:
278
+ """Clear all data from the vector store"""
279
+ try:
280
+ self.index = None
281
+ self.chunks_metadata = {}
282
+ self.dimension = None
283
+
284
+ # Remove files
285
+ if self.index_path.exists():
286
+ self.index_path.unlink()
287
+ if self.metadata_path.exists():
288
+ self.metadata_path.unlink()
289
+
290
+ logger.info("Cleared all data from vector store")
291
+ return True
292
+ except Exception as e:
293
+ logger.error(f"Error clearing vector store: {str(e)}")
294
+ return False