Spaces:

MCP-1st-Birthday
/

AI-Digital-Library-Assistant

Running

App Files Files Community

Nihal2000 commited on 11 days ago

Commit

86aa5e4

1 Parent(s): 2bc9ae2

Initial deployment of AI Digital Library Assistant

Browse files

Files changed (48) hide show

.dockerignore +67 -0
Dockerfile +31 -0
README.md +30 -8
app.py +1374 -0
config.py +56 -0
core/__init__.py +1 -0
core/__pycache__/__init__.cpython-313.pyc +0 -0
core/__pycache__/chunker.cpython-313.pyc +0 -0
core/__pycache__/document_parser.cpython-313.pyc +0 -0
core/__pycache__/models.cpython-313.pyc +0 -0
core/__pycache__/text_preprocessor.cpython-313.pyc +0 -0
core/chunker.py +303 -0
core/document_parser.py +199 -0
core/models.py +102 -0
core/text_preprocessor.py +186 -0
mcp_server.py +290 -0
mcp_tools/__init__.py +1 -0
mcp_tools/__pycache__/__init__.cpython-313.pyc +0 -0
mcp_tools/__pycache__/generative_tool.cpython-313.pyc +0 -0
mcp_tools/__pycache__/ingestion_tool.cpython-313.pyc +0 -0
mcp_tools/__pycache__/podcast_tool.cpython-313.pyc +0 -0
mcp_tools/__pycache__/search_tool.cpython-313.pyc +0 -0
mcp_tools/__pycache__/voice_tool.cpython-313.pyc +0 -0
mcp_tools/generative_tool.py +407 -0
mcp_tools/ingestion_tool.py +368 -0
mcp_tools/podcast_tool.py +138 -0
mcp_tools/search_tool.py +437 -0
mcp_tools/utils.py +373 -0
mcp_tools/voice_tool.py +63 -0
requirements.txt +31 -0
services/__init__.py +1 -0
services/__pycache__/__init__.cpython-313.pyc +0 -0
services/__pycache__/document_store_service.cpython-313.pyc +0 -0
services/__pycache__/elevenlabs_service.cpython-313.pyc +0 -0
services/__pycache__/embedding_service.cpython-313.pyc +0 -0
services/__pycache__/llamaindex_service.cpython-313.pyc +0 -0
services/__pycache__/llm_service.cpython-313.pyc +0 -0
services/__pycache__/ocr_service.cpython-313.pyc +0 -0
services/__pycache__/podcast_generator_service.cpython-313.pyc +0 -0
services/__pycache__/vector_store_service.cpython-313.pyc +0 -0
services/document_store_service.py +349 -0
services/elevenlabs_service.py +341 -0
services/embedding_service.py +243 -0
services/llamaindex_service.py +199 -0
services/llm_service.py +420 -0
services/ocr_service.py +288 -0
services/podcast_generator_service.py +663 -0
services/vector_store_service.py +294 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,67 @@

+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Git
+.git/
+.gitignore
+.gitattributes
+# CI/CD
+.github/
+.gitlab-ci.yml
+# Documentation
+README.md
+docs/
+*.md
+!requirements.txt
+# Test files
+test_*.py
+*_test.py
+tests/
+.pytest_cache/
+# Large data files (these should be in volumes)
+data/
+vector_store/
+documents/
+podcasts/
+*.db
+*.sqlite
+# Logs
+*.log
+logs/
+# OS files
+.DS_Store
+Thumbs.db
+# Deployment files (not needed in container)
+deploy_from_env.py
+modal_deploy.py
+blaxel.yaml
+bl.cmd
+test_persistence.py
+# Environment files
+.env
+.env.*

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    ca-certificates \
+    tesseract-ocr \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create data directories
+RUN mkdir -p /data/vector_store /data/documents /data/podcasts
+# Expose port (HuggingFace Spaces uses 7860)
+EXPOSE 7860
+# Set environment variables
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+ENV GRADIO_SERVER_PORT=7860
+# Run the MCP server
+CMD ["python", "mcp_server.py"]

README.md CHANGED Viewed

@@ -1,12 +1,34 @@
 ---
-title: AiDigitalLibraryAssistant
-emoji: 🏢
-colorFrom: green
-colorTo: blue
-sdk: gradio
-sdk_version: 6.0.1
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AI Digital Library Assistant
+emoji: 📚
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+app_port: 7860
 pinned: false
+license: mit
 ---
+# AI Digital Library Assistant
+An intelligent document organization and retrieval system powered by AI.
+## Features
+- **Document Ingestion**: Upload PDF, DOCX, TXT, and Images
+- **Semantic Search**: Find documents using natural language queries
+- **AI Q&A**: Ask questions about your document library
+- **Voice Assistant**: Talk to your library using ElevenLabs voice AI
+- **Podcast Generation**: Turn documents into engaging audio podcasts
+## Setup
+This Space is configured to run using Docker. It requires several API keys to function fully:
+- `OPENAI_API_KEY`: For embeddings and LLM
+- `ANTHROPIC_API_KEY`: For Claude 3.5 Sonnet
+- `MISTRAL_API_KEY`: For Mistral models and OCR
+- `ELEVENLABS_API_KEY`: For voice features
+- `ELEVENLABS_AGENT_ID`: For conversational AI agent
+Please set these in the Space Settings -> Variables and Secrets.

app.py ADDED Viewed

	@@ -0,0 +1,1374 @@

+import gradio as gr
+import os
+import asyncio
+import json
+import logging
+import tempfile
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+import nest_asyncio
+# Apply nest_asyncio to handle nested event loops in Gradio
+nest_asyncio.apply()
+# Import our custom modules
+from mcp_tools.ingestion_tool import IngestionTool
+from mcp_tools.search_tool import SearchTool
+from mcp_tools.generative_tool import GenerativeTool
+from services.vector_store_service import VectorStoreService
+from services.document_store_service import DocumentStoreService
+from services.embedding_service import EmbeddingService
+from services.llm_service import LLMService
+from services.ocr_service import OCRService
+from core.models import SearchResult, Document
+import config
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Import our custom modules
+from mcp_tools.ingestion_tool import IngestionTool
+from mcp_tools.search_tool import SearchTool
+from mcp_tools.generative_tool import GenerativeTool
+from services.vector_store_service import VectorStoreService
+from services.document_store_service import DocumentStoreService
+from services.embedding_service import EmbeddingService
+from services.llm_service import LLMService
+from services.ocr_service import OCRService
+from core.models import SearchResult, Document
+import config
+from services.llamaindex_service import LlamaIndexService
+from services.elevenlabs_service import ElevenLabsService
+from services.podcast_generator_service import PodcastGeneratorService
+from mcp_tools.voice_tool import VoiceTool
+from mcp_tools.podcast_tool import PodcastTool
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class ContentOrganizerMCPServer:
+    def __init__(self):
+        # Initialize services
+        logger.info("Initializing Content Organizer MCP Server...")
+        self.vector_store = VectorStoreService()
+        self.document_store = DocumentStoreService()
+        self.embedding_service = EmbeddingService()
+        self.llm_service = LLMService()
+        self.ocr_service = OCRService()
+        self.llamaindex_service = LlamaIndexService(self.document_store)
+        # Initialize ElevenLabs voice service
+        self.elevenlabs_service = ElevenLabsService(self.llamaindex_service)
+        # Initialize Podcast Generator
+        self.podcast_generator = PodcastGeneratorService(
+            llamaindex_service=self.llamaindex_service,
+            llm_service=self.llm_service
+        )
+        # Initialize tools
+        self.ingestion_tool = IngestionTool(
+            vector_store=self.vector_store,
+            document_store=self.document_store,
+            embedding_service=self.embedding_service,
+            ocr_service=self.ocr_service
+        )
+        self.search_tool = SearchTool(
+            vector_store=self.vector_store,
+            embedding_service=self.embedding_service,
+            document_store=self.document_store
+        )
+        self.generative_tool = GenerativeTool(
+            llm_service=self.llm_service,
+            search_tool=self.search_tool
+        )
+        self.voice_tool = VoiceTool(self.elevenlabs_service)
+        self.podcast_tool = PodcastTool(self.podcast_generator)
+        # Track processing status
+        self.processing_status = {}
+        # Document cache for quick access
+        self.document_cache = {}
+        logger.info("Content Organizer MCP Server initialized successfully!")
+    def run_async(self, coro):
+        """Helper to run async functions in Gradio"""
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        if loop.is_running():
+            # If loop is already running, create a task
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(asyncio.run, coro)
+                return future.result()
+        else:
+            return loop.run_until_complete(coro)
+    async def ingest_document_async(self, file_path: str, file_type: str) -> Dict[str, Any]:
+        """MCP Tool: Ingest and process a document"""
+        try:
+            task_id = str(uuid.uuid4())
+            self.processing_status[task_id] = {"status": "processing", "progress": 0}
+            result = await self.ingestion_tool.process_document(file_path, file_type, task_id)
+            if result.get("success"):
+                self.processing_status[task_id] = {"status": "completed", "progress": 100}
+                doc_id = result.get("document_id")
+                if doc_id:
+                    doc = await self.document_store.get_document(doc_id)
+                    if doc:
+                        self.document_cache[doc_id] = doc
+                return result
+            else:
+                self.processing_status[task_id] = {"status": "failed", "error": result.get("error")}
+                return result
+        except Exception as e:
+            logger.error(f"Document ingestion failed: {str(e)}")
+            return {"success": False, "error": str(e), "message": "Failed to process document"}
+    async def get_document_content_async(self, document_id: str) -> Optional[str]:
+        """Get document content by ID"""
+        try:
+            # Check cache first
+            if document_id in self.document_cache:
+                return self.document_cache[document_id].content
+            # Get from store
+            doc = await self.document_store.get_document(document_id)
+            if doc:
+                self.document_cache[document_id] = doc
+                return doc.content
+            return None
+        except Exception as e:
+            logger.error(f"Error getting document content: {str(e)}")
+            return None
+    async def semantic_search_async(self, query: str, top_k: int = 5, filters: Optional[Dict] = None) -> Dict[str, Any]:
+        """MCP Tool: Perform semantic search"""
+        try:
+            results = await self.search_tool.search(query, top_k, filters)
+            return {"success": True, "query": query, "results": [result.to_dict() for result in results], "total_results": len(results)}
+        except Exception as e:
+            logger.error(f"Semantic search failed: {str(e)}")
+            return {"success": False, "error": str(e), "query": query, "results": []}
+    async def summarize_content_async(self, content: str = None, document_id: str = None, style: str = "concise") -> Dict[str, Any]:
+        try:
+            if document_id and document_id != "none":
+                content = await self.get_document_content_async(document_id)
+                if not content:
+                    return {"success": False, "error": f"Document {document_id} not found"}
+            if not content or not content.strip():
+                return {"success": False, "error": "No content provided for summarization"}
+            max_content_length = 4000
+            if len(content) > max_content_length:
+                content = content[:max_content_length] + "..."
+            summary = await self.generative_tool.summarize(content, style)
+            return {"success": True, "summary": summary, "original_length": len(content), "summary_length": len(summary), "style": style, "document_id": document_id}
+        except Exception as e:
+            logger.error(f"Summarization failed: {str(e)}")
+            return {"success": False, "error": str(e)}
+    async def generate_tags_async(self, content: str = None, document_id: str = None, max_tags: int = 5) -> Dict[str, Any]:
+        """MCP Tool: Generate tags for content"""
+        try:
+            if document_id and document_id != "none":
+                content = await self.get_document_content_async(document_id)
+                if not content:
+                    return {"success": False, "error": f"Document {document_id} not found"}
+            if not content or not content.strip():
+                return {"success": False, "error": "No content provided for tag generation"}
+            tags = await self.generative_tool.generate_tags(content, max_tags)
+            if document_id and document_id != "none" and tags:
+                await self.document_store.update_document_metadata(document_id, {"tags": tags})
+            return {"success": True, "tags": tags, "content_length": len(content), "document_id": document_id}
+        except Exception as e:
+            logger.error(f"Tag generation failed: {str(e)}")
+            return {"success": False, "error": str(e)}
+    async def generate_podcast_async(
+        self,
+        document_ids: List[str],
+        style: str = "conversational",
+        duration_minutes: int = 10,
+        host1_voice: str = "Rachel",
+        host2_voice: str = "Adam"
+    ) -> Dict[str, Any]:
+        """Generate podcast from documents"""
+        try:
+            result = await self.podcast_tool.generate_podcast(
+                document_ids=document_ids,
+                style=style,
+                duration_minutes=duration_minutes,
+                host1_voice=host1_voice,
+                host2_voice=host2_voice
+            )
+            return result
+        except Exception as e:
+            logger.error(f"Podcast generation failed: {str(e)}")
+            return {"success": False, "error": str(e)}
+    async def answer_question_async(self, question: str, context_filter: Optional[Dict] = None) -> Dict[str, Any]:
+        try:
+            search_results = await self.search_tool.search(question, top_k=5, filters=context_filter)
+            if not search_results:
+                return {"success": False, "error": "No relevant context found in your documents. Please make sure you have uploaded relevant documents.", "question": question}
+            answer = await self.generative_tool.answer_question(question, search_results)
+            return {"success": True, "question": question, "answer": answer, "sources": [result.to_dict() for result in search_results], "confidence": "high" if len(search_results) >= 3 else "medium"}
+        except Exception as e:
+            logger.error(f"Question answering failed: {str(e)}")
+            return {"success": False, "error": str(e), "question": question}
+    async def generate_outline_async(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> Dict[str, Any]:
+        try:
+            outline = await self.generative_tool.generate_outline(topic, num_sections, detail_level)
+            return {"success": True, "result": outline}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def explain_concept_async(self, concept: str, audience: str = "general", length: str = "medium") -> Dict[str, Any]:
+        try:
+            explanation = await self.generative_tool.explain_concept(concept, audience, length)
+            return {"success": True, "result": explanation}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def paraphrase_text_async(self, text: str, style: str = "formal") -> Dict[str, Any]:
+        try:
+            paraphrase = await self.generative_tool.paraphrase_text(text, style)
+            return {"success": True, "result": paraphrase}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def categorize_content_async(self, content: str, categories: List[str]) -> Dict[str, Any]:
+        try:
+            category = await self.generative_tool.categorize(content, categories)
+            return {"success": True, "result": category}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def extract_key_insights_async(self, content: str, num_insights: int = 5) -> Dict[str, Any]:
+        try:
+            insights = await self.generative_tool.extract_key_insights(content, num_insights)
+            return {"success": True, "result": "\n".join([f"- {insight}" for insight in insights])}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def generate_questions_async(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> Dict[str, Any]:
+        try:
+            questions = await self.generative_tool.generate_questions(content, question_type, num_questions)
+            return {"success": True, "result": "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    async def extract_key_information_async(self, content: str) -> Dict[str, Any]:
+        try:
+            info = await self.llm_service.extract_key_information(content)
+            return {"success": True, "result": json.dumps(info, indent=2)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    def list_documents_sync(self, limit: int = 100, offset: int = 0) -> Dict[str, Any]:
+        try:
+            documents = self.run_async(self.document_store.list_documents(limit, offset))
+            return {"success": True, "documents": [doc.to_dict() for doc in documents], "total": len(documents)}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+mcp_server = ContentOrganizerMCPServer()
+def get_document_list():
+    try:
+        result = mcp_server.list_documents_sync(limit=100)
+        if result["success"]:
+            if result["documents"]:
+                doc_list_str = "📚 Documents in Library:\n\n"
+                for i, doc_item in enumerate(result["documents"], 1):
+                    doc_list_str += f"{i}. {doc_item['filename']} (ID: {doc_item['id'][:8]}...)\n"
+                    doc_list_str += f"   Type: {doc_item['doc_type']}, Size: {doc_item['file_size']} bytes\n"
+                    if doc_item.get('tags'):
+                        doc_list_str += f"   Tags: {', '.join(doc_item['tags'])}\n"
+                    doc_list_str += f"   Created: {doc_item['created_at'][:10]}\n\n"
+                return doc_list_str
+            else:
+                return "No documents in library yet. Upload some documents to get started!"
+        else:
+            return f"Error loading documents: {result['error']}"
+    except Exception as e:
+        return f"Error: {str(e)}"
+def get_document_choices():
+    try:
+        result = mcp_server.list_documents_sync(limit=100)
+        if result["success"] and result["documents"]:
+            choices = [(f"{doc['filename']} ({doc['id'][:8]}...)", doc['id']) for doc in result["documents"]]
+            logger.info(f"Generated {len(choices)} document choices")
+            return choices
+        return []
+    except Exception as e:
+        logger.error(f"Error getting document choices: {str(e)}")
+        return []
+def refresh_library():
+    doc_list_refreshed = get_document_list()
+    doc_choices_refreshed = get_document_choices()
+    logger.info(f"Refreshing library. Found {len(doc_choices_refreshed)} choices.")
+    return (
+        doc_list_refreshed,
+        gr.update(choices=doc_choices_refreshed),
+        gr.update(choices=doc_choices_refreshed),
+        gr.update(choices=doc_choices_refreshed)
+    )
+def upload_and_process_file(file):
+    if file is None:
+        doc_list_initial = get_document_list()
+        doc_choices_initial = get_document_choices()
+        return (
+            "No file uploaded", "", doc_list_initial,
+            gr.update(choices=doc_choices_initial),
+            gr.update(choices=doc_choices_initial),
+            gr.update(choices=doc_choices_initial)
+        )
+    try:
+        file_path = file.name if hasattr(file, 'name') else str(file)
+        file_type = Path(file_path).suffix.lower().strip('.') # Ensure suffix is clean
+        logger.info(f"Processing file: {file_path}, type: {file_type}")
+        result = mcp_server.run_async(mcp_server.ingest_document_async(file_path, file_type))
+        doc_list_updated = get_document_list()
+        doc_choices_updated = get_document_choices()
+        if result["success"]:
+            return (
+                f"✅ Success: {result['message']}\nDocument ID: {result['document_id']}\nChunks created: {result['chunks_created']}",
+                result["document_id"],
+                doc_list_updated,
+                gr.update(choices=doc_choices_updated),
+                gr.update(choices=doc_choices_updated),
+                gr.update(choices=doc_choices_updated)
+            )
+        else:
+            return (
+                f"❌ Error: {result.get('error', 'Unknown error')}", "",
+                doc_list_updated,
+                gr.update(choices=doc_choices_updated),
+                gr.update(choices=doc_choices_updated),
+                gr.update(choices=doc_choices_updated)
+            )
+    except Exception as e:
+        logger.error(f"Error processing file: {str(e)}")
+        doc_list_error = get_document_list()
+        doc_choices_error = get_document_choices()
+        return (
+            f"❌ Error: {str(e)}", "",
+            doc_list_error,
+            gr.update(choices=doc_choices_error),
+            gr.update(choices=doc_choices_error),
+            gr.update(choices=doc_choices_error)
+        )
+def perform_search(query, top_k):
+    if not query.strip():
+        return "Please enter a search query"
+    try:
+        result = mcp_server.run_async(mcp_server.semantic_search_async(query, int(top_k)))
+        if result["success"]:
+            if result["results"]:
+                output_str = f"🔍 Found {result['total_results']} results for: '{query}'\n\n"
+                for i, res_item in enumerate(result["results"], 1):
+                    output_str += f"Result {i}:\n"
+                    output_str += f"📊 Relevance Score: {res_item['score']:.3f}\n"
+                    output_str += f"📄 Content: {res_item['content'][:300]}...\n"
+                    if 'document_filename' in res_item.get('metadata', {}):
+                        output_str += f"📁 Source: {res_item['metadata']['document_filename']}\n"
+                    output_str += f"🔗 Document ID: {res_item.get('document_id', 'Unknown')}\n"
+                    output_str += "-" * 80 + "\n\n"
+                return output_str
+            else:
+                return f"No results found for: '{query}'\n\nMake sure you have uploaded relevant documents first."
+        else:
+            return f"❌ Search failed: {result['error']}"
+    except Exception as e:
+        logger.error(f"Search error: {str(e)}")
+        return f"❌ Error: {str(e)}"
+def update_options_visibility(task):
+    """Update visibility of options based on selected task"""
+    return (
+        gr.update(visible=task == "Summarize"),          # summary_style
+        gr.update(visible=task == "Generate Outline"),   # outline_sections
+        gr.update(visible=task == "Generate Outline"),   # outline_detail
+        gr.update(visible=task == "Explain Concept"),    # explain_audience
+        gr.update(visible=task == "Explain Concept"),    # explain_length
+        gr.update(visible=task == "Paraphrase"),         # paraphrase_style
+        gr.update(visible=task == "Categorize"),         # categories_input
+        gr.update(visible=task in ["Key Insights", "Generate Questions"]), # num_items
+        gr.update(visible=task == "Generate Questions")  # question_type
+    )
+def execute_content_task(task, doc_choice, custom_text,
+                        summary_style, outline_sections, outline_detail,
+                        explain_audience, explain_length,
+                        paraphrase_style, categories_input,
+                        num_items, question_type):
+    try:
+        # Get content
+        content = ""
+        if custom_text and custom_text.strip():
+            content = custom_text
+        elif doc_choice and doc_choice != "none":
+            content = mcp_server.run_async(mcp_server.get_document_content_async(doc_choice))
+            if not content:
+                return "❌ Error: Document not found or empty"
+        else:
+            if task == "Generate Outline":
+                content = custom_text # Topic is passed as text
+            else:
+                return "⚠️ Please select a document or enter text"
+        # Execute task
+        result = {"success": False, "error": "Unknown task"}
+        if task == "Summarize":
+            result = mcp_server.run_async(mcp_server.summarize_content_async(content=content, style=summary_style))
+            if result["success"]:
+                return f"📝 Summary ({summary_style}):\n\n{result['summary']}"
+        elif task == "Generate Outline":
+            # For outline, content is the topic
+            result = mcp_server.run_async(mcp_server.generate_outline_async(content, int(outline_sections), outline_detail))
+            if result["success"]:
+                return f"📝 Outline for '{content}':\n\n{result['result']}"
+        elif task == "Explain Concept":
+            # For explain, content is the concept
+            result = mcp_server.run_async(mcp_server.explain_concept_async(content, explain_audience, explain_length))
+            if result["success"]:
+                return f"💡 Explanation ({explain_audience}):\n\n{result['result']}"
+        elif task == "Paraphrase":
+            result = mcp_server.run_async(mcp_server.paraphrase_text_async(content, paraphrase_style))
+            if result["success"]:
+                return f"🔄 Paraphrased Text ({paraphrase_style}):\n\n{result['result']}"
+        elif task == "Categorize":
+            categories = [c.strip() for c in categories_input.split(',')] if categories_input else []
+            result = mcp_server.run_async(mcp_server.categorize_content_async(content, categories))
+            if result["success"]:
+                return f"🏷️ Category:\n\n{result['result']}"
+        elif task == "Key Insights":
+            result = mcp_server.run_async(mcp_server.extract_key_insights_async(content, int(num_items)))
+            if result["success"]:
+                return f"🔍 Key Insights:\n\n{result['result']}"
+        elif task == "Generate Questions":
+            result = mcp_server.run_async(mcp_server.generate_questions_async(content, question_type, int(num_items)))
+            if result["success"]:
+                return f"❓ Generated Questions ({question_type}):\n\n{result['result']}"
+        elif task == "Extract Key Info":
+            result = mcp_server.run_async(mcp_server.extract_key_information_async(content))
+            if result["success"]:
+                return f"📊 Key Information:\n\n{result['result']}"
+        if not result["success"]:
+            return f"❌ Error: {result.get('error', 'Unknown error')}"
+        return "✅ Task completed"
+    except Exception as e:
+        logger.error(f"Task execution error: {str(e)}")
+        return f"❌ Error: {str(e)}"
+def generate_tags_for_document(doc_choice, custom_text, max_tags):
+    try:
+        logger.info(f"Generate tags called with doc_choice: {doc_choice}, type: {type(doc_choice)}")
+        document_id = doc_choice if doc_choice and doc_choice != "none" and doc_choice != "" else None
+        if custom_text and custom_text.strip():
+            logger.info("Using custom text for tag generation")
+            result = mcp_server.run_async(mcp_server.generate_tags_async(content=custom_text, max_tags=int(max_tags)))
+        elif document_id:
+            logger.info(f"Generating tags for document: {document_id}")
+            result = mcp_server.run_async(mcp_server.generate_tags_async(document_id=document_id, max_tags=int(max_tags)))
+        else:
+            return "Please select a document from the dropdown or enter text to generate tags"
+        if result["success"]:
+            tags_str = ", ".join(result["tags"])
+            output_str = f"🏷️ Generated Tags:\n\n{tags_str}\n\n"
+            output_str += f"📊 Statistics:\n"
+            output_str += f"- Content length: {result['content_length']} characters\n"
+            output_str += f"- Number of tags: {len(result['tags'])}\n"
+            if result.get('document_id'):
+                output_str += f"- Document ID: {result['document_id']}\n"
+                output_str += f"\n✅ Tags have been saved to the document."
+            return output_str
+        else:
+            return f"❌ Tag generation failed: {result['error']}"
+    except Exception as e:
+        logger.error(f"Tag generation error: {str(e)}")
+        return f"❌ Error: {str(e)}"
+def ask_question(question):
+    if not question.strip():
+        return "Please enter a question"
+    try:
+        result = mcp_server.run_async(mcp_server.answer_question_async(question))
+        if result["success"]:
+            output_str = f"❓ Question: {result['question']}\n\n"
+            output_str += f"💡 Answer:\n{result['answer']}\n\n"
+            output_str += f"🎯 Confidence: {result['confidence']}\n\n"
+            output_str += f"📚 Sources Used ({len(result['sources'])}):\n"
+            for i, source_item in enumerate(result['sources'], 1):
+                filename = source_item.get('metadata', {}).get('document_filename', 'Unknown')
+                output_str += f"\n{i}. 📄 {filename}\n"
+                output_str += f"   📝 Excerpt: {source_item['content'][:150]}...\n"
+                output_str += f"   📊 Relevance: {source_item['score']:.3f}\n"
+            return output_str
+        else:
+            return f"❌ {result.get('error', 'Failed to answer question')}"
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+def delete_document_from_library(document_id):
+    if not document_id:
+        doc_list_current = get_document_list()
+        doc_choices_current = get_document_choices()
+        return (
+            "No document selected to delete.",
+            doc_list_current,
+            gr.update(choices=doc_choices_current),
+            gr.update(choices=doc_choices_current),
+            gr.update(choices=doc_choices_current)
+        )
+    try:
+        delete_doc_store_result = mcp_server.run_async(mcp_server.document_store.delete_document(document_id))
+        delete_vec_store_result = mcp_server.run_async(mcp_server.vector_store.delete_document(document_id))
+        msg = ""
+        if delete_doc_store_result:
+            msg += f"🗑️ Document {document_id[:8]}... deleted from document store. "
+        else:
+            msg += f"❌ Failed to delete document {document_id[:8]}... from document store. "
+        if delete_vec_store_result:
+             msg += "Embeddings deleted from vector store."
+        else:
+             msg += "Failed to delete embeddings from vector store (or no embeddings existed)."
+        doc_list_updated = get_document_list()
+        doc_choices_updated = get_document_choices()
+        return (
+            msg,
+            doc_list_updated,
+            gr.update(choices=doc_choices_updated),
+            gr.update(choices=doc_choices_updated),
+            gr.update(choices=doc_choices_updated)
+        )
+    except Exception as e:
+        logger.error(f"Error deleting document: {str(e)}")
+        doc_list_error = get_document_list()
+        doc_choices_error = get_document_choices()
+        return (
+            f"❌ Error deleting document: {str(e)}",
+            doc_list_error,
+            gr.update(choices=doc_choices_error),
+            gr.update(choices=doc_choices_error),
+            gr.update(choices=doc_choices_error)
+        )
+# Voice conversation state - global scope
+voice_conversation_state = {
+    "session_id": None,
+    "active": False,
+    "transcript": []
+}
+def start_voice_conversation():
+    """Start a new voice conversation session"""
+    try:
+        if not mcp_server.elevenlabs_service.is_available():
+            return (
+                "⚠️ Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID in .env",
+                gr.update(interactive=False),
+                gr.update(interactive=True),
+                ""
+            )
+        session_id = str(uuid.uuid4())
+        result = mcp_server.run_async(mcp_server.elevenlabs_service.start_conversation(session_id))
+        if result.get("success"):
+            voice_conversation_state["session_id"] = session_id
+            voice_conversation_state["active"] = True
+            voice_conversation_state["transcript"] = []
+            return (
+                "🎙️ Voice assistant is ready. Type your question below.",
+                gr.update(interactive=False),
+                gr.update(interactive=True),
+                []
+            )
+        else:
+            return (
+                f"❌ Failed to start conversation: {result.get('error')}",
+                gr.update(interactive=True),
+                gr.update(interactive=False),
+                []
+            )
+    except Exception as e:
+        logger.error(f"Error starting voice conversation: {str(e)}")
+        return (
+            f"❌ Error: {str(e)}",
+            gr.update(interactive=True),
+            gr.update(interactive=False),
+            []
+        )
+def stop_voice_conversation():
+    """Stop active voice conversation"""
+    try:
+        if not voice_conversation_state["active"]:
+            return (
+                "No active conversation",
+                gr.update(interactive=True),
+                gr.update(interactive=False),
+                voice_conversation_state["transcript"]
+            )
+        session_id = voice_conversation_state["session_id"]
+        if session_id:
+            mcp_server.run_async(mcp_server.elevenlabs_service.end_conversation(session_id))
+        voice_conversation_state["active"] = False
+        voice_conversation_state["session_id"] = None
+        return (
+            "✅ Conversation ended",
+            gr.update(interactive=True),
+            gr.update(interactive=False),
+            voice_conversation_state["transcript"]
+        )
+    except Exception as e:
+        logger.error(f"Error stopping conversation: {str(e)}")
+        return (
+            f"❌ Error: {str(e)}",
+            gr.update(interactive=True),
+            gr.update(interactive=False),
+            voice_conversation_state["transcript"]
+        )
+def send_voice_message(message):
+    """Send a text message in voice conversation"""
+    try:
+        if not voice_conversation_state["active"]:
+            return ("Please start a conversation first", "", format_transcript(voice_conversation_state["transcript"]))
+        if not message or not message.strip():
+            return ("Please enter a message", message, format_transcript(voice_conversation_state["transcript"]))
+        session_id = voice_conversation_state["session_id"]
+        voice_conversation_state["transcript"].append({"role": "user", "content": message})
+        result = mcp_server.run_async(mcp_server.voice_tool.voice_qa(message, session_id))
+        if result.get("success"):
+            answer = result.get("answer", "No response")
+            voice_conversation_state["transcript"].append({"role": "assistant", "content": answer})
+            return ("✅ Response received", "", format_transcript(voice_conversation_state["transcript"]))
+        else:
+            return (f"❌ Error: {result.get('error')}", message, format_transcript(voice_conversation_state["transcript"]))
+    except Exception as e:
+        logger.error(f"Error sending message: {str(e)}")
+        return (f"❌ Error: {str(e)}", message, format_transcript(voice_conversation_state["transcript"]))
+def format_transcript(transcript):
+    """Format conversation transcript for display"""
+    if not transcript:
+        return "No conversation yet. Start talking to the AI librarian!"
+    formatted = ""
+    for msg in transcript:
+        role = msg["role"]
+        content = msg["content"]
+        if role == "user":
+            formatted += f"👤 **You:** {content}\n\n"
+        else:
+            formatted += f"🤖 **AI Librarian:** {content}\n\n"
+        formatted += "---\n\n"
+    return formatted
+def clear_voice_transcript():
+    """Clear conversation transcript"""
+    voice_conversation_state["transcript"] = []
+    return ""
+def send_voice_message_v6(message, chat_history):
+    """Send message in voice conversation - Gradio 6 format"""
+    try:
+        if not voice_conversation_state["active"]:
+            return chat_history, ""
+        if not message or not message.strip():
+            return chat_history, message
+        session_id = voice_conversation_state["session_id"]
+        # Add user message in Gradio 6 format
+        chat_history.append({"role": "user", "content": message})
+        # Get AI response
+        result = mcp_server.run_async(mcp_server.voice_tool.voice_qa(message, session_id))
+        if result.get("success"):
+            answer = result.get("answer", "No response")
+            chat_history.append({"role": "assistant", "content": answer})
+        else:
+            chat_history.append({
+                "role": "assistant",
+                "content": f"❌ Error: {result.get('error')}"
+            })
+        return chat_history, ""
+    except Exception as e:
+        logger.error(f"Error in voice message: {str(e)}")
+        chat_history.append({
+            "role": "assistant",
+            "content": f"❌ Error: {str(e)}"
+        })
+        return chat_history, ""
+def generate_podcast_ui(doc_ids, style, duration, voice1, voice2):
+    """UI wrapper for podcast generation"""
+    try:
+        if not doc_ids or len(doc_ids) == 0:
+            return ("⚠️ Please select at least one document", None, "No documents selected", "")
+        logger.info(f"Generating podcast: {len(doc_ids)} docs, {style}, {duration}min")
+        result = mcp_server.run_async(
+            mcp_server.generate_podcast_async(
+                document_ids=doc_ids,
+                style=style,
+                duration_minutes=int(duration),
+                host1_voice=voice1,
+                host2_voice=voice2
+            )
+        )
+        if result.get("success"):
+            audio_file = result.get("audio_file")
+            transcript = result.get("transcript", "Transcript not available")
+            message = result.get("message", "Podcast generated!")
+            formatted_transcript = f"## Podcast Transcript\n\n{transcript}"
+            return (
+                f"✅ {message}",
+                audio_file,
+                formatted_transcript,
+                result.get("podcast_id", "")
+            )
+        else:
+            error = result.get("error", "Unknown error")
+            return (f"❌ Error: {error}", None, "Generation failed", "")
+    except Exception as e:
+        logger.error(f"Podcast UI error: {str(e)}")
+        return (f"❌ Error: {str(e)}", None, "An error occurred", "")
+def load_dashboard_stats():
+    """Load dashboard statistics for the UI"""
+    try:
+        # Get document list
+        docs_result = mcp_server.list_documents_sync(limit=1000)
+        doc_count = 0
+        total_chunks = 0
+        total_size = 0
+        recent_data = []
+        if docs_result.get("success"):
+            documents = docs_result.get("documents", [])
+            doc_count = len(documents)
+            total_chunks = sum(doc.get("metadata", {}).get("chunk_count", 0) for doc in documents)
+            total_size = sum(doc.get("file_size", 0) for doc in documents)
+            storage_mb = round(total_size / (1024 * 1024), 2) if total_size > 0 else 0.0
+            # Get recent 5 documents
+            recent = documents[:5]
+            recent_data = [
+                [
+                    doc.get("filename", "Unknown"),
+                    doc.get("doc_type", "unknown"),
+                    doc.get("created_at", "")[:10] if doc.get("created_at") else "N/A",
+                    f"{doc.get('file_size', 0)} bytes"
+                ]
+                for doc in recent
+            ]
+        else:
+            storage_mb = 0.0
+        # Service status indicators
+        vector_stat = "✅ Online" if getattr(mcp_server, "vector_store", None) else "❌ Offline"
+        llm_stat = "✅ Ready" if getattr(mcp_server, "llm_service", None) else "❌ Offline"
+        voice_stat = "✅ Ready" if (getattr(mcp_server, "elevenlabs_service", None) and mcp_server.elevenlabs_service.is_available()) else "⚠️ Configure API Key"
+        return (
+            doc_count,
+            total_chunks,
+            storage_mb,
+            recent_data,
+            vector_stat,
+            llm_stat,
+            voice_stat,
+        )
+    except Exception as e:
+        logger.error(f"Error loading dashboard stats: {str(e)}")
+        return (0, 0, 0.0, [], "❌ Error", "❌ Error", "❌ Error")
+def create_gradio_interface():
+    # Create custom theme with modern aesthetics
+    custom_theme = gr.themes.Soft(
+        primary_hue=gr.themes.colors.indigo,
+        secondary_hue=gr.themes.colors.blue,
+        neutral_hue=gr.themes.colors.slate,
+        font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
+        font_mono=[gr.themes.GoogleFont("Fira Code"), "monospace"],
+    ).set(
+        button_primary_background_fill="*primary_500",
+        button_primary_background_fill_hover="*primary_600",
+        block_title_text_weight="600",
+        block_label_text_size="sm",
+        block_label_text_weight="500",
+    )
+    with gr.Blocks(title="🧠 AI Digital Library Assistant", theme=custom_theme) as interface:
+        with gr.Tabs():
+            # Dashboard Tab - New Landing Page
+            with gr.Tab("🏠 Dashboard"):
+                gr.Markdown("# Welcome to Your AI Library Assistant")
+                gr.Markdown("*Your intelligent document management and analysis platform powered by AI*")
+                # Quick Stats Section
+                gr.Markdown("## 📊 Quick Stats")
+                with gr.Row():
+                    total_docs = gr.Number(
+                        label="📚 Total Documents",
+                        value=0,
+                        interactive=False,
+                        container=True
+                    )
+                    total_chunks = gr.Number(
+                        label="🧩 Vector Chunks",
+                        value=0,
+                        interactive=False,
+                        container=True
+                    )
+                    storage_size = gr.Number(
+                        label="💾 Storage (MB)",
+                        value=0,
+                        interactive=False,
+                        container=True
+                    )
+                # Recent Activity Section
+                gr.Markdown("## 📊 Recent Activity")
+                with gr.Group():
+                    recent_docs = gr.Dataframe(
+                        headers=["Document", "Type", "Date", "Size"],
+                        datatype=["str", "str", "str", "str"],
+                        row_count=(5, "fixed"),
+                        col_count=(4, "fixed"),
+                        interactive=False,
+                        label="Recently Added Documents"
+                    )
+                # System Status Section
+                gr.Markdown("## � System Status")
+                with gr.Row():
+                    vector_status = gr.Textbox(
+                        label="Vector Store",
+                        value="✅ Online",
+                        interactive=False,
+                        container=True
+                    )
+                    llm_status = gr.Textbox(
+                        label="LLM Service",
+                        value="✅ Ready",
+                        interactive=False,
+                        container=True
+                    )
+                    voice_status = gr.Textbox(
+                        label="Voice Service",
+                        value="⚠️ Configure API Key",
+                        interactive=False,
+                        container=True
+                    )
+            with gr.Tab("📚 Document Library"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Your Document Collection")
+                        document_list_display = gr.Textbox(label="Documents in Library", value=get_document_list(), lines=20, interactive=False)
+                        refresh_btn_library = gr.Button("🔄 Refresh Library", variant="secondary")
+                        delete_doc_dropdown_visible = gr.Dropdown(label="Select Document to Delete", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
+                        delete_btn = gr.Button("🗑️ Delete Selected Document", variant="stop")
+                        delete_output_display = gr.Textbox(label="Delete Status", visible=True)
+            with gr.Tab("📄 Upload Documents"):
+                gr.Markdown("""
+                ### 📥 Add Documents to Library
+                Upload PDFs, Word documents, text files, or images. OCR will extract text from images automatically.
+                """)
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Group():
+                            gr.Markdown("**Supported formats:** PDF, DOCX, TXT, Images (JPG, PNG)")
+                            file_input_upload = gr.File(
+                                label="Select File",
+                                file_types=[".pdf", ".txt", ".docx", ".png", ".jpg", ".jpeg"],
+                                type="filepath",
+                                file_count="single"
+                            )
+                            upload_btn_process = gr.Button("🚀 Upload & Process", variant="primary", size="lg")
+                        with gr.Group():
+                            upload_output_display = gr.Textbox(
+                                label="Status",
+                                lines=6,
+                                interactive=False,
+                                show_copy_button=False
+                            )
+                            doc_id_output_display = gr.Textbox(
+                                label="Document ID",
+                                interactive=False,
+                                visible=False
+                            )
+            with gr.Tab("🔍 Search Documents"):
+                gr.Markdown("""
+                ### 🔎 Semantic Search
+                Find relevant content across your entire document library using AI-powered semantic search.
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        with gr.Group():
+                            search_query_input = gr.Textbox(
+                                label="Search Query",
+                                placeholder="What are you looking for?",
+                                lines=2,
+                                info="Use natural language to describe what you need"
+                            )
+                            with gr.Accordion("🎛️ Search Options", open=False):
+                                search_top_k_slider = gr.Slider(
+                                    label="Number of Results",
+                                    minimum=1, maximum=20, value=5, step=1,
+                                    info="More results = broader search"
+                                )
+                            search_btn_action = gr.Button("🔍 Search", variant="primary", size="lg")
+                    with gr.Column(scale=2):
+                        with gr.Group():
+                            search_output_display = gr.Textbox(
+                                label="Results",
+                                lines=20,
+                                placeholder="Search results will appear here...",
+                                show_copy_button=True
+                            )
+            with gr.Tab("📝 Content Studio"):
+                gr.Markdown("""
+                ### 🎨 Create & Analyze Content
+                Transform documents with AI-powered tools: summarize, outline, explain, and more.
+                """)
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        # Source Selection with Group
+                        with gr.Group():
+                            gr.Markdown("#### 📄 Content Source")
+                            doc_dropdown_content = gr.Dropdown(
+                                label="Select Document",
+                                choices=get_document_choices(),
+                                value=None,
+                                interactive=True,
+                                info="Choose a document from your library"
+                            )
+                            gr.Markdown("**OR**")
+                            content_text_input = gr.Textbox(
+                                label="Enter Text or Topic",
+                                placeholder="Paste content or enter a topic...",
+                                lines=4,
+                                info="For outlines, enter a topic. For other tasks, paste text to analyze."
+                            )
+                        # Task Configuration with Group
+                        with gr.Group():
+                            gr.Markdown("#### 🛠️ Task Configuration")
+                            task_dropdown = gr.Dropdown(
+                                label="Select Task",
+                                choices=[
+                                    "Summarize", "Generate Outline", "Explain Concept",
+                                    "Paraphrase", "Categorize", "Key Insights",
+                                    "Generate Questions", "Extract Key Info"
+                                ],
+                                value="Summarize",
+                                interactive=True,
+                                info="Choose the type of analysis to perform"
+                            )
+                        # Dynamic Options with Accordion
+                        with gr.Accordion("⚙️ Advanced Options", open=False):
+                            summary_style_opt = gr.Dropdown(
+                                label="Summary Style",
+                                choices=["concise", "detailed", "bullet_points", "executive"],
+                                value="concise",
+                                visible=True,
+                                info="How detailed should the summary be?"
+                            )
+                            outline_sections_opt = gr.Slider(
+                                label="Number of Sections",
+                                minimum=3, maximum=10, value=5, step=1,
+                                visible=False,
+                                info="How many main sections?"
+                            )
+                            outline_detail_opt = gr.Dropdown(
+                                label="Detail Level",
+                                choices=["brief", "medium", "detailed"],
+                                value="medium",
+                                visible=False
+                            )
+                            explain_audience_opt = gr.Dropdown(
+                                label="Target Audience",
+                                choices=["general", "technical", "beginner", "expert"],
+                                value="general",
+                                visible=False,
+                                info="Who is this explanation for?"
+                            )
+                            explain_length_opt = gr.Dropdown(
+                                label="Length",
+                                choices=["brief", "medium", "detailed"],
+                                value="medium",
+                                visible=False
+                            )
+                            paraphrase_style_opt = gr.Dropdown(
+                                label="Style",
+                                choices=["formal", "casual", "academic", "simple", "technical"],
+                                value="formal",
+                                visible=False,
+                                info="Writing style for paraphrasing"
+                            )
+                            categories_input_opt = gr.Textbox(
+                                label="Categories (comma separated)",
+                                placeholder="Technology, Business, Science...",
+                                visible=False
+                            )
+                            num_items_opt = gr.Slider(
+                                label="Number of Items",
+                                minimum=1, maximum=10, value=5, step=1,
+                                visible=False
+                            )
+                            question_type_opt = gr.Dropdown(
+                                label="Question Type",
+                                choices=["comprehension", "analysis", "application", "creative", "factual"],
+                                value="comprehension",
+                                visible=False
+                            )
+                        run_task_btn = gr.Button("🚀 Run Task", variant="primary", size="lg")
+                    with gr.Column(scale=3):
+                        # Results with copy button and Group
+                        with gr.Group():
+                            gr.Markdown("#### 📊 Result")
+                            content_output_display = gr.Textbox(
+                                label="",
+                                lines=25,
+                                placeholder="Results will appear here...",
+                                show_copy_button=True,
+                                container=False
+                            )
+                # Event Handlers
+                task_dropdown.change(
+                    fn=update_options_visibility,
+                    inputs=[task_dropdown],
+                    outputs=[
+                        summary_style_opt, outline_sections_opt, outline_detail_opt,
+                        explain_audience_opt, explain_length_opt, paraphrase_style_opt,
+                        categories_input_opt, num_items_opt, question_type_opt
+                    ]
+                )
+                run_task_btn.click(
+                    fn=execute_content_task,
+                    inputs=[
+                        task_dropdown, doc_dropdown_content, content_text_input,
+                        summary_style_opt, outline_sections_opt, outline_detail_opt,
+                        explain_audience_opt, explain_length_opt, paraphrase_style_opt,
+                        categories_input_opt, num_items_opt, question_type_opt
+                    ],
+                    outputs=[content_output_display]
+                )
+            with gr.Tab("🏷️ Generate Tags"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Generate Document Tags")
+                        doc_dropdown_tag_visible = gr.Dropdown(label="Select Document to Tag", choices=get_document_choices(), value=None, interactive=True, allow_custom_value=False)
+                        tag_text_input = gr.Textbox(label="Or Paste Text to Generate Tags", placeholder="Paste any text here to generate tags...", lines=8)
+                        max_tags_slider = gr.Slider(label="Number of Tags", minimum=3, maximum=15, value=5, step=1)
+                        tag_btn_action = gr.Button("🏷️ Generate Tags", variant="primary", size="lg")
+                    with gr.Column():
+                        tag_output_display = gr.Textbox(label="Generated Tags", lines=10, placeholder="Tags will appear here...")
+            with gr.Tab("🎙️ Voice Assistant"):
+                gr.Markdown("""
+                ### 🗣️ Talk to Your AI Librarian
+                Have a natural conversation about your documents. Ask questions, request summaries,
+                or explore your content library through voice-powered interaction.
+                **Note:** Requires ElevenLabs API configuration.
+                """)
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        # Status and Controls
+                        with gr.Group():
+                            voice_status_display = gr.Textbox(
+                                label="Status",
+                                value="Ready to start",
+                                interactive=False,
+                                lines=2
+                            )
+                            with gr.Row():
+                                start_voice_btn = gr.Button("🎤 Start Conversation", variant="primary", size="lg")
+                                stop_voice_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", interactive=False)
+                        # Message Input
+                        with gr.Group():
+                            gr.Markdown("#### 💬 Send Message")
+                            voice_input_text = gr.Textbox(
+                                label="",
+                                placeholder="Type your question...",
+                                lines=3,
+                                container=False,
+                                info="Press Enter or click Send"
+                            )
+                            send_voice_btn = gr.Button("📤 Send", variant="secondary")
+                    with gr.Column(scale=3):
+                        # Chat Interface with Gradio 6 Chatbot
+                        with gr.Group():
+                            voice_chatbot = gr.Chatbot(
+                                label="Conversation",
+                                type="messages",
+                                height=500,
+                                show_copy_button=True
+                            )
+                            clear_chat_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+                # Voice Assistant event handlers
+                start_voice_btn.click(
+                    fn=start_voice_conversation,
+                    outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
+                )
+                stop_voice_btn.click(
+                    fn=stop_voice_conversation,
+                    outputs=[voice_status_display, start_voice_btn, stop_voice_btn, voice_chatbot]
+                )
+                send_voice_btn.click(
+                    fn=send_voice_message_v6,
+                    inputs=[voice_input_text, voice_chatbot],
+                    outputs=[voice_chatbot, voice_input_text]
+                )
+                voice_input_text.submit(
+                    fn=send_voice_message_v6,
+                    inputs=[voice_input_text, voice_chatbot],
+                    outputs=[voice_chatbot, voice_input_text]
+                )
+                clear_chat_btn.click(
+                    fn=lambda: [],
+                    outputs=[voice_chatbot]
+                )
+            with gr.Tab("🎧 Podcast Studio"):
+                gr.Markdown("""
+                ### 🎙️ AI-Powered Podcast Generation
+                Transform your documents into engaging audio conversations. Select documents,
+                customize the style and voices, and let AI create a professional podcast.
+                **Powered by:** ElevenLabs AI Voice Technology
+                """)
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        # Configuration Panel
+                        with gr.Group():
+                            gr.Markdown("#### 📚 Select Content")
+                            podcast_doc_selector = gr.CheckboxGroup(
+                                choices=get_document_choices(),
+                                label="Documents to Include",
+                                info="Choose 1-5 documents for best results",
+                                interactive=True
+                            )
+                        with gr.Accordion("🎨 Podcast Settings", open=True):
+                            with gr.Row():
+                                podcast_style = gr.Dropdown(
+                                    label="Style",
+                                    choices=["conversational", "educational", "technical", "casual"],
+                                    value="conversational",
+                                    info="Sets the tone and format"
+                                )
+                                podcast_duration = gr.Slider(
+                                    label="Duration (minutes)",
+                                    minimum=5,
+                                    maximum=30,
+                                    value=10,
+                                    step=5,
+                                    info="Approximate length"
+                                )
+                            gr.Markdown("#### 🗣️ Voice Selection")
+                            with gr.Row():
+                                host1_voice_selector = gr.Dropdown(
+                                    label="Host 1",
+                                    choices=["Rachel", "Adam", "Domi", "Bella", "Antoni", "Elli", "Josh"],
+                                    value="Rachel"
+                                )
+                                host2_voice_selector = gr.Dropdown(
+                                    label="Host 2",
+                                    choices=["Adam", "Rachel", "Josh", "Sam", "Emily", "Antoni", "Arnold"],
+                                    value="Adam"
+                                )
+                        generate_podcast_btn = gr.Button(
+                            "🎙️ Generate Podcast",
+                            variant="primary",
+                            size="lg"
+                        )
+                        podcast_status = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            lines=2
+                        )
+                        podcast_id_display = gr.Textbox(
+                            label="Podcast ID",
+                            interactive=False,
+                            visible=False
+                        )
+                    with gr.Column(scale=3):
+                        # Output Panel
+                        with gr.Group():
+                            gr.Markdown("#### 🎵 Generated Podcast")
+                            podcast_audio_player = gr.Audio(
+                                label="",
+                                type="filepath",
+                                interactive=False,
+                                autoplay=True,
+                                container=False
+                            )
+                        with gr.Accordion("📝 Transcript", open=False):
+                            podcast_transcript_display = gr.Markdown(
+                                value="*Transcript will appear after generation...*"
+                            )
+                # Event handlers
+                generate_podcast_btn.click(
+                    fn=generate_podcast_ui,
+                    inputs=[
+                        podcast_doc_selector,
+                        podcast_style,
+                        podcast_duration,
+                        host1_voice_selector,
+                        host2_voice_selector
+                    ],
+                    outputs=[
+                        podcast_status,
+                        podcast_audio_player,
+                        podcast_transcript_display,
+                        podcast_id_display
+                    ]
+                )
+            with gr.Tab("❓ Ask Questions"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("""### Ask Questions About Your Documents
+                        The AI will search through all your uploaded documents to find relevant information
+                        and provide comprehensive answers with sources.""")
+                        qa_question_input = gr.Textbox(label="Your Question", placeholder="Ask anything about your documents...", lines=3)
+                        qa_btn_action = gr.Button("❓ Get Answer", variant="primary", size="lg")
+                    with gr.Column():
+                        qa_output_display = gr.Textbox(label="AI Answer", lines=20, placeholder="Answer will appear here with sources...")
+        all_dropdowns_to_update = [delete_doc_dropdown_visible, doc_dropdown_content, doc_dropdown_tag_visible]
+        refresh_outputs = [document_list_display] + [dd for dd in all_dropdowns_to_update]
+        refresh_btn_library.click(fn=refresh_library, outputs=refresh_outputs)
+        upload_outputs = [upload_output_display, doc_id_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
+        upload_btn_process.click(upload_and_process_file, inputs=[file_input_upload], outputs=upload_outputs)
+        delete_outputs = [delete_output_display, document_list_display] + [dd for dd in all_dropdowns_to_update]
+        delete_btn.click(delete_document_from_library, inputs=[delete_doc_dropdown_visible], outputs=delete_outputs)
+        search_btn_action.click(perform_search, inputs=[search_query_input, search_top_k_slider], outputs=[search_output_display])
+        tag_btn_action.click(generate_tags_for_document, inputs=[doc_dropdown_tag_visible, tag_text_input, max_tags_slider], outputs=[tag_output_display])
+        qa_btn_action.click(ask_question, inputs=[qa_question_input], outputs=[qa_output_display])
+        # Load dashboard stats on interface load
+        interface.load(
+            fn=load_dashboard_stats,
+            outputs=[total_docs, total_chunks, storage_size, recent_docs, vector_status, llm_status, voice_status]
+        )
+        interface.load(fn=refresh_library, outputs=refresh_outputs)
+        return interface
+if __name__ == "__main__":
+    gradio_interface = create_gradio_interface()
+    gradio_interface.launch(mcp_server=True)

config.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+from typing import Optional
+from dotenv import load_dotenv
+load_dotenv()
+class Config:
+    # API Keys
+    NEBIUS_API_KEY: Optional[str] = os.getenv("NEBIUS_API_KEY")
+    MISTRAL_API_KEY: Optional[str] = os.getenv("MISTRAL_API_KEY")
+    HUGGINGFACE_API_KEY: Optional[str] = os.getenv("HUGGINGFACE_API_KEY", os.getenv("HF_TOKEN"))
+    OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY")
+    ANTHROPIC_API_KEY: Optional[str] = os.getenv("ANTHROPIC_API_KEY")
+    # NEBIUS Configuration (OpenAI OSS models)
+    NEBIUS_BASE_URL: str = os.getenv("NEBIUS_BASE_URL", "https://api.studio.nebius.com/v1/")
+    NEBIUS_MODEL: str = os.getenv("NEBIUS_MODEL", "meta-llama/Llama-3.3-70B-Instruct")
+    # Model Configuration
+    # Using OpenAI managed embeddings for performance/quality
+    EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+    MISTRAL_MODEL: str = os.getenv("MISTRAL_MODEL", "mistral-large-2407")
+    OPENAI_MODEL: str = os.getenv("OPENAI_MODEL", "gpt-5.1-chat-latest")
+    FAST_MODEL: str = os.getenv("FAST_MODEL", "gpt-5-mini")
+    # Vector Store Configuration
+    DATA_DIR: str = os.getenv("DATA_DIR", "./data")
+    VECTOR_STORE_PATH: str = os.getenv("VECTOR_STORE_PATH", "./data/vector_store")
+    DOCUMENT_STORE_PATH: str = os.getenv("DOCUMENT_STORE_PATH", "./data/documents")
+    INDEX_NAME: str = os.getenv("INDEX_NAME", "content_index")
+    # Processing Configuration
+    CHUNK_SIZE: int = int(os.getenv("CHUNK_SIZE", "500"))
+    CHUNK_OVERLAP: int = int(os.getenv("CHUNK_OVERLAP", "50"))
+    MAX_CONCURRENT_REQUESTS: int = int(os.getenv("MAX_CONCURRENT_REQUESTS", "5"))
+    # Search Configuration
+    DEFAULT_TOP_K: int = int(os.getenv("DEFAULT_TOP_K", "5"))
+    SIMILARITY_THRESHOLD: float = float(os.getenv("SIMILARITY_THRESHOLD", "0.3"))
+    # OCR Configuration
+    TESSERACT_PATH: Optional[str] = os.getenv("TESSERACT_PATH")
+    OCR_LANGUAGE: str = os.getenv("OCR_LANGUAGE", "eng")
+    # ElevenLabs Configuration
+    ELEVENLABS_API_KEY: Optional[str] = os.getenv("ELEVENLABS_API_KEY")
+    ELEVENLABS_AGENT_ID: Optional[str] = os.getenv("ELEVENLABS_AGENT_ID")
+    ELEVENLABS_VOICE_MODEL: str = os.getenv("ELEVENLABS_VOICE_MODEL", "Rachel")
+    # App Configuration
+    HOST: str = os.getenv("HOST", "0.0.0.0")
+    PORT: int = int(os.getenv("PORT", "7860"))
+    DEBUG: bool = os.getenv("DEBUG", "False").lower() == "true"
+config = Config()

core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Core module initialization

core/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (145 Bytes). View file

core/__pycache__/chunker.cpython-313.pyc ADDED Viewed

Binary file (11.1 kB). View file

core/__pycache__/document_parser.cpython-313.pyc ADDED Viewed

Binary file (10.5 kB). View file

core/__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (7.06 kB). View file

core/__pycache__/text_preprocessor.cpython-313.pyc ADDED Viewed

Binary file (9.25 kB). View file

core/chunker.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# chunker.py
+import logging
+from typing import List, Dict, Any, Optional
+import re
+from .models import Chunk
+from .text_preprocessor import TextPreprocessor
+import config
+logger = logging.getLogger(__name__)
+class TextChunker:
+    def __init__(self):
+        self.config = config.config
+        self.preprocessor = TextPreprocessor()
+        self.chunk_size = self.config.CHUNK_SIZE
+        self.chunk_overlap = self.config.CHUNK_OVERLAP
+    def chunk_document(self, document_id: str, content: str, method: str = "recursive") -> List[Chunk]:
+        """Chunk a document using the specified method"""
+        if not content:
+            return []
+        try:
+            if method == "recursive":
+                return self._recursive_chunk(document_id, content)
+            elif method == "sentence":
+                return self._sentence_chunk(document_id, content)
+            elif method == "paragraph":
+                return self._paragraph_chunk(document_id, content)
+            elif method == "fixed":
+                return self._fixed_chunk(document_id, content)
+            else:
+                logger.warning(f"Unknown chunking method: {method}, using recursive")
+                return self._recursive_chunk(document_id, content)
+        except Exception as e:
+            logger.error(f"Error chunking document: {str(e)}")
+            # Fallback to simple fixed chunking
+            return self._fixed_chunk(document_id, content)
+    def _recursive_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Recursively split text by different separators"""
+        chunks = []
+        # Define separators in order of preference
+        separators = [
+            "\n\n",  # Paragraphs
+            "\n",    # Lines
+            ". ",    # Sentences
+            ", ",    # Clauses
+            " "      # Words
+        ]
+        def split_text(text: str, separators: List[str], chunk_size: int) -> List[str]:
+            if len(text) <= chunk_size:
+                return [text] if text.strip() else []
+            if not separators:
+                # If no separators left, split by character
+                return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+            separator = separators[0]
+            remaining_separators = separators[1:]
+            splits = text.split(separator)
+            result = []
+            current_chunk = ""
+            for split in splits:
+                if len(current_chunk) + len(split) + len(separator) <= chunk_size:
+                    if current_chunk:
+                        current_chunk += separator + split
+                    else:
+                        current_chunk = split
+                else:
+                    if current_chunk:
+                        result.append(current_chunk)
+                    if len(split) > chunk_size:
+                        # Split is too big, need to split further
+                        result.extend(split_text(split, remaining_separators, chunk_size))
+                        current_chunk = ""
+                    else:
+                        current_chunk = split
+            if current_chunk:
+                result.append(current_chunk)
+            return result
+        text_chunks = split_text(content, separators, self.chunk_size)
+        # Create chunk objects with overlap
+        for i, chunk_text in enumerate(text_chunks):
+            if not chunk_text.strip():
+                continue
+            # Calculate positions
+            start_pos = content.find(chunk_text)
+            if start_pos == -1:
+                start_pos = i * self.chunk_size
+            end_pos = start_pos + len(chunk_text)
+            # Add overlap from previous chunk if not the first chunk
+            if i > 0 and self.chunk_overlap > 0:
+                prev_chunk = text_chunks[i-1]
+                overlap_text = prev_chunk[-self.chunk_overlap:] if len(prev_chunk) > self.chunk_overlap else prev_chunk
+                chunk_text = overlap_text + " " + chunk_text
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, i),
+                document_id=document_id,
+                content=chunk_text.strip(),
+                chunk_index=i,
+                start_pos=start_pos,
+                end_pos=end_pos,
+                metadata={
+                    "chunk_method": "recursive",
+                    "original_length": len(chunk_text),
+                    "word_count": len(chunk_text.split())
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _sentence_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Chunk text by sentences"""
+        chunks = []
+        sentences = self.preprocessor.extract_sentences(content)
+        current_chunk = ""
+        chunk_index = 0
+        start_pos = 0
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) <= self.chunk_size:
+                if current_chunk:
+                    current_chunk += " " + sentence
+                else:
+                    current_chunk = sentence
+                    start_pos = content.find(sentence)
+            else:
+                if current_chunk:
+                    chunk = Chunk(
+                        id=self._generate_chunk_id(document_id, chunk_index),
+                        document_id=document_id,
+                        content=current_chunk.strip(),
+                        chunk_index=chunk_index,
+                        start_pos=start_pos,
+                        end_pos=start_pos + len(current_chunk),
+                        metadata={
+                            "chunk_method": "sentence",
+                            "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
+                        }
+                    )
+                    chunks.append(chunk)
+                    chunk_index += 1
+                current_chunk = sentence
+                start_pos = content.find(sentence)
+        # Add final chunk
+        if current_chunk:
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, chunk_index),
+                document_id=document_id,
+                content=current_chunk.strip(),
+                chunk_index=chunk_index,
+                start_pos=start_pos,
+                end_pos=start_pos + len(current_chunk),
+                metadata={
+                    "chunk_method": "sentence",
+                    "sentence_count": len(self.preprocessor.extract_sentences(current_chunk))
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _paragraph_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Chunk text by paragraphs"""
+        chunks = []
+        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
+        current_chunk = ""
+        chunk_index = 0
+        start_pos = 0
+        for paragraph in paragraphs:
+            if len(current_chunk) + len(paragraph) <= self.chunk_size:
+                if current_chunk:
+                    current_chunk += "\n\n" + paragraph
+                else:
+                    current_chunk = paragraph
+                    start_pos = content.find(paragraph)
+            else:
+                if current_chunk:
+                    chunk = Chunk(
+                        id=self._generate_chunk_id(document_id, chunk_index),
+                        document_id=document_id,
+                        content=current_chunk.strip(),
+                        chunk_index=chunk_index,
+                        start_pos=start_pos,
+                        end_pos=start_pos + len(current_chunk),
+                        metadata={
+                            "chunk_method": "paragraph",
+                            "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
+                        }
+                    )
+                    chunks.append(chunk)
+                    chunk_index += 1
+                # If paragraph is too long, split it further
+                if len(paragraph) > self.chunk_size:
+                    para_chunks = self._fixed_chunk(document_id, paragraph)
+                    for pc in para_chunks:
+                        pc.chunk_index = chunk_index
+                        pc.id = self._generate_chunk_id(document_id, chunk_index)
+                        chunks.append(pc)
+                        chunk_index += 1
+                else:
+                    current_chunk = paragraph
+                    start_pos = content.find(paragraph)
+        # Add final chunk
+        if current_chunk:
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, chunk_index),
+                document_id=document_id,
+                content=current_chunk.strip(),
+                chunk_index=chunk_index,
+                start_pos=start_pos,
+                end_pos=start_pos + len(current_chunk),
+                metadata={
+                    "chunk_method": "paragraph",
+                    "paragraph_count": len([p for p in current_chunk.split('\n\n') if p.strip()])
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _fixed_chunk(self, document_id: str, content: str) -> List[Chunk]:
+        """Simple fixed-size chunking with overlap"""
+        chunks = []
+        for i in range(0, len(content), self.chunk_size - self.chunk_overlap):
+            chunk_text = content[i:i + self.chunk_size]
+            if not chunk_text.strip():
+                continue
+            chunk = Chunk(
+                id=self._generate_chunk_id(document_id, len(chunks)),
+                document_id=document_id,
+                content=chunk_text.strip(),
+                chunk_index=len(chunks),
+                start_pos=i,
+                end_pos=min(i + self.chunk_size, len(content)),
+                metadata={
+                    "chunk_method": "fixed",
+                    "original_length": len(chunk_text)
+                }
+            )
+            chunks.append(chunk)
+        return chunks
+    def _generate_chunk_id(self, document_id: str, chunk_index: int) -> str:
+        """Generate a unique chunk ID"""
+        return f"{document_id}_chunk_{chunk_index}"
+    def optimize_chunks_for_embedding(self, chunks: List[Chunk]) -> List[Chunk]:
+        """Optimize chunks for better embedding generation"""
+        optimized_chunks = []
+        for chunk in chunks:
+            # Clean the content for embedding
+            clean_content = self.preprocessor.prepare_for_embedding(chunk.content)
+            # Skip very short chunks
+            if len(clean_content.split()) < 5:
+                continue
+            # Update chunk with optimized content
+            optimized_chunk = Chunk(
+                id=chunk.id,
+                document_id=chunk.document_id,
+                content=clean_content,
+                chunk_index=chunk.chunk_index,
+                start_pos=chunk.start_pos,
+                end_pos=chunk.end_pos,
+                metadata={
+                    **chunk.metadata,
+                    "optimized_for_embedding": True,
+                    "original_content_length": len(chunk.content),
+                    "optimized_content_length": len(clean_content)
+                }
+            )
+            optimized_chunks.append(optimized_chunk)
+        return optimized_chunks

core/document_parser.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import logging
+import tempfile
+import os
+from pathlib import Path
+from typing import Optional, Dict, Any
+import asyncio
+# Document processing libraries
+import PyPDF2
+from docx import Document as DocxDocument
+from PIL import Image
+import pytesseract
+from .models import Document, DocumentType
+import config
+logger = logging.getLogger(__name__)
+class DocumentParser:
+    def __init__(self):
+        self.config = config.config
+    async def parse_document(self, file_path: str, filename: str) -> Document:
+        """Parse a document and extract its content"""
+        try:
+            file_ext = Path(filename).suffix.lower()
+            file_size = os.path.getsize(file_path)
+            # Determine document type and parse accordingly
+            if file_ext == '.pdf':
+                content = await self._parse_pdf(file_path)
+                doc_type = DocumentType.PDF
+            elif file_ext == '.txt':
+                content = await self._parse_text(file_path)
+                doc_type = DocumentType.TEXT
+            elif file_ext == '.docx':
+                content = await self._parse_docx(file_path)
+                doc_type = DocumentType.DOCX
+            elif file_ext in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff']:
+                content = await self._parse_image(file_path)
+                doc_type = DocumentType.IMAGE
+            else:
+                raise ValueError(f"Unsupported file type: {file_ext}")
+            # Create document object
+            document = Document(
+                id=self._generate_document_id(),
+                filename=filename,
+                content=content,
+                doc_type=doc_type,
+                file_size=file_size,
+                metadata={
+                    "file_extension": file_ext,
+                    "content_length": len(content),
+                    "word_count": len(content.split()) if content else 0
+                }
+            )
+            logger.info(f"Successfully parsed document: {filename}")
+            return document
+        except Exception as e:
+            logger.error(f"Error parsing document {filename}: {str(e)}")
+            raise
+    async def _parse_pdf(self, file_path: str) -> str:
+        """Extract text from PDF file"""
+        try:
+            content = ""
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page_num, page in enumerate(pdf_reader.pages):
+                    try:
+                        page_text = page.extract_text()
+                        if page_text.strip():
+                            content += f"\n--- Page {page_num + 1} ---\n"
+                            content += page_text + "\n"
+                    except Exception as e:
+                        logger.warning(f"Error extracting text from page {page_num + 1}: {str(e)}")
+                        continue
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error parsing PDF: {str(e)}")
+            raise
+    async def _parse_text(self, file_path: str) -> str:
+        """Read plain text file"""
+        try:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                content = file.read()
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error parsing text file: {str(e)}")
+            raise
+    async def _parse_docx(self, file_path: str) -> str:
+        """Extract text from DOCX file"""
+        try:
+            doc = DocxDocument(file_path)
+            content = ""
+            for paragraph in doc.paragraphs:
+                if paragraph.text.strip():
+                    content += paragraph.text + "\n"
+            # Extract text from tables
+            for table in doc.tables:
+                for row in table.rows:
+                    row_text = []
+                    for cell in row.cells:
+                        if cell.text.strip():
+                            row_text.append(cell.text.strip())
+                    if row_text:
+                        content += " | ".join(row_text) + "\n"
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error parsing DOCX file: {str(e)}")
+            raise
+    async def _parse_image(self, file_path: str) -> str:
+        """Extract text from image using OCR"""
+        try:
+            # First try with OCR service if available
+            if hasattr(self, 'ocr_service') and self.ocr_service:
+                logger.info(f"Using OCR service for image: {file_path}")
+                text = await self.ocr_service.extract_text_from_image(file_path)
+                if text:
+                    return text
+            # Fallback to direct pytesseract
+            logger.info(f"Using direct pytesseract for image: {file_path}")
+            image = Image.open(file_path)
+            # Perform OCR
+            content = pytesseract.image_to_string(
+                image,
+                lang=self.config.OCR_LANGUAGE,
+                config='--psm 6'  # Assume a single uniform block of text
+            )
+            return content.strip()
+        except Exception as e:
+            logger.error(f"Error performing OCR on image: {str(e)}")
+            # Return empty string if OCR fails
+            return ""
+    def _generate_document_id(self) -> str:
+        """Generate a unique document ID"""
+        import uuid
+        return str(uuid.uuid4())
+    async def extract_metadata(self, file_path: str, content: str) -> Dict[str, Any]:
+        """Extract additional metadata from the document"""
+        try:
+            metadata = {}
+            # Basic statistics
+            metadata["content_length"] = len(content)
+            metadata["word_count"] = len(content.split()) if content else 0
+            metadata["line_count"] = len(content.splitlines()) if content else 0
+            # File information
+            file_stat = os.stat(file_path)
+            metadata["file_size"] = file_stat.st_size
+            metadata["created_time"] = file_stat.st_ctime
+            metadata["modified_time"] = file_stat.st_mtime
+            # Content analysis
+            if content:
+                # Language detection (simple heuristic)
+                metadata["estimated_language"] = self._detect_language(content)
+                # Reading time estimation (average 200 words per minute)
+                metadata["estimated_reading_time_minutes"] = max(1, metadata["word_count"] // 200)
+            return metadata
+        except Exception as e:
+            logger.error(f"Error extracting metadata: {str(e)}")
+            return {}
+    def _detect_language(self, content: str) -> str:
+        """Simple language detection based on character patterns"""
+        # This is a very basic implementation
+        # In production, you might want to use a proper language detection library
+        if not content:
+            return "unknown"
+        # Count common English words
+        english_words = ["the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "this", "that", "these", "those"]
+        words = content.lower().split()
+        english_count = sum(1 for word in words if word in english_words)
+        if len(words) > 0 and english_count / len(words) > 0.1:
+            return "en"
+        else:
+            return "unknown"

core/models.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+from enum import Enum
+class DocumentType(str, Enum):
+    PDF = "pdf"
+    TEXT = "txt"
+    DOCX = "docx"
+    IMAGE = "image"
+    HTML = "html"
+class ProcessingStatus(str, Enum):
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+class Document(BaseModel):
+    id: str = Field(..., description="Unique document identifier")
+    filename: str = Field(..., description="Original filename")
+    content: str = Field(..., description="Extracted text content")
+    doc_type: DocumentType = Field(..., description="Document type")
+    file_size: int = Field(..., description="File size in bytes")
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    tags: List[str] = Field(default_factory=list)
+    summary: Optional[str] = None
+    category: Optional[str] = None
+    language: Optional[str] = None
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "id": self.id,
+            "filename": self.filename,
+            "content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
+            "doc_type": self.doc_type,
+            "file_size": self.file_size,
+            "created_at": self.created_at.isoformat(),
+            "metadata": self.metadata,
+            "tags": self.tags,
+            "summary": self.summary,
+            "category": self.category,
+            "language": self.language
+        }
+class Chunk(BaseModel):
+    id: str = Field(..., description="Unique chunk identifier")
+    document_id: str = Field(..., description="Parent document ID")
+    content: str = Field(..., description="Chunk text content")
+    chunk_index: int = Field(..., description="Position in document")
+    start_pos: int = Field(..., description="Start position in original document")
+    end_pos: int = Field(..., description="End position in original document")
+    embedding: Optional[List[float]] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+class SearchResult(BaseModel):
+    chunk_id: str = Field(..., description="Matching chunk ID")
+    document_id: str = Field(..., description="Source document ID")
+    content: str = Field(..., description="Matching content")
+    score: float = Field(..., description="Similarity score")
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "chunk_id": self.chunk_id,
+            "document_id": self.document_id,
+            "content": self.content,
+            "score": self.score,
+            "metadata": self.metadata
+        }
+class ProcessingTask(BaseModel):
+    task_id: str = Field(..., description="Unique task identifier")
+    document_id: Optional[str] = None
+    status: ProcessingStatus = ProcessingStatus.PENDING
+    progress: float = Field(default=0.0, ge=0.0, le=100.0)
+    message: Optional[str] = None
+    error: Optional[str] = None
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    updated_at: datetime = Field(default_factory=datetime.utcnow)
+class SummaryRequest(BaseModel):
+    content: Optional[str] = None
+    document_id: Optional[str] = None
+    style: str = Field(default="concise", description="Summary style")
+    max_length: Optional[int] = None
+class TagGenerationRequest(BaseModel):
+    content: Optional[str] = None
+    document_id: Optional[str] = None
+    max_tags: int = Field(default=5, ge=1, le=20)
+class QuestionAnswerRequest(BaseModel):
+    question: str = Field(..., description="Question to answer")
+    context_filter: Optional[Dict[str, Any]] = None
+    max_context_length: int = Field(default=2000)
+class CategorizationRequest(BaseModel):
+    content: Optional[str] = None
+    document_id: Optional[str] = None
+    categories: Optional[List[str]] = None

core/text_preprocessor.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import re
+import logging
+from typing import List, Optional
+import unicodedata
+logger = logging.getLogger(__name__)
+class TextPreprocessor:
+    def __init__(self):
+        # Common stop words for basic filtering
+        self.stop_words = {
+            'en': set([
+                'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+                'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
+                'before', 'after', 'above', 'below', 'between', 'among', 'throughout',
+                'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
+                'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
+                'must', 'shall', 'can', 'this', 'that', 'these', 'those', 'i', 'me',
+                'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours'
+            ])
+        }
+    def clean_text(self, text: str, aggressive: bool = False) -> str:
+        """Clean and normalize text"""
+        if not text:
+            return ""
+        try:
+            # Normalize unicode characters
+            text = unicodedata.normalize('NFKD', text)
+            # Remove excessive whitespace
+            text = re.sub(r'\s+', ' ', text)
+            # Remove or replace special characters
+            if aggressive:
+                # More aggressive cleaning for embedding
+                text = re.sub(r'[^\w\s\-.,!?;:]', ' ', text)
+                text = re.sub(r'[.,!?;:]+', '.', text)
+            else:
+                # Basic cleaning for readability
+                text = re.sub(r'[^\w\s\-.,!?;:()\[\]{}"\']', ' ', text)
+            # Remove excessive punctuation
+            text = re.sub(r'\.{2,}', '.', text)
+            text = re.sub(r'[!?]{2,}', '!', text)
+            # Clean up whitespace again
+            text = re.sub(r'\s+', ' ', text)
+            # Remove leading/trailing whitespace
+            text = text.strip()
+            return text
+        except Exception as e:
+            logger.error(f"Error cleaning text: {str(e)}")
+            return text
+    def extract_sentences(self, text: str) -> List[str]:
+        """Extract sentences from text"""
+        if not text:
+            return []
+        try:
+            # Simple sentence splitting
+            sentences = re.split(r'[.!?]+', text)
+            # Clean and filter sentences
+            clean_sentences = []
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if len(sentence) > 10:  # Minimum sentence length
+                    clean_sentences.append(sentence)
+            return clean_sentences
+        except Exception as e:
+            logger.error(f"Error extracting sentences: {str(e)}")
+            return [text]
+    def extract_keywords(self, text: str, language: str = 'en', max_keywords: int = 20) -> List[str]:
+        """Extract potential keywords from text"""
+        if not text:
+            return []
+        try:
+            # Convert to lowercase and split into words
+            words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
+            # Remove stop words
+            stop_words = self.stop_words.get(language, set())
+            keywords = [word for word in words if word not in stop_words]
+            # Count word frequency
+            word_freq = {}
+            for word in keywords:
+                word_freq[word] = word_freq.get(word, 0) + 1
+            # Sort by frequency and return top keywords
+            sorted_keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+            return [word for word, freq in sorted_keywords[:max_keywords]]
+        except Exception as e:
+            logger.error(f"Error extracting keywords: {str(e)}")
+            return []
+    def prepare_for_embedding(self, text: str) -> str:
+        """Prepare text specifically for embedding generation"""
+        if not text:
+            return ""
+        try:
+            # Clean text aggressively for better embeddings
+            clean_text = self.clean_text(text, aggressive=True)
+            # Remove very short words
+            words = clean_text.split()
+            filtered_words = [word for word in words if len(word) >= 2]
+            # Rejoin and ensure reasonable length
+            result = ' '.join(filtered_words)
+            # Truncate if too long (most embedding models have token limits)
+            if len(result) > 5000:  # Rough character limit
+                result = result[:5000] + "..."
+            return result
+        except Exception as e:
+            logger.error(f"Error preparing text for embedding: {str(e)}")
+            return text
+    def extract_metadata_from_text(self, text: str) -> dict:
+        """Extract metadata from text content"""
+        if not text:
+            return {}
+        try:
+            metadata = {}
+            # Basic statistics
+            metadata['character_count'] = len(text)
+            metadata['word_count'] = len(text.split())
+            metadata['sentence_count'] = len(self.extract_sentences(text))
+            metadata['paragraph_count'] = len([p for p in text.split('\n\n') if p.strip()])
+            # Content characteristics
+            metadata['avg_word_length'] = sum(len(word) for word in text.split()) / max(1, len(text.split()))
+            metadata['avg_sentence_length'] = metadata['word_count'] / max(1, metadata['sentence_count'])
+            # Special content detection
+            metadata['has_urls'] = bool(re.search(r'https?://\S+', text))
+            metadata['has_emails'] = bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
+            metadata['has_phone_numbers'] = bool(re.search(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text))
+            metadata['has_dates'] = bool(re.search(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', text))
+            metadata['has_numbers'] = bool(re.search(r'\b\d+\b', text))
+            # Language indicators
+            metadata['punctuation_density'] = len(re.findall(r'[.,!?;:]', text)) / max(1, len(text))
+            metadata['caps_ratio'] = len(re.findall(r'[A-Z]', text)) / max(1, len(text))
+            return metadata
+        except Exception as e:
+            logger.error(f"Error extracting text metadata: {str(e)}")
+            return {}
+    def normalize_for_search(self, text: str) -> str:
+        """Normalize text for search queries"""
+        if not text:
+            return ""
+        try:
+            # Convert to lowercase
+            text = text.lower()
+            # Remove special characters but keep spaces
+            text = re.sub(r'[^\w\s]', ' ', text)
+            # Normalize whitespace
+            text = re.sub(r'\s+', ' ', text)
+            # Strip leading/trailing whitespace
+            text = text.strip()
+            return text
+        except Exception as e:
+            logger.error(f"Error normalizing text for search: {str(e)}")
+            return text

mcp_server.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import asyncio
+import logging
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+from mcp.server.fastmcp import FastMCP
+from services.vector_store_service import VectorStoreService
+from services.document_store_service import DocumentStoreService
+from services.embedding_service import EmbeddingService
+from services.llm_service import LLMService
+from services.ocr_service import OCRService
+from mcp_tools.ingestion_tool import IngestionTool
+from mcp_tools.search_tool import SearchTool
+from mcp_tools.generative_tool import GenerativeTool
+# Phase 2 & 3: Voice and Podcast
+from services.llamaindex_service import LlamaIndexService
+from services.elevenlabs_service import ElevenLabsService
+from services.podcast_generator_service import PodcastGeneratorService
+from mcp_tools.voice_tool import VoiceTool
+from mcp_tools.podcast_tool import PodcastTool
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.info("Initializing services for FastMCP...")
+vector_store_service = VectorStoreService()
+document_store_service = DocumentStoreService()
+embedding_service_instance = EmbeddingService()
+llm_service_instance = LLMService()
+ocr_service_instance = OCRService()
+ingestion_tool_instance = IngestionTool(
+    vector_store=vector_store_service,
+    document_store=document_store_service,
+    embedding_service=embedding_service_instance,
+    ocr_service=ocr_service_instance
+)
+search_tool_instance = SearchTool(
+    vector_store=vector_store_service,
+    embedding_service=embedding_service_instance,
+    document_store=document_store_service
+)
+generative_tool_instance = GenerativeTool(
+    llm_service=llm_service_instance,
+    search_tool=search_tool_instance
+)
+# Phase 2 & 3 Services
+logger.info("Initializing Phase 2 & 3 services...")
+llamaindex_service_instance = LlamaIndexService(document_store_service)
+elevenlabs_service_instance = ElevenLabsService(llamaindex_service_instance)
+podcast_generator_instance = PodcastGeneratorService(
+    llamaindex_service=llamaindex_service_instance,
+    llm_service=llm_service_instance
+)
+voice_tool_instance = VoiceTool(elevenlabs_service_instance)
+podcast_tool_instance = PodcastTool(podcast_generator_instance)
+mcp = FastMCP("")
+logger.info("FastMCP server initialized.")
+@mcp.tool()
+async def ingest_document(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Process and index a document from a local file path for searching.
+    Automatically determines file_type if not provided.
+    """
+    logger.info(f"Tool 'ingest_document' called with file_path: {file_path}, file_type: {file_type}")
+    try:
+        actual_file_type = file_type
+        if not actual_file_type:
+            actual_file_type = Path(file_path).suffix.lower().strip('.')
+            logger.info(f"Inferred file_type: {actual_file_type}")
+        result = await ingestion_tool_instance.process_document(file_path, actual_file_type)
+        logger.info(f"Ingestion result: {result}")
+        return result
+    except Exception as e:
+        logger.error(f"Error in 'ingest_document' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
+@mcp.tool()
+async def semantic_search(query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Search through indexed content using natural language.
+    'filters' can be used to narrow down the search.
+    """
+    logger.info(f"Tool 'semantic_search' called with query: {query}, top_k: {top_k}, filters: {filters}")
+    try:
+        results = await search_tool_instance.search(query, top_k, filters)
+        return {
+            "success": True,
+            "query": query,
+            "results": [result.to_dict() for result in results],
+            "total_results": len(results)
+        }
+    except Exception as e:
+        logger.error(f"Error in 'semantic_search' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e), "results": []}
+@mcp.tool()
+async def summarize_content(
+    content: Optional[str] = None,
+    document_id: Optional[str] = None,
+    style: str = "concise"
+) -> Dict[str, Any]:
+    """
+    Generate a summary of provided content or a document_id.
+    Available styles: concise, detailed, bullet_points, executive.
+    """
+    logger.info(f"Tool 'summarize_content' called. doc_id: {document_id}, style: {style}, has_content: {content is not None}")
+    try:
+        text_to_summarize = content
+        if document_id and not text_to_summarize:
+            doc = await document_store_service.get_document(document_id)
+            if not doc:
+                return {"success": False, "error": f"Document {document_id} not found"}
+            text_to_summarize = doc.content
+        if not text_to_summarize:
+            return {"success": False, "error": "No content provided for summarization"}
+        max_length = 10000
+        if len(text_to_summarize) > max_length:
+            logger.warning(f"Content for summarization is long ({len(text_to_summarize)} chars), truncating to {max_length}")
+            text_to_summarize = text_to_summarize[:max_length] + "..."
+        summary = await generative_tool_instance.summarize(text_to_summarize, style)
+        return {
+            "success": True,
+            "summary": summary,
+            "original_length": len(text_to_summarize),
+            "summary_length": len(summary),
+            "style": style
+        }
+    except Exception as e:
+        logger.error(f"Error in 'summarize_content' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
+@mcp.tool()
+async def generate_tags(
+    content: Optional[str] = None,
+    document_id: Optional[str] = None,
+    max_tags: int = 5
+) -> Dict[str, Any]:
+    """
+    Generate relevant tags for content or a document_id.
+    Saves tags to document metadata if document_id is provided.
+    """
+    logger.info(f"Tool 'generate_tags' called. doc_id: {document_id}, max_tags: {max_tags}, has_content: {content is not None}")
+    try:
+        text_for_tags = content
+        if document_id and not text_for_tags:
+            doc = await document_store_service.get_document(document_id)
+            if not doc:
+                return {"success": False, "error": f"Document {document_id} not found"}
+            text_for_tags = doc.content
+        if not text_for_tags:
+            return {"success": False, "error": "No content provided for tag generation"}
+        tags = await generative_tool_instance.generate_tags(text_for_tags, max_tags)
+        if document_id and tags:
+            await document_store_service.update_document_metadata(document_id, {"tags": tags})
+            logger.info(f"Tags {tags} saved for document {document_id}")
+        return {
+            "success": True,
+            "tags": tags,
+            "content_length": len(text_for_tags),
+            "document_id": document_id
+        }
+    except Exception as e:
+        logger.error(f"Error in 'generate_tags' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
+@mcp.tool()
+async def answer_question(question: str, context_filter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+    """
+    Answer questions using RAG (Retrieval Augmented Generation) over indexed content.
+    'context_filter' can be used to narrow down the context search.
+    """
+    logger.info(f"Tool 'answer_question' called with question: {question}, context_filter: {context_filter}")
+    try:
+        search_results = await search_tool_instance.search(question, top_k=5, filters=context_filter)
+        if not search_results:
+            return {
+                "success": False,
+                "error": "No relevant context found. Please upload relevant documents.",
+                "question": question,
+                "answer": "I could not find enough information in the documents to answer your question."
+            }
+        answer = await generative_tool_instance.answer_question(question, search_results)
+        return {
+            "success": True,
+            "question": question,
+            "answer": answer,
+            "sources": [result.to_dict() for result in search_results],
+            "confidence": "high" if len(search_results) >= 3 else "medium"
+        }
+    except Exception as e:
+        logger.error(f"Error in 'answer_question' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
+@mcp.tool()
+async def voice_qa(question: str, session_id: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Ask a question using the AI voice assistant with RAG capabilities.
+    Provides text-based Q&A powered by LlamaIndex agentic search.
+    """
+    logger.info(f"Tool 'voice_qa' called with question: {question}")
+    try:
+        result = await voice_tool_instance.voice_qa(question, session_id)
+        return result
+    except Exception as e:
+        logger.error(f"Error in 'voice_qa' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
+@mcp.tool()
+async def generate_podcast(
+    document_ids: List[str],
+    style: str = "conversational",
+    duration_minutes: int = 10,
+    host1_voice: str = "Rachel",
+    host2_voice: str = "Adam"
+) -> Dict[str, Any]:
+    """
+    Generate a podcast from selected documents.
+    Styles: conversational, educational, technical, casual.
+    Duration: 5-30 minutes recommended.
+    Voices: Rachel, Adam, Domi, Bella, Antoni, Josh, Sam, Emily, etc.
+    """
+    logger.info(f"Tool 'generate_podcast' called with {len(document_ids)} docs, style: {style}")
+    try:
+        result = await podcast_tool_instance.generate_podcast(
+            document_ids=document_ids,
+            style=style,
+            duration_minutes=duration_minutes,
+            host1_voice=host1_voice,
+            host2_voice=host2_voice
+        )
+        return result
+    except Exception as e:
+        logger.error(f"Error in 'generate_podcast' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e)}
+@mcp.tool()
+async def list_documents_for_ui(limit: int = 100, offset: int = 0) -> Dict[str, Any]:
+    """
+    (UI Helper) List documents from the document store.
+    Not a standard processing tool, but useful for UI population.
+    """
+    logger.info(f"Tool 'list_documents_for_ui' called with limit: {limit}, offset: {offset}")
+    try:
+        documents = await document_store_service.list_documents(limit, offset)
+        return {
+            "success": True,
+            "documents": [doc.to_dict() for doc in documents],
+            "total": len(documents)
+        }
+    except Exception as e:
+        logger.error(f"Error in 'list_documents_for_ui' tool: {str(e)}", exc_info=True)
+        return {"success": False, "error": str(e), "documents": []}
+# Blaxel Deployment Support
+from fastapi import FastAPI
+from mcp.server.streamable_http import StreamableHTTPServerTransport
+import os
+# Get Blaxel environment variables
+host = os.getenv("BL_SERVER_HOST", "0.0.0.0")
+port = int(os.getenv("BL_SERVER_PORT", "8000"))
+# Create FastAPI app
+app = FastAPI()
+# Initialize HTTP transport instead of stdio
+transport = StreamableHTTPServerTransport()
+# Connect MCP server to HTTP transport
+mcp.connect(transport)
+# Mount transport to FastAPI
+transport.mount(app)
+@app.get("/health")
+async def health_check():
+    """Health check endpoint for Modal"""
+    return {"status": "healthy", "service": "mcp-server"}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)

mcp_tools/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # MCP tools module initialization

mcp_tools/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (150 Bytes). View file

mcp_tools/__pycache__/generative_tool.cpython-313.pyc ADDED Viewed

Binary file (21.4 kB). View file

mcp_tools/__pycache__/ingestion_tool.cpython-313.pyc ADDED Viewed

Binary file (16.8 kB). View file

mcp_tools/__pycache__/podcast_tool.cpython-313.pyc ADDED Viewed

Binary file (5.19 kB). View file

mcp_tools/__pycache__/search_tool.cpython-313.pyc ADDED Viewed

Binary file (22 kB). View file

mcp_tools/__pycache__/voice_tool.cpython-313.pyc ADDED Viewed

Binary file (2.42 kB). View file

mcp_tools/generative_tool.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import logging
+from typing import List, Dict, Any, Optional
+import asyncio
+from services.llm_service import LLMService
+from mcp_tools.search_tool import SearchTool
+from core.models import SearchResult
+logger = logging.getLogger(__name__)
+class GenerativeTool:
+    def __init__(self, llm_service: LLMService, search_tool: Optional[SearchTool] = None):
+        self.llm_service = llm_service
+        self.search_tool = search_tool
+    async def summarize(self, content: str, style: str = "concise", max_length: Optional[int] = None) -> str:
+        """Generate a summary of the given content"""
+        try:
+            if not content.strip():
+                return "No content provided for summarization."
+            logger.info(f"Generating {style} summary for content of length {len(content)}")
+            summary = await self.llm_service.summarize(content, style, max_length)
+            logger.info(f"Generated summary of length {len(summary)}")
+            return summary
+        except Exception as e:
+            logger.error(f"Error generating summary: {str(e)}")
+            return f"Error generating summary: {str(e)}"
+    async def generate_tags(self, content: str, max_tags: int = 5) -> List[str]:
+        """Generate relevant tags for the given content"""
+        try:
+            if not content.strip():
+                return []
+            logger.info(f"Generating up to {max_tags} tags for content")
+            tags = await self.llm_service.generate_tags(content, max_tags)
+            logger.info(f"Generated {len(tags)} tags")
+            return tags
+        except Exception as e:
+            logger.error(f"Error generating tags: {str(e)}")
+            return []
+    async def categorize(self, content: str, categories: List[str]) -> str:
+        """Categorize content into one of the provided categories"""
+        try:
+            if not content.strip():
+                return "Uncategorized"
+            if not categories:
+                categories = ["Technology", "Business", "Science", "Education", "Entertainment", "News", "Research", "Other"]
+            logger.info(f"Categorizing content into one of {len(categories)} categories")
+            category = await self.llm_service.categorize(content, categories)
+            logger.info(f"Categorized as: {category}")
+            return category
+        except Exception as e:
+            logger.error(f"Error categorizing content: {str(e)}")
+            return "Uncategorized"
+    async def answer_question(self, question: str, context_results: List[SearchResult] = None) -> str:
+        """Answer a question using the provided context or RAG"""
+        try:
+            if not question.strip():
+                return "No question provided."
+            logger.info(f"Answering question: {question[:100]}...")
+            # If no context provided and search tool is available, search for relevant context
+            if not context_results and self.search_tool:
+                logger.info("No context provided, searching for relevant information")
+                context_results = await self.search_tool.search(question, top_k=5)
+            # Prepare context from search results
+            if context_results:
+                context_texts = []
+                for result in context_results:
+                    context_texts.append(f"Source: {result.document_id}\nContent: {result.content}\n")
+                context = "\n---\n".join(context_texts)
+                logger.info(f"Using context from {len(context_results)} sources")
+            else:
+                context = ""
+                logger.info("No context available for answering question")
+            # Generate answer
+            answer = await self.llm_service.answer_question(question, context)
+            logger.info(f"Generated answer of length {len(answer)}")
+            return answer
+        except Exception as e:
+            logger.error(f"Error answering question: {str(e)}")
+            return f"I encountered an error while trying to answer your question: {str(e)}"
+    async def generate_outline(self, topic: str, num_sections: int = 5, detail_level: str = "medium") -> str:
+        """Generate an outline for the given topic"""
+        try:
+            if not topic.strip():
+                return "No topic provided."
+            detail_descriptions = {
+                "brief": "brief bullet points",
+                "medium": "detailed bullet points with descriptions",
+                "detailed": "comprehensive outline with sub-sections and explanations"
+            }
+            detail_desc = detail_descriptions.get(detail_level, "detailed bullet points")
+            prompt = f"""Create a {detail_desc} outline for the topic: "{topic}"
+            The outline should have {num_sections} main sections and be well-structured and informative.
+            Format the outline clearly with proper numbering and indentation.
+            Topic: {topic}
+            Outline:"""
+            outline = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.7)
+            logger.info(f"Generated outline for topic: {topic}")
+            return outline
+        except Exception as e:
+            logger.error(f"Error generating outline: {str(e)}")
+            return f"Error generating outline: {str(e)}"
+    async def explain_concept(self, concept: str, audience: str = "general", length: str = "medium") -> str:
+        """Explain a concept for a specific audience"""
+        try:
+            if not concept.strip():
+                return "No concept provided."
+            audience_styles = {
+                "general": "a general audience using simple, clear language",
+                "technical": "a technical audience with appropriate jargon and detail",
+                "beginner": "beginners with no prior knowledge, using analogies and examples",
+                "expert": "experts in the field with advanced terminology and depth"
+            }
+            length_guidance = {
+                "brief": "Keep the explanation concise and to the point (2-3 paragraphs).",
+                "medium": "Provide a comprehensive explanation (4-6 paragraphs).",
+                "detailed": "Give a thorough, in-depth explanation with examples."
+            }
+            audience_desc = audience_styles.get(audience, "a general audience")
+            length_desc = length_guidance.get(length, "Provide a comprehensive explanation.")
+            prompt = f"""Explain the concept of "{concept}" for {audience_desc}.
+            {length_desc}
+            Make sure to:
+            - Use appropriate language for the audience
+            - Include relevant examples or analogies
+            - Structure the explanation logically
+            - Ensure clarity and accuracy
+            Concept to explain: {concept}
+            Explanation:"""
+            explanation = await self.llm_service.generate_text(prompt, max_tokens=600, temperature=0.5)
+            logger.info(f"Generated explanation for concept: {concept}")
+            return explanation
+        except Exception as e:
+            logger.error(f"Error explaining concept: {str(e)}")
+            return f"Error explaining concept: {str(e)}"
+    async def compare_concepts(self, concept1: str, concept2: str, aspects: List[str] = None) -> str:
+        """Compare two concepts across specified aspects"""
+        try:
+            if not concept1.strip() or not concept2.strip():
+                return "Both concepts must be provided for comparison."
+            if not aspects:
+                aspects = ["definition", "key features", "advantages", "disadvantages", "use cases"]
+            aspects_str = ", ".join(aspects)
+            prompt = f"""Compare and contrast "{concept1}" and "{concept2}" across the following aspects: {aspects_str}.
+            Structure your comparison clearly, addressing each aspect for both concepts.
+            Format:
+            ## Comparison: {concept1} vs {concept2}
+            For each aspect, provide:
+            - **{concept1}**: [description]
+            - **{concept2}**: [description]
+            - **Key Difference**: [summary]
+            For each aspect, provide:
+            - **{concept1}**: [description]
+            - **{concept2}**: [description]
+            - **Key Difference**: [summary]
+            Concepts to compare:
+            1. {concept1}
+            2. {concept2}
+            Comparison:"""
+            comparison = await self.llm_service.generate_text(prompt, max_tokens=800, temperature=0.6)
+            logger.info(f"Generated comparison between {concept1} and {concept2}")
+            return comparison
+        except Exception as e:
+            logger.error(f"Error comparing concepts: {str(e)}")
+            return f"Error comparing concepts: {str(e)}"
+    async def generate_questions(self, content: str, question_type: str = "comprehension", num_questions: int = 5) -> List[str]:
+        """Generate questions based on the provided content"""
+        try:
+            if not content.strip():
+                return []
+            question_types = {
+                "comprehension": "comprehension questions that test understanding of key concepts",
+                "analysis": "analytical questions that require deeper thinking and evaluation",
+                "application": "application questions that ask how to use the concepts in practice",
+                "creative": "creative questions that encourage original thinking and exploration",
+                "factual": "factual questions about specific details and information"
+            }
+            question_desc = question_types.get(question_type, "comprehension questions")
+            prompt = f"""Based on the following content, generate {num_questions} {question_desc}.
+            The questions should be:
+            - Clear and well-formulated
+            - Relevant to the content
+            - Appropriate for the specified type
+            - Engaging and thought-provoking
+            Content:
+            {content[:2000]}  # Limit content length
+            Questions:"""
+            response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.7)
+            # Parse questions from response
+            questions = []
+            lines = response.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and ('?' in line or line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*'))):
+                    # Clean up the question
+                    question = line.lstrip('0123456789.-* ').strip()
+                    if question and '?' in question:
+                        questions.append(question)
+            logger.info(f"Generated {len(questions)} {question_type} questions")
+            return questions[:num_questions]
+        except Exception as e:
+            logger.error(f"Error generating questions: {str(e)}")
+            return []
+    def _chunk_text(self, text: str, chunk_size: int = 2000) -> List[str]:
+        """Split text into chunks respecting paragraph boundaries"""
+        if len(text) <= chunk_size:
+            return [text]
+        chunks = []
+        current_chunk = ""
+        # Split by paragraphs first
+        paragraphs = text.split('\n\n')
+        for para in paragraphs:
+            if len(current_chunk) + len(para) + 2 <= chunk_size:
+                current_chunk += para + "\n\n"
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = para + "\n\n"
+                # If a single paragraph is too long, split it by sentences
+                if len(current_chunk) > chunk_size:
+                    # Reset current_chunk and split the long paragraph
+                    long_para = current_chunk.strip()
+                    current_chunk = ""
+                    sentences = long_para.replace('. ', '.\n').split('\n')
+                    sub_chunk = ""
+                    for sentence in sentences:
+                        if len(sub_chunk) + len(sentence) + 1 <= chunk_size:
+                            sub_chunk += sentence + " "
+                        else:
+                            if sub_chunk:
+                                chunks.append(sub_chunk.strip())
+                            sub_chunk = sentence + " "
+                    if sub_chunk:
+                        current_chunk = sub_chunk # Carry over remaining part
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    async def paraphrase_text(self, text: str, style: str = "formal", preserve_meaning: bool = True) -> str:
+        """Paraphrase text in a different style while preserving meaning"""
+        try:
+            if not text.strip():
+                return "No text provided for paraphrasing."
+            # Check length and chunk if necessary
+            MAX_CHUNK_SIZE = 2500
+            if len(text) > MAX_CHUNK_SIZE:
+                logger.info(f"Text length {len(text)} exceeds limit, chunking...")
+                chunks = self._chunk_text(text, MAX_CHUNK_SIZE)
+                logger.info(f"Split into {len(chunks)} chunks")
+                paraphrased_chunks = []
+                for i, chunk in enumerate(chunks):
+                    logger.info(f"Processing chunk {i+1}/{len(chunks)}")
+                    # Process chunk
+                    chunk_result = await self.paraphrase_text(chunk, style, preserve_meaning)
+                    paraphrased_chunks.append(chunk_result)
+                    # Small delay to be nice to rate limits
+                    await asyncio.sleep(0.5)
+                return "\n\n".join(paraphrased_chunks)
+            style_instructions = {
+                "formal": "formal, professional language",
+                "casual": "casual, conversational language",
+                "academic": "academic, scholarly language",
+                "simple": "simple, easy-to-understand language",
+                "technical": "technical, precise language"
+            }
+            style_desc = style_instructions.get(style, "clear, appropriate language")
+            meaning_instruction = "while preserving the exact meaning and key information" if preserve_meaning else "while maintaining the general intent"
+            prompt = f"""Paraphrase the following text using {style_desc} {meaning_instruction}.
+            Original text:
+            {text}
+            Paraphrased text:"""
+            paraphrase = await self.llm_service.generate_text(prompt, max_tokens=len(text.split()) * 2, temperature=0.6)
+            logger.info(f"Paraphrased text in {style} style")
+            return paraphrase.strip()
+        except Exception as e:
+            logger.error(f"Error paraphrasing text: {str(e)}")
+            return f"Error paraphrasing text: {str(e)}"
+    async def extract_key_insights(self, content: str, num_insights: int = 5) -> List[str]:
+        """Extract key insights from the provided content"""
+        try:
+            if not content.strip():
+                return []
+            prompt = f'''Analyze the following content and extract {num_insights} key insights or takeaways.
+            Each insight should be:
+            - A clear, concise statement
+            - Significant and meaningful
+            - Based on the content provided
+            - Actionable or thought-provoking when possible
+            Content:
+            {content[:3000]}  # Limit content length
+            Key Insights:'''
+            response = await self.llm_service.generate_text(prompt, max_tokens=400, temperature=0.6)
+            # Parse insights from response
+            insights = []
+            lines = response.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line and (line.startswith(('1.', '2.', '3.', '4.', '5.', '-', '*')) or len(insights) == 0):
+                    # Clean up the insight
+                    insight = line.lstrip('0123456789.-* ').strip()
+                    if insight and len(insight) > 10:  # Minimum insight length
+                        insights.append(insight)
+            logger.info(f"Extracted {len(insights)} key insights")
+            return insights[:num_insights]
+        except Exception as e:
+            logger.error(f"Error extracting insights: {str(e)}")
+            return []

mcp_tools/ingestion_tool.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import logging
+import asyncio
+from typing import Dict, Any, Optional
+import tempfile
+import os
+from pathlib import Path
+import uuid
+from core.document_parser import DocumentParser
+from core.chunker import TextChunker
+from core.text_preprocessor import TextPreprocessor
+from services.vector_store_service import VectorStoreService
+from services.document_store_service import DocumentStoreService
+from services.embedding_service import EmbeddingService
+from services.ocr_service import OCRService
+logger = logging.getLogger(__name__)
+class IngestionTool:
+    def __init__(self, vector_store: VectorStoreService, document_store: DocumentStoreService,
+             embedding_service: EmbeddingService, ocr_service: OCRService):
+        self.vector_store = vector_store
+        self.document_store = document_store
+        self.embedding_service = embedding_service
+        self.ocr_service = ocr_service
+        self.document_parser = DocumentParser()
+        # Pass OCR service to document parser
+        self.document_parser.ocr_service = ocr_service
+        self.text_chunker = TextChunker()
+        self.text_preprocessor = TextPreprocessor()
+    async def process_document(self, file_path: str, file_type: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process a document through the full ingestion pipeline"""
+        if task_id is None:
+            task_id = str(uuid.uuid4())
+        try:
+            logger.info(f"Starting document processing for {file_path}")
+            # Step 1: Parse the document
+            filename = Path(file_path).name
+            document = await self.document_parser.parse_document(file_path, filename)
+            if not document.content:
+                logger.warning(f"No content extracted from document {filename}")
+                return {
+                    "success": False,
+                    "error": "No content could be extracted from the document",
+                    "task_id": task_id
+                }
+            # Step 2: Store the document
+            await self.document_store.store_document(document)
+            # Step 3: Process content for embeddings
+            chunks = await self._create_and_embed_chunks(document)
+            if not chunks:
+                logger.warning(f"No chunks created for document {document.id}")
+                return {
+                    "success": False,
+                    "error": "Failed to create text chunks",
+                    "task_id": task_id,
+                    "document_id": document.id,
+                    "filename": document.filename,
+                    "chunks_created": len(chunks),
+                    "content_length": len(document.content),
+                    "doc_type": document.doc_type.value,
+                    "message": f"Successfully processed {filename}"
+                }
+            # Step 4: Store embeddings
+            success = await self.vector_store.add_chunks(chunks)
+            if not success:
+                logger.error(f"Failed to store embeddings for document {document.id}")
+                return {
+                    "success": False,
+                    "error": "Failed to store embeddings",
+                    "task_id": task_id,
+                    "document_id": document.id
+                }
+            # Step 5: Update document metadata with chunk count
+            try:
+                current_metadata = document.metadata or {}
+                current_metadata["chunk_count"] = len(chunks)
+                await self.document_store.update_document_metadata(
+                    document.id,
+                    {"metadata": current_metadata}
+                )
+            except Exception as e:
+                logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
+            logger.info(f"Successfully processed document {document.id} with {len(chunks)} chunks")
+            return {
+                "success": True,
+                "task_id": task_id,
+                "document_id": document.id,
+                "filename": document.filename,
+                "chunks_created": len(chunks),
+                "content_length": len(document.content),
+                "doc_type": document.doc_type.value,
+                "message": f"Successfully processed {filename}"
+            }
+        except Exception as e:
+            logger.error(f"Error processing document {file_path}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id,
+                "message": f"Failed to process document: {str(e)}"
+            }
+    async def _create_and_embed_chunks(self, document) -> list:
+        """Create chunks and generate embeddings"""
+        try:
+            # Step 1: Create chunks
+            chunks = self.text_chunker.chunk_document(
+                document.id,
+                document.content,
+                method="recursive"
+            )
+            if not chunks:
+                return []
+            # Step 2: Optimize chunks for embedding
+            optimized_chunks = self.text_chunker.optimize_chunks_for_embedding(chunks)
+            # Step 3: Generate embeddings
+            texts = [chunk.content for chunk in optimized_chunks]
+            embeddings = await self.embedding_service.generate_embeddings(texts)
+            # Step 4: Add embeddings to chunks
+            embedded_chunks = []
+            for i, chunk in enumerate(optimized_chunks):
+                if i < len(embeddings):
+                    chunk.embedding = embeddings[i]
+                    embedded_chunks.append(chunk)
+            return embedded_chunks
+        except Exception as e:
+            logger.error(f"Error creating and embedding chunks: {str(e)}")
+            return []
+    async def process_url(self, url: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process a document from a URL"""
+        try:
+            import requests
+            from urllib.parse import urlparse
+            # Download the file
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            # Determine file type from URL or content-type
+            parsed_url = urlparse(url)
+            filename = Path(parsed_url.path).name or "downloaded_file"
+            # Create temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{filename}") as tmp_file:
+                tmp_file.write(response.content)
+                tmp_file_path = tmp_file.name
+            try:
+                # Process the downloaded file
+                result = await self.process_document(tmp_file_path, "", task_id)
+                result["source_url"] = url
+                return result
+            finally:
+                # Clean up temporary file
+                if os.path.exists(tmp_file_path):
+                    os.unlink(tmp_file_path)
+        except Exception as e:
+            logger.error(f"Error processing URL {url}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4()),
+                "source_url": url
+            }
+    async def process_text_content(self, content: str, filename: str = "text_content.txt",
+                                 task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process raw text content directly"""
+        try:
+            from core.models import Document, DocumentType
+            from datetime import datetime
+            # Create document object
+            document = Document(
+                id=str(uuid.uuid4()),
+                filename=filename,
+                content=content,
+                doc_type=DocumentType.TEXT,
+                file_size=len(content.encode('utf-8')),
+                created_at=datetime.utcnow(),
+                metadata={
+                    "source": "direct_text_input",
+                    "content_length": len(content),
+                    "word_count": len(content.split())
+                }
+            )
+            # Store the document
+            await self.document_store.store_document(document)
+            # Process content for embeddings
+            chunks = await self._create_and_embed_chunks(document)
+            if chunks:
+                await self.vector_store.add_chunks(chunks)
+                # Update document metadata with chunk count
+                try:
+                    current_metadata = document.metadata or {}
+                    current_metadata["chunk_count"] = len(chunks)
+                    await self.document_store.update_document_metadata(
+                        document.id,
+                        {"metadata": current_metadata}
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
+            return {
+                "success": True,
+                "task_id": task_id or str(uuid.uuid4()),
+                "document_id": document.id,
+                "filename": filename,
+                "chunks_created": len(chunks),
+                "content_length": len(content),
+                "message": f"Successfully processed text content"
+            }
+        except Exception as e:
+            logger.error(f"Error processing text content: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4())
+            }
+    async def reprocess_document(self, document_id: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Reprocess an existing document (useful for updating embeddings)"""
+        try:
+            # Get the document
+            document = await self.document_store.get_document(document_id)
+            if not document:
+                return {
+                    "success": False,
+                    "error": f"Document {document_id} not found",
+                    "task_id": task_id or str(uuid.uuid4())
+                }
+            # Remove existing chunks from vector store
+            await self.vector_store.delete_document(document_id)
+            # Recreate and embed chunks
+            chunks = await self._create_and_embed_chunks(document)
+            if chunks:
+                await self.vector_store.add_chunks(chunks)
+                # Update document metadata with chunk count
+                try:
+                    current_metadata = document.metadata or {}
+                    current_metadata["chunk_count"] = len(chunks)
+                    await self.document_store.update_document_metadata(
+                        document.id,
+                        {"metadata": current_metadata}
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to update chunk count for document {document.id}: {e}")
+            return {
+                "success": True,
+                "task_id": task_id or str(uuid.uuid4()),
+                "document_id": document_id,
+                "filename": document.filename,
+                "chunks_created": len(chunks),
+                "message": f"Successfully reprocessed {document.filename}"
+            }
+        except Exception as e:
+            logger.error(f"Error reprocessing document {document_id}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4()),
+                "document_id": document_id
+            }
+    async def batch_process_directory(self, directory_path: str, task_id: Optional[str] = None) -> Dict[str, Any]:
+        """Process multiple documents from a directory"""
+        try:
+            directory = Path(directory_path)
+            if not directory.exists() or not directory.is_dir():
+                return {
+                    "success": False,
+                    "error": f"Directory {directory_path} does not exist",
+                    "task_id": task_id or str(uuid.uuid4())
+                }
+            # Supported file extensions
+            supported_extensions = {'.txt', '.pdf', '.docx', '.png', '.jpg', '.jpeg', '.bmp', '.tiff'}
+            # Find all supported files
+            files_to_process = []
+            for ext in supported_extensions:
+                files_to_process.extend(directory.glob(f"*{ext}"))
+                files_to_process.extend(directory.glob(f"*{ext.upper()}"))
+            if not files_to_process:
+                return {
+                    "success": False,
+                    "error": "No supported files found in directory",
+                    "task_id": task_id or str(uuid.uuid4())
+                }
+            # Process files
+            results = []
+            successful = 0
+            failed = 0
+            for file_path in files_to_process:
+                try:
+                    result = await self.process_document(str(file_path), file_path.suffix)
+                    results.append(result)
+                    if result.get("success"):
+                        successful += 1
+                    else:
+                        failed += 1
+                except Exception as e:
+                    failed += 1
+                    results.append({
+                        "success": False,
+                        "error": str(e),
+                        "filename": file_path.name
+                    })
+            return {
+                "success": True,
+                "task_id": task_id or str(uuid.uuid4()),
+                "directory": str(directory),
+                "total_files": len(files_to_process),
+                "successful": successful,
+                "failed": failed,
+                "results": results,
+                "message": f"Processed {successful}/{len(files_to_process)} files successfully"
+            }
+        except Exception as e:
+            logger.error(f"Error batch processing directory {directory_path}: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "task_id": task_id or str(uuid.uuid4())
+            }

mcp_tools/podcast_tool.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import logging
+from typing import Dict, Any, List
+from dataclasses import asdict
+logger = logging.getLogger(__name__)
+class PodcastTool:
+    """
+    MCP Tool for podcast generation from documents
+    """
+    def __init__(self, podcast_generator):
+        """
+        Initialize Podcast Tool
+        Args:
+            podcast_generator: PodcastGeneratorService instance
+        """
+        self.podcast_generator = podcast_generator
+    async def generate_podcast(
+        self,
+        document_ids: List[str],
+        style: str = "conversational",
+        duration_minutes: int = 10,
+        host1_voice: str = "Rachel",
+        host2_voice: str = "Adam"
+    ) -> Dict[str, Any]:
+        """
+        MCP Tool: Generate podcast from documents
+        Args:
+            document_ids: List of document IDs to generate podcast from
+            style: Podcast style (conversational, educational, technical, casual)
+            duration_minutes: Target duration in minutes
+            host1_voice: Voice name for first host
+            host2_voice: Voice name for second host
+        Returns:
+            Dictionary with podcast ID, audio URL, transcript, and metadata
+        """
+        try:
+            if not document_ids or len(document_ids) == 0:
+                return {
+                    "success": False,
+                    "error": "No documents provided. Please select at least one document."
+                }
+            logger.info(f"Generating podcast from {len(document_ids)} documents")
+            # Generate podcast using service
+            result = await self.podcast_generator.generate_podcast(
+                document_ids=document_ids,
+                style=style,
+                duration_minutes=duration_minutes,
+                host1_voice=host1_voice,
+                host2_voice=host2_voice
+            )
+            if result.success:
+                return {
+                    "success": True,
+                    "podcast_id": result.podcast_id,
+                    "audio_file": result.audio_file_path,
+                    "audio_url": f"/data/podcasts/{result.podcast_id}.mp3",
+                    "transcript": result.transcript,
+                    "metadata": asdict(result.metadata) if result.metadata else {},
+                    "generation_time": result.generation_time,
+                    "message": f"Podcast generated successfully! Duration: {result.metadata.duration_seconds/60:.1f} minutes"
+                }
+            else:
+                return {
+                    "success": False,
+                    "error": result.error or "Unknown error during podcast generation"
+                }
+        except Exception as e:
+            logger.error(f"Podcast generation failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+    def list_podcasts(self, limit: int = 10) -> Dict[str, Any]:
+        """
+        List previously generated podcasts
+        Args:
+            limit: Maximum number of podcasts to return
+        Returns:
+            Dictionary with list of podcast metadata
+        """
+        try:
+            podcasts = self.podcast_generator.list_podcasts(limit=limit)
+            return {
+                "success": True,
+                "podcasts": [asdict(p) for p in podcasts],
+                "total": len(podcasts)
+            }
+        except Exception as e:
+            logger.error(f"Failed to list podcasts: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "podcasts": []
+            }
+    def get_podcast(self, podcast_id: str) -> Dict[str, Any]:
+        """
+        Get specific podcast by ID
+        Args:
+            podcast_id: Podcast identifier
+        Returns:
+            Dictionary with podcast metadata
+        """
+        try:
+            podcast = self.podcast_generator.get_podcast(podcast_id)
+            if podcast:
+                return {
+                    "success": True,
+                    "podcast": asdict(podcast)
+                }
+            else:
+                return {
+                    "success": False,
+                    "error": "Podcast not found"
+                }
+        except Exception as e:
+            logger.error(f"Failed to get podcast: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e)
+            }

mcp_tools/search_tool.py ADDED Viewed

	@@ -0,0 +1,437 @@

+import logging
+from typing import List, Dict, Any, Optional
+import asyncio
+from core.models import SearchResult
+from services.vector_store_service import VectorStoreService
+from services.embedding_service import EmbeddingService
+from services.document_store_service import DocumentStoreService
+import config
+logger = logging.getLogger(__name__)
+class SearchTool:
+    def __init__(self, vector_store: VectorStoreService, embedding_service: EmbeddingService,
+                 document_store: Optional[DocumentStoreService] = None, llamaindex_service: Any = None):
+        self.vector_store = vector_store
+        self.embedding_service = embedding_service
+        self.document_store = document_store
+        self.llamaindex_service = llamaindex_service
+        self.config = config.config
+    async def search(self, query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None,
+                    similarity_threshold: Optional[float] = None) -> List[SearchResult]:
+        """Perform semantic search"""
+        try:
+            if not query.strip():
+                logger.warning("Empty search query provided")
+                return []
+            # Use default threshold if not provided
+            if similarity_threshold is None:
+                similarity_threshold = self.config.SIMILARITY_THRESHOLD
+            logger.info(f"Performing semantic search for: '{query}' (top_k={top_k})")
+            # Generate query embedding
+            query_embedding = await self.embedding_service.generate_single_embedding(query)
+            if not query_embedding:
+                logger.error("Failed to generate query embedding")
+                return []
+            # Perform vector search
+            results = await self.vector_store.search(
+                query_embedding=query_embedding,
+                top_k=top_k,
+                filters=filters
+            )
+            # Filter by similarity threshold
+            filtered_results = [
+                result for result in results
+                if result.score >= similarity_threshold
+            ]
+            logger.info(f"Found {len(filtered_results)} results above threshold {similarity_threshold}")
+            # Enhance results with additional metadata if document store is available
+            if self.document_store:
+                enhanced_results = await self._enhance_results_with_metadata(filtered_results)
+                return enhanced_results
+            return filtered_results
+        except Exception as e:
+            logger.error(f"Error performing semantic search: {str(e)}")
+            return []
+    async def agentic_search(self, query: str) -> str:
+        """Perform agentic search using LlamaIndex"""
+        if not self.llamaindex_service:
+            logger.warning("LlamaIndex service not available for agentic search")
+            return "Agentic search not available."
+        try:
+            logger.info(f"Performing agentic search for: '{query}'")
+            return await self.llamaindex_service.query(query)
+        except Exception as e:
+            logger.error(f"Error performing agentic search: {str(e)}")
+            return f"Error performing agentic search: {str(e)}"
+    async def _enhance_results_with_metadata(self, results: List[SearchResult]) -> List[SearchResult]:
+        """Enhance search results with document metadata"""
+        try:
+            enhanced_results = []
+            for result in results:
+                try:
+                    # Get document metadata
+                    document = await self.document_store.get_document(result.document_id)
+                    if document:
+                        # Add document metadata to result
+                        enhanced_metadata = {
+                            **result.metadata,
+                            "document_filename": document.filename,
+                            "document_type": document.doc_type.value,
+                            "document_tags": document.tags,
+                            "document_category": document.category,
+                            "document_created_at": document.created_at.isoformat(),
+                            "document_summary": document.summary
+                        }
+                        enhanced_result = SearchResult(
+                            chunk_id=result.chunk_id,
+                            document_id=result.document_id,
+                            content=result.content,
+                            score=result.score,
+                            metadata=enhanced_metadata
+                        )
+                        enhanced_results.append(enhanced_result)
+                    else:
+                        # Document not found, use original result
+                        enhanced_results.append(result)
+                except Exception as e:
+                    logger.warning(f"Error enhancing result {result.chunk_id}: {str(e)}")
+                    enhanced_results.append(result)
+            return enhanced_results
+        except Exception as e:
+            logger.error(f"Error enhancing results: {str(e)}")
+            return results
+    async def multi_query_search(self, queries: List[str], top_k: int = 5,
+                               aggregate_method: str = "merge") -> List[SearchResult]:
+        """Perform search with multiple queries and aggregate results"""
+        try:
+            all_results = []
+            # Perform search for each query
+            for query in queries:
+                if query.strip():
+                    query_results = await self.search(query, top_k)
+                    all_results.extend(query_results)
+            if not all_results:
+                return []
+            # Aggregate results
+            if aggregate_method == "merge":
+                return await self._merge_results(all_results, top_k)
+            elif aggregate_method == "intersect":
+                return await self._intersect_results(all_results, top_k)
+            elif aggregate_method == "average":
+                return await self._average_results(all_results, top_k)
+            else:
+                # Default to merge
+                return await self._merge_results(all_results, top_k)
+        except Exception as e:
+            logger.error(f"Error in multi-query search: {str(e)}")
+            return []
+    async def _merge_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
+        """Merge results and remove duplicates, keeping highest scores"""
+        try:
+            # Group by chunk_id and keep highest score
+            chunk_scores = {}
+            chunk_results = {}
+            for result in results:
+                chunk_id = result.chunk_id
+                if chunk_id not in chunk_scores or result.score > chunk_scores[chunk_id]:
+                    chunk_scores[chunk_id] = result.score
+                    chunk_results[chunk_id] = result
+            # Sort by score and return top_k
+            merged_results = list(chunk_results.values())
+            merged_results.sort(key=lambda x: x.score, reverse=True)
+            return merged_results[:top_k]
+        except Exception as e:
+            logger.error(f"Error merging results: {str(e)}")
+            return results[:top_k]
+    async def _intersect_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
+        """Find chunks that appear in multiple queries"""
+        try:
+            # Count occurrences of each chunk
+            chunk_counts = {}
+            chunk_results = {}
+            for result in results:
+                chunk_id = result.chunk_id
+                chunk_counts[chunk_id] = chunk_counts.get(chunk_id, 0) + 1
+                if chunk_id not in chunk_results or result.score > chunk_results[chunk_id].score:
+                    chunk_results[chunk_id] = result
+            # Filter chunks that appear more than once
+            intersect_results = [
+                result for chunk_id, result in chunk_results.items()
+                if chunk_counts[chunk_id] > 1
+            ]
+            # Sort by score
+            intersect_results.sort(key=lambda x: x.score, reverse=True)
+            return intersect_results[:top_k]
+        except Exception as e:
+            logger.error(f"Error intersecting results: {str(e)}")
+            return []
+    async def _average_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
+        """Average scores for chunks that appear multiple times"""
+        try:
+            # Group by chunk_id and calculate average scores
+            chunk_groups = {}
+            for result in results:
+                chunk_id = result.chunk_id
+                if chunk_id not in chunk_groups:
+                    chunk_groups[chunk_id] = []
+                chunk_groups[chunk_id].append(result)
+            # Calculate average scores
+            averaged_results = []
+            for chunk_id, group in chunk_groups.items():
+                avg_score = sum(r.score for r in group) / len(group)
+                # Use the result with the highest individual score but update the score to average
+                best_result = max(group, key=lambda x: x.score)
+                averaged_result = SearchResult(
+                    chunk_id=best_result.chunk_id,
+                    document_id=best_result.document_id,
+                    content=best_result.content,
+                    score=avg_score,
+                    metadata={
+                        **best_result.metadata,
+                        "query_count": len(group),
+                        "score_range": f"{min(r.score for r in group):.3f}-{max(r.score for r in group):.3f}"
+                    }
+                )
+                averaged_results.append(averaged_result)
+            # Sort by average score
+            averaged_results.sort(key=lambda x: x.score, reverse=True)
+            return averaged_results[:top_k]
+        except Exception as e:
+            logger.error(f"Error averaging results: {str(e)}")
+            return results[:top_k]
+    async def search_by_document(self, document_id: str, query: str, top_k: int = 5) -> List[SearchResult]:
+        """Search within a specific document"""
+        try:
+            filters = {"document_id": document_id}
+            return await self.search(query, top_k, filters)
+        except Exception as e:
+            logger.error(f"Error searching within document {document_id}: {str(e)}")
+            return []
+    async def search_by_category(self, category: str, query: str, top_k: int = 5) -> List[SearchResult]:
+        """Search within documents of a specific category"""
+        try:
+            if not self.document_store:
+                logger.warning("Document store not available for category search")
+                return await self.search(query, top_k)
+            # Get documents in the category
+            documents = await self.document_store.list_documents(
+                limit=1000,  # Adjust as needed
+                filters={"category": category}
+            )
+            if not documents:
+                logger.info(f"No documents found in category '{category}'")
+                return []
+            # Extract document IDs
+            document_ids = [doc.id for doc in documents]
+            # Search with document ID filter
+            filters = {"document_ids": document_ids}
+            return await self.search(query, top_k, filters)
+        except Exception as e:
+            logger.error(f"Error searching by category {category}: {str(e)}")
+            return []
+    async def search_with_date_range(self, query: str, start_date, end_date, top_k: int = 5) -> List[SearchResult]:
+        """Search documents within a date range"""
+        try:
+            if not self.document_store:
+                logger.warning("Document store not available for date range search")
+                return await self.search(query, top_k)
+            # Get documents in the date range
+            documents = await self.document_store.list_documents(
+                limit=1000,  # Adjust as needed
+                filters={
+                    "created_after": start_date,
+                    "created_before": end_date
+                }
+            )
+            if not documents:
+                logger.info(f"No documents found in date range")
+                return []
+            # Extract document IDs
+            document_ids = [doc.id for doc in documents]
+            # Search with document ID filter
+            filters = {"document_ids": document_ids}
+            return await self.search(query, top_k, filters)
+        except Exception as e:
+            logger.error(f"Error searching with date range: {str(e)}")
+            return []
+    async def get_search_suggestions(self, partial_query: str, limit: int = 5) -> List[str]:
+        """Get search suggestions based on partial query"""
+        try:
+            # This is a simple implementation
+            # In a production system, you might want to use a more sophisticated approach
+            if len(partial_query) < 2:
+                return []
+            # Search for the partial query
+            results = await self.search(partial_query, top_k=20)
+            # Extract potential query expansions from content
+            suggestions = set()
+            for result in results:
+                content_words = result.content.lower().split()
+                for i, word in enumerate(content_words):
+                    if partial_query.lower() in word:
+                        # Add the word itself
+                        suggestions.add(word.strip('.,!?;:'))
+                        # Add phrases that include this word
+                        if i > 0:
+                            phrase = f"{content_words[i-1]} {word}".strip('.,!?;:')
+                            suggestions.add(phrase)
+                        if i < len(content_words) - 1:
+                            phrase = f"{word} {content_words[i+1]}".strip('.,!?;:')
+                            suggestions.add(phrase)
+            # Filter and sort suggestions
+            filtered_suggestions = [
+                s for s in suggestions
+                if len(s) > len(partial_query) and s.startswith(partial_query.lower())
+            ]
+            return sorted(filtered_suggestions)[:limit]
+        except Exception as e:
+            logger.error(f"Error getting search suggestions: {str(e)}")
+            return []
+    async def explain_search(self, query: str, top_k: int = 3) -> Dict[str, Any]:
+        """Provide detailed explanation of search process and results"""
+        try:
+            explanation = {
+                "query": query,
+                "steps": [],
+                "results_analysis": {},
+                "performance_metrics": {}
+            }
+            # Step 1: Query processing
+            explanation["steps"].append({
+                "step": "query_processing",
+                "description": "Processing and normalizing the search query",
+                "details": {
+                    "original_query": query,
+                    "cleaned_query": query.strip(),
+                    "query_length": len(query)
+                }
+            })
+            # Step 2: Embedding generation
+            import time
+            start_time = time.time()
+            query_embedding = await self.embedding_service.generate_single_embedding(query)
+            embedding_time = time.time() - start_time
+            explanation["steps"].append({
+                "step": "embedding_generation",
+                "description": "Converting query to vector embedding",
+                "details": {
+                    "embedding_dimension": len(query_embedding) if query_embedding else 0,
+                    "generation_time_ms": round(embedding_time * 1000, 2)
+                }
+            })
+            # Step 3: Vector search
+            start_time = time.time()
+            results = await self.vector_store.search(query_embedding, top_k)
+            search_time = time.time() - start_time
+            explanation["steps"].append({
+                "step": "vector_search",
+                "description": "Searching vector database for similar content",
+                "details": {
+                    "search_time_ms": round(search_time * 1000, 2),
+                    "results_found": len(results),
+                    "top_score": results[0].score if results else 0,
+                    "score_range": f"{min(r.score for r in results):.3f}-{max(r.score for r in results):.3f}" if results else "N/A"
+                }
+            })
+            # Results analysis
+            if results:
+                explanation["results_analysis"] = {
+                    "total_results": len(results),
+                    "average_score": sum(r.score for r in results) / len(results),
+                    "unique_documents": len(set(r.document_id for r in results)),
+                    "content_lengths": [len(r.content) for r in results]
+                }
+            # Performance metrics
+            explanation["performance_metrics"] = {
+                "total_time_ms": round((embedding_time + search_time) * 1000, 2),
+                "embedding_time_ms": round(embedding_time * 1000, 2),
+                "search_time_ms": round(search_time * 1000, 2)
+            }
+            return explanation
+        except Exception as e:
+            logger.error(f"Error explaining search: {str(e)}")
+            return {"error": str(e)}

mcp_tools/utils.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import logging
+import asyncio
+import functools
+from typing import Any, Callable, Dict, List, Optional
+import time
+import json
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def async_timer(func: Callable) -> Callable:
+    """Decorator to time async function execution"""
+    @functools.wraps(func)
+    async def wrapper(*args, **kwargs):
+        start_time = time.time()
+        try:
+            result = await func(*args, **kwargs)
+            end_time = time.time()
+            logger.debug(f"{func.__name__} completed in {end_time - start_time:.3f}s")
+            return result
+        except Exception as e:
+            end_time = time.time()
+            logger.error(f"{func.__name__} failed after {end_time - start_time:.3f}s: {str(e)}")
+            raise
+    return wrapper
+def retry_async(max_attempts: int = 3, delay: float = 1.0, backoff: float = 2.0):
+    """Decorator to retry async functions with exponential backoff"""
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        async def wrapper(*args, **kwargs):
+            attempt = 1
+            current_delay = delay
+            while attempt <= max_attempts:
+                try:
+                    return await func(*args, **kwargs)
+                except Exception as e:
+                    if attempt == max_attempts:
+                        logger.error(f"{func.__name__} failed after {max_attempts} attempts: {str(e)}")
+                        raise
+                    logger.warning(f"{func.__name__} attempt {attempt} failed: {str(e)}")
+                    logger.info(f"Retrying in {current_delay}s...")
+                    await asyncio.sleep(current_delay)
+                    attempt += 1
+                    current_delay *= backoff
+        return wrapper
+    return decorator
+class MCPToolResponse:
+    """Standardized response format for MCP tools"""
+    def __init__(self, success: bool, data: Any = None, error: str = None,
+                 metadata: Dict[str, Any] = None):
+        self.success = success
+        self.data = data
+        self.error = error
+        self.metadata = metadata or {}
+        self.timestamp = time.time()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert response to dictionary"""
+        result = {
+            "success": self.success,
+            "timestamp": self.timestamp
+        }
+        if self.success:
+            result["data"] = self.data
+        else:
+            result["error"] = self.error
+        if self.metadata:
+            result["metadata"] = self.metadata
+        return result
+    @classmethod
+    def success_response(cls, data: Any, metadata: Dict[str, Any] = None):
+        """Create a success response"""
+        return cls(success=True, data=data, metadata=metadata)
+    @classmethod
+    def error_response(cls, error: str, metadata: Dict[str, Any] = None):
+        """Create an error response"""
+        return cls(success=False, error=error, metadata=metadata)
+def validate_required_params(params: Dict[str, Any], required: List[str]) -> Optional[str]:
+    """Validate that required parameters are present"""
+    missing = []
+    for param in required:
+        if param not in params or params[param] is None:
+            missing.append(param)
+    if missing:
+        return f"Missing required parameters: {', '.join(missing)}"
+    return None
+def sanitize_filename(filename: str) -> str:
+    """Sanitize filename for safe storage"""
+    import re
+    # Remove or replace invalid characters
+    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
+    # Remove leading/trailing dots and spaces
+    filename = filename.strip('. ')
+    # Limit length
+    if len(filename) > 255:
+        name, ext = Path(filename).stem, Path(filename).suffix
+        max_name_len = 255 - len(ext)
+        filename = name[:max_name_len] + ext
+    # Ensure not empty
+    if not filename:
+        filename = "unnamed_file"
+    return filename
+def truncate_text(text: str, max_length: int, add_ellipsis: bool = True) -> str:
+    """Truncate text to specified length"""
+    if len(text) <= max_length:
+        return text
+    if add_ellipsis and max_length > 3:
+        return text[:max_length - 3] + "..."
+    else:
+        return text[:max_length]
+def extract_file_info(file_path: str) -> Dict[str, Any]:
+    """Extract information about a file"""
+    try:
+        path = Path(file_path)
+        stat = path.stat()
+        return {
+            "filename": path.name,
+            "extension": path.suffix.lower(),
+            "size_bytes": stat.st_size,
+            "size_mb": round(stat.st_size / (1024 * 1024), 2),
+            "created_time": stat.st_ctime,
+            "modified_time": stat.st_mtime,
+            "exists": path.exists(),
+            "is_file": path.is_file(),
+            "is_dir": path.is_dir()
+        }
+    except Exception as e:
+        return {"error": str(e)}
+async def batch_process(items: List[Any], processor: Callable, batch_size: int = 10,
+                       max_concurrent: int = 5) -> List[Any]:
+    """Process items in batches with concurrency control"""
+    results = []
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def process_item(item):
+        async with semaphore:
+            return await processor(item)
+    # Process in batches
+    for i in range(0, len(items), batch_size):
+        batch = items[i:i + batch_size]
+        batch_tasks = [process_item(item) for item in batch]
+        batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
+        results.extend(batch_results)
+    return results
+def format_file_size(size_bytes: int) -> str:
+    """Format file size in human-readable format"""
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if size_bytes < 1024.0:
+            return f"{size_bytes:.1f} {unit}"
+        size_bytes /= 1024.0
+    return f"{size_bytes:.1f} PB"
+def calculate_reading_time(text: str, words_per_minute: int = 200) -> int:
+    """Calculate estimated reading time in minutes"""
+    word_count = len(text.split())
+    return max(1, round(word_count / words_per_minute))
+class ProgressTracker:
+    """Track progress of long-running operations"""
+    def __init__(self, total_items: int, description: str = "Processing"):
+        self.total_items = total_items
+        self.completed_items = 0
+        self.description = description
+        self.start_time = time.time()
+        self.errors = []
+    def update(self, completed: int = 1, error: str = None):
+        """Update progress"""
+        self.completed_items += completed
+        if error:
+            self.errors.append(error)
+    def get_progress(self) -> Dict[str, Any]:
+        """Get current progress information"""
+        elapsed_time = time.time() - self.start_time
+        progress_percent = (self.completed_items / self.total_items) * 100 if self.total_items > 0 else 0
+        # Estimate remaining time
+        if self.completed_items > 0:
+            avg_time_per_item = elapsed_time / self.completed_items
+            remaining_items = self.total_items - self.completed_items
+            estimated_remaining_time = avg_time_per_item * remaining_items
+        else:
+            estimated_remaining_time = 0
+        return {
+            "description": self.description,
+            "total_items": self.total_items,
+            "completed_items": self.completed_items,
+            "progress_percent": round(progress_percent, 1),
+            "elapsed_time_seconds": round(elapsed_time, 1),
+            "estimated_remaining_seconds": round(estimated_remaining_time, 1),
+            "errors_count": len(self.errors),
+            "errors": self.errors[-5:] if self.errors else []  # Last 5 errors
+        }
+    def is_complete(self) -> bool:
+        """Check if processing is complete"""
+        return self.completed_items >= self.total_items
+def load_json_config(config_path: str, default_config: Dict[str, Any] = None) -> Dict[str, Any]:
+    """Load configuration from JSON file with fallback to defaults"""
+    try:
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        logger.info(f"Loaded configuration from {config_path}")
+        return config
+    except FileNotFoundError:
+        logger.warning(f"Configuration file {config_path} not found, using defaults")
+        return default_config or {}
+    except json.JSONDecodeError as e:
+        logger.error(f"Invalid JSON in configuration file {config_path}: {str(e)}")
+        return default_config or {}
+def save_json_config(config: Dict[str, Any], config_path: str) -> bool:
+    """Save configuration to JSON file"""
+    try:
+        # Create directory if it doesn't exist
+        Path(config_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(config_path, 'w') as f:
+            json.dump(config, f, indent=2)
+        logger.info(f"Saved configuration to {config_path}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to save configuration to {config_path}: {str(e)}")
+        return False
+class RateLimiter:
+    """Simple rate limiter for API calls"""
+    def __init__(self, max_calls: int, time_window: float):
+        self.max_calls = max_calls
+        self.time_window = time_window
+        self.calls = []
+    async def acquire(self):
+        """Acquire permission to make a call"""
+        now = time.time()
+        # Remove old calls outside the time window
+        self.calls = [call_time for call_time in self.calls if now - call_time < self.time_window]
+        # Check if we can make a new call
+        if len(self.calls) >= self.max_calls:
+            # Wait until we can make a call
+            oldest_call = min(self.calls)
+            wait_time = self.time_window - (now - oldest_call)
+            if wait_time > 0:
+                await asyncio.sleep(wait_time)
+                return await self.acquire()  # Recursive call after waiting
+        # Record this call
+        self.calls.append(now)
+def escape_markdown(text: str) -> str:
+    """Escape markdown special characters"""
+    import re
+    # Characters that need escaping in markdown
+    markdown_chars = r'([*_`\[\]()#+\-!\\])'
+    return re.sub(markdown_chars, r'\\\1', text)
+def create_error_summary(errors: List[Exception]) -> str:
+    """Create a summary of multiple errors"""
+    if not errors:
+        return "No errors"
+    error_counts = {}
+    for error in errors:
+        error_type = type(error).__name__
+        error_counts[error_type] = error_counts.get(error_type, 0) + 1
+    summary_parts = []
+    for error_type, count in error_counts.items():
+        if count == 1:
+            summary_parts.append(f"1 {error_type}")
+        else:
+            summary_parts.append(f"{count} {error_type}s")
+    return f"Encountered {len(errors)} total errors: " + ", ".join(summary_parts)
+async def safe_execute(func: Callable, *args, default_return=None, **kwargs):
+    """Safely execute a function and return default on error"""
+    try:
+        if asyncio.iscoroutinefunction(func):
+            return await func(*args, **kwargs)
+        else:
+            return func(*args, **kwargs)
+    except Exception as e:
+        logger.error(f"Error executing {func.__name__}: {str(e)}")
+        return default_return
+def get_content_preview(content: str, max_length: int = 200) -> str:
+    """Get a preview of content for display"""
+    if not content:
+        return "No content"
+    # Clean up whitespace
+    content = ' '.join(content.split())
+    if len(content) <= max_length:
+        return content
+    # Try to break at sentence boundary
+    preview = content[:max_length]
+    last_sentence_end = max(preview.rfind('.'), preview.rfind('!'), preview.rfind('?'))
+    if last_sentence_end > max_length * 0.7:  # If we found a good breaking point
+        return preview[:last_sentence_end + 1]
+    else:
+        # Break at word boundary
+        last_space = preview.rfind(' ')
+        if last_space > max_length * 0.7:
+            return preview[:last_space] + "..."
+        else:
+            return preview + "..."
+class MemoryUsageTracker:
+    """Track memory usage of operations"""
+    def __init__(self):
+        self.start_memory = self._get_memory_usage()
+    def _get_memory_usage(self) -> float:
+        """Get current memory usage in MB"""
+        try:
+            import psutil
+            process = psutil.Process()
+            return process.memory_info().rss / 1024 / 1024  # Convert to MB
+        except ImportError:
+            return 0.0
+    def get_usage_delta(self) -> float:
+        """Get memory usage change since initialization"""
+        current_memory = self._get_memory_usage()
+        return current_memory - self.start_memory
+    def log_usage(self, operation_name: str):
+        """Log current memory usage for an operation"""
+        delta = self.get_usage_delta()
+        logger.info(f"{operation_name} memory delta: {delta:.1f} MB")

mcp_tools/voice_tool.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import logging
+from typing import Dict, Any, Optional
+import asyncio
+logger = logging.getLogger(__name__)
+class VoiceTool:
+    """
+    MCP Tool for voice-based Q&A using ElevenLabs conversational AI
+    """
+    def __init__(self, elevenlabs_service):
+        """
+        Initialize Voice Tool
+        Args:
+            elevenlabs_service: ElevenLabs service instance
+        """
+        self.elevenlabs_service = elevenlabs_service
+    async def voice_qa(
+        self,
+        question: str,
+        session_id: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        MCP Tool: Ask a question using voice assistant
+        Args:
+            question: User's question (text or transcribed from voice)
+            session_id: Optional session ID for conversation context
+        Returns:
+            Dictionary with answer, audio URL (if applicable), and sources
+        """
+        try:
+            if not self.elevenlabs_service or not self.elevenlabs_service.is_available():
+                return {
+                    "success": False,
+                    "error": "Voice assistant not configured. Please set ELEVENLABS_API_KEY and ELEVENLABS_AGENT_ID"
+                }
+            logger.info(f"Voice QA: {question}")
+            # For text-based queries, we can use the RAG tool directly
+            # This provides the backend for voice queries
+            result = await self.elevenlabs_service.llamaindex_service.query(question)
+            return {
+                "success": True,
+                "question": question,
+                "answer": result,
+                "session_id": session_id,
+                "mode": "text"  # Could be "voice" if audio processing is involved
+            }
+        except Exception as e:
+            logger.error(f"Voice QA failed: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e),
+                "question": question
+            }

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+gradio[mcp]
+anthropic>=0.7.0
+mistralai
+sentence-transformers>=2.2.2
+transformers>=4.30.0
+torch>=2.0.0
+faiss-cpu>=1.7.4
+numpy>=1.24.0
+pandas>=2.0.0
+PyPDF2>=3.0.0
+python-docx>=0.8.11
+Pillow>=10.0.0
+pytesseract>=0.3.10
+aiofiles>=23.0.0
+pydantic>=2.0.0
+httpx>=0.24.0
+uvicorn[standard]
+python-multipart>=0.0.6
+asyncio-mqtt>=0.11.1
+nest-asyncio>=1.5.6
+fastapi
+fastmcp
+mcp
+openai
+python-dotenv
+llama-index
+llama-index-llms-openai
+llama-index-llms-anthropic
+llama-index-embeddings-huggingface
+elevenlabs>=1.0.0
+websockets>=12.0

services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Services module initialization

services/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (149 Bytes). View file

services/__pycache__/document_store_service.cpython-313.pyc ADDED Viewed

Binary file (17.2 kB). View file

services/__pycache__/elevenlabs_service.cpython-313.pyc ADDED Viewed

Binary file (12.8 kB). View file

services/__pycache__/embedding_service.cpython-313.pyc ADDED Viewed

Binary file (13.4 kB). View file

services/__pycache__/llamaindex_service.cpython-313.pyc ADDED Viewed

Binary file (11 kB). View file

services/__pycache__/llm_service.cpython-313.pyc ADDED Viewed

Binary file (28.2 kB). View file

services/__pycache__/ocr_service.cpython-313.pyc ADDED Viewed

Binary file (19.9 kB). View file

services/__pycache__/podcast_generator_service.cpython-313.pyc ADDED Viewed

Binary file (28.3 kB). View file

services/__pycache__/vector_store_service.cpython-313.pyc ADDED Viewed

Binary file (15.3 kB). View file

services/document_store_service.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import logging
+import json
+import os
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+import pickle
+from datetime import datetime
+import asyncio
+from core.models import Document, DocumentType
+import config
+logger = logging.getLogger(__name__)
+class DocumentStoreService:
+    def __init__(self):
+        self.config = config.config
+        self.store_path = Path(self.config.DOCUMENT_STORE_PATH)
+        self.store_path.mkdir(parents=True, exist_ok=True)
+        # Separate paths for metadata and content
+        self.metadata_path = self.store_path / "metadata"
+        self.content_path = self.store_path / "content"
+        self.metadata_path.mkdir(exist_ok=True)
+        self.content_path.mkdir(exist_ok=True)
+        # In-memory cache for frequently accessed documents
+        self._cache = {}
+        self._cache_size_limit = 100
+    async def store_document(self, document: Document) -> bool:
+        """Store a document and its metadata"""
+        try:
+            # Store metadata
+            metadata_file = self.metadata_path / f"{document.id}.json"
+            metadata = {
+                "id": document.id,
+                "filename": document.filename,
+                "doc_type": document.doc_type.value,
+                "file_size": document.file_size,
+                "created_at": document.created_at.isoformat(),
+                "metadata": document.metadata,
+                "tags": document.tags,
+                "summary": document.summary,
+                "category": document.category,
+                "language": document.language,
+                "content_length": len(document.content)
+            }
+            with open(metadata_file, 'w', encoding='utf-8') as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            # Store content separately (can be large)
+            content_file = self.content_path / f"{document.id}.txt"
+            with open(content_file, 'w', encoding='utf-8') as f:
+                f.write(document.content)
+            # Cache the document
+            self._add_to_cache(document.id, document)
+            logger.info(f"Stored document {document.id} ({document.filename})")
+            return True
+        except Exception as e:
+            logger.error(f"Error storing document {document.id}: {str(e)}")
+            return False
+    async def get_document(self, document_id: str) -> Optional[Document]:
+        """Retrieve a document by ID"""
+        try:
+            # Check cache first
+            if document_id in self._cache:
+                return self._cache[document_id]
+            # Load from disk
+            metadata_file = self.metadata_path / f"{document_id}.json"
+            content_file = self.content_path / f"{document_id}.txt"
+            if not metadata_file.exists() or not content_file.exists():
+                return None
+            # Load metadata
+            with open(metadata_file, 'r', encoding='utf-8') as f:
+                metadata = json.load(f)
+            # Load content
+            with open(content_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+            # Create document object
+            document = Document(
+                id=metadata["id"],
+                filename=metadata["filename"],
+                content=content,
+                doc_type=DocumentType(metadata["doc_type"]),
+                file_size=metadata["file_size"],
+                created_at=datetime.fromisoformat(metadata["created_at"]),
+                metadata=metadata.get("metadata", {}),
+                tags=metadata.get("tags", []),
+                summary=metadata.get("summary"),
+                category=metadata.get("category"),
+                language=metadata.get("language")
+            )
+            # Add to cache
+            self._add_to_cache(document_id, document)
+            return document
+        except Exception as e:
+            logger.error(f"Error retrieving document {document_id}: {str(e)}")
+            return None
+    async def list_documents(self, limit: int = 50, offset: int = 0,
+                           filters: Optional[Dict[str, Any]] = None) -> List[Document]:
+        """List documents with pagination and filtering"""
+        try:
+            documents = []
+            metadata_files = list(self.metadata_path.glob("*.json"))
+            # Sort by creation time (newest first)
+            metadata_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
+            # Apply pagination
+            start_idx = offset
+            end_idx = offset + limit
+            for metadata_file in metadata_files[start_idx:end_idx]:
+                try:
+                    with open(metadata_file, 'r', encoding='utf-8') as f:
+                        metadata = json.load(f)
+                    # Apply filters
+                    if filters and not self._apply_filters(metadata, filters):
+                        continue
+                    # Load content if needed (for small documents)
+                    content_file = self.content_path / f"{metadata['id']}.txt"
+                    if content_file.exists():
+                        with open(content_file, 'r', encoding='utf-8') as f:
+                            content = f.read()
+                    else:
+                        content = ""
+                    document = Document(
+                        id=metadata["id"],
+                        filename=metadata["filename"],
+                        content=content,
+                        doc_type=DocumentType(metadata["doc_type"]),
+                        file_size=metadata["file_size"],
+                        created_at=datetime.fromisoformat(metadata["created_at"]),
+                        metadata=metadata.get("metadata", {}),
+                        tags=metadata.get("tags", []),
+                        summary=metadata.get("summary"),
+                        category=metadata.get("category"),
+                        language=metadata.get("language")
+                    )
+                    documents.append(document)
+                except Exception as e:
+                    logger.warning(f"Error loading document metadata from {metadata_file}: {str(e)}")
+                    continue
+            return documents
+        except Exception as e:
+            logger.error(f"Error listing documents: {str(e)}")
+            return []
+    def _apply_filters(self, metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
+        """Apply filters to document metadata"""
+        try:
+            for key, value in filters.items():
+                if key == "doc_type":
+                    if metadata.get("doc_type") != value:
+                        return False
+                elif key == "filename_contains":
+                    if value.lower() not in metadata.get("filename", "").lower():
+                        return False
+                elif key == "created_after":
+                    doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
+                    if doc_date < value:
+                        return False
+                elif key == "created_before":
+                    doc_date = datetime.fromisoformat(metadata.get("created_at", ""))
+                    if doc_date > value:
+                        return False
+                elif key == "tags":
+                    doc_tags = set(metadata.get("tags", []))
+                    required_tags = set(value) if isinstance(value, list) else {value}
+                    if not required_tags.intersection(doc_tags):
+                        return False
+                elif key == "category":
+                    if metadata.get("category") != value:
+                        return False
+                elif key == "language":
+                    if metadata.get("language") != value:
+                        return False
+            return True
+        except Exception as e:
+            logger.error(f"Error applying filters: {str(e)}")
+            return True
+    async def update_document_metadata(self, document_id: str, updates: Dict[str, Any]) -> bool:
+        """Update document metadata"""
+        try:
+            metadata_file = self.metadata_path / f"{document_id}.json"
+            if not metadata_file.exists():
+                logger.warning(f"Document {document_id} not found")
+                return False
+            # Load existing metadata
+            with open(metadata_file, 'r', encoding='utf-8') as f:
+                metadata = json.load(f)
+            # Apply updates
+            for key, value in updates.items():
+                if key in ["tags", "summary", "category", "language", "metadata"]:
+                    metadata[key] = value
+            # Save updated metadata
+            with open(metadata_file, 'w', encoding='utf-8') as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            # Update cache if document is cached
+            if document_id in self._cache:
+                document = self._cache[document_id]
+                for key, value in updates.items():
+                    if hasattr(document, key):
+                        setattr(document, key, value)
+            logger.info(f"Updated metadata for document {document_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Error updating document metadata: {str(e)}")
+            return False
+    async def delete_document(self, document_id: str) -> bool:
+        """Delete a document and its metadata"""
+        try:
+            metadata_file = self.metadata_path / f"{document_id}.json"
+            content_file = self.content_path / f"{document_id}.txt"
+            # Remove files
+            if metadata_file.exists():
+                metadata_file.unlink()
+            if content_file.exists():
+                content_file.unlink()
+            # Remove from cache
+            if document_id in self._cache:
+                del self._cache[document_id]
+            logger.info(f"Deleted document {document_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting document {document_id}: {str(e)}")
+            return False
+    async def search_documents(self, query: str, fields: List[str] = None) -> List[Document]:
+        """Simple text search across documents"""
+        if not fields:
+            fields = ["filename", "content", "tags", "summary"]
+        try:
+            matching_documents = []
+            query_lower = query.lower()
+            # Get all documents
+            all_documents = await self.list_documents(limit=1000)  # Adjust limit as needed
+            for document in all_documents:
+                match_found = False
+                for field in fields:
+                    field_value = getattr(document, field, "")
+                    if isinstance(field_value, list):
+                        field_value = " ".join(field_value)
+                    elif field_value is None:
+                        field_value = ""
+                    if query_lower in str(field_value).lower():
+                        match_found = True
+                        break
+                if match_found:
+                    matching_documents.append(document)
+            logger.info(f"Found {len(matching_documents)} documents matching '{query}'")
+            return matching_documents
+        except Exception as e:
+            logger.error(f"Error searching documents: {str(e)}")
+            return []
+    def _add_to_cache(self, document_id: str, document: Document):
+        """Add document to cache with size limit"""
+        try:
+            # Remove oldest items if cache is full
+            if len(self._cache) >= self._cache_size_limit:
+                # Remove first item (FIFO)
+                oldest_key = next(iter(self._cache))
+                del self._cache[oldest_key]
+            self._cache[document_id] = document
+        except Exception as e:
+            logger.error(f"Error adding to cache: {str(e)}")
+    async def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the document store"""
+        try:
+            metadata_files = list(self.metadata_path.glob("*.json"))
+            content_files = list(self.content_path.glob("*.txt"))
+            # Calculate total storage size
+            total_size = 0
+            for file_path in metadata_files + content_files:
+                total_size += file_path.stat().st_size
+            # Count by document type
+            type_counts = {}
+            for metadata_file in metadata_files:
+                try:
+                    with open(metadata_file, 'r') as f:
+                        metadata = json.load(f)
+                    doc_type = metadata.get("doc_type", "unknown")
+                    type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
+                except:
+                    continue
+            return {
+                "total_documents": len(metadata_files),
+                "total_size_bytes": total_size,
+                "total_size_mb": round(total_size / (1024 * 1024), 2),
+                "cache_size": len(self._cache),
+                "document_types": type_counts,
+                "storage_path": str(self.store_path),
+                "metadata_files": len(metadata_files),
+                "content_files": len(content_files)
+            }
+        except Exception as e:
+            logger.error(f"Error getting document store stats: {str(e)}")
+            return {"error": str(e)}

services/elevenlabs_service.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import logging
+import asyncio
+from typing import Optional, Dict, Any, List
+import json
+try:
+    from elevenlabs.client import ElevenLabs
+    from elevenlabs.conversational_ai.conversation import Conversation, ClientTools
+    from elevenlabs.conversational_ai.default_audio_interface import DefaultAudioInterface
+    ELEVENLABS_AVAILABLE = True
+except ImportError:
+    ELEVENLABS_AVAILABLE = False
+    logger = logging.getLogger(__name__)
+    logger.warning("ElevenLabs SDK not available. Voice features will be disabled.")
+import config
+from services.llamaindex_service import LlamaIndexService
+logger = logging.getLogger(__name__)
+class ElevenLabsService:
+    """
+    Service for integrating ElevenLabs Conversational AI with RAG capabilities.
+    Provides voice-based interaction with the document library.
+    """
+    def __init__(self, llamaindex_service: LlamaIndexService):
+        """
+        Initialize ElevenLabs service with RAG integration
+        Args:
+            llamaindex_service: LlamaIndex service for document queries
+        """
+        self.config = config.config
+        self.llamaindex_service = llamaindex_service
+        self.client = None
+        self.client_tools = None
+        self.active_conversations: Dict[str, Conversation] = {}
+        if not ELEVENLABS_AVAILABLE:
+            logger.error("ElevenLabs SDK not installed. Run: pip install elevenlabs")
+            return
+        if not self.config.ELEVENLABS_API_KEY:
+            logger.warning("ELEVENLABS_API_KEY not configured. Voice features will be limited.")
+            return
+        try:
+            # Initialize ElevenLabs client
+            self.client = ElevenLabs(api_key=self.config.ELEVENLABS_API_KEY)
+            logger.info("ElevenLabs client initialized successfully")
+            # Initialize client tools for custom tool registration
+            self.client_tools = ClientTools()
+            # Register RAG tool
+            self._register_rag_tool()
+            logger.info("ElevenLabs service initialized with RAG tool")
+        except Exception as e:
+            logger.error(f"Error initializing ElevenLabs service: {str(e)}")
+    def _register_rag_tool(self):
+        """Register RAG query tool with ElevenLabs agent"""
+        if not self.client_tools:
+            return
+        try:
+            # Register the query_documents tool
+            # Modern ElevenLabs SDK: register(tool_name, handler=callable)
+            self.client_tools.register("query_documents", handler=self._rag_query_tool)
+            logger.info("RAG tool 'query_documents' registered successfully")
+        except Exception as e:
+            logger.error(f"Error registering RAG tool: {str(e)}")
+    async def _rag_query_tool(self, params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Custom tool for querying documents using LlamaIndex agentic RAG
+        Args:
+            params: Dictionary containing the query
+                - query (str): The user's question or search query
+        Returns:
+            Dictionary with answer and metadata
+        """
+        try:
+            query = params.get("query", "")
+            if not query:
+                return {
+                    "error": "No query provided",
+                    "answer": "I didn't receive a question to search for."
+                }
+            logger.info(f"RAG tool called with query: '{query}'")
+            # Query the LlamaIndex agentic RAG system
+            try:
+                result = await asyncio.wait_for(
+                    self.llamaindex_service.query(query),
+                    timeout=self.config.CONVERSATION_TIMEOUT
+                )
+                logger.info(f"RAG query successful")
+                return {
+                    "answer": result,
+                    "source": "document_library",
+                    "confidence": "high"
+                }
+            except asyncio.TimeoutError:
+                logger.error("RAG query timeout")
+                return {
+                    "error": "timeout",
+                    "answer": "The search took too long. Please try a simpler question."
+                }
+        except Exception as e:
+            logger.error(f"Error in RAG query tool: {str(e)}")
+            return {
+                "error": str(e),
+                "answer": f"I encountered an error searching the documents: {str(e)}"
+            }
+    def create_conversation(
+        self,
+        agent_id: Optional[str] = None,
+        session_id: Optional[str] = None
+    ) -> Optional[Conversation]:
+        """
+        Create a new conversation session
+        Args:
+            agent_id: ElevenLabs agent ID (uses config default if not provided)
+            session_id: Optional session ID for tracking
+        Returns:
+            Conversation object or None if initialization fails
+        """
+        if not self.client:
+            logger.error("ElevenLabs client not initialized")
+            return None
+        try:
+            agent_id = agent_id or self.config.ELEVENLABS_AGENT_ID
+            if not agent_id:
+                logger.error("No agent ID provided or configured")
+                return None
+            # Create audio interface for real-time audio
+            audio_interface = DefaultAudioInterface()
+            # Create conversation with RAG tool
+            conversation = Conversation(
+                client=self.client,
+                agent_id=agent_id,
+                requires_auth=True,
+                audio_interface=audio_interface,
+                client_tools=self.client_tools
+            )
+            # Store conversation if session ID provided
+            if session_id:
+                self.active_conversations[session_id] = conversation
+            logger.info(f"Created conversation for agent: {agent_id}")
+            return conversation
+        except Exception as e:
+            logger.error(f"Error creating conversation: {str(e)}")
+            return None
+    async def start_conversation(self, session_id: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Start a new conversation session (async wrapper for UI)
+        Args:
+            session_id: Optional session ID for tracking
+        Returns:
+            Dictionary with success status and conversation info
+        """
+        try:
+            conversation = self.create_conversation(session_id=session_id)
+            if conversation:
+                return {
+                    "success": True,
+                    "session_id": session_id,
+                    "message": "Conversation started successfully"
+                }
+            else:
+                return {
+                    "success": False,
+                    "error": "Failed to create conversation"
+                }
+        except Exception as e:
+            logger.error(f"Error starting conversation: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+    async def process_voice_query(
+        self,
+        audio_file_path: str,
+        agent_id: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Process a voice query file and return response
+        Args:
+            audio_file_path: Path to audio file
+            agent_id: Optional agent ID
+        Returns:
+            Dictionary with transcription, answer, and metadata
+        """
+        try:
+            # For now, this is a placeholder for file-based processing
+            # ElevenLabs Conversational AI is primarily WebSocket-based
+            # This would be used for async/batch processing
+            logger.info(f"Processing voice query from: {audio_file_path}")
+            # This would require additional implementation for file upload
+            # and processing through ElevenLabs API
+            return {
+                "status": "pending",
+                "message": "Voice query processing requires WebSocket connection",
+                "file": audio_file_path
+            }
+        except Exception as e:
+            logger.error(f"Error processing voice query: {str(e)}")
+            return {
+                "status": "error",
+                "error": str(e)
+            }
+    async def end_conversation(self, session_id: str) -> bool:
+        """
+        End an active conversation session
+        Args:
+            session_id: Session identifier
+        Returns:
+            True if conversation ended successfully
+        """
+        try:
+            if session_id in self.active_conversations:
+                conversation = self.active_conversations[session_id]
+                # Try to end the session gracefully
+                try:
+                    conversation.end_session()
+                except AttributeError as ae:
+                    # Handle cases where DefaultAudioInterface doesn't have expected methods
+                    logger.warning(f"Could not cleanly end session: {str(ae)}")
+                except Exception as e:
+                    logger.warning(f"Error during session cleanup: {str(e)}")
+                # Always remove from active conversations
+                del self.active_conversations[session_id]
+                logger.info(f"Ended conversation: {session_id}")
+                return True
+            return False
+        except Exception as e:
+            logger.error(f"Error ending conversation: {str(e)}")
+            return False
+    def get_available_voices(self) -> List[Dict[str, str]]:
+        """
+        Get list of available voice models
+        Returns:
+            List of voice model information
+        """
+        try:
+            if not self.client:
+                return []
+            # Get voices from ElevenLabs API
+            voices = self.client.voices.get_all()
+            return [
+                {
+                    "voice_id": voice.voice_id,
+                    "name": voice.name,
+                    "category": voice.category if hasattr(voice, 'category') else "general"
+                }
+                for voice in voices.voices
+            ]
+        except Exception as e:
+            logger.error(f"Error getting voices: {str(e)}")
+            return []
+    def is_available(self) -> bool:
+        """Check if ElevenLabs service is available and configured"""
+        return ELEVENLABS_AVAILABLE and self.client is not None
+    async def test_connection(self) -> Dict[str, Any]:
+        """
+        Test ElevenLabs API connection
+        Returns:
+            Dictionary with test results
+        """
+        try:
+            if not self.client:
+                return {
+                    "status": "error",
+                    "message": "Client not initialized"
+                }
+            # Try to fetch user info or voices as a connection test
+            voices = self.get_available_voices()
+            return {
+                "status": "success",
+                "message": "ElevenLabs API connected",
+                "voices_available": len(voices),
+                "rag_tool_registered": self.client_tools is not None
+            }
+        except Exception as e:
+            logger.error(f"Connection test failed: {str(e)}")
+            return {
+                "status": "error",
+                "message": str(e)
+            }

services/embedding_service.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import logging
+import asyncio
+from typing import List, Optional, Dict, Any
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import torch
+import openai
+import config
+logger = logging.getLogger(__name__)
+class EmbeddingService:
+    def __init__(self):
+        self.config = config.config
+        self.model_name = self.config.EMBEDDING_MODEL
+        self.model = None
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.openai_client = None
+        self.is_openai_model = False
+        # Initialize OpenAI client if needed
+        if self.config.OPENAI_API_KEY:
+            self.openai_client = openai.OpenAI(api_key=self.config.OPENAI_API_KEY)
+        # Load model lazily
+        self._load_model()
+    def _load_model(self):
+        """Load the embedding model"""
+        try:
+            logger.info(f"Loading embedding model: {self.model_name}")
+            if self.model_name.startswith("text-embedding-"):
+                if not self.openai_client:
+                    logger.warning(f"OpenAI model {self.model_name} requested but OPENAI_API_KEY not found. Falling back to local model.")
+                    self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
+                    self.is_openai_model = False
+                    self.model = SentenceTransformer(self.model_name, device=self.device)
+                else:
+                    self.is_openai_model = True
+                    logger.info(f"Using OpenAI embedding model: {self.model_name}")
+            else:
+                self.is_openai_model = False
+                self.model = SentenceTransformer(self.model_name, device=self.device)
+                logger.info(f"Local embedding model loaded successfully on {self.device}")
+        except Exception as e:
+            logger.error(f"Failed to load embedding model: {str(e)}")
+            # Fallback to a smaller model
+            try:
+                self.model_name = "all-MiniLM-L6-v2"
+                self.is_openai_model = False
+                self.model = SentenceTransformer(self.model_name, device=self.device)
+                logger.info(f"Loaded fallback embedding model: {self.model_name}")
+            except Exception as fallback_error:
+                logger.error(f"Failed to load fallback model: {str(fallback_error)}")
+                raise
+    async def generate_embeddings(self, texts: List[str], batch_size: int = 32) -> List[List[float]]:
+        """Generate embeddings for a list of texts"""
+        if not texts:
+            return []
+        if not self.is_openai_model and self.model is None:
+            raise RuntimeError("Embedding model not loaded")
+        try:
+            # Filter out empty texts
+            non_empty_texts = [text for text in texts if text and text.strip()]
+            if not non_empty_texts:
+                logger.warning("No non-empty texts provided for embedding")
+                return []
+            logger.info(f"Generating embeddings for {len(non_empty_texts)} texts using {self.model_name}")
+            # Process in batches to manage memory/API limits
+            all_embeddings = []
+            for i in range(0, len(non_empty_texts), batch_size):
+                batch = non_empty_texts[i:i + batch_size]
+                # Run embedding generation in thread pool to avoid blocking
+                loop = asyncio.get_event_loop()
+                batch_embeddings = await loop.run_in_executor(
+                    None,
+                    self._generate_batch_embeddings,
+                    batch
+                )
+                all_embeddings.extend(batch_embeddings)
+            logger.info(f"Generated {len(all_embeddings)} embeddings")
+            return all_embeddings
+        except Exception as e:
+            logger.error(f"Error generating embeddings: {str(e)}")
+            raise
+    def _generate_batch_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a batch of texts (synchronous)"""
+        try:
+            if self.is_openai_model:
+                # OpenAI Embeddings
+                response = self.openai_client.embeddings.create(
+                    input=texts,
+                    model=self.model_name
+                )
+                return [data.embedding for data in response.data]
+            else:
+                # Local SentenceTransformer
+                embeddings = self.model.encode(
+                    texts,
+                    convert_to_numpy=True,
+                    normalize_embeddings=True,
+                    batch_size=len(texts)
+                )
+                return embeddings.tolist()
+        except Exception as e:
+            logger.error(f"Error in batch embedding generation: {str(e)}")
+            raise
+    async def generate_single_embedding(self, text: str) -> Optional[List[float]]:
+        """Generate embedding for a single text"""
+        if not text or not text.strip():
+            return None
+        try:
+            embeddings = await self.generate_embeddings([text])
+            return embeddings[0] if embeddings else None
+        except Exception as e:
+            logger.error(f"Error generating single embedding: {str(e)}")
+            return None
+    def get_embedding_dimension(self) -> int:
+        """Get the dimension of embeddings produced by the model"""
+        if self.is_openai_model:
+            if "small" in self.model_name:
+                return 1536
+            elif "large" in self.model_name:
+                return 3072
+            elif "ada" in self.model_name:
+                return 1536
+            else:
+                # Default fallback or make a call to check?
+                # For now assume 1536 as it's standard for recent OpenAI models
+                return 1536
+        if self.model is None:
+            raise RuntimeError("Embedding model not loaded")
+        return self.model.get_sentence_embedding_dimension()
+    def compute_similarity(self, embedding1: List[float], embedding2: List[float]) -> float:
+        """Compute cosine similarity between two embeddings"""
+        try:
+            # Convert to numpy arrays
+            emb1 = np.array(embedding1)
+            emb2 = np.array(embedding2)
+            # Compute cosine similarity
+            similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
+            return float(similarity)
+        except Exception as e:
+            logger.error(f"Error computing similarity: {str(e)}")
+            return 0.0
+    def compute_similarities(self, query_embedding: List[float], embeddings: List[List[float]]) -> List[float]:
+        """Compute similarities between a query embedding and multiple embeddings"""
+        try:
+            query_emb = np.array(query_embedding)
+            emb_matrix = np.array(embeddings)
+            # Compute cosine similarities
+            similarities = np.dot(emb_matrix, query_emb) / (
+                np.linalg.norm(emb_matrix, axis=1) * np.linalg.norm(query_emb)
+            )
+            return similarities.tolist()
+        except Exception as e:
+            logger.error(f"Error computing similarities: {str(e)}")
+            return [0.0] * len(embeddings)
+    async def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Embed a list of chunks and add embeddings to them"""
+        if not chunks:
+            return []
+        try:
+            # Extract texts
+            texts = [chunk.get('content', '') for chunk in chunks]
+            # Generate embeddings
+            embeddings = await self.generate_embeddings(texts)
+            # Add embeddings to chunks
+            embedded_chunks = []
+            for i, chunk in enumerate(chunks):
+                if i < len(embeddings):
+                    chunk_copy = chunk.copy()
+                    chunk_copy['embedding'] = embeddings[i]
+                    embedded_chunks.append(chunk_copy)
+                else:
+                    logger.warning(f"No embedding generated for chunk {i}")
+                    embedded_chunks.append(chunk)
+            return embedded_chunks
+        except Exception as e:
+            logger.error(f"Error embedding chunks: {str(e)}")
+            raise
+    def validate_embedding(self, embedding: List[float]) -> bool:
+        """Validate that an embedding is properly formatted"""
+        try:
+            if not embedding:
+                return False
+            if not isinstance(embedding, list):
+                return False
+            if len(embedding) != self.get_embedding_dimension():
+                return False
+            # Check for NaN or infinite values
+            emb_array = np.array(embedding)
+            if np.isnan(emb_array).any() or np.isinf(emb_array).any():
+                return False
+            return True
+        except Exception:
+            return False
+    async def get_model_info(self) -> Dict[str, Any]:
+        """Get information about the loaded model"""
+        try:
+            return {
+                "model_name": self.model_name,
+                "device": "openai-api" if self.is_openai_model else self.device,
+                "embedding_dimension": self.get_embedding_dimension(),
+                "max_sequence_length": "8191" if self.is_openai_model else getattr(self.model, 'max_seq_length', 'unknown'),
+                "model_loaded": self.is_openai_model or (self.model is not None)
+            }
+        except Exception as e:
+            logger.error(f"Error getting model info: {str(e)}")
+            return {"error": str(e)}

services/llamaindex_service.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import logging
+import os
+from typing import List, Optional, Any
+from pathlib import Path
+import shutil
+import asyncio
+from llama_index.core import (
+    VectorStoreIndex,
+    Document,
+    StorageContext,
+    load_index_from_storage,
+    Settings,
+    SummaryIndex
+)
+from llama_index.core.tools import QueryEngineTool, ToolMetadata
+from llama_index.core.agent import ReActAgent
+from llama_index.core.selectors import LLMSingleSelector
+from llama_index.core.query_engine import RouterQueryEngine
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.embeddings.openai import OpenAIEmbedding
+import config
+from services.document_store_service import DocumentStoreService
+logger = logging.getLogger(__name__)
+class LlamaIndexService:
+    def __init__(self, document_store: DocumentStoreService):
+        self.document_store = document_store
+        self.config = config.config
+        self.storage_dir = Path(self.config.DATA_DIR) / "llamaindex_storage"
+        self.index = None
+        self.agent = None
+        self.is_initialized = False
+        self._initialize_settings()
+        # We don't fully initialize index here because we need async access to doc store
+        # But we try to load existing storage if available
+        self._try_load_from_storage()
+    def _initialize_settings(self):
+        """Initialize LlamaIndex settings (LLM, Embeddings)"""
+        try:
+            # LLM Setup
+            if self.config.OPENAI_API_KEY:
+                # Use configured OpenAI model (gpt-5.1-chat-latest or similar)
+                Settings.llm = OpenAI(model=self.config.OPENAI_MODEL, api_key=self.config.OPENAI_API_KEY)
+                logger.info(f"LlamaIndex using OpenAI model: {self.config.OPENAI_MODEL}")
+            elif self.config.NEBIUS_API_KEY:
+                # Use Nebius as OpenAI-compatible provider
+                Settings.llm = OpenAI(
+                    model=self.config.NEBIUS_MODEL,
+                    api_key=self.config.NEBIUS_API_KEY,
+                    api_base=self.config.NEBIUS_BASE_URL
+                )
+                logger.info(f"LlamaIndex using Nebius model: {self.config.NEBIUS_MODEL}")
+            else:
+                logger.warning("No API key found for LlamaIndex LLM (OpenAI or Nebius). Agentic features may fail.")
+            # Embedding Setup
+            if self.config.EMBEDDING_MODEL.startswith("text-embedding-"):
+                if self.config.OPENAI_API_KEY:
+                    Settings.embed_model = OpenAIEmbedding(
+                        model=self.config.EMBEDDING_MODEL,
+                        api_key=self.config.OPENAI_API_KEY
+                    )
+                    logger.info(f"LlamaIndex using OpenAI embeddings: {self.config.EMBEDDING_MODEL}")
+                else:
+                    logger.warning("OpenAI embedding model requested but no API key found. Falling back to HuggingFace.")
+                    Settings.embed_model = HuggingFaceEmbedding(
+                        model_name="sentence-transformers/all-MiniLM-L6-v2"
+                    )
+            else:
+                Settings.embed_model = HuggingFaceEmbedding(
+                    model_name=self.config.EMBEDDING_MODEL
+                )
+                logger.info(f"LlamaIndex using HuggingFace embeddings: {self.config.EMBEDDING_MODEL}")
+        except Exception as e:
+            logger.error(f"Error initializing LlamaIndex settings: {str(e)}")
+    def _try_load_from_storage(self):
+        """Try to load index from storage synchronously"""
+        try:
+            if self.storage_dir.exists():
+                logger.info("Loading LlamaIndex from storage...")
+                storage_context = StorageContext.from_defaults(persist_dir=str(self.storage_dir))
+                self.index = load_index_from_storage(storage_context)
+                self._initialize_agent()
+                self.is_initialized = True
+            else:
+                logger.info("No existing LlamaIndex storage found. Waiting for async initialization.")
+        except Exception as e:
+            logger.error(f"Error loading LlamaIndex from storage: {str(e)}")
+    async def initialize(self):
+        """Async initialization to sync documents and build index"""
+        try:
+            logger.info("Starting LlamaIndex async initialization...")
+            # If we already have an index, we might still want to sync if it's empty or stale
+            # For now, if no index exists, we definitely need to build it
+            if self.index is None:
+                await self.sync_from_document_store()
+            self.is_initialized = True
+            logger.info("LlamaIndex async initialization complete.")
+        except Exception as e:
+            logger.error(f"Error during LlamaIndex async initialization: {str(e)}")
+    async def sync_from_document_store(self):
+        """Sync documents from DocumentStore to LlamaIndex"""
+        try:
+            logger.info("Syncing documents from DocumentStore to LlamaIndex...")
+            # Fetch documents from async document store
+            # Limit to 1000 for now to avoid memory issues
+            docs = await self.document_store.list_documents(limit=1000)
+            if not docs:
+                logger.warning("No documents found in DocumentStore to sync.")
+                # Create empty index if no docs
+                self.index = VectorStoreIndex.from_documents([])
+            else:
+                # Convert to LlamaIndex documents
+                llama_docs = []
+                for doc in docs:
+                    llama_doc = Document(
+                        text=doc.content,
+                        metadata={
+                            "filename": doc.filename,
+                            "document_id": doc.id,
+                            **doc.metadata
+                        }
+                    )
+                    llama_docs.append(llama_doc)
+                logger.info(f"Building LlamaIndex with {len(llama_docs)} documents...")
+                self.index = VectorStoreIndex.from_documents(llama_docs)
+            # Persist storage
+            if not self.storage_dir.exists():
+                self.storage_dir.mkdir(parents=True, exist_ok=True)
+            self.index.storage_context.persist(persist_dir=str(self.storage_dir))
+            # Re-initialize agent with new index
+            self._initialize_agent()
+            logger.info("LlamaIndex sync complete.")
+        except Exception as e:
+            logger.error(f"Error syncing LlamaIndex: {str(e)}")
+    async def sync_on_demand(self):
+        """Manual trigger for syncing documents"""
+        await self.sync_from_document_store()
+        return True
+    def _initialize_agent(self):
+        """Initialize the ReAct agent with query engine tools"""
+        try:
+            if not self.index:
+                return
+            query_engine = self.index.as_query_engine()
+            query_engine_tool = QueryEngineTool(
+                query_engine=query_engine,
+                metadata=ToolMetadata(
+                    name="document_search",
+                    description="Search and retrieve information from the document library. Use this for specific questions about content."
+                )
+            )
+            self.agent = ReActAgent.from_tools(
+                [query_engine_tool],
+                llm=Settings.llm,
+                verbose=True
+            )
+            logger.info("LlamaIndex ReAct agent initialized")
+        except Exception as e:
+            logger.error(f"Error initializing LlamaIndex agent: {str(e)}")
+    async def query(self, query_text: str) -> str:
+        """Process a query using the agent"""
+        if not self.agent:
+            if not self.is_initialized:
+                return "Agent is initializing, please try again in a moment."
+            return "Agent failed to initialize. Please check logs."
+        try:
+            response = await self.agent.achat(query_text)
+            return str(response)
+        except Exception as e:
+            logger.error(f"Error querying LlamaIndex agent: {str(e)}")
+            return f"Error processing query: {str(e)}"

services/llm_service.py ADDED Viewed

	@@ -0,0 +1,420 @@

+from mistralai import Mistral
+import logging
+import asyncio
+from typing import List, Dict, Any, Optional
+import openai
+import config
+logger = logging.getLogger(__name__)
+class LLMService:
+    def __init__(self):
+        self.config = config.config
+        self.nebius_client = None
+        self.mistral_client = None
+        self.openai_client = None
+        self._initialize_clients()
+    def _initialize_clients(self):
+        """Initialize LLM clients"""
+        try:
+            if self.config.OPENAI_API_KEY:
+                self.openai_client = openai.OpenAI(
+                    api_key=self.config.OPENAI_API_KEY
+                )
+                logger.info("OpenAI client initialized")
+            if self.config.NEBIUS_API_KEY:
+                self.nebius_client = openai.OpenAI(
+                    api_key=self.config.NEBIUS_API_KEY,
+                    base_url=self.config.NEBIUS_BASE_URL
+                )
+                logger.info("NEBIUS client initialized")
+            if self.config.MISTRAL_API_KEY:
+                self.mistral_client = Mistral( # Standard sync client
+                    api_key=self.config.MISTRAL_API_KEY
+                )
+                logger.info("Mistral client initialized")
+            # Check if at least one client is initialized
+            if not any([self.openai_client, self.nebius_client, self.mistral_client]):
+                logger.warning("No LLM clients could be initialized based on current config. Check API keys.")
+            else:
+                logger.info("LLM clients initialized successfully (at least one).")
+        except Exception as e:
+            logger.error(f"Error initializing LLM clients: {str(e)}")
+            raise
+    async def generate_text(self, prompt: str, model: str = "auto", max_tokens: int = 1000, temperature: float = 0.7) -> str:
+        """Generate text using the specified model, with new priority for 'auto'."""
+        try:
+            selected_model_name_for_call: str = ""
+            if model == "auto":
+                # Priority: 1. NEBIUS (Llama 3.3 - Cost Effective), 2. OpenAI (GPT-5.1), 3. Mistral
+                if self.nebius_client and self.config.NEBIUS_MODEL:
+                    selected_model_name_for_call = self.config.NEBIUS_MODEL
+                    logger.debug(f"Auto-selected NEBIUS model: {selected_model_name_for_call}")
+                    return await self._generate_with_nebius(prompt, selected_model_name_for_call, max_tokens, temperature)
+                elif self.openai_client and self.config.OPENAI_MODEL:
+                    selected_model_name_for_call = self.config.OPENAI_MODEL
+                    logger.debug(f"Auto-selected OpenAI model: {selected_model_name_for_call}")
+                    return await self._generate_with_openai(prompt, selected_model_name_for_call, max_tokens, temperature)
+                elif self.mistral_client and self.config.MISTRAL_MODEL:
+                    selected_model_name_for_call = self.config.MISTRAL_MODEL
+                    logger.debug(f"Auto-selected Mistral model: {selected_model_name_for_call}")
+                    return await self._generate_with_mistral(prompt, selected_model_name_for_call, max_tokens, temperature)
+                else:
+                    logger.error("No LLM clients available for 'auto' mode or default models not configured.")
+                    raise ValueError("No LLM clients available for 'auto' mode or default models not configured.")
+            elif model == "fast":
+                 # Priority for speed: 1. OpenAI (GPT-5-mini), 2. Mistral Small, 3. Nebius
+                if self.openai_client and self.config.FAST_MODEL:
+                    return await self._generate_with_openai(prompt, self.config.FAST_MODEL, max_tokens, temperature)
+                # Fallback to auto if fast model not available
+                return await self.generate_text(prompt, "auto", max_tokens, temperature)
+            elif model.startswith("gpt-") or model.startswith("openai/") or "o1-" in model or "o3-" in model:
+                if self.openai_client:
+                     actual_model = model.split('/')[-1] if '/' in model else model
+                     return await self._generate_with_openai(prompt, actual_model, max_tokens, temperature)
+                elif self.nebius_client and "gpt-oss" in model: # Handle Nebius "openai/" prefix if any
+                     actual_model = model.split('/')[-1] if '/' in model else model
+                     return await self._generate_with_nebius(prompt, actual_model, max_tokens, temperature)
+                else:
+                     raise ValueError("OpenAI client not available. Check API key.")
+            elif model.lower().startswith("nebius/") or model.lower().startswith("meta-llama/"):
+                if not self.nebius_client:
+                    raise ValueError("NEBIUS client not available. Check API key.")
+                return await self._generate_with_nebius(prompt, model, max_tokens, temperature)
+            elif model.startswith("mistral"):
+                if not self.mistral_client:
+                    raise ValueError("Mistral client not available. Check API key or model prefix.")
+                return await self._generate_with_mistral(prompt, model, max_tokens, temperature)
+            else:
+                raise ValueError(f"Unsupported model: {model}. Must start with 'gpt-', 'openai/', 'nebius/', 'mistral', or be 'auto'.")
+        except Exception as e:
+            logger.error(f"Error generating text with model '{model}': {str(e)}")
+            raise
+    async def _generate_with_openai(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
+        """Generate text using OpenAI"""
+        if not self.openai_client:
+            raise RuntimeError("OpenAI client not initialized.")
+        try:
+            logger.debug(f"Generating with OpenAI model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}")
+            loop = asyncio.get_event_loop()
+            # Determine correct token parameter based on model family
+            # GPT-5, o1, o3 series use max_completion_tokens
+            use_completion_tokens = any(x in model_name for x in ["gpt-5", "o1-", "o3-"])
+            kwargs = {
+                "model": model_name,
+                "messages": [{"role": "user", "content": prompt}],
+            }
+            if use_completion_tokens:
+                kwargs["max_completion_tokens"] = max_tokens
+                # Reasoning models enforce temperature=1
+                kwargs["temperature"] = 1
+                if temperature != 1:
+                    logger.warning(f"Temperature {temperature} ignored for model {model_name} (requires 1).")
+            else:
+                kwargs["max_tokens"] = max_tokens
+                kwargs["temperature"] = temperature
+            response = await loop.run_in_executor(
+                None,
+                lambda: self.openai_client.chat.completions.create(**kwargs)
+            )
+            if response.choices and response.choices[0].message:
+                 content = response.choices[0].message.content
+                 if content is not None:
+                     return content.strip()
+            return ""
+        except Exception as e:
+            logger.error(f"Error with OpenAI generation (model: {model_name}): {str(e)}")
+            raise
+    async def _generate_with_nebius(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
+        """Generate text using NEBIUS (OpenAI OSS models via sync client)"""
+        if not self.nebius_client:
+            raise RuntimeError("NEBIUS client not initialized.")
+        try:
+            logger.debug(f"Generating with NEBIUS model: {model_name}, max_tokens: {max_tokens}, temp: {temperature}, prompt: '{prompt[:50]}...'")
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(
+                None,
+                lambda: self.nebius_client.chat.completions.create(
+                    model=model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=max_tokens,
+                    temperature=temperature
+                )
+            )
+            if response.choices and response.choices[0].message:
+                 content = response.choices[0].message.content
+                 if content is not None:
+                     return content.strip()
+                 else:
+                     logger.warning(f"NEBIUS response message content is None for model {model_name}.")
+                     return ""
+            else:
+                logger.warning(f"NEBIUS response did not contain expected choices or message for model {model_name}.")
+                return ""
+        except Exception as e:
+            logger.error(f"Error with NEBIUS generation (model: {model_name}): {str(e)}")
+            raise
+    async def _generate_with_mistral(self, prompt: str, model_name: str, max_tokens: int, temperature: float) -> str:
+        """Generate text using Mistral (Sync via run_in_executor)"""
+        if not self.mistral_client:
+            raise RuntimeError("Mistral client not initialized.")
+        try:
+            logger.debug(f"Generating with Mistral model: {model_name}, temp: {temperature}, prompt: '{prompt[:50]}...' (max_tokens: {max_tokens} - note: not directly used by MistralClient.chat)")
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(
+                None,
+                lambda: self.mistral_client.chat(
+                    model=model_name,
+                    messages=[{"role": "user", "content": prompt}],
+                    max_tokens=max_tokens,
+                    temperature=temperature
+                )
+            )
+            if response.choices and response.choices[0].message:
+                content = response.choices[0].message.content
+                if content is not None:
+                    return content.strip()
+                else:
+                    logger.warning(f"Mistral response message content is None for model {model_name}.")
+                    return ""
+            else:
+                logger.warning(f"Mistral response did not contain expected choices or message for model {model_name}.")
+                return ""
+        except Exception as e:
+            logger.error(f"Error with Mistral generation (model: {model_name}): {str(e)}")
+            raise
+    async def summarize(self, text: str, style: str = "concise", max_length: Optional[int] = None) -> str:
+        if not text.strip():
+            return ""
+        style_prompts = {
+            "concise": "Provide a concise summary of the following text, focusing on the main points:",
+            "detailed": "Provide a detailed summary of the following text, including key details and supporting information:",
+            "bullet_points": "Summarize the following text as a list of bullet points highlighting the main ideas:",
+            "executive": "Provide an executive summary of the following text, focusing on key findings and actionable insights:"
+        }
+        prompt_template = style_prompts.get(style, style_prompts["concise"])
+        if max_length:
+            prompt_template += f" Keep the summary under approximately {max_length} words."
+        prompt = f"{prompt_template}\n\nText to summarize:\n{text}\n\nSummary:"
+        try:
+            summary_max_tokens = (max_length * 2) if max_length else 500
+            summary = await self.generate_text(prompt, model="auto", max_tokens=summary_max_tokens, temperature=0.3)
+            return summary.strip()
+        except Exception as e:
+            logger.error(f"Error generating summary: {str(e)}")
+            return "Error generating summary"
+    async def generate_tags(self, text: str, max_tags: int = 5) -> List[str]:
+        if not text.strip():
+            return []
+        prompt = f"""Generate up to {max_tags} relevant tags for the following text.
+        Tags should be concise, descriptive keywords or phrases (1-3 words typically) that capture the main topics or themes.
+        Return only the tags, separated by commas. Do not include any preamble or explanation.
+        Text:
+        {text}
+        Tags:"""
+        try:
+            # Use FAST_MODEL for tags
+            response = await self.generate_text(prompt, model="fast", max_tokens=100, temperature=0.5)
+            tags = [tag.strip().lower() for tag in response.split(',') if tag.strip()]
+            tags = [tag for tag in tags if tag and len(tag) > 1 and len(tag) < 50]
+            return list(dict.fromkeys(tags))[:max_tags]
+        except Exception as e:
+            logger.error(f"Error generating tags: {str(e)}")
+            return []
+    async def categorize(self, text: str, categories: List[str]) -> str:
+        if not text.strip() or not categories:
+            return "Uncategorized"
+        categories_str = ", ".join([f"'{cat}'" for cat in categories])
+        prompt = f"""Classify the following text into ONE of these categories: {categories_str}.
+        Choose the single most appropriate category based on the content and main theme of the text.
+        Return only the category name as a string, exactly as it appears in the list provided. Do not add any other text or explanation.
+        Text to classify:
+        {text}
+        Category:"""
+        try:
+            # Use FAST_MODEL for categorization
+            response = await self.generate_text(prompt, model="fast", max_tokens=50, temperature=0.1)
+            category_candidate = response.strip().strip("'\"")
+            for cat in categories:
+                if cat.lower() == category_candidate.lower():
+                    return cat
+            logger.warning(f"LLM returned category '{category_candidate}' which is not in the provided list: {categories}. Falling back.")
+            return categories[0] if categories else "Uncategorized"
+        except Exception as e:
+            logger.error(f"Error categorizing text: {str(e)}")
+            return "Uncategorized"
+    async def answer_question(self, question: str, context: str, max_context_length: int = 3000) -> str:
+        if not question.strip():
+            return "No question provided."
+        if not context.strip():
+            return "I don't have enough context to answer this question. Please provide relevant information."
+        if len(context) > max_context_length:
+            context = context[:max_context_length] + "..."
+            logger.warning(f"Context truncated to {max_context_length} characters for question answering.")
+        prompt = f"""You are an expert Q&A assistant. Your task is to synthesize an answer to the user's question based *only* on the provided source documents.
+Analyze all the source documents provided in the context below.
+If the information is present, provide a comprehensive answer.
+Here are the source documents:
+--- START OF CONTEXT ---
+{context}
+--- END OF CONTEXT ---
+Based on the context above, please provide a clear and concise answer to the following question.
+Question: {question}
+Answer:"""
+        try:
+            answer = await self.generate_text(prompt, model="auto", max_tokens=800, temperature=0.5)
+            return answer.strip()
+        except Exception as e:
+            logger.error(f"Error answering question: {str(e)}")
+            return "I encountered an error while trying to answer your question."
+    async def extract_key_information(self, text: str) -> Dict[str, Any]:
+        if not text.strip():
+            return {}
+        prompt = f"""Analyze the following text and extract key information.
+        Provide the response as a JSON object with the following keys:
+        - "main_topic": (string) The main topic or subject of the text.
+        - "key_points": (array of strings) A list of 3-5 key points or takeaways.
+        - "entities": (array of strings) Important people, places, organizations, or products mentioned.
+        - "sentiment": (string) Overall sentiment of the text (e.g., "positive", "neutral", "negative", "mixed").
+        - "content_type": (string) The perceived type of content (e.g., "article", "email", "report", "conversation", "advertisement", "other").
+        If a piece of information is not found or not applicable, use null or an empty array/string as appropriate for the JSON structure.
+        Text to analyze:
+        ---
+        {text}
+        ---
+        JSON Analysis:"""
+        try:
+            response_str = await self.generate_text(prompt, model="auto", max_tokens=500, temperature=0.4)
+            import json
+            try:
+                if response_str.startswith("```json"):
+                    response_str = response_str.lstrip("```json").rstrip("```").strip()
+                info = json.loads(response_str)
+                expected_keys = {"main_topic", "key_points", "entities", "sentiment", "content_type"}
+                if not expected_keys.issubset(info.keys()):
+                    logger.warning(f"Extracted information missing some expected keys. Got: {info.keys()}")
+                return info
+            except json.JSONDecodeError as je:
+                logger.error(f"Failed to parse JSON from LLM response for key_information: {je}")
+                logger.debug(f"LLM Response string was: {response_str}")
+                info_fallback = {}
+                lines = response_str.split('\n')
+                for line in lines:
+                    if ':' in line:
+                        key, value = line.split(':', 1)
+                        key_clean = key.strip().lower().replace(' ', '_')
+                        value_clean = value.strip()
+                        if value_clean:
+                            if key_clean in ["key_points", "entities"] and '[' in value_clean and ']' in value_clean:
+                                try:
+                                    info_fallback[key_clean] = [item.strip().strip("'\"") for item in value_clean.strip('[]').split(',') if item.strip()]
+                                except: info_fallback[key_clean] = value_clean
+                            else: info_fallback[key_clean] = value_clean
+                if info_fallback:
+                    logger.info("Successfully parsed key information using fallback line-based method.")
+                    return info_fallback
+                return {"error": "Failed to parse LLM output", "raw_response": response_str}
+        except Exception as e:
+            logger.error(f"Error extracting key information: {str(e)}")
+            return {"error": f"General error extracting key information: {str(e)}"}
+    async def check_availability(self) -> Dict[str, bool]:
+        """Check which LLM services are available by making a tiny test call."""
+        availability = {
+            "openai": False,
+            "nebius": False,
+            "mistral": False
+        }
+        test_prompt = "Hello"
+        test_max_tokens = 5
+        test_temp = 0.1
+        logger.info("Checking LLM availability...")
+        if self.openai_client and self.config.OPENAI_MODEL:
+            try:
+                logger.debug(f"Testing OpenAI availability with model {self.config.OPENAI_MODEL}...")
+                test_response = await self._generate_with_openai(test_prompt, self.config.OPENAI_MODEL, test_max_tokens, test_temp)
+                availability["openai"] = bool(test_response.strip())
+            except Exception as e:
+                logger.warning(f"OpenAI availability check failed for model {self.config.OPENAI_MODEL}: {e}")
+        logger.info(f"OpenAI available: {availability['openai']}")
+        if self.nebius_client and self.config.NEBIUS_MODEL:
+            try:
+                logger.debug(f"Testing NEBIUS availability with model {self.config.NEBIUS_MODEL}...")
+                test_response = await self._generate_with_nebius(test_prompt, self.config.NEBIUS_MODEL, test_max_tokens, test_temp)
+                availability["nebius"] = bool(test_response.strip())
+            except Exception as e:
+                logger.warning(f"NEBIUS availability check failed for model {self.config.NEBIUS_MODEL}: {e}")
+        logger.info(f"NEBIUS available: {availability['nebius']}")
+        if self.mistral_client and self.config.MISTRAL_MODEL:
+            try:
+                logger.debug(f"Testing Mistral availability with model {self.config.MISTRAL_MODEL}...")
+                test_response = await self._generate_with_mistral(test_prompt, self.config.MISTRAL_MODEL, test_max_tokens, test_temp)
+                availability["mistral"] = bool(test_response.strip())
+            except Exception as e:
+                logger.warning(f"Mistral availability check failed for model {self.config.MISTRAL_MODEL}: {e}")
+        logger.info(f"Mistral available: {availability['mistral']}")
+        logger.info(f"Final LLM Availability: {availability}")
+        return availability

services/ocr_service.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import logging
+import asyncio
+from pathlib import Path
+import os
+import base64 # For encoding files
+from typing import Optional, List, Dict, Any
+import json
+from mistralai import Mistral
+from mistralai.models import SDKError
+# PIL (Pillow) for dummy image creation in main_example
+from PIL import Image, ImageDraw, ImageFont
+logger = logging.getLogger(__name__)
+class OCRService:
+    def __init__(self):
+        self.api_key = os.environ.get("MISTRAL_API_KEY")
+        if not self.api_key:
+            logger.error("MISTRAL_API_KEY environment variable not set.")
+            raise ValueError("MISTRAL_API_KEY not found in environment variables.")
+        self.client = Mistral(api_key=self.api_key)
+        self.ocr_model_name = "mistral-ocr-latest"
+        self.language = 'eng'
+        logger.info(f"OCRService (using Mistral AI model {self.ocr_model_name}) initialized.")
+    def _encode_file_to_base64(self, file_path: str) -> Optional[str]:
+        try:
+            with open(file_path, "rb") as file_to_encode:
+                return base64.b64encode(file_to_encode.read()).decode('utf-8')
+        except FileNotFoundError:
+            logger.error(f"Error: The file {file_path} was not found for Base64 encoding.")
+            return None
+        except Exception as e:
+            logger.error(f"Error during Base64 encoding for {file_path}: {e}")
+            return None
+    # In OCRService class:
+    async def _process_file_with_mistral(self, file_path: str, mime_type: str) -> str:
+        file_name = Path(file_path).name
+        logger.info(f"Preparing to process file: {file_name} (MIME: {mime_type}) with Mistral OCR.")
+        base64_encoded_file = self._encode_file_to_base64(file_path)
+        if not base64_encoded_file:
+            logger.warning(f"Base64 encoding failed for {file_name}, cannot process.")
+            return ""
+        document_type = "image_url" if mime_type.startswith("image/") else "document_url"
+        uri_key = "image_url" if document_type == "image_url" else "document_url"
+        data_uri = f"data:{mime_type};base64,{base64_encoded_file}"
+        document_payload = {
+            "type": document_type,
+            uri_key: data_uri
+        }
+        try:
+            logger.info(f"Calling Mistral client.ocr.process for {file_name} with model {self.ocr_model_name}.")
+            loop = asyncio.get_event_loop()
+            ocr_response = await loop.run_in_executor(
+                None,
+                lambda: self.client.ocr.process(
+                    model=self.ocr_model_name,
+                    document=document_payload,
+                    include_image_base64=False
+                )
+            )
+            logger.info(f"Received OCR response for {file_name}. Type: {type(ocr_response)}")
+            extracted_markdown = ""
+            if hasattr(ocr_response, 'pages') and ocr_response.pages and isinstance(ocr_response.pages, list):
+                all_pages_markdown = []
+                for i, page in enumerate(ocr_response.pages):
+                    page_content = None
+                    if hasattr(page, 'markdown') and page.markdown: # Check for 'markdown' attribute
+                        page_content = page.markdown
+                        logger.debug(f"Extracted content from page {i} using 'page.markdown'.")
+                    elif hasattr(page, 'markdown_content') and page.markdown_content:
+                        page_content = page.markdown_content
+                        logger.debug(f"Extracted content from page {i} using 'page.markdown_content'.")
+                    elif hasattr(page, 'text') and page.text:
+                        page_content = page.text
+                        logger.debug(f"Extracted content from page {i} using 'page.text'.")
+                    if page_content:
+                        all_pages_markdown.append(page_content)
+                    else:
+                        page_details_for_log = str(page)[:200] # Default to string snippet
+                        if hasattr(page, '__dict__'):
+                             page_details_for_log = str(vars(page))[:200] # Log part of vars if it's an object
+                        logger.warning(f"Page {i} in OCR response for {file_name} has no 'markdown', 'markdown_content', or 'text'. Page details: {page_details_for_log}")
+                if all_pages_markdown:
+                    extracted_markdown = "\n\n---\nPage Break (simulated)\n---\n\n".join(all_pages_markdown) # Simulate page breaks
+                else:
+                    logger.warning(f"'pages' attribute found but no content extracted from any pages for {file_name}.")
+            # Fallbacks if ocr_response doesn't have 'pages' but might have direct text/markdown
+            elif hasattr(ocr_response, 'text') and ocr_response.text:
+                 extracted_markdown = ocr_response.text
+                 logger.info(f"Extracted content from 'ocr_response.text' (no pages structure) for {file_name}.")
+            elif hasattr(ocr_response, 'markdown') and ocr_response.markdown:
+                 extracted_markdown = ocr_response.markdown
+                 logger.info(f"Extracted content from 'ocr_response.markdown' (no pages structure) for {file_name}.")
+            elif isinstance(ocr_response, str) and ocr_response:
+                 extracted_markdown = ocr_response
+                 logger.info(f"OCR response is a direct non-empty string for {file_name}.")
+            else:
+                logger.warning(f"Could not extract markdown from OCR response for {file_name} using known attributes (pages, text, markdown).")
+            if not extracted_markdown.strip():
+                logger.warning(f"Extracted markdown is empty for {file_name} after all parsing attempts.")
+            return extracted_markdown.strip()
+        except SDKError as e:
+            logger.error(f"Mistral API Exception during client.ocr.process for {file_name}: {e.message}")
+            logger.exception("SDKError details:")
+            return ""
+        except Exception as e:
+            logger.error(f"Generic Exception during Mistral client.ocr.process call for {file_name}: {e}")
+            logger.exception("Exception details:")
+            return ""
+    async def extract_text_from_image(self, image_path: str, language: Optional[str] = None) -> str:
+        if language:
+            logger.info(f"Language parameter '{language}' provided, but Mistral OCR is broadly multilingual.")
+        ext = Path(image_path).suffix.lower()
+        mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png',
+                    '.gif': 'image/gif', '.bmp': 'image/bmp', '.tiff': 'image/tiff', '.webp': 'image/webp',
+                    '.avif': 'image/avif'}
+        mime_type = mime_map.get(ext)
+        if not mime_type:
+            logger.warning(f"Unsupported image extension '{ext}' for path '{image_path}'. Attempting with 'application/octet-stream'.")
+            mime_type = 'application/octet-stream'
+        return await self._process_file_with_mistral(image_path, mime_type)
+    async def extract_text_from_pdf(self, pdf_path: str) -> str:
+        return await self._process_file_with_mistral(pdf_path, "application/pdf")
+    async def extract_text_from_pdf_images(self, pdf_path: str) -> List[str]:
+        logger.info("Mistral processes PDFs directly. This method will return the full Markdown content as a single list item.")
+        full_markdown = await self._process_file_with_mistral(pdf_path, "application/pdf")
+        if full_markdown:
+            return [full_markdown]
+        return [""]
+    async def extract_text_with_confidence(self, image_path: str, min_confidence: float = 0.5) -> Dict[str, Any]:
+        logger.warning("Mistral Document AI API (ocr.process) typically returns structured text (Markdown). Word-level confidence scores are not standard. 'confidence' field is a placeholder.")
+        ext = Path(image_path).suffix.lower()
+        mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
+        mime_type = mime_map.get(ext)
+        if not mime_type:
+            logger.warning(f"Unsupported image extension '{ext}' in extract_text_with_confidence. Defaulting mime type.")
+            mime_type = 'application/octet-stream'
+        text_markdown = await self._process_file_with_mistral(image_path, mime_type)
+        return {
+            "text": text_markdown,
+            "confidence": 0.0,
+            "word_count": len(text_markdown.split()) if text_markdown else 0,
+            "raw_data": "Mistral ocr.process response contains structured data. See logs from _process_file_with_mistral for details."
+        }
+    async def detect_language(self, image_path: str) -> str:
+        logger.warning("Mistral OCR is multilingual; explicit language detection is not part of client.ocr.process.")
+        return 'eng'
+    async def extract_tables_from_image(self, image_path: str) -> List[List[str]]:
+        logger.info("Extracting text (Markdown) from image using Mistral. Mistral OCR preserves table structures in Markdown.")
+        ext = Path(image_path).suffix.lower()
+        mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
+        mime_type = mime_map.get(ext)
+        if not mime_type:
+             logger.warning(f"Unsupported image extension '{ext}' in extract_tables_from_image. Defaulting mime type.")
+             mime_type = 'application/octet-stream'
+        markdown_content = await self._process_file_with_mistral(image_path, mime_type)
+        if markdown_content:
+            logger.info("Attempting basic parsing of Markdown tables. For complex tables, a dedicated parser is recommended.")
+            table_data = []
+            # Simplified parsing logic for example purposes - can be improved significantly.
+            lines = markdown_content.split('\n')
+            for line in lines:
+                stripped_line = line.strip()
+                if stripped_line.startswith('|') and stripped_line.endswith('|') and "---" not in stripped_line:
+                    cells = [cell.strip() for cell in stripped_line.strip('|').split('|')]
+                    if any(cells):
+                        table_data.append(cells)
+            if table_data:
+                 logger.info(f"Extracted {len(table_data)} lines potentially forming tables using basic parsing.")
+            else:
+                 logger.info("No distinct table structures found with basic parsing from extracted markdown.")
+            return table_data
+        return []
+    async def get_supported_languages(self) -> List[str]:
+        logger.info("Mistral OCR is multilingual. Refer to official Mistral AI documentation for details.")
+        return ['eng', 'multilingual (refer to Mistral documentation)']
+    async def validate_ocr_setup(self) -> Dict[str, Any]:
+        try:
+            models_response = await asyncio.to_thread(self.client.models.list)
+            model_ids = [model.id for model in models_response.data]
+            return {
+                "status": "operational",
+                "message": "Mistral client initialized. API key present. Model listing successful.",
+                "mistral_available_models_sample": model_ids[:5],
+                "configured_ocr_model": self.ocr_model_name,
+            }
+        except SDKError as e:
+            logger.error(f"Mistral API Exception during setup validation: {e.message}")
+            return { "status": "error", "error": f"Mistral API Error: {e.message}"}
+        except Exception as e:
+            logger.error(f"Generic error during Mistral OCR setup validation: {str(e)}")
+            return { "status": "error", "error": str(e) }
+    def extract_text(self, file_path: str) -> str:
+        logger.warning("`extract_text` is a synchronous method. Running async Mistral OCR in a blocking way.")
+        try:
+            ext = Path(file_path).suffix.lower()
+            if ext in ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.avif']:
+                result = asyncio.run(self.extract_text_from_image(file_path))
+            elif ext == '.pdf':
+                result = asyncio.run(self.extract_text_from_pdf(file_path))
+            else:
+                logger.error(f"Unsupported file type for sync extract_text: {file_path}")
+                return "Unsupported file type."
+            return result
+        except Exception as e:
+            logger.error(f"Error in synchronous extract_text for {file_path}: {str(e)}")
+            return "Error during sync extraction."
+# Example of how to use the OCRService (main execution part)
+async def main_example():
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s')
+    if not os.environ.get("MISTRAL_API_KEY"):
+       logger.error("MISTRAL_API_KEY environment variable is not set. Please set it: export MISTRAL_API_KEY='yourkey'")
+       return
+    ocr_service = OCRService()
+    logger.info("--- Validating OCR Service Setup ---")
+    validation_status = await ocr_service.validate_ocr_setup()
+    logger.info(f"OCR Service Validation: {validation_status}")
+    if validation_status.get("status") == "error":
+        logger.error("Halting due to validation error.")
+        return
+    # --- Test with a specific PDF file ---
+    pdf_path_to_test = r"C:\path\to\your\certificate.pdf"
+    if os.path.exists(pdf_path_to_test):
+        logger.info(f"\n--- Extracting text from specific PDF: {pdf_path_to_test} ---")
+        # Using the method that aligns with original `extract_text_from_pdf_images` signature
+        pdf_markdown_list = await ocr_service.extract_text_from_pdf_images(pdf_path_to_test)
+        if pdf_markdown_list and pdf_markdown_list[0]:
+            logger.info(f"Extracted Markdown from PDF ({pdf_path_to_test}):\n" + pdf_markdown_list[0])
+        else:
+            logger.warning(f"No text extracted from PDF {pdf_path_to_test} or an error occurred.")
+    else:
+        logger.warning(f"PDF file for specific test '{pdf_path_to_test}' not found. Skipping this test.")
+        logger.warning("Please update `pdf_path_to_test` in `main_example` to a valid PDF path.")
+    image_path = "dummy_test_image_ocr.png"
+    if os.path.exists(image_path):
+        logger.info(f"\n---Extracting text from image: {image_path} ---")
+        # ... image processing logic ...
+        pass
+    else:
+        logger.info(f"Dummy image {image_path} not created or found, skipping optional image test.")
+if __name__ == '__main__':
+    asyncio.run(main_example())

services/podcast_generator_service.py ADDED Viewed

	@@ -0,0 +1,663 @@

+import logging
+import asyncio
+import json
+import uuid
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, asdict
+from datetime import datetime
+from pathlib import Path
+import re
+try:
+    from elevenlabs import VoiceSettings
+    from elevenlabs.client import ElevenLabs
+    ELEVENLABS_AVAILABLE = True
+except ImportError:
+    ELEVENLABS_AVAILABLE = False
+import config
+from services.llamaindex_service import LlamaIndexService
+from services.llm_service import LLMService
+logger = logging.getLogger(__name__)
+@dataclass
+class DocumentAnalysis:
+    """Analysis results from document(s)"""
+    key_insights: List[str]  # 5-7 main points
+    topics: List[str]
+    complexity_level: str  # beginner, intermediate, advanced
+    estimated_words: int
+    source_documents: List[str]
+    summary: str
+@dataclass
+class DialogueLine:
+    """Single line of podcast dialogue"""
+    speaker: str  # "HOST1" or "HOST2"
+    text: str
+    pause_after: float = 0.5  # seconds
+@dataclass
+class PodcastScript:
+    """Complete podcast script"""
+    dialogue: List[DialogueLine]
+    total_duration_estimate: float
+    word_count: int
+    style: str
+    def to_text(self) -> str:
+        """Convert to readable transcript"""
+        lines = []
+        for line in self.dialogue:
+            lines.append(f"{line.speaker}: {line.text}")
+        return "\n\n".join(lines)
+@dataclass
+class PodcastMetadata:
+    """Metadata for generated podcast"""
+    podcast_id: str
+    title: str
+    description: str
+    source_documents: List[str]
+    style: str
+    duration_seconds: float
+    file_size_mb: float
+    voices: Dict[str, str]
+    generated_at: str
+    generation_cost: Dict[str, float]
+    key_topics: List[str]
+@dataclass
+class PodcastResult:
+    """Complete podcast generation result"""
+    podcast_id: str
+    audio_file_path: str
+    transcript: str
+    metadata: PodcastMetadata
+    generation_time: float
+    success: bool
+    error: Optional[str] = None
+class PodcastGeneratorService:
+    """
+    Service for generating conversational podcasts from documents.
+    Combines LlamaIndex for analysis and ElevenLabs for voice synthesis.
+    """
+    # Word count per minute for podcast pacing
+    WORDS_PER_MINUTE = 150
+    # Script generation prompts for different styles
+    SCRIPT_PROMPTS = {
+        "conversational": """You are an expert podcast script writer. Create an engaging 2-host podcast discussing insights from documents.
+CONTEXT:
+{analysis}
+REQUIREMENTS:
+- Duration: {duration_minutes} minutes (approximately {word_count} words)
+- Style: Conversational, friendly, and accessible
+- Format: Alternating dialogue between HOST1 and HOST2
+- Include natural transitions, questions, and "aha!" moments
+- Make complex topics easy to understand
+- Add enthusiasm and genuine curiosity
+- Balance speaking time between both hosts
+DIALOGUE FORMAT (strictly follow):
+HOST1: [What they say]
+HOST2: [What they say]
+STRUCTURE:
+1. Opening Hook (30 seconds): Grab attention with an intriguing question or fact
+2. Introduction (1 minute): Set context and preview what's coming
+3. Main Discussion (70% of time): Deep dive into key insights
+4. Wrap-up (1 minute): Summarize key takeaways and final thoughts
+TONE: Friendly, enthusiastic, educational but not condescending
+Generate the complete podcast script now:""",
+        "educational": """You are creating an educational podcast script. Two hosts discuss document insights in a clear, instructive manner.
+CONTEXT:
+{analysis}
+REQUIREMENTS:
+- Duration: {duration_minutes} minutes (approximately {word_count} words)
+- Style: Clear, methodical, educational
+- HOST1 acts as the teacher/expert, HOST2 as the curious learner
+- Include explanations of complex concepts
+- Use examples and analogies
+- Build knowledge progressively
+DIALOGUE FORMAT:
+HOST1: [Expert explanation]
+HOST2: [Clarifying question or observation]
+Generate the complete educational podcast script now:""",
+        "technical": """You are writing a technical podcast for an informed audience. Discuss document insights with precision and depth.
+CONTEXT:
+{analysis}
+REQUIREMENTS:
+- Duration: {duration_minutes} minutes (approximately {word_count} words)
+- Style: Professional, detailed, technically accurate
+- HOST1 is the subject matter expert, HOST2 is an informed interviewer
+- Use proper technical terminology
+- Dive into implementation details
+- Discuss implications and applications
+DIALOGUE FORMAT:
+HOST1: [Technical insight]
+HOST2: [Probing question]
+Generate the complete technical podcast script now:""",
+        "casual": """You are creating a fun, casual podcast. Two friends discuss interesting ideas from documents.
+CONTEXT:
+{analysis}
+REQUIREMENTS:
+- Duration: {duration_minutes} minutes (approximately {word_count} words)
+- Style: Relaxed, humorous, energetic
+- Both hosts are enthusiastic and engaged
+- Use casual language and occasional humor
+- Make it entertaining while staying informative
+- Quick pacing with energy
+DIALOGUE FORMAT:
+HOST1: [Casual commentary]
+HOST2: [Enthusiastic response]
+Generate the complete casual podcast script now:"""
+    }
+    def __init__(
+        self,
+        llamaindex_service: LlamaIndexService,
+        llm_service: LLMService,
+        elevenlabs_api_key: Optional[str] = None
+    ):
+        """
+        Initialize podcast generator service
+        Args:
+            llamaindex_service: Service for document analysis
+            llm_service: Service for script generation
+            elevenlabs_api_key: ElevenLabs API key (uses config if not provided)
+        """
+        self.config = config.config
+        self.llamaindex_service = llamaindex_service
+        self.llm_service = llm_service
+        # Initialize ElevenLabs client
+        self.elevenlabs_client = None
+        if ELEVENLABS_AVAILABLE:
+            api_key = elevenlabs_api_key or self.config.ELEVENLABS_API_KEY
+            if api_key:
+                try:
+                    self.elevenlabs_client = ElevenLabs(api_key=api_key)
+                    logger.info("ElevenLabs client initialized for podcast generation")
+                except Exception as e:
+                    logger.error(f"Failed to initialize ElevenLabs client: {e}")
+        # Create podcast storage directory
+        self.podcast_dir = Path("./data/podcasts")
+        self.podcast_dir.mkdir(parents=True, exist_ok=True)
+        # Metadata database file
+        self.metadata_file = self.podcast_dir / "metadata_db.json"
+        self._ensure_metadata_db()
+    def _ensure_metadata_db(self):
+        """Ensure metadata database exists"""
+        if not self.metadata_file.exists():
+            self.metadata_file.write_text(json.dumps([], indent=2))
+    async def generate_podcast(
+        self,
+        document_ids: List[str],
+        style: str = "conversational",
+        duration_minutes: int = 10,
+        host1_voice: str = "Rachel",
+        host2_voice: str = "Adam"
+    ) -> PodcastResult:
+        """
+        Generate a complete podcast from documents
+        Args:
+            document_ids: List of document IDs to analyze
+            style: Podcast style (conversational, educational, technical, casual)
+            duration_minutes: Target duration in minutes
+            host1_voice: Voice name for first host
+            host2_voice: Voice name for second host
+        Returns:
+            PodcastResult with audio file path and metadata
+        """
+        start_time = datetime.now()
+        podcast_id = str(uuid.uuid4())
+        try:
+            logger.info(f"Starting podcast generation {podcast_id}")
+            logger.info(f"Documents: {document_ids}, Style: {style}, Duration: {duration_minutes}min")
+            # Step 1: Analyze documents
+            logger.info("Step 1: Analyzing documents...")
+            analysis = await self.analyze_documents(document_ids)
+            # Step 2: Generate script
+            logger.info("Step 2: Generating podcast script...")
+            script = await self.generate_script(analysis, style, duration_minutes)
+            # Step 3: Synthesize audio
+            logger.info("Step 3: Synthesizing audio with voices...")
+            audio_file_path = await self.synthesize_audio(
+                podcast_id,
+                script,
+                host1_voice,
+                host2_voice
+            )
+            # Calculate generation time
+            generation_time = (datetime.now() - start_time).total_seconds()
+            # Step 4: Create metadata
+            logger.info("Step 4: Creating metadata...")
+            metadata = self._create_metadata(
+                podcast_id,
+                analysis,
+                script,
+                audio_file_path,
+                {host1_voice, host2_voice},
+                document_ids,
+                style
+            )
+            # Save metadata
+            self._save_metadata(metadata)
+            # Save transcript
+            transcript_path = self.podcast_dir / f"{podcast_id}_transcript.txt"
+            transcript_path.write_text(script.to_text(), encoding="utf-8")
+            logger.info(f"Podcast generated successfully: {podcast_id}")
+            return PodcastResult(
+                podcast_id=podcast_id,
+                audio_file_path=str(audio_file_path),
+                transcript=script.to_text(),
+                metadata=metadata,
+                generation_time=generation_time,
+                success=True
+            )
+        except Exception as e:
+            logger.error(f"Podcast generation failed: {str(e)}", exc_info=True)
+            return PodcastResult(
+                podcast_id=podcast_id,
+                audio_file_path="",
+                transcript="",
+                metadata=None,
+                generation_time=(datetime.now() - start_time).total_seconds(),
+                success=False,
+                error=str(e)
+            )
+    async def analyze_documents(self, document_ids: List[str]) -> DocumentAnalysis:
+        """
+        Analyze documents to extract key insights for podcast
+        Args:
+            document_ids: List of document IDs
+        Returns:
+            DocumentAnalysis with key insights and topics
+        """
+        # Create analysis query for the agentic RAG
+        analysis_query = f"""Analyze the following documents and provide:
+1. The 5-7 most important insights or key points
+2. Main themes and topics covered
+3. The overall complexity level (beginner/intermediate/advanced)
+4. A brief summary suitable for podcast discussion
+Document IDs: {', '.join(document_ids)}
+Provide a structured analysis optimized for creating an engaging podcast discussion."""
+        # Use LlamaIndex agentic RAG for analysis
+        result = await self.llamaindex_service.query(analysis_query)
+        # Parse the result to extract structured information
+        # This is a simplified parser - in production, you might want more robust parsing
+        insights = self._extract_insights(result)
+        topics = self._extract_topics(result)
+        complexity = self._determine_complexity(result)
+        return DocumentAnalysis(
+            key_insights=insights[:7],  # Limit to 7
+            topics=topics,
+            complexity_level=complexity,
+            estimated_words=len(result.split()),
+            source_documents=document_ids,
+            summary=result
+        )
+    def _extract_insights(self, text: str) -> List[str]:
+        """Extract key insights from analysis text"""
+        insights = []
+        #Simple extraction based on numbered lists or bullet points
+        lines = text.split('\n')
+        for line in lines:
+            line = line.strip()
+            # Match patterns like "1.", "2.", "-", "*", "•"
+            if re.match(r'^\d+\.|\-|\*|•', line):
+                insight = re.sub(r'^\d+\.|\-|\*|•', '', line).strip()
+                if len(insight) > 20:  # Ensure it's substantial
+                    insights.append(insight)
+        # If no insights found, create from first few sentences
+        if not insights:
+            sentences = text.split('.')
+            insights = [s.strip() + '.' for s in sentences[:7] if len(s.strip()) > 20]
+        return insights
+    def _extract_topics(self, text: str) -> List[str]:
+        """Extract main topics from analysis"""
+        # Simple keyword extraction - could be enhanced with NLP
+        common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
+        words = text.lower().split()
+        word_freq = {}
+        for word in words:
+            word = re.sub(r'[^\w\s]', '', word)
+            if len(word) > 4 and word not in common_words:
+                word_freq[word] = word_freq.get(word, 0) + 1
+        # Get top topics
+        topics = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
+        return [topic[0].title() for topic in topics]
+    def _determine_complexity(self, text: str) -> str:
+        """Determine content complexity level"""
+        text_lower = text.lower()
+        # Simple heuristic based on keywords
+        if any(word in text_lower for word in ['basic', 'introduction', 'beginner', 'simple']):
+            return "beginner"
+        elif any(word in text_lower for word in ['advanced', 'complex', 'sophisticated', 'expert']):
+            return "advanced"
+        else:
+            return "intermediate"
+    async def generate_script(
+        self,
+        analysis: DocumentAnalysis,
+        style: str,
+        duration_minutes: int
+    ) -> PodcastScript:
+        """
+        Generate podcast script from analysis
+        Args:
+            analysis: Document analysis results
+            style: Podcast style
+            duration_minutes: Target duration
+        Returns:
+            Complete podcast script
+        """
+        # Calculate target word count
+        target_words = duration_minutes * self.WORDS_PER_MINUTE
+        # Prepare analysis context
+        analysis_context = f"""
+KEY INSIGHTS:
+{chr(10).join(f"{i+1}. {insight}" for i, insight in enumerate(analysis.key_insights))}
+TOPICS: {', '.join(analysis.topics)}
+COMPLEXITY: {analysis.complexity_level}
+SUMMARY:
+{analysis.summary[:500]}...
+"""
+        # Get prompt template for style
+        prompt_template = self.SCRIPT_PROMPTS.get(style, self.SCRIPT_PROMPTS["conversational"])
+        # Fill in the template
+        prompt = prompt_template.format(
+            analysis=analysis_context,
+            duration_minutes=duration_minutes,
+            word_count=target_words
+        )
+        # Generate script using LLM
+        script_text = await self.llm_service.generate_text(
+            prompt,
+            max_tokens=target_words * 2,  # Give room for generation
+            temperature=0.8  # More creative
+        )
+        # Parse script into dialogue lines
+        dialogue = self._parse_script(script_text)
+        # Calculate actual word count and duration
+        word_count = sum(len(line.text.split()) for line in dialogue)
+        duration_estimate = word_count / self.WORDS_PER_MINUTE
+        return PodcastScript(
+            dialogue=dialogue,
+            total_duration_estimate=duration_estimate * 60,  # Convert to seconds
+            word_count=word_count,
+            style=style
+        )
+    def _parse_script(self, script_text: str) -> List[DialogueLine]:
+        """Parse generated script into dialogue lines"""
+        dialogue = []
+        lines = script_text.split('\n')
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            # Match "HOST1:" or "HOST2:" format
+            if line.startswith('HOST1:'):
+                text = line[6:].strip()
+                if text:
+                    dialogue.append(DialogueLine(speaker="HOST1", text=text))
+            elif line.startswith('HOST2:'):
+                text = line[6:].strip()
+                if text:
+                    dialogue.append(DialogueLine(speaker="HOST2", text=text))
+        return dialogue
+    def _get_voice_id(self, voice_name: str) -> str:
+        """
+        Get voice ID from voice name.
+        Falls back to first available voice if not found.
+        Args:
+            voice_name: Voice name (e.g., "Rachel", "Adam")
+        Returns:
+            Voice ID string
+        """
+        try:
+            # Try to get voices and find by name
+            voices = self.elevenlabs_client.voices.get_all()
+            if not voices or not voices.voices:
+                logger.error("No voices available from ElevenLabs")
+                raise RuntimeError("No voices available")
+            # First, try exact name match
+            for voice in voices.voices:
+                if voice.name.lower() == voice_name.lower():
+                    logger.info(f"Found exact voice match for '{voice_name}': {voice.voice_id}")
+                    return voice.voice_id
+            # Try partial match
+            for voice in voices.voices:
+                if voice_name.lower() in voice.name.lower():
+                    logger.info(f"Found partial voice match for '{voice_name}': {voice.name} ({voice.voice_id})")
+                    return voice.voice_id
+            # Use first available voice as fallback
+            first_voice = voices.voices[0]
+            logger.warning(f"Voice '{voice_name}' not found, using first available voice: {first_voice.name} ({first_voice.voice_id})")
+            return first_voice.voice_id
+        except Exception as e:
+            logger.error(f"Could not fetch voices: {e}", exc_info=True)
+            raise RuntimeError(f"Failed to get voice ID: {str(e)}")
+    async def synthesize_audio(
+        self,
+        podcast_id: str,
+        script: PodcastScript,
+        host1_voice: str,
+        host2_voice: str
+    ) -> Path:
+        """
+        Synthesize audio from script using ElevenLabs
+        Args:
+            podcast_id: Unique podcast ID
+            script: Podcast script
+            host1_voice: Voice for HOST1
+            host2_voice: Voice for HOST2
+        Returns:
+            Path to generated MP3 file
+        """
+        if not self.elevenlabs_client:
+            raise RuntimeError("ElevenLabs client not initialized")
+        audio_file = self.podcast_dir / f"{podcast_id}.mp3"
+        # For now, create a simple text-to-speech for the full script
+        # In production, you'd combine segments with pauses
+        full_text = script.to_text()
+        # Get actual voice ID
+        voice_id = self._get_voice_id(host1_voice)
+        try:
+            # Use modern ElevenLabs TTS API
+            # Note: This is a simplified version using single voice
+            # Full implementation would process each line separately with different voices
+            logger.info(f"Generating audio with voice: {host1_voice}")
+            # Use the modern text_to_speech API
+            audio_generator = self.elevenlabs_client.text_to_speech.convert(
+                voice_id=voice_id,  # Using resolved voice ID
+                text=full_text,
+                model_id="eleven_multilingual_v2"
+            )
+            # Write audio chunks to file
+            with open(audio_file, 'wb') as f:
+                for chunk in audio_generator:
+                    if chunk:
+                        f.write(chunk)
+            # Verify file was created with content
+            if audio_file.exists() and audio_file.stat().st_size > 1000:
+                logger.info(f"Audio synthesized successfully: {audio_file} ({audio_file.stat().st_size} bytes)")
+                return audio_file
+            else:
+                raise RuntimeError(f"Generated audio file is too small or empty: {audio_file.stat().st_size} bytes")
+        except Exception as e:
+            logger.error(f"Audio synthesis failed: {e}", exc_info=True)
+            raise RuntimeError(f"Failed to generate podcast audio: {str(e)}")
+    def _create_metadata(
+        self,
+        podcast_id: str,
+        analysis: DocumentAnalysis,
+        script: PodcastScript,
+        audio_path: Path,
+        voices: set,
+        document_ids: List[str],
+        style: str
+    ) -> PodcastMetadata:
+        """Create podcast metadata"""
+        # Auto-generate title
+        title = f"Podcast: {analysis.topics[0] if analysis.topics else 'Document Discussion'}"
+        # Create description
+        description = f"A {style} podcast discussing insights from {len(document_ids)} document(s)."
+        # Calculate file size
+        file_size_mb = audio_path.stat().st_size / (1024 * 1024) if audio_path.exists() else 0
+        # Estimate costs
+        llm_cost = (script.word_count / 1000) * 0.01  # Rough estimate
+        tts_cost = (script.word_count * 5 / 1000) * 0.30  # Rough estimate
+        return PodcastMetadata(
+            podcast_id=podcast_id,
+            title=title,
+            description=description,
+            source_documents=document_ids,
+            style=style,
+            duration_seconds=script.total_duration_estimate,
+            file_size_mb=file_size_mb,
+            voices={"host1": list(voices)[0] if len(voices) > 0 else "Rachel",
+                   "host2": list(voices)[1] if len(voices) > 1 else "Adam"},
+            generated_at=datetime.now().isoformat(),
+            generation_cost={"llm_cost": llm_cost, "tts_cost": tts_cost, "total": llm_cost + tts_cost},
+            key_topics=analysis.topics
+        )
+    def _save_metadata(self, metadata: PodcastMetadata):
+        """Save metadata to database"""
+        try:
+            # Load existing metadata
+            existing = json.loads(self.metadata_file.read_text())
+            # Add new metadata
+            existing.append(asdict(metadata))
+            # Save back
+            self.metadata_file.write_text(json.dumps(existing, indent=2))
+            logger.info(f"Metadata saved for podcast: {metadata.podcast_id}")
+        except Exception as e:
+            logger.error(f"Failed to save metadata: {e}")
+    def list_podcasts(self, limit: int = 10) -> List[PodcastMetadata]:
+        """List generated podcasts"""
+        try:
+            data = json.loads(self.metadata_file.read_text())
+            podcasts = [PodcastMetadata(**item) for item in data[-limit:]]
+            return list(reversed(podcasts))  # Most recent first
+        except Exception as e:
+            logger.error(f"Failed to list podcasts: {e}")
+            return []
+    def get_podcast(self, podcast_id: str) -> Optional[PodcastMetadata]:
+        """Get specific podcast metadata"""
+        try:
+            data = json.loads(self.metadata_file.read_text())
+            for item in data:
+                if item.get('podcast_id') == podcast_id:
+                    return PodcastMetadata(**item)
+            return None
+        except Exception as e:
+            logger.error(f"Failed to get podcast: {e}")
+            return None

services/vector_store_service.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import logging
+import os
+import pickle
+import numpy as np
+from typing import List, Dict, Any, Optional, Tuple
+import faiss
+from pathlib import Path
+import asyncio
+import json
+from core.models import SearchResult, Chunk
+import config
+logger = logging.getLogger(__name__)
+class VectorStoreService:
+    def __init__(self):
+        self.config = config.config
+        self.index = None
+        self.chunks_metadata = {}  # Maps index position to chunk metadata
+        self.dimension = None
+        # Paths
+        self.store_path = Path(self.config.VECTOR_STORE_PATH)
+        self.store_path.mkdir(parents=True, exist_ok=True)
+        self.index_path = self.store_path / f"{self.config.INDEX_NAME}.index"
+        self.metadata_path = self.store_path / f"{self.config.INDEX_NAME}_metadata.json"
+        # Load existing index if available
+        self._load_index()
+    def _load_index(self):
+        """Load existing FAISS index and metadata"""
+        try:
+            if self.index_path.exists() and self.metadata_path.exists():
+                logger.info("Loading existing FAISS index...")
+                # Load FAISS index
+                self.index = faiss.read_index(str(self.index_path))
+                self.dimension = self.index.d
+                # Load metadata
+                with open(self.metadata_path, 'r') as f:
+                    self.chunks_metadata = json.load(f)
+                logger.info(f"Loaded index with {self.index.ntotal} vectors, dimension {self.dimension}")
+            else:
+                logger.info("No existing index found, will create new one")
+        except Exception as e:
+            logger.error(f"Error loading index: {str(e)}")
+    def _initialize_index(self, dimension: int):
+        """Initialize a new FAISS index"""
+        try:
+            # Use IndexFlatIP for cosine similarity (since embeddings are normalized)
+            self.index = faiss.IndexFlatIP(dimension)
+            self.dimension = dimension
+            self.chunks_metadata = {}
+            logger.info(f"Initialized new FAISS index with dimension {dimension}")
+        except Exception as e:
+            logger.error(f"Error initializing index: {str(e)}")
+            raise
+    async def add_chunks(self, chunks: List[Chunk]) -> bool:
+        """Add chunks to the vector store"""
+        if not chunks:
+            return True
+        try:
+            # Extract embeddings and metadata
+            embeddings = []
+            new_metadata = {}
+            for chunk in chunks:
+                if chunk.embedding and len(chunk.embedding) > 0:
+                    embeddings.append(chunk.embedding)
+                    # Store metadata using the current index position
+                    current_index = len(self.chunks_metadata) + len(embeddings) - 1
+                    new_metadata[str(current_index)] = {
+                        "chunk_id": chunk.id,
+                        "document_id": chunk.document_id,
+                        "content": chunk.content,
+                        "chunk_index": chunk.chunk_index,
+                        "start_pos": chunk.start_pos,
+                        "end_pos": chunk.end_pos,
+                        "metadata": chunk.metadata
+                    }
+            if not embeddings:
+                logger.warning("No valid embeddings found in chunks")
+                return False
+            # Check for dimension mismatch
+            if self.index is not None and self.dimension is not None:
+                if len(embeddings[0]) != self.dimension:
+                    logger.warning(f"Dimension mismatch! New embeddings have {len(embeddings[0])}, but index has {self.dimension}. Rebuilding index.")
+                    # Reset index
+                    self.index = None
+                    self.chunks_metadata = {}
+                    self.dimension = None
+            # Initialize index if needed
+            if self.index is None:
+                self._initialize_index(len(embeddings[0]))
+            # Convert to numpy array
+            embeddings_array = np.array(embeddings, dtype=np.float32)
+            # Add to FAISS index
+            self.index.add(embeddings_array)
+            # Update metadata
+            self.chunks_metadata.update(new_metadata)
+            # Save index and metadata
+            await self._save_index()
+            logger.info(f"Added {len(embeddings)} chunks to vector store")
+            return True
+        except Exception as e:
+            logger.error(f"Error adding chunks to vector store: {str(e)}")
+            return False
+    async def search(self, query_embedding: List[float], top_k: int = 5,
+                    filters: Optional[Dict[str, Any]] = None) -> List[SearchResult]:
+        """Search for similar chunks"""
+        if self.index is None or self.index.ntotal == 0:
+            logger.warning("No index available or index is empty")
+            return []
+        try:
+            # Convert query embedding to numpy array
+            query_array = np.array([query_embedding], dtype=np.float32)
+            # Perform search
+            scores, indices = self.index.search(query_array, min(top_k, self.index.ntotal))
+            # Convert results to SearchResult objects
+            results = []
+            for score, idx in zip(scores[0], indices[0]):
+                if idx == -1:  # FAISS returns -1 for empty slots
+                    continue
+                chunk_metadata = self.chunks_metadata.get(str(idx))
+                if chunk_metadata:
+                    # Apply filters if specified
+                    if filters and not self._apply_filters(chunk_metadata, filters):
+                        continue
+                    result = SearchResult(
+                        chunk_id=chunk_metadata["chunk_id"],
+                        document_id=chunk_metadata["document_id"],
+                        content=chunk_metadata["content"],
+                        score=float(score),
+                        metadata=chunk_metadata.get("metadata", {})
+                    )
+                    results.append(result)
+            # Sort by score (descending)
+            results.sort(key=lambda x: x.score, reverse=True)
+            logger.info(f"Found {len(results)} search results")
+            return results
+        except Exception as e:
+            logger.error(f"Error searching vector store: {str(e)}")
+            return []
+    def _apply_filters(self, chunk_metadata: Dict[str, Any], filters: Dict[str, Any]) -> bool:
+        """Apply filters to chunk metadata"""
+        try:
+            for key, value in filters.items():
+                if key == "document_id":
+                    if chunk_metadata.get("document_id") != value:
+                        return False
+                elif key == "document_ids":
+                    if chunk_metadata.get("document_id") not in value:
+                        return False
+                elif key == "content_length_min":
+                    if len(chunk_metadata.get("content", "")) < value:
+                        return False
+                elif key == "content_length_max":
+                    if len(chunk_metadata.get("content", "")) > value:
+                        return False
+                # Add more filter types as needed
+            return True
+        except Exception as e:
+            logger.error(f"Error applying filters: {str(e)}")
+            return True
+    async def _save_index(self):
+        """Save the FAISS index and metadata to disk"""
+        try:
+            if self.index is not None:
+                # Save FAISS index
+                faiss.write_index(self.index, str(self.index_path))
+                # Save metadata
+                with open(self.metadata_path, 'w') as f:
+                    json.dump(self.chunks_metadata, f, indent=2)
+                logger.debug("Saved index and metadata to disk")
+        except Exception as e:
+            logger.error(f"Error saving index: {str(e)}")
+    async def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the vector store"""
+        try:
+            return {
+                "total_vectors": self.index.ntotal if self.index else 0,
+                "dimension": self.dimension,
+                "index_type": type(self.index).__name__ if self.index else None,
+                "metadata_entries": len(self.chunks_metadata),
+                "index_file_exists": self.index_path.exists(),
+                "metadata_file_exists": self.metadata_path.exists()
+            }
+        except Exception as e:
+            logger.error(f"Error getting stats: {str(e)}")
+            return {"error": str(e)}
+    async def delete_document(self, document_id: str) -> bool:
+        """Delete all chunks for a specific document"""
+        try:
+            # Find indices to remove
+            indices_to_remove = []
+            for idx, metadata in self.chunks_metadata.items():
+                if metadata.get("document_id") == document_id:
+                    indices_to_remove.append(int(idx))
+            if not indices_to_remove:
+                logger.warning(f"No chunks found for document {document_id}")
+                return False
+            # FAISS doesn't support removing individual vectors efficiently
+            # We need to rebuild the index without the removed vectors
+            if self.index and self.index.ntotal > 0:
+                # Get all embeddings except the ones to remove
+                all_embeddings = []
+                new_metadata = {}
+                new_index = 0
+                for old_idx in range(self.index.ntotal):
+                    if old_idx not in indices_to_remove:
+                        # Get the embedding from FAISS
+                        embedding = self.index.reconstruct(old_idx)
+                        all_embeddings.append(embedding)
+                        # Update metadata with new index
+                        old_metadata = self.chunks_metadata.get(str(old_idx))
+                        if old_metadata:
+                            new_metadata[str(new_index)] = old_metadata
+                            new_index += 1
+                # Rebuild index
+                if all_embeddings:
+                    self._initialize_index(self.dimension)
+                    embeddings_array = np.array(all_embeddings, dtype=np.float32)
+                    self.index.add(embeddings_array)
+                    self.chunks_metadata = new_metadata
+                else:
+                    # No embeddings left, create empty index
+                    self._initialize_index(self.dimension)
+                # Save updated index
+                await self._save_index()
+            logger.info(f"Deleted {len(indices_to_remove)} chunks for document {document_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting document chunks: {str(e)}")
+            return False
+    async def clear_all(self) -> bool:
+        """Clear all data from the vector store"""
+        try:
+            self.index = None
+            self.chunks_metadata = {}
+            self.dimension = None
+            # Remove files
+            if self.index_path.exists():
+                self.index_path.unlink()
+            if self.metadata_path.exists():
+                self.metadata_path.unlink()
+            logger.info("Cleared all data from vector store")
+            return True
+        except Exception as e:
+            logger.error(f"Error clearing vector store: {str(e)}")
+            return False