Spaces:

Ansemin101
/

Markit_v2

Running on Zero

File size: 74,066 Bytes

6ea41ec

import gradio as gr
import markdown
import threading
import time
import logging
from pathlib import Path
from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress
from src.parsers.parser_registry import ParserRegistry
from src.core.config import config
from src.core.exceptions import (
    DocumentProcessingError,
    UnsupportedFileTypeError,
    FileSizeLimitError,
    ConfigurationError
)
from src.core.logging_config import get_logger
from src.rag import rag_chat_service, document_ingestion_service
from src.rag.vector_store import vector_store_manager
from src.services.data_clearing_service import data_clearing_service

# Use centralized logging
logger = get_logger(__name__)

# Import MarkItDown to check if it's available
try:
    from markitdown import MarkItDown
    HAS_MARKITDOWN = True
    logger.info("MarkItDown is available for use")
except ImportError:
    HAS_MARKITDOWN = False
    logger.warning("MarkItDown is not available")

# Add a global variable to track cancellation state
conversion_cancelled = threading.Event()

# Pass the cancellation flag to the converter module
set_cancellation_flag(conversion_cancelled)

# Add a background thread to monitor cancellation
def monitor_cancellation():
    """Background thread to monitor cancellation and update UI if needed"""
    logger.info("Starting cancellation monitor thread")
    while is_conversion_in_progress():
        if conversion_cancelled.is_set():
            logger.info("Cancellation detected by monitor thread")
        time.sleep(0.1)  # Check every 100ms
    logger.info("Cancellation monitor thread ending")

def update_ui_for_file_count(files):
    """Update UI components based on the number of files uploaded."""
    if not files or len(files) == 0:
        return (
            gr.update(visible=False),  # processing_type_selector
            "<div style='color: #666; font-style: italic;'>Upload documents to begin</div>"  # file_status_text
        )
    
    if len(files) == 1:
        file_name = files[0].name if hasattr(files[0], 'name') else str(files[0])
        return (
            gr.update(visible=False),  # processing_type_selector (hidden for single file)
            f"<div style='color: #2563eb; font-weight: 500;'>📄 Single document: {file_name}</div>"
        )
    else:
        # Calculate total size for validation display
        total_size = 0
        try:
            for file in files:
                if hasattr(file, 'size'):
                    total_size += file.size
                elif hasattr(file, 'name'):
                    # For file paths, get size from filesystem
                    total_size += Path(file.name).stat().st_size
        except:
            pass  # Size calculation is optional for display
        
        size_display = f" ({total_size / (1024*1024):.1f}MB)" if total_size > 0 else ""
        
        # Check if within limits
        if len(files) > 5:
            status_color = "#dc2626"  # red
            status_text = f"⚠️ Too many files: {len(files)}/5 (max 5 files allowed)"
        elif total_size > 20 * 1024 * 1024:  # 20MB
            status_color = "#dc2626"  # red
            status_text = f"⚠️ Files too large{size_display} (max 20MB combined)"
        else:
            status_color = "#059669"  # green
            status_text = f"📂 Batch mode: {len(files)} files{size_display}"
        
        return (
            gr.update(visible=True),  # processing_type_selector (visible for multiple files)
            f"<div style='color: {status_color}; font-weight: 500;'>{status_text}</div>"
        )

def validate_file_for_parser(file_path, parser_name):
    """Validate if the file type is supported by the selected parser."""
    if not file_path:
        return True, ""  # No file selected yet
    
    try:
        file_path_obj = Path(file_path)
        file_ext = file_path_obj.suffix.lower()
        
        # Check file size
        if file_path_obj.exists():
            file_size = file_path_obj.stat().st_size
            if file_size > config.app.max_file_size:
                size_mb = file_size / (1024 * 1024)
                max_mb = config.app.max_file_size / (1024 * 1024)
                return False, f"File size ({size_mb:.1f}MB) exceeds maximum allowed size ({max_mb:.1f}MB)"
        
        # Check file extension
        if file_ext not in config.app.allowed_extensions:
            return False, f"File type '{file_ext}' is not supported. Allowed types: {', '.join(config.app.allowed_extensions)}"
        
        # Parser-specific validation
        if "GOT-OCR" in parser_name:
            if file_ext not in ['.jpg', '.jpeg', '.png']:
                return False, "GOT-OCR only supports JPG and PNG formats."
        
        return True, ""
        
    except Exception as e:
        logger.error(f"Error validating file: {e}")
        return False, f"Error validating file: {e}"

def format_markdown_content(content):
    if not content:
        return content
    
    # Convert the content to HTML using markdown library
    html_content = markdown.markdown(str(content), extensions=['tables'])
    return html_content

def render_latex_to_html(latex_content):
    """Convert LaTeX content to HTML using Mathpix Markdown like GOT-OCR demo."""
    import json
    
    # Clean up the content similar to GOT-OCR demo
    content = latex_content.strip()
    if content.endswith("<|im_end|>"):
        content = content[:-len("<|im_end|>")]
    
    # Fix unbalanced delimiters exactly like GOT-OCR demo
    right_num = content.count("\\right")
    left_num = content.count("\\left")
    
    if right_num != left_num:
        content = (
            content.replace("\\left(", "(")
            .replace("\\right)", ")")
            .replace("\\left[", "[")
            .replace("\\right]", "]")
            .replace("\\left{", "{")
            .replace("\\right}", "}")
            .replace("\\left|", "|")
            .replace("\\right|", "|")
            .replace("\\left.", ".")
            .replace("\\right.", ".")
        )
    
    # Process content like GOT-OCR demo: remove $ signs and replace quotes
    content = content.replace('"', "``").replace("$", "")
    
    # Split into lines and create JavaScript string like GOT-OCR demo
    outputs_list = content.split("\n")
    js_text_parts = []
    for line in outputs_list:
        # Escape backslashes and add line break
        escaped_line = line.replace("\\", "\\\\")
        js_text_parts.append(f'"{escaped_line}\\n"')
    
    # Join with + like in GOT-OCR demo
    js_text = " + ".join(js_text_parts)
    
    # Create HTML using Mathpix Markdown like GOT-OCR demo
    html_content = f"""<!DOCTYPE html>
<html lang="en" data-lt-installed="true">
<head>
    <meta charset="UTF-8">
    <title>LaTeX Content</title>
    <script>
        const text = {js_text};
    </script>
    <style>
        #content {{
            max-width: 800px;
            margin: auto;
            padding: 20px;
        }}
        body {{
            font-family: 'Times New Roman', serif;
            line-height: 1.6;
            background-color: #ffffff;
            color: #333;
        }}
        table {{
            border-collapse: collapse;
            width: 100%;
            margin: 20px 0;
        }}
        td, th {{
            border: 1px solid #333;
            padding: 8px 12px;
            text-align: center;
            vertical-align: middle;
        }}
    </style>
    <script>
        let script = document.createElement('script');
        script.src = "https://cdn.jsdelivr.net/npm/mathpix-markdown-it@1.3.6/es5/bundle.js";
        document.head.append(script);
        script.onload = function() {{
            const isLoaded = window.loadMathJax();
            if (isLoaded) {{
                console.log('Styles loaded!')
            }}
            const el = window.document.getElementById('content-text');
            if (el) {{
                const options = {{
                    htmlTags: true
                }};
                const html = window.render(text, options);
                el.outerHTML = html;
            }}
        }};
    </script>
</head>
<body>
    <div id="content">
        <div id="content-text"></div>
    </div>
</body>
</html>"""
    
    return html_content

def format_latex_content(content):
    """Format LaTeX content for display in UI using MathJax rendering like GOT-OCR demo."""
    if not content:
        return content
    
    try:
        # Generate rendered HTML
        rendered_html = render_latex_to_html(content)
        
        # Encode for iframe display (similar to GOT-OCR demo)
        import base64
        encoded_html = base64.b64encode(rendered_html.encode("utf-8")).decode("utf-8")
        iframe_src = f"data:text/html;base64,{encoded_html}"
        
        # Create the display with both rendered and raw views
        formatted_content = f"""
        <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
            <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
                📄 LaTeX Content (Rendered with MathJax)
            </div>
            <div style="padding: 0;">
                <iframe src="{iframe_src}" width="100%" height="500px" style="border: none; border-radius: 0 0 8px 8px;"></iframe>
            </div>
            <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0; font-size: 12px; color: #6c757d; border-top: 1px solid #dee2e6;">
                💡 LaTeX content rendered with MathJax. Tables and formulas are displayed as they would appear in a LaTeX document.
            </div>
            <details style="margin: 0; border-top: 1px solid #dee2e6;">
                <summary style="padding: 8px 15px; background-color: #e9ecef; cursor: pointer; font-size: 12px; color: #6c757d;">
                    📝 View Raw LaTeX Source
                </summary>
                <div style="padding: 15px; background-color: #f8f9fa;">
                    <pre style="background-color: transparent; margin: 0; padding: 0;
                                font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.4; 
                                white-space: pre-wrap; word-wrap: break-word; color: #2c3e50; max-height: 200px; overflow-y: auto;">
{content}
                    </pre>
                </div>
            </details>
        </div>
        """
        
    except Exception as e:
        # Fallback to simple formatting if rendering fails
        import html
        escaped_content = html.escape(str(content))
        formatted_content = f"""
        <div style="background-color: #f8f9fa; border-radius: 8px; border: 1px solid #e9ecef; margin: 10px 0;">
            <div style="background-color: #e9ecef; padding: 10px; border-radius: 8px 8px 0 0; font-weight: bold; color: #495057;">
                📄 LaTeX Content (Fallback View)
            </div>
            <div style="padding: 15px;">
                <pre style="background-color: transparent; margin: 0; padding: 0;
                            font-family: 'Courier New', monospace; font-size: 14px; line-height: 1.4; 
                            white-space: pre-wrap; word-wrap: break-word; color: #2c3e50;">
{escaped_content}
                </pre>
            </div>
            <div style="background-color: #e9ecef; padding: 8px 15px; border-radius: 0 0 8px 8px; font-size: 12px; color: #6c757d;">
                ⚠️ Rendering failed, showing raw LaTeX. Error: {str(e)}
            </div>
        </div>
        """
    
    return formatted_content

# Function to run conversion in a separate thread
def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format):
    """Run the conversion in a separate thread and return the thread object"""
    global conversion_cancelled
    
    # Reset the cancellation flag
    conversion_cancelled.clear()
    
    # Create a container for the results
    results = {"content": None, "download_file": None, "error": None}
    
    def conversion_worker():
        try:
            content, download_file = convert_file(file_path, parser_name, ocr_method_name, output_format)
            results["content"] = content
            results["download_file"] = download_file
        except Exception as e:
            logger.error(f"Error during conversion: {str(e)}")
            results["error"] = str(e)
    
    # Create and start the thread
    thread = threading.Thread(target=conversion_worker)
    thread.daemon = True
    thread.start()
    
    return thread, results

def run_conversion_thread_multi(file_paths, parser_name, ocr_method_name, output_format, processing_type):
    """Run the conversion in a separate thread for multiple files."""
    import threading
    from src.services.document_service import DocumentService
    
    # Results will be shared between threads
    results = {"content": None, "download_file": None, "error": None}
    
    def conversion_worker():
        try:
            logger.info(f"Starting multi-file conversion thread for {len(file_paths)} files")
            
            # Use the new document service unified method
            document_service = DocumentService()
            document_service.set_cancellation_flag(conversion_cancelled)
            
            # Call the unified convert_documents method
            content, output_file = document_service.convert_documents(
                file_paths=file_paths,
                parser_name=parser_name,
                ocr_method_name=ocr_method_name,
                output_format=output_format,
                processing_type=processing_type
            )
            
            logger.info(f"Multi-file conversion completed successfully for {len(file_paths)} files")
            results["content"] = content
            results["download_file"] = output_file
            
        except Exception as e:
            logger.error(f"Error during multi-file conversion: {str(e)}")
            results["error"] = str(e)
    
    # Create and start the thread
    thread = threading.Thread(target=conversion_worker)
    thread.daemon = True
    thread.start()
    
    return thread, results

def handle_convert(files, parser_name, ocr_method_name, output_format, processing_type, is_cancelled):
    """Handle file conversion for single or multiple files."""
    global conversion_cancelled
    
    # Check if we should cancel before starting
    if is_cancelled:
        logger.info("Conversion cancelled before starting")
        return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    # Validate files input
    if not files or len(files) == 0:
        error_msg = "No files uploaded. Please upload at least one document."
        logger.error(error_msg)
        return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    # Convert Gradio file objects to file paths
    file_paths = []
    for file in files:
        if hasattr(file, 'name'):
            file_paths.append(file.name)
        else:
            file_paths.append(str(file))
    
    # Validate file types for the selected parser
    for file_path in file_paths:
        is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
        if not is_valid:
            logger.error(f"File validation error: {error_msg}")
            return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    logger.info(f"Starting conversion of {len(file_paths)} file(s) with cancellation flag cleared")
    
    # Start the conversion in a separate thread
    thread, results = run_conversion_thread_multi(file_paths, parser_name, ocr_method_name, output_format, processing_type)
    
    # Start the monitoring thread
    monitor_thread = threading.Thread(target=monitor_cancellation)
    monitor_thread.daemon = True
    monitor_thread.start()
    
    # Wait for the thread to complete or be cancelled
    while thread.is_alive():
        # Check if cancellation was requested
        if conversion_cancelled.is_set():
            logger.info("Cancellation detected, waiting for thread to finish")
            # Give the thread a chance to clean up
            thread.join(timeout=0.5)
            if thread.is_alive():
                logger.warning("Thread did not finish within timeout")
            return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
        
        # Sleep briefly to avoid busy waiting
        time.sleep(0.1)
    
    # Thread has completed, check results
    if results["error"]:
        return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    content = results["content"]
    download_file = results["download_file"]
    
    # If conversion returned a cancellation message
    if content == "Conversion cancelled.":
        logger.info("Converter returned cancellation message")
        return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
    
    # Format the content based on parser type
    if "GOT-OCR" in parser_name:
        # For GOT-OCR, display as LaTeX
        formatted_content = format_latex_content(str(content))
        html_output = f"<div class='output-container'>{formatted_content}</div>"
    else:
        # For other parsers, display as Markdown
        formatted_content = format_markdown_content(str(content))
        html_output = f"<div class='output-container'>{formatted_content}</div>"
    
    logger.info("Conversion completed successfully")
    
    # Auto-ingest the converted document for RAG
    try:
        # Read original file content for proper deduplication hashing
        original_file_content = None
        if file_path and Path(file_path).exists():
            try:
                with open(file_path, 'rb') as f:
                    original_file_content = f.read().decode('utf-8', errors='ignore')
            except Exception as e:
                logger.warning(f"Could not read original file content: {e}")
        
        conversion_result = {
            "markdown_content": content,
            "original_filename": Path(file_path).name if file_path else "unknown",
            "conversion_method": parser_name,
            "file_size": Path(file_path).stat().st_size if file_path and Path(file_path).exists() else 0,
            "conversion_time": 0,  # Could be tracked if needed
            "original_file_content": original_file_content
        }
        
        success, ingestion_msg, stats = document_ingestion_service.ingest_from_conversion_result(conversion_result)
        if success:
            logger.info(f"Document auto-ingested for RAG: {ingestion_msg}")
        else:
            logger.warning(f"Document ingestion failed: {ingestion_msg}")
    except Exception as e:
        logger.error(f"Error during auto-ingestion: {e}")
    
    return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)

def handle_chat_message(message, history):
    """Handle a new chat message with streaming response."""
    if not message or not message.strip():
        return "", history, gr.update()
    
    try:
        # Add user message to history
        history = history or []
        history.append({"role": "user", "content": message})
        
        # Add assistant message placeholder
        history.append({"role": "assistant", "content": ""})
        
        # Get response from RAG service
        response_text = ""
        for chunk in rag_chat_service.chat_stream(message):
            response_text += chunk
            # Update the last message in history with the current response
            history[-1]["content"] = response_text
            # Update status in real-time during streaming
            updated_status = get_chat_status()
            yield "", history, updated_status
        
        logger.info(f"Chat response completed for message: {message[:50]}...")
        
        # Final status update after message completion
        final_status = get_chat_status()
        yield "", history, final_status
        
    except Exception as e:
        error_msg = f"Error generating response: {str(e)}"
        logger.error(error_msg)
        if history and len(history) > 0:
            history[-1]["content"] = f"❌ {error_msg}"
        else:
            history = [
                {"role": "user", "content": message},
                {"role": "assistant", "content": f"❌ {error_msg}"}
            ]
        # Update status even on error
        error_status = get_chat_status()
        yield "", history, error_status

def start_new_chat_session():
    """Start a new chat session."""
    try:
        session_id = rag_chat_service.start_new_session()
        logger.info(f"Started new chat session: {session_id}")
        return [], f"✅ New chat session started: {session_id}"
    except Exception as e:
        error_msg = f"Error starting new session: {str(e)}"
        logger.error(error_msg)
        return [], f"❌ {error_msg}"

def handle_clear_all_data():
    """Handle clearing all RAG data (vector store + chat history)."""
    try:
        # Clear all data using the data clearing service
        success, message, stats = data_clearing_service.clear_all_data()
        
        if success:
            # Reset chat session after clearing data
            session_id = rag_chat_service.start_new_session()
            
            # Get updated status
            updated_status = get_chat_status()
            
            # Create success message with stats
            if stats.get("total_cleared_documents", 0) > 0 or stats.get("total_cleared_files", 0) > 0:
                clear_msg = f"✅ {message}"
                session_msg = f"🆕 Started new session: {session_id}"
                combined_msg = f'{clear_msg}<br/><div class="session-info">{session_msg}</div>'
            else:
                combined_msg = f'ℹ️ {message}<br/><div class="session-info">🆕 Started new session: {session_id}</div>'
            
            logger.info(f"Data cleared successfully: {message}")
            
            return [], combined_msg, updated_status
        else:
            error_msg = f"❌ {message}"
            logger.error(f"Data clearing failed: {message}")
            
            # Still get updated status even on error
            updated_status = get_chat_status()
            
            return None, f'<div class="session-info">{error_msg}</div>', updated_status
            
    except Exception as e:
        error_msg = f"Error clearing data: {str(e)}"
        logger.error(error_msg)
        
        # Get current status
        current_status = get_chat_status()
        
        return None, f'<div class="session-info">❌ {error_msg}</div>', current_status

def handle_query_search(query, method, k_value):
    """Handle query search and return formatted results."""
    if not query or not query.strip():
        return """
        <div class="ranker-container">
            <div class="ranker-placeholder">
                <h3>🔍 Query Ranker</h3>
                <p>Enter a search query to find relevant document chunks with similarity scores.</p>
            </div>
        </div>
        """
    
    try:
        logger.info(f"Query search: '{query[:50]}...' using method: {method}")
        
        # Get results based on method
        results = []
        if method == "similarity":
            retriever = vector_store_manager.get_retriever("similarity", {"k": k_value})
            docs = retriever.invoke(query)
            # Try to get actual similarity scores
            try:
                vector_store = vector_store_manager.get_vector_store()
                if hasattr(vector_store, 'similarity_search_with_score'):
                    docs_with_scores = vector_store.similarity_search_with_score(query, k=k_value)
                    for i, (doc, score) in enumerate(docs_with_scores):
                        similarity_score = max(0, 1 - score) if score is not None else 0.8
                        results.append(_format_ranker_result(doc, similarity_score, i + 1))
                else:
                    # Fallback without scores
                    for i, doc in enumerate(docs):
                        score = 0.85 - (i * 0.05)
                        results.append(_format_ranker_result(doc, score, i + 1))
            except Exception as e:
                logger.warning(f"Could not get similarity scores: {e}")
                for i, doc in enumerate(docs):
                    score = 0.85 - (i * 0.05)
                    results.append(_format_ranker_result(doc, score, i + 1))
                    
        elif method == "mmr":
            retriever = vector_store_manager.get_retriever("mmr", {"k": k_value, "fetch_k": k_value * 2, "lambda_mult": 0.5})
            docs = retriever.invoke(query)
            for i, doc in enumerate(docs):
                results.append(_format_ranker_result(doc, None, i + 1))  # No score for MMR
                
        elif method == "bm25":
            retriever = vector_store_manager.get_bm25_retriever(k=k_value)
            docs = retriever.invoke(query)
            for i, doc in enumerate(docs):
                results.append(_format_ranker_result(doc, None, i + 1))  # No score for BM25
                
        elif method == "hybrid":
            retriever = vector_store_manager.get_hybrid_retriever(k=k_value, semantic_weight=0.7, keyword_weight=0.3)
            docs = retriever.invoke(query)
            # Explicitly limit results to k_value since EnsembleRetriever may return more
            docs = docs[:k_value]
            for i, doc in enumerate(docs):
                results.append(_format_ranker_result(doc, None, i + 1))  # No score for Hybrid
        
        return _format_ranker_results_html(results, query, method)
        
    except Exception as e:
        error_msg = f"Error during search: {str(e)}"
        logger.error(error_msg)
        return f"""
        <div class="ranker-container">
            <div class="ranker-error">
                <h3>❌ Search Error</h3>
                <p>{error_msg}</p>
                <p class="error-hint">Please check if documents are uploaded and the system is ready.</p>
            </div>
        </div>
        """

def _format_ranker_result(doc, score, rank):
    """Format a single document result for the ranker."""
    metadata = doc.metadata or {}
    
    # Extract metadata
    source = metadata.get("source", "Unknown Document")
    page = metadata.get("page", "N/A")
    chunk_id = metadata.get("chunk_id", f"chunk_{rank}")
    
    # Content length indicator
    content_length = len(doc.page_content)
    if content_length < 200:
        length_indicator = "📄 Short"
    elif content_length < 500:
        length_indicator = "📄 Medium"
    else:
        length_indicator = "📄 Long"
    
    # Rank-based confidence levels (applies to all methods)
    if rank <= 3:
        confidence = "High"
        confidence_color = "#22c55e"
        confidence_icon = "🟢"
    elif rank <= 6:
        confidence = "Medium"
        confidence_color = "#f59e0b"
        confidence_icon = "🟡"
    else:
        confidence = "Low"
        confidence_color = "#ef4444"
        confidence_icon = "🔴"
    
    result = {
        "rank": rank,
        "content": doc.page_content,
        "source": source,
        "page": page,
        "chunk_id": chunk_id,
        "length_indicator": length_indicator,
        "has_score": score is not None,
        "confidence": confidence,
        "confidence_color": confidence_color,
        "confidence_icon": confidence_icon
    }
    
    # Only add score if we have a real score (similarity search only)
    if score is not None:
        result["score"] = round(score, 3)
    
    return result

def _format_ranker_results_html(results, query, method):
    """Format search results as HTML."""
    if not results:
        return """
        <div class="ranker-container">
            <div class="ranker-no-results">
                <h3>🔍 No Results Found</h3>
                <p>No relevant documents found for your query.</p>
                <p class="no-results-hint">Try different keywords or check if documents are uploaded.</p>
            </div>
        </div>
        """
    
    # Method display names
    method_labels = {
        "similarity": "🎯 Similarity Search",
        "mmr": "🔀 MMR (Diverse)",
        "bm25": "🔍 BM25 (Keywords)",
        "hybrid": "🔗 Hybrid (Recommended)"
    }
    method_display = method_labels.get(method, method)
    
    # Start building HTML
    html_parts = [f"""
    <div class="ranker-container">
        <div class="ranker-header">
            <div class="ranker-title">
                <h3>🔍 Search Results</h3>
                <div class="query-display">"{query}"</div>
            </div>
            <div class="ranker-meta">
                <span class="method-badge">{method_display}</span>
                <span class="result-count">{len(results)} results</span>
            </div>
        </div>
    """]
    
    # Add results
    for result in results:
        rank_emoji = ["🥇", "🥈", "🥉"][result["rank"] - 1] if result["rank"] <= 3 else f"#{result['rank']}"
        
        # Escape content for safe HTML inclusion and JavaScript
        escaped_content = result['content'].replace('"', '&quot;').replace("'", "&#39;").replace('\n', '\\n')
        
        # Build score info - always show confidence, only show score for similarity search
        score_info_parts = [f"""
                    <span class="confidence-badge" style="color: {result['confidence_color']}">
                        {result['confidence_icon']} {result['confidence']}
                    </span>"""]
        
        # Only add score value if we have real scores (similarity search)
        if result.get('has_score', False):
            score_info_parts.append(f'<span class="score-value">🎯 {result["score"]}</span>')
        
        score_info_html = f"""
                <div class="score-info">
                    {''.join(score_info_parts)}
                </div>"""
        
        html_parts.append(f"""
        <div class="result-card">
            <div class="result-header">
                <div class="rank-info">
                    <span class="rank-badge">{rank_emoji} Rank {result['rank']}</span>
                    <span class="source-info">📄 {result['source']}</span>
                    {f"<span class='page-info'>Page {result['page']}</span>" if result['page'] != 'N/A' else ""}
                    <span class="length-info">{result['length_indicator']}</span>
                </div>
                {score_info_html}
            </div>
            <div class="result-content">
                <div class="content-text">{result['content']}</div>
            </div>
        </div>
        """)
    
    html_parts.append("</div>")
    
    return "".join(html_parts)

def get_ranker_status():
    """Get current ranker system status."""
    try:
        # Get collection info
        collection_info = vector_store_manager.get_collection_info()
        document_count = collection_info.get("document_count", 0)
        
        # Get available methods
        available_methods = ["similarity", "mmr", "bm25", "hybrid"]
        
        # Check if system is ready
        ingestion_status = document_ingestion_service.get_ingestion_status()
        system_ready = ingestion_status.get('system_ready', False)
        
        status_html = f"""
        <div class="status-card">
            <div class="status-header">
                <h3>🔍 Query Ranker Status</h3>
                <div class="status-indicator {'status-ready' if system_ready else 'status-not-ready'}">
                    {'🟢 READY' if system_ready else '🔴 NOT READY'}
                </div>
            </div>
            
            <div class="status-grid">
                <div class="status-item">
                    <div class="status-label">Available Documents</div>
                    <div class="status-value">{document_count}</div>
                </div>
                <div class="status-item">
                    <div class="status-label">Retrieval Methods</div>
                    <div class="status-value">{len(available_methods)}</div>
                </div>
                <div class="status-item">
                    <div class="status-label">Vector Store</div>
                    <div class="status-value">{'Ready' if system_ready else 'Not Ready'}</div>
                </div>
            </div>
            
            <div class="ranker-methods">
                <div class="methods-label">Available Methods:</div>
                <div class="methods-list">
                    <span class="method-tag">🎯 Similarity</span>
                    <span class="method-tag">🔀 MMR</span>
                    <span class="method-tag">🔍 BM25</span>
                    <span class="method-tag">🔗 Hybrid</span>
                </div>
            </div>
        </div>
        """
        
        return status_html
        
    except Exception as e:
        error_msg = f"Error getting ranker status: {str(e)}"
        logger.error(error_msg)
        return f"""
        <div class="status-card status-error">
            <div class="status-header">
                <h3>❌ System Error</h3>
            </div>
            <p class="error-message">{error_msg}</p>
        </div>
        """

def get_chat_status():
    """Get current chat system status."""
    try:
        # Check ingestion status
        ingestion_status = document_ingestion_service.get_ingestion_status()
        
        # Check usage stats
        usage_stats = rag_chat_service.get_usage_stats()
        
        # Get data status for additional context
        data_status = data_clearing_service.get_data_status()
        
        # Modern status card design with better styling
        status_html = f"""
        <div class="status-card">
            <div class="status-header">
                <h3>💬 Chat System Status</h3>
                <div class="status-indicator {'status-ready' if ingestion_status.get('system_ready', False) else 'status-not-ready'}">
                    {'🟢 READY' if ingestion_status.get('system_ready', False) else '🔴 NOT READY'}
                </div>
            </div>
            
            <div class="status-grid">
                <div class="status-item">
                    <div class="status-label">Vector Store Docs</div>
                    <div class="status-value">{data_status.get('vector_store', {}).get('document_count', 0)}</div>
                </div>
                <div class="status-item">
                    <div class="status-label">Chat History Files</div>
                    <div class="status-value">{data_status.get('chat_history', {}).get('file_count', 0)}</div>
                </div>
                <div class="status-item">
                    <div class="status-label">Session Usage</div>
                    <div class="status-value">{usage_stats.get('session_messages', 0)}/{usage_stats.get('session_limit', 50)}</div>
                </div>
                <div class="status-item">
                    <div class="status-label">Environment</div>
                    <div class="status-value">{'HF Space' if data_status.get('environment') == 'hf_space' else 'Local'}</div>
                </div>
            </div>
            
            <div class="status-services">
                <div class="service-status {'service-ready' if ingestion_status.get('embedding_model_available', False) else 'service-error'}">
                    <span class="service-icon">🧠</span>
                    <span>Embedding Model</span>
                    <span class="service-indicator">{'✅' if ingestion_status.get('embedding_model_available', False) else '❌'}</span>
                </div>
                <div class="service-status {'service-ready' if ingestion_status.get('vector_store_available', False) else 'service-error'}">
                    <span class="service-icon">🗄️</span>
                    <span>Vector Store</span>
                    <span class="service-indicator">{'✅' if ingestion_status.get('vector_store_available', False) else '❌'}</span>
                </div>
            </div>
        </div>
        """
        
        return status_html
        
    except Exception as e:
        error_msg = f"Error getting chat status: {str(e)}"
        logger.error(error_msg)
        return f"""
        <div class="status-card status-error">
            <div class="status-header">
                <h3>❌ System Error</h3>
            </div>
            <p class="error-message">{error_msg}</p>
        </div>
        """

def create_ui():
    with gr.Blocks(css="""
        /* Global styles */
        .gradio-container {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        }
        
        /* Document converter styles */
        .output-container {
            max-height: 420px;
            overflow-y: auto;
            border: 1px solid #ddd;
            padding: 10px;
        }
        
        .gradio-container .prose {
            overflow: visible;
        }
        
        .processing-controls { 
            display: flex; 
            justify-content: center; 
            gap: 10px; 
            margin-top: 10px; 
        }
        
        .provider-options-row {
            margin-top: 15px;
            margin-bottom: 15px;
        }
        
        /* Chat Tab Styles - Complete redesign */
        .chat-tab-container {
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
        }
        
        .chat-header {
            text-align: center;
            margin-bottom: 30px;
            padding: 20px;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            border-radius: 15px;
            color: white;
            box-shadow: 0 4px 15px rgba(0,0,0,0.1);
        }
        
        .chat-header h2 {
            margin: 0;
            font-size: 1.8em;
            font-weight: 600;
        }
        
        .chat-header p {
            margin: 10px 0 0 0;
            opacity: 0.9;
            font-size: 1.1em;
        }
        
        /* Status Card Styling */
        .status-card {
            background: #ffffff;
            border: 1px solid #e1e5e9;
            border-radius: 12px;
            padding: 20px;
            margin-bottom: 25px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
            transition: all 0.3s ease;
        }
        
        .status-card:hover {
            box-shadow: 0 4px 20px rgba(0,0,0,0.1);
        }
        
        .status-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 20px;
            padding-bottom: 15px;
            border-bottom: 2px solid #f0f2f5;
        }
        
        .status-header h3 {
            margin: 0;
            color: #2c3e50;
            font-size: 1.3em;
            font-weight: 600;
        }
        
        .status-indicator {
            padding: 8px 16px;
            border-radius: 25px;
            font-weight: 600;
            font-size: 0.9em;
            letter-spacing: 0.5px;
        }
        
        .status-ready {
            background: #d4edda;
            color: #155724;
            border: 1px solid #c3e6cb;
        }
        
        .status-not-ready {
            background: #f8d7da;
            color: #721c24;
            border: 1px solid #f5c6cb;
        }
        
        .status-grid {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 15px;
            margin-bottom: 20px;
        }
        
        .status-item {
            background: #f8f9fa;
            padding: 15px;
            border-radius: 8px;
            text-align: center;
            border: 1px solid #e9ecef;
        }
        
        .status-label {
            font-size: 0.85em;
            color: #6c757d;
            margin-bottom: 5px;
            font-weight: 500;
        }
        
        .status-value {
            font-size: 1.4em;
            font-weight: 700;
            color: #495057;
        }
        
        .status-services {
            display: flex;
            gap: 15px;
            flex-wrap: wrap;
        }
        
        .service-status {
            display: flex;
            align-items: center;
            gap: 8px;
            padding: 10px 15px;
            border-radius: 8px;
            font-weight: 500;
            flex: 1;
            min-width: 200px;
            color: #2c3e50 !important;
        }
        
        .service-status span {
            color: #2c3e50 !important;
        }
        
        .service-ready {
            background: #d4edda;
            color: #2c3e50 !important;
            border: 1px solid #c3e6cb;
        }
        
        .service-ready span {
            color: #2c3e50 !important;
        }
        
        .service-error {
            background: #f8d7da;
            color: #2c3e50 !important;
            border: 1px solid #f5c6cb;
        }
        
        .service-error span {
            color: #2c3e50 !important;
        }
        
        .service-icon {
            font-size: 1.2em;
        }
        
        .service-indicator {
            margin-left: auto;
        }
        
        .status-error {
            border-color: #dc3545;
            background: #f8d7da;
        }
        
        .error-message {
            color: #721c24;
            margin: 0;
            font-weight: 500;
        }
        
        /* Control buttons styling */
        .control-buttons {
            display: flex;
            gap: 12px;
            justify-content: flex-end;
            margin-bottom: 25px;
        }
        
        .control-btn {
            padding: 10px 20px;
            border-radius: 8px;
            font-weight: 500;
            transition: all 0.3s ease;
            border: none;
            cursor: pointer;
        }
        
        .btn-refresh {
            background: #17a2b8;
            color: white;
        }
        
        .btn-refresh:hover {
            background: #138496;
            transform: translateY(-1px);
        }
        
        .btn-new-session {
            background: #28a745;
            color: white;
        }
        
        .btn-new-session:hover {
            background: #218838;
            transform: translateY(-1px);
        }
        
        .btn-clear-data {
            background: #dc3545;
            color: white;
        }
        
        .btn-clear-data:hover {
            background: #c82333;
            transform: translateY(-1px);
        }
        
        .btn-primary {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }
        
        .btn-primary:hover {
            transform: translateY(-1px);
            box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
        }
        
        /* Chat interface styling */
        .chat-main-container {
            background: #ffffff;
            border-radius: 15px;
            box-shadow: 0 4px 20px rgba(0,0,0,0.08);
            overflow: hidden;
            margin-bottom: 25px;
        }
        
        .chat-container {
            background: #ffffff;
            border-radius: 12px;
            border: 1px solid #e1e5e9;
            overflow: hidden;
        }
        
        /* Custom chatbot styling */
        .gradio-chatbot {
            border: none !important;
            background: #ffffff;
        }
        
        .gradio-chatbot .message {
            padding: 15px 20px;
            margin: 10px;
            border-radius: 12px;
        }
        
        .gradio-chatbot .message.user {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            margin-left: 50px;
        }
        
        .gradio-chatbot .message.assistant {
            background: #f8f9fa;
            border: 1px solid #e9ecef;
            margin-right: 50px;
        }
        
        /* Input area styling */
        .chat-input-container {
            background: #ffffff;
            padding: 20px;
            border-top: 1px solid #e1e5e9;
            border-radius: 0 0 15px 15px;
        }
        
        .input-row {
            display: flex;
            gap: 12px;
            align-items: center;
        }
        
        .message-input {
            flex: 1;
            border: 2px solid #e1e5e9;
            border-radius: 25px;
            padding: 12px 20px;
            font-size: 1em;
            transition: all 0.3s ease;
            resize: none;
            max-height: 120px;
            min-height: 48px;
        }
        
        .message-input:focus {
            border-color: #667eea;
            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
            outline: none;
        }
        
        .send-button {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border: none;
            border-radius: 12px;
            padding: 12px 24px;
            min-width: 80px;
            height: 48px;
            margin-right: 10px;
            cursor: pointer;
            transition: all 0.3s ease;
            display: flex;
            align-items: center;
            justify-content: center;
            font-size: 1em;
            font-weight: 600;
            letter-spacing: 0.5px;
        }
        
        .send-button:hover {
            transform: scale(1.05);
            box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
        }
        
        /* Session info styling */
        .session-info {
            background: #e7f3ff;
            border: 1px solid #b3d9ff;
            border-radius: 8px;
            padding: 15px;
            color: #0056b3;
            font-weight: 500;
            text-align: center;
        }
        
        /* Responsive design */
        @media (max-width: 768px) {
            .chat-tab-container {
                padding: 10px;
            }
            
            .status-grid {
                grid-template-columns: repeat(2, 1fr);
            }
            
            .service-status {
                min-width: 100%;
            }
            
            .control-buttons {
                flex-direction: column;
                gap: 8px;
            }
            
            .gradio-chatbot .message.user {
                margin-left: 20px;
            }
            
            .gradio-chatbot .message.assistant {
                margin-right: 20px;
            }
        }
        
        /* Query Ranker Styles */
        .ranker-container {
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
        }
        
        .ranker-placeholder {
            text-align: center;
            padding: 40px;
            background: #f8f9fa;
            border-radius: 12px;
            border: 1px solid #e9ecef;
            color: #6c757d;
        }
        
        .ranker-placeholder h3 {
            color: #495057;
            margin-bottom: 10px;
        }
        
        .ranker-error {
            text-align: center;
            padding: 30px;
            background: #f8d7da;
            border: 1px solid #f5c6cb;
            border-radius: 12px;
            color: #721c24;
        }
        
        .ranker-error h3 {
            margin-bottom: 15px;
        }
        
        .error-hint {
            font-style: italic;
            margin-top: 10px;
            opacity: 0.8;
        }
        
        .ranker-no-results {
            text-align: center;
            padding: 40px;
            background: #ffffff;
            border: 1px solid #e1e5e9;
            border-radius: 12px;
            color: #6c757d;
        }
        
        .ranker-no-results h3 {
            color: #495057;
            margin-bottom: 15px;
        }
        
        .no-results-hint {
            font-style: italic;
            margin-top: 10px;
            opacity: 0.8;
        }
        
        .ranker-header {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 20px;
            border-radius: 15px;
            margin-bottom: 25px;
            box-shadow: 0 4px 15px rgba(0,0,0,0.1);
        }
        
        .ranker-title h3 {
            margin: 0 0 10px 0;
            font-size: 1.4em;
            font-weight: 600;
        }
        
        .query-display {
            font-size: 1.1em;
            opacity: 0.9;
            font-style: italic;
            margin-bottom: 15px;
        }
        
        .ranker-meta {
            display: flex;
            gap: 15px;
            align-items: center;
            flex-wrap: wrap;
        }
        
        .method-badge {
            background: rgba(255, 255, 255, 0.2);
            padding: 6px 12px;
            border-radius: 20px;
            font-weight: 500;
            font-size: 0.9em;
        }
        
        .result-count {
            background: rgba(255, 255, 255, 0.15);
            padding: 6px 12px;
            border-radius: 20px;
            font-weight: 500;
            font-size: 0.9em;
        }
        
        .result-card {
            background: #ffffff;
            border: 1px solid #e1e5e9;
            border-radius: 12px;
            margin-bottom: 20px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
            transition: all 0.3s ease;
            overflow: hidden;
        }
        
        .result-card:hover {
            box-shadow: 0 4px 20px rgba(0,0,0,0.1);
            transform: translateY(-2px);
        }
        
        .result-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            padding: 15px 20px;
            background: #f8f9fa;
            border-bottom: 1px solid #e9ecef;
        }
        
        .rank-info {
            display: flex;
            gap: 10px;
            align-items: center;
            flex-wrap: wrap;
        }
        
        .rank-badge {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 4px 10px;
            border-radius: 15px;
            font-weight: 600;
            font-size: 0.85em;
        }
        
        .source-info {
            background: #e9ecef;
            color: #495057;
            padding: 4px 8px;
            border-radius: 10px;
            font-size: 0.85em;
            font-weight: 500;
        }
        
        .page-info {
            background: #d1ecf1;
            color: #0c5460;
            padding: 4px 8px;
            border-radius: 10px;
            font-size: 0.85em;
        }
        
        .length-info {
            background: #f8f9fa;
            color: #6c757d;
            padding: 4px 8px;
            border-radius: 10px;
            font-size: 0.85em;
        }
        
        .score-info {
            display: flex;
            gap: 10px;
            align-items: center;
        }
        
        .confidence-badge {
            padding: 4px 8px;
            border-radius: 10px;
            font-weight: 600;
            font-size: 0.85em;
        }
        
        .score-value {
            background: #2c3e50;
            color: white;
            padding: 6px 12px;
            border-radius: 15px;
            font-weight: 600;
            font-size: 0.9em;
        }
        
        .result-content {
            padding: 20px;
        }
        
        .content-text {
            line-height: 1.6;
            color: #2c3e50;
            border-left: 3px solid #667eea;
            padding-left: 15px;
            background: #f8f9fa;
            padding: 15px;
            border-radius: 0 8px 8px 0;
            max-height: 300px;
            overflow-y: auto;
        }
        
        .result-actions {
            display: flex;
            gap: 10px;
            padding: 15px 20px;
            background: #f8f9fa;
            border-top: 1px solid #e9ecef;
        }
        
        .action-btn {
            padding: 8px 16px;
            border: none;
            border-radius: 8px;
            font-weight: 500;
            cursor: pointer;
            transition: all 0.3s ease;
            font-size: 0.9em;
            display: flex;
            align-items: center;
            gap: 5px;
        }
        
        .copy-btn {
            background: #17a2b8;
            color: white;
        }
        
        .copy-btn:hover {
            background: #138496;
            transform: translateY(-1px);
        }
        
        .info-btn {
            background: #6c757d;
            color: white;
        }
        
        .info-btn:hover {
            background: #5a6268;
            transform: translateY(-1px);
        }
        
        .ranker-methods {
            margin-top: 20px;
            padding-top: 15px;
            border-top: 1px solid #e9ecef;
        }
        
        .methods-label {
            font-weight: 600;
            color: #495057;
            margin-bottom: 10px;
            font-size: 0.9em;
        }
        
        .methods-list {
            display: flex;
            gap: 8px;
            flex-wrap: wrap;
        }
        
        .method-tag {
            background: #e9ecef;
            color: #495057;
            padding: 4px 10px;
            border-radius: 12px;
            font-size: 0.8em;
            font-weight: 500;
        }
        
        /* Ranker controls styling */
        .ranker-controls {
            background: #ffffff;
            border: 1px solid #e1e5e9;
            border-radius: 12px;
            padding: 20px;
            margin-bottom: 25px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
        }
        
        .ranker-input-row {
            display: flex;
            gap: 15px;
            align-items: end;
            margin-bottom: 15px;
        }
        
        .ranker-query-input {
            flex: 1;
            border: 2px solid #e1e5e9;
            border-radius: 25px;
            padding: 12px 20px;
            font-size: 1em;
            transition: all 0.3s ease;
        }
        
        .ranker-query-input:focus {
            border-color: #667eea;
            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1);
            outline: none;
        }
        
        .ranker-search-btn {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border: none;
            border-radius: 12px;
            padding: 12px 24px;
            min-width: 100px;
            cursor: pointer;
            transition: all 0.3s ease;
            font-weight: 600;
            font-size: 1em;
        }
        
        .ranker-search-btn:hover {
            transform: scale(1.05);
            box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3);
        }
        
        .ranker-options-row {
            display: flex;
            gap: 15px;
            align-items: center;
        }
        
        /* Responsive design for ranker */
        @media (max-width: 768px) {
            .ranker-container {
                padding: 10px;
            }
            
            .ranker-input-row {
                flex-direction: column;
                gap: 10px;
            }
            
            .ranker-options-row {
                flex-direction: column;
                gap: 10px;
                align-items: stretch;
            }
            
            .ranker-meta {
                justify-content: center;
            }
            
            .rank-info {
                flex-direction: column;
                gap: 5px;
                align-items: flex-start;
            }
            
            .result-header {
                flex-direction: column;
                gap: 10px;
                align-items: flex-start;
            }
            
            .score-info {
                align-self: flex-end;
            }
            
            .result-actions {
                flex-direction: column;
                gap: 8px;
            }
        }
    """) as demo:
        # Modern title with better styling
        gr.Markdown("""
        # 🚀 Markit
        ## Document to Markdown Converter with RAG Chat
        """)
        
        with gr.Tabs():
            # Document Converter Tab
            with gr.TabItem("📄 Document Converter"):
                with gr.Column(elem_classes=["chat-tab-container"]):
                    # Modern header matching other tabs
                    gr.HTML("""
                    <div class="chat-header">
                        <h2>📄 Document Converter</h2>
                        <p>Convert documents to Markdown format with advanced OCR and AI processing</p>
                    </div>
                    """)
                    
                    # State to track if cancellation is requested
                    cancel_requested = gr.State(False)
                    # State to store the conversion thread
                    conversion_thread = gr.State(None)
                    # State to store the output format (fixed to Markdown)
                    output_format_state = gr.State("Markdown")

                    # Multi-file input (supports single and multiple files)
                    files_input = gr.Files(
                        label="Upload Document(s) - Single file or up to 5 files (20MB max combined)",
                        file_count="multiple",
                        file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls", ".txt", ".md", ".html", ".htm"]
                    )
                    
                    # Processing type selector (visible only for multiple files)
                    processing_type_selector = gr.Radio(
                        choices=["combined", "individual", "summary", "comparison"],
                        value="combined",
                        label="Multi-Document Processing Type",
                        info="How to process multiple documents together",
                        visible=False
                    )
                    
                    # Status text to show file count and processing mode
                    file_status_text = gr.HTML(
                        value="<div style='color: #666; font-style: italic;'>Upload documents to begin</div>",
                        label=""
                    )
                    
                    # Provider and OCR options below the file input
                    with gr.Row(elem_classes=["provider-options-row"]):
                        with gr.Column(scale=1):
                            parser_names = ParserRegistry.get_parser_names()
                            
                            # Make MarkItDown the default parser if available
                            default_parser = next((p for p in parser_names if p == "MarkItDown"), parser_names[0] if parser_names else "PyPdfium")
                            
                            provider_dropdown = gr.Dropdown(
                                label="Provider",
                                choices=parser_names,
                                value=default_parser,
                                interactive=True
                            )
                        with gr.Column(scale=1):
                            default_ocr_options = ParserRegistry.get_ocr_options(default_parser)
                            default_ocr = default_ocr_options[0] if default_ocr_options else "No OCR"
                            
                            ocr_dropdown = gr.Dropdown(
                                label="OCR Options",
                                choices=default_ocr_options,
                                value=default_ocr,
                                interactive=True
                            )
                    
                    # Processing controls row with consistent styling
                    with gr.Row(elem_classes=["control-buttons"]):
                        convert_button = gr.Button("🚀 Convert", elem_classes=["control-btn", "btn-primary"])
                        cancel_button = gr.Button("⏹️ Cancel", elem_classes=["control-btn", "btn-clear-data"], visible=False)
                    
                    # Simple output container with just one scrollbar
                    file_display = gr.HTML(
                        value="<div class='output-container'></div>",
                        label="Converted Content"
                    )
                    
                    file_download = gr.File(label="Download File")

                # Event handlers for document converter
                
                # Update UI when files are uploaded/changed
                files_input.change(
                    fn=update_ui_for_file_count,
                    inputs=[files_input],
                    outputs=[processing_type_selector, file_status_text]
                )
                
                provider_dropdown.change(
                    lambda p: gr.Dropdown(
                        choices=["Plain Text", "Formatted Text"] if "GOT-OCR" in p else ParserRegistry.get_ocr_options(p),
                        value="Plain Text" if "GOT-OCR" in p else (ParserRegistry.get_ocr_options(p)[0] if ParserRegistry.get_ocr_options(p) else None)
                    ),
                    inputs=[provider_dropdown],
                    outputs=[ocr_dropdown]
                )

                # Reset cancel flag when starting conversion
                def start_conversion():
                    global conversion_cancelled
                    conversion_cancelled.clear()
                    logger.info("Starting conversion with cancellation flag cleared")
                    return gr.update(visible=False), gr.update(visible=True), False

                # Set cancel flag and terminate thread when cancel button is clicked
                def request_cancellation(thread):
                    global conversion_cancelled
                    conversion_cancelled.set()
                    logger.info("Cancel button clicked, cancellation flag set")
                    
                    # Try to join the thread with a timeout
                    if thread is not None:
                        logger.info(f"Attempting to join conversion thread: {thread}")
                        thread.join(timeout=0.5)
                        if thread.is_alive():
                            logger.warning("Thread did not finish within timeout")
                    
                    # Add immediate feedback to the user
                    return gr.update(visible=True), gr.update(visible=False), True, None

                # Start conversion sequence
                convert_button.click(
                    fn=start_conversion,
                    inputs=[],
                    outputs=[convert_button, cancel_button, cancel_requested],
                    queue=False  # Execute immediately
                ).then(
                    fn=handle_convert,
                    inputs=[files_input, provider_dropdown, ocr_dropdown, output_format_state, processing_type_selector, cancel_requested],
                    outputs=[file_display, file_download, convert_button, cancel_button, conversion_thread]
                )
                
                # Handle cancel button click
                cancel_button.click(
                    fn=request_cancellation,
                    inputs=[conversion_thread],
                    outputs=[convert_button, cancel_button, cancel_requested, conversion_thread],
                    queue=False  # Execute immediately
                )

            # Chat Tab - Completely redesigned
            with gr.TabItem("💬 Chat with Documents"):
                with gr.Column(elem_classes=["chat-tab-container"]):
                    # Modern header
                    gr.HTML("""
                    <div class="chat-header">
                        <h2>💬 Chat with your converted documents</h2>
                        <p>Ask questions about your documents using advanced RAG technology</p>
                    </div>
                    """)
                    
                    # Status section with modern design
                    status_display = gr.HTML(value=get_chat_status())
                    
                    # Control buttons
                    with gr.Row(elem_classes=["control-buttons"]):
                        refresh_status_btn = gr.Button("🔄 Refresh Status", elem_classes=["control-btn", "btn-refresh"])
                        new_session_btn = gr.Button("🆕 New Session", elem_classes=["control-btn", "btn-new-session"])
                        clear_data_btn = gr.Button("🗑️ Clear All Data", elem_classes=["control-btn", "btn-clear-data"], variant="stop")
                    
                    # Main chat interface
                    with gr.Column(elem_classes=["chat-main-container"]):
                        chatbot = gr.Chatbot(
                            elem_classes=["chat-container"],
                            height=500,
                            show_label=False,
                            show_share_button=False,
                            bubble_full_width=False,
                            type="messages",
                            placeholder="Start a conversation by asking questions about your documents..."
                        )
                        
                        # Input area
                        with gr.Row(elem_classes=["input-row"]):
                            msg_input = gr.Textbox(
                                placeholder="Ask questions about your documents...",
                                show_label=False,
                                scale=5,
                                lines=1,
                                max_lines=3,
                                elem_classes=["message-input"]
                            )
                            send_btn = gr.Button("Submit", elem_classes=["send-button"], scale=0)
                    
                    # Session info with better styling
                    session_info = gr.HTML(
                        value='<div class="session-info">No active session - Click "New Session" to start</div>'
                    )
                
                # Event handlers for chat
                def clear_input():
                    return ""
                
                # Send message when button clicked or Enter pressed
                msg_input.submit(
                    fn=handle_chat_message,
                    inputs=[msg_input, chatbot],
                    outputs=[msg_input, chatbot, status_display]
                )
                
                send_btn.click(
                    fn=handle_chat_message,
                    inputs=[msg_input, chatbot],
                    outputs=[msg_input, chatbot, status_display]
                )
                
                # New session handler with improved feedback
                def enhanced_new_session():
                    history, info = start_new_chat_session()
                    session_html = f'<div class="session-info">{info}</div>'
                    updated_status = get_chat_status()
                    return history, session_html, updated_status
                
                new_session_btn.click(
                    fn=enhanced_new_session,
                    inputs=[],
                    outputs=[chatbot, session_info, status_display]
                )
                
                # Refresh status handler
                refresh_status_btn.click(
                    fn=get_chat_status,
                    inputs=[],
                    outputs=[status_display]
                )
                
                # Clear all data handler
                clear_data_btn.click(
                    fn=handle_clear_all_data,
                    inputs=[],
                    outputs=[chatbot, session_info, status_display]
                )

            # Query Ranker Tab
            with gr.TabItem("🔍 Query Ranker"):
                with gr.Column(elem_classes=["ranker-container"]):
                    # Modern header
                    gr.HTML("""
                    <div class="chat-header">
                        <h2>🔍 Query Ranker</h2>
                        <p>Search and rank document chunks with similarity scores</p>
                    </div>
                    """)
                    
                    # Status section
                    ranker_status_display = gr.HTML(value=get_ranker_status())
                    
                    # Control buttons
                    with gr.Row(elem_classes=["control-buttons"]):
                        refresh_ranker_status_btn = gr.Button("🔄 Refresh Status", elem_classes=["control-btn", "btn-refresh"])
                        clear_results_btn = gr.Button("🗑️ Clear Results", elem_classes=["control-btn", "btn-clear-data"])
                    
                    # Search controls
                    with gr.Column(elem_classes=["ranker-controls"]):
                        with gr.Row(elem_classes=["ranker-input-row"]):
                            query_input = gr.Textbox(
                                placeholder="Enter your search query...",
                                show_label=False,
                                elem_classes=["ranker-query-input"],
                                scale=4
                            )
                            search_btn = gr.Button("🔍 Search", elem_classes=["ranker-search-btn"], scale=0)
                        
                        with gr.Row(elem_classes=["ranker-options-row"]):
                            method_dropdown = gr.Dropdown(
                                choices=[
                                    ("🎯 Similarity Search", "similarity"),
                                    ("🔀 MMR (Diverse)", "mmr"),
                                    ("🔍 BM25 (Keywords)", "bm25"),
                                    ("🔗 Hybrid (Recommended)", "hybrid")
                                ],
                                value="hybrid",
                                label="Retrieval Method",
                                scale=2
                            )
                            k_slider = gr.Slider(
                                minimum=1,
                                maximum=10,
                                value=5,
                                step=1,
                                label="Number of Results",
                                scale=1
                            )
                    
                    # Results display
                    results_display = gr.HTML(
                        value=handle_query_search("", "hybrid", 5),  # Initial placeholder
                        elem_classes=["ranker-results-container"]
                    )
                
                # Event handlers for Query Ranker
                def clear_ranker_results():
                    """Clear the search results and reset to placeholder."""
                    return handle_query_search("", "hybrid", 5), ""
                
                def refresh_ranker_status():
                    """Refresh the ranker status display."""
                    return get_ranker_status()
                
                # Search functionality
                query_input.submit(
                    fn=handle_query_search,
                    inputs=[query_input, method_dropdown, k_slider],
                    outputs=[results_display]
                )
                
                search_btn.click(
                    fn=handle_query_search,
                    inputs=[query_input, method_dropdown, k_slider],
                    outputs=[results_display]
                )
                
                # Control button handlers
                refresh_ranker_status_btn.click(
                    fn=refresh_ranker_status,
                    inputs=[],
                    outputs=[ranker_status_display]
                )
                
                clear_results_btn.click(
                    fn=clear_ranker_results,
                    inputs=[],
                    outputs=[results_display, query_input]
                )
                
                # Update results when method or k changes
                method_dropdown.change(
                    fn=handle_query_search,
                    inputs=[query_input, method_dropdown, k_slider],
                    outputs=[results_display]
                )
                
                k_slider.change(
                    fn=handle_query_search,
                    inputs=[query_input, method_dropdown, k_slider],
                    outputs=[results_display]
                )

    return demo


def launch_ui(server_name="0.0.0.0", server_port=7860, share=False):
    demo = create_ui()
    demo.launch(
        server_name=server_name,
        server_port=server_port,
        root_path="",
        show_error=True,
        share=share
    )