Spaces:

DaVinciCode
/

doctra-document-parser

Sleeping

File size: 39,204 Bytes

"""
Doctra - Document Parser for Hugging Face Spaces

This is a Hugging Face Spaces deployment of the Doctra document parsing library.
It provides a comprehensive web interface for PDF parsing, table/chart extraction,
image restoration, and enhanced document processing.
"""

import os
import shutil
import tempfile
import re
import html as _html
import base64
import json
from pathlib import Path
from typing import Optional, Tuple, List, Dict, Any

import gradio as gr
import pandas as pd

# Mock google.genai to avoid import errors
import sys
from unittest.mock import MagicMock

# Create a mock google.genai module
mock_google_genai = MagicMock()
sys.modules['google.genai'] = mock_google_genai
sys.modules['google.genai.types'] = MagicMock()

# Now import Doctra components
try:
    from doctra.parsers.structured_pdf_parser import StructuredPDFParser
    from doctra.parsers.table_chart_extractor import ChartTablePDFParser
    from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
    from doctra.ui.docres_wrapper import DocResUIWrapper
    from doctra.utils.pdf_io import render_pdf_to_images
except ImportError as e:
    print(f"Warning: Some Doctra components may not be available: {e}")
    # Create mock classes if imports fail
    StructuredPDFParser = None
    ChartTablePDFParser = None
    EnhancedPDFParser = None
    DocResUIWrapper = None
    render_pdf_to_images = None


# UI Theme and Styling Constants
THEME = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")

CUSTOM_CSS = """
/* Full-width layout */
.gradio-container {max-width: 100% !important; padding-left: 24px; padding-right: 24px}
.container {max-width: 100% !important}
.app {max-width: 100% !important}

/* Header and helpers */
.header {margin-bottom: 8px}
.subtitle {color: var(--body-text-color-subdued)}
.card {border:1px solid var(--border-color); border-radius:12px; padding:8px}
.status-ok {color: var(--color-success)}

/* Scrollable gallery styling */
.scrollable-gallery {
    max-height: 600px !important;
    overflow-y: auto !important;
    border: 1px solid var(--border-color) !important;
    border-radius: 8px !important;
    padding: 8px !important;
}

/* Page content styling */
.page-content img {
    max-width: 100% !important;
    height: auto !important;
    display: block !important;
    margin: 10px auto !important;
    border: 1px solid #ddd !important;
    border-radius: 8px !important;
    box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
}

.page-content {
    max-height: none !important;
    overflow: visible !important;
}

/* Table styling */
.page-content table.doc-table { 
    width: 100% !important; 
    border-collapse: collapse !important; 
    margin: 12px 0 !important; 
}
.page-content table.doc-table th,
.page-content table.doc-table td { 
    border: 1px solid #e5e7eb !important; 
    padding: 8px 10px !important; 
    text-align: left !important; 
}
.page-content table.doc-table thead th { 
    background: #f9fafb !important; 
    font-weight: 600 !important; 
}
.page-content table.doc-table tbody tr:nth-child(even) td { 
    background: #fafafa !important; 
}

/* Clickable image buttons */
.image-button {
    background: #0066cc !important;
    color: white !important;
    border: none !important;
    padding: 5px 10px !important;
    border-radius: 4px !important;
    cursor: pointer !important;
    margin: 2px !important;
    font-size: 14px !important;
}

.image-button:hover {
    background: #0052a3 !important;
}
"""


def gather_outputs(
    out_dir: Path, 
    allowed_kinds: Optional[List[str]] = None, 
    zip_filename: Optional[str] = None, 
    is_structured_parsing: bool = False
) -> Tuple[List[tuple[str, str]], List[str], str]:
    """
    Gather output files and create a ZIP archive for download.
    """
    gallery_items: List[tuple[str, str]] = []
    file_paths: List[str] = []

    if out_dir.exists():
        if is_structured_parsing:
            # For structured parsing, include all files
            for file_path in sorted(out_dir.rglob("*")):
                if file_path.is_file():
                    file_paths.append(str(file_path))
        else:
            # For full parsing, include specific main files
            main_files = [
                "result.html",
                "result.md", 
                "tables.html",
                "tables.xlsx"
            ]
            
            for main_file in main_files:
                file_path = out_dir / main_file
                if file_path.exists():
                    file_paths.append(str(file_path))
            
            # Include images based on allowed kinds
            if allowed_kinds:
                for kind in allowed_kinds:
                    p = out_dir / kind
                    if p.exists():
                        for img in sorted(p.glob("*.png")):
                            file_paths.append(str(img))
                    
                    images_dir = out_dir / "images" / kind
                    if images_dir.exists():
                        for img in sorted(images_dir.glob("*.jpg")):
                            file_paths.append(str(img))
            else:
                # Include all images if no specific kinds specified
                for p in (out_dir / "charts").glob("*.png"):
                    file_paths.append(str(p))
                for p in (out_dir / "tables").glob("*.png"):
                    file_paths.append(str(p))
                for p in (out_dir / "images").rglob("*.jpg"):
                    file_paths.append(str(p))

            # Include Excel files based on allowed kinds
            if allowed_kinds:
                if "charts" in allowed_kinds and "tables" in allowed_kinds:
                    excel_files = ["parsed_tables_charts.xlsx"]
                elif "charts" in allowed_kinds:
                    excel_files = ["parsed_charts.xlsx"]
                elif "tables" in allowed_kinds:
                    excel_files = ["parsed_tables.xlsx"]
                else:
                    excel_files = []
                
                for excel_file in excel_files:
                    excel_path = out_dir / excel_file
                    if excel_path.exists():
                        file_paths.append(str(excel_path))

    # Build gallery items for image display
    kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
    for sub in kinds:
        p = out_dir / sub
        if p.exists():
            for img in sorted(p.glob("*.png")):
                gallery_items.append((str(img), f"{sub}: {img.name}"))
        
        images_dir = out_dir / "images" / sub
        if images_dir.exists():
            for img in sorted(images_dir.glob("*.jpg")):
                gallery_items.append((str(img), f"{sub}: {img.name}"))

    # Create ZIP archive
    tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
    
    if zip_filename:
        safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
        zip_base = tmp_zip_dir / safe_filename
    else:
        zip_base = tmp_zip_dir / "doctra_outputs"
    
    filtered_dir = tmp_zip_dir / "filtered_outputs"
    shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
    
    zip_path = shutil.make_archive(str(zip_base), 'zip', root_dir=str(filtered_dir))

    return gallery_items, file_paths, zip_path


def validate_vlm_config(use_vlm: bool, vlm_api_key: str, vlm_provider: str = "gemini") -> Optional[str]:
    """
    Validate VLM configuration parameters.
    """
    if use_vlm and vlm_provider not in ["ollama"] and not vlm_api_key:
        return "❌ Error: VLM API key is required when using VLM (except for Ollama)"
    
    if use_vlm and vlm_api_key and vlm_provider not in ["ollama"]:
        # Basic API key validation
        if len(vlm_api_key.strip()) < 10:
            return "❌ Error: VLM API key appears to be too short or invalid"
        if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
            return "❌ Error: OpenAI API key appears to be invalid (too short)"
    
    return None


def create_page_html_content(page_content: List[str], base_dir: Optional[Path] = None) -> str:
    """
    Convert page content lines to HTML with inline images and proper formatting.
    """
    processed_content = []
    paragraph_buffer = []
    
    def flush_paragraph():
        """Flush accumulated paragraph content to HTML"""
        nonlocal paragraph_buffer
        if paragraph_buffer:
            joined = '<br/>'.join(_html.escape(l) for l in paragraph_buffer)
            processed_content.append(f'<p>{joined}</p>')
            paragraph_buffer = []

    def is_markdown_table_header(s: str) -> bool:
        return '|' in s and ('---' in s or '—' in s)

    def render_markdown_table(lines: List[str]) -> str:
        rows = [l.strip().strip('|').split('|') for l in lines]
        rows = [[_html.escape(c.strip()) for c in r] for r in rows]
        if len(rows) < 2:
            return ""
        
        header = rows[0]
        body = rows[2:] if len(rows) > 2 else []
        thead = '<thead><tr>' + ''.join(f'<th>{c}</th>' for c in header) + '</tr></thead>'
        tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{c}</td>' for c in r) + '</tr>' for r in body) + '</tbody>'
        return f'<table class="doc-table">{thead}{tbody}</table>'

    i = 0
    n = len(page_content)
    
    while i < n:
        raw_line = page_content[i]
        line = raw_line.rstrip('\r\n')
        stripped = line.strip()
        
        # Handle image references
        if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
            flush_paragraph()
            match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
            if match and base_dir is not None:
                caption = match.group(1)
                rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
                abs_path = (base_dir / rel_path).resolve()
                try:
                    with open(abs_path, 'rb') as f:
                        b64 = base64.b64encode(f.read()).decode('ascii')
                    processed_content.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
                except Exception as e:
                    print(f"❌ Failed to embed image {rel_path}: {e}")
                    processed_content.append(f'<div>{_html.escape(caption)} (image not found)</div>')
            else:
                processed_content.append(f'<div>{_html.escape(stripped)}</div>')
            i += 1
            continue

        # Handle markdown tables
        if (stripped.startswith('|') or stripped.count('|') >= 2) and i + 1 < n and is_markdown_table_header(page_content[i + 1]):
            flush_paragraph()
            table_block = [stripped]
            i += 1
            table_block.append(page_content[i].strip())
            i += 1
            while i < n:
                nxt = page_content[i].rstrip('\r\n')
                if nxt.strip() == '' or (not nxt.strip().startswith('|') and nxt.count('|') < 2):
                    break
                table_block.append(nxt.strip())
                i += 1
            html_table = render_markdown_table(table_block)
            if html_table:
                processed_content.append(html_table)
            else:
                for tl in table_block:
                    paragraph_buffer.append(tl)
            continue

        # Handle headers and content
        if stripped.startswith('## '):
            flush_paragraph()
            processed_content.append(f'<h3>{_html.escape(stripped[3:])}</h3>')
        elif stripped.startswith('# '):
            flush_paragraph()
            processed_content.append(f'<h2>{_html.escape(stripped[2:])}</h2>')
        elif stripped == '':
            flush_paragraph()
            processed_content.append('<br/>')
        else:
            paragraph_buffer.append(raw_line)
        i += 1
    
    flush_paragraph()
    return "\n".join(processed_content)


def run_full_parse(
    pdf_file: str,
    use_vlm: bool,
    vlm_provider: str,
    vlm_api_key: str,
    layout_model_name: str,
    dpi: int,
    min_score: float,
    ocr_lang: str,
    ocr_psm: int,
    ocr_oem: int,
    ocr_extra_config: str,
    box_separator: str,
) -> Tuple[str, Optional[str], List[tuple[str, str]], List[str], str]:
    """Run full PDF parsing with structured output."""
    if not pdf_file:
        return ("No file provided.", None, [], [], "")

    # Check if Doctra components are available
    if StructuredPDFParser is None:
        return ("❌ Error: Doctra library not properly installed. Please check the requirements.", None, [], [], "")

    # Validate VLM configuration
    vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
    if vlm_error:
        return (vlm_error, None, [], [], "")

    original_filename = Path(pdf_file).stem
    
    # Create temporary directory for processing
    tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
    input_pdf = tmp_dir / f"{original_filename}.pdf"
    shutil.copy2(pdf_file, input_pdf)

    # Initialize parser with configuration
    parser = StructuredPDFParser(
        use_vlm=use_vlm,
        vlm_provider=vlm_provider,
        vlm_api_key=vlm_api_key or None,
        layout_model_name=layout_model_name,
        dpi=int(dpi),
        min_score=float(min_score),
        ocr_lang=ocr_lang,
        ocr_psm=int(ocr_psm),
        ocr_oem=int(ocr_oem),
        ocr_extra_config=ocr_extra_config or "",
        box_separator=box_separator or "\n",
    )

    try:
        parser.parse(str(input_pdf))
    except Exception as e:
        import traceback
        traceback.print_exc()
        try:
            error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
            return (f"❌ VLM processing failed: {error_msg}", None, [], [], "")
        except Exception:
            return (f"❌ VLM processing failed: <Unicode encoding error>", None, [], [], "")

    # Find output directory
    outputs_root = Path("outputs")
    out_dir = outputs_root / original_filename / "full_parse"
    if not out_dir.exists():
        candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
        if candidates:
            out_dir = candidates[0] / "full_parse"
        else:
            out_dir = outputs_root

    # Read markdown file if it exists
    md_file = next(out_dir.glob("*.md"), None)
    md_preview = None
    if md_file and md_file.exists():
        try:
            with md_file.open("r", encoding="utf-8", errors="ignore") as f:
                md_preview = f.read()
        except Exception:
            md_preview = None

    # Gather output files and create ZIP
    gallery_items, file_paths, zip_path = gather_outputs(
        out_dir, 
        zip_filename=original_filename, 
        is_structured_parsing=False
    )
    
    return (
        f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", 
        md_preview, 
        gallery_items, 
        file_paths, 
        zip_path
    )


def run_extract(
    pdf_file: str,
    target: str,
    use_vlm: bool,
    vlm_provider: str,
    vlm_api_key: str,
    layout_model_name: str,
    dpi: int,
    min_score: float,
) -> Tuple[str, str, List[tuple[str, str]], List[str], str]:
    """Run table/chart extraction from PDF."""
    if not pdf_file:
        return ("No file provided.", "", [], [], "")
    
    # Check if Doctra components are available
    if ChartTablePDFParser is None:
        return ("❌ Error: Doctra library not properly installed. Please check the requirements.", "", [], [], "")
    
    # Validate VLM configuration
    vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
    if vlm_error:
        return (vlm_error, "", [], [], "")

    original_filename = Path(pdf_file).stem
    
    # Create temporary directory for processing
    tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
    input_pdf = tmp_dir / f"{original_filename}.pdf"
    shutil.copy2(pdf_file, input_pdf)

    # Initialize parser with configuration
    parser = ChartTablePDFParser(
        extract_charts=(target in ("charts", "both")),
        extract_tables=(target in ("tables", "both")),
        use_vlm=use_vlm,
        vlm_provider=vlm_provider,
        vlm_api_key=vlm_api_key or None,
        layout_model_name=layout_model_name,
        dpi=int(dpi),
        min_score=float(min_score),
    )

    # Run extraction
    output_base = Path("outputs")
    parser.parse(str(input_pdf), str(output_base))

    # Find output directory
    outputs_root = output_base
    out_dir = outputs_root / original_filename / "structured_parsing"
    if not out_dir.exists():
        if outputs_root.exists():
            candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
            if candidates:
                out_dir = candidates[0] / "structured_parsing"
            else:
                out_dir = outputs_root
        else:
            outputs_root.mkdir(parents=True, exist_ok=True)
            out_dir = outputs_root

    # Determine which kinds to include in outputs based on target selection
    allowed_kinds: Optional[List[str]] = None
    if target in ("tables", "charts"):
        allowed_kinds = [target]
    elif target == "both":
        allowed_kinds = ["tables", "charts"]

    # Gather output files and create ZIP
    gallery_items, file_paths, zip_path = gather_outputs(
        out_dir, 
        allowed_kinds, 
        zip_filename=original_filename, 
        is_structured_parsing=True
    )

    # Build tables HTML preview from Excel data (when VLM enabled)
    tables_html = ""
    try:
        if use_vlm:
            # Find Excel file based on target
            excel_filename = None
            if target in ("tables", "charts"):
                if target == "tables":
                    excel_filename = "parsed_tables.xlsx"
                else:  # charts
                    excel_filename = "parsed_charts.xlsx"
            elif target == "both":
                excel_filename = "parsed_tables_charts.xlsx"
            
            if excel_filename:
                excel_path = out_dir / excel_filename
                if excel_path.exists():
                    # Read Excel file and create HTML tables
                    xl_file = pd.ExcelFile(excel_path)
                    html_blocks = []
                    
                    for sheet_name in xl_file.sheet_names:
                        df = pd.read_excel(excel_path, sheet_name=sheet_name)
                        if not df.empty:
                            # Create table with title
                            title = f"<h3>{_html.escape(sheet_name)}</h3>"
                            
                            # Convert DataFrame to HTML table
                            table_html = df.to_html(
                                classes="doc-table",
                                table_id=None,
                                escape=True,
                                index=False,
                                na_rep=""
                            )
                            
                            html_blocks.append(title + table_html)
                    
                    tables_html = "\n".join(html_blocks)
    except Exception as e:
        try:
            error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
            print(f"Error building tables HTML: {error_msg}")
        except Exception:
            print(f"Error building tables HTML: <Unicode encoding error>")
        tables_html = ""

    return (
        f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}", 
        tables_html, 
        gallery_items, 
        file_paths, 
        zip_path
    )


def run_docres_restoration(
    pdf_file: str, 
    task: str, 
    device: str, 
    dpi: int, 
    save_enhanced: bool, 
    save_images: bool
) -> Tuple[str, Optional[str], Optional[str], Optional[dict], List[str]]:
    """Run DocRes image restoration on PDF."""
    if not pdf_file:
        return ("No file provided.", None, None, None, [])
    
    # Check if Doctra components are available
    if DocResUIWrapper is None:
        return ("❌ Error: Doctra library not properly installed. Please check the requirements.", None, None, None, [])
    
    try:
        # Initialize DocRes engine
        device_str = None if device == "auto" else device
        docres = DocResUIWrapper(device=device_str)
        
        # Extract filename
        original_filename = Path(pdf_file).stem
        
        # Create output directory
        output_dir = Path("outputs") / f"{original_filename}_docres"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Run DocRes restoration
        enhanced_pdf_path = output_dir / f"{original_filename}_enhanced.pdf"
        docres.restore_pdf(
            pdf_path=pdf_file,
            output_path=str(enhanced_pdf_path),
            task=task,
            dpi=dpi
        )
        
        # Prepare outputs
        file_paths = []
        
        if save_enhanced and enhanced_pdf_path.exists():
            file_paths.append(str(enhanced_pdf_path))
        
        if save_images:
            # Look for enhanced images
            images_dir = output_dir / "enhanced_images"
            if images_dir.exists():
                for img_path in sorted(images_dir.glob("*.jpg")):
                    file_paths.append(str(img_path))
        
        # Create metadata
        metadata = {
            "task": task,
            "device": str(docres.device),
            "dpi": dpi,
            "original_file": pdf_file,
            "enhanced_file": str(enhanced_pdf_path) if enhanced_pdf_path.exists() else None,
            "output_directory": str(output_dir)
        }
        
        status_msg = f"✅ DocRes restoration completed successfully!\n📁 Output directory: {output_dir}"
        
        enhanced_pdf_file = str(enhanced_pdf_path) if enhanced_pdf_path.exists() else None
        return (status_msg, pdf_file, enhanced_pdf_file, metadata, file_paths)
        
    except Exception as e:
        error_msg = f"❌ DocRes restoration failed: {str(e)}"
        return (error_msg, None, None, None, [])


def run_enhanced_parse(
    pdf_file: str,
    use_image_restoration: bool,
    restoration_task: str,
    restoration_device: str,
    restoration_dpi: int,
    use_vlm: bool,
    vlm_provider: str,
    vlm_api_key: str,
    layout_model_name: str,
    dpi: int,
    min_score: float,
    ocr_lang: str,
    ocr_psm: int,
    ocr_oem: int,
    ocr_extra_config: str,
    box_separator: str,
) -> Tuple[str, Optional[str], List[str], str, Optional[str], Optional[str], str]:
    """Run enhanced PDF parsing with DocRes image restoration."""
    if not pdf_file:
        return ("No file provided.", None, [], "", None, None, "")

    # Check if Doctra components are available
    if EnhancedPDFParser is None:
        return ("❌ Error: Doctra library not properly installed. Please check the requirements.", None, [], "", None, None, "")

    # Validate VLM configuration if VLM is enabled
    if use_vlm:
        vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
        if vlm_error:
            return (vlm_error, None, [], "", None, None, "")

    original_filename = Path(pdf_file).stem
    
    # Create temporary directory for processing
    tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_enhanced_"))
    input_pdf = tmp_dir / f"{original_filename}.pdf"
    shutil.copy2(pdf_file, input_pdf)

    try:
        # Initialize enhanced parser with configuration
        parser = EnhancedPDFParser(
            use_image_restoration=use_image_restoration,
            restoration_task=restoration_task,
            restoration_device=restoration_device if restoration_device != "auto" else None,
            restoration_dpi=int(restoration_dpi),
            use_vlm=use_vlm,
            vlm_provider=vlm_provider,
            vlm_api_key=vlm_api_key or None,
            layout_model_name=layout_model_name,
            dpi=int(dpi),
            min_score=float(min_score),
            ocr_lang=ocr_lang,
            ocr_psm=int(ocr_psm),
            ocr_oem=int(ocr_oem),
            ocr_extra_config=ocr_extra_config or "",
            box_separator=box_separator or "\n",
        )

        # Parse the PDF with enhancement
        parser.parse(str(input_pdf))

    except Exception as e:
        import traceback
        traceback.print_exc()
        try:
            error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
            return (f"❌ Enhanced parsing failed: {error_msg}", None, [], "", None, None, "")
        except Exception:
            return (f"❌ Enhanced parsing failed: <Unicode encoding error>", None, [], "", None, None, "")

    # Find output directory
    outputs_root = Path("outputs")
    out_dir = outputs_root / original_filename / "enhanced_parse"
    if not out_dir.exists():
        candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
        if candidates:
            out_dir = candidates[0] / "enhanced_parse"
        else:
            out_dir = outputs_root
    
    # If still no enhanced_parse directory, try to find any directory with enhanced files
    if not out_dir.exists():
        for candidate_dir in outputs_root.rglob("*"):
            if candidate_dir.is_dir():
                enhanced_pdfs = list(candidate_dir.glob("*enhanced*.pdf"))
                if enhanced_pdfs:
                    out_dir = candidate_dir
                    break

    # Load first page content initially
    md_preview = None
    try:
        pages_dir = out_dir / "pages"
        first_page_path = pages_dir / "page_001.md"
        if first_page_path.exists():
            with first_page_path.open("r", encoding="utf-8", errors="ignore") as f:
                md_content = f.read()
            
            md_lines = md_content.split('\n')
            md_preview = create_page_html_content(md_lines, out_dir)
        else:
            md_file = next(out_dir.glob("*.md"), None)
            if md_file and md_file.exists():
                with md_file.open("r", encoding="utf-8", errors="ignore") as f:
                    md_content = f.read()
                
                md_lines = md_content.split('\n')
                md_preview = create_page_html_content(md_lines, out_dir)
    except Exception as e:
        print(f"❌ Error loading initial content: {e}")
        md_preview = None

    # Gather output files and create ZIP
    _, file_paths, zip_path = gather_outputs(
        out_dir, 
        zip_filename=f"{original_filename}_enhanced", 
        is_structured_parsing=False
    )

    # Look for enhanced PDF file
    enhanced_pdf_path = None
    if use_image_restoration:
        enhanced_pdf_candidates = list(out_dir.glob("*enhanced*.pdf"))
        if enhanced_pdf_candidates:
            enhanced_pdf_path = str(enhanced_pdf_candidates[0])
        else:
            parent_enhanced = list(out_dir.parent.glob("*enhanced*.pdf"))
            if parent_enhanced:
                enhanced_pdf_path = str(parent_enhanced[0])

    return (
        f"✅ Enhanced parsing completed successfully!\n📁 Output directory: {out_dir}", 
        md_preview, 
        file_paths, 
        zip_path,
        pdf_file,  # Original PDF path
        enhanced_pdf_path,  # Enhanced PDF path
        str(out_dir)  # Output directory for page-specific content
    )


def create_tips_markdown() -> str:
    """Create the tips section markdown for the UI."""
    return """
<div class="card">
  <b>Tips</b>
  <ul>
    <li>On Spaces, set a secret <code>VLM_API_KEY</code> to enable VLM features.</li>
    <li>Use <strong>Enhanced Parser</strong> for documents that need image restoration before parsing (scanned docs, low-quality PDFs).</li>
    <li>Use <strong>DocRes Image Restoration</strong> for standalone image enhancement without parsing.</li>
    <li>DocRes tasks: <code>appearance</code> (default), <code>dewarping</code>, <code>deshadowing</code>, <code>deblurring</code>, <code>binarization</code>, <code>end2end</code>.</li>
    <li>Outputs are saved under <code>outputs/&lt;pdf_stem&gt;/</code>.</li>
    <li><strong>Note:</strong> Google Gemini VLM may not be available due to dependency conflicts. Use OpenAI, Anthropic, or other VLM providers.</li>
  </ul>
</div>
    """


# Create the main Gradio interface
with gr.Blocks(title="Doctra - Document Parser", theme=THEME, css=CUSTOM_CSS) as demo:
    # Header section
    gr.Markdown(
        """
<div class="header">
  <h2 style="margin:0">Doctra — Document Parser</h2>
  <div class="subtitle">Parse PDFs, extract tables/charts, preview markdown, and download outputs.</div>
</div>
        """
    )
    
    # Full Parse Tab
    with gr.Tab("Full Parse"):
        with gr.Row():
            pdf = gr.File(file_types=[".pdf"], label="PDF")
            use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
            vlm_provider = gr.Dropdown(["openai", "anthropic", "openrouter", "ollama"], value="openai", label="VLM Provider")
            vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")

        with gr.Accordion("Advanced", open=False):
            with gr.Row():
                layout_model = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
                dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
                min_score = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
            with gr.Row():
                ocr_lang = gr.Textbox(value="eng", label="OCR Language")
                ocr_psm = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
                ocr_oem = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
            with gr.Row():
                ocr_config = gr.Textbox(value="", label="Extra OCR config")
                box_sep = gr.Textbox(value="\n", label="Box separator")

        run_btn = gr.Button("▶ Run Full Parse", variant="primary")
        status = gr.Textbox(label="Status", elem_classes=["status-ok"])
        
        # Full Parse components
        with gr.Row():
            with gr.Column():
                md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
            with gr.Column():
                page_image = gr.Image(label="Page image", interactive=False)
        files_out = gr.Files(label="Download individual output files")
        zip_out = gr.File(label="Download all outputs (ZIP)")

        run_btn.click(
            fn=run_full_parse,
            inputs=[pdf, use_vlm, vlm_provider, vlm_api_key, layout_model, dpi, min_score, ocr_lang, ocr_psm, ocr_oem, ocr_config, box_sep],
            outputs=[status, md_preview, files_out, zip_out],
        )

    # Tables & Charts Tab
    with gr.Tab("Extract Tables/Charts"):
        with gr.Row():
            pdf_e = gr.File(file_types=[".pdf"], label="PDF")
            target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
            use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
            vlm_provider_e = gr.Dropdown(["openai", "anthropic", "openrouter", "ollama"], value="openai", label="VLM Provider")
            vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")
        
        with gr.Accordion("Advanced", open=False):
            with gr.Row():
                layout_model_e = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
                dpi_e = gr.Slider(100, 400, value=200, step=10, label="DPI")
                min_score_e = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")

        run_btn_e = gr.Button("▶ Run Extraction", variant="primary")
        status_e = gr.Textbox(label="Status")
        
        with gr.Row():
            with gr.Column():
                tables_preview_e = gr.HTML(label="Extracted Data", elem_classes=["page-content"])
            with gr.Column():
                image_e = gr.Image(label="Selected Image", interactive=False)
        
        files_out_e = gr.Files(label="Download individual output files")
        zip_out_e = gr.File(label="Download all outputs (ZIP)")

        run_btn_e.click(
            fn=lambda f, t, a, b, c, d, e, g: run_extract(
                f.name if f else "",
                t,
                a,
                b,
                c,
                d,
                e,
                g,
            ),
            inputs=[pdf_e, target, use_vlm_e, vlm_provider_e, vlm_api_key_e, layout_model_e, dpi_e, min_score_e],
            outputs=[status_e, tables_preview_e, files_out_e, zip_out_e],
        )

    # DocRes Image Restoration Tab
    with gr.Tab("DocRes Image Restoration"):
        with gr.Row():
            pdf_docres = gr.File(file_types=[".pdf"], label="PDF")
            docres_task_standalone = gr.Dropdown(
                ["appearance", "dewarping", "deshadowing", "deblurring", "binarization", "end2end"], 
                value="appearance", 
                label="Restoration Task"
            )
            docres_device_standalone = gr.Dropdown(
                ["auto", "cuda", "cpu"], 
                value="auto", 
                label="Device"
            )
        
        with gr.Row():
            docres_dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
            docres_save_enhanced = gr.Checkbox(label="Save Enhanced PDF", value=True)
            docres_save_images = gr.Checkbox(label="Save Enhanced Images", value=True)
        
        run_docres_btn = gr.Button("▶ Run DocRes Restoration", variant="primary")
        docres_status = gr.Textbox(label="Status", elem_classes=["status-ok"])
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 📄 Original PDF")
                docres_original_pdf = gr.File(label="Original PDF File", interactive=False, visible=False)
                docres_original_page_image = gr.Image(label="Original PDF Page", interactive=False, height=800)
            with gr.Column():
                gr.Markdown("### ✨ Enhanced PDF")
                docres_enhanced_pdf = gr.File(label="Enhanced PDF File", interactive=False, visible=False)
                docres_enhanced_page_image = gr.Image(label="Enhanced PDF Page", interactive=False, height=800)
        
        docres_files_out = gr.Files(label="Download enhanced files")

        run_docres_btn.click(
            fn=run_docres_restoration,
            inputs=[pdf_docres, docres_task_standalone, docres_device_standalone, docres_dpi, docres_save_enhanced, docres_save_images],
            outputs=[docres_status, docres_original_pdf, docres_enhanced_pdf, docres_files_out]
        )

    # Enhanced Parser Tab
    with gr.Tab("Enhanced Parser"):
        with gr.Row():
            pdf_enhanced = gr.File(file_types=[".pdf"], label="PDF")
            use_image_restoration = gr.Checkbox(label="Use Image Restoration", value=True)
            restoration_task = gr.Dropdown(
                ["appearance", "dewarping", "deshadowing", "deblurring", "binarization", "end2end"], 
                value="appearance", 
                label="Restoration Task"
            )
            restoration_device = gr.Dropdown(
                ["auto", "cuda", "cpu"], 
                value="auto", 
                label="Restoration Device"
            )

        with gr.Row():
            use_vlm_enhanced = gr.Checkbox(label="Use VLM (optional)", value=False)
            vlm_provider_enhanced = gr.Dropdown(["openai", "anthropic", "openrouter", "ollama"], value="openai", label="VLM Provider")
            vlm_api_key_enhanced = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")

        with gr.Accordion("Advanced Settings", open=False):
            with gr.Row():
                restoration_dpi = gr.Slider(100, 400, value=200, step=10, label="Restoration DPI")
                layout_model_enhanced = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
                dpi_enhanced = gr.Slider(100, 400, value=200, step=10, label="Processing DPI")
                min_score_enhanced = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
            
            with gr.Row():
                ocr_lang_enhanced = gr.Textbox(value="eng", label="OCR Language")
                ocr_psm_enhanced = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
                ocr_oem_enhanced = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
            
            with gr.Row():
                ocr_config_enhanced = gr.Textbox(value="", label="Extra OCR config")
                box_sep_enhanced = gr.Textbox(value="\n", label="Box separator")

        run_enhanced_btn = gr.Button("▶ Run Enhanced Parse", variant="primary")
        enhanced_status = gr.Textbox(label="Status", elem_classes=["status-ok"])
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("### 📄 Original PDF")
                enhanced_original_pdf = gr.File(label="Original PDF File", interactive=False, visible=False)
                enhanced_original_page_image = gr.Image(label="Original PDF Page", interactive=False, height=600)
            with gr.Column():
                gr.Markdown("### ✨ Enhanced PDF")
                enhanced_enhanced_pdf = gr.File(label="Enhanced PDF File", interactive=False, visible=False)
                enhanced_enhanced_page_image = gr.Image(label="Enhanced PDF Page", interactive=False, height=600)
        
        with gr.Row():
            enhanced_md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
        
        enhanced_files_out = gr.Files(label="Download individual output files")
        enhanced_zip_out = gr.File(label="Download all outputs (ZIP)")

        run_enhanced_btn.click(
            fn=run_enhanced_parse,
            inputs=[
                pdf_enhanced, use_image_restoration, restoration_task, restoration_device, restoration_dpi,
                use_vlm_enhanced, vlm_provider_enhanced, vlm_api_key_enhanced, layout_model_enhanced,
                dpi_enhanced, min_score_enhanced, ocr_lang_enhanced, ocr_psm_enhanced, ocr_oem_enhanced,
                ocr_config_enhanced, box_sep_enhanced
            ],
            outputs=[
                enhanced_status, enhanced_md_preview, enhanced_files_out, enhanced_zip_out,
                enhanced_original_pdf, enhanced_enhanced_pdf
            ]
        )

    # Tips section
    gr.Markdown(create_tips_markdown())


if __name__ == "__main__":
    # Launch the interface
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.getenv("PORT", "7860")),
        share=False
    )