| | """ |
| | Docling document parsing with figure extraction and markdown export |
| | """ |
| |
|
| | from typing import Dict, List, Any |
| | import tempfile |
| | import os |
| |
|
| |
|
| | def parse_document(pdf_bytes: bytes) -> Dict[str, Any]: |
| | """ |
| | Parse PDF with Docling model and extract markdown + figure regions. |
| | |
| | Returns: |
| | { |
| | "html": "HTML representation of document (markdown wrapped)", |
| | "text": "full extracted text", |
| | "figures": [ |
| | {"bbox": [x, y, w, h], "page": 0}, |
| | ... |
| | ] |
| | } |
| | """ |
| | try: |
| | from docling.document_converter import DocumentConverter, PdfFormatOption |
| | from docling.datamodel.base_models import InputFormat |
| |
|
| | |
| | with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: |
| | tmp.write(pdf_bytes) |
| | tmp_path = tmp.name |
| |
|
| | try: |
| | |
| | |
| | pdf_format_option = PdfFormatOption() |
| | |
| | pdf_format_option.pipeline_options.generate_picture_images = True |
| |
|
| | converter = DocumentConverter(format_options={ |
| | InputFormat.PDF: pdf_format_option |
| | }) |
| |
|
| | |
| | result = converter.convert(tmp_path) |
| | doc = result.document |
| |
|
| | |
| | markdown_text = doc.export_to_markdown() |
| |
|
| | |
| | html = f"<pre style='white-space: pre-wrap; word-wrap: break-word;'>{markdown_text}</pre>" |
| |
|
| | |
| | text = doc.export_to_text() |
| |
|
| | |
| | figures = [] |
| | try: |
| | |
| | if hasattr(doc, 'pictures'): |
| | for figure in doc.pictures: |
| | |
| | |
| | if figure.content_layer.value != "body": |
| | continue |
| |
|
| | |
| | page_num = 0 |
| | bbox_list = None |
| |
|
| | if figure.prov: |
| | |
| | page_num = figure.prov[0].page_no |
| | |
| | bbox = figure.prov[0].bbox |
| | bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height] |
| |
|
| | |
| | caption = "" |
| | if figure.captions: |
| | |
| | for cap_ref in figure.captions: |
| | try: |
| | |
| | if hasattr(cap_ref, 'cref') and cap_ref.cref.startswith('#/texts/'): |
| | idx = int(cap_ref.cref.split('/')[-1]) |
| | if idx < len(doc.texts): |
| | caption = doc.texts[idx].text |
| | break |
| | except Exception: |
| | pass |
| |
|
| | |
| | pil_image = None |
| | if figure.image: |
| | try: |
| | pil_image = figure.image.pil_image |
| |
|
| | figures.append({ |
| | "bbox": bbox_list, |
| | "page": page_num, |
| | "caption": caption, |
| | "image": pil_image, |
| | }) |
| | except Exception as e: |
| | pass |
| | except Exception as e: |
| | |
| | figures = [] |
| |
|
| | return { |
| | "html": html, |
| | "text": text, |
| | "figures": figures, |
| | } |
| | finally: |
| | |
| | if os.path.exists(tmp_path): |
| | os.unlink(tmp_path) |
| |
|
| | except ImportError as e: |
| | print(f"⚠️ Docling import error: {e}, using placeholder") |
| | return { |
| | "html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>", |
| | "text": "Sample text from PDF.\n\nDocling not available - using placeholder.", |
| | "figures": [], |
| | } |
| | except Exception as e: |
| | print(f"⚠️ Docling parse error: {e}") |
| | import traceback |
| | traceback.print_exc() |
| | return { |
| | "html": f"<h1>Error</h1><pre>{str(e)}</pre>", |
| | "text": f"Error: {str(e)}", |
| | "figures": [], |
| | } |
| |
|