Spaces:

pengyuan
/

DocAI

Running

DocAI / src /docling_parse.py

Pengyuan Li

Add ZeroGPU support for DocAI demo on HuggingFace Spaces

c37e95b 3 days ago

5.5 kB

	"""
	Docling document parsing with figure extraction and markdown export
	"""

	from typing import Dict, List, Any
	import tempfile
	import os


	def parse_document(pdf_bytes: bytes) -> Dict[str, Any]:
	"""
	Parse PDF with Docling model and extract markdown + figure regions.

	Returns:
	{
	"html": "HTML representation of document (markdown wrapped)",
	"text": "full extracted text",
	"figures": [
	{"bbox": [x, y, w, h], "page": 0},
	...
	]
	}
	"""
	try:
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.base_models import InputFormat

	# Save bytes to temp file (DocumentConverter needs file path)
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
	tmp.write(pdf_bytes)
	tmp_path = tmp.name

	try:
	# Initialize converter with picture image generation enabled
	# Get default format option and modify it
	pdf_format_option = PdfFormatOption()
	# Enable picture image generation
	pdf_format_option.pipeline_options.generate_picture_images = True

	converter = DocumentConverter(format_options={
	InputFormat.PDF: pdf_format_option
	})

	# Convert document using correct API: convert() not convert_document()
	result = converter.convert(tmp_path)
	doc = result.document

	# Export as markdown
	markdown_text = doc.export_to_markdown()

	# Wrap markdown in HTML for display
	html = f"<pre style='white-space: pre-wrap; word-wrap: break-word;'>{markdown_text}</pre>"

	# Extract text
	text = doc.export_to_text()

	# Extract figures (pictures) with bounding boxes using modern Docling API
	figures = []
	try:
	# Modern approach: iterate through doc.pictures directly
	if hasattr(doc, 'pictures'):
	for figure in doc.pictures:
	# Skip figures not in the main body (logos, headers, etc.)
	# Only include figures in the body content layer
	if figure.content_layer.value != "body":
	continue

	# Get page number and bbox from provenance
	page_num = 0
	bbox_list = None

	if figure.prov:
	# Get page from first provenance entry
	page_num = figure.prov[0].page_no
	# Get bbox from first provenance entry
	bbox = figure.prov[0].bbox
	bbox_list = [bbox.l, bbox.t, bbox.width, bbox.height]

	# Get caption from captions list
	caption = ""
	if figure.captions:
	# Captions are RefItem objects pointing to text elements via cref
	for cap_ref in figure.captions:
	try:
	# Parse the cref like "#/texts/11" to get the index
	if hasattr(cap_ref, 'cref') and cap_ref.cref.startswith('#/texts/'):
	idx = int(cap_ref.cref.split('/')[-1])
	if idx < len(doc.texts):
	caption = doc.texts[idx].text
	break
	except Exception:
	pass

	# Get PIL image from ImageRef
	pil_image = None
	if figure.image:
	try:
	pil_image = figure.image.pil_image

	figures.append({
	"bbox": bbox_list,
	"page": page_num,
	"caption": caption,
	"image": pil_image, # Store PIL image directly
	})
	except Exception as e:
	pass # Skip figures without valid images
	except Exception as e:
	# Continue without figures rather than failing completely
	figures = []

	return {
	"html": html,
	"text": text,
	"figures": figures,
	}
	finally:
	# Clean up temp file
	if os.path.exists(tmp_path):
	os.unlink(tmp_path)

	except ImportError as e:
	print(f"⚠️ Docling import error: {e}, using placeholder")
	return {
	"html": "<h1>Sample Document</h1><p>Docling not available - using placeholder.</p>",
	"text": "Sample text from PDF.\n\nDocling not available - using placeholder.",
	"figures": [],
	}
	except Exception as e:
	print(f"⚠️ Docling parse error: {e}")
	import traceback
	traceback.print_exc()
	return {
	"html": f"<h1>Error</h1><pre>{str(e)}</pre>",
	"text": f"Error: {str(e)}",
	"figures": [],
	}