Spaces:

jobian
/

smoldocling-api

Running

App Files Files Community

smoldocling-api / smoldocling /overlays.py

jobian

Added Smoldocling Package and implemeted it's first test /parse

ceaf2e8 23 days ago

raw

history blame contribute delete

9.87 kB

	import argparse
	import json
	from PIL import Image
	import os
	import base64

	HTML_TEMPLATE = '''<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<title>Document Overlay</title>
	<style>
	.overlay-container {{
	position: relative;
	width: {img_width}px;
	height: {img_height}px;
	background: url('{img_src}') no-repeat;
	background-size: 100% 100%;
	border: 1px solid #ccc;
	}}
	.word-box {{
	position: absolute;
	border: 1px solid #e74c3c;
	background: rgba(255,255,0,0.3);
	font-size: 12px;
	color: #222;
	padding: 0;
	margin: 0;
	line-height: 1;
	pointer-events: none;
	white-space: pre;
	overflow: hidden;
	}}
	</style>
	</head>
	<body>
	<div class="overlay-container">
	{boxes}
	</div>
	</body>
	</html>
	'''

	def load_image_size(image_path):
	with Image.open(image_path) as img:
	return img.width, img.height

	def extract_words(json_data):
	# Azure Document Intelligence v4 layout: words are in pages[x]['words']
	words = []
	for page in json_data.get('pages', []):
	for word in page.get('words', []):
	text = word.get('content', '')
	polygon = word.get('polygon', [])
	if len(polygon) == 8: # 4 points (x0,y0,...,x3,y3)
	words.append({'text': text, 'polygon': polygon})
	return words

	def polygon_to_bbox(polygon):
	xs = polygon[0::2]
	ys = polygon[1::2]
	x_min, x_max = min(xs), max(xs)
	y_min, y_max = min(ys), max(ys)
	return x_min, y_min, x_max, y_max

	def scale_polygon(polygon, scale_x, scale_y):
	return [polygon[i] * (scale_x if i % 2 == 0 else scale_y) for i in range(8)]

	def generate_azure_overlay_html(image_path, json_path, output_path):
	# Load image size
	img_width, img_height = load_image_size(image_path)

	# Load JSON
	with open(json_path, 'r') as f:
	data = json.load(f)

	# Get page dimensions from JSON (assume first page)
	page = data['pages'][0]
	doc_width = page.get('width', img_width)
	doc_height = page.get('height', img_height)
	unit = page.get('unit', 'pixel')

	# Compute scaling factors
	scale_x = img_width / doc_width
	scale_y = img_height / doc_height

	# Extract words
	words = extract_words(data)

	# Generate HTML boxes
	boxes = []
	for word in words:
	poly = word['polygon']
	scaled_poly = scale_polygon(poly, scale_x, scale_y)
	x0, y0, x2, y2 = scaled_poly[0], scaled_poly[1], scaled_poly[4], scaled_poly[5]
	left = x0
	top = y0
	width = x2 - x0
	height = y2 - y0
	# Fallback for negative width/height
	width = abs(width)
	height = abs(height)
	style = f"left:{left:.2f}px;top:{top:.2f}px;width:{width:.2f}px;height:{height:.2f}px;"
	box_html = f'<span class="word-box" style="{style}">{word["text"]}</span>'
	boxes.append(box_html)

	# Use relative path for image in HTML
	img_src = os.path.relpath(image_path, os.path.dirname(output_path))

	html = HTML_TEMPLATE.format(
	img_width=img_width,
	img_height=img_height,
	img_src=img_src,
	boxes='\n'.join(boxes)
	)

	with open(output_path, 'w') as f:
	f.write(html)
	print(f"Overlay HTML written to {output_path}")

	def generate_docling_overlay(image_path, json_path, output_path):
	"""
	Generate an HTML file overlaying bounding boxes from the JSON on the image, with tooltips showing the extracted text on hover.
	Returns the HTML content as a string.
	"""
	# Load image and encode as base64
	with open(image_path, "rb") as img_f:
	img_bytes = img_f.read()
	img_b64 = base64.b64encode(img_bytes).decode("utf-8")
	from PIL import Image as PILImage
	img = PILImage.open(image_path)
	img_width, img_height = img.size

	# Load JSON
	with open(json_path, "r") as f:
	doc = json.load(f)

	# Collect bounding boxes and texts
	boxes = []
	# Texts: red
	for text in doc.get("texts", []):
	for prov in text.get("prov", []):
	bbox = prov.get("bbox")
	if bbox:
	l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
	boxes.append({
	"l": l,
	"t": t,
	"r": r,
	"b": b,
	"text": text.get("text", ""),
	"type": "text"
	})
	# Pictures: green
	for pic in doc.get("pictures", []):
	for prov in pic.get("prov", []):
	bbox = prov.get("bbox")
	if bbox:
	l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
	boxes.append({
	"l": l,
	"t": t,
	"r": r,
	"b": b,
	"text": pic.get("label", "picture"),
	"type": "picture"
	})
	# Groups: blue (enclosing all children)
	def get_bbox_for_refs(refs, texts_by_ref):
	# Get all bboxes for the referenced texts (recursively for groups)
	bboxes = []
	for ref in refs:
	if ref["$ref"].startswith("#/texts/"):
	text = texts_by_ref.get(ref["$ref"])
	if text:
	for prov in text.get("prov", []):
	bbox = prov.get("bbox")
	if bbox:
	bboxes.append(bbox)
	elif ref["$ref"].startswith("#/groups/"):
	group = groups_by_ref.get(ref["$ref"])
	if group:
	bboxes.extend(get_bbox_for_refs(group.get("children", []), texts_by_ref))
	return bboxes
	groups_by_ref = {g["self_ref"]: g for g in doc.get("groups", [])}
	texts_by_ref = {t["self_ref"]: t for t in doc.get("texts", [])}
	for group in doc.get("groups", []):
	bboxes = get_bbox_for_refs(group.get("children", []), texts_by_ref)
	if bboxes:
	l = min(b["l"] for b in bboxes)
	t = min(b["t"] for b in bboxes)
	r = max(b["r"] for b in bboxes)
	b_ = max(b["b"] for b in bboxes)
	boxes.append({
	"l": l,
	"t": t,
	"r": r,
	"b": b_,
	"text": group.get("label", "group"),
	"type": "group"
	})
	# Build HTML as a list of lines
	html_lines = [
	'<!DOCTYPE html>',
	'<html lang="en">',
	'<head>',
	'<meta charset="UTF-8">',
	f'<title>Overlay for {os.path.basename(image_path)}</title>',
	'<style>',
	f'''.container {{
	position: relative;
	width: {img_width}px;
	height: {img_height}px;
	background: #222;
	}}
	.overlay-img {{
	display: block;
	width: {img_width}px;
	height: {img_height}px;
	}}
	.bbox {{
	position: absolute;
	box-sizing: border-box;
	cursor: pointer;
	}}
	.bbox-text {{
	border: 2px solid red;
	}}
	.bbox-picture {{
	border: 2px solid green;
	}}
	.bbox-group {{
	border: 2px solid blue;
	}}
	.tooltip {{
	display: none;
	position: absolute;
	background: #fff;
	color: #222;
	border: 1px solid #888;
	padding: 6px 10px;
	border-radius: 4px;
	z-index: 10;
	pointer-events: none;
	max-width: 400px;
	font-size: 15px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.2);
	white-space: pre-line;
	}}''',
	'</style>',
	'</head>',
	'<body>',
	f'<h2>Overlay for {os.path.basename(image_path)}</h2>',
	f'<div class="container" id="img-container">',
	f' <img src="data:image/png;base64,{img_b64}" class="overlay-img" alt="source image">'
	]
	# Add bounding boxes
	for i, box in enumerate(boxes):
	left = box["l"]
	top = box["t"]
	width = box["r"] - box["l"]
	height = box["b"] - box["t"]
	text = box["text"].replace('"', '"').replace("'", "'")
	box_class = f"bbox bbox-{box['type']}"
	html_lines.append(f'<div class="{box_class}" style="left:{left}px;top:{top}px;width:{width}px;height:{height}px;" data-tooltip="{text}" onmousemove="showTooltip(event, {i})" onmouseleave="hideTooltip()"></div>')
	html_lines.append('<div class="tooltip" id="tooltip"></div>')
	html_lines.append('</div>')
	html_lines.append('''<script>
	const tooltip = document.getElementById('tooltip');
	function showTooltip(e, idx) {
	const bbox = e.target;
	const text = bbox.getAttribute('data-tooltip');
	tooltip.innerText = text;
	tooltip.style.display = 'block';
	// Position tooltip near mouse, but inside container
	const container = document.getElementById('img-container');
	let x = e.clientX - container.getBoundingClientRect().left + 10;
	let y = e.clientY - container.getBoundingClientRect().top + 10;
	// Clamp to container
	x = Math.min(x, container.offsetWidth - tooltip.offsetWidth - 10);
	y = Math.min(y, container.offsetHeight - tooltip.offsetHeight - 10);
	tooltip.style.left = x + 'px';
	tooltip.style.top = y + 'px';
	}
	function hideTooltip() {
	tooltip.style.display = 'none';
	}
	</script>''')
	html_lines.append('</body></html>')
	html = '\n'.join(html_lines)
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(html)
	print(f"Overlay HTML written to {output_path}")
	return html

	def main():
	parser = argparse.ArgumentParser(description="Generate HTML overlay for Azure Document Intelligence output.")
	parser.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
	parser.add_argument('--image', required=True, help='Path to scanned image file')
	parser.add_argument('--output', required=True, help='Path to output HTML file')
	args = parser.parse_args()
	generate_azure_overlay_html(args.image, args.json, args.output)

	if __name__ == '__main__':
	main()