Spaces:
Running
Running
import argparse | |
import json | |
from PIL import Image | |
import os | |
import base64 | |
HTML_TEMPLATE = '''<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<title>Document Overlay</title> | |
<style> | |
.overlay-container {{ | |
position: relative; | |
width: {img_width}px; | |
height: {img_height}px; | |
background: url('{img_src}') no-repeat; | |
background-size: 100% 100%; | |
border: 1px solid #ccc; | |
}} | |
.word-box {{ | |
position: absolute; | |
border: 1px solid #e74c3c; | |
background: rgba(255,255,0,0.3); | |
font-size: 12px; | |
color: #222; | |
padding: 0; | |
margin: 0; | |
line-height: 1; | |
pointer-events: none; | |
white-space: pre; | |
overflow: hidden; | |
}} | |
</style> | |
</head> | |
<body> | |
<div class="overlay-container"> | |
{boxes} | |
</div> | |
</body> | |
</html> | |
''' | |
def load_image_size(image_path): | |
with Image.open(image_path) as img: | |
return img.width, img.height | |
def extract_words(json_data): | |
# Azure Document Intelligence v4 layout: words are in pages[x]['words'] | |
words = [] | |
for page in json_data.get('pages', []): | |
for word in page.get('words', []): | |
text = word.get('content', '') | |
polygon = word.get('polygon', []) | |
if len(polygon) == 8: # 4 points (x0,y0,...,x3,y3) | |
words.append({'text': text, 'polygon': polygon}) | |
return words | |
def polygon_to_bbox(polygon): | |
xs = polygon[0::2] | |
ys = polygon[1::2] | |
x_min, x_max = min(xs), max(xs) | |
y_min, y_max = min(ys), max(ys) | |
return x_min, y_min, x_max, y_max | |
def scale_polygon(polygon, scale_x, scale_y): | |
return [polygon[i] * (scale_x if i % 2 == 0 else scale_y) for i in range(8)] | |
def generate_azure_overlay_html(image_path, json_path, output_path): | |
# Load image size | |
img_width, img_height = load_image_size(image_path) | |
# Load JSON | |
with open(json_path, 'r') as f: | |
data = json.load(f) | |
# Get page dimensions from JSON (assume first page) | |
page = data['pages'][0] | |
doc_width = page.get('width', img_width) | |
doc_height = page.get('height', img_height) | |
unit = page.get('unit', 'pixel') | |
# Compute scaling factors | |
scale_x = img_width / doc_width | |
scale_y = img_height / doc_height | |
# Extract words | |
words = extract_words(data) | |
# Generate HTML boxes | |
boxes = [] | |
for word in words: | |
poly = word['polygon'] | |
scaled_poly = scale_polygon(poly, scale_x, scale_y) | |
x0, y0, x2, y2 = scaled_poly[0], scaled_poly[1], scaled_poly[4], scaled_poly[5] | |
left = x0 | |
top = y0 | |
width = x2 - x0 | |
height = y2 - y0 | |
# Fallback for negative width/height | |
width = abs(width) | |
height = abs(height) | |
style = f"left:{left:.2f}px;top:{top:.2f}px;width:{width:.2f}px;height:{height:.2f}px;" | |
box_html = f'<span class="word-box" style="{style}">{word["text"]}</span>' | |
boxes.append(box_html) | |
# Use relative path for image in HTML | |
img_src = os.path.relpath(image_path, os.path.dirname(output_path)) | |
html = HTML_TEMPLATE.format( | |
img_width=img_width, | |
img_height=img_height, | |
img_src=img_src, | |
boxes='\n'.join(boxes) | |
) | |
with open(output_path, 'w') as f: | |
f.write(html) | |
print(f"Overlay HTML written to {output_path}") | |
def generate_docling_overlay(image_path, json_path, output_path): | |
""" | |
Generate an HTML file overlaying bounding boxes from the JSON on the image, with tooltips showing the extracted text on hover. | |
Returns the HTML content as a string. | |
""" | |
# Load image and encode as base64 | |
with open(image_path, "rb") as img_f: | |
img_bytes = img_f.read() | |
img_b64 = base64.b64encode(img_bytes).decode("utf-8") | |
from PIL import Image as PILImage | |
img = PILImage.open(image_path) | |
img_width, img_height = img.size | |
# Load JSON | |
with open(json_path, "r") as f: | |
doc = json.load(f) | |
# Collect bounding boxes and texts | |
boxes = [] | |
# Texts: red | |
for text in doc.get("texts", []): | |
for prov in text.get("prov", []): | |
bbox = prov.get("bbox") | |
if bbox: | |
l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"] | |
boxes.append({ | |
"l": l, | |
"t": t, | |
"r": r, | |
"b": b, | |
"text": text.get("text", ""), | |
"type": "text" | |
}) | |
# Pictures: green | |
for pic in doc.get("pictures", []): | |
for prov in pic.get("prov", []): | |
bbox = prov.get("bbox") | |
if bbox: | |
l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"] | |
boxes.append({ | |
"l": l, | |
"t": t, | |
"r": r, | |
"b": b, | |
"text": pic.get("label", "picture"), | |
"type": "picture" | |
}) | |
# Groups: blue (enclosing all children) | |
def get_bbox_for_refs(refs, texts_by_ref): | |
# Get all bboxes for the referenced texts (recursively for groups) | |
bboxes = [] | |
for ref in refs: | |
if ref["$ref"].startswith("#/texts/"): | |
text = texts_by_ref.get(ref["$ref"]) | |
if text: | |
for prov in text.get("prov", []): | |
bbox = prov.get("bbox") | |
if bbox: | |
bboxes.append(bbox) | |
elif ref["$ref"].startswith("#/groups/"): | |
group = groups_by_ref.get(ref["$ref"]) | |
if group: | |
bboxes.extend(get_bbox_for_refs(group.get("children", []), texts_by_ref)) | |
return bboxes | |
groups_by_ref = {g["self_ref"]: g for g in doc.get("groups", [])} | |
texts_by_ref = {t["self_ref"]: t for t in doc.get("texts", [])} | |
for group in doc.get("groups", []): | |
bboxes = get_bbox_for_refs(group.get("children", []), texts_by_ref) | |
if bboxes: | |
l = min(b["l"] for b in bboxes) | |
t = min(b["t"] for b in bboxes) | |
r = max(b["r"] for b in bboxes) | |
b_ = max(b["b"] for b in bboxes) | |
boxes.append({ | |
"l": l, | |
"t": t, | |
"r": r, | |
"b": b_, | |
"text": group.get("label", "group"), | |
"type": "group" | |
}) | |
# Build HTML as a list of lines | |
html_lines = [ | |
'<!DOCTYPE html>', | |
'<html lang="en">', | |
'<head>', | |
'<meta charset="UTF-8">', | |
f'<title>Overlay for {os.path.basename(image_path)}</title>', | |
'<style>', | |
f'''.container {{ | |
position: relative; | |
width: {img_width}px; | |
height: {img_height}px; | |
background: #222; | |
}} | |
.overlay-img {{ | |
display: block; | |
width: {img_width}px; | |
height: {img_height}px; | |
}} | |
.bbox {{ | |
position: absolute; | |
box-sizing: border-box; | |
cursor: pointer; | |
}} | |
.bbox-text {{ | |
border: 2px solid red; | |
}} | |
.bbox-picture {{ | |
border: 2px solid green; | |
}} | |
.bbox-group {{ | |
border: 2px solid blue; | |
}} | |
.tooltip {{ | |
display: none; | |
position: absolute; | |
background: #fff; | |
color: #222; | |
border: 1px solid #888; | |
padding: 6px 10px; | |
border-radius: 4px; | |
z-index: 10; | |
pointer-events: none; | |
max-width: 400px; | |
font-size: 15px; | |
box-shadow: 0 2px 8px rgba(0,0,0,0.2); | |
white-space: pre-line; | |
}}''', | |
'</style>', | |
'</head>', | |
'<body>', | |
f'<h2>Overlay for {os.path.basename(image_path)}</h2>', | |
f'<div class="container" id="img-container">', | |
f' <img src="data:image/png;base64,{img_b64}" class="overlay-img" alt="source image">' | |
] | |
# Add bounding boxes | |
for i, box in enumerate(boxes): | |
left = box["l"] | |
top = box["t"] | |
width = box["r"] - box["l"] | |
height = box["b"] - box["t"] | |
text = box["text"].replace('"', '"').replace("'", "'") | |
box_class = f"bbox bbox-{box['type']}" | |
html_lines.append(f'<div class="{box_class}" style="left:{left}px;top:{top}px;width:{width}px;height:{height}px;" data-tooltip="{text}" onmousemove="showTooltip(event, {i})" onmouseleave="hideTooltip()"></div>') | |
html_lines.append('<div class="tooltip" id="tooltip"></div>') | |
html_lines.append('</div>') | |
html_lines.append('''<script> | |
const tooltip = document.getElementById('tooltip'); | |
function showTooltip(e, idx) { | |
const bbox = e.target; | |
const text = bbox.getAttribute('data-tooltip'); | |
tooltip.innerText = text; | |
tooltip.style.display = 'block'; | |
// Position tooltip near mouse, but inside container | |
const container = document.getElementById('img-container'); | |
let x = e.clientX - container.getBoundingClientRect().left + 10; | |
let y = e.clientY - container.getBoundingClientRect().top + 10; | |
// Clamp to container | |
x = Math.min(x, container.offsetWidth - tooltip.offsetWidth - 10); | |
y = Math.min(y, container.offsetHeight - tooltip.offsetHeight - 10); | |
tooltip.style.left = x + 'px'; | |
tooltip.style.top = y + 'px'; | |
} | |
function hideTooltip() { | |
tooltip.style.display = 'none'; | |
} | |
</script>''') | |
html_lines.append('</body></html>') | |
html = '\n'.join(html_lines) | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write(html) | |
print(f"Overlay HTML written to {output_path}") | |
return html | |
def main(): | |
parser = argparse.ArgumentParser(description="Generate HTML overlay for Azure Document Intelligence output.") | |
parser.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file') | |
parser.add_argument('--image', required=True, help='Path to scanned image file') | |
parser.add_argument('--output', required=True, help='Path to output HTML file') | |
args = parser.parse_args() | |
generate_azure_overlay_html(args.image, args.json, args.output) | |
if __name__ == '__main__': | |
main() |