jobian's picture
Added Smoldocling Package and implemeted it's first test /parse
ceaf2e8
import argparse
import json
from PIL import Image
import os
import base64
HTML_TEMPLATE = '''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Document Overlay</title>
<style>
.overlay-container {{
position: relative;
width: {img_width}px;
height: {img_height}px;
background: url('{img_src}') no-repeat;
background-size: 100% 100%;
border: 1px solid #ccc;
}}
.word-box {{
position: absolute;
border: 1px solid #e74c3c;
background: rgba(255,255,0,0.3);
font-size: 12px;
color: #222;
padding: 0;
margin: 0;
line-height: 1;
pointer-events: none;
white-space: pre;
overflow: hidden;
}}
</style>
</head>
<body>
<div class="overlay-container">
{boxes}
</div>
</body>
</html>
'''
def load_image_size(image_path):
with Image.open(image_path) as img:
return img.width, img.height
def extract_words(json_data):
# Azure Document Intelligence v4 layout: words are in pages[x]['words']
words = []
for page in json_data.get('pages', []):
for word in page.get('words', []):
text = word.get('content', '')
polygon = word.get('polygon', [])
if len(polygon) == 8: # 4 points (x0,y0,...,x3,y3)
words.append({'text': text, 'polygon': polygon})
return words
def polygon_to_bbox(polygon):
xs = polygon[0::2]
ys = polygon[1::2]
x_min, x_max = min(xs), max(xs)
y_min, y_max = min(ys), max(ys)
return x_min, y_min, x_max, y_max
def scale_polygon(polygon, scale_x, scale_y):
return [polygon[i] * (scale_x if i % 2 == 0 else scale_y) for i in range(8)]
def generate_azure_overlay_html(image_path, json_path, output_path):
# Load image size
img_width, img_height = load_image_size(image_path)
# Load JSON
with open(json_path, 'r') as f:
data = json.load(f)
# Get page dimensions from JSON (assume first page)
page = data['pages'][0]
doc_width = page.get('width', img_width)
doc_height = page.get('height', img_height)
unit = page.get('unit', 'pixel')
# Compute scaling factors
scale_x = img_width / doc_width
scale_y = img_height / doc_height
# Extract words
words = extract_words(data)
# Generate HTML boxes
boxes = []
for word in words:
poly = word['polygon']
scaled_poly = scale_polygon(poly, scale_x, scale_y)
x0, y0, x2, y2 = scaled_poly[0], scaled_poly[1], scaled_poly[4], scaled_poly[5]
left = x0
top = y0
width = x2 - x0
height = y2 - y0
# Fallback for negative width/height
width = abs(width)
height = abs(height)
style = f"left:{left:.2f}px;top:{top:.2f}px;width:{width:.2f}px;height:{height:.2f}px;"
box_html = f'<span class="word-box" style="{style}">{word["text"]}</span>'
boxes.append(box_html)
# Use relative path for image in HTML
img_src = os.path.relpath(image_path, os.path.dirname(output_path))
html = HTML_TEMPLATE.format(
img_width=img_width,
img_height=img_height,
img_src=img_src,
boxes='\n'.join(boxes)
)
with open(output_path, 'w') as f:
f.write(html)
print(f"Overlay HTML written to {output_path}")
def generate_docling_overlay(image_path, json_path, output_path):
"""
Generate an HTML file overlaying bounding boxes from the JSON on the image, with tooltips showing the extracted text on hover.
Returns the HTML content as a string.
"""
# Load image and encode as base64
with open(image_path, "rb") as img_f:
img_bytes = img_f.read()
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
from PIL import Image as PILImage
img = PILImage.open(image_path)
img_width, img_height = img.size
# Load JSON
with open(json_path, "r") as f:
doc = json.load(f)
# Collect bounding boxes and texts
boxes = []
# Texts: red
for text in doc.get("texts", []):
for prov in text.get("prov", []):
bbox = prov.get("bbox")
if bbox:
l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
boxes.append({
"l": l,
"t": t,
"r": r,
"b": b,
"text": text.get("text", ""),
"type": "text"
})
# Pictures: green
for pic in doc.get("pictures", []):
for prov in pic.get("prov", []):
bbox = prov.get("bbox")
if bbox:
l, t, r, b = bbox["l"], bbox["t"], bbox["r"], bbox["b"]
boxes.append({
"l": l,
"t": t,
"r": r,
"b": b,
"text": pic.get("label", "picture"),
"type": "picture"
})
# Groups: blue (enclosing all children)
def get_bbox_for_refs(refs, texts_by_ref):
# Get all bboxes for the referenced texts (recursively for groups)
bboxes = []
for ref in refs:
if ref["$ref"].startswith("#/texts/"):
text = texts_by_ref.get(ref["$ref"])
if text:
for prov in text.get("prov", []):
bbox = prov.get("bbox")
if bbox:
bboxes.append(bbox)
elif ref["$ref"].startswith("#/groups/"):
group = groups_by_ref.get(ref["$ref"])
if group:
bboxes.extend(get_bbox_for_refs(group.get("children", []), texts_by_ref))
return bboxes
groups_by_ref = {g["self_ref"]: g for g in doc.get("groups", [])}
texts_by_ref = {t["self_ref"]: t for t in doc.get("texts", [])}
for group in doc.get("groups", []):
bboxes = get_bbox_for_refs(group.get("children", []), texts_by_ref)
if bboxes:
l = min(b["l"] for b in bboxes)
t = min(b["t"] for b in bboxes)
r = max(b["r"] for b in bboxes)
b_ = max(b["b"] for b in bboxes)
boxes.append({
"l": l,
"t": t,
"r": r,
"b": b_,
"text": group.get("label", "group"),
"type": "group"
})
# Build HTML as a list of lines
html_lines = [
'<!DOCTYPE html>',
'<html lang="en">',
'<head>',
'<meta charset="UTF-8">',
f'<title>Overlay for {os.path.basename(image_path)}</title>',
'<style>',
f'''.container {{
position: relative;
width: {img_width}px;
height: {img_height}px;
background: #222;
}}
.overlay-img {{
display: block;
width: {img_width}px;
height: {img_height}px;
}}
.bbox {{
position: absolute;
box-sizing: border-box;
cursor: pointer;
}}
.bbox-text {{
border: 2px solid red;
}}
.bbox-picture {{
border: 2px solid green;
}}
.bbox-group {{
border: 2px solid blue;
}}
.tooltip {{
display: none;
position: absolute;
background: #fff;
color: #222;
border: 1px solid #888;
padding: 6px 10px;
border-radius: 4px;
z-index: 10;
pointer-events: none;
max-width: 400px;
font-size: 15px;
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
white-space: pre-line;
}}''',
'</style>',
'</head>',
'<body>',
f'<h2>Overlay for {os.path.basename(image_path)}</h2>',
f'<div class="container" id="img-container">',
f' <img src="data:image/png;base64,{img_b64}" class="overlay-img" alt="source image">'
]
# Add bounding boxes
for i, box in enumerate(boxes):
left = box["l"]
top = box["t"]
width = box["r"] - box["l"]
height = box["b"] - box["t"]
text = box["text"].replace('"', '&quot;').replace("'", "&#39;")
box_class = f"bbox bbox-{box['type']}"
html_lines.append(f'<div class="{box_class}" style="left:{left}px;top:{top}px;width:{width}px;height:{height}px;" data-tooltip="{text}" onmousemove="showTooltip(event, {i})" onmouseleave="hideTooltip()"></div>')
html_lines.append('<div class="tooltip" id="tooltip"></div>')
html_lines.append('</div>')
html_lines.append('''<script>
const tooltip = document.getElementById('tooltip');
function showTooltip(e, idx) {
const bbox = e.target;
const text = bbox.getAttribute('data-tooltip');
tooltip.innerText = text;
tooltip.style.display = 'block';
// Position tooltip near mouse, but inside container
const container = document.getElementById('img-container');
let x = e.clientX - container.getBoundingClientRect().left + 10;
let y = e.clientY - container.getBoundingClientRect().top + 10;
// Clamp to container
x = Math.min(x, container.offsetWidth - tooltip.offsetWidth - 10);
y = Math.min(y, container.offsetHeight - tooltip.offsetHeight - 10);
tooltip.style.left = x + 'px';
tooltip.style.top = y + 'px';
}
function hideTooltip() {
tooltip.style.display = 'none';
}
</script>''')
html_lines.append('</body></html>')
html = '\n'.join(html_lines)
with open(output_path, "w", encoding="utf-8") as f:
f.write(html)
print(f"Overlay HTML written to {output_path}")
return html
def main():
parser = argparse.ArgumentParser(description="Generate HTML overlay for Azure Document Intelligence output.")
parser.add_argument('--json', required=True, help='Path to Azure Document Intelligence JSON file')
parser.add_argument('--image', required=True, help='Path to scanned image file')
parser.add_argument('--output', required=True, help='Path to output HTML file')
args = parser.parse_args()
generate_azure_overlay_html(args.image, args.json, args.output)
if __name__ == '__main__':
main()