import math import os import random import xml.etree.ElementTree as ET from PIL import Image, ImageDraw, ImageFont class XmlParser: def __init__(self, page_xml="./page_xml.xml"): self.tree = ET.parse(page_xml, parser=ET.XMLParser(encoding="utf-8")) self.root = self.tree.getroot() self.namespace = "{http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15}" def visualize_xml( self, background_image, font_size=9, text_offset=10, font_path_tff="./src/htr_pipeline/utils/templates/arial.ttf", ): image = Image.fromarray(background_image).convert("RGBA") image_width = int(self.root.find(f"{self.namespace}Page").attrib["imageWidth"]) image_height = int(self.root.find(f"{self.namespace}Page").attrib["imageHeight"]) text_offset = -text_offset base_font_size = font_size font_path = font_path_tff max_bbox_width = 0 # Initialize maximum bounding box width for textregion in self.root.findall(f".//{self.namespace}TextRegion"): coords = textregion.find(f"{self.namespace}Coords").attrib["points"].split() points = [tuple(map(int, point.split(","))) for point in coords] x_coords, y_coords = zip(*points) min_x, max_x = min(x_coords), max(x_coords) bbox_width = max_x - min_x # Width of the current bounding box max_bbox_width = max(max_bbox_width, bbox_width) # Update maximum bounding box width scaling_factor = max_bbox_width / 400.0 # Use maximum bounding box width for scaling font_size_scaled = int(base_font_size * scaling_factor) font = ImageFont.truetype(font_path, font_size_scaled) for textregion in self.root.findall(f".//{self.namespace}TextRegion"): fill_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), 100) for textline in textregion.findall(f".//{self.namespace}TextLine"): coords = textline.find(f"{self.namespace}Coords").attrib["points"].split() points = [tuple(map(int, point.split(","))) for point in coords] poly_image = Image.new("RGBA", image.size) poly_draw = ImageDraw.Draw(poly_image) poly_draw.polygon(points, fill=fill_color) text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text x_coords, y_coords = zip(*points) min_x, max_x = min(x_coords), max(x_coords) min_y = min(y_coords) text_width, text_height = poly_draw.textsize(text, font=font) # Get text size text_position = ( (min_x + max_x) // 2 - text_width // 2, min_y + text_offset, ) # Center text horizontally poly_draw.text(text_position, text, fill=(0, 0, 0), font=font) image = Image.alpha_composite(image, poly_image) return image def xml_to_txt(self, output_file="page_txt.txt"): with open(output_file, "w", encoding="utf-8") as f: for textregion in self.root.findall(f".//{self.namespace}TextRegion"): for textline in textregion.findall(f".//{self.namespace}TextLine"): text = textline.find(f"{self.namespace}TextEquiv").find(f"{self.namespace}Unicode").text f.write(text + "\n") f.write("\n")