pta-text-v0.1 / utils.py
gitlost-murali's picture
initial checkpoint inference push
da59cbe
raw
history blame
5.59 kB
import io
import os
import textwrap
from typing import Dict, Optional, Tuple
from huggingface_hub import hf_hub_download
from PIL import Image, ImageDraw, ImageFont
DEFAULT_FONT_PATH = "ybelkada/fonts"
def download_default_font():
font_path = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
return font_path
def render_text(
text: str,
text_size: int = 36,
text_color: str = "black",
background_color: str = "white",
left_padding: int = 5,
right_padding: int = 5,
top_padding: int = 5,
bottom_padding: int = 5,
font_bytes: Optional[bytes] = None,
font_path: Optional[str] = None,
) -> Image.Image:
"""
Render text. This script is entirely adapted from the original script that can be found here:
https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py
Args:
text (`str`, *optional*, defaults to ):
Text to render.
text_size (`int`, *optional*, defaults to 36):
Size of the text.
text_color (`str`, *optional*, defaults to `"black"`):
Color of the text.
background_color (`str`, *optional*, defaults to `"white"`):
Color of the background.
left_padding (`int`, *optional*, defaults to 5):
Padding on the left.
right_padding (`int`, *optional*, defaults to 5):
Padding on the right.
top_padding (`int`, *optional*, defaults to 5):
Padding on the top.
bottom_padding (`int`, *optional*, defaults to 5):
Padding on the bottom.
font_bytes (`bytes`, *optional*):
Bytes of the font to use. If `None`, the default font will be used.
font_path (`str`, *optional*):
Path to the font to use. If `None`, the default font will be used.
"""
wrapper = textwrap.TextWrapper(
width=80
) # Add new lines so that each line is no more than 80 characters.
lines = wrapper.wrap(text=text)
wrapped_text = "\n".join(lines)
if font_bytes is not None and font_path is None:
font = io.BytesIO(font_bytes)
elif font_path is not None:
font = font_path
else:
font = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
raise ValueError(
"Either font_bytes or font_path must be provided. "
f"Using default font {font}."
)
font = ImageFont.truetype(font, encoding="UTF-8", size=text_size)
# Use a temporary canvas to determine the width and height in pixels when
# rendering the text.
temp_draw = ImageDraw.Draw(Image.new("RGB", (1, 1), background_color))
_, _, text_width, text_height = temp_draw.textbbox((0, 0), wrapped_text, font)
# Create the actual image with a bit of padding around the text.
image_width = text_width + left_padding + right_padding
image_height = text_height + top_padding + bottom_padding
image = Image.new("RGB", (image_width, image_height), background_color)
draw = ImageDraw.Draw(image)
draw.text(
xy=(left_padding, top_padding), text=wrapped_text, fill=text_color, font=font
)
return image
# Adapted from https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L87
def render_header(
image: Image.Image, header: str, bbox: Dict[str, float], font_path: str, **kwargs
) -> Tuple[Image.Image, Tuple[float, float, float, float]]:
"""
Renders the input text as a header on the input image and updates the bounding box.
Args:
image (Image.Image):
The image to render the header on.
header (str):
The header text.
bbox (Dict[str,float]):
The bounding box in relative position (0-1), format ("x_min": 0,
"y_min": 0,
"x_max": 0,
"y_max": 0).
input_data_format (Union[str, ChildProcessError], optional):
The data format of the image.
Returns:
Tuple[Image.Image, Dict[str, float] ]:
The image with the header rendered and the updated bounding box.
"""
assert os.path.exists(font_path), f"Font path {font_path} does not exist."
header_image = render_text(text=header, font_path=font_path, **kwargs)
new_width = max(header_image.width, image.width)
new_height = int(image.height * (new_width / image.width))
new_header_height = int(header_image.height * (new_width / header_image.width))
new_image = Image.new("RGB", (new_width, new_height + new_header_height), "white")
new_image.paste(header_image.resize((new_width, new_header_height)), (0, 0))
new_image.paste(image.resize((new_width, new_height)), (0, new_header_height))
new_total_height = new_image.height
new_bbox = {
"xmin": bbox["xmin"],
"ymin": ((bbox["ymin"] * new_height) + new_header_height)
/ new_total_height, # shift y_min down by the header's relative height
"xmax": bbox["xmax"],
"ymax": ((bbox["ymax"] * new_height) + new_header_height)
/ new_total_height, # shift y_min down by the header's relative height
}
return (
new_image,
new_bbox,
{
"width": new_width,
"height": new_height,
"header_height": new_header_height,
"total_height": new_total_height,
},
)