Spaces:

sooooner
/

DonutMathHWP

Runtime error

DonutMathHWP / utils.py

ce2c619 6 months ago

5.21 kB

	import os
	import re
	import torch
	import requests
	import numpy as np
	import PIL.Image
	import PIL.ImageOps
	from PIL import Image
	from typing import Union
	from transformers import DonutProcessor, VisionEncoderDecoderModel, VisionEncoderDecoderConfig

	def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
	"""
	Loads `image` to a PIL Image.

	Args:
	image (`str` or `PIL.Image.Image`):
	The image to convert to the PIL Image format.
	Returns:
	`PIL.Image.Image`:
	A PIL Image.
	"""
	if isinstance(image, str):
	if image.startswith("http://") or image.startswith("https://"):
	image = PIL.Image.open(requests.get(image, stream=True).raw)
	elif os.path.isfile(image):
	image = PIL.Image.open(image)
	else:
	raise ValueError(
	f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
	)
	elif isinstance(image, PIL.Image.Image):
	image = image
	else:
	raise ValueError(
	"Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
	)
	image = PIL.ImageOps.exif_transpose(image)
	image = image.convert("RGB")
	return image

	def aspect_ratio_preserving_resize_and_crop(image, target_width, target_height):
	width, height = image.size
	width_ratio = width / target_width
	height_ratio = height / target_height

	if width > target_width and height > target_height:
	if width_ratio > height_ratio:
	new_width = target_width
	new_height = int(new_width / (width / height))
	else:
	new_height = target_height
	new_width = int(new_height * (width / height))
	elif width > target_width:
	new_width = target_width
	new_height = int(new_width / (width / height))
	elif height > target_height:
	new_height = target_height
	new_width = int(new_height * (width / height))
	else:
	new_width, new_height = width, height

	resized_image = image.resize((new_width, new_height), Image.LANCZOS)
	padded_image = Image.new("RGB", (target_width, target_height), (255, 255, 255))

	offset_x = (target_width - new_width) // 2
	offset_y = (target_height - new_height) // 2

	padded_image.paste(resized_image, (offset_x, offset_y))
	return padded_image


	class Image2Text:
	def __init__(self, model_path, hf_token, device, max_length=1024):
	self.device = device
	self.hf_token = hf_token
	self.model_path = model_path
	self.max_length = max_length
	self.model, self.processor = self.load_model(self.model_path)
	self.decoder_input_ids = torch.tensor([[self.model.config.decoder_start_token_id]]).to(self.device)

	def load_model(self, model_path):
	config = VisionEncoderDecoderConfig.from_pretrained(model_path, token=self.hf_token)
	processor = DonutProcessor.from_pretrained(model_path, token=self.hf_token)

	model = VisionEncoderDecoderModel.from_pretrained(model_path, config=config, token=self.hf_token).to(self.device)
	model.eval()
	return model, processor

	def load_img(self, inputs, width=480, height=480):
	images = [load_image(input_) for input_ in inputs]
	images = [aspect_ratio_preserving_resize_and_crop(image, target_width=width, target_height=height) for image in images]
	imgs = self.processor([image.convert("RGB") for image in images], return_tensors="pt", size=(width, height)).pixel_values
	pixel_values = imgs.to(self.device)
	return pixel_values

	def generate(self, pixel_values, num_beams):
	outputs = self.model.generate(
	pixel_values,
	decoder_input_ids=self.decoder_input_ids.repeat(pixel_values.shape[0], 1),
	max_length=self.max_length,
	early_stopping=True,
	pad_token_id=self.processor.tokenizer.pad_token_id,
	eos_token_id=self.processor.tokenizer.eos_token_id,
	use_cache=True,
	num_beams=num_beams,
	bad_words_ids=[[self.processor.tokenizer.unk_token_id]],
	return_dict_in_generate=True,
	)
	return outputs

	def postprocessing(self, outputs):
	seqs = self.processor.batch_decode(outputs.sequences)
	seqs = [seq.replace(self.processor.tokenizer.eos_token, "").replace(self.processor.tokenizer.pad_token, "") for seq in seqs]
	seqs = [re.sub(r"<.*?>", "", seq, count=1).strip() for seq in seqs]
	seqs = [self.processor.token2json(seq) for seq in seqs]

	contents = []
	for seq in seqs:
	try:
	content = seq['content']
	except:
	content = seq['text_sequence']
	contents.append('\n'.join(content.split('[newline]')))

	return contents

	def get_text(self, img_path, num_beams=4):
	pixel_values = self.load_img(img_path)
	outputs = self.generate(pixel_values, num_beams)
	contents = self.postprocessing(outputs)
	return contents