uform-gen2-qwen-500m / processing_uform_gen.py

Upload processing_uform_gen.py with huggingface_hub

ab80423 verified 9 months ago

5.9 kB

	from functools import partial

	import torch
	import torch.nn.functional as F
	from transformers.processing_utils import ProcessorMixin
	from transformers.image_processing_utils import BaseImageProcessor
	from transformers import AutoTokenizer, AutoConfig
	from transformers import BatchFeature

	from PIL import Image
	from torchvision.transforms import (
	Compose,
	Normalize,
	Resize,
	ToTensor
	)


	IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
	IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)


	def convert_to_rgb(x):
	return x.convert("RGB")


	def expand2square(image, background_color):
	width, height = image.size
	if width == height:
	return image
	elif width > height:
	result = Image.new(image.mode, (width, width), background_color)
	result.paste(image, (0, (width - height) // 2))
	return result
	else:
	result = Image.new(image.mode, (height, height), background_color)
	result.paste(image, ((height - width) // 2, 0))
	return result


	class ImageProcessor(BaseImageProcessor):
	def __init__(
	self,
	image_size: int,
	**kwargs
	):
	super().__init__(**kwargs)
	self.transform = Compose(
	[
	convert_to_rgb,
	partial(
	expand2square,
	background_color=tuple(int(255 * v) for v in IMAGENET_MEAN)
	),
	Resize(image_size),
	ToTensor(),
	Normalize(
	mean=IMAGENET_MEAN,
	std=IMAGENET_STD,
	),
	]
	)

	def preprocess(
	self,
	image: Image
	):
	return self.transform(image)

	def __repr__(self):
	return repr(self.transform)


	class VLMProcessor(ProcessorMixin):
	def __init__(self, config):
	self.config = config
	self.image_size = config.image_size

	self.feature_extractor = ImageProcessor(self.image_size)
	self.tokenizer = AutoTokenizer.from_pretrained(
	config.text_decoder_name_or_path, additional_special_tokens=["<image>"]
	)
	self.tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<\|im_start\|>' + message['role'] + '\n' + message['content'] + '<\|im_end\|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<\|im_start\|>assistant\n' }}{% endif %}"
	self.num_image_latents = config.num_image_latents
	# super().__init__(self.image_processor, self.tokenizer)

	def __call__(
	self, text=None, images=None, **kwargs
	):
	if text is not None:
	if isinstance(text, str):
	text = [text]

	tokenized_texts = []
	for t in text:
	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": f" <image> {t}"},
	]
	tokenized_prompt = self.tokenizer.apply_chat_template(
	messages, add_generation_prompt=True, return_tensors="pt"
	)

	tokenized_texts.append(tokenized_prompt)

	max_len = max(len(t[0]) for t in tokenized_texts)
	input_ids = torch.full(
	(len(tokenized_texts), max_len),
	fill_value=self.tokenizer.pad_token_id,
	dtype=torch.int64,
	)
	attention_mask = torch.full(
	(len(tokenized_texts), max_len), fill_value=0, dtype=torch.int64
	)

	for i, tokens in enumerate(tokenized_texts):
	input_ids[i, -len(tokens[0]) :] = tokens[0]
	attention_mask[i, -len(tokens[0]) :] = 1

	attention_mask = F.pad(
	attention_mask, pad=(0, self.num_image_latents - 1), value=1
	)

	encoding = BatchFeature(
	data={"input_ids": input_ids, "attention_mask": attention_mask}
	)

	if images is not None:
	if isinstance(images, (list, tuple)):
	image_features = torch.empty(
	(len(images), 3, self.image_size , self.image_size),
	dtype=torch.float32,
	)

	for i, image in enumerate(images):
	image_features[i] = self.feature_extractor(image)

	else:
	image_features = self.image_processor(images).unsqueeze(0)

	if text is not None and images is not None:
	encoding["images"] = image_features
	return encoding

	elif text is not None:
	return encoding

	else:
	return BatchFeature(
	data={
	"images": image_features,
	},
	tensor_type=return_tensors,
	)

	def batch_decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
	refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	"""
	This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
	the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, *kwargs)

	@classmethod
	def from_pretrained(
	cls,
	pretrained_model_name_or_path,
	trust_remote_code=False,
	**kwargs
	):
	config = AutoConfig.from_pretrained(
	pretrained_model_name_or_path,
	trust_remote_code=trust_remote_code
	)
	return cls(config)