vectorllm_v1 / processing_vectorllm.py

Upload folder using huggingface_hub

bcc6605 verified 2 days ago

4.74 kB

	import numpy as np
	from transformers import AutoImageProcessor, AutoProcessor
	from transformers.feature_extraction_utils import BatchFeature
	from transformers.image_utils import ImageInput
	from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

	from .image_processing_vectorllm import VectorLLMImageProcessor


	class VectorLLMImagesKwargs(ImagesKwargs):
	resized_size: int
	patch_size: int


	class VectorLLMProcessorKwargs(ProcessingKwargs, total=False):
	images_kwargs: VectorLLMImagesKwargs
	_defaults = {
	"text_kwargs": {
	"padding": False,
	"return_mm_token_type_ids": False,
	}
	}


	class VectorLLMProcessor(ProcessorMixin):
	attributes = ["image_processor", "tokenizer"]
	image_processor_class = "VectorLLMImageProcessor"
	tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

	def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
	self.image_token = "<pixel>"
	self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
	super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)

	def __call__(
	self,
	images: ImageInput = None,
	text: TextInput \| PreTokenizedInput \| list[TextInput] \| list[PreTokenizedInput] = None,
	**kwargs: Unpack[VectorLLMProcessorKwargs],
	) -> BatchFeature:
	output_kwargs = self._merge_kwargs(
	VectorLLMProcessorKwargs,
	tokenizer_init_kwargs=self.tokenizer.init_kwargs,
	**kwargs,
	)

	image_inputs = {}
	if images is not None:
	image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])

	if not isinstance(text, list):
	text = [text]

	text = text.copy()
	if images is not None:
	num_image_tokens = (
	self.image_processor.resized_size // self.image_processor.patch_size
	) ** 2
	for index in range(len(text)):
	while self.image_token in text[index]:
	text[index] = text[index].replace(
	self.image_token,
	"<\|placeholder\|>" * num_image_tokens,
	1,
	)
	text[index] = text[index].replace("<\|placeholder\|>", self.image_token)

	return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
	return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
	text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])

	if return_mm_token_type_ids:
	array_ids = np.array(text_inputs["input_ids"])
	mm_token_type_ids = np.zeros_like(array_ids)
	mm_token_type_ids[array_ids == self.image_token_id] = 1
	text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()

	return BatchFeature(data={text_inputs, image_inputs}, tensor_type=return_tensors)

	def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
	vision_data = {}
	if image_sizes is not None:
	images_kwargs = VectorLLMProcessorKwargs._defaults.get("images_kwargs", {})
	images_kwargs.update(kwargs)
	resized_size = images_kwargs.get("resized_size", None) or self.image_processor.resized_size
	patch_size = images_kwargs.get("patch_size", None) or self.image_processor.patch_size
	num_image_patches = [(resized_size // patch_size) ** 2 for _ in image_sizes]
	vision_data.update(
	{"num_image_tokens": num_image_patches, "num_image_patches": num_image_patches}
	)
	return MultiModalData(**vision_data)

	def post_process_image_text_to_text(
	self,
	generated_outputs,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	**kwargs,
	):
	return self.tokenizer.batch_decode(
	generated_outputs,
	skip_special_tokens=skip_special_tokens,
	clean_up_tokenization_spaces=clean_up_tokenization_spaces,
	**kwargs,
	)

	@property
	def model_input_names(self):
	tokenizer_input_names = self.tokenizer.model_input_names
	image_processor_input_names = self.image_processor.model_input_names
	return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))


	AutoProcessor.register("VectorLLMProcessor", VectorLLMProcessor)
	AutoImageProcessor.register("VectorLLMImageProcessor", VectorLLMImageProcessor)