cogvlm2-llama3-chat-19B-tgi / image_processing_cogvlm2.py

Ubuntu

first

1f6d03b 26 days ago

No virus

5.3 kB

	# coding=utf-8
	# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Image processor class for Idefics."""

	from typing import Callable, Dict, List, Optional, Union

	from PIL import Image


	from transformers.image_processing_utils import BaseImageProcessor, BatchFeature


	from transformers.image_utils import (
	ImageInput
	)
	from transformers.utils import TensorType, is_torch_available

	from torchvision import transforms




	class Cogvlm2ImageProcessor(BaseImageProcessor):
	r"""
	Constructs a Idefics image processor.

	Args:
	image_size (`int`, optional, defaults to 224):
	Resize to image size
	image_mean (`float` or `List[float]`, optional, defaults to `IDEFICS_STANDARD_MEAN`):
	Mean to use if normalizing the image. This is a float or list of floats the length of the number of
	channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
	overridden by the `image_mean` parameter in the `preprocess` method.
	image_std (`float` or `List[float]`, optional, defaults to `IDEFICS_STANDARD_STD`):
	Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
	number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
	Can be overridden by the `image_std` parameter in the `preprocess` method.
	image_num_channels (`int`, optional, defaults to 3):
	Number of image channels.
	"""

	model_input_names = ["pixel_values"]

	def __init__(
	self,
	image_size: int = 1344,
	image_mean: Optional[Union[float, List[float]]] = None,
	image_std: Optional[Union[float, List[float]]] = None,
	image_num_channels: Optional[int] = 3,
	**kwargs,
	) -> None:
	super().__init__(**kwargs)

	self.image_size = image_size
	self.image_num_channels = image_num_channels
	self.image_mean = image_mean
	self.image_std = image_std

	def preprocess(
	self,
	images: ImageInput,
	return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
	) -> TensorType:
	"""
	Preprocess a batch of images.

	Args:
	images (`ImageInput`):
	A list of images to preprocess.
	image_size (`int`, optional, defaults to `self.image_size`):
	Resize to image size
	image_num_channels (`int`, optional, defaults to `self.image_num_channels`):
	Number of image channels.
	image_mean (`float` or `List[float]`, optional, defaults to `IDEFICS_STANDARD_MEAN`):
	Mean to use if normalizing the image. This is a float or list of floats the length of the number of
	channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
	be overridden by the `image_mean` parameter in the `preprocess` method.
	image_std (`float` or `List[float]`, optional, defaults to `IDEFICS_STANDARD_STD`):
	Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
	number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
	method. Can be overridden by the `image_std` parameter in the `preprocess` method.
	transform (`Callable`, optional, defaults to `None`):
	A custom transform function that accepts a single image can be passed for training. For example,
	`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
	assumed - and then a preset of inference-specific transforms will be applied to the images

	Returns:
	a PyTorch tensor of the processed images

	"""

	# For training a user needs to pass their own set of transforms as a Callable.
	# For reference this is what was used in the original IDEFICS training:
	transform = transforms.Compose(
	[
	transforms.Resize(
	(self.image_size, self.image_size), interpolation=transforms.InterpolationMode.BICUBIC
	),
	transforms.ToTensor(),
	transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
	]
	)

	images = transform(images).unsqueeze(0)

	images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)

	return images