# coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Image processor class for Idefics.""" from typing import Callable, Dict, List, Optional, Union from PIL import Image from transformers.image_processing_utils import BaseImageProcessor, BatchFeature from transformers.image_utils import ( ImageInput ) from transformers.utils import TensorType, is_torch_available from torchvision import transforms class Cogvlm2ImageProcessor(BaseImageProcessor): r""" Constructs a Idefics image processor. Args: image_size (`int`, *optional*, defaults to 224): Resize to image size image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`): Mean to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be overridden by the `image_mean` parameter in the `preprocess` method. image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`): Standard deviation to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method. image_num_channels (`int`, *optional*, defaults to 3): Number of image channels. """ model_input_names = ["pixel_values"] def __init__( self, image_size: int = 1344, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, image_num_channels: Optional[int] = 3, **kwargs, ) -> None: super().__init__(**kwargs) self.image_size = image_size self.image_num_channels = image_num_channels self.image_mean = image_mean self.image_std = image_std def preprocess( self, images: ImageInput, return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, ) -> TensorType: """ Preprocess a batch of images. Args: images (`ImageInput`): A list of images to preprocess. image_size (`int`, *optional*, defaults to `self.image_size`): Resize to image size image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`): Number of image channels. image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`): Mean to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be overridden by the `image_mean` parameter in the `preprocess` method. image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`): Standard deviation to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method. transform (`Callable`, *optional*, defaults to `None`): A custom transform function that accepts a single image can be passed for training. For example, `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is assumed - and then a preset of inference-specific transforms will be applied to the images Returns: a PyTorch tensor of the processed images """ # For training a user needs to pass their own set of transforms as a Callable. # For reference this is what was used in the original IDEFICS training: transform = transforms.Compose( [ transforms.Resize( (self.image_size, self.image_size), interpolation=transforms.InterpolationMode.BICUBIC ), transforms.ToTensor(), transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), ] ) images = transform(images).unsqueeze(0) images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) return images