|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Image processor class for Idefics.""" |
|
|
|
from typing import Callable, Dict, List, Optional, Union |
|
|
|
from PIL import Image |
|
|
|
|
|
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature |
|
|
|
|
|
from transformers.image_utils import ( |
|
ImageInput |
|
) |
|
from transformers.utils import TensorType, is_torch_available |
|
|
|
from torchvision import transforms |
|
|
|
|
|
|
|
|
|
class Cogvlm2ImageProcessor(BaseImageProcessor): |
|
r""" |
|
Constructs a Idefics image processor. |
|
|
|
Args: |
|
image_size (`int`, *optional*, defaults to 224): |
|
Resize to image size |
|
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`): |
|
Mean to use if normalizing the image. This is a float or list of floats the length of the number of |
|
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be |
|
overridden by the `image_mean` parameter in the `preprocess` method. |
|
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`): |
|
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the |
|
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. |
|
Can be overridden by the `image_std` parameter in the `preprocess` method. |
|
image_num_channels (`int`, *optional*, defaults to 3): |
|
Number of image channels. |
|
""" |
|
|
|
model_input_names = ["pixel_values"] |
|
|
|
def __init__( |
|
self, |
|
image_size: int = 1344, |
|
image_mean: Optional[Union[float, List[float]]] = None, |
|
image_std: Optional[Union[float, List[float]]] = None, |
|
image_num_channels: Optional[int] = 3, |
|
**kwargs, |
|
) -> None: |
|
super().__init__(**kwargs) |
|
|
|
self.image_size = image_size |
|
self.image_num_channels = image_num_channels |
|
self.image_mean = image_mean |
|
self.image_std = image_std |
|
|
|
def preprocess( |
|
self, |
|
images: ImageInput, |
|
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, |
|
) -> TensorType: |
|
""" |
|
Preprocess a batch of images. |
|
|
|
Args: |
|
images (`ImageInput`): |
|
A list of images to preprocess. |
|
image_size (`int`, *optional*, defaults to `self.image_size`): |
|
Resize to image size |
|
image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`): |
|
Number of image channels. |
|
image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`): |
|
Mean to use if normalizing the image. This is a float or list of floats the length of the number of |
|
channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can |
|
be overridden by the `image_mean` parameter in the `preprocess` method. |
|
image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`): |
|
Standard deviation to use if normalizing the image. This is a float or list of floats the length of the |
|
number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` |
|
method. Can be overridden by the `image_std` parameter in the `preprocess` method. |
|
transform (`Callable`, *optional*, defaults to `None`): |
|
A custom transform function that accepts a single image can be passed for training. For example, |
|
`torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is |
|
assumed - and then a preset of inference-specific transforms will be applied to the images |
|
|
|
Returns: |
|
a PyTorch tensor of the processed images |
|
|
|
""" |
|
|
|
|
|
|
|
transform = transforms.Compose( |
|
[ |
|
transforms.Resize( |
|
(self.image_size, self.image_size), interpolation=transforms.InterpolationMode.BICUBIC |
|
), |
|
transforms.ToTensor(), |
|
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), |
|
] |
|
) |
|
|
|
images = transform(images).unsqueeze(0) |
|
|
|
images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) |
|
|
|
return images |
|
|