# coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import warnings from typing import Iterable, List, Optional, Tuple, Union import numpy as np from .image_utils import ( ChannelDimension, ImageInput, get_channel_dimension_axis, get_image_size, infer_channel_dimension_format, ) from .utils import ExplicitEnum, TensorType, is_jax_tensor, is_tf_tensor, is_torch_tensor from .utils.import_utils import ( is_flax_available, is_tf_available, is_torch_available, is_vision_available, requires_backends, ) if is_vision_available(): import PIL from .image_utils import PILImageResampling if is_torch_available(): import torch if is_tf_available(): import tensorflow as tf if is_flax_available(): import jax.numpy as jnp def to_channel_dimension_format( image: np.ndarray, channel_dim: Union[ChannelDimension, str], input_channel_dim: Optional[Union[ChannelDimension, str]] = None, ) -> np.ndarray: """ Converts `image` to the channel dimension format specified by `channel_dim`. Args: image (`numpy.ndarray`): The image to have its channel dimension set. channel_dim (`ChannelDimension`): The channel dimension format to use. input_channel_dim (`ChannelDimension`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred from the input image. Returns: `np.ndarray`: The image with the channel dimension set to `channel_dim`. """ if not isinstance(image, np.ndarray): raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") if input_channel_dim is None: input_channel_dim = infer_channel_dimension_format(image) target_channel_dim = ChannelDimension(channel_dim) if input_channel_dim == target_channel_dim: return image if target_channel_dim == ChannelDimension.FIRST: image = image.transpose((2, 0, 1)) elif target_channel_dim == ChannelDimension.LAST: image = image.transpose((1, 2, 0)) else: raise ValueError("Unsupported channel dimension format: {}".format(channel_dim)) return image def rescale( image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, dtype: np.dtype = np.float32, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Rescales `image` by `scale`. Args: image (`np.ndarray`): The image to rescale. scale (`float`): The scale to use for rescaling the image. data_format (`ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. dtype (`np.dtype`, *optional*, defaults to `np.float32`): The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature extractors. input_data_format (`ChannelDimension`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred from the input image. Returns: `np.ndarray`: The rescaled image. """ if not isinstance(image, np.ndarray): raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") rescaled_image = image * scale if data_format is not None: rescaled_image = to_channel_dimension_format(rescaled_image, data_format, input_data_format) rescaled_image = rescaled_image.astype(dtype) return rescaled_image def _rescale_for_pil_conversion(image): """ Detects whether or not the image needs to be rescaled before being converted to a PIL image. The assumption is that if the image is of type `np.float` and all values are between 0 and 1, it needs to be rescaled. """ if image.dtype == np.uint8: do_rescale = False elif np.allclose(image, image.astype(int)): if np.all(0 <= image) and np.all(image <= 255): do_rescale = False else: raise ValueError( "The image to be converted to a PIL image contains values outside the range [0, 255], " f"got [{image.min()}, {image.max()}] which cannot be converted to uint8." ) elif np.all(0 <= image) and np.all(image <= 1): do_rescale = True else: raise ValueError( "The image to be converted to a PIL image contains values outside the range [0, 1], " f"got [{image.min()}, {image.max()}] which cannot be converted to uint8." ) return do_rescale def to_pil_image( image: Union[np.ndarray, "PIL.Image.Image", "torch.Tensor", "tf.Tensor", "jnp.ndarray"], do_rescale: Optional[bool] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> "PIL.Image.Image": """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if needed. Args: image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor` or `tf.Tensor`): The image to convert to the `PIL.Image` format. do_rescale (`bool`, *optional*): Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default to `True` if the image type is a floating type and casting to `int` would result in a loss of precision, and `False` otherwise. input_data_format (`ChannelDimension`, *optional*): The channel dimension format of the input image. If unset, will use the inferred format from the input. Returns: `PIL.Image.Image`: The converted image. """ requires_backends(to_pil_image, ["vision"]) if isinstance(image, PIL.Image.Image): return image # Convert all tensors to numpy arrays before converting to PIL image if is_torch_tensor(image) or is_tf_tensor(image): image = image.numpy() elif is_jax_tensor(image): image = np.array(image) elif not isinstance(image, np.ndarray): raise ValueError("Input image type not supported: {}".format(type(image))) # If the channel as been moved to first dim, we put it back at the end. image = to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) # If there is a single channel, we squeeze it, as otherwise PIL can't handle it. image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image # PIL.Image can only store uint8 values so we rescale the image to be between 0 and 255 if needed. do_rescale = _rescale_for_pil_conversion(image) if do_rescale is None else do_rescale if do_rescale: image = rescale(image, 255) image = image.astype(np.uint8) return PIL.Image.fromarray(image) # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366 def get_resize_output_image_size( input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int], Tuple[int]], default_to_square: bool = True, max_size: Optional[int] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> tuple: """ Find the target (height, width) dimension of the output image after resizing given the input image and the desired size. Args: input_image (`np.ndarray`): The image to resize. size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]): The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to this. If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this number. i.e, if height > width, then image will be rescaled to (size * height / width, size). default_to_square (`bool`, *optional*, defaults to `True`): How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square (`size`,`size`). If set to `False`, will replicate [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize) with support for resizing only the smallest edge and providing an optional `max_size`. max_size (`int`, *optional*): The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater than `max_size` after being resized according to `size`, then the image is resized again so that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter than `size`. Only used if `default_to_square` is `False`. input_data_format (`ChannelDimension`, *optional*): The channel dimension format of the input image. If unset, will use the inferred format from the input. Returns: `tuple`: The target (height, width) dimension of the output image after resizing. """ if isinstance(size, (tuple, list)): if len(size) == 2: return tuple(size) elif len(size) == 1: # Perform same logic as if size was an int size = size[0] else: raise ValueError("size must have 1 or 2 elements if it is a list or tuple") if default_to_square: return (size, size) height, width = get_image_size(input_image, input_data_format) short, long = (width, height) if width <= height else (height, width) requested_new_short = size new_short, new_long = requested_new_short, int(requested_new_short * long / short) if max_size is not None: if max_size <= requested_new_short: raise ValueError( f"max_size = {max_size} must be strictly greater than the requested " f"size for the smaller edge size = {size}" ) if new_long > max_size: new_short, new_long = int(max_size * new_short / new_long), max_size return (new_long, new_short) if width <= height else (new_short, new_long) def resize( image, size: Tuple[int, int], resample: "PILImageResampling" = None, reducing_gap: Optional[int] = None, data_format: Optional[ChannelDimension] = None, return_numpy: bool = True, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Resizes `image` to `(height, width)` specified by `size` using the PIL library. Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): The image to resize. size (`Tuple[int, int]`): The size to use for resizing the image. resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`): The filter to user for resampling. reducing_gap (`int`, *optional*): Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to the fair resampling. See corresponding Pillow documentation for more details. data_format (`ChannelDimension`, *optional*): The channel dimension format of the output image. If unset, will use the inferred format from the input. return_numpy (`bool`, *optional*, defaults to `True`): Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is returned. input_data_format (`ChannelDimension`, *optional*): The channel dimension format of the input image. If unset, will use the inferred format from the input. Returns: `np.ndarray`: The resized image. """ requires_backends(resize, ["vision"]) resample = resample if resample is not None else PILImageResampling.BILINEAR if not len(size) == 2: raise ValueError("size must have 2 elements") # For all transformations, we want to keep the same data format as the input image unless otherwise specified. # The resized image from PIL will always have channels last, so find the input format first. if input_data_format is None: input_data_format = infer_channel_dimension_format(image) data_format = input_data_format if data_format is None else data_format # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use # the pillow library to resize the image and then convert back to numpy do_rescale = False if not isinstance(image, PIL.Image.Image): do_rescale = _rescale_for_pil_conversion(image) image = to_pil_image(image, do_rescale=do_rescale, input_data_format=input_data_format) height, width = size # PIL images are in the format (width, height) resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap) if return_numpy: resized_image = np.array(resized_image) # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image # so we need to add it back if necessary. resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image # The image is always in channels last format after converting from a PIL image resized_image = to_channel_dimension_format( resized_image, data_format, input_channel_dim=ChannelDimension.LAST ) # If an image was rescaled to be in the range [0, 255] before converting to a PIL image, then we need to # rescale it back to the original range. resized_image = rescale(resized_image, 1 / 255) if do_rescale else resized_image return resized_image def normalize( image: np.ndarray, mean: Union[float, Iterable[float]], std: Union[float, Iterable[float]], data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Normalizes `image` using the mean and standard deviation specified by `mean` and `std`. image = (image - mean) / std Args: image (`np.ndarray`): The image to normalize. mean (`float` or `Iterable[float]`): The mean to use for normalization. std (`float` or `Iterable[float]`): The standard deviation to use for normalization. data_format (`ChannelDimension`, *optional*): The channel dimension format of the output image. If unset, will use the inferred format from the input. input_data_format (`ChannelDimension`, *optional*): The channel dimension format of the input image. If unset, will use the inferred format from the input. """ if not isinstance(image, np.ndarray): raise ValueError("image must be a numpy array") if input_data_format is None: input_data_format = infer_channel_dimension_format(image) channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format) num_channels = image.shape[channel_axis] if isinstance(mean, Iterable): if len(mean) != num_channels: raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}") else: mean = [mean] * num_channels mean = np.array(mean, dtype=image.dtype) if isinstance(std, Iterable): if len(std) != num_channels: raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}") else: std = [std] * num_channels std = np.array(std, dtype=image.dtype) if input_data_format == ChannelDimension.LAST: image = (image - mean) / std else: image = ((image.T - mean) / std).T image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image return image def center_crop( image: np.ndarray, size: Tuple[int, int], data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, return_numpy: Optional[bool] = None, ) -> np.ndarray: """ Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to the size given, it will be padded (so the returned result will always be of size `size`). Args: image (`np.ndarray`): The image to crop. size (`Tuple[int, int]`): The target size for the cropped image. data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format for the output image. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. If unset, will use the inferred format of the input image. input_data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format for the input image. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. If unset, will use the inferred format of the input image. return_numpy (`bool`, *optional*): Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the previous ImageFeatureExtractionMixin method. - Unset: will return the same type as the input image. - `True`: will return a numpy array. - `False`: will return a `PIL.Image.Image` object. Returns: `np.ndarray`: The cropped image. """ requires_backends(center_crop, ["vision"]) if return_numpy is not None: warnings.warn("return_numpy is deprecated and will be removed in v.4.33", FutureWarning) return_numpy = True if return_numpy is None else return_numpy if not isinstance(image, np.ndarray): raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") if not isinstance(size, Iterable) or len(size) != 2: raise ValueError("size must have 2 elements representing the height and width of the output image") if input_data_format is None: input_data_format = infer_channel_dimension_format(image) output_data_format = data_format if data_format is not None else input_data_format # We perform the crop in (C, H, W) format and then convert to the output format image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format) orig_height, orig_width = get_image_size(image, ChannelDimension.FIRST) crop_height, crop_width = size crop_height, crop_width = int(crop_height), int(crop_width) # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result. top = (orig_height - crop_height) // 2 bottom = top + crop_height # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result. left = (orig_width - crop_width) // 2 right = left + crop_width # Check if cropped area is within image boundaries if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width: image = image[..., top:bottom, left:right] image = to_channel_dimension_format(image, output_data_format, ChannelDimension.FIRST) return image # Otherwise, we may need to pad if the image is too small. Oh joy... new_height = max(crop_height, orig_height) new_width = max(crop_width, orig_width) new_shape = image.shape[:-2] + (new_height, new_width) new_image = np.zeros_like(image, shape=new_shape) # If the image is too small, pad it with zeros top_pad = (new_height - orig_height) // 2 bottom_pad = top_pad + orig_height left_pad = (new_width - orig_width) // 2 right_pad = left_pad + orig_width new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image top += top_pad bottom += top_pad left += left_pad right += left_pad new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)] new_image = to_channel_dimension_format(new_image, output_data_format, ChannelDimension.FIRST) if not return_numpy: new_image = to_pil_image(new_image) return new_image def _center_to_corners_format_torch(bboxes_center: "torch.Tensor") -> "torch.Tensor": center_x, center_y, width, height = bboxes_center.unbind(-1) bbox_corners = torch.stack( # top left x, top left y, bottom right x, bottom right y [(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)], dim=-1, ) return bbox_corners def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray: center_x, center_y, width, height = bboxes_center.T bboxes_corners = np.stack( # top left x, top left y, bottom right x, bottom right y [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height], axis=-1, ) return bboxes_corners def _center_to_corners_format_tf(bboxes_center: "tf.Tensor") -> "tf.Tensor": center_x, center_y, width, height = tf.unstack(bboxes_center, axis=-1) bboxes_corners = tf.stack( # top left x, top left y, bottom right x, bottom right y [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height], axis=-1, ) return bboxes_corners # 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py def center_to_corners_format(bboxes_center: TensorType) -> TensorType: """ Converts bounding boxes from center format to corners format. center format: contains the coordinate for the center of the box and its width, height dimensions (center_x, center_y, width, height) corners format: contains the coodinates for the top-left and bottom-right corners of the box (top_left_x, top_left_y, bottom_right_x, bottom_right_y) """ # Function is used during model forward pass, so we use the input framework if possible, without # converting to numpy if is_torch_tensor(bboxes_center): return _center_to_corners_format_torch(bboxes_center) elif isinstance(bboxes_center, np.ndarray): return _center_to_corners_format_numpy(bboxes_center) elif is_tf_tensor(bboxes_center): return _center_to_corners_format_tf(bboxes_center) raise ValueError(f"Unsupported input type {type(bboxes_center)}") def _corners_to_center_format_torch(bboxes_corners: "torch.Tensor") -> "torch.Tensor": top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1) b = [ (top_left_x + bottom_right_x) / 2, # center x (top_left_y + bottom_right_y) / 2, # center y (bottom_right_x - top_left_x), # width (bottom_right_y - top_left_y), # height ] return torch.stack(b, dim=-1) def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray: top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T bboxes_center = np.stack( [ (top_left_x + bottom_right_x) / 2, # center x (top_left_y + bottom_right_y) / 2, # center y (bottom_right_x - top_left_x), # width (bottom_right_y - top_left_y), # height ], axis=-1, ) return bboxes_center def _corners_to_center_format_tf(bboxes_corners: "tf.Tensor") -> "tf.Tensor": top_left_x, top_left_y, bottom_right_x, bottom_right_y = tf.unstack(bboxes_corners, axis=-1) bboxes_center = tf.stack( [ (top_left_x + bottom_right_x) / 2, # center x (top_left_y + bottom_right_y) / 2, # center y (bottom_right_x - top_left_x), # width (bottom_right_y - top_left_y), # height ], axis=-1, ) return bboxes_center def corners_to_center_format(bboxes_corners: TensorType) -> TensorType: """ Converts bounding boxes from corners format to center format. corners format: contains the coodinates for the top-left and bottom-right corners of the box (top_left_x, top_left_y, bottom_right_x, bottom_right_y) center format: contains the coordinate for the center of the box and its the width, height dimensions (center_x, center_y, width, height) """ # Inverse function accepts different input types so implemented here too if is_torch_tensor(bboxes_corners): return _corners_to_center_format_torch(bboxes_corners) elif isinstance(bboxes_corners, np.ndarray): return _corners_to_center_format_numpy(bboxes_corners) elif is_tf_tensor(bboxes_corners): return _corners_to_center_format_tf(bboxes_corners) raise ValueError(f"Unsupported input type {type(bboxes_corners)}") # 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py # Copyright (c) 2018, Alexander Kirillov # All rights reserved. def rgb_to_id(color): """ Converts RGB color to unique ID. """ if isinstance(color, np.ndarray) and len(color.shape) == 3: if color.dtype == np.uint8: color = color.astype(np.int32) return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) def id_to_rgb(id_map): """ Converts unique ID to RGB color. """ if isinstance(id_map, np.ndarray): id_map_copy = id_map.copy() rgb_shape = tuple(list(id_map.shape) + [3]) rgb_map = np.zeros(rgb_shape, dtype=np.uint8) for i in range(3): rgb_map[..., i] = id_map_copy % 256 id_map_copy //= 256 return rgb_map color = [] for _ in range(3): color.append(id_map % 256) id_map //= 256 return color class PaddingMode(ExplicitEnum): """ Enum class for the different padding modes to use when padding images. """ CONSTANT = "constant" REFLECT = "reflect" REPLICATE = "replicate" SYMMETRIC = "symmetric" def pad( image: np.ndarray, padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], mode: PaddingMode = PaddingMode.CONSTANT, constant_values: Union[float, Iterable[float]] = 0.0, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Pads the `image` with the specified (height, width) `padding` and `mode`. Args: image (`np.ndarray`): The image to pad. padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): Padding to apply to the edges of the height, width axes. Can be one of three formats: - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. - `((before, after),)` yields same before and after pad for height and width. - `(pad,)` or int is a shortcut for before = after = pad width for all axes. mode (`PaddingMode`): The padding mode to use. Can be one of: - `"constant"`: pads with a constant value. - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the vector along each axis. - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. constant_values (`float` or `Iterable[float]`, *optional*): The value to use for the padding if `mode` is `"constant"`. data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format for the output image. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. If unset, will use same as the input image. input_data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format for the input image. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. If unset, will use the inferred format of the input image. Returns: `np.ndarray`: The padded image. """ if input_data_format is None: input_data_format = infer_channel_dimension_format(image) def _expand_for_data_format(values): """ Convert values to be in the format expected by np.pad based on the data format. """ if isinstance(values, (int, float)): values = ((values, values), (values, values)) elif isinstance(values, tuple) and len(values) == 1: values = ((values[0], values[0]), (values[0], values[0])) elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int): values = (values, values) elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple): values = values else: raise ValueError(f"Unsupported format: {values}") # add 0 for channel dimension values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0)) # Add additional padding if there's a batch dimension values = (0, *values) if image.ndim == 4 else values return values padding = _expand_for_data_format(padding) if mode == PaddingMode.CONSTANT: constant_values = _expand_for_data_format(constant_values) image = np.pad(image, padding, mode="constant", constant_values=constant_values) elif mode == PaddingMode.REFLECT: image = np.pad(image, padding, mode="reflect") elif mode == PaddingMode.REPLICATE: image = np.pad(image, padding, mode="edge") elif mode == PaddingMode.SYMMETRIC: image = np.pad(image, padding, mode="symmetric") else: raise ValueError(f"Invalid padding mode: {mode}") image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image return image # TODO (Amy): Accept 1/3/4 channel numpy array as input and return np.array as default def convert_to_rgb(image: ImageInput) -> ImageInput: """ Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image as is. Args: image (Image): The image to convert. """ requires_backends(convert_to_rgb, ["vision"]) if not isinstance(image, PIL.Image.Image): return image image = image.convert("RGB") return image def flip_channel_order( image: np.ndarray, data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Flips the channel order of the image. If the image is in RGB format, it will be converted to BGR and vice versa. Args: image (`np.ndarray`): The image to flip. data_format (`ChannelDimension`, *optional*): The channel dimension format for the output image. Can be one of: - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.LAST`: image in (height, width, num_channels) format. If unset, will use same as the input image. input_data_format (`ChannelDimension`, *optional*): The channel dimension format for the input image. Can be one of: - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.LAST`: image in (height, width, num_channels) format. If unset, will use the inferred format of the input image. """ input_data_format = infer_channel_dimension_format(image) if input_data_format is None else input_data_format if input_data_format == ChannelDimension.LAST: image = image[..., ::-1] elif input_data_format == ChannelDimension.FIRST: image = image[::-1, ...] else: raise ValueError(f"Unsupported channel dimension: {input_data_format}") if data_format is not None: image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) return image