xgen-mm-phi3-mini-base-r-v1.5 / image_processing_blip_3.py
Manli's picture
Merge modeling files into a single one to avoid relative import
cc057ef
raw
history blame contribute delete
No virus
16.2 kB
import random
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
import torchvision.transforms.functional as F
from torchvision.transforms import (
Normalize,
Compose,
RandomResizedCrop,
InterpolationMode,
ToTensor,
Resize,
CenterCrop,
ColorJitter,
Grayscale,
)
import numbers
import torch
import ast
import math
import numpy as np
from PIL import Image
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
from transformers.image_utils import ImageInput
from transformers.utils import TensorType
def expand2square(pil_img, background_color):
width, height = pil_img.size
if width == height:
return pil_img
elif width > height:
result = Image.new(pil_img.mode, (width, width), background_color)
result.paste(pil_img, (0, (width - height) // 2))
return result
else:
result = Image.new(pil_img.mode, (height, height), background_color)
result.paste(pil_img, ((height - width) // 2, 0))
return result
class Blip3ImageProcessor(BaseImageProcessor):
def __init__(
self,
do_resize: bool = True,
resize_mode: str = "squash",
interpolation_mode: str = "bicubic",
size: Union[Tuple[int, int], List[int]] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.do_resize = do_resize
self.resize_mode = resize_mode
self.interpolation_mode = interpolation_mode
self.size = size if size is not None else (384, 384)
self.grids = None
self.image_mean = image_mean if image_mean is not None else [0.5, 0.5, 0.5]
self.image_std = image_std if image_std is not None else [0.5, 0.5, 0.5]
@classmethod
def resize(cls, image_size, resize_mode, interpolation="bicubic", fill_color=0):
interpolation_mode = (
InterpolationMode.BILINEAR
if interpolation == "bilinear"
else InterpolationMode.BICUBIC
)
if resize_mode == "longest":
transforms = [
ResizeKeepRatio(
image_size, interpolation=interpolation_mode, longest=1
),
CenterCropOrPad(image_size, fill=fill_color),
]
elif resize_mode == "squash":
if isinstance(image_size, int):
image_size = (image_size, image_size)
transforms = [
Resize(image_size, interpolation=interpolation_mode),
]
else:
assert resize_mode == "shortest"
if not isinstance(image_size, (tuple, list)):
image_size = (image_size, image_size)
if image_size[0] == image_size[1]:
# simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg)
transforms = [Resize(image_size[0], interpolation=interpolation_mode)]
else:
# resize shortest edge to matching target dim for non-square target
transforms = [ResizeKeepRatio(image_size)]
transforms += [CenterCrop(image_size)]
return transforms
@classmethod
def convert_rgb(cls, image):
return image.convert("RGB")
def _preprocess(self, images: ImageInput) -> torch.Tensor:
transforms = self.resize(self.size, self.resize_mode, self.interpolation_mode)
transforms.extend(
[
self.convert_rgb,
ToTensor(),
Normalize(mean=self.image_mean, std=self.image_std),
]
)
composed_transforms = Compose(transforms)
images_tensor = composed_transforms(images)
return images_tensor
def preprocess(
self,
images: ImageInput,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> BatchFeature:
if "image_aspect_ratio" in kwargs:
image_aspect_ratio = kwargs["image_aspect_ratio"]
else:
image_aspect_ratio = "none"
new_images = []
if image_aspect_ratio == "pad":
for image in images:
image = expand2square(
image, tuple(int(x * 255) for x in self.image_mean)
)
image = self._preprocess(image)
new_images.append(image)
elif image_aspect_ratio == "anyres":
for image in images:
image = process_anyres_image(
image, self._preprocess, self.size, self.grids
)
new_images.append(image)
else:
for image in images:
image = self._preprocess(image)
new_images.append(image)
if all(x.shape == new_images[0].shape for x in new_images):
new_images = torch.stack(new_images, dim=0)
if image_aspect_ratio == "anyres":
new_images = BatchFeature(
data={"pixel_values": new_images}, tensor_type=return_tensors
)
else:
new_images = BatchFeature(
data={"pixel_values": new_images.unsqueeze(1).unsqueeze(0)},
tensor_type=return_tensors,
)
return new_images
class ResizeKeepRatio:
"""Resize and Keep Ratio
Copy & paste from `timm`
"""
def __init__(
self,
size,
longest=0.0,
interpolation=InterpolationMode.BICUBIC,
random_scale_prob=0.0,
random_scale_range=(0.85, 1.05),
random_aspect_prob=0.0,
random_aspect_range=(0.9, 1.11),
):
if isinstance(size, (list, tuple)):
self.size = tuple(size)
else:
self.size = (size, size)
self.interpolation = interpolation
self.longest = float(longest) # [0, 1] where 0 == shortest edge, 1 == longest
self.random_scale_prob = random_scale_prob
self.random_scale_range = random_scale_range
self.random_aspect_prob = random_aspect_prob
self.random_aspect_range = random_aspect_range
@staticmethod
def get_params(
img,
target_size,
longest,
random_scale_prob=0.0,
random_scale_range=(0.85, 1.05),
random_aspect_prob=0.0,
random_aspect_range=(0.9, 1.11),
):
"""Get parameters"""
source_size = img.size[::-1] # h, w
h, w = source_size
target_h, target_w = target_size
ratio_h = h / target_h
ratio_w = w / target_w
ratio = max(ratio_h, ratio_w) * longest + min(ratio_h, ratio_w) * (
1.0 - longest
)
if random_scale_prob > 0 and random.random() < random_scale_prob:
ratio_factor = random.uniform(random_scale_range[0], random_scale_range[1])
ratio_factor = (ratio_factor, ratio_factor)
else:
ratio_factor = (1.0, 1.0)
if random_aspect_prob > 0 and random.random() < random_aspect_prob:
aspect_factor = random.uniform(
random_aspect_range[0], random_aspect_range[1]
)
ratio_factor = (
ratio_factor[0] / aspect_factor,
ratio_factor[1] * aspect_factor,
)
size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
return size
def __call__(self, img):
"""
Args:
img (PIL Image): Image to be cropped and resized.
Returns:
PIL Image: Resized, padded to at least target size, possibly cropped to exactly target size
"""
size = self.get_params(
img,
self.size,
self.longest,
self.random_scale_prob,
self.random_scale_range,
self.random_aspect_prob,
self.random_aspect_range,
)
img = F.resize(img, size, self.interpolation)
return img
def __repr__(self):
format_string = self.__class__.__name__ + "(size={0}".format(self.size)
format_string += f", interpolation={self.interpolation})"
format_string += f", longest={self.longest:.3f})"
return format_string
def _setup_size(size, error_msg):
if isinstance(size, numbers.Number):
return int(size), int(size)
if isinstance(size, Sequence) and len(size) == 1:
return size[0], size[0]
if len(size) != 2:
raise ValueError(error_msg)
return size
def center_crop_or_pad(
img: torch.Tensor, output_size: List[int], fill=0
) -> torch.Tensor:
"""Center crops and/or pads the given image.
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
Args:
img (PIL Image or Tensor): Image to be cropped.
output_size (sequence or int): (height, width) of the crop box. If int or sequence with single int,
it is used for both directions.
fill (int, Tuple[int]): Padding color
Returns:
PIL Image or Tensor: Cropped image.
"""
if isinstance(output_size, numbers.Number):
output_size = (int(output_size), int(output_size))
elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
output_size = (output_size[0], output_size[0])
_, image_height, image_width = F.get_dimensions(img)
crop_height, crop_width = output_size
if crop_width > image_width or crop_height > image_height:
padding_ltrb = [
(crop_width - image_width) // 2 if crop_width > image_width else 0,
(crop_height - image_height) // 2 if crop_height > image_height else 0,
(crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
(crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
]
img = F.pad(img, padding_ltrb, fill=fill)
_, image_height, image_width = F.get_dimensions(img)
if crop_width == image_width and crop_height == image_height:
return img
crop_top = int(round((image_height - crop_height) / 2.0))
crop_left = int(round((image_width - crop_width) / 2.0))
return F.crop(img, crop_top, crop_left, crop_height, crop_width)
class CenterCropOrPad(torch.nn.Module):
"""Crops the given image at the center.
If the image is torch Tensor, it is expected
to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
Args:
size (sequence or int): Desired output size of the crop. If size is an
int instead of sequence like (h, w), a square crop (size, size) is
made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
"""
def __init__(self, size, fill=0):
super().__init__()
self.size = _setup_size(
size, error_msg="Please provide only two dimensions (h, w) for size."
)
self.fill = fill
def forward(self, img):
"""
Args:
img (PIL Image or Tensor): Image to be cropped.
Returns:
PIL Image or Tensor: Cropped image.
"""
return center_crop_or_pad(img, self.size, fill=self.fill)
def __repr__(self) -> str:
return f"{self.__class__.__name__}(size={self.size})"
def process_anyres_image(image, processor, processor_size, grid_pinpoints):
"""
Process an image with variable resolutions.
Args:
image (PIL.Image.Image): The input image to be processed.
processor: The image processor object.
processor_size (tuple, list): The size of the image processor.
grid_pinpoints (str): A string representation of a list of possible resolutions.
Returns:
torch.Tensor: A tensor containing the processed image patches.
"""
# FIXME: determine grid_pinpoints from image sizes.
if type(grid_pinpoints) is list:
possible_resolutions = grid_pinpoints
else:
possible_resolutions = ast.literal_eval(grid_pinpoints)
best_resolution = select_best_resolution(image.size, possible_resolutions)
image_padded = resize_and_pad_image(image, best_resolution)
# processor_size = processor.transforms[0].size
patches = divide_to_patches(image_padded, processor_size[0])
image_original_resize = image.resize((processor_size[0], processor_size[0]))
image_patches = [image_original_resize] + patches
image_patches = [processor(image_patch) for image_patch in image_patches]
return torch.stack(image_patches, dim=0)
def select_best_resolution(original_size, possible_resolutions):
"""
Selects the best resolution from a list of possible resolutions based on the original size.
Args:
original_size (tuple): The original size of the image in the format (width, height).
possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
Returns:
tuple: The best fit resolution in the format (width, height).
"""
original_width, original_height = original_size
best_fit = None
max_effective_resolution = 0
min_wasted_resolution = float("inf")
for width, height in possible_resolutions:
scale = min(width / original_width, height / original_height)
downscaled_width, downscaled_height = int(original_width * scale), int(
original_height * scale
)
effective_resolution = min(
downscaled_width * downscaled_height, original_width * original_height
)
wasted_resolution = (width * height) - effective_resolution
if effective_resolution > max_effective_resolution or (
effective_resolution == max_effective_resolution
and wasted_resolution < min_wasted_resolution
):
max_effective_resolution = effective_resolution
min_wasted_resolution = wasted_resolution
best_fit = (width, height)
return best_fit
def resize_and_pad_image(image, target_resolution):
"""
Resize and pad an image to a target resolution while maintaining aspect ratio.
Args:
image (PIL.Image.Image): The input image.
target_resolution (tuple): The target resolution (width, height) of the image.
Returns:
PIL.Image.Image: The resized and padded image.
"""
original_width, original_height = image.size
target_width, target_height = target_resolution
scale_w = target_width / original_width
scale_h = target_height / original_height
if scale_w < scale_h:
new_width = target_width
new_height = min(math.ceil(original_height * scale_w), target_height)
else:
new_height = target_height
new_width = min(math.ceil(original_width * scale_h), target_width)
# Resize the image
resized_image = image.resize((new_width, new_height))
new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0))
paste_x = (target_width - new_width) // 2
paste_y = (target_height - new_height) // 2
new_image.paste(resized_image, (paste_x, paste_y))
return new_image
def divide_to_patches(image, patch_size):
"""
Divides an image into patches of a specified size.
Args:
image (PIL.Image.Image): The input image.
patch_size (int): The size of each patch.
Returns:
list: A list of PIL.Image.Image objects representing the patches.
"""
patches = []
width, height = image.size
for i in range(0, height, patch_size):
for j in range(0, width, patch_size):
box = (j, i, j + patch_size, i + patch_size)
patch = image.crop(box)
patches.append(patch)
return patches