| |
| |
| |
| |
| |
| |
| import bisect |
| from copy import deepcopy |
|
|
| import cv2 |
| import numpy as np |
| import torch |
|
|
| from core.structures.bbox import Bbox |
|
|
|
|
| def src_center_crop_according_to_mask( |
| img, mask, aspect_standard, enlarge_ratio, head_bbox=None |
| ): |
| """This function crops the input image and mask around the nonzero region of the mask. |
| The cropping region's aspect ratio is standardized to match `aspect_standard` and can be enlarged randomly |
| between `enlarge_ratio[0]` and `enlarge_ratio[1]`. |
| Optionally, if a `head_bbox` (bounding box of the head) is provided, its coordinates will be offset |
| to match the new cropped region. |
| Returns the cropped image, cropped mask, the left and top offset (relative to original image), |
| and the (possibly offset) head_bbox. |
| """ |
|
|
| ys, xs = np.where(mask > 0) |
|
|
| if len(xs) == 0 or len(ys) == 0: |
| raise Exception("empty mask") |
|
|
| x_min = np.min(xs) |
| x_max = np.max(xs) |
| y_min = np.min(ys) |
| y_max = np.max(ys) |
|
|
| height, width, c = img.shape |
|
|
| fore_bbox = Bbox([x_min, y_min, x_max, y_max], mode="whwh") |
|
|
| if abs(enlarge_ratio[0] - 1) > 0.01 or abs(enlarge_ratio[1] - 1) > 0.01: |
| enlarge_ratio_min, enlarge_ratio_max = enlarge_ratio |
| enlarge_ratio_cur = ( |
| np.random.rand() * (enlarge_ratio_max - enlarge_ratio_min) |
| + enlarge_ratio_min |
| ) |
|
|
| fore_bbox = fore_bbox.scale(enlarge_ratio_cur, width, height) |
|
|
| _l, _t, _r, _b = fore_bbox.get_box() |
| crop_img = img[_t:_b, _l:_r] |
| crop_mask = mask[_t:_b, _l:_r] |
|
|
| if head_bbox is not None: |
| head_bbox.offset(-_l, -_t) |
|
|
| return crop_img, crop_mask, _l, _t, head_bbox |
|
|
|
|
| def collected_imgs_with_target_ratio(img_list, target_ratio, tgt_max_size=1024): |
| """ |
| Resize and pad each image in the list to a target ratio and size for batching. |
| |
| Args: |
| img_list (List[np.ndarray]): List of input images in HWC format. |
| target_ratio (float): The desired aspect ratio (width/height) of the output images. |
| tgt_max_size (int, optional): The maximum height of the resized/padded images. Defaults to 1024. |
| |
| Returns: |
| torch.Tensor: Batched tensor of shape (N, 3, H, W), where images are resized and padded |
| to share the same size and aspect ratio, with values in range [0, 255]. |
| |
| Notes: |
| The function finds the maximum height and width among images, |
| selects the closest allowed target size, rescales each image down while maintaining aspect ratio, |
| then center-pads each image to (height, width) determined by target ratio and max size. |
| Images are converted from numpy arrays to torch tensors with CHW format. |
| """ |
|
|
| tgt_max_size_list = [840] |
|
|
| max_width = 0 |
| max_height = 0 |
| for img in img_list: |
| height, width = img.shape[:2] |
| max_width = max(max_width, width) |
| max_height = max(max_height, height) |
| assert max_height > 0 and max_width > 0, print("do not find images") |
|
|
| idx = bisect.bisect_left(tgt_max_size_list, max_height) |
| idx = max(idx - 1, 0) |
| tgt_max_size = tgt_max_size_list[idx] |
| tgt_max_width_size = (int(tgt_max_size / target_ratio) // 14) * 14 |
|
|
| height, width = tgt_max_size, tgt_max_width_size |
|
|
| collect_imgs = [] |
|
|
| for img in img_list: |
| draw = np.ones((tgt_max_size, tgt_max_width_size, 3), dtype=img.dtype) |
|
|
| input_h, input_w, _ = img.shape |
|
|
| scale = min(height / input_h, width / input_w) |
|
|
| if scale < 1.0: |
| new_size = (int(input_w * scale), int(input_h * scale)) |
| resized_img = cv2.resize(img, new_size, interpolation=cv2.INTER_AREA) |
| else: |
| resized_img = img |
|
|
| y_offset = (height - resized_img.shape[0]) // 2 |
| x_offset = (width - resized_img.shape[1]) // 2 |
|
|
| if resized_img.shape[2] == 3 and draw.shape[2] == 3: |
| draw[ |
| y_offset : y_offset + resized_img.shape[0], |
| x_offset : x_offset + resized_img.shape[1], |
| ] = resized_img |
|
|
| else: |
| raise NotImplementedError("color channel not match") |
|
|
| collect_imgs.append( |
| torch.from_numpy(draw).float().permute(2, 0, 1).unsqueeze(0) |
| ) |
|
|
| return torch.cat(collect_imgs, dim=0) |
|
|
|
|
| class PadRatio: |
| """Pad the input images to a target ratio and size for batching.""" |
|
|
| def __init__(self, target_ratio, tgt_max_size_list=[840], val=False, **kwargs): |
| """ |
| PadRatio pads and resizes each image in the input list to fit within a target aspect ratio and maximum size(s). |
| The function computes the largest height and width among the input images, determines an appropriate |
| target output shape according to the given ratio and a sorted list of allowed maximum sizes, and |
| resizes each image (scaling down if necessary, centering with white padding) to a common batch size |
| suitable for batching in deep learning pipelines. |
| |
| Args: |
| target_ratio (float or str): The target aspect ratio (height/width) to pad images to. |
| If provided as a string, it will be evaluated to a float. |
| tgt_max_size_list (list of int, optional): List of allowed maximum sizes for output images. |
| Used to select the appropriate size for the current batch. Default: [840]. |
| val (bool, optional): If True, disables random scaling (used for validation). Default: False. |
| **kwargs: Extra arguments (ignored). |
| |
| Usage: |
| pad = PadRatio(target_ratio=1.2, tgt_max_size_list=[672, 840, 1008]) |
| imgs = pad([img1, img2, img3]) |
| """ |
|
|
| self.target_ratio = ( |
| eval(target_ratio) if isinstance(target_ratio, str) else target_ratio |
| ) |
| self.tgt_max_size_list = tgt_max_size_list |
| self.val = val |
|
|
| def __call__(self, img_list): |
| """ |
| Pads and resizes a list of images to a common batch size with a target aspect ratio. |
| |
| Each image in `img_list` is padded to match the aspect ratio specified by `target_ratio`, |
| and resized (if necessary) so that the largest side does not exceed `tgt_max_size_list`. |
| The output images are centered with white padding to ensure all are identical in shape, |
| which is useful for batching in deep learning. This function is compatible with both training and validation |
| workflows, with randomized scaling disabled when `val` is True. |
| |
| Args: |
| img_list (list of numpy.ndarray): List of input images of shape (H, W, 3), dtype uint8 or float32. |
| |
| Returns: |
| torch.Tensor: Batched tensor of shape (N, 3, tgt_max_size, tgt_max_width_size), where N is the number of images. |
| """ |
|
|
| target_ratio = self.target_ratio |
| tgt_max_size_list = self.tgt_max_size_list |
|
|
| max_width = 0 |
| max_height = 0 |
| for img in img_list: |
| height, width = img.shape[:2] |
| max_width = max(max_width, width) |
| max_height = max(max_height, height) |
| assert max_height > 0 and max_width > 0, print("do not find images") |
|
|
| idx = bisect.bisect_left(tgt_max_size_list, max_height) |
| idx = max(idx - 1, 0) |
| tgt_max_size = tgt_max_size_list[idx] |
| tgt_max_width_size = (int(tgt_max_size / target_ratio) // 14) * 14 |
|
|
| height, width = tgt_max_size, tgt_max_width_size |
| collect_imgs = [] |
|
|
| max_width = 0 |
| max_height = 0 |
| for img in img_list: |
| height, width = img.shape[:2] |
| max_width = max(max_width, width) |
| max_height = max(max_height, height) |
| assert max_height > 0 and max_width > 0, print("do not find images") |
|
|
| idx = bisect.bisect_left(tgt_max_size_list, max_height) |
| idx = max(idx - 1, 0) |
| tgt_max_size = tgt_max_size_list[idx] |
| tgt_max_width_size = (int(tgt_max_size / target_ratio) // 14) * 14 |
|
|
| height, width = tgt_max_size, tgt_max_width_size |
|
|
| collect_imgs = [] |
|
|
| for img in img_list: |
| draw = np.ones((tgt_max_size, tgt_max_width_size, 3), dtype=img.dtype) |
|
|
| input_h, input_w, _ = img.shape |
|
|
| scale = min(height / input_h, width / input_w) |
|
|
| if scale < 1.0: |
| new_size = (int(input_w * scale), int(input_h * scale)) |
| resized_img = cv2.resize(img, new_size, interpolation=cv2.INTER_AREA) |
| else: |
| resized_img = img |
|
|
| y_offset = (height - resized_img.shape[0]) // 2 |
| x_offset = (width - resized_img.shape[1]) // 2 |
|
|
| if resized_img.shape[2] == 3 and draw.shape[2] == 3: |
| draw[ |
| y_offset : y_offset + resized_img.shape[0], |
| x_offset : x_offset + resized_img.shape[1], |
| ] = resized_img |
|
|
| else: |
| raise NotImplementedError("color channel not match") |
|
|
| collect_imgs.append(draw) |
| return collect_imgs |
|
|
| def __repr__(self): |
| return ( |
| f" {self.__class__.__name__}({self.target_ratio}, {self.tgt_max_size_list})" |
| ) |
|
|
|
|
| class PadRatioWithScale(PadRatio): |
| """pad ratio follow by a scale image, |
| if the image is smaller than the target size, |
| it will be scaled to the target size |
| """ |
|
|
| def scale_img_keep_aspect(self, img, t_h, t_w): |
| """ |
| Scales an input image while keeping its aspect ratio, so that the result matches the target height (t_h) |
| and width (t_w). The scaling factor is chosen such that the scaled image fits within (t_h, t_w) without |
| exceeding it, and the aspect ratio is preserved. If not in validation mode, a small random variation is |
| applied to the scale. |
| |
| Args: |
| img (np.ndarray): Input image in HWC format. |
| t_h (int): Target height. |
| t_w (int): Target width. |
| |
| Returns: |
| np.ndarray: Scaled image, still preserving its aspect ratio, and potentially with a random scale |
| variation if not in validation mode. |
| """ |
|
|
| h, w = img.shape[:2] |
| scale_h, scale_w = t_h / h, t_w / w |
| scale = min(scale_h, scale_w) |
| assert scale > 1, f"{scale} must be larger than 1" |
|
|
| if not self.val: |
| scale_min = scale * 0.95 |
| random_scale = scale_min + (scale - scale_min) * np.random.rand() |
| else: |
| random_scale = scale |
| new_w, new_h = (int(w * random_scale), int(h * random_scale)) |
| resized_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA) |
|
|
| return resized_img |
|
|
| def __call__(self, img_list): |
| """ |
| Pad each image in the input list according to a scaled ratio determined by the target size. |
| If the image is smaller than the target, it will be upscaled with preserved aspect ratio. |
| Padding is added so that the final image matches the computed target size (multiples of 14). |
| |
| Args: |
| img_list (List[np.ndarray]): List of images (H, W, 3) to be padded and/or scaled. |
| |
| Returns: |
| collect_imgs (List[np.ndarray]): List of images (H', W', 3) where each image is scaled (if needed) and |
| padded to meet target size constraints, with shapes that are multiples of 14. |
| """ |
|
|
| target_ratio = self.target_ratio |
| tgt_max_size_list = self.tgt_max_size_list |
|
|
| max_width = 0 |
| max_height = 0 |
| for img in img_list: |
| height, width = img.shape[:2] |
| max_width = max(max_width, width) |
| max_height = max(max_height, height) |
| assert max_height > 0 and max_width > 0, print("do not find images") |
|
|
| idx = bisect.bisect_left(tgt_max_size_list, max_height) |
| idx = max(idx - 1, 0) |
| tgt_max_size = tgt_max_size_list[idx] |
| tgt_max_width_size = (int(tgt_max_size / target_ratio) // 14) * 14 |
|
|
| height, width = tgt_max_size, tgt_max_width_size |
| collect_imgs = [] |
|
|
| max_width = 0 |
| max_height = 0 |
| for img in img_list: |
| height, width = img.shape[:2] |
| max_width = max(max_width, width) |
| max_height = max(max_height, height) |
| assert max_height > 0 and max_width > 0, print("do not find images") |
|
|
| idx = bisect.bisect_left(tgt_max_size_list, max_height) |
| idx = max(idx - 1, 0) |
| tgt_max_size = tgt_max_size_list[idx] |
| tgt_max_width_size = (int(tgt_max_size / target_ratio) // 14) * 14 |
|
|
| height, width = tgt_max_size, tgt_max_width_size |
|
|
| collect_imgs = [] |
|
|
| for img in img_list: |
| draw = np.ones((height, width, 3), dtype=img.dtype) |
|
|
| input_h, input_w, _ = img.shape |
|
|
| scale = min(height / input_h, width / input_w) |
| if scale <= 1.0: |
| if not self.val: |
| scale_min = 0.95 * scale |
| random_scale = scale_min + (scale - scale_min) * np.random.rand() |
| else: |
| random_scale = scale |
| new_size = (int(input_w * random_scale), int(input_h * random_scale)) |
| resized_img = cv2.resize(img, new_size, interpolation=cv2.INTER_AREA) |
| else: |
| resized_img = self.scale_img_keep_aspect(img, height, width) |
|
|
| y_offset = (height - resized_img.shape[0]) // 2 |
| x_offset = (width - resized_img.shape[1]) // 2 |
|
|
| if resized_img.shape[2] == 3 and draw.shape[2] == 3: |
| draw[ |
| y_offset : y_offset + resized_img.shape[0], |
| x_offset : x_offset + resized_img.shape[1], |
| ] = resized_img |
| else: |
| raise NotImplementedError("color channel not match") |
|
|
| collect_imgs.append(draw) |
| |
| return collect_imgs |
|
|
|
|
| class UpRandomCrop: |
| """Randomly crop an image to a specified aspect ratio range.""" |
|
|
| def __init__(self, low, high, val=False, **kwargs): |
| self.low = low |
| self.high = high |
| self.val = val |
|
|
| def random_body_crop(self, img): |
| """ |
| Randomly crops the top portion of the input image along the vertical axis according to a random ratio |
| between `self.low` and `self.high` (both between 0 and 1). The cropped image is then padded at the top |
| with the background value (constant 1) to retain the original height. |
| If the generated ratio is greater than or equal to 1, the original image is returned unchanged. |
| |
| Args: |
| img (np.ndarray): The input image as a numpy array of shape (H, W, C). |
| |
| Returns: |
| np.ndarray: The randomly cropped and padded image with the same shape as the input. |
| """ |
|
|
| h, w, _ = img.shape |
|
|
| |
| ratio = np.random.uniform(self.low, self.high) |
|
|
| if ratio < 1.0: |
| ratio_h = int(h * ratio) |
| crop_img = img[:ratio_h, :, :] |
|
|
| pad_h = h - ratio_h |
|
|
| padded_image = np.pad( |
| crop_img, |
| ((pad_h, 0), (0, 0), (0, 0)), |
| mode="constant", |
| constant_values=1, |
| ) |
|
|
| return padded_image |
| else: |
| return img |
|
|
| def __call__(self, img_list): |
| if self.val: |
| return img_list |
| return [self.random_body_crop(img) for img in img_list] |
|
|
| def __repr__(self): |
| return f"{self.__class__.__name__}({self.low}, {self.high})" |
|
|
|
|
| class ToTensor: |
| def __init__(self, **kwargs): |
| pass |
|
|
| def __call__(self, img_list): |
|
|
| img_list = [ |
| torch.from_numpy(draw).float().permute(2, 0, 1).unsqueeze(0) |
| for draw in img_list |
| ] |
| return torch.cat(img_list, dim=0) |
|
|
|
|
| class Flip: |
| """ |
| Flip |
| ---- |
| Data augmentation transformation class for flipping images in a list. |
| |
| Args: |
| horizontal (bool): If True, flip images horizontally (left-right). If False, flip vertically (up-down). |
| val (bool): If True, disables random flipping and simply returns the original image list (used for validation). |
| **kwargs: Additional keyword arguments (ignored). |
| |
| Methods: |
| random_flip(img): Randomly flips the image based on the horizontal flag with a probability of 0.5. |
| __call__(img_list): Applies the flip transformation to each image in the list if not in validation mode. |
| __repr__(): Returns a string representation of the transformation. |
| """ |
|
|
| def __init__(self, horizontal=True, val=False, **kwargs): |
| self.horizontal = horizontal |
| self.val = val |
|
|
| def random_flip(self, img): |
| if self.horizontal: |
| if np.random.rand() > 0.5: |
| return np.flip(img, axis=1) |
| else: |
| if np.random.rand() > 0.5: |
| return np.flip(img, axis=0) |
| return img |
|
|
| def __call__(self, img_list): |
|
|
| if self.val: |
| return img_list |
|
|
| return [self.random_flip(img) for img in img_list] |
|
|
| def __repr__(self): |
| return f"{self.__class__.__name__}(horizontal={self.horizontal})" |
|
|
|
|
| class SrcImagePipeline: |
| """ |
| SrcImagePipeline |
| ---------------- |
| A configurable pipeline for processing a list of images through multiple, composable transformation steps. |
| |
| This class allows chaining multiple augmentation and preprocessing steps (such as cropping, padding, flipping, and tensor conversion) |
| by specifying each step as a dictionary with a 'name' key and its specific arguments. |
| |
| Args: |
| *args: Each argument is a dictionary specifying a pipeline step: |
| - 'name' (str): The name of the transformation ('UpRandomCrop', 'PadRatio', 'PadRatioWithScale', 'ToTensor', or 'Flip') |
| - Other keys correspond to arguments for the transformation's constructor. |
| |
| Example: |
| pipeline = SrcImagePipeline( |
| {'name': 'UpRandomCrop', 'crop_size': 256, 'some_arg': val}, |
| {'name': 'Flip', 'horizontal': True}, |
| {'name': 'ToTensor'} |
| ) |
| processed_imgs = pipeline(img_list) |
| |
| """ |
|
|
| def __init__(self, *args): |
| self.valid_pipeline = { |
| "UpRandomCrop": UpRandomCrop, |
| "PadRatio": PadRatio, |
| "PadRatioWithScale": PadRatioWithScale, |
| "ToTensor": ToTensor, |
| "Flip": Flip, |
| } |
|
|
| self.pipeline = [] |
|
|
| for kwargs in args: |
| kwargs = deepcopy(kwargs) |
| pipeline_name = kwargs.pop("name") |
| if pipeline_name in self.valid_pipeline.keys(): |
| self.pipeline.append(self.valid_pipeline[pipeline_name](**kwargs)) |
| else: |
| raise ValueError(f"{pipeline_name} is not a valid pipeline") |
|
|
| def __call__(self, img_list): |
| for pipeline in self.pipeline: |
| img_list = pipeline(img_list) |
|
|
| return img_list |
|
|
| def __repr__(self): |
| return "Pipeline: " + " -> ".join([p.__repr__() for p in self.pipeline]) |
|
|