# Copy from: https://github.com/rwightman/efficientdet-pytorch/blob/d43c9e34cd62d22b4205831bb735f6dd83b8e881/effdet/data/transforms.py """ COCO transforms (quick and dirty) Hacked together by Ross Wightman """ import random import math from copy import deepcopy from PIL import Image import numpy as np import torch IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) class ImageToNumpy: def __call__(self, pil_img, annotations: dict): np_img = np.array(pil_img, dtype=np.uint8) if np_img.ndim < 3: np_img = np.expand_dims(np_img, axis=-1) np_img = np.moveaxis(np_img, 2, 0) # HWC to CHW return np_img, annotations class ImageToTensor: def __init__(self, dtype=torch.float32): self.dtype = dtype def __call__(self, pil_img, annotations: dict): np_img = np.array(pil_img, dtype=np.uint8) if np_img.ndim < 3: np_img = np.expand_dims(np_img, axis=-1) np_img = np.moveaxis(np_img, 2, 0) # HWC to CHW return torch.from_numpy(np_img).to(dtype=self.dtype), annotations def _pil_interp(method): if method == "bicubic": return Image.BICUBIC elif method == "lanczos": return Image.LANCZOS elif method == "hamming": return Image.HAMMING else: # default bilinear, do we want to allow nearest? return Image.BILINEAR _RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) def clip_boxes_(boxes, img_size): height, width = img_size clip_upper = np.array([height, width] * 2, dtype=boxes.dtype) np.clip(boxes, 0, clip_upper, out=boxes) def clip_boxes(boxes, img_size): clipped_boxes = boxes.copy() clip_boxes_(clipped_boxes, img_size) return clipped_boxes def _size_tuple(size): if isinstance(size, int): return size, size else: assert len(size) == 2 return size class ResizePad: def __init__(self, target_size: int, interpolation: str = "bilinear", fill_color: tuple = (0, 0, 0)): self.target_size = _size_tuple(target_size) self.interpolation = interpolation self.fill_color = fill_color def __call__(self, img, anno: dict): width, height = img.size img_scale_y = self.target_size[0] / height img_scale_x = self.target_size[1] / width img_scale = min(img_scale_y, img_scale_x) scaled_h = int(height * img_scale) scaled_w = int(width * img_scale) new_img = Image.new("RGB", (self.target_size[1], self.target_size[0]), color=self.fill_color) interp_method = _pil_interp(self.interpolation) img = img.resize((scaled_w, scaled_h), interp_method) new_img.paste(img) # pastes at 0,0 (upper-left corner) if "bbox" in anno: bbox = anno["bbox"] bbox[:, :4] *= img_scale bbox_bound = (min(scaled_h, self.target_size[0]), min(scaled_w, self.target_size[1])) clip_boxes_(bbox, bbox_bound) # crop to bounds of target image or letter-box, whichever is smaller valid_indices = (bbox[:, :2] < bbox[:, 2:4]).all(axis=1) anno["bbox"] = bbox[valid_indices, :] anno["cls"] = anno["cls"][valid_indices] anno["img_scale"] = 1.0 / img_scale # back to original return new_img, anno class RandomResizePad: def __init__( self, target_size: int, scale: tuple = (0.1, 2.0), interpolation: str = "random", fill_color: tuple = (0, 0, 0) ): self.target_size = _size_tuple(target_size) self.scale = scale if interpolation == "random": self.interpolation = _RANDOM_INTERPOLATION else: self.interpolation = _pil_interp(interpolation) self.fill_color = fill_color def _get_params(self, img): # Select a random scale factor. scale_factor = random.uniform(*self.scale) scaled_target_height = scale_factor * self.target_size[0] scaled_target_width = scale_factor * self.target_size[1] # Recompute the accurate scale_factor using rounded scaled image size. width, height = img.size img_scale_y = scaled_target_height / height img_scale_x = scaled_target_width / width img_scale = min(img_scale_y, img_scale_x) # Select non-zero random offset (x, y) if scaled image is larger than target size scaled_h = int(height * img_scale) scaled_w = int(width * img_scale) offset_y = scaled_h - self.target_size[0] offset_x = scaled_w - self.target_size[1] offset_y = int(max(0.0, float(offset_y)) * random.uniform(0, 1)) offset_x = int(max(0.0, float(offset_x)) * random.uniform(0, 1)) return scaled_h, scaled_w, offset_y, offset_x, img_scale def __call__(self, img, anno: dict): scaled_h, scaled_w, offset_y, offset_x, img_scale = self._get_params(img) if isinstance(self.interpolation, (tuple, list)): interpolation = random.choice(self.interpolation) else: interpolation = self.interpolation img = img.resize((scaled_w, scaled_h), interpolation) right, lower = min(scaled_w, offset_x + self.target_size[1]), min(scaled_h, offset_y + self.target_size[0]) img = img.crop((offset_x, offset_y, right, lower)) new_img = Image.new("RGB", (self.target_size[1], self.target_size[0]), color=self.fill_color) # new_img.paste(img) # pastes at 0,0 (upper-left corner) # NOTE: Change here. This is my idea of improve the padding target_height = self.target_size[0] target_width = self.target_size[1] img_width, img_height = img.size offset_x_paste = int(max(0, target_width - img_width) * random.uniform(0, 1)) offset_y_paste = int(max(0, target_height - img_height) * random.uniform(0, 1)) right_paste = min(target_width, offset_x_paste + img_width) lower_paste = min(target_height, offset_y_paste + img_height) print(offset_x_paste, offset_y_paste, right_paste, lower_paste) new_img.paste( img, (offset_x_paste, offset_y_paste, right_paste, lower_paste) ) # pastes at 0,0 (upper-left corner) if "bbox" in anno: bbox = anno["bbox"] # for convenience, modifies in-place bbox[:, :4] *= img_scale box_offset = np.stack([offset_y, offset_x] * 2) bbox -= box_offset bbox_bound = (min(scaled_h, self.target_size[0]), min(scaled_w, self.target_size[1])) clip_boxes_(bbox, bbox_bound) # crop to bounds of target image or letter-box, whichever is smaller # NOTE: Change here. This is my idea of improve the padding box_offset_paste = np.stack([offset_y_paste, offset_x_paste] * 2) bbox += box_offset_paste valid_indices = (bbox[:, :2] < bbox[:, 2:4]).all(axis=1) anno["bbox"] = bbox[valid_indices, :] anno["cls"] = anno["cls"][valid_indices] anno["img_scale"] = 1.0 / img_scale # back to original return new_img, anno class RandomFlip: def __init__(self, horizontal=True, vertical=False, prob=0.5): self.horizontal = horizontal self.vertical = vertical self.prob = prob def _get_params(self): do_horizontal = random.random() < self.prob if self.horizontal else False do_vertical = random.random() < self.prob if self.vertical else False return do_horizontal, do_vertical def __call__(self, img, annotations: dict): do_horizontal, do_vertical = self._get_params() width, height = img.size def _fliph(bbox): x_max = width - bbox[:, 1] x_min = width - bbox[:, 3] bbox[:, 1] = x_min bbox[:, 3] = x_max def _flipv(bbox): y_max = height - bbox[:, 0] y_min = height - bbox[:, 2] bbox[:, 0] = y_min bbox[:, 2] = y_max if do_horizontal and do_vertical: img = img.transpose(Image.ROTATE_180) if "bbox" in annotations: _fliph(annotations["bbox"]) _flipv(annotations["bbox"]) elif do_horizontal: img = img.transpose(Image.FLIP_LEFT_RIGHT) if "bbox" in annotations: _fliph(annotations["bbox"]) elif do_vertical: img = img.transpose(Image.FLIP_TOP_BOTTOM) if "bbox" in annotations: _flipv(annotations["bbox"]) return img, annotations def resolve_fill_color(fill_color, img_mean=IMAGENET_DEFAULT_MEAN): if isinstance(fill_color, tuple): assert len(fill_color) == 3 fill_color = fill_color else: try: int_color = int(fill_color) fill_color = (int_color,) * 3 except ValueError: assert fill_color == "mean" fill_color = tuple([int(round(255 * x)) for x in img_mean]) return fill_color class Compose: def __init__(self, transforms: list): self.transforms = transforms def __call__(self, img, annotations: dict): for t in self.transforms: img, annotations = t(img, annotations) return img, annotations def transforms_coco_eval( img_size=224, interpolation="bilinear", use_prefetcher=False, fill_color="mean", mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, ): fill_color = resolve_fill_color(fill_color, mean) image_tfl = [ ResizePad(target_size=img_size, interpolation=interpolation, fill_color=fill_color), ImageToNumpy(), ] assert use_prefetcher, "Only supporting prefetcher usage right now" image_tf = Compose(image_tfl) return image_tf def transforms_coco_train( img_size=224, interpolation="random", use_prefetcher=False, fill_color="mean", mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, ): fill_color = resolve_fill_color(fill_color, mean) image_tfl = [ RandomFlip(horizontal=True, prob=0.5), RandomResizePad(target_size=img_size, interpolation=interpolation, fill_color=fill_color), ImageToNumpy(), ] assert use_prefetcher, "Only supporting prefetcher usage right now" image_tf = Compose(image_tfl) return image_tf