""" Transforms and data augmentation for sequence level images, bboxes and masks. """ import random import PIL import torch import torchvision.transforms as T import torchvision.transforms.functional as F from util.box_ops import box_xyxy_to_cxcywh, box_iou from util.misc import interpolate import numpy as np from numpy import random as rand from PIL import Image import cv2 class Check(object): def __init__(self,): pass def __call__(self, img, target): fields = ["labels"] if "boxes" in target: fields.append("boxes") if "masks" in target: fields.append("masks") ### check if box or mask still exist after transforms if "boxes" in target or "masks" in target: if "boxes" in target: cropped_boxes = target['boxes'].reshape(-1, 2, 2) keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) else: keep = target['masks'].flatten(1).any(1) if False in keep: for k in range(len(keep)): if not keep[k] and "boxes" in target: target['boxes'][k] = target['boxes'][k]//1000.0 # [0, 0, 0, 0] target['valid'] = keep.to(torch.int32) return img, target def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6): assert mode in ['iou', 'iof'] bboxes1 = bboxes1.astype(np.float32) bboxes2 = bboxes2.astype(np.float32) rows = bboxes1.shape[0] cols = bboxes2.shape[0] ious = np.zeros((rows, cols), dtype=np.float32) if rows * cols == 0: return ious exchange = False if bboxes1.shape[0] > bboxes2.shape[0]: bboxes1, bboxes2 = bboxes2, bboxes1 ious = np.zeros((cols, rows), dtype=np.float32) exchange = True area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) for i in range(bboxes1.shape[0]): x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0]) y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1]) x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2]) y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3]) overlap = np.maximum(x_end - x_start, 0) * np.maximum(y_end - y_start, 0) if mode == 'iou': union = area1[i] + area2 - overlap else: union = area1[i] if not exchange else area2 union = np.maximum(union, eps) ious[i, :] = overlap / union if exchange: ious = ious.T return ious def crop(clip, target, region): cropped_image = [] for image in clip: cropped_image.append(F.crop(image, *region)) target = target.copy() i, j, h, w = region # should we do something wrt the original size? target["size"] = torch.tensor([h, w]) fields = ["labels", "area", "iscrowd"] if "boxes" in target: boxes = target["boxes"] max_size = torch.as_tensor([w, h], dtype=torch.float32) cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) cropped_boxes = cropped_boxes.clamp(min=0) area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) target["boxes"] = cropped_boxes.reshape(-1, 4) target["area"] = area fields.append("boxes") if "masks" in target: # FIXME should we update the area here if there are no boxes? target['masks'] = target['masks'][:, i:i + h, j:j + w] fields.append("masks") return cropped_image, target def hflip(clip, target): flipped_image = [] for image in clip: flipped_image.append(F.hflip(image)) w, h = clip[0].size target = target.copy() if "boxes" in target: boxes = target["boxes"] boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) target["boxes"] = boxes if "masks" in target: target['masks'] = target['masks'].flip(-1) return flipped_image, target def vflip(image,target): flipped_image = [] for image in clip: flipped_image.append(F.vflip(image)) w, h = clip[0].size target = target.copy() if "boxes" in target: boxes = target["boxes"] boxes = boxes[:, [0, 3, 2, 1]] * torch.as_tensor([1, -1, 1, -1]) + torch.as_tensor([0, h, 0, h]) target["boxes"] = boxes if "masks" in target: target['masks'] = target['masks'].flip(1) return flipped_image, target def resize(clip, target, size, max_size=None): # size can be min_size (scalar) or (w, h) tuple def get_size_with_aspect_ratio(image_size, size, max_size=None): w, h = image_size if max_size is not None: min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: size = int(round(max_size * min_original_size / max_original_size)) if (w <= h and w == size) or (h <= w and h == size): return (h, w) if w < h: ow = size oh = int(size * h / w) else: oh = size ow = int(size * w / h) return (oh, ow) def get_size(image_size, size, max_size=None): if isinstance(size, (list, tuple)): return size[::-1] else: return get_size_with_aspect_ratio(image_size, size, max_size) size = get_size(clip[0].size, size, max_size) rescaled_image = [] for image in clip: rescaled_image.append(F.resize(image, size)) if target is None: return rescaled_image, None ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image[0].size, clip[0].size)) ratio_width, ratio_height = ratios target = target.copy() if "boxes" in target: boxes = target["boxes"] scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) target["boxes"] = scaled_boxes if "area" in target: area = target["area"] scaled_area = area * (ratio_width * ratio_height) target["area"] = scaled_area h, w = size target["size"] = torch.tensor([h, w]) if "masks" in target: if target['masks'].shape[0]>0: target['masks'] = interpolate( target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 else: target['masks'] = torch.zeros((target['masks'].shape[0],h,w)) return rescaled_image, target def pad(clip, target, padding): # assumes that we only pad on the bottom right corners padded_image = [] for image in clip: padded_image.append(F.pad(image, (0, 0, padding[0], padding[1]))) if target is None: return padded_image, None target = target.copy() # should we do something wrt the original size? target["size"] = torch.tensor(padded_image[0].size[::-1]) if "masks" in target: target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) return padded_image, target class RandomCrop(object): def __init__(self, size): self.size = size def __call__(self, img, target): region = T.RandomCrop.get_params(img, self.size) return crop(img, target, region) class RandomSizeCrop(object): def __init__(self, min_size: int, max_size: int): self.min_size = min_size self.max_size = max_size def __call__(self, img: PIL.Image.Image, target: dict): w = random.randint(self.min_size, min(img[0].width, self.max_size)) h = random.randint(self.min_size, min(img[0].height, self.max_size)) region = T.RandomCrop.get_params(img[0], [h, w]) return crop(img, target, region) class CenterCrop(object): def __init__(self, size): self.size = size def __call__(self, img, target): image_width, image_height = img.size crop_height, crop_width = self.size crop_top = int(round((image_height - crop_height) / 2.)) crop_left = int(round((image_width - crop_width) / 2.)) return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) class MinIoURandomCrop(object): def __init__(self, min_ious=(0.1, 0.3, 0.5, 0.7, 0.9), min_crop_size=0.3): self.min_ious = min_ious self.sample_mode = (1, *min_ious, 0) self.min_crop_size = min_crop_size def __call__(self, img, target): w,h = img.size while True: mode = random.choice(self.sample_mode) self.mode = mode if mode == 1: return img,target min_iou = mode boxes = target['boxes'].numpy() labels = target['labels'] for i in range(50): new_w = rand.uniform(self.min_crop_size * w, w) new_h = rand.uniform(self.min_crop_size * h, h) if new_h / new_w < 0.5 or new_h / new_w > 2: continue left = rand.uniform(w - new_w) top = rand.uniform(h - new_h) patch = np.array((int(left), int(top), int(left + new_w), int(top + new_h))) if patch[2] == patch[0] or patch[3] == patch[1]: continue overlaps = bbox_overlaps(patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1) if len(overlaps) > 0 and overlaps.min() < min_iou: continue if len(overlaps) > 0: def is_center_of_bboxes_in_patch(boxes, patch): center = (boxes[:, :2] + boxes[:, 2:]) / 2 mask = ((center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (center[:, 0] < patch[2]) * (center[:, 1] < patch[3])) return mask mask = is_center_of_bboxes_in_patch(boxes, patch) if False in mask: continue #TODO: use no center boxes #if not mask.any(): # continue boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) boxes -= np.tile(patch[:2], 2) target['boxes'] = torch.tensor(boxes) img = np.asarray(img)[patch[1]:patch[3], patch[0]:patch[2]] img = Image.fromarray(img) width, height = img.size target['orig_size'] = torch.tensor([height,width]) target['size'] = torch.tensor([height,width]) return img,target class RandomContrast(object): def __init__(self, lower=0.5, upper=1.5): self.lower = lower self.upper = upper assert self.upper >= self.lower, "contrast upper must be >= lower." assert self.lower >= 0, "contrast lower must be non-negative." def __call__(self, image, target): if rand.randint(2): alpha = rand.uniform(self.lower, self.upper) image *= alpha return image, target class RandomBrightness(object): def __init__(self, delta=32): assert delta >= 0.0 assert delta <= 255.0 self.delta = delta def __call__(self, image, target): if rand.randint(2): delta = rand.uniform(-self.delta, self.delta) image += delta return image, target class RandomSaturation(object): def __init__(self, lower=0.5, upper=1.5): self.lower = lower self.upper = upper assert self.upper >= self.lower, "contrast upper must be >= lower." assert self.lower >= 0, "contrast lower must be non-negative." def __call__(self, image, target): if rand.randint(2): image[:, :, 1] *= rand.uniform(self.lower, self.upper) return image, target class RandomHue(object): # def __init__(self, delta=18.0): assert delta >= 0.0 and delta <= 360.0 self.delta = delta def __call__(self, image, target): if rand.randint(2): image[:, :, 0] += rand.uniform(-self.delta, self.delta) image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 return image, target class RandomLightingNoise(object): def __init__(self): self.perms = ((0, 1, 2), (0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0)) def __call__(self, image, target): if rand.randint(2): swap = self.perms[rand.randint(len(self.perms))] shuffle = SwapChannels(swap) # shuffle channels image = shuffle(image) return image, target class ConvertColor(object): def __init__(self, current='BGR', transform='HSV'): self.transform = transform self.current = current def __call__(self, image, target): if self.current == 'BGR' and self.transform == 'HSV': image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) elif self.current == 'HSV' and self.transform == 'BGR': image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) else: raise NotImplementedError return image, target class SwapChannels(object): def __init__(self, swaps): self.swaps = swaps def __call__(self, image): image = image[:, :, self.swaps] return image class PhotometricDistort(object): def __init__(self): self.pd = [ RandomContrast(), ConvertColor(transform='HSV'), RandomSaturation(), RandomHue(), ConvertColor(current='HSV', transform='BGR'), RandomContrast() ] self.rand_brightness = RandomBrightness() self.rand_light_noise = RandomLightingNoise() def __call__(self,clip,target): imgs = [] for img in clip: img = np.asarray(img).astype('float32') img, target = self.rand_brightness(img, target) if rand.randint(2): distort = Compose(self.pd[:-1]) else: distort = Compose(self.pd[1:]) img, target = distort(img, target) img, target = self.rand_light_noise(img, target) imgs.append(Image.fromarray(img.astype('uint8'))) return imgs, target # NOTICE: if used for mask, need to change class Expand(object): def __init__(self, mean): self.mean = mean def __call__(self, clip, target): if rand.randint(2): return clip,target imgs = [] masks = [] image = np.asarray(clip[0]).astype('float32') height, width, depth = image.shape ratio = rand.uniform(1, 4) left = rand.uniform(0, width*ratio - width) top = rand.uniform(0, height*ratio - height) for i in range(len(clip)): image = np.asarray(clip[i]).astype('float32') expand_image = np.zeros((int(height*ratio), int(width*ratio), depth),dtype=image.dtype) expand_image[:, :, :] = self.mean expand_image[int(top):int(top + height),int(left):int(left + width)] = image imgs.append(Image.fromarray(expand_image.astype('uint8'))) expand_mask = torch.zeros((int(height*ratio), int(width*ratio)),dtype=torch.uint8) expand_mask[int(top):int(top + height),int(left):int(left + width)] = target['masks'][i] masks.append(expand_mask) boxes = target['boxes'].numpy() boxes[:, :2] += (int(left), int(top)) boxes[:, 2:] += (int(left), int(top)) target['boxes'] = torch.tensor(boxes) target['masks']=torch.stack(masks) return imgs, target class RandomHorizontalFlip(object): def __init__(self, p=0.5): self.p = p def __call__(self, img, target): if random.random() < self.p: # NOTE: caption for 'left' and 'right' should also change caption = target['caption'] target['caption'] = caption.replace('left', '@').replace('right', 'left').replace('@', 'right') return hflip(img, target) return img, target class RandomVerticalFlip(object): def __init__(self, p=0.5): self.p = p def __call__(self, img, target): if random.random() < self.p: return vflip(img, target) return img, target class RandomResize(object): def __init__(self, sizes, max_size=None): assert isinstance(sizes, (list, tuple)) self.sizes = sizes self.max_size = max_size def __call__(self, img, target=None): size = random.choice(self.sizes) return resize(img, target, size, self.max_size) class RandomPad(object): def __init__(self, max_pad): self.max_pad = max_pad def __call__(self, img, target): pad_x = random.randint(0, self.max_pad) pad_y = random.randint(0, self.max_pad) return pad(img, target, (pad_x, pad_y)) class RandomSelect(object): """ Randomly selects between transforms1 and transforms2, with probability p for transforms1 and (1 - p) for transforms2 """ def __init__(self, transforms1, transforms2, p=0.5): self.transforms1 = transforms1 self.transforms2 = transforms2 self.p = p def __call__(self, img, target): if random.random() < self.p: return self.transforms1(img, target) return self.transforms2(img, target) class ToTensor(object): def __call__(self, clip, target): img = [] for im in clip: img.append(F.to_tensor(im)) return img, target class RandomErasing(object): def __init__(self, *args, **kwargs): self.eraser = T.RandomErasing(*args, **kwargs) def __call__(self, img, target): return self.eraser(img), target class Normalize(object): def __init__(self, mean, std): self.mean = mean self.std = std def __call__(self, clip, target=None): image = [] for im in clip: image.append(F.normalize(im, mean=self.mean, std=self.std)) if target is None: return image, None target = target.copy() h, w = image[0].shape[-2:] if "boxes" in target: boxes = target["boxes"] boxes = box_xyxy_to_cxcywh(boxes) boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) target["boxes"] = boxes return image, target class Compose(object): def __init__(self, transforms): self.transforms = transforms def __call__(self, image, target): for t in self.transforms: image, target = t(image, target) return image, target def __repr__(self): format_string = self.__class__.__name__ + "(" for t in self.transforms: format_string += "\n" format_string += " {0}".format(t) format_string += "\n)" return format_string