diff --git a/mmyolo/__init__.py b/mmyolo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a7a2f33338e484f9ecb4e9123e88b84f902cd6cf --- /dev/null +++ b/mmyolo/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmdet +import mmengine +from mmengine.utils import digit_version + +from .version import __version__, version_info + +mmcv_minimum_version = '2.0.0rc4' +mmcv_maximum_version = '2.1.0' +mmcv_version = digit_version(mmcv.__version__) + +mmengine_minimum_version = '0.6.0' +mmengine_maximum_version = '1.0.0' +mmengine_version = digit_version(mmengine.__version__) + +mmdet_minimum_version = '3.0.0rc6' +mmdet_maximum_version = '3.1.0' +mmdet_version = digit_version(mmdet.__version__) + + +assert (mmcv_version >= digit_version(mmcv_minimum_version) + and mmcv_version < digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.' + +assert (mmengine_version >= digit_version(mmengine_minimum_version) + and mmengine_version < digit_version(mmengine_maximum_version)), \ + f'MMEngine=={mmengine.__version__} is used but incompatible. ' \ + f'Please install mmengine>={mmengine_minimum_version}, ' \ + f'<{mmengine_maximum_version}.' + +assert (mmdet_version >= digit_version(mmdet_minimum_version) + and mmdet_version < digit_version(mmdet_maximum_version)), \ + f'MMDetection=={mmdet.__version__} is used but incompatible. ' \ + f'Please install mmdet>={mmdet_minimum_version}, ' \ + f'<{mmdet_maximum_version}.' + +__all__ = ['__version__', 'version_info', 'digit_version'] diff --git a/mmyolo/datasets/__init__.py b/mmyolo/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b3b6b971937e0179306965fbb5695121fd5d3b64 --- /dev/null +++ b/mmyolo/datasets/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .transforms import * # noqa: F401,F403 +from .utils import BatchShapePolicy, yolov5_collate +from .yolov5_coco import YOLOv5CocoDataset +from .yolov5_crowdhuman import YOLOv5CrowdHumanDataset +from .yolov5_dota import YOLOv5DOTADataset +from .yolov5_voc import YOLOv5VOCDataset + +__all__ = [ + 'YOLOv5CocoDataset', 'YOLOv5VOCDataset', 'BatchShapePolicy', + 'yolov5_collate', 'YOLOv5CrowdHumanDataset', 'YOLOv5DOTADataset' +] diff --git a/mmyolo/datasets/transforms/__init__.py b/mmyolo/datasets/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..58f4e6fdb5d7272743240f1d0da55c5a7d489fbf --- /dev/null +++ b/mmyolo/datasets/transforms/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .mix_img_transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp +from .transforms import (LetterResize, LoadAnnotations, PPYOLOERandomCrop, + PPYOLOERandomDistort, RegularizeRotatedBox, + RemoveDataElement, YOLOv5CopyPaste, + YOLOv5HSVRandomAug, YOLOv5KeepRatioResize, + YOLOv5RandomAffine) + +__all__ = [ + 'YOLOv5KeepRatioResize', 'LetterResize', 'Mosaic', 'YOLOXMixUp', + 'YOLOv5MixUp', 'YOLOv5HSVRandomAug', 'LoadAnnotations', + 'YOLOv5RandomAffine', 'PPYOLOERandomDistort', 'PPYOLOERandomCrop', + 'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox' +] diff --git a/mmyolo/datasets/transforms/mix_img_transforms.py b/mmyolo/datasets/transforms/mix_img_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..4a25f6f7ef327878cc19b51f32037037d0412aaa --- /dev/null +++ b/mmyolo/datasets/transforms/mix_img_transforms.py @@ -0,0 +1,1150 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import collections +import copy +from abc import ABCMeta, abstractmethod +from typing import Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmdet.structures.bbox import autocast_box_type +from mmengine.dataset import BaseDataset +from mmengine.dataset.base_dataset import Compose +from numpy import random + +from mmyolo.registry import TRANSFORMS + + +class BaseMixImageTransform(BaseTransform, metaclass=ABCMeta): + """A Base Transform of multiple images mixed. + + Suitable for training on multiple images mixed data augmentation like + mosaic and mixup. + + Cached mosaic transform will random select images from the cache + and combine them into one output image if use_cached is True. + + Args: + pre_transform(Sequence[str]): Sequence of transform object or + config dict to be composed. Defaults to None. + prob(float): The transformation probability. Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + pre_transform: Optional[Sequence[str]] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + + self.max_refetch = max_refetch + self.prob = prob + + self.use_cached = use_cached + self.max_cached_images = max_cached_images + self.random_pop = random_pop + self.results_cache = [] + + if pre_transform is None: + self.pre_transform = None + else: + self.pre_transform = Compose(pre_transform) + + @abstractmethod + def get_indexes(self, dataset: Union[BaseDataset, + list]) -> Union[list, int]: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list or int: indexes. + """ + pass + + @abstractmethod + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + pass + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Data augmentation function. + + The transform steps are as follows: + 1. Randomly generate index list of other images. + 2. Before Mosaic or MixUp need to go through the necessary + pre_transform, such as MixUp' pre_transform pipeline + include: 'LoadImageFromFile','LoadAnnotations', + 'Mosaic' and 'RandomAffine'. + 3. Use mix_img_transform function to implement specific + mix operations. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + + if random.uniform(0, 1) > self.prob: + return results + + if self.use_cached: + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + self.results_cache.append(copy.deepcopy(results)) + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results + else: + assert 'dataset' in results + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + + for _ in range(self.max_refetch): + # get index of one or three other images + if self.use_cached: + indexes = self.get_indexes(self.results_cache) + else: + indexes = self.get_indexes(dataset) + + if not isinstance(indexes, collections.abc.Sequence): + indexes = [indexes] + + if self.use_cached: + mix_results = [ + copy.deepcopy(self.results_cache[i]) for i in indexes + ] + else: + # get images information will be used for Mosaic or MixUp + mix_results = [ + copy.deepcopy(dataset.get_data_info(index)) + for index in indexes + ] + + if self.pre_transform is not None: + for i, data in enumerate(mix_results): + # pre_transform may also require dataset + data.update({'dataset': dataset}) + # before Mosaic or MixUp need to go through + # the necessary pre_transform + _results = self.pre_transform(data) + _results.pop('dataset') + mix_results[i] = _results + + if None not in mix_results: + results['mix_results'] = mix_results + break + print('Repeated calculation') + else: + raise RuntimeError( + 'The loading pipeline of the original dataset' + ' always return None. Please check the correctness ' + 'of the dataset and its pipeline.') + + # Mosaic or MixUp + results = self.mix_img_transform(results) + + if 'mix_results' in results: + results.pop('mix_results') + results['dataset'] = dataset + + return results + + +@TRANSFORMS.register_module() +class Mosaic(BaseMixImageTransform): + """Mosaic augmentation. + + Given 4 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | | + | +-----------+ pad | + | | | | + | | image1 +-----------+ + | | | | + | | | image2 | + center_y |----+-+-----------+-----------+ + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + The mosaic transform steps are as follows: + + 1. Choose the mosaic center as the intersections of 4 images + 2. Get the left top image according to the index, and randomly + sample another 3 images from the custom dataset. + 3. Sub image will be cropped if image is larger than mosaic patch + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_ratio_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 4, 'The length of cache must >= 4, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.center_ratio_range = center_ratio_range + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + mosaic_masks = [] + with_mask = True if 'gt_masks' in results else False + # self.img_scale is wh format + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 2), int(img_scale_w * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)), + self.pad_val, + dtype=results['img'].dtype) + + # mosaic center x, y + center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w) + center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h) + center_position = (center_x, center_y) + + loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for i, loc in enumerate(loc_strs): + if loc == 'top_left': + results_patch = results + else: + results_patch = results['mix_results'][i - 1] + + img_i = results_patch['img'] + h_i, w_i = img_i.shape[:2] + # keep_ratio resize + scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i) + img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) + + # compute the combine parameters + paste_coord, crop_coord = self._mosaic_combine( + loc, center_position, img_i.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] + + # adjust coordinate + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + + padw = x1_p - x1_c + padh = y1_p - y1_c + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + if with_mask and results_patch.get('gt_masks', None) is not None: + gt_masks_i = results_patch['gt_masks'] + gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i)) + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padw, + direction='horizontal') + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padh, + direction='vertical') + mosaic_masks.append(gt_masks_i) + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks) + results['gt_masks'] = mosaic_masks + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds] + results['gt_masks'] = mosaic_masks + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + + return results + + def _mosaic_combine( + self, loc: str, center_position_xy: Sequence[float], + img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]: + """Calculate global coordinate of mosaic image and local coordinate of + cropped sub-image. + + Args: + loc (str): Index for the sub-image, loc in ('top_left', + 'top_right', 'bottom_left', 'bottom_right'). + center_position_xy (Sequence[float]): Mixing center for 4 images, + (x, y). + img_shape_wh (Sequence[int]): Width and height of sub-image + + Returns: + tuple[tuple[float]]: Corresponding coordinate of pasting and + cropping + - paste_coord (tuple): paste corner coordinate in mosaic image. + - crop_coord (tuple): crop corner coordinate in mosaic image. + """ + assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') + if loc == 'top_left': + # index0 to top left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + center_position_xy[0], \ + center_position_xy[1] + crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - ( + y2 - y1), img_shape_wh[0], img_shape_wh[1] + + elif loc == 'top_right': + # index1 to top right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + center_position_xy[1] + crop_coord = 0, img_shape_wh[1] - (y2 - y1), min( + img_shape_wh[0], x2 - x1), img_shape_wh[1] + + elif loc == 'bottom_left': + # index2 to bottom left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + center_position_xy[1], \ + center_position_xy[0], \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min( + y2 - y1, img_shape_wh[1]) + + else: + # index3 to bottom right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + center_position_xy[1], \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = 0, 0, min(img_shape_wh[0], + x2 - x1), min(y2 - y1, img_shape_wh[1]) + + paste_coord = x1, y1, x2, y2 + return paste_coord, crop_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_ratio_range={self.center_ratio_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class Mosaic9(BaseMixImageTransform): + """Mosaic9 augmentation. + + Given 9 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + +-------------------------------+------------+ + | pad | pad | | + | +----------+ | | + | | +---------------+ top_right | + | | | top | image2 | + | | top_left | image1 | | + | | image8 o--------+------+--------+---+ + | | | | | | + +----+----------+ | right |pad| + | | center | image3 | | + | left | image0 +---------------+---| + | image7 | | | | + +---+-----------+---+--------+ | | + | | cropped | | bottom_right |pad| + | |bottom_left| | image4 | | + | | image6 | bottom | | | + +---|-----------+ image5 +---------------+---| + | pad | | pad | + +-----------+------------+-------------------+ + + The mosaic transform steps are as follows: + + 1. Get the center image according to the index, and randomly + sample another 8 images from the custom dataset. + 2. Randomly offset the image after Mosaic + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 5 caches for each image suffices for + randomness. Defaults to 50. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + bbox_clip_border: bool = True, + pad_val: Union[float, int] = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 50, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 9, 'The length of cache must >= 9, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + # intermediate variables + self._current_img_shape = [0, 0] + self._center_img_shape = [0, 0] + self._previous_img_shape = [0, 0] + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(8)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 3), int(img_scale_w * 3), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)), + self.pad_val, + dtype=results['img'].dtype) + + # index = 0 is mean original image + # len(results['mix_results']) = 8 + loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + results_all = [results, *results['mix_results']] + for index, results_patch in enumerate(results_all): + img_i = results_patch['img'] + # keep_ratio resize + img_i_h, img_i_w = img_i.shape[:2] + scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w) + img_i = mmcv.imresize( + img_i, + (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i))) + + paste_coord = self._mosaic_combine(loc_strs[index], + img_i.shape[:2]) + + padw, padh = paste_coord[:2] + x1, y1, x2, y2 = (max(x, 0) for x in paste_coord) + mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:] + + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + + # Offset + offset_x = int(random.uniform(0, img_scale_w)) + offset_y = int(random.uniform(0, img_scale_h)) + mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h, + offset_x:offset_x + 2 * img_scale_w] + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes.translate_([-offset_x, -offset_y]) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + return results + + def _mosaic_combine(self, loc: str, + img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]: + """Calculate global coordinate of mosaic image. + + Args: + loc (str): Index for the sub-image. + img_shape_hw (Sequence[int]): Height and width of sub-image + + Returns: + paste_coord (tuple): paste corner coordinate in mosaic image. + """ + assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + img_scale_w, img_scale_h = self.img_scale + + self._current_img_shape = img_shape_hw + current_img_h, current_img_w = self._current_img_shape + previous_img_h, previous_img_w = self._previous_img_shape + center_img_h, center_img_w = self._center_img_shape + + if loc == 'center': + self._center_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + paste_coord = img_scale_w, \ + img_scale_h, \ + img_scale_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'top': + paste_coord = img_scale_w, \ + img_scale_h - current_img_h, \ + img_scale_w + current_img_w, \ + img_scale_h + elif loc == 'top_right': + paste_coord = img_scale_w + previous_img_w, \ + img_scale_h - current_img_h, \ + img_scale_w + previous_img_w + current_img_w, \ + img_scale_h + elif loc == 'right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'bottom_right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h + previous_img_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + previous_img_h + current_img_h + elif loc == 'bottom': + paste_coord = img_scale_w + center_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'bottom_left': + paste_coord = img_scale_w + center_img_w - \ + previous_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w - previous_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h + elif loc == 'top_left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - \ + previous_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h - previous_img_h + + self._previous_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + return paste_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5MixUp(BaseMixImageTransform): + """MixUp data augmentation for YOLOv5. + + .. code:: text + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset. + 2. Randomly obtain the fusion ratio from the beta distribution, + then fuse the target + of the original image and mixup image through this ratio. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + alpha (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + beta (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + pre_transform (Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + alpha: float = 32.0, + beta: float = 32.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.alpha = alpha + self.beta = beta + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOv5 MixUp transform function. + + Args: + results (dict): Result dict + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + ori_img = results['img'] + assert ori_img.shape == retrieve_img.shape + + # Randomly obtain the fusion ratio from the beta distribution, + # which is around 0.5 + ratio = np.random.beta(self.alpha, self.beta) + mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio)) + + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = retrieve_gt_bboxes.cat( + (results['gt_bboxes'], retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + if 'gt_masks' in results: + assert 'gt_masks' in retrieve_results + mixup_gt_masks = results['gt_masks'].cat( + [results['gt_masks'], retrieve_results['gt_masks']]) + results['gt_masks'] = mixup_gt_masks + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + +@TRANSFORMS.register_module() +class YOLOXMixUp(BaseMixImageTransform): + """MixUp data augmentation for YOLOX. + + .. code:: text + + mixup transform + +---------------+--------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + +---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | +-----------------+ | + | pad | + +------------------------------+ + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset and embedded in + the top left patch(after padding and resizing) + 2. The target of mixup transform is the weighted average of mixup + image and origin image. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + bbox_clip_border: bool = True, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.img_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.bbox_clip_border = bbox_clip_border + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOX MixUp transform function. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + assert len( + results['mix_results']) == 1, 'MixUp only support 2 images now !' + + if results['mix_results'][0]['gt_bboxes'].shape[0] == 0: + # empty bbox + return results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + + jit_factor = random.uniform(*self.ratio_range) + is_filp = random.uniform(0, 1) > self.flip_ratio + + if len(retrieve_img.shape) == 3: + out_img = np.ones((self.img_scale[1], self.img_scale[0], 3), + dtype=retrieve_img.dtype) * self.pad_val + else: + out_img = np.ones( + self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val + + # 1. keep_ratio resize + scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0], + self.img_scale[0] / retrieve_img.shape[1]) + retrieve_img = mmcv.imresize( + retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), + int(retrieve_img.shape[0] * scale_ratio))) + + # 2. paste + out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img + + # 3. scale jit + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # 4. flip + if is_filp: + out_img = out_img[:, ::-1, :] + + # 5. random crop + ori_img = results['img'] + origin_h, origin_w = out_img.shape[:2] + target_h, target_w = ori_img.shape[:2] + padded_img = np.ones((max(origin_h, target_h), max( + origin_w, target_w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:origin_h, :origin_w] = out_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, + x_offset:x_offset + target_w] + + # 6. adjust bbox + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_bboxes.clip_([origin_h, origin_w]) + + if is_filp: + retrieve_gt_bboxes.flip_([origin_h, origin_w], + direction='horizontal') + + # 7. filter + cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() + cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_bboxes.clip_([target_h, target_w]) + + # 8. mix up + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img + + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( + (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + + if not self.bbox_clip_border: + # remove outside bbox + inside_inds = mixup_gt_bboxes.is_inside([target_h, + target_w]).numpy() + mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] + mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] + mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'max_refetch={self.max_refetch}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str diff --git a/mmyolo/datasets/transforms/transforms.py b/mmyolo/datasets/transforms/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..d5179fba3372c53573716afbe8daf3efa674d587 --- /dev/null +++ b/mmyolo/datasets/transforms/transforms.py @@ -0,0 +1,1557 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from copy import deepcopy +from typing import List, Sequence, Tuple, Union + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.transforms import BaseTransform, Compose +from mmcv.transforms.utils import cache_randomness +from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations +from mmdet.datasets.transforms import Resize as MMDET_Resize +from mmdet.structures.bbox import (HorizontalBoxes, autocast_box_type, + get_box_type) +from mmdet.structures.mask import PolygonMasks +from numpy import random + +from mmyolo.registry import TRANSFORMS + +# TODO: Waiting for MMCV support +TRANSFORMS.register_module(module=Compose, force=True) + + +@TRANSFORMS.register_module() +class YOLOv5KeepRatioResize(MMDET_Resize): + """Resize images & bbox(if existed). + + This transform resizes the input image according to ``scale``. + Bboxes (if existed) are then resized with the same scale factor. + + Required Keys: + + - img (np.uint8) + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + + Modified Keys: + + - img (np.uint8) + - img_shape (tuple) + - gt_bboxes (optional) + - scale (float) + + Added Keys: + + - scale_factor (np.float32) + + Args: + scale (Union[int, Tuple[int, int]]): Images scales for resizing. + """ + + def __init__(self, + scale: Union[int, Tuple[int, int]], + keep_ratio: bool = True, + **kwargs): + assert keep_ratio is True + super().__init__(scale=scale, keep_ratio=True, **kwargs) + + @staticmethod + def _get_rescale_ratio(old_size: Tuple[int, int], + scale: Union[float, Tuple[int]]) -> float: + """Calculate the ratio for rescaling. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by + this factor, else if it is a tuple of 2 integers, then + the image will be rescaled as large as possible within + the scale. + + Returns: + float: The resize ratio. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError('Scale must be a number or tuple of int, ' + f'but got {type(scale)}') + + return scale_factor + + def _resize_img(self, results: dict): + """Resize images with ``results['scale']``.""" + assert self.keep_ratio is True + + if results.get('img', None) is not None: + image = results['img'] + original_h, original_w = image.shape[:2] + ratio = self._get_rescale_ratio((original_h, original_w), + self.scale) + + if ratio != 1: + # resize image according to the ratio + image = mmcv.imrescale( + img=image, + scale=ratio, + interpolation='area' if ratio < 1 else 'bilinear', + backend=self.backend) + + resized_h, resized_w = image.shape[:2] + scale_ratio = resized_h / original_h + + scale_factor = (scale_ratio, scale_ratio) + + results['img'] = image + results['img_shape'] = image.shape[:2] + results['scale_factor'] = scale_factor + + +@TRANSFORMS.register_module() +class LetterResize(MMDET_Resize): + """Resize and pad image while meeting stride-multiple constraints. + + Required Keys: + + - img (np.uint8) + - batch_shape (np.int64) (optional) + + Modified Keys: + + - img (np.uint8) + - img_shape (tuple) + - gt_bboxes (optional) + + Added Keys: + - pad_param (np.float32) + + Args: + scale (Union[int, Tuple[int, int]]): Images scales for resizing. + pad_val (dict): Padding value. Defaults to dict(img=0, seg=255). + use_mini_pad (bool): Whether using minimum rectangle padding. + Defaults to True + stretch_only (bool): Whether stretch to the specified size directly. + Defaults to False + allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True + """ + + def __init__(self, + scale: Union[int, Tuple[int, int]], + pad_val: dict = dict(img=0, mask=0, seg=255), + use_mini_pad: bool = False, + stretch_only: bool = False, + allow_scale_up: bool = True, + **kwargs): + super().__init__(scale=scale, keep_ratio=True, **kwargs) + + self.pad_val = pad_val + if isinstance(pad_val, (int, float)): + pad_val = dict(img=pad_val, seg=255) + assert isinstance( + pad_val, dict), f'pad_val must be dict, but got {type(pad_val)}' + + self.use_mini_pad = use_mini_pad + self.stretch_only = stretch_only + self.allow_scale_up = allow_scale_up + + def _resize_img(self, results: dict): + """Resize images with ``results['scale']``.""" + image = results.get('img', None) + if image is None: + return + + # Use batch_shape if a batch_shape policy is configured + if 'batch_shape' in results: + scale = tuple(results['batch_shape']) # hw + else: + scale = self.scale[::-1] # wh -> hw + + image_shape = image.shape[:2] # height, width + + # Scale ratio (new / old) + ratio = min(scale[0] / image_shape[0], scale[1] / image_shape[1]) + + # only scale down, do not scale up (for better test mAP) + if not self.allow_scale_up: + ratio = min(ratio, 1.0) + + ratio = [ratio, ratio] # float -> (float, float) for (height, width) + + # compute the best size of the image + no_pad_shape = (int(round(image_shape[0] * ratio[0])), + int(round(image_shape[1] * ratio[1]))) + + # padding height & width + padding_h, padding_w = [ + scale[0] - no_pad_shape[0], scale[1] - no_pad_shape[1] + ] + if self.use_mini_pad: + # minimum rectangle padding + padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32) + + elif self.stretch_only: + # stretch to the specified size directly + padding_h, padding_w = 0.0, 0.0 + no_pad_shape = (scale[0], scale[1]) + ratio = [scale[0] / image_shape[0], + scale[1] / image_shape[1]] # height, width ratios + + if image_shape != no_pad_shape: + # compare with no resize and padding size + image = mmcv.imresize( + image, (no_pad_shape[1], no_pad_shape[0]), + interpolation=self.interpolation, + backend=self.backend) + + scale_factor = (ratio[1], ratio[0]) # mmcv scale factor is (w, h) + + if 'scale_factor' in results: + results['scale_factor_origin'] = results['scale_factor'] + results['scale_factor'] = scale_factor + + # padding + top_padding, left_padding = int(round(padding_h // 2 - 0.1)), int( + round(padding_w // 2 - 0.1)) + bottom_padding = padding_h - top_padding + right_padding = padding_w - left_padding + + padding_list = [ + top_padding, bottom_padding, left_padding, right_padding + ] + if top_padding != 0 or bottom_padding != 0 or \ + left_padding != 0 or right_padding != 0: + + pad_val = self.pad_val.get('img', 0) + if isinstance(pad_val, int) and image.ndim == 3: + pad_val = tuple(pad_val for _ in range(image.shape[2])) + + image = mmcv.impad( + img=image, + padding=(padding_list[2], padding_list[0], padding_list[3], + padding_list[1]), + pad_val=pad_val, + padding_mode='constant') + + results['img'] = image + results['img_shape'] = image.shape + if 'pad_param' in results: + results['pad_param_origin'] = results['pad_param'] * \ + np.repeat(ratio, 2) + results['pad_param'] = np.array(padding_list, dtype=np.float32) + + def _resize_masks(self, results: dict): + """Resize masks with ``results['scale']``""" + if results.get('gt_masks', None) is None: + return + + gt_masks = results['gt_masks'] + assert isinstance( + gt_masks, PolygonMasks + ), f'Only supports PolygonMasks, but got {type(gt_masks)}' + + # resize the gt_masks + gt_mask_h = results['gt_masks'].height * results['scale_factor'][1] + gt_mask_w = results['gt_masks'].width * results['scale_factor'][0] + gt_masks = results['gt_masks'].resize( + (int(round(gt_mask_h)), int(round(gt_mask_w)))) + + top_padding, _, left_padding, _ = results['pad_param'] + if int(left_padding) != 0: + gt_masks = gt_masks.translate( + out_shape=results['img_shape'][:2], + offset=int(left_padding), + direction='horizontal') + if int(top_padding) != 0: + gt_masks = gt_masks.translate( + out_shape=results['img_shape'][:2], + offset=int(top_padding), + direction='vertical') + results['gt_masks'] = gt_masks + + def _resize_bboxes(self, results: dict): + """Resize bounding boxes with ``results['scale_factor']``.""" + if results.get('gt_bboxes', None) is None: + return + results['gt_bboxes'].rescale_(results['scale_factor']) + + if len(results['pad_param']) != 4: + return + results['gt_bboxes'].translate_( + (results['pad_param'][2], results['pad_param'][0])) + + if self.clip_object_border: + results['gt_bboxes'].clip_(results['img_shape']) + + def transform(self, results: dict) -> dict: + results = super().transform(results) + if 'scale_factor_origin' in results: + scale_factor_origin = results.pop('scale_factor_origin') + results['scale_factor'] = (results['scale_factor'][0] * + scale_factor_origin[0], + results['scale_factor'][1] * + scale_factor_origin[1]) + if 'pad_param_origin' in results: + pad_param_origin = results.pop('pad_param_origin') + results['pad_param'] += pad_param_origin + return results + + +# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug +@TRANSFORMS.register_module() +class YOLOv5HSVRandomAug(BaseTransform): + """Apply HSV augmentation to image sequentially. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + hue_delta ([int, float]): delta of hue. Defaults to 0.015. + saturation_delta ([int, float]): delta of saturation. Defaults to 0.7. + value_delta ([int, float]): delta of value. Defaults to 0.4. + """ + + def __init__(self, + hue_delta: Union[int, float] = 0.015, + saturation_delta: Union[int, float] = 0.7, + value_delta: Union[int, float] = 0.4): + self.hue_delta = hue_delta + self.saturation_delta = saturation_delta + self.value_delta = value_delta + + def transform(self, results: dict) -> dict: + """The HSV augmentation transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + hsv_gains = \ + random.uniform(-1, 1, 3) * \ + [self.hue_delta, self.saturation_delta, self.value_delta] + 1 + hue, sat, val = cv2.split( + cv2.cvtColor(results['img'], cv2.COLOR_BGR2HSV)) + + table_list = np.arange(0, 256, dtype=hsv_gains.dtype) + lut_hue = ((table_list * hsv_gains[0]) % 180).astype(np.uint8) + lut_sat = np.clip(table_list * hsv_gains[1], 0, 255).astype(np.uint8) + lut_val = np.clip(table_list * hsv_gains[2], 0, 255).astype(np.uint8) + + im_hsv = cv2.merge( + (cv2.LUT(hue, lut_hue), cv2.LUT(sat, + lut_sat), cv2.LUT(val, lut_val))) + results['img'] = cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_delta={self.hue_delta}, ' + repr_str += f'saturation_delta={self.saturation_delta}, ' + repr_str += f'value_delta={self.value_delta})' + return repr_str + + +@TRANSFORMS.register_module() +class LoadAnnotations(MMDET_LoadAnnotations): + """Because the yolo series does not need to consider ignore bboxes for the + time being, in order to speed up the pipeline, it can be excluded in + advance.""" + + def __init__(self, + mask2bbox: bool = False, + poly2mask: bool = False, + **kwargs) -> None: + self.mask2bbox = mask2bbox + assert not poly2mask, 'Does not support BitmapMasks considering ' \ + 'that bitmap consumes more memory.' + super().__init__(poly2mask=poly2mask, **kwargs) + if self.mask2bbox: + assert self.with_mask, 'Using mask2bbox requires ' \ + 'with_mask is True.' + self._mask_ignore_flag = None + + def transform(self, results: dict) -> dict: + """Function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box, label and + semantic segmentation. + """ + if self.mask2bbox: + self._load_masks(results) + if self.with_label: + self._load_labels(results) + self._update_mask_ignore_data(results) + gt_bboxes = results['gt_masks'].get_bboxes(dst_type='hbox') + results['gt_bboxes'] = gt_bboxes + else: + results = super().transform(results) + self._update_mask_ignore_data(results) + return results + + def _update_mask_ignore_data(self, results: dict) -> None: + if 'gt_masks' not in results: + return + + if 'gt_bboxes_labels' in results and len( + results['gt_bboxes_labels']) != len(results['gt_masks']): + assert len(results['gt_bboxes_labels']) == len( + self._mask_ignore_flag) + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + self._mask_ignore_flag] + + if 'gt_bboxes' in results and len(results['gt_bboxes']) != len( + results['gt_masks']): + assert len(results['gt_bboxes']) == len(self._mask_ignore_flag) + results['gt_bboxes'] = results['gt_bboxes'][self._mask_ignore_flag] + + def _load_bboxes(self, results: dict): + """Private function to load bounding box annotations. + Note: BBoxes with ignore_flag of 1 is not considered. + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box annotations. + """ + gt_bboxes = [] + gt_ignore_flags = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + gt_bboxes.append(instance['bbox']) + gt_ignore_flags.append(instance['ignore_flag']) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + if self.box_type is None: + results['gt_bboxes'] = np.array( + gt_bboxes, dtype=np.float32).reshape((-1, 4)) + else: + _, box_type_cls = get_box_type(self.box_type) + results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32) + + def _load_labels(self, results: dict): + """Private function to load label annotations. + + Note: BBoxes with ignore_flag of 1 is not considered. + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + Returns: + dict: The dict contains loaded label annotations. + """ + gt_bboxes_labels = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + gt_bboxes_labels.append(instance['bbox_label']) + results['gt_bboxes_labels'] = np.array( + gt_bboxes_labels, dtype=np.int64) + + def _load_masks(self, results: dict) -> None: + """Private function to load mask annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + """ + gt_masks = [] + gt_ignore_flags = [] + self._mask_ignore_flag = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + if 'mask' in instance: + gt_mask = instance['mask'] + if isinstance(gt_mask, list): + gt_mask = [ + np.array(polygon) for polygon in gt_mask + if len(polygon) % 2 == 0 and len(polygon) >= 6 + ] + if len(gt_mask) == 0: + # ignore + self._mask_ignore_flag.append(0) + else: + gt_masks.append(gt_mask) + gt_ignore_flags.append(instance['ignore_flag']) + self._mask_ignore_flag.append(1) + else: + raise NotImplementedError( + 'Only supports mask annotations in polygon ' + 'format currently') + else: + # TODO: Actually, gt with bbox and without mask needs + # to be retained + self._mask_ignore_flag.append(0) + self._mask_ignore_flag = np.array(self._mask_ignore_flag, dtype=bool) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + h, w = results['ori_shape'] + gt_masks = PolygonMasks([mask for mask in gt_masks], h, w) + results['gt_masks'] = gt_masks + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_mask={self.with_mask}, ' + repr_str += f'with_seg={self.with_seg}, ' + repr_str += f'mask2bbox={self.mask2bbox}, ' + repr_str += f'poly2mask={self.poly2mask}, ' + repr_str += f"imdecode_backend='{self.imdecode_backend}', " + repr_str += f'file_client_args={self.file_client_args})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5RandomAffine(BaseTransform): + """Random affine transform data augmentation in YOLOv5 and YOLOv8. It is + different from the implementation in YOLOX. + + This operation randomly generates affine transform matrix which including + rotation, translation, shear and scaling transforms. + If you set use_mask_refine == True, the code will use the masks + annotation to refine the bbox. + Our implementation is slightly different from the official. In COCO + dataset, a gt may have multiple mask tags. The official YOLOv5 + annotation file already combines the masks that an object has, + but our code takes into account the fact that an object has multiple masks. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (PolygonMasks) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + - gt_masks (PolygonMasks) (optional) + + Args: + max_rotate_degree (float): Maximum degrees of rotation transform. + Defaults to 10. + max_translate_ratio (float): Maximum ratio of translation. + Defaults to 0.1. + scaling_ratio_range (tuple[float]): Min and max ratio of + scaling transform. Defaults to (0.5, 1.5). + max_shear_degree (float): Maximum degrees of shear + transform. Defaults to 2. + border (tuple[int]): Distance from width and height sides of input + image to adjust output shape. Only used in mosaic dataset. + Defaults to (0, 0). + border_val (tuple[int]): Border padding values of 3 channels. + Defaults to (114, 114, 114). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + min_bbox_size (float): Width and height threshold to filter bboxes. + If the height or width of a box is smaller than this value, it + will be removed. Defaults to 2. + min_area_ratio (float): Threshold of area ratio between + original bboxes and wrapped bboxes. If smaller than this value, + the box will be removed. Defaults to 0.1. + use_mask_refine (bool): Whether to refine bbox by mask. + max_aspect_ratio (float): Aspect ratio of width and height + threshold to filter bboxes. If max(h/w, w/h) larger than this + value, the box will be removed. Defaults to 20. + resample_num (int): Number of poly to resample to. + """ + + def __init__(self, + max_rotate_degree: float = 10.0, + max_translate_ratio: float = 0.1, + scaling_ratio_range: Tuple[float, float] = (0.5, 1.5), + max_shear_degree: float = 2.0, + border: Tuple[int, int] = (0, 0), + border_val: Tuple[int, int, int] = (114, 114, 114), + bbox_clip_border: bool = True, + min_bbox_size: int = 2, + min_area_ratio: float = 0.1, + use_mask_refine: bool = False, + max_aspect_ratio: float = 20., + resample_num: int = 1000): + assert 0 <= max_translate_ratio <= 1 + assert scaling_ratio_range[0] <= scaling_ratio_range[1] + assert scaling_ratio_range[0] > 0 + self.max_rotate_degree = max_rotate_degree + self.max_translate_ratio = max_translate_ratio + self.scaling_ratio_range = scaling_ratio_range + self.max_shear_degree = max_shear_degree + self.border = border + self.border_val = border_val + self.bbox_clip_border = bbox_clip_border + self.min_bbox_size = min_bbox_size + self.min_area_ratio = min_area_ratio + self.use_mask_refine = use_mask_refine + self.max_aspect_ratio = max_aspect_ratio + self.resample_num = resample_num + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """The YOLOv5 random affine transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + img = results['img'] + # self.border is wh format + height = img.shape[0] + self.border[1] * 2 + width = img.shape[1] + self.border[0] * 2 + + # Note: Different from YOLOX + center_matrix = np.eye(3, dtype=np.float32) + center_matrix[0, 2] = -img.shape[1] / 2 + center_matrix[1, 2] = -img.shape[0] / 2 + + warp_matrix, scaling_ratio = self._get_random_homography_matrix( + height, width) + warp_matrix = warp_matrix @ center_matrix + + img = cv2.warpPerspective( + img, + warp_matrix, + dsize=(width, height), + borderValue=self.border_val) + results['img'] = img + results['img_shape'] = img.shape + img_h, img_w = img.shape[:2] + + bboxes = results['gt_bboxes'] + num_bboxes = len(bboxes) + if num_bboxes: + orig_bboxes = bboxes.clone() + if self.use_mask_refine and 'gt_masks' in results: + # If the dataset has annotations of mask, + # the mask will be used to refine bbox. + gt_masks = results['gt_masks'] + + gt_masks_resample = self.resample_masks(gt_masks) + gt_masks = self.warp_mask(gt_masks_resample, warp_matrix, + img_h, img_w) + + # refine bboxes by masks + bboxes = gt_masks.get_bboxes(dst_type='hbox') + # filter bboxes outside image + valid_index = self.filter_gt_bboxes(orig_bboxes, + bboxes).numpy() + results['gt_masks'] = gt_masks[valid_index] + else: + bboxes.project_(warp_matrix) + if self.bbox_clip_border: + bboxes.clip_([height, width]) + + # filter bboxes + orig_bboxes.rescale_([scaling_ratio, scaling_ratio]) + + # Be careful: valid_index must convert to numpy, + # otherwise it will raise out of bounds when len(valid_index)=1 + valid_index = self.filter_gt_bboxes(orig_bboxes, + bboxes).numpy() + if 'gt_masks' in results: + results['gt_masks'] = PolygonMasks( + results['gt_masks'].masks, img_h, img_w) + + results['gt_bboxes'] = bboxes[valid_index] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + valid_index] + results['gt_ignore_flags'] = results['gt_ignore_flags'][ + valid_index] + + return results + + @staticmethod + def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int, + img_h: int) -> np.ndarray: + """Function to warp one mask and filter points outside image. + + Args: + poly (np.ndarray): Segmentation annotation with shape (n, ) and + with format (x1, y1, x2, y2, ...). + warp_matrix (np.ndarray): Affine transformation matrix. + Shape: (3, 3). + img_w (int): Width of output image. + img_h (int): Height of output image. + """ + # TODO: Current logic may cause retained masks unusable for + # semantic segmentation training, which is same as official + # implementation. + poly = poly.reshape((-1, 2)) + poly = np.concatenate((poly, np.ones( + (len(poly), 1), dtype=poly.dtype)), + axis=-1) + # transform poly + poly = poly @ warp_matrix.T + poly = poly[:, :2] / poly[:, 2:3] + + # filter point outside image + x, y = poly.T + valid_ind_point = (x >= 0) & (y >= 0) & (x <= img_w) & (y <= img_h) + return poly[valid_ind_point].reshape(-1) + + def warp_mask(self, gt_masks: PolygonMasks, warp_matrix: np.ndarray, + img_w: int, img_h: int) -> PolygonMasks: + """Warp masks by warp_matrix and retain masks inside image after + warping. + + Args: + gt_masks (PolygonMasks): Annotations of semantic segmentation. + warp_matrix (np.ndarray): Affine transformation matrix. + Shape: (3, 3). + img_w (int): Width of output image. + img_h (int): Height of output image. + + Returns: + PolygonMasks: Masks after warping. + """ + masks = gt_masks.masks + + new_masks = [] + for poly_per_obj in masks: + warpped_poly_per_obj = [] + # One gt may have multiple masks. + for poly in poly_per_obj: + valid_poly = self.warp_poly(poly, warp_matrix, img_w, img_h) + if len(valid_poly): + warpped_poly_per_obj.append(valid_poly.reshape(-1)) + # If all the masks are invalid, + # add [0, 0, 0, 0, 0, 0,] here. + if not warpped_poly_per_obj: + # This will be filtered in function `filter_gt_bboxes`. + warpped_poly_per_obj = [ + np.zeros(6, dtype=poly_per_obj[0].dtype) + ] + new_masks.append(warpped_poly_per_obj) + + gt_masks = PolygonMasks(new_masks, img_h, img_w) + return gt_masks + + def resample_masks(self, gt_masks: PolygonMasks) -> PolygonMasks: + """Function to resample each mask annotation with shape (2 * n, ) to + shape (resample_num * 2, ). + + Args: + gt_masks (PolygonMasks): Annotations of semantic segmentation. + """ + masks = gt_masks.masks + new_masks = [] + for poly_per_obj in masks: + resample_poly_per_obj = [] + for poly in poly_per_obj: + poly = poly.reshape((-1, 2)) # xy + poly = np.concatenate((poly, poly[0:1, :]), axis=0) + x = np.linspace(0, len(poly) - 1, self.resample_num) + xp = np.arange(len(poly)) + poly = np.concatenate([ + np.interp(x, xp, poly[:, i]) for i in range(2) + ]).reshape(2, -1).T.reshape(-1) + resample_poly_per_obj.append(poly) + new_masks.append(resample_poly_per_obj) + return PolygonMasks(new_masks, gt_masks.height, gt_masks.width) + + def filter_gt_bboxes(self, origin_bboxes: HorizontalBoxes, + wrapped_bboxes: HorizontalBoxes) -> torch.Tensor: + """Filter gt bboxes. + + Args: + origin_bboxes (HorizontalBoxes): Origin bboxes. + wrapped_bboxes (HorizontalBoxes): Wrapped bboxes + + Returns: + dict: The result dict. + """ + origin_w = origin_bboxes.widths + origin_h = origin_bboxes.heights + wrapped_w = wrapped_bboxes.widths + wrapped_h = wrapped_bboxes.heights + aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16), + wrapped_h / (wrapped_w + 1e-16)) + + wh_valid_idx = (wrapped_w > self.min_bbox_size) & \ + (wrapped_h > self.min_bbox_size) + area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h + + 1e-16) > self.min_area_ratio + aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio + return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx + + @cache_randomness + def _get_random_homography_matrix(self, height: int, + width: int) -> Tuple[np.ndarray, float]: + """Get random homography matrix. + + Args: + height (int): Image height. + width (int): Image width. + + Returns: + Tuple[np.ndarray, float]: The result of warp_matrix and + scaling_ratio. + """ + # Rotation + rotation_degree = random.uniform(-self.max_rotate_degree, + self.max_rotate_degree) + rotation_matrix = self._get_rotation_matrix(rotation_degree) + + # Scaling + scaling_ratio = random.uniform(self.scaling_ratio_range[0], + self.scaling_ratio_range[1]) + scaling_matrix = self._get_scaling_matrix(scaling_ratio) + + # Shear + x_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + y_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + shear_matrix = self._get_shear_matrix(x_degree, y_degree) + + # Translation + trans_x = random.uniform(0.5 - self.max_translate_ratio, + 0.5 + self.max_translate_ratio) * width + trans_y = random.uniform(0.5 - self.max_translate_ratio, + 0.5 + self.max_translate_ratio) * height + translate_matrix = self._get_translation_matrix(trans_x, trans_y) + warp_matrix = ( + translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix) + return warp_matrix, scaling_ratio + + @staticmethod + def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray: + """Get rotation matrix. + + Args: + rotate_degrees (float): Rotate degrees. + + Returns: + np.ndarray: The rotation matrix. + """ + radian = math.radians(rotate_degrees) + rotation_matrix = np.array( + [[np.cos(radian), -np.sin(radian), 0.], + [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]], + dtype=np.float32) + return rotation_matrix + + @staticmethod + def _get_scaling_matrix(scale_ratio: float) -> np.ndarray: + """Get scaling matrix. + + Args: + scale_ratio (float): Scale ratio. + + Returns: + np.ndarray: The scaling matrix. + """ + scaling_matrix = np.array( + [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]], + dtype=np.float32) + return scaling_matrix + + @staticmethod + def _get_shear_matrix(x_shear_degrees: float, + y_shear_degrees: float) -> np.ndarray: + """Get shear matrix. + + Args: + x_shear_degrees (float): X shear degrees. + y_shear_degrees (float): Y shear degrees. + + Returns: + np.ndarray: The shear matrix. + """ + x_radian = math.radians(x_shear_degrees) + y_radian = math.radians(y_shear_degrees) + shear_matrix = np.array([[1, np.tan(x_radian), 0.], + [np.tan(y_radian), 1, 0.], [0., 0., 1.]], + dtype=np.float32) + return shear_matrix + + @staticmethod + def _get_translation_matrix(x: float, y: float) -> np.ndarray: + """Get translation matrix. + + Args: + x (float): X translation. + y (float): Y translation. + + Returns: + np.ndarray: The translation matrix. + """ + translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]], + dtype=np.float32) + return translation_matrix + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(max_rotate_degree={self.max_rotate_degree}, ' + repr_str += f'max_translate_ratio={self.max_translate_ratio}, ' + repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, ' + repr_str += f'max_shear_degree={self.max_shear_degree}, ' + repr_str += f'border={self.border}, ' + repr_str += f'border_val={self.border_val}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@TRANSFORMS.register_module() +class PPYOLOERandomDistort(BaseTransform): + """Random hue, saturation, contrast and brightness distortion. + + Required Keys: + + - img + + Modified Keys: + + - img (np.float32) + + Args: + hue_cfg (dict): Hue settings. Defaults to dict(min=-18, + max=18, prob=0.5). + saturation_cfg (dict): Saturation settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + contrast_cfg (dict): Contrast settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + brightness_cfg (dict): Brightness settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + num_distort_func (int): The number of distort function. Defaults + to 4. + """ + + def __init__(self, + hue_cfg: dict = dict(min=-18, max=18, prob=0.5), + saturation_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + contrast_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + brightness_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + num_distort_func: int = 4): + self.hue_cfg = hue_cfg + self.saturation_cfg = saturation_cfg + self.contrast_cfg = contrast_cfg + self.brightness_cfg = brightness_cfg + self.num_distort_func = num_distort_func + assert 0 < self.num_distort_func <= 4, \ + 'num_distort_func must > 0 and <= 4' + for cfg in [ + self.hue_cfg, self.saturation_cfg, self.contrast_cfg, + self.brightness_cfg + ]: + assert 0. <= cfg['prob'] <= 1., 'prob must >=0 and <=1' + + def transform_hue(self, results): + """Transform hue randomly.""" + if random.uniform(0., 1.) >= self.hue_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.hue_cfg['min'], self.hue_cfg['max']) + u = np.cos(delta * np.pi) + w = np.sin(delta * np.pi) + delta_iq = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) + rgb2yiq_matrix = np.array([[0.114, 0.587, 0.299], + [-0.321, -0.274, 0.596], + [0.311, -0.523, 0.211]]) + yiq2rgb_matric = np.array([[1.0, -1.107, 1.705], [1.0, -0.272, -0.647], + [1.0, 0.956, 0.621]]) + t = np.dot(np.dot(yiq2rgb_matric, delta_iq), rgb2yiq_matrix).T + img = np.dot(img, t) + results['img'] = img + return results + + def transform_saturation(self, results): + """Transform saturation randomly.""" + if random.uniform(0., 1.) >= self.saturation_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.saturation_cfg['min'], + self.saturation_cfg['max']) + + # convert bgr img to gray img + gray = img * np.array([[[0.114, 0.587, 0.299]]], dtype=np.float32) + gray = gray.sum(axis=2, keepdims=True) + gray *= (1.0 - delta) + img *= delta + img += gray + results['img'] = img + return results + + def transform_contrast(self, results): + """Transform contrast randomly.""" + if random.uniform(0., 1.) >= self.contrast_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.contrast_cfg['min'], + self.contrast_cfg['max']) + img *= delta + results['img'] = img + return results + + def transform_brightness(self, results): + """Transform brightness randomly.""" + if random.uniform(0., 1.) >= self.brightness_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.brightness_cfg['min'], + self.brightness_cfg['max']) + img += delta + results['img'] = img + return results + + def transform(self, results: dict) -> dict: + """The hue, saturation, contrast and brightness distortion function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + results['img'] = results['img'].astype(np.float32) + + functions = [ + self.transform_brightness, self.transform_contrast, + self.transform_saturation, self.transform_hue + ] + distortions = random.permutation(functions)[:self.num_distort_func] + for func in distortions: + results = func(results) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_cfg={self.hue_cfg}, ' + repr_str += f'saturation_cfg={self.saturation_cfg}, ' + repr_str += f'contrast_cfg={self.contrast_cfg}, ' + repr_str += f'brightness_cfg={self.brightness_cfg}, ' + repr_str += f'num_distort_func={self.num_distort_func})' + return repr_str + + +@TRANSFORMS.register_module() +class PPYOLOERandomCrop(BaseTransform): + """Random crop the img and bboxes. Different thresholds are used in PPYOLOE + to judge whether the clipped image meets the requirements. This + implementation is different from the implementation of RandomCrop in mmdet. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Added Keys: + - pad_param (np.float32) + + Args: + aspect_ratio (List[float]): Aspect ratio of cropped region. Default to + [.5, 2]. + thresholds (List[float]): Iou thresholds for deciding a valid bbox crop + in [min, max] format. Defaults to [.0, .1, .3, .5, .7, .9]. + scaling (List[float]): Ratio between a cropped region and the original + image in [min, max] format. Default to [.3, 1.]. + num_attempts (int): Number of tries for each threshold before + giving up. Default to 50. + allow_no_crop (bool): Allow return without actually cropping them. + Default to True. + cover_all_box (bool): Ensure all bboxes are covered in the final crop. + Default to False. + """ + + def __init__(self, + aspect_ratio: List[float] = [.5, 2.], + thresholds: List[float] = [.0, .1, .3, .5, .7, .9], + scaling: List[float] = [.3, 1.], + num_attempts: int = 50, + allow_no_crop: bool = True, + cover_all_box: bool = False): + self.aspect_ratio = aspect_ratio + self.thresholds = thresholds + self.scaling = scaling + self.num_attempts = num_attempts + self.allow_no_crop = allow_no_crop + self.cover_all_box = cover_all_box + + def _crop_data(self, results: dict, crop_box: Tuple[int, int, int, int], + valid_inds: np.ndarray) -> Union[dict, None]: + """Function to randomly crop images, bounding boxes, masks, semantic + segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + crop_box (Tuple[int, int, int, int]): Expected absolute coordinates + for cropping, (x1, y1, x2, y2). + valid_inds (np.ndarray): The indexes of gt that needs to be + retained. + + Returns: + results (Union[dict, None]): Randomly cropped results, 'img_shape' + key in result dict is updated according to crop size. None will + be returned when there is no valid bbox after cropping. + """ + # crop the image + img = results['img'] + crop_x1, crop_y1, crop_x2, crop_y2 = crop_box + img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] + results['img'] = img + img_shape = img.shape + results['img_shape'] = img.shape + + # crop bboxes accordingly and clip to the image boundary + if results.get('gt_bboxes', None) is not None: + bboxes = results['gt_bboxes'] + bboxes.translate_([-crop_x1, -crop_y1]) + bboxes.clip_(img_shape[:2]) + + results['gt_bboxes'] = bboxes[valid_inds] + + if results.get('gt_ignore_flags', None) is not None: + results['gt_ignore_flags'] = \ + results['gt_ignore_flags'][valid_inds] + + if results.get('gt_bboxes_labels', None) is not None: + results['gt_bboxes_labels'] = \ + results['gt_bboxes_labels'][valid_inds] + + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'][ + valid_inds.nonzero()[0]].crop( + np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) + + # crop semantic seg + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2, + crop_x1:crop_x2] + + return results + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """The random crop transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if results.get('gt_bboxes', None) is None or len( + results['gt_bboxes']) == 0: + return results + + orig_img_h, orig_img_w = results['img'].shape[:2] + gt_bboxes = results['gt_bboxes'] + + thresholds = list(self.thresholds) + if self.allow_no_crop: + thresholds.append('no_crop') + random.shuffle(thresholds) + + for thresh in thresholds: + # Determine the coordinates for cropping + if thresh == 'no_crop': + return results + + found = False + for i in range(self.num_attempts): + crop_h, crop_w = self._get_crop_size((orig_img_h, orig_img_w)) + if self.aspect_ratio is None: + if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0: + continue + + # get image crop_box + margin_h = max(orig_img_h - crop_h, 0) + margin_w = max(orig_img_w - crop_w, 0) + offset_h, offset_w = self._rand_offset((margin_h, margin_w)) + crop_y1, crop_y2 = offset_h, offset_h + crop_h + crop_x1, crop_x2 = offset_w, offset_w + crop_w + + crop_box = [crop_x1, crop_y1, crop_x2, crop_y2] + # Calculate the iou between gt_bboxes and crop_boxes + iou = self._iou_matrix(gt_bboxes, + np.array([crop_box], dtype=np.float32)) + # If the maximum value of the iou is less than thresh, + # the current crop_box is considered invalid. + if iou.max() < thresh: + continue + + # If cover_all_box == True and the minimum value of + # the iou is less than thresh, the current crop_box + # is considered invalid. + if self.cover_all_box and iou.min() < thresh: + continue + + # Get which gt_bboxes to keep after cropping. + valid_inds = self._get_valid_inds( + gt_bboxes, np.array(crop_box, dtype=np.float32)) + if valid_inds.size > 0: + found = True + break + + if found: + results = self._crop_data(results, crop_box, valid_inds) + return results + return results + + @cache_randomness + def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generate crop offset. + + Args: + margin (Tuple[int, int]): The upper bound for the offset generated + randomly. + + Returns: + Tuple[int, int]: The random offset for the crop. + """ + margin_h, margin_w = margin + offset_h = np.random.randint(0, margin_h + 1) + offset_w = np.random.randint(0, margin_w + 1) + + return (offset_h, offset_w) + + @cache_randomness + def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generates the crop size based on `image_size`. + + Args: + image_size (Tuple[int, int]): (h, w). + + Returns: + crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels. + """ + h, w = image_size + scale = random.uniform(*self.scaling) + if self.aspect_ratio is not None: + min_ar, max_ar = self.aspect_ratio + aspect_ratio = random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) + h_scale = scale / np.sqrt(aspect_ratio) + w_scale = scale * np.sqrt(aspect_ratio) + else: + h_scale = random.uniform(*self.scaling) + w_scale = random.uniform(*self.scaling) + crop_h = h * h_scale + crop_w = w * w_scale + return int(crop_h), int(crop_w) + + def _iou_matrix(self, + gt_bbox: HorizontalBoxes, + crop_bbox: np.ndarray, + eps: float = 1e-10) -> np.ndarray: + """Calculate iou between gt and image crop box. + + Args: + gt_bbox (HorizontalBoxes): Ground truth bounding boxes. + crop_bbox (np.ndarray): Image crop coordinates in + [x1, y1, x2, y2] format. + eps (float): Default to 1e-10. + Return: + (np.ndarray): IoU. + """ + gt_bbox = gt_bbox.tensor.numpy() + lefttop = np.maximum(gt_bbox[:, np.newaxis, :2], crop_bbox[:, :2]) + rightbottom = np.minimum(gt_bbox[:, np.newaxis, 2:], crop_bbox[:, 2:]) + + overlap = np.prod( + rightbottom - lefttop, + axis=2) * (lefttop < rightbottom).all(axis=2) + area_gt_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1) + area_crop_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1) + area_o = (area_gt_bbox[:, np.newaxis] + area_crop_bbox - overlap) + return overlap / (area_o + eps) + + def _get_valid_inds(self, gt_bbox: HorizontalBoxes, + img_crop_bbox: np.ndarray) -> np.ndarray: + """Get which Bboxes to keep at the current cropping coordinates. + + Args: + gt_bbox (HorizontalBoxes): Ground truth bounding boxes. + img_crop_bbox (np.ndarray): Image crop coordinates in + [x1, y1, x2, y2] format. + + Returns: + (np.ndarray): Valid indexes. + """ + cropped_box = gt_bbox.tensor.numpy().copy() + gt_bbox = gt_bbox.tensor.numpy().copy() + + cropped_box[:, :2] = np.maximum(gt_bbox[:, :2], img_crop_bbox[:2]) + cropped_box[:, 2:] = np.minimum(gt_bbox[:, 2:], img_crop_bbox[2:]) + cropped_box[:, :2] -= img_crop_bbox[:2] + cropped_box[:, 2:] -= img_crop_bbox[:2] + + centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2 + valid = np.logical_and(img_crop_bbox[:2] <= centers, + centers < img_crop_bbox[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + + return np.where(valid)[0] + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(aspect_ratio={self.aspect_ratio}, ' + repr_str += f'thresholds={self.thresholds}, ' + repr_str += f'scaling={self.scaling}, ' + repr_str += f'num_attempts={self.num_attempts}, ' + repr_str += f'allow_no_crop={self.allow_no_crop}, ' + repr_str += f'cover_all_box={self.cover_all_box})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5CopyPaste(BaseTransform): + """Copy-Paste used in YOLOv5 and YOLOv8. + + This transform randomly copy some objects in the image to the mirror + position of the image.It is different from the `CopyPaste` in mmdet. + + Required Keys: + + - img (np.uint8) + - gt_bboxes (BaseBoxes[torch.float32]) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (PolygonMasks) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (optional) + - gt_masks (optional) + + Args: + ioa_thresh (float): Ioa thresholds for deciding valid bbox. + prob (float): Probability of choosing objects. + Defaults to 0.5. + """ + + def __init__(self, ioa_thresh: float = 0.3, prob: float = 0.5): + self.ioa_thresh = ioa_thresh + self.prob = prob + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """The YOLOv5 and YOLOv8 Copy-Paste transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if len(results.get('gt_masks', [])) == 0: + return results + gt_masks = results['gt_masks'] + assert isinstance(gt_masks, PolygonMasks),\ + 'only support type of PolygonMasks,' \ + ' but get type: %s' % type(gt_masks) + gt_bboxes = results['gt_bboxes'] + gt_bboxes_labels = results.get('gt_bboxes_labels', None) + img = results['img'] + img_h, img_w = img.shape[:2] + + # calculate ioa + gt_bboxes_flip = deepcopy(gt_bboxes) + gt_bboxes_flip.flip_(img.shape) + + ioa = self.bbox_ioa(gt_bboxes_flip, gt_bboxes) + indexes = torch.nonzero((ioa < self.ioa_thresh).all(1))[:, 0] + n = len(indexes) + valid_inds = random.choice( + indexes, size=round(self.prob * n), replace=False) + if len(valid_inds) == 0: + return results + + if gt_bboxes_labels is not None: + # prepare labels + gt_bboxes_labels = np.concatenate( + (gt_bboxes_labels, gt_bboxes_labels[valid_inds]), axis=0) + + # prepare bboxes + copypaste_bboxes = gt_bboxes_flip[valid_inds] + gt_bboxes = gt_bboxes.cat([gt_bboxes, copypaste_bboxes]) + + # prepare images + copypaste_gt_masks = gt_masks[valid_inds] + copypaste_gt_masks_flip = copypaste_gt_masks.flip() + # convert poly format to bitmap format + # example: poly: [[array(0.0, 0.0, 10.0, 0.0, 10.0, 10.0, 0.0, 10.0]] + # -> bitmap: a mask with shape equal to (1, img_h, img_w) + # # type1 low speed + # copypaste_gt_masks_bitmap = copypaste_gt_masks.to_ndarray() + # copypaste_mask = np.sum(copypaste_gt_masks_bitmap, axis=0) > 0 + + # type2 + copypaste_mask = np.zeros((img_h, img_w), dtype=np.uint8) + for poly in copypaste_gt_masks.masks: + poly = [i.reshape((-1, 1, 2)).astype(np.int32) for i in poly] + cv2.drawContours(copypaste_mask, poly, -1, (1, ), cv2.FILLED) + + copypaste_mask = copypaste_mask.astype(bool) + + # copy objects, and paste to the mirror position of the image + copypaste_mask_flip = mmcv.imflip( + copypaste_mask, direction='horizontal') + copypaste_img = mmcv.imflip(img, direction='horizontal') + img[copypaste_mask_flip] = copypaste_img[copypaste_mask_flip] + + # prepare masks + gt_masks = copypaste_gt_masks.cat([gt_masks, copypaste_gt_masks_flip]) + + if 'gt_ignore_flags' in results: + # prepare gt_ignore_flags + gt_ignore_flags = results['gt_ignore_flags'] + gt_ignore_flags = np.concatenate( + [gt_ignore_flags, gt_ignore_flags[valid_inds]], axis=0) + results['gt_ignore_flags'] = gt_ignore_flags + + results['img'] = img + results['gt_bboxes'] = gt_bboxes + if gt_bboxes_labels is not None: + results['gt_bboxes_labels'] = gt_bboxes_labels + results['gt_masks'] = gt_masks + + return results + + @staticmethod + def bbox_ioa(gt_bboxes_flip: HorizontalBoxes, + gt_bboxes: HorizontalBoxes, + eps: float = 1e-7) -> np.ndarray: + """Calculate ioa between gt_bboxes_flip and gt_bboxes. + + Args: + gt_bboxes_flip (HorizontalBoxes): Flipped ground truth + bounding boxes. + gt_bboxes (HorizontalBoxes): Ground truth bounding boxes. + eps (float): Default to 1e-10. + Return: + (Tensor): Ioa. + """ + gt_bboxes_flip = gt_bboxes_flip.tensor + gt_bboxes = gt_bboxes.tensor + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = gt_bboxes_flip.T + b2_x1, b2_y1, b2_x2, b2_y2 = gt_bboxes.T + + # Intersection area + inter_area = (torch.minimum(b1_x2[:, None], + b2_x2) - torch.maximum(b1_x1[:, None], + b2_x1)).clip(0) * \ + (torch.minimum(b1_y2[:, None], + b2_y2) - torch.maximum(b1_y1[:, None], + b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps + + # Intersection over box2 area + return inter_area / box2_area + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(ioa_thresh={self.ioa_thresh},' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class RemoveDataElement(BaseTransform): + """Remove unnecessary data element in results. + + Args: + keys (Union[str, Sequence[str]]): Keys need to be removed. + """ + + def __init__(self, keys: Union[str, Sequence[str]]): + self.keys = [keys] if isinstance(keys, str) else keys + + def transform(self, results: dict) -> dict: + for key in self.keys: + results.pop(key, None) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(keys={self.keys})' + return repr_str + + +@TRANSFORMS.register_module() +class RegularizeRotatedBox(BaseTransform): + """Regularize rotated boxes. + + Due to the angle periodicity, one rotated box can be represented in + many different (x, y, w, h, t). To make each rotated box unique, + ``regularize_boxes`` will take the remainder of the angle divided by + 180 degrees. + + For convenience, three angle_version can be used here: + + - 'oc': OpenCV Definition. Has the same box representation as + ``cv2.minAreaRect`` the angle ranges in [-90, 0). + - 'le90': Long Edge Definition (90). the angle ranges in [-90, 90). + The width is always longer than the height. + - 'le135': Long Edge Definition (135). the angle ranges in [-45, 135). + The width is always longer than the height. + + Required Keys: + + - gt_bboxes (RotatedBoxes[torch.float32]) + + Modified Keys: + + - gt_bboxes + + Args: + angle_version (str): Angle version. Can only be 'oc', + 'le90', or 'le135'. Defaults to 'le90. + """ + + def __init__(self, angle_version='le90') -> None: + self.angle_version = angle_version + try: + from mmrotate.structures.bbox import RotatedBoxes + self.box_type = RotatedBoxes + except ImportError: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + def transform(self, results: dict) -> dict: + assert isinstance(results['gt_bboxes'], self.box_type) + results['gt_bboxes'] = self.box_type( + results['gt_bboxes'].regularize_boxes(self.angle_version)) + return results diff --git a/mmyolo/datasets/utils.py b/mmyolo/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..62fe5484b4befc76328798d6f044d1c283edc397 --- /dev/null +++ b/mmyolo/datasets/utils.py @@ -0,0 +1,114 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence + +import numpy as np +import torch +from mmengine.dataset import COLLATE_FUNCTIONS + +from ..registry import TASK_UTILS + + +@COLLATE_FUNCTIONS.register_module() +def yolov5_collate(data_batch: Sequence, + use_ms_training: bool = False) -> dict: + """Rewrite collate_fn to get faster training speed. + + Args: + data_batch (Sequence): Batch of data. + use_ms_training (bool): Whether to use multi-scale training. + """ + batch_imgs = [] + batch_bboxes_labels = [] + batch_masks = [] + for i in range(len(data_batch)): + datasamples = data_batch[i]['data_samples'] + inputs = data_batch[i]['inputs'] + batch_imgs.append(inputs) + + gt_bboxes = datasamples.gt_instances.bboxes.tensor + gt_labels = datasamples.gt_instances.labels + if 'masks' in datasamples.gt_instances: + masks = datasamples.gt_instances.masks.to_tensor( + dtype=torch.bool, device=gt_bboxes.device) + batch_masks.append(masks) + batch_idx = gt_labels.new_full((len(gt_labels), 1), i) + bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes), + dim=1) + batch_bboxes_labels.append(bboxes_labels) + + collated_results = { + 'data_samples': { + 'bboxes_labels': torch.cat(batch_bboxes_labels, 0) + } + } + if len(batch_masks) > 0: + collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0) + + if use_ms_training: + collated_results['inputs'] = batch_imgs + else: + collated_results['inputs'] = torch.stack(batch_imgs, 0) + return collated_results + + +@TASK_UTILS.register_module() +class BatchShapePolicy: + """BatchShapePolicy is only used in the testing phase, which can reduce the + number of pad pixels during batch inference. + + Args: + batch_size (int): Single GPU batch size during batch inference. + Defaults to 32. + img_size (int): Expected output image size. Defaults to 640. + size_divisor (int): The minimum size that is divisible + by size_divisor. Defaults to 32. + extra_pad_ratio (float): Extra pad ratio. Defaults to 0.5. + """ + + def __init__(self, + batch_size: int = 32, + img_size: int = 640, + size_divisor: int = 32, + extra_pad_ratio: float = 0.5): + self.batch_size = batch_size + self.img_size = img_size + self.size_divisor = size_divisor + self.extra_pad_ratio = extra_pad_ratio + + def __call__(self, data_list: List[dict]) -> List[dict]: + image_shapes = [] + for data_info in data_list: + image_shapes.append((data_info['width'], data_info['height'])) + + image_shapes = np.array(image_shapes, dtype=np.float64) + + n = len(image_shapes) # number of images + batch_index = np.floor(np.arange(n) / self.batch_size).astype( + np.int64) # batch index + number_of_batches = batch_index[-1] + 1 # number of batches + + aspect_ratio = image_shapes[:, 1] / image_shapes[:, 0] # aspect ratio + irect = aspect_ratio.argsort() + + data_list = [data_list[i] for i in irect] + + aspect_ratio = aspect_ratio[irect] + # Set training image shapes + shapes = [[1, 1]] * number_of_batches + for i in range(number_of_batches): + aspect_ratio_index = aspect_ratio[batch_index == i] + min_index, max_index = aspect_ratio_index.min( + ), aspect_ratio_index.max() + if max_index < 1: + shapes[i] = [max_index, 1] + elif min_index > 1: + shapes[i] = [1, 1 / min_index] + + batch_shapes = np.ceil( + np.array(shapes) * self.img_size / self.size_divisor + + self.extra_pad_ratio).astype(np.int64) * self.size_divisor + + for i, data_info in enumerate(data_list): + data_info['batch_shape'] = batch_shapes[batch_index[i]] + + return data_list diff --git a/mmyolo/datasets/yolov5_coco.py b/mmyolo/datasets/yolov5_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..55bc899abfcceebfdadf7549e56336725d891dcb --- /dev/null +++ b/mmyolo/datasets/yolov5_coco.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Optional + +from mmdet.datasets import BaseDetDataset, CocoDataset + +from ..registry import DATASETS, TASK_UTILS + + +class BatchShapePolicyDataset(BaseDetDataset): + """Dataset with the batch shape policy that makes paddings with least + pixels during batch inference process, which does not require the image + scales of all batches to be the same throughout validation.""" + + def __init__(self, + *args, + batch_shapes_cfg: Optional[dict] = None, + **kwargs): + self.batch_shapes_cfg = batch_shapes_cfg + super().__init__(*args, **kwargs) + + def full_init(self): + """rewrite full_init() to be compatible with serialize_data in + BatchShapePolicy.""" + if self._fully_initialized: + return + # load data information + self.data_list = self.load_data_list() + + # batch_shapes_cfg + if self.batch_shapes_cfg: + batch_shapes_policy = TASK_UTILS.build(self.batch_shapes_cfg) + self.data_list = batch_shapes_policy(self.data_list) + del batch_shapes_policy + + # filter illegal data, such as data that has no annotations. + self.data_list = self.filter_data() + # Get subset data according to indices. + if self._indices is not None: + self.data_list = self._get_unserialized_subset(self._indices) + + # serialize data_list + if self.serialize_data: + self.data_bytes, self.data_address = self._serialize_data() + + self._fully_initialized = True + + def prepare_data(self, idx: int) -> Any: + """Pass the dataset to the pipeline during training to support mixed + data augmentation, such as Mosaic and MixUp.""" + if self.test_mode is False: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) + else: + return super().prepare_data(idx) + + +@DATASETS.register_module() +class YOLOv5CocoDataset(BatchShapePolicyDataset, CocoDataset): + """Dataset for YOLOv5 COCO Dataset. + + We only add `BatchShapePolicy` function compared with CocoDataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/mmyolo/datasets/yolov5_crowdhuman.py b/mmyolo/datasets/yolov5_crowdhuman.py new file mode 100644 index 0000000000000000000000000000000000000000..486a8324fb4c7d8a34bf885f1818d2e6f974f6e7 --- /dev/null +++ b/mmyolo/datasets/yolov5_crowdhuman.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets import CrowdHumanDataset + +from ..registry import DATASETS +from .yolov5_coco import BatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5CrowdHumanDataset(BatchShapePolicyDataset, CrowdHumanDataset): + """Dataset for YOLOv5 CrowdHuman Dataset. + + We only add `BatchShapePolicy` function compared with CrowdHumanDataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/mmyolo/datasets/yolov5_dota.py b/mmyolo/datasets/yolov5_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..a9647981333ed725a568a293279873ab9e20db47 --- /dev/null +++ b/mmyolo/datasets/yolov5_dota.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from ..registry import DATASETS + +try: + from mmrotate.datasets import DOTADataset + MMROTATE_AVAILABLE = True +except ImportError: + from mmengine.dataset import BaseDataset + DOTADataset = BaseDataset + MMROTATE_AVAILABLE = False + + +@DATASETS.register_module() +class YOLOv5DOTADataset(BatchShapePolicyDataset, DOTADataset): + """Dataset for YOLOv5 DOTA Dataset. + + We only add `BatchShapePolicy` function compared with DOTADataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + + def __init__(self, *args, **kwargs): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + super().__init__(*args, **kwargs) diff --git a/mmyolo/datasets/yolov5_voc.py b/mmyolo/datasets/yolov5_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..5be764f1db3097645ae1be387e45cafb1b460731 --- /dev/null +++ b/mmyolo/datasets/yolov5_voc.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets import VOCDataset + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from ..registry import DATASETS + + +@DATASETS.register_module() +class YOLOv5VOCDataset(BatchShapePolicyDataset, VOCDataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with VOCDataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/mmyolo/deploy/__init__.py b/mmyolo/deploy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4904a9058b41526d9719994ed718ae58336d290e --- /dev/null +++ b/mmyolo/deploy/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdeploy.codebase.base import MMCodebase + +from .models import * # noqa: F401,F403 +from .object_detection import MMYOLO, YOLOObjectDetection + +__all__ = ['MMCodebase', 'MMYOLO', 'YOLOObjectDetection'] diff --git a/mmyolo/deploy/models/__init__.py b/mmyolo/deploy/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b999a0161543d6a9d2ab56d797af740dc7261e4 --- /dev/null +++ b/mmyolo/deploy/models/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from . import dense_heads # noqa: F401,F403 diff --git a/mmyolo/deploy/models/dense_heads/__init__.py b/mmyolo/deploy/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cc423af3ec374cabe2b9f46d2fe4f4dc9755b8e3 --- /dev/null +++ b/mmyolo/deploy/models/dense_heads/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from . import yolov5_head # noqa: F401,F403 + +__all__ = ['yolov5_head'] diff --git a/mmyolo/deploy/models/dense_heads/yolov5_head.py b/mmyolo/deploy/models/dense_heads/yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ac996ba41336243ef091e3e952430382be9ff978 --- /dev/null +++ b/mmyolo/deploy/models/dense_heads/yolov5_head.py @@ -0,0 +1,189 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from functools import partial +from typing import List, Optional, Tuple + +import torch +from mmdeploy.codebase.mmdet import get_post_processing_params +from mmdeploy.codebase.mmdet.models.layers import multiclass_nms +from mmdeploy.core import FUNCTION_REWRITER +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.deploy.models.layers import efficient_nms +from mmyolo.models.dense_heads import YOLOv5Head + + +def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: int) -> Tensor: + """Decode YOLOv5 bounding boxes. + + Args: + priors (Tensor): Prior boxes in center-offset form. + bbox_preds (Tensor): Predicted bounding boxes. + stride (int): Stride of the feature map. + + Returns: + Tensor: Decoded bounding boxes. + """ + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5Head.predict_by_feat') +def yolov5_head__predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> Tuple[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + Returns: + tuple[Tensor, Tensor]: The first item is an (N, num_box, 5) tensor, + where 5 represent (tl_x, tl_y, br_x, br_y, score), N is batch + size and the score between 0 and 1. The shape of the second + tensor in the tuple is (N, num_box), and each element + represents the class label of the corresponding box. + """ + ctx = FUNCTION_REWRITER.get_context() + detector_type = type(self) + deploy_cfg = ctx.cfg + use_efficientnms = deploy_cfg.get('use_efficientnms', False) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + bbox_decoder = self.bbox_coder.decode + nms_func = multiclass_nms + if use_efficientnms: + if detector_type is YOLOv5Head: + nms_func = partial(efficient_nms, box_coding=0) + bbox_decoder = yolov5_bbox_decoder + else: + nms_func = efficient_nms + + assert len(cls_scores) == len(bbox_preds) + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + num_imgs = cls_scores[0].shape[0] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, dtype=dtype, device=device) + + flatten_priors = torch.cat(mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size[0] * featmap_size[1] * self.num_base_priors, ), + stride) + for featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes) + for cls_score in cls_scores + ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) + + if not with_nms: + return bboxes, scores + + post_params = get_post_processing_params(deploy_cfg) + max_output_boxes_per_class = post_params.max_output_boxes_per_class + iou_threshold = cfg.nms.get('iou_threshold', post_params.iou_threshold) + score_threshold = cfg.get('score_thr', post_params.score_threshold) + pre_top_k = post_params.pre_top_k + keep_top_k = cfg.get('max_per_img', post_params.keep_top_k) + + return nms_func(bboxes, scores, max_output_boxes_per_class, iou_threshold, + score_threshold, pre_top_k, keep_top_k) + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5Head.predict', + backend='rknn') +def yolov5_head__predict__rknn(self, x: Tuple[Tensor], *args, + **kwargs) -> Tuple[Tensor, Tensor, Tensor]: + """Perform forward propagation of the detection head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + """ + outs = self(x) + return outs + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5HeadModule.forward', + backend='rknn') +def yolov5_head_module__forward__rknn( + self, x: Tensor, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + out = [] + for i, feat in enumerate(x): + out.append(self.convs_pred[i](feat)) + return out diff --git a/mmyolo/deploy/models/layers/__init__.py b/mmyolo/deploy/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6017cf83425b640eb788a8abf6b253f29d759afb --- /dev/null +++ b/mmyolo/deploy/models/layers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_nms import efficient_nms + +__all__ = ['efficient_nms'] diff --git a/mmyolo/deploy/models/layers/bbox_nms.py b/mmyolo/deploy/models/layers/bbox_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..4db81c0227a36e0315855082dcd8125e1f9be70a --- /dev/null +++ b/mmyolo/deploy/models/layers/bbox_nms.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdeploy.core import mark +from torch import Tensor + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x, y, w, h]. + Set to 1 means [x1, y1 ,x2, y2]. + + Returns: + tuple[Tensor, Tensor]: (dets, labels), `dets` of shape [N, num_det, 5] + and `labels` of shape [N, num_det]. + """ + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, det_boxes, det_scores, labels = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + dets = torch.cat([det_boxes, det_scores.unsqueeze(2)], -1) + + # retain shape info + batch_size = boxes.size(0) + + dets_shape = dets.shape + label_shape = labels.shape + dets = dets.reshape([batch_size, *dets_shape[1:]]) + labels = labels.reshape([batch_size, *label_shape[1:]]) + return dets, labels + + +@mark('efficient_nms', inputs=['boxes', 'scores'], outputs=['dets', 'labels']) +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +class TRTEfficientNMSop(torch.autograd.Function): + """Efficient NMS op for TensorRT.""" + + @staticmethod + def forward( + ctx, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25, + ): + """Forward function of TRTEfficientNMSop.""" + batch_size, num_boxes, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25): + """Symbolic function of TRTEfficientNMSop.""" + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + nums, boxes, scores, classes = out + return nums, boxes, scores, classes diff --git a/mmyolo/deploy/object_detection.py b/mmyolo/deploy/object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..7efdfcfb7a46c8bc6b90e76bd06d9065410e55f0 --- /dev/null +++ b/mmyolo/deploy/object_detection.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Callable, Dict, Optional + +import torch +from mmdeploy.codebase.base import CODEBASE, MMCodebase +from mmdeploy.codebase.mmdet.deploy import ObjectDetection +from mmdeploy.utils import Codebase, Task +from mmengine import Config +from mmengine.registry import Registry + +MMYOLO_TASK = Registry('mmyolo_tasks') + + +@CODEBASE.register_module(Codebase.MMYOLO.value) +class MMYOLO(MMCodebase): + """MMYOLO codebase class.""" + + task_registry = MMYOLO_TASK + + @classmethod + def register_deploy_modules(cls): + """register all rewriters for mmdet.""" + import mmdeploy.codebase.mmdet.models # noqa: F401 + import mmdeploy.codebase.mmdet.ops # noqa: F401 + import mmdeploy.codebase.mmdet.structures # noqa: F401 + + @classmethod + def register_all_modules(cls): + """register all modules.""" + from mmdet.utils.setup_env import \ + register_all_modules as register_all_modules_mmdet + + from mmyolo.utils.setup_env import \ + register_all_modules as register_all_modules_mmyolo + + cls.register_deploy_modules() + register_all_modules_mmyolo(True) + register_all_modules_mmdet(False) + + +def _get_dataset_metainfo(model_cfg: Config): + """Get metainfo of dataset. + + Args: + model_cfg Config: Input model Config object. + + Returns: + list[str]: A list of string specifying names of different class. + """ + from mmyolo import datasets # noqa + from mmyolo.registry import DATASETS + + module_dict = DATASETS.module_dict + for dataloader_name in [ + 'test_dataloader', 'val_dataloader', 'train_dataloader' + ]: + if dataloader_name not in model_cfg: + continue + dataloader_cfg = model_cfg[dataloader_name] + dataset_cfg = dataloader_cfg.dataset + dataset_cls = module_dict.get(dataset_cfg.type, None) + if dataset_cls is None: + continue + if hasattr(dataset_cls, '_load_metainfo') and isinstance( + dataset_cls._load_metainfo, Callable): + meta = dataset_cls._load_metainfo( + dataset_cfg.get('metainfo', None)) + if meta is not None: + return meta + if hasattr(dataset_cls, 'METAINFO'): + return dataset_cls.METAINFO + + return None + + +@MMYOLO_TASK.register_module(Task.OBJECT_DETECTION.value) +class YOLOObjectDetection(ObjectDetection): + """YOLO Object Detection task.""" + + def get_visualizer(self, name: str, save_dir: str): + """Get visualizer. + + Args: + name (str): Name of visualizer. + save_dir (str): Directory to save visualization results. + + Returns: + Visualizer: A visualizer instance. + """ + from mmdet.visualization import DetLocalVisualizer # noqa: F401,F403 + metainfo = _get_dataset_metainfo(self.model_cfg) + visualizer = super().get_visualizer(name, save_dir) + if metainfo is not None: + visualizer.dataset_meta = metainfo + return visualizer + + def build_pytorch_model(self, + model_checkpoint: Optional[str] = None, + cfg_options: Optional[Dict] = None, + **kwargs) -> torch.nn.Module: + """Initialize torch model. + + Args: + model_checkpoint (str): The checkpoint file of torch model, + defaults to `None`. + cfg_options (dict): Optional config key-pair parameters. + Returns: + nn.Module: An initialized torch model generated by other OpenMMLab + codebases. + """ + from copy import deepcopy + + from mmengine.model import revert_sync_batchnorm + from mmengine.registry import MODELS + + from mmyolo.utils import switch_to_deploy + + model = deepcopy(self.model_cfg.model) + preprocess_cfg = deepcopy(self.model_cfg.get('preprocess_cfg', {})) + preprocess_cfg.update( + deepcopy(self.model_cfg.get('data_preprocessor', {}))) + model.setdefault('data_preprocessor', preprocess_cfg) + model = MODELS.build(model) + if model_checkpoint is not None: + from mmengine.runner.checkpoint import load_checkpoint + load_checkpoint(model, model_checkpoint, map_location=self.device) + + model = revert_sync_batchnorm(model) + switch_to_deploy(model) + model = model.to(self.device) + model.eval() + return model diff --git a/mmyolo/engine/__init__.py b/mmyolo/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b2e0a126c09797b327f7309d6e980245b7e44773 --- /dev/null +++ b/mmyolo/engine/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hooks import * # noqa: F401,F403 +from .optimizers import * # noqa: F401,F403 diff --git a/mmyolo/engine/hooks/__init__.py b/mmyolo/engine/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b8deebc8827da5b9a3f8c92a2fffe70e42d0bfa --- /dev/null +++ b/mmyolo/engine/hooks/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ppyoloe_param_scheduler_hook import PPYOLOEParamSchedulerHook +from .switch_to_deploy_hook import SwitchToDeployHook +from .yolov5_param_scheduler_hook import YOLOv5ParamSchedulerHook +from .yolox_mode_switch_hook import YOLOXModeSwitchHook + +__all__ = [ + 'YOLOv5ParamSchedulerHook', 'YOLOXModeSwitchHook', 'SwitchToDeployHook', + 'PPYOLOEParamSchedulerHook' +] diff --git a/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py b/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..26dfe6ef2d5cf590ea381efb3e42cdc1c5492361 --- /dev/null +++ b/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +from mmengine.hooks import ParamSchedulerHook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +@HOOKS.register_module() +class PPYOLOEParamSchedulerHook(ParamSchedulerHook): + """A hook to update learning rate and momentum in optimizer of PPYOLOE. We + use this hook to implement adaptive computation for `warmup_total_iters`, + which is not possible with the built-in ParamScheduler in mmyolo. + + Args: + warmup_min_iter (int): Minimum warmup iters. Defaults to 1000. + start_factor (float): The number we multiply learning rate in the + first epoch. The multiplication factor changes towards end_factor + in the following epochs. Defaults to 0. + warmup_epochs (int): Epochs for warmup. Defaults to 5. + min_lr_ratio (float): Minimum learning rate ratio. + total_epochs (int): In PPYOLOE, `total_epochs` is set to + training_epochs x 1.2. Defaults to 360. + """ + priority = 9 + + def __init__(self, + warmup_min_iter: int = 1000, + start_factor: float = 0., + warmup_epochs: int = 5, + min_lr_ratio: float = 0.0, + total_epochs: int = 360): + + self.warmup_min_iter = warmup_min_iter + self.start_factor = start_factor + self.warmup_epochs = warmup_epochs + self.min_lr_ratio = min_lr_ratio + self.total_epochs = total_epochs + + self._warmup_end = False + self._base_lr = None + + def before_train(self, runner: Runner): + """Operations before train. + + Args: + runner (Runner): The runner of the training process. + """ + optimizer = runner.optim_wrapper.optimizer + for group in optimizer.param_groups: + # If the param is never be scheduled, record the current value + # as the initial value. + group.setdefault('initial_lr', group['lr']) + + self._base_lr = [ + group['initial_lr'] for group in optimizer.param_groups + ] + self._min_lr = [i * self.min_lr_ratio for i in self._base_lr] + + def before_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None): + """Operations before each training iteration. + + Args: + runner (Runner): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict or tuple or list, optional): Data from dataloader. + """ + cur_iters = runner.iter + optimizer = runner.optim_wrapper.optimizer + dataloader_len = len(runner.train_dataloader) + + # The minimum warmup is self.warmup_min_iter + warmup_total_iters = max( + round(self.warmup_epochs * dataloader_len), self.warmup_min_iter) + + if cur_iters <= warmup_total_iters: + # warm up + alpha = cur_iters / warmup_total_iters + factor = self.start_factor * (1 - alpha) + alpha + + for group_idx, param in enumerate(optimizer.param_groups): + param['lr'] = self._base_lr[group_idx] * factor + else: + for group_idx, param in enumerate(optimizer.param_groups): + total_iters = self.total_epochs * dataloader_len + lr = self._min_lr[group_idx] + ( + self._base_lr[group_idx] - + self._min_lr[group_idx]) * 0.5 * ( + math.cos((cur_iters - warmup_total_iters) * math.pi / + (total_iters - warmup_total_iters)) + 1.0) + param['lr'] = lr diff --git a/mmyolo/engine/hooks/switch_to_deploy_hook.py b/mmyolo/engine/hooks/switch_to_deploy_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..28ac345f40c44c974fb33b7bf9756a61fcabf820 --- /dev/null +++ b/mmyolo/engine/hooks/switch_to_deploy_hook.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS +from mmyolo.utils import switch_to_deploy + + +@HOOKS.register_module() +class SwitchToDeployHook(Hook): + """Switch to deploy mode before testing. + + This hook converts the multi-channel structure of the training network + (high performance) to the one-way structure of the testing network (fast + speed and memory saving). + """ + + def before_test_epoch(self, runner: Runner): + """Switch to deploy mode before testing.""" + switch_to_deploy(runner.model) diff --git a/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py b/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..777bb49d7abd7fc37385370546d05e70c274b3b7 --- /dev/null +++ b/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import numpy as np +from mmengine.hooks import ParamSchedulerHook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +def linear_fn(lr_factor: float, max_epochs: int): + """Generate linear function.""" + return lambda x: (1 - x / max_epochs) * (1.0 - lr_factor) + lr_factor + + +def cosine_fn(lr_factor: float, max_epochs: int): + """Generate cosine function.""" + return lambda x: ( + (1 - math.cos(x * math.pi / max_epochs)) / 2) * (lr_factor - 1) + 1 + + +@HOOKS.register_module() +class YOLOv5ParamSchedulerHook(ParamSchedulerHook): + """A hook to update learning rate and momentum in optimizer of YOLOv5.""" + priority = 9 + + scheduler_maps = {'linear': linear_fn, 'cosine': cosine_fn} + + def __init__(self, + scheduler_type: str = 'linear', + lr_factor: float = 0.01, + max_epochs: int = 300, + warmup_epochs: int = 3, + warmup_bias_lr: float = 0.1, + warmup_momentum: float = 0.8, + warmup_mim_iter: int = 1000, + **kwargs): + + assert scheduler_type in self.scheduler_maps + + self.warmup_epochs = warmup_epochs + self.warmup_bias_lr = warmup_bias_lr + self.warmup_momentum = warmup_momentum + self.warmup_mim_iter = warmup_mim_iter + + kwargs.update({'lr_factor': lr_factor, 'max_epochs': max_epochs}) + self.scheduler_fn = self.scheduler_maps[scheduler_type](**kwargs) + + self._warmup_end = False + self._base_lr = None + self._base_momentum = None + + def before_train(self, runner: Runner): + """Operations before train. + + Args: + runner (Runner): The runner of the training process. + """ + optimizer = runner.optim_wrapper.optimizer + for group in optimizer.param_groups: + # If the param is never be scheduled, record the current value + # as the initial value. + group.setdefault('initial_lr', group['lr']) + group.setdefault('initial_momentum', group.get('momentum', -1)) + + self._base_lr = [ + group['initial_lr'] for group in optimizer.param_groups + ] + self._base_momentum = [ + group['initial_momentum'] for group in optimizer.param_groups + ] + + def before_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None): + """Operations before each training iteration. + + Args: + runner (Runner): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict or tuple or list, optional): Data from dataloader. + """ + cur_iters = runner.iter + cur_epoch = runner.epoch + optimizer = runner.optim_wrapper.optimizer + + # The minimum warmup is self.warmup_mim_iter + warmup_total_iters = max( + round(self.warmup_epochs * len(runner.train_dataloader)), + self.warmup_mim_iter) + + if cur_iters <= warmup_total_iters: + xp = [0, warmup_total_iters] + for group_idx, param in enumerate(optimizer.param_groups): + if group_idx == 2: + # bias learning rate will be handled specially + yp = [ + self.warmup_bias_lr, + self._base_lr[group_idx] * self.scheduler_fn(cur_epoch) + ] + else: + yp = [ + 0.0, + self._base_lr[group_idx] * self.scheduler_fn(cur_epoch) + ] + param['lr'] = np.interp(cur_iters, xp, yp) + + if 'momentum' in param: + param['momentum'] = np.interp( + cur_iters, xp, + [self.warmup_momentum, self._base_momentum[group_idx]]) + else: + self._warmup_end = True + + def after_train_epoch(self, runner: Runner): + """Operations after each training epoch. + + Args: + runner (Runner): The runner of the training process. + """ + if not self._warmup_end: + return + + cur_epoch = runner.epoch + optimizer = runner.optim_wrapper.optimizer + for group_idx, param in enumerate(optimizer.param_groups): + param['lr'] = self._base_lr[group_idx] * self.scheduler_fn( + cur_epoch) diff --git a/mmyolo/engine/hooks/yolox_mode_switch_hook.py b/mmyolo/engine/hooks/yolox_mode_switch_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..27711768c3f89b26410ae1373bc920d0bfded603 --- /dev/null +++ b/mmyolo/engine/hooks/yolox_mode_switch_hook.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Sequence + +from mmengine.hooks import Hook +from mmengine.model import is_model_wrapper +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +@HOOKS.register_module() +class YOLOXModeSwitchHook(Hook): + """Switch the mode of YOLOX during training. + + This hook turns off the mosaic and mixup data augmentation and switches + to use L1 loss in bbox_head. + + Args: + num_last_epochs (int): The number of latter epochs in the end of the + training to close the data augmentation and switch to L1 loss. + Defaults to 15. + """ + + def __init__(self, + num_last_epochs: int = 15, + new_train_pipeline: Sequence[dict] = None): + self.num_last_epochs = num_last_epochs + self.new_train_pipeline_cfg = new_train_pipeline + + def before_train_epoch(self, runner: Runner): + """Close mosaic and mixup augmentation and switches to use L1 loss.""" + epoch = runner.epoch + model = runner.model + if is_model_wrapper(model): + model = model.module + + if (epoch + 1) == runner.max_epochs - self.num_last_epochs: + runner.logger.info(f'New Pipeline: {self.new_train_pipeline_cfg}') + + train_dataloader_cfg = copy.deepcopy(runner.cfg.train_dataloader) + train_dataloader_cfg.dataset.pipeline = self.new_train_pipeline_cfg + # Note: Why rebuild the dataset? + # When build_dataloader will make a deep copy of the dataset, + # it will lead to potential risks, such as the global instance + # object FileClient data is disordered. + # This problem needs to be solved in the future. + new_train_dataloader = Runner.build_dataloader( + train_dataloader_cfg) + runner.train_loop.dataloader = new_train_dataloader + + runner.logger.info('recreate the dataloader!') + runner.logger.info('Add additional bbox reg loss now!') + model.bbox_head.use_bbox_aux = True diff --git a/mmyolo/engine/optimizers/__init__.py b/mmyolo/engine/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b598020d05db54cdc1d803d39ebd2c91026a6112 --- /dev/null +++ b/mmyolo/engine/optimizers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .yolov5_optim_constructor import YOLOv5OptimizerConstructor +from .yolov7_optim_wrapper_constructor import YOLOv7OptimWrapperConstructor + +__all__ = ['YOLOv5OptimizerConstructor', 'YOLOv7OptimWrapperConstructor'] diff --git a/mmyolo/engine/optimizers/yolov5_optim_constructor.py b/mmyolo/engine/optimizers/yolov5_optim_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..5e5f42cb5c2c18962f989288b45011c742845c2f --- /dev/null +++ b/mmyolo/engine/optimizers/yolov5_optim_constructor.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.model import is_model_wrapper +from mmengine.optim import OptimWrapper + +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOv5OptimizerConstructor: + """YOLOv5 constructor for optimizers. + + It has the following functions: + + - divides the optimizer parameters into 3 groups: + Conv, Bias and BN + + - support `weight_decay` parameter adaption based on + `batch_size_per_gpu` + + Args: + optim_wrapper_cfg (dict): The config dict of the optimizer wrapper. + Positional fields are + + - ``type``: class name of the OptimizerWrapper + - ``optimizer``: The configuration of optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer wrapper type, + e.g., accumulative_counts, clip_grad, etc. + + The positional fields of ``optimizer`` are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + + paramwise_cfg (dict, optional): Parameter-wise options. Must include + `base_total_batch_size` if not None. If the total input batch + is smaller than `base_total_batch_size`, the `weight_decay` + parameter will be kept unchanged, otherwise linear scaling. + + Example: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optim_wrapper_cfg = dict( + >>> dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01, + >>> momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16)) + >>> paramwise_cfg = dict(base_total_batch_size=64) + >>> optim_wrapper_builder = YOLOv5OptimizerConstructor( + >>> optim_wrapper_cfg, paramwise_cfg) + >>> optim_wrapper = optim_wrapper_builder(model) + """ + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + if paramwise_cfg is None: + paramwise_cfg = {'base_total_batch_size': 64} + assert 'base_total_batch_size' in paramwise_cfg + + if not isinstance(optim_wrapper_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optim_wrapper_cfg)}') + assert 'optimizer' in optim_wrapper_cfg, ( + '`optim_wrapper_cfg` must contain "optimizer" config') + + self.optim_wrapper_cfg = optim_wrapper_cfg + self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer') + self.base_total_batch_size = paramwise_cfg['base_total_batch_size'] + + def __call__(self, model: nn.Module) -> OptimWrapper: + if is_model_wrapper(model): + model = model.module + optimizer_cfg = self.optimizer_cfg.copy() + weight_decay = optimizer_cfg.pop('weight_decay', 0) + + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + params_groups = [], [], [] + + for v in model.modules(): + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + params_groups[2].append(v.bias) + # Includes SyncBatchNorm + if isinstance(v, nn.modules.batchnorm._NormBase): + params_groups[1].append(v.weight) + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + params_groups[0].append(v.weight) + + # Note: Make sure bias is in the last parameter group + optimizer_cfg['params'] = [] + # conv + optimizer_cfg['params'].append({ + 'params': params_groups[0], + 'weight_decay': weight_decay + }) + # bn + optimizer_cfg['params'].append({'params': params_groups[1]}) + # bias + optimizer_cfg['params'].append({'params': params_groups[2]}) + + print_log( + 'Optimizer groups: %g .bias, %g conv.weight, %g other' % + (len(params_groups[2]), len(params_groups[0]), len( + params_groups[1])), 'current') + del params_groups + + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py b/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..79ea8b69976760c0e45e35f8420d0cc69b13331a --- /dev/null +++ b/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.model import is_model_wrapper +from mmengine.optim import OptimWrapper + +from mmyolo.models.dense_heads.yolov7_head import ImplicitA, ImplicitM +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +# TODO: Consider merging into YOLOv5OptimizerConstructor +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOv7OptimWrapperConstructor: + """YOLOv7 constructor for optimizer wrappers. + + It has the following functions: + + - divides the optimizer parameters into 3 groups: + Conv, Bias and BN/ImplicitA/ImplicitM + + - support `weight_decay` parameter adaption based on + `batch_size_per_gpu` + + Args: + optim_wrapper_cfg (dict): The config dict of the optimizer wrapper. + Positional fields are + + - ``type``: class name of the OptimizerWrapper + - ``optimizer``: The configuration of optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer wrapper type, + e.g., accumulative_counts, clip_grad, etc. + + The positional fields of ``optimizer`` are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + + paramwise_cfg (dict, optional): Parameter-wise options. Must include + `base_total_batch_size` if not None. If the total input batch + is smaller than `base_total_batch_size`, the `weight_decay` + parameter will be kept unchanged, otherwise linear scaling. + + Example: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optim_wrapper_cfg = dict( + >>> dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01, + >>> momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16)) + >>> paramwise_cfg = dict(base_total_batch_size=64) + >>> optim_wrapper_builder = YOLOv7OptimWrapperConstructor( + >>> optim_wrapper_cfg, paramwise_cfg) + >>> optim_wrapper = optim_wrapper_builder(model) + """ + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + if paramwise_cfg is None: + paramwise_cfg = {'base_total_batch_size': 64} + assert 'base_total_batch_size' in paramwise_cfg + + if not isinstance(optim_wrapper_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optim_wrapper_cfg)}') + assert 'optimizer' in optim_wrapper_cfg, ( + '`optim_wrapper_cfg` must contain "optimizer" config') + + self.optim_wrapper_cfg = optim_wrapper_cfg + self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer') + self.base_total_batch_size = paramwise_cfg['base_total_batch_size'] + + def __call__(self, model: nn.Module) -> OptimWrapper: + if is_model_wrapper(model): + model = model.module + optimizer_cfg = self.optimizer_cfg.copy() + weight_decay = optimizer_cfg.pop('weight_decay', 0) + + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + params_groups = [], [], [] + for v in model.modules(): + # no decay + # Caution: Coupling with model + if isinstance(v, (ImplicitA, ImplicitM)): + params_groups[0].append(v.implicit) + elif isinstance(v, nn.modules.batchnorm._NormBase): + params_groups[0].append(v.weight) + # apply decay + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + params_groups[1].append(v.weight) # apply decay + + # biases, no decay + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + params_groups[2].append(v.bias) + + # Note: Make sure bias is in the last parameter group + optimizer_cfg['params'] = [] + # conv + optimizer_cfg['params'].append({ + 'params': params_groups[1], + 'weight_decay': weight_decay + }) + # bn ... + optimizer_cfg['params'].append({'params': params_groups[0]}) + # bias + optimizer_cfg['params'].append({'params': params_groups[2]}) + + print_log( + 'Optimizer groups: %g .bias, %g conv.weight, %g other' % + (len(params_groups[2]), len(params_groups[1]), len( + params_groups[0])), 'current') + del params_groups + + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/mmyolo/models/__init__.py b/mmyolo/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..51c37f0436f131dcd26b9a8115e58fe49d59207e --- /dev/null +++ b/mmyolo/models/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbones import * # noqa: F401,F403 +from .data_preprocessors import * # noqa: F401,F403 +from .dense_heads import * # noqa: F401,F403 +from .detectors import * # noqa: F401,F403 +from .layers import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .plugins import * # noqa: F401,F403 +from .task_modules import * # noqa: F401,F403 diff --git a/mmyolo/models/backbones/__init__.py b/mmyolo/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48c8e28b1e7eb97e3f7cb064c75af0dc79b4cc8d --- /dev/null +++ b/mmyolo/models/backbones/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_backbone import BaseBackbone +from .csp_darknet import YOLOv5CSPDarknet, YOLOv8CSPDarknet, YOLOXCSPDarknet +from .csp_resnet import PPYOLOECSPResNet +from .cspnext import CSPNeXt +from .efficient_rep import YOLOv6CSPBep, YOLOv6EfficientRep +from .yolov7_backbone import YOLOv7Backbone + +__all__ = [ + 'YOLOv5CSPDarknet', 'BaseBackbone', 'YOLOv6EfficientRep', 'YOLOv6CSPBep', + 'YOLOXCSPDarknet', 'CSPNeXt', 'YOLOv7Backbone', 'PPYOLOECSPResNet', + 'YOLOv8CSPDarknet' +] diff --git a/mmyolo/models/backbones/base_backbone.py b/mmyolo/models/backbones/base_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..730c7095eccf66b0d563fad96122454c98dff0ac --- /dev/null +++ b/mmyolo/models/backbones/base_backbone.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Sequence, Union + +import torch +import torch.nn as nn +from mmcv.cnn import build_plugin_layer +from mmdet.utils import ConfigType, OptMultiConfig +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class BaseBackbone(BaseModule, metaclass=ABCMeta): + """BaseBackbone backbone used in YOLO series. + + .. code:: text + + Backbone model structure diagram + +-----------+ + | input | + +-----------+ + v + +-----------+ + | stem | + | layer | + +-----------+ + v + +-----------+ + | stage | + | layer 1 | + +-----------+ + v + +-----------+ + | stage | + | layer 2 | + +-----------+ + v + ...... + v + +-----------+ + | stage | + | layer n | + +-----------+ + In P5 model, n=4 + In P6 model, n=5 + + Args: + arch_setting (list): Architecture of BaseBackbone. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels: Number of input image channels. Defaults to 3. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to None. + act_cfg (dict): Config dict for activation layer. + Defaults to None. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + arch_setting: list, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = None, + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + self.num_stages = len(arch_setting) + self.arch_setting = arch_setting + + assert set(out_indices).issubset( + i for i in range(len(arch_setting) + 1)) + + if frozen_stages not in range(-1, len(arch_setting) + 1): + raise ValueError('"frozen_stages" must be in range(-1, ' + 'len(arch_setting) + 1). But received ' + f'{frozen_stages}') + + self.input_channels = input_channels + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.widen_factor = widen_factor + self.deepen_factor = deepen_factor + self.norm_eval = norm_eval + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.plugins = plugins + + self.stem = self.build_stem_layer() + self.layers = ['stem'] + + for idx, setting in enumerate(arch_setting): + stage = [] + stage += self.build_stage_layer(idx, setting) + if plugins is not None: + stage += self.make_stage_plugins(plugins, idx, setting) + self.add_module(f'stage{idx + 1}', nn.Sequential(*stage)) + self.layers.append(f'stage{idx + 1}') + + @abstractmethod + def build_stem_layer(self): + """Build a stem layer.""" + pass + + @abstractmethod + def build_stage_layer(self, stage_idx: int, setting: list): + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + pass + + def make_stage_plugins(self, plugins, stage_idx, setting): + """Make plugins for backbone ``stage_idx`` th stage. + + Currently we support to insert ``context_block``, + ``empirical_attention_block``, ``nonlocal_block``, ``dropout_block`` + into the backbone. + + + An example of plugins format could be: + + Examples: + >>> plugins=[ + ... dict(cfg=dict(type='xxx', arg1='xxx'), + ... stages=(False, True, True, True)), + ... dict(cfg=dict(type='yyy'), + ... stages=(True, True, True, True)), + ... ] + >>> model = YOLOv5CSPDarknet() + >>> stage_plugins = model.make_stage_plugins(plugins, 0, setting) + >>> assert len(stage_plugins) == 1 + + Suppose ``stage_idx=0``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1 -> conv2 -> conv3 -> yyy + + Suppose ``stage_idx=1``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1 -> conv2 -> conv3 -> xxx -> yyy + + + Args: + plugins (list[dict]): List of plugins cfg to build. The postfix is + required if multiple same type plugins are inserted. + stage_idx (int): Index of stage to build + If stages is missing, the plugin would be applied to all + stages. + setting (list): The architecture setting of a stage layer. + + Returns: + list[nn.Module]: Plugins for current stage + """ + # TODO: It is not general enough to support any channel and needs + # to be refactored + in_channels = int(setting[1] * self.widen_factor) + plugin_layers = [] + for plugin in plugins: + plugin = plugin.copy() + stages = plugin.pop('stages', None) + assert stages is None or len(stages) == self.num_stages + if stages is None or stages[stage_idx]: + name, layer = build_plugin_layer( + plugin['cfg'], in_channels=in_channels) + plugin_layers.append(layer) + return plugin_layers + + def _freeze_stages(self): + """Freeze the parameters of the specified stage so that they are no + longer updated.""" + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self, self.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode: bool = True): + """Convert the model into training mode while keep normalization layer + frozen.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def forward(self, x: torch.Tensor) -> tuple: + """Forward batch_inputs from the data_preprocessor.""" + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) diff --git a/mmyolo/models/backbones/csp_darknet.py b/mmyolo/models/backbones/csp_darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..92bd69a5a9378a37ed8fb50c52dfba0de6879083 --- /dev/null +++ b/mmyolo/models/backbones/csp_darknet.py @@ -0,0 +1,427 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer, Focus +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import CSPLayerWithTwoConv, SPPFBottleneck +from ..utils import make_divisible, make_round +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv5CSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOv5. + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to: 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv5CSPDarknet + >>> import torch + >>> model = YOLOv5CSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 1024, 3, True, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 768, 3, True, False], + [768, 1024, 3, True, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return ConvModule( + self.input_channels, + make_divisible(self.arch_setting[0][0], self.widen_factor), + kernel_size=6, + stride=2, + padding=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv_layer = ConvModule( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + """Initialize the parameters.""" + if self.init_cfg is None: + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv8CSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOv8. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5}. + Defaults to P5. + last_stage_out_channels (int): Final layer output channel. + Defaults to 1024. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to: 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + + Example: + >>> from mmyolo.models import YOLOv8CSPDarknet + >>> import torch + >>> model = YOLOv8CSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + # the final out_channels will be set according to the param. + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, None, 3, True, True]], + } + + def __init__(self, + arch: str = 'P5', + last_stage_out_channels: int = 1024, + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + self.arch_settings[arch][-1][1] = last_stage_out_channels + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return ConvModule( + self.input_channels, + make_divisible(self.arch_setting[0][0], self.widen_factor), + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv_layer = ConvModule( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + csp_layer = CSPLayerWithTwoConv( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + """Initialize the parameters.""" + if self.init_cfg is None: + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOXCSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOX. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP + layers. Defaults to (5, 9, 13). + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOXCSPDarknet + >>> import torch + >>> model = YOLOXCSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 1024, 3, False, True]], + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + use_depthwise: bool = False, + spp_kernal_sizes: Tuple[int] = (5, 9, 13), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + self.use_depthwise = use_depthwise + self.spp_kernal_sizes = spp_kernal_sizes + super().__init__(self.arch_settings[arch], deepen_factor, widen_factor, + input_channels, out_indices, frozen_stages, plugins, + norm_cfg, act_cfg, norm_eval, init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return Focus( + 3, + make_divisible(64, self.widen_factor), + kernel_size=3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + conv_layer = conv( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=self.spp_kernal_sizes, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + return stage diff --git a/mmyolo/models/backbones/csp_resnet.py b/mmyolo/models/backbones/csp_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..a42ed489d8872913f4aacce08497c8e48fdace49 --- /dev/null +++ b/mmyolo/models/backbones/csp_resnet.py @@ -0,0 +1,169 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.backbones import BaseBackbone +from mmyolo.models.layers.yolo_bricks import CSPResLayer +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class PPYOLOECSPResNet(BaseBackbone): + """CSP-ResNet backbone used in PPYOLOE. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + block_cfg (dict): Config dict for block. Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True) + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', momentum=0.1, + eps=1e-5). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + attention_cfg (dict): Config dict for `EffectiveSELayer`. + Defaults to dict(type='EffectiveSELayer', + act_cfg=dict(type='HSigmoid')). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + use_large_stem (bool): Whether to use large stem layer. + Defaults to False. + """ + # From left to right: + # in_channels, out_channels, num_blocks + arch_settings = { + 'P5': [[64, 128, 3], [128, 256, 6], [256, 512, 6], [512, 1024, 3]] + } + + def __init__(self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + arch_ovewrite: dict = None, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + attention_cfg: ConfigType = dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None, + use_large_stem: bool = False): + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + arch_setting = [[ + int(in_channels * widen_factor), + int(out_channels * widen_factor), + round(num_blocks * deepen_factor) + ] for in_channels, out_channels, num_blocks in arch_setting] + self.block_cfg = block_cfg + self.use_large_stem = use_large_stem + self.attention_cfg = attention_cfg + + super().__init__( + arch_setting, + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + if self.use_large_stem: + stem = nn.Sequential( + ConvModule( + self.input_channels, + self.arch_setting[0][0] // 2, + 3, + stride=2, + padding=1, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0] // 2, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0], + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + else: + stem = nn.Sequential( + ConvModule( + self.input_channels, + self.arch_setting[0][0] // 2, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0], + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks = setting + + cspres_layer = CSPResLayer( + in_channels=in_channels, + out_channels=out_channels, + num_block=num_blocks, + block_cfg=self.block_cfg, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=self.attention_cfg, + use_spp=False) + return [cspres_layer] diff --git a/mmyolo/models/backbones/cspnext.py b/mmyolo/models/backbones/cspnext.py new file mode 100644 index 0000000000000000000000000000000000000000..adca9dd9d11baecefda90a99a4188e78c2ca8188 --- /dev/null +++ b/mmyolo/models/backbones/cspnext.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Sequence, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import SPPFBottleneck +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class CSPNeXt(BaseBackbone): + """CSPNeXt backbone used in RTMDet. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin.Defaults to + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + channel_attention (bool): Whether to add channel attention in each + stage. Defaults to True. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 1024, 3, False, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 768, 3, True, False], + [768, 1024, 3, False, True]] + } + + def __init__( + self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + arch_ovewrite: dict = None, + channel_attention: bool = True, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + self.channel_attention = channel_attention + self.use_depthwise = use_depthwise + self.conv = DepthwiseSeparableConvModule \ + if use_depthwise else ConvModule + self.expand_ratio = expand_ratio + self.conv_cfg = conv_cfg + + super().__init__( + arch_setting, + deepen_factor, + widen_factor, + input_channels, + out_indices, + frozen_stages=frozen_stages, + plugins=plugins, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = max(round(num_blocks * self.deepen_factor), 1) + + stage = [] + conv_layer = self.conv( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + use_depthwise=self.use_depthwise, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + channel_attention=self.channel_attention, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + return stage diff --git a/mmyolo/models/backbones/efficient_rep.py b/mmyolo/models/backbones/efficient_rep.py new file mode 100644 index 0000000000000000000000000000000000000000..691c5b846a6453ff1dfbccb6785337f43e356bdc --- /dev/null +++ b/mmyolo/models/backbones/efficient_rep.py @@ -0,0 +1,287 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.layers.yolo_bricks import SPPFBottleneck +from mmyolo.registry import MODELS +from ..layers import BepC3StageBlock, RepStageBlock +from ..utils import make_round +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv6EfficientRep(BaseBackbone): + """EfficientRep backbone used in YOLOv6. + Args: + arch (str): Architecture of BaseDarknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (Union[dict, list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv6EfficientRep + >>> import torch + >>> model = YOLOv6EfficientRep() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, use_spp + arch_settings = { + 'P5': [[64, 128, 6, False], [128, 256, 12, False], + [256, 512, 18, False], [512, 1024, 6, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_eval: bool = False, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.block_cfg = block_cfg + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=self.input_channels, + out_channels=int(self.arch_setting[0][0] * self.widen_factor), + kernel_size=3, + stride=2, + )) + return MODELS.build(block_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, use_spp = setting + + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + + rep_stage_block = RepStageBlock( + in_channels=out_channels, + out_channels=out_channels, + num_blocks=num_blocks, + block_cfg=self.block_cfg, + ) + + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2)) + stage = [] + + ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block) + + stage.append(ef_block) + + if use_spp: + spp = SPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv6CSPBep(YOLOv6EfficientRep): + """CSPBep backbone used in YOLOv6. + Args: + arch (str): Architecture of BaseDarknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (Union[dict, list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv6CSPBep + >>> import torch + >>> model = YOLOv6CSPBep() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, use_spp + arch_settings = { + 'P5': [[64, 128, 6, False], [128, 256, 12, False], + [256, 512, 18, False], [512, 1024, 6, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + hidden_ratio: float = 0.5, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + block_cfg: ConfigType = dict(type='ConvWrapper'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + super().__init__( + arch=arch, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, use_spp = setting + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + + rep_stage_block = BepC3StageBlock( + in_channels=out_channels, + out_channels=out_channels, + num_blocks=num_blocks, + hidden_ratio=self.hidden_ratio, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2)) + stage = [] + + ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block) + + stage.append(ef_block) + + if use_spp: + spp = SPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage diff --git a/mmyolo/models/backbones/yolov7_backbone.py b/mmyolo/models/backbones/yolov7_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..bb9a5eed85ca1ee6884f7348ef3745a9ceaba032 --- /dev/null +++ b/mmyolo/models/backbones/yolov7_backbone.py @@ -0,0 +1,285 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.backbones.csp_darknet import Focus +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import MaxPoolAndStrideConvBlock +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv7Backbone(BaseBackbone): + """Backbone used in YOLOv7. + + Args: + arch (str): Architecture of YOLOv7Defaults to L. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + _tiny_stage1_cfg = dict(type='TinyDownSampleBlock', middle_ratio=0.5) + _tiny_stage2_4_cfg = dict(type='TinyDownSampleBlock', middle_ratio=1.0) + _l_expand_channel_2x = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.5, + num_blocks=2, + num_convs_in_block=2) + _l_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.25, + block_ratio=0.25, + num_blocks=2, + num_convs_in_block=2) + _x_expand_channel_2x = dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + _x_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.2, + block_ratio=0.2, + num_blocks=3, + num_convs_in_block=2) + _w_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.5, + num_blocks=2, + num_convs_in_block=2) + _e_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + _d_no_change_channel = dict( + type='ELANBlock', + middle_ratio=1 / 3, + block_ratio=1 / 3, + num_blocks=4, + num_convs_in_block=2) + _e2e_no_change_channel = dict( + type='EELANBlock', + num_elan_block=2, + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + + # From left to right: + # in_channels, out_channels, Block_params + arch_settings = { + 'Tiny': [[64, 64, _tiny_stage1_cfg], [64, 128, _tiny_stage2_4_cfg], + [128, 256, _tiny_stage2_4_cfg], + [256, 512, _tiny_stage2_4_cfg]], + 'L': [[64, 256, _l_expand_channel_2x], + [256, 512, _l_expand_channel_2x], + [512, 1024, _l_expand_channel_2x], + [1024, 1024, _l_no_change_channel]], + 'X': [[80, 320, _x_expand_channel_2x], + [320, 640, _x_expand_channel_2x], + [640, 1280, _x_expand_channel_2x], + [1280, 1280, _x_no_change_channel]], + 'W': + [[64, 128, _w_no_change_channel], [128, 256, _w_no_change_channel], + [256, 512, _w_no_change_channel], [512, 768, _w_no_change_channel], + [768, 1024, _w_no_change_channel]], + 'E': + [[80, 160, _e_no_change_channel], [160, 320, _e_no_change_channel], + [320, 640, _e_no_change_channel], [640, 960, _e_no_change_channel], + [960, 1280, _e_no_change_channel]], + 'D': [[96, 192, + _d_no_change_channel], [192, 384, _d_no_change_channel], + [384, 768, _d_no_change_channel], + [768, 1152, _d_no_change_channel], + [1152, 1536, _d_no_change_channel]], + 'E2E': [[80, 160, _e2e_no_change_channel], + [160, 320, _e2e_no_change_channel], + [320, 640, _e2e_no_change_channel], + [640, 960, _e2e_no_change_channel], + [960, 1280, _e2e_no_change_channel]], + } + + def __init__(self, + arch: str = 'L', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + assert arch in self.arch_settings.keys() + self.arch = arch + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + if self.arch in ['L', 'X']: + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + elif self.arch == 'Tiny': + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + elif self.arch in ['W', 'E', 'D', 'E2E']: + stem = Focus( + 3, + int(self.arch_setting[0][0] * self.widen_factor), + kernel_size=3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, stage_block_cfg = setting + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + + stage_block_cfg = stage_block_cfg.copy() + stage_block_cfg.setdefault('norm_cfg', self.norm_cfg) + stage_block_cfg.setdefault('act_cfg', self.act_cfg) + + stage_block_cfg['in_channels'] = in_channels + stage_block_cfg['out_channels'] = out_channels + + stage = [] + if self.arch in ['W', 'E', 'D', 'E2E']: + stage_block_cfg['in_channels'] = out_channels + elif self.arch in ['L', 'X']: + if stage_idx == 0: + stage_block_cfg['in_channels'] = out_channels // 2 + + downsample_layer = self._build_downsample_layer( + stage_idx, in_channels, out_channels) + stage.append(MODELS.build(stage_block_cfg)) + if downsample_layer is not None: + stage.insert(0, downsample_layer) + return stage + + def _build_downsample_layer(self, stage_idx: int, in_channels: int, + out_channels: int) -> Optional[nn.Module]: + """Build a downsample layer pre stage.""" + if self.arch in ['E', 'D', 'E2E']: + downsample_layer = MaxPoolAndStrideConvBlock( + in_channels, + out_channels, + use_in_channels_of_middle=True, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif self.arch == 'W': + downsample_layer = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif self.arch == 'Tiny': + if stage_idx != 0: + downsample_layer = nn.MaxPool2d(2, 2) + else: + downsample_layer = None + elif self.arch in ['L', 'X']: + if stage_idx == 0: + downsample_layer = ConvModule( + in_channels, + out_channels // 2, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + downsample_layer = MaxPoolAndStrideConvBlock( + in_channels, + in_channels, + use_in_channels_of_middle=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return downsample_layer diff --git a/mmyolo/models/data_preprocessors/__init__.py b/mmyolo/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3ef4f6d7d801cb8150ebca645ddb3cbf5d1b9599 --- /dev/null +++ b/mmyolo/models/data_preprocessors/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_preprocessor import (PPYOLOEBatchRandomResize, + PPYOLOEDetDataPreprocessor, + YOLOv5DetDataPreprocessor, + YOLOXBatchSyncRandomResize) + +__all__ = [ + 'YOLOv5DetDataPreprocessor', 'PPYOLOEDetDataPreprocessor', + 'PPYOLOEBatchRandomResize', 'YOLOXBatchSyncRandomResize' +] diff --git a/mmyolo/models/data_preprocessors/data_preprocessor.py b/mmyolo/models/data_preprocessors/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..f09fd8e741b9ec7d002391968eab40924ff7ab8d --- /dev/null +++ b/mmyolo/models/data_preprocessors/data_preprocessor.py @@ -0,0 +1,302 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from mmdet.models import BatchSyncRandomResize +from mmdet.models.data_preprocessors import DetDataPreprocessor +from mmengine import MessageHub, is_list_of +from mmengine.structures import BaseDataElement +from torch import Tensor + +from mmyolo.registry import MODELS + +CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str, + None] + + +@MODELS.register_module() +class YOLOXBatchSyncRandomResize(BatchSyncRandomResize): + """YOLOX batch random resize. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + """ + + def forward(self, inputs: Tensor, data_samples: dict) -> Tensor and dict: + """resize a batch of images and bboxes to shape ``self._input_size``""" + h, w = inputs.shape[-2:] + inputs = inputs.float() + assert isinstance(data_samples, dict) + + if self._input_size is None: + self._input_size = (h, w) + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1 or scale_y != 1: + inputs = F.interpolate( + inputs, + size=self._input_size, + mode='bilinear', + align_corners=False) + + data_samples['bboxes_labels'][:, 2::2] *= scale_x + data_samples['bboxes_labels'][:, 3::2] *= scale_y + + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + self._input_size = self._get_random_size( + aspect_ratio=float(w / h), device=inputs.device) + + return inputs, data_samples + + +@MODELS.register_module() +class YOLOv5DetDataPreprocessor(DetDataPreprocessor): + """Rewrite collate_fn to get faster training speed. + + Note: It must be used together with `mmyolo.datasets.utils.yolov5_collate` + """ + + def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs): + super().__init__(*args, non_blocking=non_blocking, **kwargs) + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding and bgr2rgb conversion based on + ``DetDataPreprocessorr``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # TODO: Supports multi-scale training + if self._channel_conversion and inputs.shape[1] == 3: + inputs = inputs[:, [2, 1, 0], ...] + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples_output = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + if 'masks' in data_samples: + data_samples_output['masks'] = data_samples['masks'] + + return {'inputs': inputs, 'data_samples': data_samples_output} + + +@MODELS.register_module() +class PPYOLOEDetDataPreprocessor(DetDataPreprocessor): + """Image pre-processor for detection tasks. + + The main difference between PPYOLOEDetDataPreprocessor and + DetDataPreprocessor is the normalization order. The official + PPYOLOE resize image first, and then normalize image. + In DetDataPreprocessor, the order is reversed. + + Note: It must be used together with + `mmyolo.datasets.utils.yolov5_collate` + """ + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization、padding and bgr2rgb conversion based on + ``BaseDataPreprocessor``. This class use batch_augments first, and then + normalize the image, which is different from the `DetDataPreprocessor` + . + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + assert isinstance(data['inputs'], list) and is_list_of( + data['inputs'], torch.Tensor), \ + '"inputs" should be a list of Tensor, but got ' \ + f'{type(data["inputs"])}. The possible reason for this ' \ + 'is that you are not using it with ' \ + '"mmyolo.datasets.utils.yolov5_collate". Please refer to ' \ + '"cconfigs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py".' + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # Process data. + batch_inputs = [] + for _input in inputs: + # channel transform + if self._channel_conversion: + _input = _input[[2, 1, 0], ...] + # Convert to float after channel conversion to ensure + # efficiency + _input = _input.float() + batch_inputs.append(_input) + + # Batch random resize image. + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(batch_inputs, data_samples) + + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + + return {'inputs': inputs, 'data_samples': data_samples} + + +# TODO: No generality. Its input data format is different +# mmdet's batch aug, and it must be compatible in the future. +@MODELS.register_module() +class PPYOLOEBatchRandomResize(BatchSyncRandomResize): + """PPYOLOE batch random resize. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + random_interp (bool): Whether to choose interp_mode randomly. + If set to True, the type of `interp_mode` must be list. + If set to False, the type of `interp_mode` must be str. + Defaults to True. + interp_mode (Union[List, str]): The modes available for resizing + are ('nearest', 'bilinear', 'bicubic', 'area'). + keep_ratio (bool): Whether to keep the aspect ratio when resizing + the image. Now we only support keep_ratio=False. + Defaults to False. + """ + + def __init__(self, + random_size_range: Tuple[int, int], + interval: int = 1, + size_divisor: int = 32, + random_interp=True, + interp_mode: Union[List[str], str] = [ + 'nearest', 'bilinear', 'bicubic', 'area' + ], + keep_ratio: bool = False) -> None: + super().__init__(random_size_range, interval, size_divisor) + self.random_interp = random_interp + self.keep_ratio = keep_ratio + # TODO: need to support keep_ratio==True + assert not self.keep_ratio, 'We do not yet support keep_ratio=True' + + if self.random_interp: + assert isinstance(interp_mode, list) and len(interp_mode) > 1,\ + 'While random_interp==True, the type of `interp_mode`' \ + ' must be list and len(interp_mode) must large than 1' + self.interp_mode_list = interp_mode + self.interp_mode = None + else: + assert isinstance(interp_mode, str),\ + 'While random_interp==False, the type of ' \ + '`interp_mode` must be str' + assert interp_mode in ['nearest', 'bilinear', 'bicubic', 'area'] + self.interp_mode_list = None + self.interp_mode = interp_mode + + def forward(self, inputs: list, + data_samples: dict) -> Tuple[Tensor, Tensor]: + """Resize a batch of images and bboxes to shape ``self._input_size``. + + The inputs and data_samples should be list, and + ``PPYOLOEBatchRandomResize`` must be used with + ``PPYOLOEDetDataPreprocessor`` and ``yolov5_collate`` with + ``use_ms_training == True``. + """ + assert isinstance(inputs, list),\ + 'The type of inputs must be list. The possible reason for this ' \ + 'is that you are not using it with `PPYOLOEDetDataPreprocessor` ' \ + 'and `yolov5_collate` with use_ms_training == True.' + + bboxes_labels = data_samples['bboxes_labels'] + + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + # get current input size + self._input_size, interp_mode = self._get_random_size_and_interp() + if self.random_interp: + self.interp_mode = interp_mode + + # TODO: need to support type(inputs)==Tensor + if isinstance(inputs, list): + outputs = [] + for i in range(len(inputs)): + _batch_input = inputs[i] + h, w = _batch_input.shape[-2:] + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1. or scale_y != 1.: + if self.interp_mode in ('nearest', 'area'): + align_corners = None + else: + align_corners = False + _batch_input = F.interpolate( + _batch_input.unsqueeze(0), + size=self._input_size, + mode=self.interp_mode, + align_corners=align_corners) + + # rescale boxes + indexes = bboxes_labels[:, 0] == i + bboxes_labels[indexes, 2] *= scale_x + bboxes_labels[indexes, 3] *= scale_y + bboxes_labels[indexes, 4] *= scale_x + bboxes_labels[indexes, 5] *= scale_y + + data_samples['bboxes_labels'] = bboxes_labels + else: + _batch_input = _batch_input.unsqueeze(0) + + outputs.append(_batch_input) + + # convert to Tensor + return torch.cat(outputs, dim=0), data_samples + else: + raise NotImplementedError('Not implemented yet!') + + def _get_random_size_and_interp(self) -> Tuple[int, int]: + """Randomly generate a shape in ``_random_size_range`` and a + interp_mode in interp_mode_list.""" + size = random.randint(*self._random_size_range) + input_size = (self._size_divisor * size, self._size_divisor * size) + + if self.random_interp: + interp_ind = random.randint(0, len(self.interp_mode_list) - 1) + interp_mode = self.interp_mode_list[interp_ind] + else: + interp_mode = None + return input_size, interp_mode diff --git a/mmyolo/models/dense_heads/__init__.py b/mmyolo/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a95abd611db4115484c62fab610650a091c092cf --- /dev/null +++ b/mmyolo/models/dense_heads/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ppyoloe_head import PPYOLOEHead, PPYOLOEHeadModule +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule +from .rtmdet_ins_head import RTMDetInsSepBNHead, RTMDetInsSepBNHeadModule +from .rtmdet_rotated_head import (RTMDetRotatedHead, + RTMDetRotatedSepBNHeadModule) +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule +from .yolov6_head import YOLOv6Head, YOLOv6HeadModule +from .yolov7_head import YOLOv7Head, YOLOv7HeadModule, YOLOv7p6HeadModule +from .yolov8_head import YOLOv8Head, YOLOv8HeadModule +from .yolox_head import YOLOXHead, YOLOXHeadModule + +__all__ = [ + 'YOLOv5Head', 'YOLOv6Head', 'YOLOXHead', 'YOLOv5HeadModule', + 'YOLOv6HeadModule', 'YOLOXHeadModule', 'RTMDetHead', + 'RTMDetSepBNHeadModule', 'YOLOv7Head', 'PPYOLOEHead', 'PPYOLOEHeadModule', + 'YOLOv7HeadModule', 'YOLOv7p6HeadModule', 'YOLOv8Head', 'YOLOv8HeadModule', + 'RTMDetRotatedHead', 'RTMDetRotatedSepBNHeadModule', 'RTMDetInsSepBNHead', + 'RTMDetInsSepBNHeadModule' +] diff --git a/mmyolo/models/dense_heads/ppyoloe_head.py b/mmyolo/models/dense_heads/ppyoloe_head.py new file mode 100644 index 0000000000000000000000000000000000000000..72d820041cf2fc3d3f605fee1ae9cc87cf7cee4c --- /dev/null +++ b/mmyolo/models/dense_heads/ppyoloe_head.py @@ -0,0 +1,374 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from mmengine import MessageHub +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..layers.yolo_bricks import PPYOLOESELayer +from ..utils import gt_instances_preprocess +from .yolov6_head import YOLOv6Head + + +@MODELS.register_module() +class PPYOLOEHeadModule(BaseModule): + """PPYOLOEHead head module used in `PPYOLOE. + + `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}`` + in QFL setting. Defaults to 16. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + featmap_strides: Sequence[int] = (8, 16, 32), + reg_max: int = 16, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.reg_max = reg_max + + if isinstance(in_channels, int): + self.in_channels = [int(in_channels * widen_factor) + ] * self.num_levels + else: + self.in_channels = [int(i * widen_factor) for i in in_channels] + + self._init_layers() + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for conv in self.cls_preds: + conv.bias.data.fill_(bias_init_with_prob(prior_prob)) + conv.weight.data.fill_(0.) + + for conv in self.reg_preds: + conv.bias.data.fill_(1.0) + conv.weight.data.fill_(0.) + + def _init_layers(self): + """initialize conv layers in PPYOLOE head.""" + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.cls_stems = nn.ModuleList() + self.reg_stems = nn.ModuleList() + + for in_channel in self.in_channels: + self.cls_stems.append( + PPYOLOESELayer( + in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) + self.reg_stems.append( + PPYOLOESELayer( + in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) + + for in_channel in self.in_channels: + self.cls_preds.append( + nn.Conv2d(in_channel, self.num_classes, 3, padding=1)) + self.reg_preds.append( + nn.Conv2d(in_channel, 4 * (self.reg_max + 1), 3, padding=1)) + + # init proj + proj = torch.linspace(0, self.reg_max, self.reg_max + 1).view( + [1, self.reg_max + 1, 1, 1]) + self.register_buffer('proj', proj, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tensor: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions. + """ + assert len(x) == self.num_levels + + return multi_apply(self.forward_single, x, self.cls_stems, + self.cls_preds, self.reg_stems, self.reg_preds) + + def forward_single(self, x: Tensor, cls_stem: nn.ModuleList, + cls_pred: nn.ModuleList, reg_stem: nn.ModuleList, + reg_pred: nn.ModuleList) -> Tensor: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + hw = h * w + avg_feat = F.adaptive_avg_pool2d(x, (1, 1)) + cls_logit = cls_pred(cls_stem(x, avg_feat) + x) + bbox_dist_preds = reg_pred(reg_stem(x, avg_feat)) + # TODO: Test whether use matmul instead of conv can speed up training. + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max + 1, hw]).permute(0, 2, 3, 1) + + bbox_preds = F.conv2d(F.softmax(bbox_dist_preds, dim=1), self.proj) + + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + +@MODELS.register_module() +class PPYOLOEHead(YOLOv6Head): + """PPYOLOEHead head used in `PPYOLOE `_. + The YOLOv6 head and the PPYOLOE head are only slightly different. + Distribution focal loss is extra used in PPYOLOE, but not in YOLOv6. + + Args: + head_module(ConfigType): Base module used for YOLOv5Head + prior_generator(dict): Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_dfl (:obj:`ConfigDict` or dict): Config of distribution focal + loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + loss_dfl: ConfigType = dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=0.5 / 4), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + self.loss_dfl = MODELS.build(loss_dfl) + # ppyoloe doesn't need loss_obj + self.loss_obj = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + # get epoch information from message hub + message_hub = MessageHub.get_current_instance() + current_epoch = message_hub.get_info('epoch') + + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, reg_max+1, n, 4) -> (bs, n, 4, reg_max+1) + flatten_pred_dists = [ + bbox_pred_org.permute(0, 2, 3, 1).reshape( + num_imgs, -1, (self.head_module.reg_max + 1) * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + pred_scores = torch.sigmoid(flatten_cls_preds) + + if current_epoch < self.initial_epoch: + assigned_result = self.initial_assigner( + flatten_pred_bboxes.detach(), self.flatten_priors_train, + self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag) + else: + assigned_result = self.assigner(flatten_pred_bboxes.detach(), + pred_scores.detach(), + self.flatten_priors_train, + gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + # cls loss + with torch.cuda.amp.autocast(enabled=False): + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores) + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + assigned_scores_sum = assigned_scores.sum() + # reduce_mean between all gpus + assigned_scores_sum = torch.clamp( + reduce_mean(assigned_scores_sum), min=1) + loss_cls /= assigned_scores_sum + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, + assigned_bboxes_pos, + weight=bbox_weight, + avg_factor=assigned_scores_sum) + + # dfl loss + dist_mask = fg_mask_pre_prior.unsqueeze(-1).repeat( + [1, 1, (self.head_module.reg_max + 1) * 4]) + + pred_dist_pos = torch.masked_select( + flatten_dist_preds, + dist_mask).reshape([-1, 4, self.head_module.reg_max + 1]) + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl( + pred_dist_pos.reshape(-1, self.head_module.reg_max + 1), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dfl=loss_dfl) diff --git a/mmyolo/models/dense_heads/rtmdet_head.py b/mmyolo/models/dense_heads/rtmdet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..54245a97f404b66eba47e41f03302110c8894134 --- /dev/null +++ b/mmyolo/models/dense_heads/rtmdet_head.py @@ -0,0 +1,368 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, is_norm +from mmdet.models.task_modules.samplers import PseudoSampler +from mmdet.structures.bbox import distance2bbox +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig, reduce_mean) +from mmengine.model import (BaseModule, bias_init_with_prob, constant_init, + normal_init) +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class RTMDetSepBNHeadModule(BaseModule): + """Detection Head of RTMDet. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. Defaults to 1. + feat_channels (int): Number of hidden channels. Used in child classes. + Defaults to 256 + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN')``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + share_conv: bool = True, + pred_kernel_size: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + super().__init__(init_cfg=init_cfg) + self.share_conv = share_conv + self.num_classes = num_classes + self.pred_kernel_size = pred_kernel_size + self.feat_channels = int(feat_channels * widen_factor) + self.stacked_convs = stacked_convs + self.num_base_priors = num_base_priors + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.featmap_strides = featmap_strides + + self.in_channels = int(in_channels * widen_factor) + + self._init_layers() + + def _init_layers(self): + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + for n in range(len(self.featmap_strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(reg_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.num_classes, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + + if self.share_conv: + for n in range(len(self.featmap_strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + def init_weights(self) -> None: + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + """ + + cls_scores = [] + bbox_preds = [] + for idx, x in enumerate(feats): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + reg_dist = self.rtm_reg[idx](reg_feat) + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + return tuple(cls_scores), tuple(bbox_preds) + + +@MODELS.register_module() +class RTMDetHead(YOLOv5Head): + """RTMDet head. + + Args: + head_module(ConfigType): Base module used for RTMDetHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.GIoULoss', loss_weight=2.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = self.num_classes + else: + self.cls_out_channels = self.num_classes + 1 + # rtmdet doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg.sampler, default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + self.featmap_sizes_train = None + self.flatten_priors_train = None + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + return self.head_module(x) + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_bboxes = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_bboxes = flatten_bboxes * self.flatten_priors_train[..., -1, + None] + flatten_bboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_bboxes) + + assigned_result = self.assigner(flatten_bboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + label_weights = assigned_result['assigned_labels_weights'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4) + assign_metrics = assigned_result['assign_metrics'].reshape(-1) + cls_preds = flatten_cls_scores.reshape(-1, self.num_classes) + bbox_preds = flatten_bboxes.reshape(-1, 4) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item() + + loss_cls = self.loss_cls( + cls_preds, (labels, assign_metrics), + label_weights, + avg_factor=avg_factor) + + if len(pos_inds) > 0: + loss_bbox = self.loss_bbox( + bbox_preds[pos_inds], + bbox_targets[pos_inds], + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + else: + loss_bbox = bbox_preds.sum() * 0 + + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox) diff --git a/mmyolo/models/dense_heads/rtmdet_ins_head.py b/mmyolo/models/dense_heads/rtmdet_ins_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1d0562aad6fb977516924ef9cd72cdef54ff0016 --- /dev/null +++ b/mmyolo/models/dense_heads/rtmdet_ins_head.py @@ -0,0 +1,725 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, is_norm +from mmcv.ops import batched_nms +from mmdet.models.utils import filter_scores_and_topk +from mmdet.structures.bbox import get_box_tensor, get_box_wh, scale_boxes +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig) +from mmengine import ConfigDict +from mmengine.model import (BaseModule, bias_init_with_prob, constant_init, + normal_init) +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule + + +class MaskFeatModule(BaseModule): + """Mask feature head used in RTMDet-Ins. Copy from mmdet. + + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels of the mask feature + map branch. + stacked_convs (int): Number of convs in mask feature branch. + num_levels (int): The starting feature map level from RPN that + will be used to predict the mask feature map. + num_prototypes (int): Number of output channel of the mask feature + map branch. This is the channel count of the mask + feature map that to be dynamically convolved with the predicted + kernel. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='ReLU', inplace=True) + norm_cfg (dict): Config dict for normalization layer. Default: None. + """ + + def __init__( + self, + in_channels: int, + feat_channels: int = 256, + stacked_convs: int = 4, + num_levels: int = 3, + num_prototypes: int = 8, + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_cfg: ConfigType = dict(type='BN') + ) -> None: + super().__init__(init_cfg=None) + self.num_levels = num_levels + self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1) + convs = [] + for i in range(stacked_convs): + in_c = in_channels if i == 0 else feat_channels + convs.append( + ConvModule( + in_c, + feat_channels, + 3, + padding=1, + act_cfg=act_cfg, + norm_cfg=norm_cfg)) + self.stacked_convs = nn.Sequential(*convs) + self.projection = nn.Conv2d( + feat_channels, num_prototypes, kernel_size=1) + + def forward(self, features: Tuple[Tensor, ...]) -> Tensor: + # multi-level feature fusion + fusion_feats = [features[0]] + size = features[0].shape[-2:] + for i in range(1, self.num_levels): + f = F.interpolate(features[i], size=size, mode='bilinear') + fusion_feats.append(f) + fusion_feats = torch.cat(fusion_feats, dim=1) + fusion_feats = self.fusion_conv(fusion_feats) + # pred mask feats + mask_features = self.stacked_convs(fusion_feats) + mask_features = self.projection(mask_features) + return mask_features + + +@MODELS.register_module() +class RTMDetInsSepBNHeadModule(RTMDetSepBNHeadModule): + """Detection and Instance Segmentation Head of RTMDet. + + Args: + num_classes (int): Number of categories excluding the background + category. + num_prototypes (int): Number of mask prototype features extracted + from the mask head. Defaults to 8. + dyconv_channels (int): Channel of the dynamic conv layers. + Defaults to 8. + num_dyconvs (int): Number of the dynamic convolution layers. + Defaults to 3. + use_sigmoid_cls (bool): Use sigmoid for class prediction. + Defaults to True. + """ + + def __init__(self, + num_classes: int, + *args, + num_prototypes: int = 8, + dyconv_channels: int = 8, + num_dyconvs: int = 3, + use_sigmoid_cls: bool = True, + **kwargs): + self.num_prototypes = num_prototypes + self.num_dyconvs = num_dyconvs + self.dyconv_channels = dyconv_channels + self.use_sigmoid_cls = use_sigmoid_cls + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + super().__init__(num_classes=num_classes, *args, **kwargs) + + def _init_layers(self): + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.kernel_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + self.rtm_kernel = nn.ModuleList() + self.rtm_obj = nn.ModuleList() + + # calculate num dynamic parameters + weight_nums, bias_nums = [], [] + for i in range(self.num_dyconvs): + if i == 0: + weight_nums.append( + (self.num_prototypes + 2) * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + elif i == self.num_dyconvs - 1: + weight_nums.append(self.dyconv_channels) + bias_nums.append(1) + else: + weight_nums.append(self.dyconv_channels * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + self.weight_nums = weight_nums + self.bias_nums = bias_nums + self.num_gen_params = sum(weight_nums) + sum(bias_nums) + pred_pad_size = self.pred_kernel_size // 2 + + for n in range(len(self.featmap_strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + kernel_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + kernel_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(cls_convs) + self.kernel_convs.append(kernel_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_kernel.append( + nn.Conv2d( + self.feat_channels, + self.num_gen_params, + self.pred_kernel_size, + padding=pred_pad_size)) + + if self.share_conv: + for n in range(len(self.featmap_strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + self.mask_head = MaskFeatModule( + in_channels=self.in_channels, + feat_channels=self.feat_channels, + stacked_convs=4, + num_levels=len(self.featmap_strides), + num_prototypes=self.num_prototypes, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg, + self.rtm_kernel): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01, bias=1) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale + levels, each is a 4D-tensor, the channels number is + num_gen_params. + - mask_feat (Tensor): Mask prototype features. + Has shape (batch_size, num_prototypes, H, W). + """ + mask_feat = self.mask_head(feats) + + cls_scores = [] + bbox_preds = [] + kernel_preds = [] + for idx, (x, stride) in enumerate(zip(feats, self.featmap_strides)): + cls_feat = x + reg_feat = x + kernel_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for kernel_layer in self.kernel_convs[idx]: + kernel_feat = kernel_layer(kernel_feat) + kernel_pred = self.rtm_kernel[idx](kernel_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + reg_dist = self.rtm_reg[idx](reg_feat) + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + kernel_preds.append(kernel_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple( + kernel_preds), mask_feat + + +@MODELS.register_module() +class RTMDetInsSepBNHead(RTMDetHead): + """RTMDet Instance Segmentation head. + + Args: + head_module(ConfigType): Base module used for RTMDetInsSepBNHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_mask (:obj:`ConfigDict` or dict): Config of mask loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.GIoULoss', loss_weight=2.0), + loss_mask=dict( + type='mmdet.DiceLoss', + loss_weight=2.0, + eps=5e-6, + reduction='mean'), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if isinstance(self.head_module, RTMDetInsSepBNHeadModule): + assert self.use_sigmoid_cls == self.head_module.use_sigmoid_cls + self.loss_mask = MODELS.build(loss_mask) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + kernel_preds: List[Tensor], + mask_feats: Tensor, + score_factors: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + kernel_preds (list[Tensor]): Kernel predictions of dynamic + convs for all scale levels, each is a 4D-tensor, has shape + (batch_size, num_params, H, W). + mask_feats (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, num_prototypes, H, W). + score_factors (list[Tensor], optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Defaults to None. + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection and instance + segmentation results of each image after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_kernel_preds = [ + kernel_pred.permute(0, 2, 3, + 1).reshape(num_imgs, -1, + self.head_module.num_gen_params) + for kernel_pred in kernel_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[..., :2].unsqueeze(0), flatten_bbox_preds, + flatten_stride) + + flatten_kernel_preds = torch.cat(flatten_kernel_preds, dim=1) + + results_list = [] + for (bboxes, scores, kernel_pred, mask_feat, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_kernel_preds, mask_feats, + batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + empty_results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict( + labels=labels[:, 0], + kernel_pred=kernel_pred, + priors=flatten_priors)) + labels = results['labels'] + kernel_pred = results['kernel_pred'] + priors = results['priors'] + else: + out = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict( + kernel_pred=kernel_pred, priors=flatten_priors)) + scores, labels, keep_idxs, filtered_results = out + kernel_pred = filtered_results['kernel_pred'] + priors = filtered_results['priors'] + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs], + kernels=kernel_pred, + priors=priors) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_mask_post_process( + results=results, + mask_feat=mask_feat, + cfg=cfg, + rescale_bbox=False, + rescale_mask=rescale, + with_nms=with_nms, + pad_param=pad_param, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list + + def _bbox_mask_post_process( + self, + results: InstanceData, + mask_feat: Tensor, + cfg: ConfigDict, + rescale_bbox: bool = False, + rescale_mask: bool = True, + with_nms: bool = True, + pad_param: Optional[np.ndarray] = None, + img_meta: Optional[dict] = None) -> InstanceData: + """bbox and mask post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + mask_feat (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, num_prototypes, H, W). + cfg (ConfigDict): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale_bbox (bool): If True, return boxes in original image space. + Default to False. + rescale_mask (bool): If True, return masks in original image space. + Default to True. + with_nms (bool): If True, do nms before return boxes. + Default to True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + if rescale_bbox: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if hasattr(results, 'score_factors'): + # TODO: Add sqrt operation in order to be consistent with + # the paper. + score_factors = results.pop('score_factors') + results.scores = results.scores * score_factors + + # filter small size bboxes + if cfg.get('min_bbox_size', -1) >= 0: + w, h = get_box_wh(results.bboxes) + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + results = results[valid_mask] + + # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg + assert with_nms, 'with_nms must be True for RTMDet-Ins' + if results.bboxes.numel() > 0: + bboxes = get_box_tensor(results.bboxes) + det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, + results.labels, cfg.nms) + results = results[keep_idxs] + # some nms would reweight the score, such as softnms + results.scores = det_bboxes[:, -1] + results = results[:cfg.max_per_img] + + # process masks + mask_logits = self._mask_predict_by_feat(mask_feat, + results.kernels, + results.priors) + + stride = self.prior_generator.strides[0][0] + mask_logits = F.interpolate( + mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear') + if rescale_mask: + # TODO: When use mmdet.Resize or mmdet.Pad, will meet bug + # Use img_meta to crop and resize + ori_h, ori_w = img_meta['ori_shape'][:2] + if isinstance(pad_param, np.ndarray): + pad_param = pad_param.astype(np.int32) + crop_y1, crop_y2 = pad_param[ + 0], mask_logits.shape[-2] - pad_param[1] + crop_x1, crop_x2 = pad_param[ + 2], mask_logits.shape[-1] - pad_param[3] + mask_logits = mask_logits[..., crop_y1:crop_y2, + crop_x1:crop_x2] + mask_logits = F.interpolate( + mask_logits, + size=[ori_h, ori_w], + mode='bilinear', + align_corners=False) + + masks = mask_logits.sigmoid().squeeze(0) + masks = masks > cfg.mask_thr_binary + results.masks = masks + else: + h, w = img_meta['ori_shape'][:2] if rescale_mask else img_meta[ + 'img_shape'][:2] + results.masks = torch.zeros( + size=(results.bboxes.shape[0], h, w), + dtype=torch.bool, + device=results.bboxes.device) + return results + + def _mask_predict_by_feat(self, mask_feat: Tensor, kernels: Tensor, + priors: Tensor) -> Tensor: + """Generate mask logits from mask features with dynamic convs. + + Args: + mask_feat (Tensor): Mask prototype features. + Has shape (num_prototypes, H, W). + kernels (Tensor): Kernel parameters for each instance. + Has shape (num_instance, num_params) + priors (Tensor): Center priors for each instance. + Has shape (num_instance, 4). + Returns: + Tensor: Instance segmentation masks for each instance. + Has shape (num_instance, H, W). + """ + num_inst = kernels.shape[0] + h, w = mask_feat.size()[-2:] + if num_inst < 1: + return torch.empty( + size=(num_inst, h, w), + dtype=mask_feat.dtype, + device=mask_feat.device) + if len(mask_feat.shape) < 4: + mask_feat.unsqueeze(0) + + coord = self.prior_generator.single_level_grid_priors( + (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2) + num_inst = priors.shape[0] + points = priors[:, :2].reshape(-1, 1, 2) + strides = priors[:, 2:].reshape(-1, 1, 2) + relative_coord = (points - coord).permute(0, 2, 1) / ( + strides[..., 0].reshape(-1, 1, 1) * 8) + relative_coord = relative_coord.reshape(num_inst, 2, h, w) + + mask_feat = torch.cat( + [relative_coord, + mask_feat.repeat(num_inst, 1, 1, 1)], dim=1) + weights, biases = self.parse_dynamic_params(kernels) + + n_layers = len(weights) + x = mask_feat.reshape(1, -1, h, w) + for i, (weight, bias) in enumerate(zip(weights, biases)): + x = F.conv2d( + x, weight, bias=bias, stride=1, padding=0, groups=num_inst) + if i < n_layers - 1: + x = F.relu(x) + x = x.reshape(num_inst, h, w) + return x + + def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple: + """split kernel head prediction to conv weight and bias.""" + n_inst = flatten_kernels.size(0) + n_layers = len(self.head_module.weight_nums) + params_splits = list( + torch.split_with_sizes( + flatten_kernels, + self.head_module.weight_nums + self.head_module.bias_nums, + dim=1)) + weight_splits = params_splits[:n_layers] + bias_splits = params_splits[n_layers:] + for i in range(n_layers): + if i < n_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + n_inst * self.head_module.dyconv_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape( + n_inst * self.head_module.dyconv_channels) + else: + weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst) + + return weight_splits, bias_splits + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + raise NotImplementedError diff --git a/mmyolo/models/dense_heads/rtmdet_rotated_head.py b/mmyolo/models/dense_heads/rtmdet_rotated_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1428b4fd05065e3dba764313febc46d6125408ac --- /dev/null +++ b/mmyolo/models/dense_heads/rtmdet_rotated_head.py @@ -0,0 +1,641 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmdet.models.utils import filter_scores_and_topk +from mmdet.structures.bbox import HorizontalBoxes, distance2bbox +from mmdet.structures.bbox.transforms import bbox_cxcywh_to_xyxy, scale_boxes +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig, reduce_mean) +from mmengine.config import ConfigDict +from mmengine.model import normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule + +try: + from mmrotate.structures.bbox import RotatedBoxes, distance2obb + MMROTATE_AVAILABLE = True +except ImportError: + RotatedBoxes = None + distance2obb = None + MMROTATE_AVAILABLE = False + + +@MODELS.register_module() +class RTMDetRotatedSepBNHeadModule(RTMDetSepBNHeadModule): + """Detection Head Module of RTMDet-R. + + Compared with RTMDet Detection Head Module, RTMDet-R adds + a conv for angle prediction. + An `angle_out_dim` arg is added, which is generated by the + angle_coder module and controls the angle pred dim. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. Defaults to 1. + feat_channels (int): Number of hidden channels. Used in child classes. + Defaults to 256 + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1. + angle_out_dim (int): Encoded length of angle, will passed by head. + Defaults to 1. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN')``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + share_conv: bool = True, + pred_kernel_size: int = 1, + angle_out_dim: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + self.angle_out_dim = angle_out_dim + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + widen_factor=widen_factor, + num_base_priors=num_base_priors, + feat_channels=feat_channels, + stacked_convs=stacked_convs, + featmap_strides=featmap_strides, + share_conv=share_conv, + pred_kernel_size=pred_kernel_size, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def _init_layers(self): + """Initialize layers of the head.""" + super()._init_layers() + self.rtm_ang = nn.ModuleList() + for _ in range(len(self.featmap_strides)): + self.rtm_ang.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.angle_out_dim, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + for rtm_ang in self.rtm_ang: + normal_init(rtm_ang, std=0.01) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - angle_preds (list[Tensor]): Angle prediction for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * angle_out_dim. + """ + + cls_scores = [] + bbox_preds = [] + angle_preds = [] + for idx, x in enumerate(feats): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + reg_dist = self.rtm_reg[idx](reg_feat) + angle_pred = self.rtm_ang[idx](reg_feat) + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + angle_preds.append(angle_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple(angle_preds) + + +@MODELS.register_module() +class RTMDetRotatedHead(RTMDetHead): + """RTMDet-R head. + + Compared with RTMDetHead, RTMDetRotatedHead add some args to support + rotated object detection. + + - `angle_version` used to limit angle_range during training. + - `angle_coder` used to encode and decode angle, which is similar + to bbox_coder. + - `use_hbbox_loss` and `loss_angle` allow custom regression loss + calculation for rotated box. + + There are three combination options for regression: + + 1. `use_hbbox_loss=False` and loss_angle is None. + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + └────►decode──(a)─┘ + + 2. `use_hbbox_loss=False` and loss_angle is specified. + A angle loss is added on angle_pred. + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + ├────►decode──(a)─┘ + │ + └───────────────────────────────────────────►loss_angle + + 3. `use_hbbox_loss=True` and loss_angle is specified. + In this case the loss_angle must be set. + + .. code:: text + + bbox_pred──(tblr)──►decode──►hbox_pred──(xyxy)──►loss_bbox + + angle_pred──────────────────────────────────────►loss_angle + + - There's a `decoded_with_angle` flag in test_cfg, which is similar + to training process. + + When `decoded_with_angle=True`: + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──(xywha)──►rbox_pred + │ ▲ + └────►decode──(a)─┘ + + When `decoded_with_angle=False`: + + .. code:: text + + bbox_pred──(tblr)─►decode + │ (xyxy) + ▼ + format───(xywh)──►concat──(xywha)──►rbox_pred + ▲ + angle_pred────────►decode────(a)───────┘ + + Args: + head_module(ConfigType): Base module used for RTMDetRotatedHead. + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + angle_version (str): Angle representations. Defaults to 'le90'. + use_hbbox_loss (bool): If true, use horizontal bbox loss and + loss_angle should not be None. Default to False. + angle_coder (:obj:`ConfigDict` or dict): Config of angle coder. + loss_angle (:obj:`ConfigDict` or dict, optional): Config of angle loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', strides=[8, 16, 32], + offset=0), + bbox_coder: ConfigType = dict(type='DistanceAnglePointCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmrotate.RotatedIoULoss', mode='linear', + loss_weight=2.0), + angle_version: str = 'le90', + use_hbbox_loss: bool = False, + angle_coder: ConfigType = dict(type='mmrotate.PseudoAngleCoder'), + loss_angle: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + self.angle_version = angle_version + self.use_hbbox_loss = use_hbbox_loss + if self.use_hbbox_loss: + assert loss_angle is not None, \ + ('When use hbbox loss, loss_angle needs to be specified') + self.angle_coder = TASK_UTILS.build(angle_coder) + self.angle_out_dim = self.angle_coder.encode_size + if head_module.get('angle_out_dim') is not None: + warnings.warn('angle_out_dim will be overridden by angle_coder ' + 'and does not need to be set manually') + + head_module['angle_out_dim'] = self.angle_out_dim + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + if loss_angle is not None: + self.loss_angle = MODELS.build(loss_angle) + else: + self.loss_angle = None + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + angle_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into bbox + results. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + angle_preds (list[Tensor]): Box angle for each scale level + with shape (N, num_points * angle_dim, H, W) + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 5), + the last dimension 4 arrange as (x, y, w, h, angle). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + # Whether to decode rbox with angle. + # different setting lead to different final results. + # Defaults to True. + decode_with_angle = cfg.get('decode_with_angle', True) + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_angle_preds = [ + angle_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.angle_out_dim) + for angle_pred in angle_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_angle_preds = torch.cat(flatten_angle_preds, dim=1) + flatten_angle_preds = self.angle_coder.decode( + flatten_angle_preds, keepdim=True) + + if decode_with_angle: + flatten_rbbox_preds = torch.cat( + [flatten_bbox_preds, flatten_angle_preds], dim=-1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_rbbox_preds, flatten_stride) + else: + flatten_decoded_hbboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + flatten_decoded_hbboxes = HorizontalBoxes.xyxy_to_cxcywh( + flatten_decoded_hbboxes) + flatten_decoded_bboxes = torch.cat( + [flatten_decoded_hbboxes, flatten_angle_preds], dim=-1) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = RotatedBoxes(bboxes) + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=RotatedBoxes(bboxes[keep_idxs])) + + if rescale: + if pad_param is not None: + results.bboxes.translate_([-pad_param[2], -pad_param[0]]) + + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + + results_list.append(results) + return results_list + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + angle_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + angle_preds (list[Tensor]): Angle prediction for each scale + level with shape (N, num_anchors * angle_out_dim, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xywha + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_tblrs = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_tblrs = flatten_tblrs * self.flatten_priors_train[..., -1, + None] + flatten_angles = torch.cat([ + angle_pred.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.angle_out_dim) for angle_pred in angle_preds + ], 1) + flatten_decoded_angle = self.angle_coder.decode( + flatten_angles, keepdim=True) + flatten_tblra = torch.cat([flatten_tblrs, flatten_decoded_angle], + dim=-1) + flatten_rbboxes = distance2obb( + self.flatten_priors_train[..., :2], + flatten_tblra, + angle_version=self.angle_version) + if self.use_hbbox_loss: + flatten_hbboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_tblrs) + + assigned_result = self.assigner(flatten_rbboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + label_weights = assigned_result['assigned_labels_weights'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 5) + assign_metrics = assigned_result['assign_metrics'].reshape(-1) + cls_preds = flatten_cls_scores.reshape(-1, self.num_classes) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item() + + loss_cls = self.loss_cls( + cls_preds, (labels, assign_metrics), + label_weights, + avg_factor=avg_factor) + + pos_bbox_targets = bbox_targets[pos_inds] + + if self.use_hbbox_loss: + bbox_preds = flatten_hbboxes.reshape(-1, 4) + pos_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets[:, :4]) + else: + bbox_preds = flatten_rbboxes.reshape(-1, 5) + angle_preds = flatten_angles.reshape(-1, self.angle_out_dim) + + if len(pos_inds) > 0: + loss_bbox = self.loss_bbox( + bbox_preds[pos_inds], + pos_bbox_targets, + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + loss_angle = angle_preds.sum() * 0 + if self.loss_angle is not None: + pos_angle_targets = bbox_targets[pos_inds][:, 4:5] + pos_angle_targets = self.angle_coder.encode(pos_angle_targets) + loss_angle = self.loss_angle( + angle_preds[pos_inds], + pos_angle_targets, + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + else: + loss_bbox = bbox_preds.sum() * 0 + loss_angle = angle_preds.sum() * 0 + + losses = dict() + losses['loss_cls'] = loss_cls + losses['loss_bbox'] = loss_bbox + if self.loss_angle is not None: + losses['loss_angle'] = loss_angle + + return losses diff --git a/mmyolo/models/dense_heads/yolov5_head.py b/mmyolo/models/dense_heads/yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..c49d0851897c36fd68d9de1c6097ae58b532024f --- /dev/null +++ b/mmyolo/models/dense_heads/yolov5_head.py @@ -0,0 +1,890 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.models.dense_heads.base_dense_head import BaseDenseHead +from mmdet.models.utils import filter_scores_and_topk, multi_apply +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine.config import ConfigDict +from mmengine.dist import get_dist_info +from mmengine.logging import print_log +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import make_divisible + + +def get_prior_xy_info(index: int, num_base_priors: int, + featmap_sizes: int) -> Tuple[int, int, int]: + """Get prior index and xy index in feature map by flatten index.""" + _, featmap_w = featmap_sizes + priors = index % num_base_priors + xy_index = index // num_base_priors + grid_y = xy_index // featmap_w + grid_x = xy_index % featmap_w + return priors, grid_x, grid_y + + +@MODELS.register_module() +class YOLOv5HeadModule(BaseModule): + """YOLOv5Head head module used in `YOLOv5`. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 3, + featmap_strides: Sequence[int] = (8, 16, 32), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.widen_factor = widen_factor + + self.featmap_strides = featmap_strides + self.num_out_attrib = 5 + self.num_classes + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + + if isinstance(in_channels, int): + self.in_channels = [make_divisible(in_channels, widen_factor) + ] * self.num_levels + else: + self.in_channels = [ + make_divisible(i, widen_factor) for i in in_channels + ] + + self._init_layers() + + def _init_layers(self): + """initialize conv layers in YOLOv5 head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Conv2d(self.in_channels[i], + self.num_base_priors * self.num_out_attrib, + 1) + + self.convs_pred.append(conv_pred) + + def init_weights(self): + """Initialize the bias of YOLOv5 head.""" + super().init_weights() + for mi, s in zip(self.convs_pred, self.featmap_strides): # from + b = mi.bias.data.view(self.num_base_priors, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.999999)) + + mi.bias.data = b.view(-1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.convs_pred) + + def forward_single(self, x: Tensor, + convs: nn.Module) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + pred_map = convs(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib, + ny, nx) + + cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + + return cls_score, bbox_pred, objectness + + +@MODELS.register_module() +class YOLOv5Head(BaseDenseHead): + """YOLOv5Head head used in `YOLOv5`. + + Args: + head_module(ConfigType): Base module used for YOLOv5Head + prior_generator(dict): Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss. + prior_match_thr (float): Defaults to 4.0. + ignore_iof_thr (float): Defaults to -1.0. + obj_level_weights (List[float]): Defaults to [4.0, 1.0, 0.4]. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.YOLOAnchorGenerator', + base_sizes=[[(10, 13), (16, 30), (33, 23)], + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]], + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='YOLOv5BBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=0.5), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + eps=1e-7, + reduction='mean', + loss_weight=0.05, + return_iou=True), + loss_obj: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=1.0), + prior_match_thr: float = 4.0, + near_neighbor_thr: float = 0.5, + ignore_iof_thr: float = -1.0, + obj_level_weights: List[float] = [4.0, 1.0, 0.4], + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.head_module = MODELS.build(head_module) + self.num_classes = self.head_module.num_classes + self.featmap_strides = self.head_module.featmap_strides + self.num_levels = len(self.featmap_strides) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.loss_cls: nn.Module = MODELS.build(loss_cls) + self.loss_bbox: nn.Module = MODELS.build(loss_bbox) + self.loss_obj: nn.Module = MODELS.build(loss_obj) + + self.prior_generator = TASK_UTILS.build(prior_generator) + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.num_base_priors = self.prior_generator.num_base_priors[0] + + self.featmap_sizes = [torch.empty(1)] * self.num_levels + + self.prior_match_thr = prior_match_thr + self.near_neighbor_thr = near_neighbor_thr + self.obj_level_weights = obj_level_weights + self.ignore_iof_thr = ignore_iof_thr + + self.special_init() + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + assert len(self.obj_level_weights) == len( + self.featmap_strides) == self.num_levels + if self.prior_match_thr != 4.0: + print_log( + "!!!Now, you've changed the prior_match_thr " + 'parameter to something other than 4.0. Please make sure ' + 'that you have modified both the regression formula in ' + 'bbox_coder and before loss_box computation, ' + 'otherwise the accuracy may be degraded!!!') + + if self.num_classes == 1: + print_log('!!!You are using `YOLOv5Head` with num_classes == 1.' + ' The loss_cls will be 0. This is a normal phenomenon.') + + priors_base_sizes = torch.tensor( + self.prior_generator.base_sizes, dtype=torch.float) + featmap_strides = torch.tensor( + self.featmap_strides, dtype=torch.float)[:, None, None] + self.register_buffer( + 'priors_base_sizes', + priors_base_sizes / featmap_strides, + persistent=False) + + grid_offset = torch.tensor([ + [0, 0], # center + [1, 0], # left + [0, 1], # up + [-1, 0], # right + [0, -1], # bottom + ]).float() + self.register_buffer( + 'grid_offset', grid_offset[:, None], persistent=False) + + prior_inds = torch.arange(self.num_base_priors).float().view( + self.num_base_priors, 1) + self.register_buffer('prior_inds', prior_inds, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + return self.head_module(x) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, labels=labels, bboxes=bboxes[keep_idxs]) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + + if isinstance(batch_data_samples, list): + losses = super().loss(x, batch_data_samples) + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + if self.ignore_iof_thr != -1: + # TODO: Support fast version + # convert ignore gt + batch_target_ignore_list = [] + for i, gt_instances_ignore in enumerate(batch_gt_instances_ignore): + bboxes = gt_instances_ignore.bboxes + labels = gt_instances_ignore.labels + index = bboxes.new_full((len(bboxes), 1), i) + # (batch_idx, label, bboxes) + target = torch.cat((index, labels[:, None].float(), bboxes), + dim=1) + batch_target_ignore_list.append(target) + + # (num_bboxes, 6) + batch_gt_targets_ignore = torch.cat( + batch_target_ignore_list, dim=0) + if batch_gt_targets_ignore.shape[0] != 0: + # Consider regions with ignore in annotations + return self._loss_by_feat_with_ignore( + cls_scores, + bbox_preds, + objectnesses, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + batch_gt_instances_ignore=batch_gt_targets_ignore) + + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + scaled_factor = torch.ones(7, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj(objectnesses[i], + target_obj) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls(pred_cls_scores, target_class) + else: + loss_cls += cls_scores[i].sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) + + def _convert_gt_to_norm_format(self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict]) -> Tensor: + if isinstance(batch_gt_instances, torch.Tensor): + # fast version + img_shape = batch_img_metas[0]['batch_input_shape'] + gt_bboxes_xyxy = batch_gt_instances[:, 2:] + xy1, xy2 = gt_bboxes_xyxy.split((2, 2), dim=-1) + gt_bboxes_xywh = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1) + gt_bboxes_xywh[:, 1::2] /= img_shape[0] + gt_bboxes_xywh[:, 0::2] /= img_shape[1] + batch_gt_instances[:, 2:] = gt_bboxes_xywh + + # (num_base_priors, num_bboxes, 6) + batch_targets_normed = batch_gt_instances.repeat( + self.num_base_priors, 1, 1) + else: + batch_target_list = [] + # Convert xyxy bbox to yolo format. + for i, gt_instances in enumerate(batch_gt_instances): + img_shape = batch_img_metas[i]['batch_input_shape'] + bboxes = gt_instances.bboxes + labels = gt_instances.labels + + xy1, xy2 = bboxes.split((2, 2), dim=-1) + bboxes = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1) + # normalized to 0-1 + bboxes[:, 1::2] /= img_shape[0] + bboxes[:, 0::2] /= img_shape[1] + + index = bboxes.new_full((len(bboxes), 1), i) + # (batch_idx, label, normed_bbox) + target = torch.cat((index, labels[:, None].float(), bboxes), + dim=1) + batch_target_list.append(target) + + # (num_base_priors, num_bboxes, 6) + batch_targets_normed = torch.cat( + batch_target_list, dim=0).repeat(self.num_base_priors, 1, 1) + + # (num_base_priors, num_bboxes, 1) + batch_targets_prior_inds = self.prior_inds.repeat( + 1, batch_targets_normed.shape[1])[..., None] + # (num_base_priors, num_bboxes, 7) + # (img_ind, labels, bbox_cx, bbox_cy, bbox_w, bbox_h, prior_ind) + batch_targets_normed = torch.cat( + (batch_targets_normed, batch_targets_prior_inds), 2) + return batch_targets_normed + + def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes) -> Tensor: + bbox_pred = bbox_pred.sigmoid() + pred_xy = bbox_pred[:, :2] * 2 - 0.5 + pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes + decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1) + return decoded_bbox_pred + + def _loss_by_feat_with_ignore( + self, cls_scores: Sequence[Tensor], bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: Sequence[Tensor]) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (Sequence[Tensor]): Ignore boxes with + batch_ids and labels, each is a 2D-tensor, the channel number + is 6, means that (batch_id, label, xmin, ymin, xmax, ymax). + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + scaled_factor = torch.ones(7, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + not_ignore_flags = bbox_preds[i].new_ones(batch_size, + self.num_base_priors, h, + w) + + ignore_overlaps = bbox_overlaps(self.mlvl_priors[i], + batch_gt_instances_ignore[..., 2:], + 'iof') + ignore_max_overlaps, ignore_max_ignore_index = ignore_overlaps.max( + dim=1) + + batch_inds = batch_gt_instances_ignore[:, + 0][ignore_max_ignore_index] + ignore_inds = (ignore_max_overlaps > self.ignore_iof_thr).nonzero( + as_tuple=True)[0] + batch_inds = batch_inds[ignore_inds].long() + ignore_priors, ignore_grid_xs, ignore_grid_ys = get_prior_xy_info( + ignore_inds, self.num_base_priors, self.featmap_sizes[i]) + not_ignore_flags[batch_inds, ignore_priors, ignore_grid_ys, + ignore_grid_xs] = 0 + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + + not_ignore_weights = not_ignore_flags[img_inds, priors_inds, + grid_y_inds, grid_x_inds] + loss_box_i, iou = self.loss_bbox( + decoded_bbox_pred, + bboxes_targets, + weight=not_ignore_weights, + avg_factor=max(not_ignore_weights.sum(), 1)) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls( + pred_cls_scores, + target_class, + weight=not_ignore_weights[:, None].repeat( + 1, self.num_classes), + avg_factor=max(not_ignore_weights.sum(), 1)) + else: + loss_cls += cls_scores[i].sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) diff --git a/mmyolo/models/dense_heads/yolov6_head.py b/mmyolo/models/dense_heads/yolov6_head.py new file mode 100644 index 0000000000000000000000000000000000000000..4b492d121a02acc194ba45637adc9e8b3e26a22c --- /dev/null +++ b/mmyolo/models/dense_heads/yolov6_head.py @@ -0,0 +1,369 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine import MessageHub +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOv6HeadModule(BaseModule): + """YOLOv6Head head module used in `YOLOv6. + + `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors: (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + None, otherwise False. Defaults to "auto". + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + featmap_strides: Sequence[int] = (8, 16, 32), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + if isinstance(in_channels, int): + self.in_channels = [int(in_channels * widen_factor) + ] * self.num_levels + else: + self.in_channels = [int(i * widen_factor) for i in in_channels] + + self._init_layers() + + def _init_layers(self): + """initialize conv layers in YOLOv6 head.""" + # Init decouple head + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.stems = nn.ModuleList() + for i in range(self.num_levels): + self.stems.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=1, + stride=1, + padding=1 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=3, + stride=1, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.reg_convs.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=3, + stride=1, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_preds.append( + nn.Conv2d( + in_channels=self.in_channels[i], + out_channels=self.num_base_priors * self.num_classes, + kernel_size=1)) + self.reg_preds.append( + nn.Conv2d( + in_channels=self.in_channels[i], + out_channels=self.num_base_priors * 4, + kernel_size=1)) + + def init_weights(self): + super().init_weights() + bias_init = bias_init_with_prob(0.01) + for conv in self.cls_preds: + conv.bias.data.fill_(bias_init) + conv.weight.data.fill_(0.) + + for conv in self.reg_preds: + conv.bias.data.fill_(1.0) + conv.weight.data.fill_(0.) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.stems, self.cls_convs, + self.cls_preds, self.reg_convs, self.reg_preds) + + def forward_single(self, x: Tensor, stem: nn.Module, cls_conv: nn.Module, + cls_pred: nn.Module, reg_conv: nn.Module, + reg_pred: nn.Module) -> Tuple[Tensor, Tensor]: + """Forward feature of a single scale level.""" + y = stem(x) + cls_x = y + reg_x = y + cls_feat = cls_conv(cls_x) + reg_feat = reg_conv(reg_x) + + cls_score = cls_pred(cls_feat) + bbox_pred = reg_pred(reg_feat) + + return cls_score, bbox_pred + + +@MODELS.register_module() +class YOLOv6Head(YOLOv5Head): + """YOLOv6Head head used in `YOLOv6 `_. + + Args: + head_module(ConfigType): Base module used for YOLOv6Head + prior_generator(dict): Points generator feature maps + in 2D points-based detectors. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + # yolov6 doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.initial_epoch = self.train_cfg['initial_epoch'] + self.initial_assigner = TASK_UTILS.build( + self.train_cfg.initial_assigner) + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + + # Add common attributes to reduce calculation + self.featmap_sizes_train = None + self.num_level_priors = None + self.flatten_priors_train = None + self.stride_tensor = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + # get epoch information from message hub + message_hub = MessageHub.get_current_instance() + current_epoch = message_hub.get_info('epoch') + + num_imgs = len(batch_img_metas) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[:, 0]) + pred_scores = torch.sigmoid(flatten_cls_preds) + + if current_epoch < self.initial_epoch: + assigned_result = self.initial_assigner( + flatten_pred_bboxes.detach(), self.flatten_priors_train, + self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag) + else: + assigned_result = self.assigner(flatten_pred_bboxes.detach(), + pred_scores.detach(), + self.flatten_priors_train, + gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + # cls loss + with torch.cuda.amp.autocast(enabled=False): + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores) + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # TODO: Add all_reduce makes training more stable + assigned_scores_sum = assigned_scores.sum() + if assigned_scores_sum > 0: + loss_cls /= assigned_scores_sum + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, + assigned_bboxes_pos, + weight=bbox_weight, + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * world_size, loss_bbox=loss_bbox * world_size) diff --git a/mmyolo/models/dense_heads/yolov7_head.py b/mmyolo/models/dense_heads/yolov7_head.py new file mode 100644 index 0000000000000000000000000000000000000000..80e6aadd2880fbe95b7c897630ab9033183c2062 --- /dev/null +++ b/mmyolo/models/dense_heads/yolov7_head.py @@ -0,0 +1,404 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.dist import get_dist_info +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..layers import ImplicitA, ImplicitM +from ..task_modules.assigners.batch_yolov7_assigner import BatchYOLOv7Assigner +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule + + +@MODELS.register_module() +class YOLOv7HeadModule(YOLOv5HeadModule): + """YOLOv7Head head module used in YOLOv7.""" + + def _init_layers(self): + """initialize conv layers in YOLOv7 head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Sequential( + ImplicitA(self.in_channels[i]), + nn.Conv2d(self.in_channels[i], + self.num_base_priors * self.num_out_attrib, 1), + ImplicitM(self.num_base_priors * self.num_out_attrib), + ) + self.convs_pred.append(conv_pred) + + def init_weights(self): + """Initialize the bias of YOLOv7 head.""" + super(YOLOv5HeadModule, self).init_weights() + for mi, s in zip(self.convs_pred, self.featmap_strides): # from + mi = mi[1] # nn.Conv2d + + b = mi.bias.data.view(3, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + + mi.bias.data = b.view(-1) + + +@MODELS.register_module() +class YOLOv7p6HeadModule(YOLOv5HeadModule): + """YOLOv7Head head module used in YOLOv7.""" + + def __init__(self, + *args, + main_out_channels: Sequence[int] = [256, 512, 768, 1024], + aux_out_channels: Sequence[int] = [320, 640, 960, 1280], + use_aux: bool = True, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + self.main_out_channels = main_out_channels + self.aux_out_channels = aux_out_channels + self.use_aux = use_aux + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + super().__init__(*args, **kwargs) + + def _init_layers(self): + """initialize conv layers in YOLOv7 head.""" + self.main_convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Sequential( + ConvModule( + self.in_channels[i], + self.main_out_channels[i], + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ImplicitA(self.main_out_channels[i]), + nn.Conv2d(self.main_out_channels[i], + self.num_base_priors * self.num_out_attrib, 1), + ImplicitM(self.num_base_priors * self.num_out_attrib), + ) + self.main_convs_pred.append(conv_pred) + + if self.use_aux: + self.aux_convs_pred = nn.ModuleList() + for i in range(self.num_levels): + aux_pred = nn.Sequential( + ConvModule( + self.in_channels[i], + self.aux_out_channels[i], + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(self.aux_out_channels[i], + self.num_base_priors * self.num_out_attrib, 1)) + self.aux_convs_pred.append(aux_pred) + else: + self.aux_convs_pred = [None] * len(self.main_convs_pred) + + def init_weights(self): + """Initialize the bias of YOLOv5 head.""" + super(YOLOv5HeadModule, self).init_weights() + for mi, aux, s in zip(self.main_convs_pred, self.aux_convs_pred, + self.featmap_strides): # from + mi = mi[2] # nn.Conv2d + b = mi.bias.data.view(3, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + mi.bias.data = b.view(-1) + + if self.use_aux: + aux = aux[1] # nn.Conv2d + b = aux.bias.data.view(3, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + mi.bias.data = b.view(-1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.main_convs_pred, + self.aux_convs_pred) + + def forward_single(self, x: Tensor, convs: nn.Module, + aux_convs: Optional[nn.Module]) \ + -> Tuple[Union[Tensor, List], Union[Tensor, List], + Union[Tensor, List]]: + """Forward feature of a single scale level.""" + + pred_map = convs(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib, + ny, nx) + + cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + + if not self.training or not self.use_aux: + return cls_score, bbox_pred, objectness + else: + aux_pred_map = aux_convs(x) + aux_pred_map = aux_pred_map.view(bs, self.num_base_priors, + self.num_out_attrib, ny, nx) + aux_cls_score = aux_pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + aux_bbox_pred = aux_pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + aux_objectness = aux_pred_map[:, :, 4:5, + ...].reshape(bs, -1, ny, nx) + + return [cls_score, + aux_cls_score], [bbox_pred, aux_bbox_pred + ], [objectness, aux_objectness] + + +@MODELS.register_module() +class YOLOv7Head(YOLOv5Head): + """YOLOv7Head head used in `YOLOv7 `_. + + Args: + simota_candidate_topk (int): The candidate top-k which used to + get top-k ious to calculate dynamic-k in BatchYOLOv7Assigner. + Defaults to 10. + simota_iou_weight (float): The scale factor for regression + iou cost in BatchYOLOv7Assigner. Defaults to 3.0. + simota_cls_weight (float): The scale factor for classification + cost in BatchYOLOv7Assigner. Defaults to 1.0. + """ + + def __init__(self, + *args, + simota_candidate_topk: int = 20, + simota_iou_weight: float = 3.0, + simota_cls_weight: float = 1.0, + aux_loss_weights: float = 0.25, + **kwargs): + super().__init__(*args, **kwargs) + self.aux_loss_weights = aux_loss_weights + self.assigner = BatchYOLOv7Assigner( + num_classes=self.num_classes, + num_base_priors=self.num_base_priors, + featmap_strides=self.featmap_strides, + prior_match_thr=self.prior_match_thr, + candidate_topk=simota_candidate_topk, + iou_weight=simota_iou_weight, + cls_weight=simota_cls_weight) + + def loss_by_feat( + self, + cls_scores: Sequence[Union[Tensor, List]], + bbox_preds: Sequence[Union[Tensor, List]], + objectnesses: Sequence[Union[Tensor, List]], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + if isinstance(cls_scores[0], Sequence): + with_aux = True + batch_size = cls_scores[0][0].shape[0] + device = cls_scores[0][0].device + + bbox_preds_main, bbox_preds_aux = zip(*bbox_preds) + objectnesses_main, objectnesses_aux = zip(*objectnesses) + cls_scores_main, cls_scores_aux = zip(*cls_scores) + + head_preds = self._merge_predict_results(bbox_preds_main, + objectnesses_main, + cls_scores_main) + head_preds_aux = self._merge_predict_results( + bbox_preds_aux, objectnesses_aux, cls_scores_aux) + else: + with_aux = False + batch_size = cls_scores[0].shape[0] + device = cls_scores[0].device + + head_preds = self._merge_predict_results(bbox_preds, objectnesses, + cls_scores) + + # Convert gt to norm xywh format + # (num_base_priors, num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + scaled_factors = [ + torch.tensor(head_pred.shape, device=device)[[3, 2, 3, 2]] + for head_pred in head_preds + ] + + loss_cls, loss_obj, loss_box = self._calc_loss( + head_preds=head_preds, + head_preds_aux=None, + batch_targets_normed=batch_targets_normed, + near_neighbor_thr=self.near_neighbor_thr, + scaled_factors=scaled_factors, + batch_img_metas=batch_img_metas, + device=device) + + if with_aux: + loss_cls_aux, loss_obj_aux, loss_box_aux = self._calc_loss( + head_preds=head_preds, + head_preds_aux=head_preds_aux, + batch_targets_normed=batch_targets_normed, + near_neighbor_thr=self.near_neighbor_thr * 2, + scaled_factors=scaled_factors, + batch_img_metas=batch_img_metas, + device=device) + loss_cls += self.aux_loss_weights * loss_cls_aux + loss_obj += self.aux_loss_weights * loss_obj_aux + loss_box += self.aux_loss_weights * loss_box_aux + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) + + def _calc_loss(self, head_preds, head_preds_aux, batch_targets_normed, + near_neighbor_thr, scaled_factors, batch_img_metas, device): + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + + assigner_results = self.assigner( + head_preds, + batch_targets_normed, + batch_img_metas[0]['batch_input_shape'], + self.priors_base_sizes, + self.grid_offset, + near_neighbor_thr=near_neighbor_thr) + # mlvl is mean multi_level + mlvl_positive_infos = assigner_results['mlvl_positive_infos'] + mlvl_priors = assigner_results['mlvl_priors'] + mlvl_targets_normed = assigner_results['mlvl_targets_normed'] + + if head_preds_aux is not None: + # This is mean calc aux branch loss + head_preds = head_preds_aux + + for i, head_pred in enumerate(head_preds): + batch_inds, proir_idx, grid_x, grid_y = mlvl_positive_infos[i].T + num_pred_positive = batch_inds.shape[0] + target_obj = torch.zeros_like(head_pred[..., 0]) + # empty positive sampler + if num_pred_positive == 0: + loss_box += head_pred[..., :4].sum() * 0 + loss_cls += head_pred[..., 5:].sum() * 0 + loss_obj += self.loss_obj( + head_pred[..., 4], target_obj) * self.obj_level_weights[i] + continue + + priors = mlvl_priors[i] + targets_normed = mlvl_targets_normed[i] + + head_pred_positive = head_pred[batch_inds, proir_idx, grid_y, + grid_x] + + # calc bbox loss + grid_xy = torch.stack([grid_x, grid_y], dim=1) + decoded_pred_bbox = self._decode_bbox_to_xywh( + head_pred_positive[:, :4], priors, grid_xy) + target_bbox_scaled = targets_normed[:, 2:6] * scaled_factors[i] + + loss_box_i, iou = self.loss_bbox(decoded_pred_bbox, + target_bbox_scaled) + loss_box += loss_box_i + + # calc obj loss + target_obj[batch_inds, proir_idx, grid_y, + grid_x] = iou.detach().clamp(0).type(target_obj.dtype) + loss_obj += self.loss_obj(head_pred[..., 4], + target_obj) * self.obj_level_weights[i] + + # calc cls loss + if self.num_classes > 1: + pred_cls_scores = targets_normed[:, 1].long() + target_class = torch.full_like( + head_pred_positive[:, 5:], 0., device=device) + target_class[range(num_pred_positive), pred_cls_scores] = 1. + loss_cls += self.loss_cls(head_pred_positive[:, 5:], + target_class) + else: + loss_cls += head_pred_positive[:, 5:].sum() * 0 + return loss_cls, loss_obj, loss_box + + def _merge_predict_results(self, bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + cls_scores: Sequence[Tensor]) -> List[Tensor]: + """Merge predict output from 3 heads. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + + Returns: + List[Tensor]: Merged output. + """ + head_preds = [] + for bbox_pred, objectness, cls_score in zip(bbox_preds, objectnesses, + cls_scores): + b, _, h, w = bbox_pred.shape + bbox_pred = bbox_pred.reshape(b, self.num_base_priors, -1, h, w) + objectness = objectness.reshape(b, self.num_base_priors, -1, h, w) + cls_score = cls_score.reshape(b, self.num_base_priors, -1, h, w) + head_pred = torch.cat([bbox_pred, objectness, cls_score], + dim=2).permute(0, 1, 3, 4, 2).contiguous() + head_preds.append(head_pred) + return head_preds + + def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes, + grid_xy) -> Tensor: + bbox_pred = bbox_pred.sigmoid() + pred_xy = bbox_pred[:, :2] * 2 - 0.5 + grid_xy + pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes + decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1) + return decoded_bbox_pred diff --git a/mmyolo/models/dense_heads/yolov8_head.py b/mmyolo/models/dense_heads/yolov8_head.py new file mode 100644 index 0000000000000000000000000000000000000000..123d0dfb0d21e392dbffdc79a8cdcd4601e9e12a --- /dev/null +++ b/mmyolo/models/dense_heads/yolov8_head.py @@ -0,0 +1,398 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess, make_divisible +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOv8HeadModule(BaseModule): + """YOLOv8HeadModule head module used in `YOLOv8`. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + reg_max (int): Max value of integral set :math: ``{0, ..., reg_max-1}`` + in QFL setting. Defaults to 16. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + featmap_strides: Sequence[int] = (8, 16, 32), + reg_max: int = 16, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.in_channels = in_channels + self.reg_max = reg_max + + in_channels = [] + for channel in self.in_channels: + channel = make_divisible(channel, widen_factor) + in_channels.append(channel) + self.in_channels = in_channels + + self._init_layers() + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for reg_pred, cls_pred, stride in zip(self.reg_preds, self.cls_preds, + self.featmap_strides): + reg_pred[-1].bias.data[:] = 1.0 # box + # cls (.01 objects, 80 classes, 640 img) + cls_pred[-1].bias.data[:self.num_classes] = math.log( + 5 / self.num_classes / (640 / stride)**2) + + def _init_layers(self): + """initialize conv layers in YOLOv8 head.""" + # Init decouple head + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + + reg_out_channels = max( + (16, self.in_channels[0] // 4, self.reg_max * 4)) + cls_out_channels = max(self.in_channels[0], self.num_classes) + + for i in range(self.num_levels): + self.reg_preds.append( + nn.Sequential( + ConvModule( + in_channels=self.in_channels[i], + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + in_channels=reg_out_channels, + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + in_channels=reg_out_channels, + out_channels=4 * self.reg_max, + kernel_size=1))) + self.cls_preds.append( + nn.Sequential( + ConvModule( + in_channels=self.in_channels[i], + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + in_channels=cls_out_channels, + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + in_channels=cls_out_channels, + out_channels=self.num_classes, + kernel_size=1))) + + proj = torch.arange(self.reg_max, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.cls_preds, + self.reg_preds) + + def forward_single(self, x: torch.Tensor, cls_pred: nn.ModuleList, + reg_pred: nn.ModuleList) -> Tuple: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + cls_logit = cls_pred(x) + bbox_dist_preds = reg_pred(x) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + +@MODELS.register_module() +class YOLOv8Head(YOLOv5Head): + """YOLOv8Head head used in `YOLOv8`. + + Args: + head_module(:obj:`ConfigDict` or dict): Base module used for YOLOv8Head + prior_generator(dict): Points generator feature maps + in 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_dfl (:obj:`ConfigDict` or dict): Config of Distribution Focal + Loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=0.5), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=7.5, + return_iou=False), + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=1.5 / 4), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None + ): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + self.loss_dfl = MODELS.build(loss_dfl) + # YOLOv8 doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + + # Add common attributes to reduce calculation + self.featmap_sizes_train = None + self.num_level_priors = None + self.flatten_priors_train = None + self.stride_tensor = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + flatten_pred_dists = [ + bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + assigned_scores_sum = assigned_scores.sum().clamp(min=1) + + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum() + loss_cls /= assigned_scores_sum + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, assigned_bboxes_pos, + weight=bbox_weight) / assigned_scores_sum + + # dfl loss + pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior] + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max - 1, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl( + pred_dist_pos.reshape(-1, self.head_module.reg_max), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * num_imgs * world_size, + loss_bbox=loss_bbox * num_imgs * world_size, + loss_dfl=loss_dfl * num_imgs * world_size) diff --git a/mmyolo/models/dense_heads/yolox_head.py b/mmyolo/models/dense_heads/yolox_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a203298d8536148a7022711eabeee7f04fea8ab4 --- /dev/null +++ b/mmyolo/models/dense_heads/yolox_head.py @@ -0,0 +1,514 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.task_modules.samplers import PseudoSampler +from mmdet.models.utils import multi_apply +from mmdet.structures.bbox import bbox_xyxy_to_cxcywh +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOXHeadModule(BaseModule): + """YOLOXHead head module used in `YOLOX. + + ``_ + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Defaults to False. + dcn_on_last_conv (bool): If true, use dcn in the last layer of + towers. Defaults to False. + conv_bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Defaults to "auto". + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + use_depthwise: bool = False, + dcn_on_last_conv: bool = False, + conv_bias: Union[bool, str] = 'auto', + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.feat_channels = int(feat_channels * widen_factor) + self.stacked_convs = stacked_convs + self.use_depthwise = use_depthwise + self.dcn_on_last_conv = dcn_on_last_conv + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.num_base_priors = num_base_priors + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.featmap_strides = featmap_strides + + if isinstance(in_channels, int): + in_channels = int(in_channels * widen_factor) + self.in_channels = in_channels + + self._init_layers() + + def _init_layers(self): + """Initialize heads for all level feature maps.""" + self.multi_level_cls_convs = nn.ModuleList() + self.multi_level_reg_convs = nn.ModuleList() + self.multi_level_conv_cls = nn.ModuleList() + self.multi_level_conv_reg = nn.ModuleList() + self.multi_level_conv_obj = nn.ModuleList() + for _ in self.featmap_strides: + self.multi_level_cls_convs.append(self._build_stacked_convs()) + self.multi_level_reg_convs.append(self._build_stacked_convs()) + conv_cls, conv_reg, conv_obj = self._build_predictor() + self.multi_level_conv_cls.append(conv_cls) + self.multi_level_conv_reg.append(conv_reg) + self.multi_level_conv_obj.append(conv_obj) + + def _build_stacked_convs(self) -> nn.Sequential: + """Initialize conv layers of a single level head.""" + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + stacked_convs = [] + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + stacked_convs.append( + conv( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.conv_bias)) + return nn.Sequential(*stacked_convs) + + def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]: + """Initialize predictor layers of a single level head.""" + conv_cls = nn.Conv2d(self.feat_channels, self.num_classes, 1) + conv_reg = nn.Conv2d(self.feat_channels, 4, 1) + conv_obj = nn.Conv2d(self.feat_channels, 1, 1) + return conv_cls, conv_reg, conv_obj + + def init_weights(self): + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj in zip(self.multi_level_conv_cls, + self.multi_level_conv_obj): + conv_cls.bias.data.fill_(bias_init) + conv_obj.bias.data.fill_(bias_init) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + + return multi_apply(self.forward_single, x, self.multi_level_cls_convs, + self.multi_level_reg_convs, + self.multi_level_conv_cls, + self.multi_level_conv_reg, + self.multi_level_conv_obj) + + def forward_single(self, x: Tensor, cls_convs: nn.Module, + reg_convs: nn.Module, conv_cls: nn.Module, + conv_reg: nn.Module, + conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + cls_feat = cls_convs(x) + reg_feat = reg_convs(x) + + cls_score = conv_cls(cls_feat) + bbox_pred = conv_reg(reg_feat) + objectness = conv_obj(reg_feat) + + return cls_score, bbox_pred, objectness + + +@MODELS.register_module() +class YOLOXHead(YOLOv5Head): + """YOLOXHead head used in `YOLOX `_. + + Args: + head_module(ConfigType): Base module used for YOLOXHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss. + loss_bbox_aux (:obj:`ConfigDict` or dict): Config of bbox aux loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='YOLOXBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox_aux: ConfigType = dict( + type='mmdet.L1Loss', reduction='sum', loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + self.use_bbox_aux = False + self.loss_bbox_aux = loss_bbox_aux + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_obj=loss_obj, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + self.loss_bbox_aux: nn.Module = MODELS.build(self.loss_bbox_aux) + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + # YOLOX does not support sampling + self.sampler = PseudoSampler() + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + return self.head_module(x) + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Tensor, + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + batch_gt_instances = self.gt_instances_preprocess( + batch_gt_instances, len(batch_img_metas)) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_objectness = torch.cat(flatten_objectness, dim=1) + flatten_priors = torch.cat(mlvl_priors) + flatten_bboxes = self.bbox_coder.decode(flatten_priors[..., :2], + flatten_bbox_preds, + flatten_priors[..., 2]) + + (pos_masks, cls_targets, obj_targets, bbox_targets, bbox_aux_target, + num_fg_imgs) = multi_apply( + self._get_targets_single, + flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1), + flatten_cls_preds.detach(), flatten_bboxes.detach(), + flatten_objectness.detach(), batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + # The experimental results show that 'reduce_mean' can improve + # performance on the COCO dataset. + num_pos = torch.tensor( + sum(num_fg_imgs), + dtype=torch.float, + device=flatten_cls_preds.device) + num_total_samples = max(reduce_mean(num_pos), 1.0) + + pos_masks = torch.cat(pos_masks, 0) + cls_targets = torch.cat(cls_targets, 0) + obj_targets = torch.cat(obj_targets, 0) + bbox_targets = torch.cat(bbox_targets, 0) + if self.use_bbox_aux: + bbox_aux_target = torch.cat(bbox_aux_target, 0) + + loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), + obj_targets) / num_total_samples + if num_pos > 0: + loss_cls = self.loss_cls( + flatten_cls_preds.view(-1, self.num_classes)[pos_masks], + cls_targets) / num_total_samples + loss_bbox = self.loss_bbox( + flatten_bboxes.view(-1, 4)[pos_masks], + bbox_targets) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_cls = flatten_cls_preds.sum() * 0 + loss_bbox = flatten_bboxes.sum() * 0 + + loss_dict = dict( + loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj) + + if self.use_bbox_aux: + if num_pos > 0: + loss_bbox_aux = self.loss_bbox_aux( + flatten_bbox_preds.view(-1, 4)[pos_masks], + bbox_aux_target) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_bbox_aux = flatten_bbox_preds.sum() * 0 + loss_dict.update(loss_bbox_aux=loss_bbox_aux) + + return loss_dict + + @torch.no_grad() + def _get_targets_single( + self, + priors: Tensor, + cls_preds: Tensor, + decoded_bboxes: Tensor, + objectness: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None) -> tuple: + """Compute classification, regression, and objectness targets for + priors in a single image. + + Args: + priors (Tensor): All priors of one image, a 2D-Tensor with shape + [num_priors, 4] in [cx, xy, stride_w, stride_y] format. + cls_preds (Tensor): Classification predictions of one image, + a 2D-Tensor with shape [num_priors, num_classes] + decoded_bboxes (Tensor): Decoded bboxes predictions of one image, + a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y, + br_x, br_y] format. + objectness (Tensor): Objectness predictions of one image, + a 1D-Tensor with shape [num_priors] + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + tuple: + foreground_mask (list[Tensor]): Binary mask of foreground + targets. + cls_target (list[Tensor]): Classification targets of an image. + obj_target (list[Tensor]): Objectness targets of an image. + bbox_target (list[Tensor]): BBox targets of an image. + bbox_aux_target (int): BBox aux targets of an image. + num_pos_per_img (int): Number of positive samples in an image. + """ + + num_priors = priors.size(0) + num_gts = len(gt_instances) + # No target + if num_gts == 0: + cls_target = cls_preds.new_zeros((0, self.num_classes)) + bbox_target = cls_preds.new_zeros((0, 4)) + bbox_aux_target = cls_preds.new_zeros((0, 4)) + obj_target = cls_preds.new_zeros((num_priors, 1)) + foreground_mask = cls_preds.new_zeros(num_priors).bool() + return (foreground_mask, cls_target, obj_target, bbox_target, + bbox_aux_target, 0) + + # YOLOX uses center priors with 0.5 offset to assign targets, + # but use center priors without offset to regress bboxes. + offset_priors = torch.cat( + [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1) + + scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid() + pred_instances = InstanceData( + bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors) + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + gt_instances_ignore=gt_instances_ignore) + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + pos_inds = sampling_result.pos_inds + num_pos_per_img = pos_inds.size(0) + + pos_ious = assign_result.max_overlaps[pos_inds] + # IOU aware classification score + cls_target = F.one_hot(sampling_result.pos_gt_labels, + self.num_classes) * pos_ious.unsqueeze(-1) + obj_target = torch.zeros_like(objectness).unsqueeze(-1) + obj_target[pos_inds] = 1 + bbox_target = sampling_result.pos_gt_bboxes + bbox_aux_target = cls_preds.new_zeros((num_pos_per_img, 4)) + if self.use_bbox_aux: + bbox_aux_target = self._get_bbox_aux_target( + bbox_aux_target, bbox_target, priors[pos_inds]) + foreground_mask = torch.zeros_like(objectness).to(torch.bool) + foreground_mask[pos_inds] = 1 + return (foreground_mask, cls_target, obj_target, bbox_target, + bbox_aux_target, num_pos_per_img) + + def _get_bbox_aux_target(self, + bbox_aux_target: Tensor, + gt_bboxes: Tensor, + priors: Tensor, + eps: float = 1e-8) -> Tensor: + """Convert gt bboxes to center offset and log width height.""" + gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes) + bbox_aux_target[:, :2] = (gt_cxcywh[:, :2] - + priors[:, :2]) / priors[:, 2:] + bbox_aux_target[:, + 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps) + return bbox_aux_target + + @staticmethod + def gt_instances_preprocess(batch_gt_instances: Tensor, + batch_size: int) -> List[InstanceData]: + """Split batch_gt_instances with batch size. + + Args: + batch_gt_instances (Tensor): Ground truth + a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6] + batch_size (int): Batch size. + + Returns: + List: batch gt instances data, shape [batch_size, InstanceData] + """ + # faster version + batch_instance_list = [] + for i in range(batch_size): + batch_gt_instance_ = InstanceData() + single_batch_instance = \ + batch_gt_instances[batch_gt_instances[:, 0] == i, :] + batch_gt_instance_.bboxes = single_batch_instance[:, 2:] + batch_gt_instance_.labels = single_batch_instance[:, 1] + batch_instance_list.append(batch_gt_instance_) + + return batch_instance_list diff --git a/mmyolo/models/detectors/__init__.py b/mmyolo/models/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74fb1c6c21c5840a5cd3f45a1a9f827c0e670604 --- /dev/null +++ b/mmyolo/models/detectors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .yolo_detector import YOLODetector + +__all__ = ['YOLODetector'] diff --git a/mmyolo/models/detectors/yolo_detector.py b/mmyolo/models/detectors/yolo_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..e6783fbab41287df54f136ea121e827d0603414f --- /dev/null +++ b/mmyolo/models/detectors/yolo_detector.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdet.models.detectors.single_stage import SingleStageDetector +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.dist import get_world_size +from mmengine.logging import print_log + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLODetector(SingleStageDetector): + r"""Implementation of YOLO Series + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of YOLO. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of YOLO. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + use_syncbn (bool): whether to use SyncBatchNorm. Defaults to True. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None, + use_syncbn: bool = True): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + + # TODO: Waiting for mmengine support + if use_syncbn and get_world_size() > 1: + torch.nn.SyncBatchNorm.convert_sync_batchnorm(self) + print_log('Using SyncBatchNorm()', 'current') diff --git a/mmyolo/models/layers/__init__.py b/mmyolo/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f709dbb7e4bbd4c1a9d294a8d4cd28de2f2e457f --- /dev/null +++ b/mmyolo/models/layers/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ema import ExpMomentumEMA +from .yolo_bricks import (BepC3StageBlock, CSPLayerWithTwoConv, + DarknetBottleneck, EELANBlock, EffectiveSELayer, + ELANBlock, ImplicitA, ImplicitM, + MaxPoolAndStrideConvBlock, PPYOLOEBasicBlock, + RepStageBlock, RepVGGBlock, SPPFBottleneck, + SPPFCSPBlock, TinyDownSampleBlock) + +__all__ = [ + 'SPPFBottleneck', 'RepVGGBlock', 'RepStageBlock', 'ExpMomentumEMA', + 'ELANBlock', 'MaxPoolAndStrideConvBlock', 'SPPFCSPBlock', + 'PPYOLOEBasicBlock', 'EffectiveSELayer', 'TinyDownSampleBlock', + 'EELANBlock', 'ImplicitA', 'ImplicitM', 'BepC3StageBlock', + 'CSPLayerWithTwoConv', 'DarknetBottleneck' +] diff --git a/mmyolo/models/layers/ema.py b/mmyolo/models/layers/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..02ed204190ee4a5ab9395eddce5866545caac2c0 --- /dev/null +++ b/mmyolo/models/layers/ema.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import torch +import torch.nn as nn +from mmdet.models.layers import ExpMomentumEMA as MMDET_ExpMomentumEMA +from torch import Tensor + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class ExpMomentumEMA(MMDET_ExpMomentumEMA): + """Exponential moving average (EMA) with exponential momentum strategy, + which is used in YOLO. + + Args: + model (nn.Module): The model to be averaged. + momentum (float): The momentum used for updating ema parameter. + Ema's parameters are updated with the formula: + `averaged_param = (1-momentum) * averaged_param + momentum * + source_param`. Defaults to 0.0002. + gamma (int): Use a larger momentum early in training and gradually + annealing to a smaller value to update the ema model smoothly. The + momentum is calculated as + `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`. + Defaults to 2000. + interval (int): Interval between two updates. Defaults to 1. + device (torch.device, optional): If provided, the averaged model will + be stored on the :attr:`device`. Defaults to None. + update_buffers (bool): if True, it will compute running averages for + both the parameters and the buffers of the model. Defaults to + False. + """ + + def __init__(self, + model: nn.Module, + momentum: float = 0.0002, + gamma: int = 2000, + interval=1, + device: Optional[torch.device] = None, + update_buffers: bool = False): + super().__init__( + model=model, + momentum=momentum, + interval=interval, + device=device, + update_buffers=update_buffers) + assert gamma > 0, f'gamma must be greater than 0, but got {gamma}' + self.gamma = gamma + + # Note: There is no need to re-fetch every update, + # as most models do not change their structure + # during the training process. + self.src_parameters = ( + model.state_dict() + if self.update_buffers else dict(model.named_parameters())) + if not self.update_buffers: + self.src_buffers = model.buffers() + + def avg_func(self, averaged_param: Tensor, source_param: Tensor, + steps: int): + """Compute the moving average of the parameters using the exponential + momentum strategy. + + Args: + averaged_param (Tensor): The averaged parameters. + source_param (Tensor): The source parameters. + steps (int): The number of times the parameters have been + updated. + """ + momentum = (1 - self.momentum) * math.exp( + -float(1 + steps) / self.gamma) + self.momentum + averaged_param.lerp_(source_param, momentum) + + def update_parameters(self, model: nn.Module): + """Update the parameters after each training step. + + Args: + model (nn.Module): The model of the parameter needs to be updated. + """ + if self.steps == 0: + for k, p_avg in self.avg_parameters.items(): + p_avg.data.copy_(self.src_parameters[k].data) + elif self.steps % self.interval == 0: + for k, p_avg in self.avg_parameters.items(): + if p_avg.dtype.is_floating_point: + self.avg_func(p_avg.data, self.src_parameters[k].data, + self.steps) + if not self.update_buffers: + # If not update the buffers, + # keep the buffers in sync with the source model. + for b_avg, b_src in zip(self.module.buffers(), self.src_buffers): + b_avg.data.copy_(b_src.data) + self.steps += 1 diff --git a/mmyolo/models/layers/yolo_bricks.py b/mmyolo/models/layers/yolo_bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..2e69d528bf6268a895f913fac25d89f5f35e3301 --- /dev/null +++ b/mmyolo/models/layers/yolo_bricks.py @@ -0,0 +1,1510 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, MaxPool2d, + build_norm_layer) +from mmdet.models.layers.csp_layer import \ + DarknetBottleneck as MMDET_DarknetBottleneck +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.model import BaseModule +from mmengine.utils import digit_version +from torch import Tensor + +from mmyolo.registry import MODELS + +if digit_version(torch.__version__) >= digit_version('1.7.0'): + MODELS.register_module(module=nn.SiLU, name='SiLU') +else: + + class SiLU(nn.Module): + """Sigmoid Weighted Liner Unit.""" + + def __init__(self, inplace=True): + super().__init__() + + def forward(self, inputs) -> Tensor: + return inputs * torch.sigmoid(inputs) + + MODELS.register_module(module=SiLU, name='SiLU') + + +class SPPFBottleneck(BaseModule): + """Spatial pyramid pooling - Fast (SPPF) layer for + YOLOv5, YOLOX and PPYOLOE by Glenn Jocher + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + use_conv_first (bool): Whether to use conv before pooling layer. + In YOLOv5 and YOLOX, the para set to True. + In PPYOLOE, the para set to False. + Defaults to True. + mid_channels_scale (float): Channel multiplier, multiply in_channels + by this amount to get mid_channels. This parameter is valid only + when use_conv_fist=True.Defaults to 0.5. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_sizes: Union[int, Sequence[int]] = 5, + use_conv_first: bool = True, + mid_channels_scale: float = 0.5, + conv_cfg: ConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + if use_conv_first: + mid_channels = int(in_channels * mid_channels_scale) + self.conv1 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + mid_channels = in_channels + self.conv1 = None + self.kernel_sizes = kernel_sizes + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + conv2_in_channels = mid_channels * 4 + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_in_channels = mid_channels * (len(kernel_sizes) + 1) + + self.conv2 = ConvModule( + conv2_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + if self.conv1: + x = self.conv1(x) + if isinstance(self.kernel_sizes, int): + y1 = self.poolings(x) + y2 = self.poolings(y1) + x = torch.cat([x, y1, y2, self.poolings(y2)], dim=1) + else: + x = torch.cat( + [x] + [pooling(x) for pooling in self.poolings], dim=1) + x = self.conv2(x) + return x + + +@MODELS.register_module() +class RepVGGBlock(nn.Module): + """RepVGGBlock is a basic rep-style block, including training and deploy + status This code is based on + https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple): Stride of the convolution. Default: 1 + padding (int, tuple): Padding added to all four sides of + the input. Default: 1 + dilation (int or tuple): Spacing between kernel elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + padding_mode (string, optional): Default: 'zeros' + use_se (bool): Whether to use se. Default: False + use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv. + In PPYOLOE+ model backbone, `use_alpha` will be set to True. + Default: False. + use_bn_first (bool): Whether to use bn layer before conv. + In YOLOv6 and YOLOv7, this will be set to True. + In PPYOLOE, this will be set to False. + Default: True. + deploy (bool): Whether in deploy mode. Default: False + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int]] = 3, + stride: Union[int, Tuple[int]] = 1, + padding: Union[int, Tuple[int]] = 1, + dilation: Union[int, Tuple[int]] = 1, + groups: Optional[int] = 1, + padding_mode: Optional[str] = 'zeros', + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + use_se: bool = False, + use_alpha: bool = False, + use_bn_first=True, + deploy: bool = False): + super().__init__() + self.deploy = deploy + self.groups = groups + self.in_channels = in_channels + self.out_channels = out_channels + + assert kernel_size == 3 + assert padding == 1 + + padding_11 = padding - kernel_size // 2 + + self.nonlinearity = MODELS.build(act_cfg) + + if use_se: + raise NotImplementedError('se block not supported yet') + else: + self.se = nn.Identity() + + if use_alpha: + alpha = torch.ones([ + 1, + ], dtype=torch.float32, requires_grad=True) + self.alpha = nn.Parameter(alpha, requires_grad=True) + else: + self.alpha = None + + if deploy: + self.rbr_reparam = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=True, + padding_mode=padding_mode) + + else: + if use_bn_first and (out_channels == in_channels) and stride == 1: + self.rbr_identity = build_norm_layer( + norm_cfg, num_features=in_channels)[1] + else: + self.rbr_identity = None + + self.rbr_dense = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None) + self.rbr_1x1 = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=padding_11, + groups=groups, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, inputs: Tensor) -> Tensor: + """Forward process. + Args: + inputs (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + if hasattr(self, 'rbr_reparam'): + return self.nonlinearity(self.se(self.rbr_reparam(inputs))) + + if self.rbr_identity is None: + id_out = 0 + else: + id_out = self.rbr_identity(inputs) + if self.alpha: + return self.nonlinearity( + self.se( + self.rbr_dense(inputs) + + self.alpha * self.rbr_1x1(inputs) + id_out)) + else: + return self.nonlinearity( + self.se( + self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)) + + def get_equivalent_kernel_bias(self): + """Derives the equivalent kernel and bias in a differentiable way. + + Returns: + tuple: Equivalent kernel and bias + """ + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) + kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) + if self.alpha: + return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + self.alpha * bias1x1 + biasid + else: + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + """Pad 1x1 tensor to 3x3. + Args: + kernel1x1 (Tensor): The input 1x1 kernel need to be padded. + + Returns: + Tensor: 3x3 kernel after padded. + """ + if kernel1x1 is None: + return 0 + else: + return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch: nn.Module) -> Tuple[np.ndarray, Tensor]: + """Derives the equivalent kernel and bias of a specific branch layer. + + Args: + branch (nn.Module): The layer that needs to be equivalently + transformed, which can be nn.Sequential or nn.Batchnorm2d + + Returns: + tuple: Equivalent kernel and bias + """ + if branch is None: + return 0, 0 + if isinstance(branch, ConvModule): + kernel = branch.conv.weight + running_mean = branch.bn.running_mean + running_var = branch.bn.running_var + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn.eps + else: + assert isinstance(branch, (nn.SyncBatchNorm, nn.BatchNorm2d)) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), + dtype=np.float32) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, 1, 1] = 1 + self.id_tensor = torch.from_numpy(kernel_value).to( + branch.weight.device) + kernel = self.id_tensor + running_mean = branch.running_mean + running_var = branch.running_var + gamma = branch.weight + beta = branch.bias + eps = branch.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def switch_to_deploy(self): + """Switch to deploy mode.""" + if hasattr(self, 'rbr_reparam'): + return + kernel, bias = self.get_equivalent_kernel_bias() + self.rbr_reparam = nn.Conv2d( + in_channels=self.rbr_dense.conv.in_channels, + out_channels=self.rbr_dense.conv.out_channels, + kernel_size=self.rbr_dense.conv.kernel_size, + stride=self.rbr_dense.conv.stride, + padding=self.rbr_dense.conv.padding, + dilation=self.rbr_dense.conv.dilation, + groups=self.rbr_dense.conv.groups, + bias=True) + self.rbr_reparam.weight.data = kernel + self.rbr_reparam.bias.data = bias + for para in self.parameters(): + para.detach_() + self.__delattr__('rbr_dense') + self.__delattr__('rbr_1x1') + if hasattr(self, 'rbr_identity'): + self.__delattr__('rbr_identity') + if hasattr(self, 'id_tensor'): + self.__delattr__('id_tensor') + self.deploy = True + + +@MODELS.register_module() +class BepC3StageBlock(nn.Module): + """Beer-mug RepC3 Block. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + num_blocks (int): Number of blocks. Defaults to 1 + hidden_ratio (float): Hidden channel expansion. + Default: 0.5 + concat_all_layer (bool): Concat all layer when forward calculate. + Default: True + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + norm_cfg (ConfigType): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (ConfigType): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_blocks: int = 1, + hidden_ratio: float = 0.5, + concat_all_layer: bool = True, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True)): + super().__init__() + hidden_channels = int(out_channels * hidden_ratio) + + self.conv1 = ConvModule( + in_channels, + hidden_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = ConvModule( + in_channels, + hidden_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + 2 * hidden_channels, + out_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.block = RepStageBlock( + in_channels=hidden_channels, + out_channels=hidden_channels, + num_blocks=num_blocks, + block_cfg=block_cfg, + bottle_block=BottleRep) + self.concat_all_layer = concat_all_layer + if not concat_all_layer: + self.conv3 = ConvModule( + hidden_channels, + out_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + if self.concat_all_layer is True: + return self.conv3( + torch.cat((self.block(self.conv1(x)), self.conv2(x)), dim=1)) + else: + return self.conv3(self.block(self.conv1(x))) + + +class BottleRep(nn.Module): + """Bottle Rep Block. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + adaptive_weight (bool): Add adaptive_weight when forward calculate. + Defaults False. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + adaptive_weight: bool = False): + super().__init__() + conv1_cfg = block_cfg.copy() + conv2_cfg = block_cfg.copy() + + conv1_cfg.update( + dict(in_channels=in_channels, out_channels=out_channels)) + conv2_cfg.update( + dict(in_channels=out_channels, out_channels=out_channels)) + + self.conv1 = MODELS.build(conv1_cfg) + self.conv2 = MODELS.build(conv2_cfg) + + if in_channels != out_channels: + self.shortcut = False + else: + self.shortcut = True + if adaptive_weight: + self.alpha = nn.Parameter(torch.ones(1)) + else: + self.alpha = 1.0 + + def forward(self, x: Tensor) -> Tensor: + outputs = self.conv1(x) + outputs = self.conv2(outputs) + return outputs + self.alpha * x if self.shortcut else outputs + + +@MODELS.register_module() +class ConvWrapper(nn.Module): + """Wrapper for normal Conv with SiLU activation. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple): Stride of the convolution. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): Conv bias. Default: True. + norm_cfg (ConfigType): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (ConfigType): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + stride: int = 1, + groups: int = 1, + bias: bool = True, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = dict(type='SiLU')): + super().__init__() + self.block = ConvModule( + in_channels, + out_channels, + kernel_size, + stride, + padding=kernel_size // 2, + groups=groups, + bias=bias, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + return self.block(x) + + +@MODELS.register_module() +class EffectiveSELayer(nn.Module): + """Effective Squeeze-Excitation. + + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` + arxiv (https://arxiv.org/abs/1911.06667) + This code referenced to + https://github.com/youngwanLEE/CenterMask/blob/72147e8aae673fcaf4103ee90a6a6b73863e7fa1/maskrcnn_benchmark/modeling/backbone/vovnet.py#L108-L121 # noqa + + Args: + channels (int): The input and output channels of this Module. + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='HSigmoid'). + """ + + def __init__(self, + channels: int, + act_cfg: ConfigType = dict(type='HSigmoid')): + super().__init__() + assert isinstance(act_cfg, dict) + self.fc = ConvModule(channels, channels, 1, act_cfg=None) + + act_cfg_ = act_cfg.copy() # type: ignore + self.activate = MODELS.build(act_cfg_) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x_se = x.mean((2, 3), keepdim=True) + x_se = self.fc(x_se) + return x * self.activate(x_se) + + +class PPYOLOESELayer(nn.Module): + """Squeeze-and-Excitation Attention Module for PPYOLOE. + There are some differences between the current implementation and + SELayer in mmdet: + 1. For fast speed and avoiding double inference in ppyoloe, + use `F.adaptive_avg_pool2d` before PPYOLOESELayer. + 2. Special ways to init weights. + 3. Different convolution order. + + Args: + feat_channels (int): The input (and output) channels of the SE layer. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + """ + + def __init__(self, + feat_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True)): + super().__init__() + self.fc = nn.Conv2d(feat_channels, feat_channels, 1) + self.sig = nn.Sigmoid() + self.conv = ConvModule( + feat_channels, + feat_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self._init_weights() + + def _init_weights(self): + """Init weights.""" + nn.init.normal_(self.fc.weight, mean=0, std=0.001) + + def forward(self, feat: Tensor, avg_feat: Tensor) -> Tensor: + """Forward process + Args: + feat (Tensor): The input tensor. + avg_feat (Tensor): Average pooling feature tensor. + """ + weight = self.sig(self.fc(avg_feat)) + return self.conv(feat * weight) + + +@MODELS.register_module() +class ELANBlock(BaseModule): + """Efficient layer aggregation networks for YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + middle_ratio (float): The scaling ratio of the middle layer + based on the in_channels. + block_ratio (float): The scaling ratio of the block layer + based on the in_channels. + num_blocks (int): The number of blocks in the main branch. + Defaults to 2. + num_convs_in_block (int): The number of convs pre block. + Defaults to 1. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + middle_ratio: float, + block_ratio: float, + num_blocks: int = 2, + num_convs_in_block: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + assert num_blocks >= 1 + assert num_convs_in_block >= 1 + + middle_channels = int(in_channels * middle_ratio) + block_channels = int(in_channels * block_ratio) + final_conv_in_channels = int( + num_blocks * block_channels) + 2 * middle_channels + + self.main_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.short_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.ModuleList() + for _ in range(num_blocks): + if num_convs_in_block == 1: + internal_block = ConvModule( + middle_channels, + block_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + internal_block = [] + for _ in range(num_convs_in_block): + internal_block.append( + ConvModule( + middle_channels, + block_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + middle_channels = block_channels + internal_block = nn.Sequential(*internal_block) + + middle_channels = block_channels + self.blocks.append(internal_block) + + self.final_conv = ConvModule( + final_conv_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x_short = self.short_conv(x) + x_main = self.main_conv(x) + block_outs = [] + x_block = x_main + for block in self.blocks: + x_block = block(x_block) + block_outs.append(x_block) + x_final = torch.cat((*block_outs[::-1], x_main, x_short), dim=1) + return self.final_conv(x_final) + + +@MODELS.register_module() +class EELANBlock(BaseModule): + """Expand efficient layer aggregation networks for YOLOv7. + + Args: + num_elan_block (int): The number of ELANBlock. + """ + + def __init__(self, num_elan_block: int, **kwargs): + super().__init__() + assert num_elan_block >= 1 + self.e_elan_blocks = nn.ModuleList() + for _ in range(num_elan_block): + self.e_elan_blocks.append(ELANBlock(**kwargs)) + + def forward(self, x: Tensor) -> Tensor: + outs = [] + for elan_blocks in self.e_elan_blocks: + outs.append(elan_blocks(x)) + return sum(outs) + + +class MaxPoolAndStrideConvBlock(BaseModule): + """Max pooling and stride conv layer for YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + maxpool_kernel_sizes (int): kernel sizes of pooling layers. + Defaults to 2. + use_in_channels_of_middle (bool): Whether to calculate middle channels + based on in_channels. Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + maxpool_kernel_sizes: int = 2, + use_in_channels_of_middle: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + middle_channels = in_channels if use_in_channels_of_middle \ + else out_channels // 2 + + self.maxpool_branches = nn.Sequential( + MaxPool2d( + kernel_size=maxpool_kernel_sizes, stride=maxpool_kernel_sizes), + ConvModule( + in_channels, + out_channels // 2, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.stride_conv_branches = nn.Sequential( + ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + middle_channels, + out_channels // 2, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + maxpool_out = self.maxpool_branches(x) + stride_conv_out = self.stride_conv_branches(x) + return torch.cat([stride_conv_out, maxpool_out], dim=1) + + +@MODELS.register_module() +class TinyDownSampleBlock(BaseModule): + """Down sample layer for YOLOv7-tiny. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + middle_ratio (float): The scaling ratio of the middle layer + based on the in_channels. Defaults to 1.0. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 3. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + middle_ratio: float = 1.0, + kernel_sizes: Union[int, Sequence[int]] = 3, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='LeakyReLU', negative_slope=0.1), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + middle_channels = int(in_channels * middle_ratio) + + self.short_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.main_convs = nn.ModuleList() + for i in range(3): + if i == 0: + self.main_convs.append( + ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + else: + self.main_convs.append( + ConvModule( + middle_channels, + middle_channels, + kernel_sizes, + padding=(kernel_sizes - 1) // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.final_conv = ConvModule( + middle_channels * 4, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x) -> Tensor: + short_out = self.short_conv(x) + + main_outs = [] + for main_conv in self.main_convs: + main_out = main_conv(x) + main_outs.append(main_out) + x = main_out + + return self.final_conv(torch.cat([*main_outs[::-1], short_out], dim=1)) + + +@MODELS.register_module() +class SPPFCSPBlock(BaseModule): + """Spatial pyramid pooling - Fast (SPPF) layer with CSP for + YOLOv7 + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expand_ratio (float): Expand ratio of SPPCSPBlock. + Defaults to 0.5. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + is_tiny_version (bool): Is tiny version of SPPFCSPBlock. If True, + it means it is a yolov7 tiny model. Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + kernel_sizes: Union[int, Sequence[int]] = 5, + is_tiny_version: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.is_tiny_version = is_tiny_version + + mid_channels = int(2 * out_channels * expand_ratio) + + if is_tiny_version: + self.main_layers = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.main_layers = nn.Sequential( + ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ) + + self.kernel_sizes = kernel_sizes + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + + if is_tiny_version: + self.fuse_layers = ConvModule( + 4 * mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.fuse_layers = nn.Sequential( + ConvModule( + 4 * mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.short_layer = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.final_conv = ConvModule( + 2 * mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x1 = self.main_layers(x) + if isinstance(self.kernel_sizes, int): + y1 = self.poolings(x1) + y2 = self.poolings(y1) + concat_list = [x1] + [y1, y2, self.poolings(y2)] + if self.is_tiny_version: + x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1)) + else: + x1 = self.fuse_layers(torch.cat(concat_list, 1)) + else: + concat_list = [x1] + [m(x1) for m in self.poolings] + if self.is_tiny_version: + x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1)) + else: + x1 = self.fuse_layers(torch.cat(concat_list, 1)) + + x2 = self.short_layer(x) + return self.final_conv(torch.cat((x1, x2), dim=1)) + + +class ImplicitA(nn.Module): + """Implicit add layer in YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + mean (float): Mean value of implicit module. Defaults to 0. + std (float): Std value of implicit module. Defaults to 0.02 + """ + + def __init__(self, in_channels: int, mean: float = 0., std: float = .02): + super().__init__() + self.implicit = nn.Parameter(torch.zeros(1, in_channels, 1, 1)) + nn.init.normal_(self.implicit, mean=mean, std=std) + + def forward(self, x): + """Forward process + Args: + x (Tensor): The input tensor. + """ + return self.implicit + x + + +class ImplicitM(nn.Module): + """Implicit multiplier layer in YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + mean (float): Mean value of implicit module. Defaults to 1. + std (float): Std value of implicit module. Defaults to 0.02. + """ + + def __init__(self, in_channels: int, mean: float = 1., std: float = .02): + super().__init__() + self.implicit = nn.Parameter(torch.ones(1, in_channels, 1, 1)) + nn.init.normal_(self.implicit, mean=mean, std=std) + + def forward(self, x): + """Forward process + Args: + x (Tensor): The input tensor. + """ + return self.implicit * x + + +@MODELS.register_module() +class PPYOLOEBasicBlock(nn.Module): + """PPYOLOE Backbone BasicBlock. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + shortcut (bool): Whether to add inputs and outputs together + at the end of this layer. Defaults to True. + use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + shortcut: bool = True, + use_alpha: bool = False): + super().__init__() + assert act_cfg is None or isinstance(act_cfg, dict) + self.conv1 = ConvModule( + in_channels, + out_channels, + 3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv2 = RepVGGBlock( + out_channels, + out_channels, + use_alpha=use_alpha, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + use_bn_first=False) + self.shortcut = shortcut + + def forward(self, x: Tensor) -> Tensor: + """Forward process. + Args: + inputs (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + y = self.conv1(x) + y = self.conv2(y) + if self.shortcut: + return x + y + else: + return y + + +class CSPResLayer(nn.Module): + """PPYOLOE Backbone Stage. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + num_block (int): Number of blocks in this stage. + block_cfg (dict): Config dict for block. Default config is + suitable for PPYOLOE+ backbone. And in PPYOLOE neck, + block_cfg is set to dict(type='PPYOLOEBasicBlock', + shortcut=False, use_alpha=False). Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True). + stride (int): Stride of the convolution. In backbone, the stride + must be set to 2. In neck, the stride must be set to 1. + Defaults to 1. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + attention_cfg (dict, optional): Config dict for `EffectiveSELayer`. + Defaults to dict(type='EffectiveSELayer', + act_cfg=dict(type='HSigmoid')). + use_spp (bool): Whether to use `SPPFBottleneck` layer. + Defaults to False. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_block: int, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + stride: int = 1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + attention_cfg: OptMultiConfig = dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + use_spp: bool = False): + super().__init__() + + self.num_block = num_block + self.block_cfg = block_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.use_spp = use_spp + assert attention_cfg is None or isinstance(attention_cfg, dict) + + if stride == 2: + conv1_in_channels = conv2_in_channels = conv3_in_channels = ( + in_channels + out_channels) // 2 + blocks_channels = conv1_in_channels // 2 + self.conv_down = ConvModule( + in_channels, + conv1_in_channels, + 3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + conv1_in_channels = conv2_in_channels = in_channels + conv3_in_channels = out_channels + blocks_channels = out_channels // 2 + self.conv_down = None + + self.conv1 = ConvModule( + conv1_in_channels, + blocks_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv2 = ConvModule( + conv2_in_channels, + blocks_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = self.build_blocks_layer(blocks_channels) + + self.conv3 = ConvModule( + conv3_in_channels, + out_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if attention_cfg: + attention_cfg = attention_cfg.copy() + attention_cfg['channels'] = blocks_channels * 2 + self.attn = MODELS.build(attention_cfg) + else: + self.attn = None + + def build_blocks_layer(self, blocks_channels: int) -> nn.Module: + """Build blocks layer. + + Args: + blocks_channels: The channels of this Module. + """ + blocks = nn.Sequential() + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict(in_channels=blocks_channels, out_channels=blocks_channels)) + block_cfg.setdefault('norm_cfg', self.norm_cfg) + block_cfg.setdefault('act_cfg', self.act_cfg) + + for i in range(self.num_block): + blocks.add_module(str(i), MODELS.build(block_cfg)) + + if i == (self.num_block - 1) // 2 and self.use_spp: + blocks.add_module( + 'spp', + SPPFBottleneck( + blocks_channels, + blocks_channels, + kernel_sizes=[5, 9, 13], + use_conv_first=False, + conv_cfg=None, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + return blocks + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + if self.conv_down is not None: + x = self.conv_down(x) + y1 = self.conv1(x) + y2 = self.blocks(self.conv2(x)) + y = torch.cat([y1, y2], axis=1) + if self.attn is not None: + y = self.attn(y) + y = self.conv3(y) + return y + + +@MODELS.register_module() +class RepStageBlock(nn.Module): + """RepStageBlock is a stage block with rep-style basic block. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + num_blocks (int, tuple[int]): Number of blocks. Defaults to 1. + bottle_block (nn.Module): Basic unit of RepStage. + Defaults to RepVGGBlock. + block_cfg (ConfigType): Config of RepStage. + Defaults to 'RepVGGBlock'. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_blocks: int = 1, + bottle_block: nn.Module = RepVGGBlock, + block_cfg: ConfigType = dict(type='RepVGGBlock')): + super().__init__() + block_cfg = block_cfg.copy() + + block_cfg.update( + dict(in_channels=in_channels, out_channels=out_channels)) + + self.conv1 = MODELS.build(block_cfg) + + block_cfg.update( + dict(in_channels=out_channels, out_channels=out_channels)) + + self.block = None + if num_blocks > 1: + self.block = nn.Sequential(*(MODELS.build(block_cfg) + for _ in range(num_blocks - 1))) + + if bottle_block == BottleRep: + self.conv1 = BottleRep( + in_channels, + out_channels, + block_cfg=block_cfg, + adaptive_weight=True) + num_blocks = num_blocks // 2 + self.block = None + if num_blocks > 1: + self.block = nn.Sequential(*(BottleRep( + out_channels, + out_channels, + block_cfg=block_cfg, + adaptive_weight=True) for _ in range(num_blocks - 1))) + + def forward(self, x: Tensor) -> Tensor: + """Forward process. + + Args: + x (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + x = self.conv1(x) + if self.block is not None: + x = self.block(x) + return x + + +class DarknetBottleneck(MMDET_DarknetBottleneck): + """The basic bottleneck block used in Darknet. + + Each ResBlock consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and LeakyReLU. + The first convLayer has filter size of k1Xk1 and the second one has the + filter size of k2Xk2. + + Note: + This DarknetBottleneck is little different from MMDet's, we can + change the kernel size and padding for each conv. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (float): The kernel size for hidden channel. + Defaults to 0.5. + kernel_size (Sequence[int]): The kernel size of the convolution. + Defaults to (1, 3). + padding (Sequence[int]): The padding size of the convolution. + Defaults to (0, 1). + add_identity (bool): Whether to add identity to the out. + Defaults to True + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expansion: float = 0.5, + kernel_size: Sequence[int] = (1, 3), + padding: Sequence[int] = (0, 1), + add_identity: bool = True, + use_depthwise: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels, out_channels, init_cfg=init_cfg) + hidden_channels = int(out_channels * expansion) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + assert isinstance(kernel_size, Sequence) and len(kernel_size) == 2 + + self.conv1 = ConvModule( + in_channels, + hidden_channels, + kernel_size[0], + padding=padding[0], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = conv( + hidden_channels, + out_channels, + kernel_size[1], + stride=1, + padding=padding[1], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.add_identity = \ + add_identity and in_channels == out_channels + + +class CSPLayerWithTwoConv(BaseModule): + """Cross Stage Partial Layer with 2 convolutions. + + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + num_blocks (int): Number of blocks. Defaults to 1 + add_identity (bool): Whether to add identity in blocks. + Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + num_blocks: int = 1, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + + self.mid_channels = int(out_channels * expand_ratio) + self.main_conv = ConvModule( + in_channels, + 2 * self.mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.final_conv = ConvModule( + (2 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.ModuleList( + DarknetBottleneck( + self.mid_channels, + self.mid_channels, + expansion=1, + kernel_size=(3, 3), + padding=(1, 1), + add_identity=add_identity, + use_depthwise=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) for _ in range(num_blocks)) + + def forward(self, x: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + return self.final_conv(torch.cat(x_main, 1)) diff --git a/mmyolo/models/losses/__init__.py b/mmyolo/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ee192921b0c4722a5334be10c192dfadcbe68f08 --- /dev/null +++ b/mmyolo/models/losses/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .iou_loss import IoULoss, bbox_overlaps + +__all__ = ['IoULoss', 'bbox_overlaps'] diff --git a/mmyolo/models/losses/iou_loss.py b/mmyolo/models/losses/iou_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..e3d3dc40ef3e678989db85ee8cfd0035a26a9f19 --- /dev/null +++ b/mmyolo/models/losses/iou_loss.py @@ -0,0 +1,232 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.models.losses.utils import weight_reduce_loss +from mmdet.structures.bbox import HorizontalBoxes + +from mmyolo.registry import MODELS + + +def bbox_overlaps(pred: torch.Tensor, + target: torch.Tensor, + iou_mode: str = 'ciou', + bbox_format: str = 'xywh', + siou_theta: float = 4.0, + eps: float = 1e-7) -> torch.Tensor: + r"""Calculate overlap between two set of bboxes. + `Implementation of paper `Enhancing Geometric Factors into + Model Learning and Inference for Object Detection and Instance + Segmentation `_. + + In the CIoU implementation of YOLOv5 and MMDetection, there is a slight + difference in the way the alpha parameter is computed. + + mmdet version: + alpha = (ious > 0.5).float() * v / (1 - ious + v) + YOLOv5 version: + alpha = v / (v - ious + (1 + eps) + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2) + or (x, y, w, h),shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + iou_mode (str): Options are ('iou', 'ciou', 'giou', 'siou'). + Defaults to "ciou". + bbox_format (str): Options are "xywh" and "xyxy". + Defaults to "xywh". + siou_theta (float): siou_theta for SIoU when calculate shape cost. + Defaults to 4.0. + eps (float): Eps to avoid log(0). + + Returns: + Tensor: shape (n, ). + """ + assert iou_mode in ('iou', 'ciou', 'giou', 'siou') + assert bbox_format in ('xyxy', 'xywh') + if bbox_format == 'xywh': + pred = HorizontalBoxes.cxcywh_to_xyxy(pred) + target = HorizontalBoxes.cxcywh_to_xyxy(target) + + bbox1_x1, bbox1_y1 = pred[..., 0], pred[..., 1] + bbox1_x2, bbox1_y2 = pred[..., 2], pred[..., 3] + bbox2_x1, bbox2_y1 = target[..., 0], target[..., 1] + bbox2_x2, bbox2_y2 = target[..., 2], target[..., 3] + + # Overlap + overlap = (torch.min(bbox1_x2, bbox2_x2) - + torch.max(bbox1_x1, bbox2_x1)).clamp(0) * \ + (torch.min(bbox1_y2, bbox2_y2) - + torch.max(bbox1_y1, bbox2_y1)).clamp(0) + + # Union + w1, h1 = bbox1_x2 - bbox1_x1, bbox1_y2 - bbox1_y1 + w2, h2 = bbox2_x2 - bbox2_x1, bbox2_y2 - bbox2_y1 + union = (w1 * h1) + (w2 * h2) - overlap + eps + + h1 = bbox1_y2 - bbox1_y1 + eps + h2 = bbox2_y2 - bbox2_y1 + eps + + # IoU + ious = overlap / union + + # enclose area + enclose_x1y1 = torch.min(pred[..., :2], target[..., :2]) + enclose_x2y2 = torch.max(pred[..., 2:], target[..., 2:]) + enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + + enclose_w = enclose_wh[..., 0] # cw + enclose_h = enclose_wh[..., 1] # ch + + if iou_mode == 'ciou': + # CIoU = IoU - ( (ρ^2(b_pred,b_gt) / c^2) + (alpha x v) ) + + # calculate enclose area (c^2) + enclose_area = enclose_w**2 + enclose_h**2 + eps + + # calculate ρ^2(b_pred,b_gt): + # euclidean distance between b_pred(bbox2) and b_gt(bbox1) + # center point, because bbox format is xyxy -> left-top xy and + # right-bottom xy, so need to / 4 to get center point. + rho2_left_item = ((bbox2_x1 + bbox2_x2) - (bbox1_x1 + bbox1_x2))**2 / 4 + rho2_right_item = ((bbox2_y1 + bbox2_y2) - + (bbox1_y1 + bbox1_y2))**2 / 4 + rho2 = rho2_left_item + rho2_right_item # rho^2 (ρ^2) + + # Width and height ratio (v) + wh_ratio = (4 / (math.pi**2)) * torch.pow( + torch.atan(w2 / h2) - torch.atan(w1 / h1), 2) + + with torch.no_grad(): + alpha = wh_ratio / (wh_ratio - ious + (1 + eps)) + + # CIoU + ious = ious - ((rho2 / enclose_area) + (alpha * wh_ratio)) + + elif iou_mode == 'giou': + # GIoU = IoU - ( (A_c - union) / A_c ) + convex_area = enclose_w * enclose_h + eps # convex area (A_c) + ious = ious - (convex_area - union) / convex_area + + elif iou_mode == 'siou': + # SIoU: https://arxiv.org/pdf/2205.12740.pdf + # SIoU = IoU - ( (Distance Cost + Shape Cost) / 2 ) + + # calculate sigma (σ): + # euclidean distance between bbox2(pred) and bbox1(gt) center point, + # sigma_cw = b_cx_gt - b_cx + sigma_cw = (bbox2_x1 + bbox2_x2) / 2 - (bbox1_x1 + bbox1_x2) / 2 + eps + # sigma_ch = b_cy_gt - b_cy + sigma_ch = (bbox2_y1 + bbox2_y2) / 2 - (bbox1_y1 + bbox1_y2) / 2 + eps + # sigma = √( (sigma_cw ** 2) - (sigma_ch ** 2) ) + sigma = torch.pow(sigma_cw**2 + sigma_ch**2, 0.5) + + # choose minimize alpha, sin(alpha) + sin_alpha = torch.abs(sigma_ch) / sigma + sin_beta = torch.abs(sigma_cw) / sigma + sin_alpha = torch.where(sin_alpha <= math.sin(math.pi / 4), sin_alpha, + sin_beta) + + # Angle cost = 1 - 2 * ( sin^2 ( arcsin(x) - (pi / 4) ) ) + angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) + + # Distance cost = Σ_(t=x,y) (1 - e ^ (- γ ρ_t)) + rho_x = (sigma_cw / enclose_w)**2 # ρ_x + rho_y = (sigma_ch / enclose_h)**2 # ρ_y + gamma = 2 - angle_cost # γ + distance_cost = (1 - torch.exp(-1 * gamma * rho_x)) + ( + 1 - torch.exp(-1 * gamma * rho_y)) + + # Shape cost = Ω = Σ_(t=w,h) ( ( 1 - ( e ^ (-ω_t) ) ) ^ θ ) + omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) # ω_w + omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) # ω_h + shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), + siou_theta) + torch.pow( + 1 - torch.exp(-1 * omiga_h), siou_theta) + + ious = ious - ((distance_cost + shape_cost) * 0.5) + + return ious.clamp(min=-1.0, max=1.0) + + +@MODELS.register_module() +class IoULoss(nn.Module): + """IoULoss. + + Computing the IoU loss between a set of predicted bboxes and target bboxes. + Args: + iou_mode (str): Options are "ciou". + Defaults to "ciou". + bbox_format (str): Options are "xywh" and "xyxy". + Defaults to "xywh". + eps (float): Eps to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + return_iou (bool): If True, return loss and iou. + """ + + def __init__(self, + iou_mode: str = 'ciou', + bbox_format: str = 'xywh', + eps: float = 1e-7, + reduction: str = 'mean', + loss_weight: float = 1.0, + return_iou: bool = True): + super().__init__() + assert bbox_format in ('xywh', 'xyxy') + assert iou_mode in ('ciou', 'siou', 'giou') + self.iou_mode = iou_mode + self.bbox_format = bbox_format + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + self.return_iou = return_iou + + def forward( + self, + pred: torch.Tensor, + target: torch.Tensor, + weight: Optional[torch.Tensor] = None, + avg_factor: Optional[float] = None, + reduction_override: Optional[Union[str, bool]] = None + ) -> Tuple[Union[torch.Tensor, torch.Tensor], torch.Tensor]: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2) + or (x, y, w, h),shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + weight (Tensor, optional): Element-wise weights. + avg_factor (float, optional): Average factor when computing the + mean of losses. + reduction_override (str, bool, optional): Same as built-in losses + of PyTorch. Defaults to None. + Returns: + loss or tuple(loss, iou): + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + if weight is not None and weight.dim() > 1: + weight = weight.mean(-1) + + iou = bbox_overlaps( + pred, + target, + iou_mode=self.iou_mode, + bbox_format=self.bbox_format, + eps=self.eps) + loss = self.loss_weight * weight_reduce_loss(1.0 - iou, weight, + reduction, avg_factor) + + if self.return_iou: + return loss, iou + else: + return loss diff --git a/mmyolo/models/necks/__init__.py b/mmyolo/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6da9641cee490a1708921bf93f4a07f95f2d6b7c --- /dev/null +++ b/mmyolo/models/necks/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_yolo_neck import BaseYOLONeck +from .cspnext_pafpn import CSPNeXtPAFPN +from .ppyoloe_csppan import PPYOLOECSPPAFPN +from .yolov5_pafpn import YOLOv5PAFPN +from .yolov6_pafpn import YOLOv6CSPRepPAFPN, YOLOv6RepPAFPN +from .yolov7_pafpn import YOLOv7PAFPN +from .yolov8_pafpn import YOLOv8PAFPN +from .yolox_pafpn import YOLOXPAFPN + +__all__ = [ + 'YOLOv5PAFPN', 'BaseYOLONeck', 'YOLOv6RepPAFPN', 'YOLOXPAFPN', + 'CSPNeXtPAFPN', 'YOLOv7PAFPN', 'PPYOLOECSPPAFPN', 'YOLOv6CSPRepPAFPN', + 'YOLOv8PAFPN' +] diff --git a/mmyolo/models/necks/base_yolo_neck.py b/mmyolo/models/necks/base_yolo_neck.py new file mode 100644 index 0000000000000000000000000000000000000000..54fddf79ce90bb0c023f9c445aced62551552174 --- /dev/null +++ b/mmyolo/models/necks/base_yolo_neck.py @@ -0,0 +1,261 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Union + +import torch +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class BaseYOLONeck(BaseModule, metaclass=ABCMeta): + """Base neck used in YOLO series. + + .. code:: text + + P5 neck model structure diagram + +--------+ +-------+ + |top_down|----------+--------->| out |---> output0 + | layer1 | | | layer0| + +--------+ | +-------+ + stride=8 ^ | + idx=0 +------+ +--------+ | + -----> |reduce|--->| cat | | + |layer0| +--------+ | + +------+ ^ v + +--------+ +-----------+ + |upsample| |downsample | + | layer1 | | layer0 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer2 |--->| cat | + +--------+ +-----------+ + stride=16 ^ v + idx=1 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output1 + |layer1| +--------+ | layer0 | | layer1| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer2 | | layer1 | + stride=32 +--------+ +-----------+ + idx=2 +------+ ^ v + -----> |reduce| | +-----------+ + |layer2|---------+------->| cat | + +------+ +-----------+ + v + +-----------+ +-------+ + | bottom_up |--->| out |---> output2 + | layer1 | | layer2| + +-----------+ +-------+ + + .. code:: text + + P6 neck model structure diagram + +--------+ +-------+ + |top_down|----------+--------->| out |---> output0 + | layer1 | | | layer0| + +--------+ | +-------+ + stride=8 ^ | + idx=0 +------+ +--------+ | + -----> |reduce|--->| cat | | + |layer0| +--------+ | + +------+ ^ v + +--------+ +-----------+ + |upsample| |downsample | + | layer1 | | layer0 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer2 |--->| cat | + +--------+ +-----------+ + stride=16 ^ v + idx=1 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output1 + |layer1| +--------+ | layer0 | | layer1| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer2 | | layer1 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer3 |--->| cat | + +--------+ +-----------+ + stride=32 ^ v + idx=2 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output2 + |layer2| +--------+ | layer1 | | layer2| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer3 | | layer2 | + +--------+ +-----------+ + stride=64 ^ v + idx=3 +------+ | +-----------+ + -----> |reduce|---------+------->| cat | + |layer3| +-----------+ + +------+ v + +-----------+ +-------+ + | bottom_up |--->| out |---> output3 + | layer2 | | layer3| + +-----------+ +-------+ + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + upsample_feats_cat_first (bool): Whether the output features are + concat first after upsampling in the topdown module. + Defaults to True. Currently only YOLOv7 is false. + freeze_all(bool): Whether to freeze the model. Defaults to False + norm_cfg (dict): Config dict for normalization layer. + Defaults to None. + act_cfg (dict): Config dict for activation layer. + Defaults to None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[int, List[int]], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + upsample_feats_cat_first: bool = True, + freeze_all: bool = False, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs): + super().__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + self.deepen_factor = deepen_factor + self.widen_factor = widen_factor + self.upsample_feats_cat_first = upsample_feats_cat_first + self.freeze_all = freeze_all + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + self.reduce_layers = nn.ModuleList() + for idx in range(len(in_channels)): + self.reduce_layers.append(self.build_reduce_layer(idx)) + + # build top-down blocks + self.upsample_layers = nn.ModuleList() + self.top_down_layers = nn.ModuleList() + for idx in range(len(in_channels) - 1, 0, -1): + self.upsample_layers.append(self.build_upsample_layer(idx=idx, n_layers=len(in_channels))) + self.top_down_layers.append(self.build_top_down_layer(idx)) + + # build bottom-up blocks + self.downsample_layers = nn.ModuleList() + self.bottom_up_layers = nn.ModuleList() + for idx in range(len(in_channels) - 1): + self.downsample_layers.append(self.build_downsample_layer(idx)) + self.bottom_up_layers.append(self.build_bottom_up_layer(idx)) + + self.out_layers = nn.ModuleList() + for idx in range(len(in_channels)): + self.out_layers.append(self.build_out_layer(idx)) + + @abstractmethod + def build_reduce_layer(self, idx: int): + """build reduce layer.""" + pass + + @abstractmethod + def build_upsample_layer(self, idx: int): + """build upsample layer.""" + pass + + @abstractmethod + def build_top_down_layer(self, idx: int): + """build top down layer.""" + pass + + @abstractmethod + def build_downsample_layer(self, idx: int): + """build downsample layer.""" + pass + + @abstractmethod + def build_bottom_up_layer(self, idx: int): + """build bottom up layer.""" + pass + + @abstractmethod + def build_out_layer(self, idx: int): + """build out layer.""" + pass + + def _freeze_all(self): + """Freeze the model.""" + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Convert the model into training mode while keep the normalization + layer freezed.""" + super().train(mode) + if self.freeze_all: + self._freeze_all() + + def forward(self, inputs: List[torch.Tensor]) -> tuple: + """Forward function.""" + assert len(inputs) == len(self.in_channels) + # reduce layers + reduce_outs = [] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](inputs[idx])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = reduce_outs[idx - 1] + upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - + idx]( + feat_high) + if self.upsample_feats_cat_first: + top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1) + else: + top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx]( + torch.cat([downsample_feat, feat_high], 1)) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) diff --git a/mmyolo/models/necks/cspnext_pafpn.py b/mmyolo/models/necks/cspnext_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..310126f63e12f888daac50ca30674484f7b3a6ec --- /dev/null +++ b/mmyolo/models/necks/cspnext_pafpn.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Sequence + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class CSPNeXtPAFPN(BaseYOLONeck): + """Path Aggregation Network with CSPNeXt blocks. + + Args: + in_channels (Sequence[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. + Defaults to 3. + use_depthwise (bool): Whether to use depthwise separable convolution in + blocks. Defaults to False. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(scale_factor=2, mode='nearest')` + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True) + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__( + self, + in_channels: Sequence[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'), + conv_cfg: bool = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + self.num_csp_blocks = round(num_csp_blocks * deepen_factor) + self.conv = DepthwiseSeparableConvModule \ + if use_depthwise else ConvModule + self.upsample_cfg = upsample_cfg + self.expand_ratio = expand_ratio + self.conv_cfg = conv_cfg + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=int(out_channels * widen_factor), + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = self.conv( + self.in_channels[idx], + self.in_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(**self.upsample_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + if idx == 1: + return CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return nn.Sequential( + CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + self.conv( + self.in_channels[idx - 1], + self.in_channels[idx - 2], + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + return self.conv( + self.in_channels[idx], + self.in_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + self.in_channels[idx] * 2, + self.in_channels[idx + 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + return self.conv( + self.in_channels[idx], + self.out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/mmyolo/models/necks/ppyoloe_csppan.py b/mmyolo/models/necks/ppyoloe_csppan.py new file mode 100644 index 0000000000000000000000000000000000000000..4e4ef7200bfc6784a7ce8d92bcfbc46314e518e9 --- /dev/null +++ b/mmyolo/models/necks/ppyoloe_csppan.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.backbones.csp_resnet import CSPResLayer +from mmyolo.models.necks import BaseYOLONeck +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class PPYOLOECSPPAFPN(BaseYOLONeck): + """CSPPAN in PPYOLOE. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (List[int]): Number of output channels + (used at each scale). + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + freeze_all(bool): Whether to freeze the model. + num_csplayer (int): Number of `CSPResLayer` in per layer. + Defaults to 1. + num_blocks_per_layer (int): Number of blocks per `CSPResLayer`. + Defaults to 3. + block_cfg (dict): Config dict for block. Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=False) + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + drop_block_cfg (dict, optional): Drop block config. + Defaults to None. If you want to use Drop block after + `CSPResLayer`, you can set this para as + dict(type='mmdet.DropBlock', drop_prob=0.1, + block_size=3, warm_iters=0). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + use_spp (bool): Whether to use `SPP` in reduce layer. + Defaults to False. + """ + + def __init__(self, + in_channels: List[int] = [256, 512, 1024], + out_channels: List[int] = [256, 512, 1024], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + freeze_all: bool = False, + num_csplayer: int = 1, + num_blocks_per_layer: int = 3, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=False, + use_alpha=False), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + drop_block_cfg: ConfigType = None, + init_cfg: OptMultiConfig = None, + use_spp: bool = False): + self.block_cfg = block_cfg + self.num_csplayer = num_csplayer + self.num_blocks_per_layer = round(num_blocks_per_layer * deepen_factor) + # Only use spp in last reduce_layer, if use_spp=True. + self.use_spp = use_spp + self.drop_block_cfg = drop_block_cfg + assert drop_block_cfg is None or isinstance(drop_block_cfg, dict) + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=[ + int(channel * widen_factor) for channel in out_channels + ], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int): + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + # fpn_stage + in_channels = self.in_channels[idx] + out_channels = self.out_channels[idx] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=self.use_spp) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + layer = nn.Sequential(*layer) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer.""" + # fpn_route + in_channels = self.out_channels[idx] + return nn.Sequential( + ConvModule( + in_channels=in_channels, + out_channels=in_channels // 2, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Upsample(scale_factor=2, mode='nearest')) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + # fpn_stage + in_channels = self.in_channels[idx - 1] + self.out_channels[idx] // 2 + out_channels = self.out_channels[idx - 1] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=False) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + + return nn.Sequential(*layer) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + # pan_route + return ConvModule( + in_channels=self.out_channels[idx], + out_channels=self.out_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + # pan_stage + in_channels = self.out_channels[idx + 1] + self.out_channels[idx] + out_channels = self.out_channels[idx + 1] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=False) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + + return nn.Sequential(*layer) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() diff --git a/mmyolo/models/necks/yolov5_pafpn.py b/mmyolo/models/necks/yolov5_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..b95147fc512359442aeb1bbc88aadd07031bdadf --- /dev/null +++ b/mmyolo/models/necks/yolov5_pafpn.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..utils import make_divisible, make_round +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv5PAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv5. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 1, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = num_csp_blocks + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = ConvModule( + make_divisible(self.in_channels[idx], self.widen_factor), + make_divisible(self.in_channels[idx - 1], self.widen_factor), + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(scale_factor=2, mode='nearest') + + def build_top_down_layer(self, idx: int): + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + + if idx == 1: + return CSPLayer( + make_divisible(self.in_channels[idx - 1] * 2, + self.widen_factor), + make_divisible(self.in_channels[idx - 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return nn.Sequential( + CSPLayer( + make_divisible(self.in_channels[idx - 1] * 2, + self.widen_factor), + make_divisible(self.in_channels[idx - 1], + self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, + self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + make_divisible(self.in_channels[idx - 1], + self.widen_factor), + make_divisible(self.in_channels[idx - 2], + self.widen_factor), + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + return ConvModule( + make_divisible(self.in_channels[idx], self.widen_factor), + make_divisible(self.in_channels[idx], self.widen_factor), + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + make_divisible(self.in_channels[idx] * 2, self.widen_factor), + make_divisible(self.in_channels[idx + 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() diff --git a/mmyolo/models/necks/yolov6_pafpn.py b/mmyolo/models/necks/yolov6_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..74b7ce932ec5352db0fae9ed5f499fe447ac3d27 --- /dev/null +++ b/mmyolo/models/necks/yolov6_pafpn.py @@ -0,0 +1,285 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import BepC3StageBlock, RepStageBlock +from ..utils import make_round +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv6RepPAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv6. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = num_csp_blocks + self.block_cfg = block_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The reduce layer. + """ + if idx == 2: + layer = ConvModule( + in_channels=int(self.in_channels[idx] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The upsample layer. + """ + return nn.ConvTranspose2d( + in_channels=int(self.out_channels[idx - 1] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + kernel_size=2, + stride=2, + bias=True) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = RepStageBlock( + in_channels=int( + (self.out_channels[idx - 1] + self.in_channels[idx - 1]) * + self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The downsample layer. + """ + return ConvModule( + in_channels=int(self.out_channels[idx] * self.widen_factor), + out_channels=int(self.out_channels[idx] * self.widen_factor), + kernel_size=3, + stride=2, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return RepStageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv6CSPRepPAFPN(YOLOv6RepPAFPN): + """Path Aggregation Network used in YOLOv6. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + hidden_ratio: float = 0.5, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_act_cfg: ConfigType = dict(type='SiLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + self.block_act_cfg = block_act_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = BepC3StageBlock( + in_channels=int( + (self.out_channels[idx - 1] + self.in_channels[idx - 1]) * + self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return BepC3StageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) diff --git a/mmyolo/models/necks/yolov7_pafpn.py b/mmyolo/models/necks/yolov7_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..1d31f4623b50083ff820e6b20229b33ad0f41860 --- /dev/null +++ b/mmyolo/models/necks/yolov7_pafpn.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import MaxPoolAndStrideConvBlock, RepVGGBlock, SPPFCSPBlock +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv7PAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv7. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + block_cfg (dict): Config dict for block. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + spp_expand_ratio (float): Expand ratio of SPPCSPBlock. + Defaults to 0.5. + is_tiny_version (bool): Is tiny version of neck. If True, + it means it is a yolov7 tiny model. Defaults to False. + use_maxpool_in_downsample (bool): Whether maxpooling is + used in downsample layers. Defaults to True. + use_in_channels_in_downsample (bool): MaxPoolAndStrideConvBlock + module input parameters. Defaults to False. + use_repconv_outs (bool): Whether to use `repconv` in the output + layer. Defaults to True. + upsample_feats_cat_first (bool): Whether the output features are + concat first after upsampling in the topdown module. + Defaults to True. Currently only YOLOv7 is false. + freeze_all(bool): Whether to freeze the model. Defaults to False. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: List[int], + block_cfg: dict = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.25, + num_blocks=4, + num_convs_in_block=1), + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + spp_expand_ratio: float = 0.5, + is_tiny_version: bool = False, + use_maxpool_in_downsample: bool = True, + use_in_channels_in_downsample: bool = False, + use_repconv_outs: bool = True, + upsample_feats_cat_first: bool = False, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + + self.is_tiny_version = is_tiny_version + self.use_maxpool_in_downsample = use_maxpool_in_downsample + self.use_in_channels_in_downsample = use_in_channels_in_downsample + self.spp_expand_ratio = spp_expand_ratio + self.use_repconv_outs = use_repconv_outs + self.block_cfg = block_cfg + self.block_cfg.setdefault('norm_cfg', norm_cfg) + self.block_cfg.setdefault('act_cfg', act_cfg) + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=[ + int(channel * widen_factor) for channel in out_channels + ], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + upsample_feats_cat_first=upsample_feats_cat_first, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = SPPFCSPBlock( + self.in_channels[idx], + self.out_channels[idx], + expand_ratio=self.spp_expand_ratio, + is_tiny_version=self.is_tiny_version, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = ConvModule( + self.in_channels[idx], + self.out_channels[idx], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer.""" + return nn.Sequential( + ConvModule( + self.out_channels[idx], + self.out_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Upsample(scale_factor=2, mode='nearest')) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + block_cfg['in_channels'] = self.out_channels[idx - 1] * 2 + block_cfg['out_channels'] = self.out_channels[idx - 1] + return MODELS.build(block_cfg) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + if self.use_maxpool_in_downsample and not self.is_tiny_version: + return MaxPoolAndStrideConvBlock( + self.out_channels[idx], + self.out_channels[idx + 1], + use_in_channels_of_middle=self.use_in_channels_in_downsample, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return ConvModule( + self.out_channels[idx], + self.out_channels[idx + 1], + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + block_cfg['in_channels'] = self.out_channels[idx + 1] * 2 + block_cfg['out_channels'] = self.out_channels[idx + 1] + return MODELS.build(block_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + if len(self.in_channels) == 4: + # P6 + return nn.Identity() + + out_channels = self.out_channels[idx] * 2 + + if self.use_repconv_outs: + return RepVGGBlock( + self.out_channels[idx], + out_channels, + 3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return ConvModule( + self.out_channels[idx], + out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/mmyolo/models/necks/yolov8_pafpn.py b/mmyolo/models/necks/yolov8_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..e26698bcc191b0141d89c1e965de811494a96539 --- /dev/null +++ b/mmyolo/models/necks/yolov8_pafpn.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .. import CSPLayerWithTwoConv +from ..utils import make_divisible, make_round +from .yolov5_pafpn import YOLOv5PAFPN + + +@MODELS.register_module() +class YOLOv8PAFPN(YOLOv5PAFPN): + """Path Aggregation Network used in YOLOv8. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + return nn.Identity() + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + return CSPLayerWithTwoConv( + make_divisible((self.in_channels[idx - 1] + self.in_channels[idx]), + self.widen_factor), + make_divisible(self.out_channels[idx - 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayerWithTwoConv( + make_divisible( + (self.out_channels[idx] + self.out_channels[idx + 1]), + self.widen_factor), + make_divisible(self.out_channels[idx + 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/mmyolo/models/necks/yolox_pafpn.py b/mmyolo/models/necks/yolox_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..bd2595e70fe47e38e68ebd0d878deb6f264bf2d1 --- /dev/null +++ b/mmyolo/models/necks/yolox_pafpn.py @@ -0,0 +1,172 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOXPAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOX. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + freeze_all(bool): Whether to freeze the model. Defaults to False. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + use_depthwise: bool = False, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = round(num_csp_blocks * deepen_factor) + self.use_depthwise = use_depthwise + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=int(out_channels * widen_factor), + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == 2: + layer = ConvModule( + self.in_channels[idx], + self.in_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(scale_factor=2, mode='nearest') + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + if idx == 1: + return CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif idx == 2: + return nn.Sequential( + CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.in_channels[idx - 1], + self.in_channels[idx - 2], + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + return conv( + self.in_channels[idx], + self.in_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + self.in_channels[idx] * 2, + self.in_channels[idx + 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + return ConvModule( + self.in_channels[idx], + self.out_channels, + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/mmyolo/models/plugins/__init__.py b/mmyolo/models/plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..497233ac21a4dd1a6a2a3127c09435d8146eb553 --- /dev/null +++ b/mmyolo/models/plugins/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .cbam import CBAM + +__all__ = ['CBAM'] diff --git a/mmyolo/models/plugins/cbam.py b/mmyolo/models/plugins/cbam.py new file mode 100644 index 0000000000000000000000000000000000000000..e9559f2e2db951a5681ec9af5864928ed480361b --- /dev/null +++ b/mmyolo/models/plugins/cbam.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import OptMultiConfig +from mmengine.model import BaseModule + +from mmyolo.registry import MODELS + + +class ChannelAttention(BaseModule): + """ChannelAttention. + + Args: + channels (int): The input (and output) channels of the + ChannelAttention. + reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate + channel will be ``int(channels/ratio)``. Defaults to 16. + act_cfg (dict): Config dict for activation layer + Defaults to dict(type='ReLU'). + """ + + def __init__(self, + channels: int, + reduce_ratio: int = 16, + act_cfg: dict = dict(type='ReLU')): + super().__init__() + + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc = nn.Sequential( + ConvModule( + in_channels=channels, + out_channels=int(channels / reduce_ratio), + kernel_size=1, + stride=1, + conv_cfg=None, + act_cfg=act_cfg), + ConvModule( + in_channels=int(channels / reduce_ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=None, + act_cfg=None)) + self.sigmoid = nn.Sigmoid() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + avgpool_out = self.fc(self.avg_pool(x)) + maxpool_out = self.fc(self.max_pool(x)) + out = self.sigmoid(avgpool_out + maxpool_out) + return out + + +class SpatialAttention(BaseModule): + """SpatialAttention + Args: + kernel_size (int): The size of the convolution kernel in + SpatialAttention. Defaults to 7. + """ + + def __init__(self, kernel_size: int = 7): + super().__init__() + + self.conv = ConvModule( + in_channels=2, + out_channels=1, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=None, + act_cfg=dict(type='Sigmoid')) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + avg_out = torch.mean(x, dim=1, keepdim=True) + max_out, _ = torch.max(x, dim=1, keepdim=True) + out = torch.cat([avg_out, max_out], dim=1) + out = self.conv(out) + return out + + +@MODELS.register_module() +class CBAM(BaseModule): + """Convolutional Block Attention Module. arxiv link: + https://arxiv.org/abs/1807.06521v2. + + Args: + in_channels (int): The input (and output) channels of the CBAM. + reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate + channel will be ``int(channels/ratio)``. Defaults to 16. + kernel_size (int): The size of the convolution kernel in + SpatialAttention. Defaults to 7. + act_cfg (dict): Config dict for activation layer in ChannelAttention + Defaults to dict(type='ReLU'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + reduce_ratio: int = 16, + kernel_size: int = 7, + act_cfg: dict = dict(type='ReLU'), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + self.channel_attention = ChannelAttention( + channels=in_channels, reduce_ratio=reduce_ratio, act_cfg=act_cfg) + + self.spatial_attention = SpatialAttention(kernel_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + out = self.channel_attention(x) * x + out = self.spatial_attention(out) * out + return out diff --git a/mmyolo/models/task_modules/__init__.py b/mmyolo/models/task_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7dbdc25fa3cf16e85e0e99e7d302a98f2b4f13ce --- /dev/null +++ b/mmyolo/models/task_modules/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assigners import BatchATSSAssigner, BatchTaskAlignedAssigner +from .coders import YOLOv5BBoxCoder, YOLOXBBoxCoder + +__all__ = [ + 'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'BatchATSSAssigner', + 'BatchTaskAlignedAssigner' +] diff --git a/mmyolo/models/task_modules/assigners/__init__.py b/mmyolo/models/task_modules/assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e74ab728b301b98eaa3702cae4befc82d62f0bc5 --- /dev/null +++ b/mmyolo/models/task_modules/assigners/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .batch_atss_assigner import BatchATSSAssigner +from .batch_dsl_assigner import BatchDynamicSoftLabelAssigner +from .batch_task_aligned_assigner import BatchTaskAlignedAssigner +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + +__all__ = [ + 'BatchATSSAssigner', 'BatchTaskAlignedAssigner', + 'select_candidates_in_gts', 'select_highest_overlaps', + 'yolov6_iou_calculator', 'BatchDynamicSoftLabelAssigner' +] diff --git a/mmyolo/models/task_modules/assigners/batch_atss_assigner.py b/mmyolo/models/task_modules/assigners/batch_atss_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..45b3069afde73e240890273c58e3860da59ad854 --- /dev/null +++ b/mmyolo/models/task_modules/assigners/batch_atss_assigner.py @@ -0,0 +1,339 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.utils import ConfigType +from torch import Tensor + +from mmyolo.registry import TASK_UTILS +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + + +def bbox_center_distance(bboxes: Tensor, + priors: Tensor) -> Tuple[Tensor, Tensor]: + """Compute the center distance between bboxes and priors. + + Args: + bboxes (Tensor): Shape (n, 4) for bbox, "xyxy" format. + priors (Tensor): Shape (num_priors, 4) for priors, "xyxy" format. + + Returns: + distances (Tensor): Center distances between bboxes and priors, + shape (num_priors, n). + priors_points (Tensor): Priors cx cy points, + shape (num_priors, 2). + """ + bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 + bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 + bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1) + + priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0 + priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0 + priors_points = torch.stack((priors_cx, priors_cy), dim=1) + + distances = (bbox_points[:, None, :] - + priors_points[None, :, :]).pow(2).sum(-1).sqrt() + + return distances, priors_points + + +@TASK_UTILS.register_module() +class BatchATSSAssigner(nn.Module): + """Assign a batch of corresponding gt bboxes or background to each prior. + + This code is based on + https://github.com/meituan/YOLOv6/blob/main/yolov6/assigners/atss_assigner.py + + Each proposal will be assigned with `0` or a positive integer + indicating the ground truth index. + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + num_classes (int): number of class + iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou + calculator. Defaults to ``dict(type='BboxOverlaps2D')`` + topk (int): number of priors selected in each level + """ + + def __init__( + self, + num_classes: int, + iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'), + topk: int = 9): + super().__init__() + self.num_classes = num_classes + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.topk = topk + + @torch.no_grad() + def forward(self, pred_bboxes: Tensor, priors: Tensor, + num_level_priors: List, gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor) -> dict: + """Assign gt to priors. + + The assignment is done in following steps + + 1. compute iou between all prior (prior of all pyramid levels) and gt + 2. compute center distance between all prior and gt + 3. on each pyramid level, for each gt, select k prior whose center + are closest to the gt center, so we total select k*l prior as + candidates for each gt + 4. get corresponding iou for the these candidates, and compute the + mean and std, set mean + std as the iou threshold + 5. select these candidates whose iou are greater than or equal to + the threshold as positive + 6. limit the positive sample's center in gt + + Args: + pred_bboxes (Tensor): Predicted bounding boxes, + shape(batch_size, num_priors, 4) + priors (Tensor): Model priors with stride, shape(num_priors, 4) + num_level_priors (List): Number of bboxes in each level, len(3) + gt_labels (Tensor): Ground truth label, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground truth bbox, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + Returns: + assigned_result (dict): Assigned result + 'assigned_labels' (Tensor): shape(batch_size, num_gt) + 'assigned_bboxes' (Tensor): shape(batch_size, num_gt, 4) + 'assigned_scores' (Tensor): + shape(batch_size, num_gt, number_classes) + 'fg_mask_pre_prior' (Tensor): shape(bs, num_gt) + """ + # generate priors + cell_half_size = priors[:, 2:] * 2.5 + priors_gen = torch.zeros_like(priors) + priors_gen[:, :2] = priors[:, :2] - cell_half_size + priors_gen[:, 2:] = priors[:, :2] + cell_half_size + priors = priors_gen + + batch_size = gt_bboxes.size(0) + num_gt, num_priors = gt_bboxes.size(1), priors.size(0) + + assigned_result = { + 'assigned_labels': + gt_bboxes.new_full([batch_size, num_priors], self.num_classes), + 'assigned_bboxes': + gt_bboxes.new_full([batch_size, num_priors, 4], 0), + 'assigned_scores': + gt_bboxes.new_full([batch_size, num_priors, self.num_classes], 0), + 'fg_mask_pre_prior': + gt_bboxes.new_full([batch_size, num_priors], 0) + } + + if num_gt == 0: + return assigned_result + + # compute iou between all prior (prior of all pyramid levels) and gt + overlaps = self.iou_calculator(gt_bboxes.reshape([-1, 4]), priors) + overlaps = overlaps.reshape([batch_size, -1, num_priors]) + + # compute center distance between all prior and gt + distances, priors_points = bbox_center_distance( + gt_bboxes.reshape([-1, 4]), priors) + distances = distances.reshape([batch_size, -1, num_priors]) + + # Selecting candidates based on the center distance + is_in_candidate, candidate_idxs = self.select_topk_candidates( + distances, num_level_priors, pad_bbox_flag) + + # get corresponding iou for the these candidates, and compute the + # mean and std, set mean + std as the iou threshold + overlaps_thr_per_gt, iou_candidates = self.threshold_calculator( + is_in_candidate, candidate_idxs, overlaps, num_priors, batch_size, + num_gt) + + # select candidates iou >= threshold as positive + is_pos = torch.where( + iou_candidates > overlaps_thr_per_gt.repeat([1, 1, num_priors]), + is_in_candidate, torch.zeros_like(is_in_candidate)) + + is_in_gts = select_candidates_in_gts(priors_points, gt_bboxes) + pos_mask = is_pos * is_in_gts * pad_bbox_flag + + # if an anchor box is assigned to multiple gts, + # the one with the highest IoU will be selected. + gt_idx_pre_prior, fg_mask_pre_prior, pos_mask = \ + select_highest_overlaps(pos_mask, overlaps, num_gt) + + # assigned target + assigned_labels, assigned_bboxes, assigned_scores = self.get_targets( + gt_labels, gt_bboxes, gt_idx_pre_prior, fg_mask_pre_prior, + num_priors, batch_size, num_gt) + + # soft label with iou + if pred_bboxes is not None: + ious = yolov6_iou_calculator(gt_bboxes, pred_bboxes) * pos_mask + ious = ious.max(axis=-2)[0].unsqueeze(-1) + assigned_scores *= ious + + assigned_result['assigned_labels'] = assigned_labels.long() + assigned_result['assigned_bboxes'] = assigned_bboxes + assigned_result['assigned_scores'] = assigned_scores + assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool() + return assigned_result + + def select_topk_candidates(self, distances: Tensor, + num_level_priors: List[int], + pad_bbox_flag: Tensor) -> Tuple[Tensor, Tensor]: + """Selecting candidates based on the center distance. + + Args: + distances (Tensor): Distance between all bbox and gt, + shape(batch_size, num_gt, num_priors) + num_level_priors (List[int]): Number of bboxes in each level, + len(3) + pad_bbox_flag (Tensor): Ground truth bbox mask, + shape(batch_size, num_gt, 1) + + Return: + is_in_candidate_list (Tensor): Flag show that each level have + topk candidates or not, shape(batch_size, num_gt, num_priors) + candidate_idxs (Tensor): Candidates index, + shape(batch_size, num_gt, num_gt) + """ + is_in_candidate_list = [] + candidate_idxs = [] + start_idx = 0 + + distances_dtype = distances.dtype + distances = torch.split(distances, num_level_priors, dim=-1) + pad_bbox_flag = pad_bbox_flag.repeat(1, 1, self.topk).bool() + + for distances_per_level, priors_per_level in zip( + distances, num_level_priors): + # on each pyramid level, for each gt, + # select k bbox whose center are closest to the gt center + end_index = start_idx + priors_per_level + selected_k = min(self.topk, priors_per_level) + + _, topk_idxs_per_level = distances_per_level.topk( + selected_k, dim=-1, largest=False) + candidate_idxs.append(topk_idxs_per_level + start_idx) + + topk_idxs_per_level = torch.where( + pad_bbox_flag, topk_idxs_per_level, + torch.zeros_like(topk_idxs_per_level)) + + is_in_candidate = F.one_hot(topk_idxs_per_level, + priors_per_level).sum(dim=-2) + is_in_candidate = torch.where(is_in_candidate > 1, + torch.zeros_like(is_in_candidate), + is_in_candidate) + is_in_candidate_list.append(is_in_candidate.to(distances_dtype)) + + start_idx = end_index + + is_in_candidate_list = torch.cat(is_in_candidate_list, dim=-1) + candidate_idxs = torch.cat(candidate_idxs, dim=-1) + + return is_in_candidate_list, candidate_idxs + + @staticmethod + def threshold_calculator(is_in_candidate: List, candidate_idxs: Tensor, + overlaps: Tensor, num_priors: int, + batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor]: + """Get corresponding iou for the these candidates, and compute the mean + and std, set mean + std as the iou threshold. + + Args: + is_in_candidate (Tensor): Flag show that each level have + topk candidates or not, shape(batch_size, num_gt, num_priors). + candidate_idxs (Tensor): Candidates index, + shape(batch_size, num_gt, num_gt) + overlaps (Tensor): Overlaps area, + shape(batch_size, num_gt, num_priors). + num_priors (int): Number of priors. + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + + Return: + overlaps_thr_per_gt (Tensor): Overlap threshold of + per ground truth, shape(batch_size, num_gt, 1). + candidate_overlaps (Tensor): Candidate overlaps, + shape(batch_size, num_gt, num_priors). + """ + + batch_size_num_gt = batch_size * num_gt + candidate_overlaps = torch.where(is_in_candidate > 0, overlaps, + torch.zeros_like(overlaps)) + candidate_idxs = candidate_idxs.reshape([batch_size_num_gt, -1]) + + assist_indexes = num_priors * torch.arange( + batch_size_num_gt, device=candidate_idxs.device) + assist_indexes = assist_indexes[:, None] + flatten_indexes = candidate_idxs + assist_indexes + + candidate_overlaps_reshape = candidate_overlaps.reshape( + -1)[flatten_indexes] + candidate_overlaps_reshape = candidate_overlaps_reshape.reshape( + [batch_size, num_gt, -1]) + + overlaps_mean_per_gt = candidate_overlaps_reshape.mean( + axis=-1, keepdim=True) + overlaps_std_per_gt = candidate_overlaps_reshape.std( + axis=-1, keepdim=True) + overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt + + return overlaps_thr_per_gt, candidate_overlaps + + def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor, + assigned_gt_inds: Tensor, fg_mask_pre_prior: Tensor, + num_priors: int, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get target info. + + Args: + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + assigned_gt_inds (Tensor): Assigned ground truth indexes, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + num_priors (int): Number of priors. + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + + Return: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned bboxes, + shape(batch_size, num_priors) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors) + """ + + # assigned target labels + batch_index = torch.arange( + batch_size, dtype=gt_labels.dtype, device=gt_labels.device) + batch_index = batch_index[..., None] + assigned_gt_inds = (assigned_gt_inds + batch_index * num_gt).long() + assigned_labels = gt_labels.flatten()[assigned_gt_inds.flatten()] + assigned_labels = assigned_labels.reshape([batch_size, num_priors]) + assigned_labels = torch.where( + fg_mask_pre_prior > 0, assigned_labels, + torch.full_like(assigned_labels, self.num_classes)) + + # assigned target boxes + assigned_bboxes = gt_bboxes.reshape([-1, + 4])[assigned_gt_inds.flatten()] + assigned_bboxes = assigned_bboxes.reshape([batch_size, num_priors, 4]) + + # assigned target scores + assigned_scores = F.one_hot(assigned_labels.long(), + self.num_classes + 1).float() + assigned_scores = assigned_scores[:, :, :self.num_classes] + + return assigned_labels, assigned_bboxes, assigned_scores diff --git a/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py b/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae0f80239590f9c906778e6e4c7c6b4bd10c488 --- /dev/null +++ b/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py @@ -0,0 +1,272 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.structures.bbox import BaseBoxes +from mmdet.utils import ConfigType +from torch import Tensor + +from mmyolo.registry import TASK_UTILS + +INF = 100000000 +EPS = 1.0e-7 + + +def find_inside_points(boxes: Tensor, + points: Tensor, + box_dim: int = 4, + eps: float = 0.01) -> Tensor: + """Find inside box points in batches. Boxes dimension must be 3. + + Args: + boxes (Tensor): Boxes tensor. Must be batch input. + Has shape of (batch_size, n_boxes, box_dim). + points (Tensor): Points coordinates. Has shape of (n_points, 2). + box_dim (int): The dimension of box. 4 means horizontal box and + 5 means rotated box. Defaults to 4. + eps (float): Make sure the points are inside not on the boundary. + Only use in rotated boxes. Defaults to 0.01. + + Returns: + Tensor: A BoolTensor indicating whether a point is inside + boxes. The index has shape of (n_points, batch_size, n_boxes). + """ + if box_dim == 4: + # Horizontal Boxes + lt_ = points[:, None, None] - boxes[..., :2] + rb_ = boxes[..., 2:] - points[:, None, None] + + deltas = torch.cat([lt_, rb_], dim=-1) + is_in_gts = deltas.min(dim=-1).values > 0 + + elif box_dim == 5: + # Rotated Boxes + points = points[:, None, None] + ctrs, wh, t = torch.split(boxes, [2, 2, 1], dim=-1) + cos_value, sin_value = torch.cos(t), torch.sin(t) + matrix = torch.cat([cos_value, sin_value, -sin_value, cos_value], + dim=-1).reshape(*boxes.shape[:-1], 2, 2) + + offset = points - ctrs + offset = torch.matmul(matrix, offset[..., None]) + offset = offset.squeeze(-1) + offset_x, offset_y = offset[..., 0], offset[..., 1] + w, h = wh[..., 0], wh[..., 1] + is_in_gts = (offset_x <= w / 2 - eps) & (offset_x >= - w / 2 + eps) & \ + (offset_y <= h / 2 - eps) & (offset_y >= - h / 2 + eps) + else: + raise NotImplementedError(f'Unsupport box_dim:{box_dim}') + + return is_in_gts + + +def get_box_center(boxes: Tensor, box_dim: int = 4) -> Tensor: + """Return a tensor representing the centers of boxes. + + Args: + boxes (Tensor): Boxes tensor. Has shape of (b, n, box_dim) + box_dim (int): The dimension of box. 4 means horizontal box and + 5 means rotated box. Defaults to 4. + + Returns: + Tensor: Centers have shape of (b, n, 2) + """ + if box_dim == 4: + # Horizontal Boxes, (x1, y1, x2, y2) + return (boxes[..., :2] + boxes[..., 2:]) / 2.0 + elif box_dim == 5: + # Rotated Boxes, (x, y, w, h, a) + return boxes[..., :2] + else: + raise NotImplementedError(f'Unsupported box_dim:{box_dim}') + + +@TASK_UTILS.register_module() +class BatchDynamicSoftLabelAssigner(nn.Module): + """Computes matching between predictions and ground truth with dynamic soft + label assignment. + + Args: + num_classes (int): number of class + soft_center_radius (float): Radius of the soft center prior. + Defaults to 3.0. + topk (int): Select top-k predictions to calculate dynamic k + best matches for each gt. Defaults to 13. + iou_weight (float): The scale factor of iou cost. Defaults to 3.0. + iou_calculator (ConfigType): Config of overlaps Calculator. + Defaults to dict(type='BboxOverlaps2D'). + batch_iou (bool): Use batch input when calculate IoU. + If set to False use loop instead. Defaults to True. + """ + + def __init__( + self, + num_classes, + soft_center_radius: float = 3.0, + topk: int = 13, + iou_weight: float = 3.0, + iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'), + batch_iou: bool = True, + ) -> None: + super().__init__() + self.num_classes = num_classes + self.soft_center_radius = soft_center_radius + self.topk = topk + self.iou_weight = iou_weight + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.batch_iou = batch_iou + + @torch.no_grad() + def forward(self, pred_bboxes: Tensor, pred_scores: Tensor, priors: Tensor, + gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor) -> dict: + num_gt = gt_bboxes.size(1) + decoded_bboxes = pred_bboxes + batch_size, num_bboxes, box_dim = decoded_bboxes.size() + + if num_gt == 0 or num_bboxes == 0: + return { + 'assigned_labels': + gt_labels.new_full( + pred_scores[..., 0].shape, + self.num_classes, + dtype=torch.long), + 'assigned_labels_weights': + gt_bboxes.new_full(pred_scores[..., 0].shape, 1), + 'assigned_bboxes': + gt_bboxes.new_full(pred_bboxes.shape, 0), + 'assign_metrics': + gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + } + + prior_center = priors[:, :2] + if isinstance(gt_bboxes, BaseBoxes): + raise NotImplementedError( + f'type of {type(gt_bboxes)} are not implemented !') + else: + is_in_gts = find_inside_points(gt_bboxes, prior_center, box_dim) + + # (N_points, B, N_boxes) + is_in_gts = is_in_gts * pad_bbox_flag[..., 0][None] + # (N_points, B, N_boxes) -> (B, N_points, N_boxes) + is_in_gts = is_in_gts.permute(1, 0, 2) + # (B, N_points) + valid_mask = is_in_gts.sum(dim=-1) > 0 + + gt_center = get_box_center(gt_bboxes, box_dim) + + strides = priors[..., 2] + distance = (priors[None].unsqueeze(2)[..., :2] - + gt_center[:, None, :, :] + ).pow(2).sum(-1).sqrt() / strides[None, :, None] + + # prevent overflow + distance = distance * valid_mask.unsqueeze(-1) + soft_center_prior = torch.pow(10, distance - self.soft_center_radius) + + if self.batch_iou: + pairwise_ious = self.iou_calculator(decoded_bboxes, gt_bboxes) + else: + ious = [] + for box, gt in zip(decoded_bboxes, gt_bboxes): + iou = self.iou_calculator(box, gt) + ious.append(iou) + pairwise_ious = torch.stack(ious, dim=0) + + iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight + + # select the predicted scores corresponded to the gt_labels + pairwise_pred_scores = pred_scores.permute(0, 2, 1) + idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long) + idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt) + idx[1] = gt_labels.long().squeeze(-1) + pairwise_pred_scores = pairwise_pred_scores[idx[0], + idx[1]].permute(0, 2, 1) + # classification cost + scale_factor = pairwise_ious - pairwise_pred_scores.sigmoid() + pairwise_cls_cost = F.binary_cross_entropy_with_logits( + pairwise_pred_scores, pairwise_ious, + reduction='none') * scale_factor.abs().pow(2.0) + + cost_matrix = pairwise_cls_cost + iou_cost + soft_center_prior + + max_pad_value = torch.ones_like(cost_matrix) * INF + cost_matrix = torch.where(valid_mask[..., None].repeat(1, 1, num_gt), + cost_matrix, max_pad_value) + + (matched_pred_ious, matched_gt_inds, + fg_mask_inboxes) = self.dynamic_k_matching(cost_matrix, pairwise_ious, + pad_bbox_flag) + + del pairwise_ious, cost_matrix + + batch_index = (fg_mask_inboxes > 0).nonzero(as_tuple=True)[0] + + assigned_labels = gt_labels.new_full(pred_scores[..., 0].shape, + self.num_classes) + assigned_labels[fg_mask_inboxes] = gt_labels[ + batch_index, matched_gt_inds].squeeze(-1) + assigned_labels = assigned_labels.long() + + assigned_labels_weights = gt_bboxes.new_full(pred_scores[..., 0].shape, + 1) + + assigned_bboxes = gt_bboxes.new_full(pred_bboxes.shape, 0) + assigned_bboxes[fg_mask_inboxes] = gt_bboxes[batch_index, + matched_gt_inds] + + assign_metrics = gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + assign_metrics[fg_mask_inboxes] = matched_pred_ious + + return dict( + assigned_labels=assigned_labels, + assigned_labels_weights=assigned_labels_weights, + assigned_bboxes=assigned_bboxes, + assign_metrics=assign_metrics) + + def dynamic_k_matching( + self, cost_matrix: Tensor, pairwise_ious: Tensor, + pad_bbox_flag: int) -> Tuple[Tensor, Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets. + + Args: + cost_matrix (Tensor): Cost matrix. + pairwise_ious (Tensor): Pairwise iou matrix. + num_gt (int): Number of gt. + valid_mask (Tensor): Mask for valid bboxes. + Returns: + tuple: matched ious and gt indexes. + """ + matching_matrix = torch.zeros_like(cost_matrix, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.topk, pairwise_ious.size(1)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=1) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1) + + num_gts = pad_bbox_flag.sum((1, 2)).int() + # sorting the batch cost matirx is faster than topk + _, sorted_indices = torch.sort(cost_matrix, dim=1) + for b in range(pad_bbox_flag.shape[0]): + for gt_idx in range(num_gts[b]): + topk_ids = sorted_indices[b, :dynamic_ks[b, gt_idx], gt_idx] + matching_matrix[b, :, gt_idx][topk_ids] = 1 + + del topk_ious, dynamic_ks + + prior_match_gt_mask = matching_matrix.sum(2) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost_matrix[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(2) > 0 + matched_pred_ious = (matching_matrix * + pairwise_ious).sum(2)[fg_mask_inboxes] + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + return matched_pred_ious, matched_gt_inds, fg_mask_inboxes diff --git a/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py b/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..202d678986c3a398de63675c004592b98ea092e0 --- /dev/null +++ b/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py @@ -0,0 +1,311 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmyolo.models.losses import bbox_overlaps +from mmyolo.registry import TASK_UTILS +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + + +@TASK_UTILS.register_module() +class BatchTaskAlignedAssigner(nn.Module): + """This code referenced to + https://github.com/meituan/YOLOv6/blob/main/yolov6/ + assigners/tal_assigner.py. + Batch Task aligned assigner base on the paper: + `TOOD: Task-aligned One-stage Object Detection. + `_. + Assign a corresponding gt bboxes or background to a batch of + predicted bboxes. Each bbox will be assigned with `0` or a + positive integer indicating the ground truth index. + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + Args: + num_classes (int): number of class + topk (int): number of bbox selected in each level + alpha (float): Hyper-parameters related to alignment_metrics. + Defaults to 1.0 + beta (float): Hyper-parameters related to alignment_metrics. + Defaults to 6. + eps (float): Eps to avoid log(0). Default set to 1e-9 + use_ciou (bool): Whether to use ciou while calculating iou. + Defaults to False. + """ + + def __init__(self, + num_classes: int, + topk: int = 13, + alpha: float = 1.0, + beta: float = 6.0, + eps: float = 1e-7, + use_ciou: bool = False): + super().__init__() + self.num_classes = num_classes + self.topk = topk + self.alpha = alpha + self.beta = beta + self.eps = eps + self.use_ciou = use_ciou + + @torch.no_grad() + def forward( + self, + pred_bboxes: Tensor, + pred_scores: Tensor, + priors: Tensor, + gt_labels: Tensor, + gt_bboxes: Tensor, + pad_bbox_flag: Tensor, + ) -> dict: + """Assign gt to bboxes. + + The assignment is done in following steps + 1. compute alignment metric between all bbox (bbox of all pyramid + levels) and gt + 2. select top-k bbox as candidates for each gt + 3. limit the positive sample's center in gt (because the anchor-free + detector only can predict positive distance) + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bboxes, + shape(batch_size, num_priors, num_classes) + priors (Tensor): Model priors, shape (num_priors, 4) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + Returns: + assigned_result (dict) Assigned result: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned boxes, + shape(batch_size, num_priors, 4) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors, num_classes) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + """ + # (num_priors, 4) -> (num_priors, 2) + priors = priors[:, :2] + + batch_size = pred_scores.size(0) + num_gt = gt_bboxes.size(1) + + assigned_result = { + 'assigned_labels': + gt_bboxes.new_full(pred_scores[..., 0].shape, self.num_classes), + 'assigned_bboxes': + gt_bboxes.new_full(pred_bboxes.shape, 0), + 'assigned_scores': + gt_bboxes.new_full(pred_scores.shape, 0), + 'fg_mask_pre_prior': + gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + } + + if num_gt == 0: + return assigned_result + + pos_mask, alignment_metrics, overlaps = self.get_pos_mask( + pred_bboxes, pred_scores, priors, gt_labels, gt_bboxes, + pad_bbox_flag, batch_size, num_gt) + + (assigned_gt_idxs, fg_mask_pre_prior, + pos_mask) = select_highest_overlaps(pos_mask, overlaps, num_gt) + + # assigned target + assigned_labels, assigned_bboxes, assigned_scores = self.get_targets( + gt_labels, gt_bboxes, assigned_gt_idxs, fg_mask_pre_prior, + batch_size, num_gt) + + # normalize + alignment_metrics *= pos_mask + pos_align_metrics = alignment_metrics.max(axis=-1, keepdim=True)[0] + pos_overlaps = (overlaps * pos_mask).max(axis=-1, keepdim=True)[0] + norm_align_metric = ( + alignment_metrics * pos_overlaps / + (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1) + assigned_scores = assigned_scores * norm_align_metric + + assigned_result['assigned_labels'] = assigned_labels + assigned_result['assigned_bboxes'] = assigned_bboxes + assigned_result['assigned_scores'] = assigned_scores + assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool() + return assigned_result + + def get_pos_mask(self, pred_bboxes: Tensor, pred_scores: Tensor, + priors: Tensor, gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get possible mask. + + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bbox, + shape(batch_size, num_priors, num_classes) + priors (Tensor): Model priors, shape (num_priors, 2) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + pos_mask (Tensor): Possible mask, + shape(batch_size, num_gt, num_priors) + alignment_metrics (Tensor): Alignment metrics, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): Overlaps of gt_bboxes and pred_bboxes, + shape(batch_size, num_gt, num_priors) + """ + + # Compute alignment metric between all bbox and gt + alignment_metrics, overlaps = \ + self.get_box_metrics(pred_bboxes, pred_scores, gt_labels, + gt_bboxes, batch_size, num_gt) + + # get is_in_gts mask + is_in_gts = select_candidates_in_gts(priors, gt_bboxes) + + # get topk_metric mask + topk_metric = self.select_topk_candidates( + alignment_metrics * is_in_gts, + topk_mask=pad_bbox_flag.repeat([1, 1, self.topk]).bool()) + + # merge all mask to a final mask + pos_mask = topk_metric * is_in_gts * pad_bbox_flag + + return pos_mask, alignment_metrics, overlaps + + def get_box_metrics(self, pred_bboxes: Tensor, pred_scores: Tensor, + gt_labels: Tensor, gt_bboxes: Tensor, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor]: + """Compute alignment metric between all bbox and gt. + + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bbox, + shape(batch_size, num_priors, num_classes) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + alignment_metrics (Tensor): Align metric, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): Overlaps, shape(batch_size, num_gt, num_priors) + """ + pred_scores = pred_scores.permute(0, 2, 1) + gt_labels = gt_labels.to(torch.long) + idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long) + idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt) + idx[1] = gt_labels.squeeze(-1) + bbox_scores = pred_scores[idx[0], idx[1]] + # TODO: need to replace the yolov6_iou_calculator function + if self.use_ciou: + overlaps = bbox_overlaps( + pred_bboxes.unsqueeze(1), + gt_bboxes.unsqueeze(2), + iou_mode='ciou', + bbox_format='xyxy').clamp(0) + else: + overlaps = yolov6_iou_calculator(gt_bboxes, pred_bboxes) + + alignment_metrics = bbox_scores.pow(self.alpha) * overlaps.pow( + self.beta) + + return alignment_metrics, overlaps + + def select_topk_candidates(self, + alignment_gt_metrics: Tensor, + using_largest_topk: bool = True, + topk_mask: Optional[Tensor] = None) -> Tensor: + """Compute alignment metric between all bbox and gt. + + Args: + alignment_gt_metrics (Tensor): Alignment metric of gt candidates, + shape(batch_size, num_gt, num_priors) + using_largest_topk (bool): Controls whether to using largest or + smallest elements. + topk_mask (Tensor): Topk mask, + shape(batch_size, num_gt, self.topk) + Returns: + Tensor: Topk candidates mask, + shape(batch_size, num_gt, num_priors) + """ + num_priors = alignment_gt_metrics.shape[-1] + topk_metrics, topk_idxs = torch.topk( + alignment_gt_metrics, + self.topk, + axis=-1, + largest=using_largest_topk) + if topk_mask is None: + topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > + self.eps).tile([1, 1, self.topk]) + topk_idxs = torch.where(topk_mask, topk_idxs, + torch.zeros_like(topk_idxs)) + is_in_topk = F.one_hot(topk_idxs, num_priors).sum(axis=-2) + is_in_topk = torch.where(is_in_topk > 1, torch.zeros_like(is_in_topk), + is_in_topk) + return is_in_topk.to(alignment_gt_metrics.dtype) + + def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor, + assigned_gt_idxs: Tensor, fg_mask_pre_prior: Tensor, + batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get assigner info. + + Args: + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + assigned_gt_idxs (Tensor): Assigned ground truth indexes, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned bboxes, + shape(batch_size, num_priors) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors) + """ + # assigned target labels + batch_ind = torch.arange( + end=batch_size, dtype=torch.int64, device=gt_labels.device)[..., + None] + assigned_gt_idxs = assigned_gt_idxs + batch_ind * num_gt + assigned_labels = gt_labels.long().flatten()[assigned_gt_idxs] + + # assigned target boxes + assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_idxs] + + # assigned target scores + assigned_labels[assigned_labels < 0] = 0 + assigned_scores = F.one_hot(assigned_labels, self.num_classes) + force_gt_scores_mask = fg_mask_pre_prior[:, :, None].repeat( + 1, 1, self.num_classes) + assigned_scores = torch.where(force_gt_scores_mask > 0, + assigned_scores, + torch.full_like(assigned_scores, 0)) + + return assigned_labels, assigned_bboxes, assigned_scores diff --git a/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py b/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..6709968eeb1768fc4e6124f1f7a344f581dd43a7 --- /dev/null +++ b/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py @@ -0,0 +1,344 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_overlaps + + +def _cat_multi_level_tensor_in_place(*multi_level_tensor, place_hold_var): + """concat multi-level tensor in place.""" + for level_tensor in multi_level_tensor: + for i, var in enumerate(level_tensor): + if len(var) > 0: + level_tensor[i] = torch.cat(var, dim=0) + else: + level_tensor[i] = place_hold_var + + +class BatchYOLOv7Assigner(nn.Module): + """Batch YOLOv7 Assigner. + + It consists of two assigning steps: + + 1. YOLOv5 cross-grid sample assigning + 2. SimOTA assigning + + This code referenced to + https://github.com/WongKinYiu/yolov7/blob/main/utils/loss.py. + + Args: + num_classes (int): Number of classes. + num_base_priors (int): Number of base priors. + featmap_strides (Sequence[int]): Feature map strides. + prior_match_thr (float): Threshold to match priors. + Defaults to 4.0. + candidate_topk (int): Number of topk candidates to + assign. Defaults to 10. + iou_weight (float): IOU weight. Defaults to 3.0. + cls_weight (float): Class weight. Defaults to 1.0. + """ + + def __init__(self, + num_classes: int, + num_base_priors: int, + featmap_strides: Sequence[int], + prior_match_thr: float = 4.0, + candidate_topk: int = 10, + iou_weight: float = 3.0, + cls_weight: float = 1.0): + super().__init__() + self.num_classes = num_classes + self.num_base_priors = num_base_priors + self.featmap_strides = featmap_strides + # yolov5 param + self.prior_match_thr = prior_match_thr + # simota param + self.candidate_topk = candidate_topk + self.iou_weight = iou_weight + self.cls_weight = cls_weight + + @torch.no_grad() + def forward(self, + pred_results, + batch_targets_normed, + batch_input_shape, + priors_base_sizes, + grid_offset, + near_neighbor_thr=0.5) -> dict: + """Forward function.""" + # (num_base_priors, num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + + # mlvl is mean multi_level + if batch_targets_normed.shape[1] == 0: + # empty gt of batch + num_levels = len(pred_results) + return dict( + mlvl_positive_infos=[pred_results[0].new_empty( + (0, 4))] * num_levels, + mlvl_priors=[] * num_levels, + mlvl_targets_normed=[] * num_levels) + + # if near_neighbor_thr = 0.5 are mean the nearest + # 3 neighbors are also considered positive samples. + # if near_neighbor_thr = 1.0 are mean the nearest + # 5 neighbors are also considered positive samples. + mlvl_positive_infos, mlvl_priors = self.yolov5_assigner( + pred_results, + batch_targets_normed, + priors_base_sizes, + grid_offset, + near_neighbor_thr=near_neighbor_thr) + + mlvl_positive_infos, mlvl_priors, \ + mlvl_targets_normed = self.simota_assigner( + pred_results, batch_targets_normed, mlvl_positive_infos, + mlvl_priors, batch_input_shape) + + place_hold_var = batch_targets_normed.new_empty((0, 4)) + _cat_multi_level_tensor_in_place( + mlvl_positive_infos, + mlvl_priors, + mlvl_targets_normed, + place_hold_var=place_hold_var) + + return dict( + mlvl_positive_infos=mlvl_positive_infos, + mlvl_priors=mlvl_priors, + mlvl_targets_normed=mlvl_targets_normed) + + def yolov5_assigner(self, + pred_results, + batch_targets_normed, + priors_base_sizes, + grid_offset, + near_neighbor_thr=0.5): + """YOLOv5 cross-grid sample assigner.""" + num_batch_gts = batch_targets_normed.shape[1] + assert num_batch_gts > 0 + + mlvl_positive_infos, mlvl_priors = [], [] + + scaled_factor = torch.ones(7, device=pred_results[0].device) + for i in range(len(pred_results)): # lever + priors_base_sizes_i = priors_base_sizes[i] + # (1, 1, feat_shape_w, feat_shape_h, feat_shape_w, feat_shape_h) + scaled_factor[2:6] = torch.tensor( + pred_results[i].shape)[[3, 2, 3, 2]] + + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_batch_gts, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1. / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[ + match_inds] # (num_matched_target, 7) + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + mlvl_positive_infos.append( + batch_targets_scaled.new_empty((0, 4))) + mlvl_priors.append([]) + continue + + # Positive samples with additional neighbors + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] # () + retained_offsets = grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # batch_targets_scaled: (num_matched_target, 7) + # 7 is mean (batch_idx, cls_id, x_scaled, + # y_scaled, w_scaled, h_scaled, prior_idx) + + # mlvl_positive_info: (num_matched_target, 4) + # 4 is mean (batch_idx, prior_idx, x_scaled, y_scaled) + mlvl_positive_info = batch_targets_scaled[:, [0, 6, 2, 3]] + retained_offsets = retained_offsets * near_neighbor_thr + mlvl_positive_info[:, + 2:] = mlvl_positive_info[:, + 2:] - retained_offsets + mlvl_positive_info[:, 2].clamp_(0, scaled_factor[2] - 1) + mlvl_positive_info[:, 3].clamp_(0, scaled_factor[3] - 1) + mlvl_positive_info = mlvl_positive_info.long() + priors_inds = mlvl_positive_info[:, 1] + + mlvl_positive_infos.append(mlvl_positive_info) + mlvl_priors.append(priors_base_sizes_i[priors_inds]) + + return mlvl_positive_infos, mlvl_priors + + def simota_assigner(self, pred_results, batch_targets_normed, + mlvl_positive_infos, mlvl_priors, batch_input_shape): + """SimOTA assigner.""" + num_batch_gts = batch_targets_normed.shape[1] + assert num_batch_gts > 0 + num_levels = len(mlvl_positive_infos) + + mlvl_positive_infos_matched = [[] for _ in range(num_levels)] + mlvl_priors_matched = [[] for _ in range(num_levels)] + mlvl_targets_normed_matched = [[] for _ in range(num_levels)] + + for batch_idx in range(pred_results[0].shape[0]): + # (num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + targets_normed = batch_targets_normed[0] + # (num_gt, 7) + targets_normed = targets_normed[targets_normed[:, 0] == batch_idx] + num_gts = targets_normed.shape[0] + + if num_gts == 0: + continue + + _mlvl_decoderd_bboxes = [] + _mlvl_obj_cls = [] + _mlvl_priors = [] + _mlvl_positive_infos = [] + _from_which_layer = [] + + for i, head_pred in enumerate(pred_results): + # (num_matched_target, 4) + # 4 is mean (batch_idx, prior_idx, grid_x, grid_y) + _mlvl_positive_info = mlvl_positive_infos[i] + if _mlvl_positive_info.shape[0] == 0: + continue + + idx = (_mlvl_positive_info[:, 0] == batch_idx) + _mlvl_positive_info = _mlvl_positive_info[idx] + _mlvl_positive_infos.append(_mlvl_positive_info) + + priors = mlvl_priors[i][idx] + _mlvl_priors.append(priors) + + _from_which_layer.append( + _mlvl_positive_info.new_full( + size=(_mlvl_positive_info.shape[0], ), fill_value=i)) + + # (n,85) + level_batch_idx, prior_ind, \ + grid_x, grid_y = _mlvl_positive_info.T + pred_positive = head_pred[level_batch_idx, prior_ind, grid_y, + grid_x] + _mlvl_obj_cls.append(pred_positive[:, 4:]) + + # decoded + grid = torch.stack([grid_x, grid_y], dim=1) + pred_positive_cxcy = (pred_positive[:, :2].sigmoid() * 2. - + 0.5 + grid) * self.featmap_strides[i] + pred_positive_wh = (pred_positive[:, 2:4].sigmoid() * 2) ** 2 \ + * priors * self.featmap_strides[i] + pred_positive_xywh = torch.cat( + [pred_positive_cxcy, pred_positive_wh], dim=-1) + _mlvl_decoderd_bboxes.append(pred_positive_xywh) + + if len(_mlvl_decoderd_bboxes) == 0: + continue + + # 1 calc pair_wise_iou_loss + _mlvl_decoderd_bboxes = torch.cat(_mlvl_decoderd_bboxes, dim=0) + num_pred_positive = _mlvl_decoderd_bboxes.shape[0] + + if num_pred_positive == 0: + continue + + # scaled xywh + batch_input_shape_wh = pred_results[0].new_tensor( + batch_input_shape[::-1]).repeat((1, 2)) + targets_scaled_bbox = targets_normed[:, 2:6] * batch_input_shape_wh + + targets_scaled_bbox = bbox_cxcywh_to_xyxy(targets_scaled_bbox) + _mlvl_decoderd_bboxes = bbox_cxcywh_to_xyxy(_mlvl_decoderd_bboxes) + pair_wise_iou = bbox_overlaps(targets_scaled_bbox, + _mlvl_decoderd_bboxes) + pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8) + + # 2 calc pair_wise_cls_loss + _mlvl_obj_cls = torch.cat(_mlvl_obj_cls, dim=0).float().sigmoid() + _mlvl_positive_infos = torch.cat(_mlvl_positive_infos, dim=0) + _from_which_layer = torch.cat(_from_which_layer, dim=0) + _mlvl_priors = torch.cat(_mlvl_priors, dim=0) + + gt_cls_per_image = ( + F.one_hot(targets_normed[:, 1].to(torch.int64), + self.num_classes).float().unsqueeze(1).repeat( + 1, num_pred_positive, 1)) + # cls_score * obj + cls_preds_ = _mlvl_obj_cls[:, 1:]\ + .unsqueeze(0)\ + .repeat(num_gts, 1, 1) \ + * _mlvl_obj_cls[:, 0:1]\ + .unsqueeze(0).repeat(num_gts, 1, 1) + y = cls_preds_.sqrt_() + pair_wise_cls_loss = F.binary_cross_entropy_with_logits( + torch.log(y / (1 - y)), gt_cls_per_image, + reduction='none').sum(-1) + del cls_preds_ + + # calc cost + cost = ( + self.cls_weight * pair_wise_cls_loss + + self.iou_weight * pair_wise_iou_loss) + + # num_gt, num_match_pred + matching_matrix = torch.zeros_like(cost) + + top_k, _ = torch.topk( + pair_wise_iou, + min(self.candidate_topk, pair_wise_iou.shape[1]), + dim=1) + dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1) + + # Select only topk matches per gt + for gt_idx in range(num_gts): + _, pos_idx = torch.topk( + cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False) + matching_matrix[gt_idx][pos_idx] = 1.0 + del top_k, dynamic_ks + + # Each prediction box can match at most one gt box, + # and if there are more than one, + # only the least costly one can be taken + anchor_matching_gt = matching_matrix.sum(0) + if (anchor_matching_gt > 1).sum() > 0: + _, cost_argmin = torch.min( + cost[:, anchor_matching_gt > 1], dim=0) + matching_matrix[:, anchor_matching_gt > 1] *= 0.0 + matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 + fg_mask_inboxes = matching_matrix.sum(0) > 0.0 + matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) + + targets_normed = targets_normed[matched_gt_inds] + _mlvl_positive_infos = _mlvl_positive_infos[fg_mask_inboxes] + _from_which_layer = _from_which_layer[fg_mask_inboxes] + _mlvl_priors = _mlvl_priors[fg_mask_inboxes] + + # Rearranged in the order of the prediction layers + # to facilitate loss + for i in range(num_levels): + layer_idx = _from_which_layer == i + mlvl_positive_infos_matched[i].append( + _mlvl_positive_infos[layer_idx]) + mlvl_priors_matched[i].append(_mlvl_priors[layer_idx]) + mlvl_targets_normed_matched[i].append( + targets_normed[layer_idx]) + + results = mlvl_positive_infos_matched, \ + mlvl_priors_matched, \ + mlvl_targets_normed_matched + return results diff --git a/mmyolo/models/task_modules/assigners/utils.py b/mmyolo/models/task_modules/assigners/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5843200936ef7a269109517e6d2952cceea02059 --- /dev/null +++ b/mmyolo/models/task_modules/assigners/utils.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor + + +def select_candidates_in_gts(priors_points: Tensor, + gt_bboxes: Tensor, + eps: float = 1e-9) -> Tensor: + """Select the positive priors' center in gt. + + Args: + priors_points (Tensor): Model priors points, + shape(num_priors, 2) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + eps (float): Default to 1e-9. + Return: + (Tensor): shape(batch_size, num_gt, num_priors) + """ + batch_size, num_gt, _ = gt_bboxes.size() + gt_bboxes = gt_bboxes.reshape([-1, 4]) + + priors_number = priors_points.size(0) + priors_points = priors_points.unsqueeze(0).repeat(batch_size * num_gt, 1, + 1) + + # calculate the left, top, right, bottom distance between positive + # prior center and gt side + gt_bboxes_lt = gt_bboxes[:, 0:2].unsqueeze(1).repeat(1, priors_number, 1) + gt_bboxes_rb = gt_bboxes[:, 2:4].unsqueeze(1).repeat(1, priors_number, 1) + bbox_deltas = torch.cat( + [priors_points - gt_bboxes_lt, gt_bboxes_rb - priors_points], dim=-1) + bbox_deltas = bbox_deltas.reshape([batch_size, num_gt, priors_number, -1]) + + return (bbox_deltas.min(axis=-1)[0] > eps).to(gt_bboxes.dtype) + + +def select_highest_overlaps(pos_mask: Tensor, overlaps: Tensor, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """If an anchor box is assigned to multiple gts, the one with the highest + iou will be selected. + + Args: + pos_mask (Tensor): The assigned positive sample mask, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): IoU between all bbox and ground truth, + shape(batch_size, num_gt, num_priors) + num_gt (int): Number of ground truth. + Return: + gt_idx_pre_prior (Tensor): Target ground truth index, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force matching ground truth, + shape(batch_size, num_priors) + pos_mask (Tensor): The assigned positive sample mask, + shape(batch_size, num_gt, num_priors) + """ + fg_mask_pre_prior = pos_mask.sum(axis=-2) + + # Make sure the positive sample matches the only one and is the largest IoU + if fg_mask_pre_prior.max() > 1: + mask_multi_gts = (fg_mask_pre_prior.unsqueeze(1) > 1).repeat( + [1, num_gt, 1]) + index = overlaps.argmax(axis=1) + is_max_overlaps = F.one_hot(index, num_gt) + is_max_overlaps = \ + is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype) + + pos_mask = torch.where(mask_multi_gts, is_max_overlaps, pos_mask) + fg_mask_pre_prior = pos_mask.sum(axis=-2) + + gt_idx_pre_prior = pos_mask.argmax(axis=-2) + return gt_idx_pre_prior, fg_mask_pre_prior, pos_mask + + +# TODO:'mmdet.BboxOverlaps2D' will cause gradient inconsistency, +# which will be found and solved in a later version. +def yolov6_iou_calculator(bbox1: Tensor, + bbox2: Tensor, + eps: float = 1e-9) -> Tensor: + """Calculate iou for batch. + + Args: + bbox1 (Tensor): shape(batch size, num_gt, 4) + bbox2 (Tensor): shape(batch size, num_priors, 4) + eps (float): Default to 1e-9. + Return: + (Tensor): IoU, shape(size, num_gt, num_priors) + """ + bbox1 = bbox1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] + bbox2 = bbox2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] + + # calculate xy info of predict and gt bbox + bbox1_x1y1, bbox1_x2y2 = bbox1[:, :, :, 0:2], bbox1[:, :, :, 2:4] + bbox2_x1y1, bbox2_x2y2 = bbox2[:, :, :, 0:2], bbox2[:, :, :, 2:4] + + # calculate overlap area + overlap = (torch.minimum(bbox1_x2y2, bbox2_x2y2) - + torch.maximum(bbox1_x1y1, bbox2_x1y1)).clip(0).prod(-1) + + # calculate bbox area + bbox1_area = (bbox1_x2y2 - bbox1_x1y1).clip(0).prod(-1) + bbox2_area = (bbox2_x2y2 - bbox2_x1y1).clip(0).prod(-1) + + union = bbox1_area + bbox2_area - overlap + eps + + return overlap / union diff --git a/mmyolo/models/task_modules/coders/__init__.py b/mmyolo/models/task_modules/coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75b6e7d6b30afd3de21c738dfc8e75df2eae7120 --- /dev/null +++ b/mmyolo/models/task_modules/coders/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .distance_angle_point_coder import DistanceAnglePointCoder +from .distance_point_bbox_coder import DistancePointBBoxCoder +from .yolov5_bbox_coder import YOLOv5BBoxCoder +from .yolox_bbox_coder import YOLOXBBoxCoder + +__all__ = [ + 'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'DistancePointBBoxCoder', + 'DistanceAnglePointCoder' +] diff --git a/mmyolo/models/task_modules/coders/distance_angle_point_coder.py b/mmyolo/models/task_modules/coders/distance_angle_point_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..a7e322f94725ee548c9b261be6f5bae2f3d9b4d9 --- /dev/null +++ b/mmyolo/models/task_modules/coders/distance_angle_point_coder.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch + +from mmyolo.registry import TASK_UTILS + +try: + from mmrotate.models.task_modules.coders import \ + DistanceAnglePointCoder as MMROTATE_DistanceAnglePointCoder + MMROTATE_AVAILABLE = True +except ImportError: + from mmdet.models.task_modules.coders import BaseBBoxCoder + MMROTATE_DistanceAnglePointCoder = BaseBBoxCoder + MMROTATE_AVAILABLE = False + + +@TASK_UTILS.register_module() +class DistanceAnglePointCoder(MMROTATE_DistanceAnglePointCoder): + """Distance Angle Point BBox coder. + + This coder encodes gt bboxes (x, y, w, h, theta) into (top, bottom, left, + right, theta) and decode it back to the original. + """ + + def __init__(self, clip_border=True, angle_version='oc'): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + super().__init__(clip_border=clip_border, angle_version=angle_version) + + def decode( + self, + points: torch.Tensor, + pred_bboxes: torch.Tensor, + stride: torch.Tensor, + max_shape: Optional[Union[Sequence[int], torch.Tensor, + Sequence[Sequence[int]]]] = None, + ) -> torch.Tensor: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries and angle (left, top, right, bottom, angle). + Shape (B, N, 5) or (N, 5) + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Tensor: Boxes with shape (N, 5) or (B, N, 5) + """ + assert points.size(-2) == pred_bboxes.size(-2) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 5 + if self.clip_border is False: + max_shape = None + + if pred_bboxes.dim() == 2: + stride = stride[:, None] + else: + stride = stride[None, :, None] + pred_bboxes[..., :4] = pred_bboxes[..., :4] * stride + + return self.distance2obb(points, pred_bboxes, max_shape, + self.angle_version) + + def encode(self, + points: torch.Tensor, + gt_bboxes: torch.Tensor, + max_dis: float = 16., + eps: float = 0.01) -> torch.Tensor: + """Encode bounding box to distances. + + Args: + points (Tensor): Shape (N, 2), The format is [x, y]. + gt_bboxes (Tensor): Shape (N, 5), The format is "xywha" + max_dis (float): Upper bound of the distance. Default None. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.1. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 5). + """ + + assert points.size(-2) == gt_bboxes.size(-2) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 5 + return self.obb2distance(points, gt_bboxes, max_dis, eps) diff --git a/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py b/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..16417b8ab209c57880cfcfe0ba2a955e78c0a3f0 --- /dev/null +++ b/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch +from mmdet.models.task_modules.coders import \ + DistancePointBBoxCoder as MMDET_DistancePointBBoxCoder +from mmdet.structures.bbox import bbox2distance, distance2bbox + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class DistancePointBBoxCoder(MMDET_DistancePointBBoxCoder): + """Distance Point BBox coder. + + This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, + right) and decode it back to the original. + """ + + def decode( + self, + points: torch.Tensor, + pred_bboxes: torch.Tensor, + stride: torch.Tensor, + max_shape: Optional[Union[Sequence[int], torch.Tensor, + Sequence[Sequence[int]]]] = None + ) -> torch.Tensor: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). Shape (B, N, 4) + or (N, 4) + stride (Tensor): Featmap stride. + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Tensor: Boxes with shape (N, 4) or (B, N, 4) + """ + assert points.size(-2) == pred_bboxes.size(-2) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 4 + if self.clip_border is False: + max_shape = None + + pred_bboxes = pred_bboxes * stride[None, :, None] + + return distance2bbox(points, pred_bboxes, max_shape) + + def encode(self, + points: torch.Tensor, + gt_bboxes: torch.Tensor, + max_dis: float = 16., + eps: float = 0.01) -> torch.Tensor: + """Encode bounding box to distances. The rewrite is to support batch + operations. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2), The format is [x, y]. + gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format + is "xyxy" + max_dis (float): Upper bound of the distance. Default to 16.. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.01. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 4) or + (B, N, 4). + """ + + assert points.size(-2) == gt_bboxes.size(-2) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 4 + return bbox2distance(points, gt_bboxes, max_dis, eps) diff --git a/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py b/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..bab5f0e0fe06c1930497bdece7c7a06636fe9c37 --- /dev/null +++ b/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class YOLOv5BBoxCoder(BaseBBoxCoder): + """YOLOv5 BBox coder. + + This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + """ + + def encode(self, **kwargs): + """Encode deltas between bboxes and ground truth boxes.""" + pass + + def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor, + stride: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + + Args: + priors (torch.Tensor): Basic boxes or points, e.g. anchors. + pred_bboxes (torch.Tensor): Encoded boxes with shape + stride (torch.Tensor | int): Strides of bboxes. + + Returns: + torch.Tensor: Decoded boxes. + """ + assert pred_bboxes.size(-1) == priors.size(-1) == 4 + + pred_bboxes = pred_bboxes.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + # The anchor of mmdet has been offset by 0.5 + x_center_pred = (pred_bboxes[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (pred_bboxes[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (pred_bboxes[..., 2] * 2)**2 * w + h_pred = (pred_bboxes[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + (x_center_pred - w_pred / 2, y_center_pred - h_pred / 2, + x_center_pred + w_pred / 2, y_center_pred + h_pred / 2), + dim=-1) + + return decoded_bboxes diff --git a/mmyolo/models/task_modules/coders/yolox_bbox_coder.py b/mmyolo/models/task_modules/coders/yolox_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..02c898d814e89e5c8ef4db792831a7ba80c7c0cc --- /dev/null +++ b/mmyolo/models/task_modules/coders/yolox_bbox_coder.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class YOLOXBBoxCoder(BaseBBoxCoder): + """YOLOX BBox coder. + + This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + """ + + def encode(self, **kwargs): + """Encode deltas between bboxes and ground truth boxes.""" + pass + + def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor, + stride: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + + Args: + priors (torch.Tensor): Basic boxes or points, e.g. anchors. + pred_bboxes (torch.Tensor): Encoded boxes with shape + stride (torch.Tensor | int): Strides of bboxes. + + Returns: + torch.Tensor: Decoded boxes. + """ + stride = stride[None, :, None] + xys = (pred_bboxes[..., :2] * stride) + priors + whs = pred_bboxes[..., 2:].exp() * stride + + tl_x = (xys[..., 0] - whs[..., 0] / 2) + tl_y = (xys[..., 1] - whs[..., 1] / 2) + br_x = (xys[..., 0] + whs[..., 0] / 2) + br_y = (xys[..., 1] + whs[..., 1] / 2) + + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes diff --git a/mmyolo/models/utils/__init__.py b/mmyolo/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cdfeaaf0f206fd62dda27cbf44f519777da56ea8 --- /dev/null +++ b/mmyolo/models/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .misc import gt_instances_preprocess, make_divisible, make_round + +__all__ = ['make_divisible', 'make_round', 'gt_instances_preprocess'] diff --git a/mmyolo/models/utils/misc.py b/mmyolo/models/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..531558b69bc14141fb6299aea17b54b432fd5f59 --- /dev/null +++ b/mmyolo/models/utils/misc.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Sequence, Union + +import torch +from mmdet.structures.bbox.transforms import get_box_tensor +from torch import Tensor + + +def make_divisible(x: float, + widen_factor: float = 1.0, + divisor: int = 8) -> int: + """Make sure that x*widen_factor is divisible by divisor.""" + return math.ceil(x * widen_factor / divisor) * divisor + + +def make_round(x: float, deepen_factor: float = 1.0) -> int: + """Make sure that x*deepen_factor becomes an integer not less than 1.""" + return max(round(x * deepen_factor), 1) if x > 1 else x + + +def gt_instances_preprocess(batch_gt_instances: Union[Tensor, Sequence], + batch_size: int) -> Tensor: + """Split batch_gt_instances with batch size. + + From [all_gt_bboxes, box_dim+2] to [batch_size, number_gt, box_dim+1]. + For horizontal box, box_dim=4, for rotated box, box_dim=5 + + If some shape of single batch smaller than + gt bbox len, then using zeros to fill. + + Args: + batch_gt_instances (Sequence[Tensor]): Ground truth + instances for whole batch, shape [all_gt_bboxes, box_dim+2] + batch_size (int): Batch size. + + Returns: + Tensor: batch gt instances data, shape + [batch_size, number_gt, box_dim+1] + """ + if isinstance(batch_gt_instances, Sequence): + max_gt_bbox_len = max( + [len(gt_instances) for gt_instances in batch_gt_instances]) + # fill zeros with length box_dim+1 if some shape of + # single batch not equal max_gt_bbox_len + batch_instance_list = [] + for index, gt_instance in enumerate(batch_gt_instances): + bboxes = gt_instance.bboxes + labels = gt_instance.labels + box_dim = get_box_tensor(bboxes).size(-1) + batch_instance_list.append( + torch.cat((labels[:, None], bboxes), dim=-1)) + + if bboxes.shape[0] >= max_gt_bbox_len: + continue + + fill_tensor = bboxes.new_full( + [max_gt_bbox_len - bboxes.shape[0], box_dim + 1], 0) + batch_instance_list[index] = torch.cat( + (batch_instance_list[index], fill_tensor), dim=0) + + return torch.stack(batch_instance_list) + else: + # faster version + # format of batch_gt_instances: [img_ind, cls_ind, (box)] + # For example horizontal box should be: + # [img_ind, cls_ind, x1, y1, x2, y2] + # Rotated box should be + # [img_ind, cls_ind, x, y, w, h, a] + + # sqlit batch gt instance [all_gt_bboxes, box_dim+2] -> + # [batch_size, max_gt_bbox_len, box_dim+1] + assert isinstance(batch_gt_instances, Tensor) + box_dim = batch_gt_instances.size(-1) - 2 + if len(batch_gt_instances) > 0: + gt_images_indexes = batch_gt_instances[:, 0] + max_gt_bbox_len = gt_images_indexes.unique( + return_counts=True)[1].max() + # fill zeros with length box_dim+1 if some shape of + # single batch not equal max_gt_bbox_len + batch_instance = torch.zeros( + (batch_size, max_gt_bbox_len, box_dim + 1), + dtype=batch_gt_instances.dtype, + device=batch_gt_instances.device) + + for i in range(batch_size): + match_indexes = gt_images_indexes == i + gt_num = match_indexes.sum() + if gt_num: + batch_instance[i, :gt_num] = batch_gt_instances[ + match_indexes, 1:] + else: + batch_instance = torch.zeros((batch_size, 0, box_dim + 1), + dtype=batch_gt_instances.dtype, + device=batch_gt_instances.device) + + return batch_instance diff --git a/mmyolo/registry.py b/mmyolo/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..71f43e6cf53d92917b7aea6175ae0540613ff720 --- /dev/null +++ b/mmyolo/registry.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""MMYOLO provides 17 registry nodes to support using modules across projects. +Each node is a child of the root registry in MMEngine. + +More details can be found at +https://mmengine.readthedocs.io/en/latest/tutorials/registry.html. +""" + +from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS +from mmengine.registry import DATASETS as MMENGINE_DATASETS +from mmengine.registry import HOOKS as MMENGINE_HOOKS +from mmengine.registry import LOOPS as MMENGINE_LOOPS +from mmengine.registry import METRICS as MMENGINE_METRICS +from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS +from mmengine.registry import MODELS as MMENGINE_MODELS +from mmengine.registry import \ + OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS +from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS +from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS +from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS +from mmengine.registry import \ + RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS +from mmengine.registry import RUNNERS as MMENGINE_RUNNERS +from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS +from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS +from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS +from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS +from mmengine.registry import \ + WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS +from mmengine.registry import Registry + +# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner` +RUNNERS = Registry( + 'runner', parent=MMENGINE_RUNNERS, locations=['mmyolo.engine']) +# manage runner constructors that define how to initialize runners +RUNNER_CONSTRUCTORS = Registry( + 'runner constructor', + parent=MMENGINE_RUNNER_CONSTRUCTORS, + locations=['mmyolo.engine']) +# manage all kinds of loops like `EpochBasedTrainLoop` +LOOPS = Registry('loop', parent=MMENGINE_LOOPS, locations=['mmyolo.engine']) +# manage all kinds of hooks like `CheckpointHook` +HOOKS = Registry( + 'hook', parent=MMENGINE_HOOKS, locations=['mmyolo.engine.hooks']) + +# manage data-related modules +DATASETS = Registry( + 'dataset', parent=MMENGINE_DATASETS, locations=['mmyolo.datasets']) +DATA_SAMPLERS = Registry( + 'data sampler', + parent=MMENGINE_DATA_SAMPLERS, + locations=['mmyolo.datasets']) +TRANSFORMS = Registry( + 'transform', + parent=MMENGINE_TRANSFORMS, + locations=['mmyolo.datasets.transforms']) + +# manage all kinds of modules inheriting `nn.Module` +MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmyolo.models']) +# manage all kinds of model wrappers like 'MMDistributedDataParallel' +MODEL_WRAPPERS = Registry( + 'model_wrapper', + parent=MMENGINE_MODEL_WRAPPERS, + locations=['mmyolo.models']) +# manage all kinds of weight initialization modules like `Uniform` +WEIGHT_INITIALIZERS = Registry( + 'weight initializer', + parent=MMENGINE_WEIGHT_INITIALIZERS, + locations=['mmyolo.models']) + +# manage all kinds of optimizers like `SGD` and `Adam` +OPTIMIZERS = Registry( + 'optimizer', + parent=MMENGINE_OPTIMIZERS, + locations=['mmyolo.engine.optimizers']) +OPTIM_WRAPPERS = Registry( + 'optim_wrapper', + parent=MMENGINE_OPTIM_WRAPPERS, + locations=['mmyolo.engine.optimizers']) +# manage constructors that customize the optimization hyperparameters. +OPTIM_WRAPPER_CONSTRUCTORS = Registry( + 'optimizer constructor', + parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS, + locations=['mmyolo.engine.optimizers']) +# manage all kinds of parameter schedulers like `MultiStepLR` +PARAM_SCHEDULERS = Registry( + 'parameter scheduler', + parent=MMENGINE_PARAM_SCHEDULERS, + locations=['mmyolo.engine.optimizers']) +# manage all kinds of metrics +METRICS = Registry( + 'metric', parent=MMENGINE_METRICS, locations=['mmyolo.engine']) + +# manage task-specific modules like anchor generators and box coders +TASK_UTILS = Registry( + 'task util', parent=MMENGINE_TASK_UTILS, locations=['mmyolo.models']) + +# manage visualizer +VISUALIZERS = Registry( + 'visualizer', parent=MMENGINE_VISUALIZERS, locations=['mmyolo.utils']) +# manage visualizer backend +VISBACKENDS = Registry( + 'vis_backend', parent=MMENGINE_VISBACKENDS, locations=['mmyolo.utils']) diff --git a/mmyolo/testing/__init__.py b/mmyolo/testing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b6d7a010ee27b2822d44ad099f46f65bf6f0c00a --- /dev/null +++ b/mmyolo/testing/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ._utils import get_detector_cfg + +__all__ = ['get_detector_cfg'] diff --git a/mmyolo/testing/_utils.py b/mmyolo/testing/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9ccf2fe0cfd7baa3aeb7f3793c3db025d8889d5f --- /dev/null +++ b/mmyolo/testing/_utils.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from os.path import dirname, exists, join + +import numpy as np +from mmengine.config import Config + + +def _get_config_directory(): + """Find the predefined detector config directory.""" + try: + # Assume we are running in the source mmyolo repo + repo_dpath = dirname(dirname(dirname(__file__))) + except NameError: + # For IPython development when this __file__ is not defined + import mmyolo + repo_dpath = dirname(dirname(mmyolo.__file__)) + config_dpath = join(repo_dpath, 'configs') + if not exists(config_dpath): + raise Exception('Cannot find config path') + return config_dpath + + +def _get_config_module(fname): + """Load a configuration as a python module.""" + config_dpath = _get_config_directory() + config_fpath = join(config_dpath, fname) + config_mod = Config.fromfile(config_fpath) + return config_mod + + +def get_detector_cfg(fname): + """Grab configs necessary to create a detector. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + return model + + +def _rand_bboxes(rng, num_boxes, w, h): + """Randomly generate a specified number of bboxes.""" + cx, cy, bw, bh = rng.rand(num_boxes, 4).T + + tl_x = ((cx * w) - (w * bw / 2)).clip(0, w) + tl_y = ((cy * h) - (h * bh / 2)).clip(0, h) + br_x = ((cx * w) + (w * bw / 2)).clip(0, w) + br_y = ((cy * h) + (h * bh / 2)).clip(0, h) + + bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T + return bboxes diff --git a/mmyolo/utils/__init__.py b/mmyolo/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e968494892ccefb60d0c7b713c131ddc6fb869 --- /dev/null +++ b/mmyolo/utils/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collect_env import collect_env +from .misc import is_metainfo_lower, switch_to_deploy +from .setup_env import register_all_modules + +__all__ = [ + 'register_all_modules', 'collect_env', 'switch_to_deploy', + 'is_metainfo_lower' +] diff --git a/mmyolo/utils/boxam_utils.py b/mmyolo/utils/boxam_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4a46f21c1b5b40e7bc106ae7a15281816ae3efcc --- /dev/null +++ b/mmyolo/utils/boxam_utils.py @@ -0,0 +1,512 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import bisect +import copy +import warnings +from pathlib import Path +from typing import Callable, List, Optional, Tuple, Union + +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torchvision +from mmcv.transforms import Compose +from mmdet.evaluation import get_classes +from mmdet.utils import ConfigType +from mmengine.config import Config +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS + +try: + from pytorch_grad_cam import (AblationCAM, AblationLayer, + ActivationsAndGradients) + from pytorch_grad_cam import GradCAM as Base_GradCAM + from pytorch_grad_cam import GradCAMPlusPlus as Base_GradCAMPlusPlus + from pytorch_grad_cam.base_cam import BaseCAM + from pytorch_grad_cam.utils.image import scale_cam_image, show_cam_on_image + from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection +except ImportError: + pass + + +def init_detector( + config: Union[str, Path, Config], + checkpoint: Optional[str] = None, + palette: str = 'coco', + device: str = 'cuda:0', + cfg_options: Optional[dict] = None, +) -> nn.Module: + """Initialize a detector from config file. + + Args: + config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path, + :obj:`Path`, or the config object. + checkpoint (str, optional): Checkpoint path. If left as None, the model + will not load any weights. + palette (str): Color palette used for visualization. If palette + is stored in checkpoint, use checkpoint's palette first, otherwise + use externally passed palette. Currently, supports 'coco', 'voc', + 'citys' and 'random'. Defaults to coco. + device (str): The device where the anchors will be put on. + Defaults to cuda:0. + cfg_options (dict, optional): Options to override some settings in + the used config. + + Returns: + nn.Module: The constructed detector. + """ + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if cfg_options is not None: + config.merge_from_dict(cfg_options) + elif 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + # only change this + # grad based method requires train_cfg + # config.model.train_cfg = None + init_default_scope(config.get('default_scope', 'mmyolo')) + + model = MODELS.build(config.model) + if checkpoint is not None: + checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') + # Weights converted from elsewhere may not have meta fields. + checkpoint_meta = checkpoint.get('meta', {}) + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint_meta: + # mmdet 3.x, all keys should be lowercase + model.dataset_meta = { + k.lower(): v + for k, v in checkpoint_meta['dataset_meta'].items() + } + elif 'CLASSES' in checkpoint_meta: + # < mmdet 3.x + classes = checkpoint_meta['CLASSES'] + model.dataset_meta = {'classes': classes, 'palette': palette} + else: + warnings.simplefilter('once') + warnings.warn( + 'dataset_meta or class names are not saved in the ' + 'checkpoint\'s meta data, use COCO classes by default.') + model.dataset_meta = { + 'classes': get_classes('coco'), + 'palette': palette + } + + model.cfg = config # save the config in the model for convenience + model.to(device) + model.eval() + return model + + +def reshape_transform(feats: Union[Tensor, List[Tensor]], + max_shape: Tuple[int, int] = (20, 20), + is_need_grad: bool = False): + """Reshape and aggregate feature maps when the input is a multi-layer + feature map. + + Takes these tensors with different sizes, resizes them to a common shape, + and concatenates them. + """ + if len(max_shape) == 1: + max_shape = max_shape * 2 + + if isinstance(feats, torch.Tensor): + feats = [feats] + else: + if is_need_grad: + raise NotImplementedError('The `grad_base` method does not ' + 'support output multi-activation layers') + + max_h = max([im.shape[-2] for im in feats]) + max_w = max([im.shape[-1] for im in feats]) + if -1 in max_shape: + max_shape = (max_h, max_w) + else: + max_shape = (min(max_h, max_shape[0]), min(max_w, max_shape[1])) + + activations = [] + for feat in feats: + activations.append( + torch.nn.functional.interpolate( + torch.abs(feat), max_shape, mode='bilinear')) + + activations = torch.cat(activations, axis=1) + return activations + + +class BoxAMDetectorWrapper(nn.Module): + """Wrap the mmdet model class to facilitate handling of non-tensor + situations during inference.""" + + def __init__(self, + cfg: ConfigType, + checkpoint: str, + score_thr: float, + device: str = 'cuda:0'): + super().__init__() + self.cfg = cfg + self.device = device + self.score_thr = score_thr + self.checkpoint = checkpoint + self.detector = init_detector(self.cfg, self.checkpoint, device=device) + + pipeline_cfg = copy.deepcopy(self.cfg.test_dataloader.dataset.pipeline) + pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray' + + new_test_pipeline = [] + for pipeline in pipeline_cfg: + if not pipeline['type'].endswith('LoadAnnotations'): + new_test_pipeline.append(pipeline) + self.test_pipeline = Compose(new_test_pipeline) + + self.is_need_loss = False + self.input_data = None + self.image = None + + def need_loss(self, is_need_loss: bool): + """Grad-based methods require loss.""" + self.is_need_loss = is_need_loss + + def set_input_data(self, + image: np.ndarray, + pred_instances: Optional[InstanceData] = None): + """Set the input data to be used in the next step.""" + self.image = image + + if self.is_need_loss: + assert pred_instances is not None + pred_instances = pred_instances.numpy() + data = dict( + img=self.image, + img_id=0, + gt_bboxes=pred_instances.bboxes, + gt_bboxes_labels=pred_instances.labels) + data = self.test_pipeline(data) + else: + data = dict(img=self.image, img_id=0) + data = self.test_pipeline(data) + data['inputs'] = [data['inputs']] + data['data_samples'] = [data['data_samples']] + self.input_data = data + + def __call__(self, *args, **kwargs): + assert self.input_data is not None + if self.is_need_loss: + # Maybe this is a direction that can be optimized + # self.detector.init_weights() + + self.detector.bbox_head.head_module.training = True + if hasattr(self.detector.bbox_head, 'featmap_sizes'): + # Prevent the model algorithm error when calculating loss + self.detector.bbox_head.featmap_sizes = None + + data_ = {} + data_['inputs'] = [self.input_data['inputs']] + data_['data_samples'] = [self.input_data['data_samples']] + data = self.detector.data_preprocessor(data_, training=False) + loss = self.detector._run_forward(data, mode='loss') + + if hasattr(self.detector.bbox_head, 'featmap_sizes'): + self.detector.bbox_head.featmap_sizes = None + + return [loss] + else: + self.detector.bbox_head.head_module.training = False + with torch.no_grad(): + results = self.detector.test_step(self.input_data) + return results + + +class BoxAMDetectorVisualizer: + """Box AM visualization class.""" + + def __init__(self, + method_class, + model: nn.Module, + target_layers: List, + reshape_transform: Optional[Callable] = None, + is_need_grad: bool = False, + extra_params: Optional[dict] = None): + self.target_layers = target_layers + self.reshape_transform = reshape_transform + self.is_need_grad = is_need_grad + + if method_class.__name__ == 'AblationCAM': + batch_size = extra_params.get('batch_size', 1) + ratio_channels_to_ablate = extra_params.get( + 'ratio_channels_to_ablate', 1.) + self.cam = AblationCAM( + model, + target_layers, + use_cuda=True if 'cuda' in model.device else False, + reshape_transform=reshape_transform, + batch_size=batch_size, + ablation_layer=extra_params['ablation_layer'], + ratio_channels_to_ablate=ratio_channels_to_ablate) + else: + self.cam = method_class( + model, + target_layers, + use_cuda=True if 'cuda' in model.device else False, + reshape_transform=reshape_transform, + ) + if self.is_need_grad: + self.cam.activations_and_grads.release() + + self.classes = model.detector.dataset_meta['classes'] + self.COLORS = np.random.uniform(0, 255, size=(len(self.classes), 3)) + + def switch_activations_and_grads(self, model) -> None: + """In the grad-based method, we need to switch + ``ActivationsAndGradients`` layer, otherwise an error will occur.""" + self.cam.model = model + + if self.is_need_grad is True: + self.cam.activations_and_grads = ActivationsAndGradients( + model, self.target_layers, self.reshape_transform) + self.is_need_grad = False + else: + self.cam.activations_and_grads.release() + self.is_need_grad = True + + def __call__(self, img, targets, aug_smooth=False, eigen_smooth=False): + img = torch.from_numpy(img)[None].permute(0, 3, 1, 2) + return self.cam(img, targets, aug_smooth, eigen_smooth)[0, :] + + def show_am(self, + image: np.ndarray, + pred_instance: InstanceData, + grayscale_am: np.ndarray, + with_norm_in_bboxes: bool = False): + """Normalize the AM to be in the range [0, 1] inside every bounding + boxes, and zero outside of the bounding boxes.""" + + boxes = pred_instance.bboxes + labels = pred_instance.labels + + if with_norm_in_bboxes is True: + boxes = boxes.astype(np.int32) + renormalized_am = np.zeros(grayscale_am.shape, dtype=np.float32) + images = [] + for x1, y1, x2, y2 in boxes: + img = renormalized_am * 0 + img[y1:y2, x1:x2] = scale_cam_image( + [grayscale_am[y1:y2, x1:x2].copy()])[0] + images.append(img) + + renormalized_am = np.max(np.float32(images), axis=0) + renormalized_am = scale_cam_image([renormalized_am])[0] + else: + renormalized_am = grayscale_am + + am_image_renormalized = show_cam_on_image( + image / 255, renormalized_am, use_rgb=False) + + image_with_bounding_boxes = self._draw_boxes( + boxes, labels, am_image_renormalized, pred_instance.get('scores')) + return image_with_bounding_boxes + + def _draw_boxes(self, + boxes: List, + labels: List, + image: np.ndarray, + scores: Optional[List] = None): + """draw boxes on image.""" + for i, box in enumerate(boxes): + label = labels[i] + color = self.COLORS[label] + cv2.rectangle(image, (int(box[0]), int(box[1])), + (int(box[2]), int(box[3])), color, 2) + if scores is not None: + score = scores[i] + text = str(self.classes[label]) + ': ' + str( + round(score * 100, 1)) + else: + text = self.classes[label] + + cv2.putText( + image, + text, (int(box[0]), int(box[1] - 5)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + color, + 1, + lineType=cv2.LINE_AA) + return image + + +class DetAblationLayer(AblationLayer): + """Det AblationLayer.""" + + def __init__(self): + super().__init__() + self.activations = None + + def set_next_batch(self, input_batch_index, activations, + num_channels_to_ablate): + """Extract the next batch member from activations, and repeat it + num_channels_to_ablate times.""" + if isinstance(activations, torch.Tensor): + return super().set_next_batch(input_batch_index, activations, + num_channels_to_ablate) + + self.activations = [] + for activation in activations: + activation = activation[ + input_batch_index, :, :, :].clone().unsqueeze(0) + self.activations.append( + activation.repeat(num_channels_to_ablate, 1, 1, 1)) + + def __call__(self, x): + """Go over the activation indices to be ablated, stored in + self.indices.""" + result = self.activations + + if isinstance(result, torch.Tensor): + return super().__call__(x) + + channel_cumsum = np.cumsum([r.shape[1] for r in result]) + num_channels_to_ablate = result[0].size(0) # batch + for i in range(num_channels_to_ablate): + pyramid_layer = bisect.bisect_right(channel_cumsum, + self.indices[i]) + if pyramid_layer > 0: + index_in_pyramid_layer = self.indices[i] - channel_cumsum[ + pyramid_layer - 1] + else: + index_in_pyramid_layer = self.indices[i] + result[pyramid_layer][i, index_in_pyramid_layer, :, :] = -1000 + return result + + +class DetBoxScoreTarget: + """Det Score calculation class. + + In the case of the grad-free method, the calculation method is that + for every original detected bounding box specified in "bboxes", + assign a score on how the current bounding boxes match it, + + 1. In Bbox IoU + 2. In the classification score. + 3. In Mask IoU if ``segms`` exist. + + If there is not a large enough overlap, or the category changed, + assign a score of 0. The total score is the sum of all the box scores. + + In the case of the grad-based method, the calculation method is + the sum of losses after excluding a specific key. + """ + + def __init__(self, + pred_instance: InstanceData, + match_iou_thr: float = 0.5, + device: str = 'cuda:0', + ignore_loss_params: Optional[List] = None): + self.focal_bboxes = pred_instance.bboxes + self.focal_labels = pred_instance.labels + self.match_iou_thr = match_iou_thr + self.device = device + self.ignore_loss_params = ignore_loss_params + if ignore_loss_params is not None: + assert isinstance(self.ignore_loss_params, list) + + def __call__(self, results): + output = torch.tensor([0.], device=self.device) + + if 'loss_cls' in results: + # grad-based method + # results is dict + for loss_key, loss_value in results.items(): + if 'loss' not in loss_key or \ + loss_key in self.ignore_loss_params: + continue + if isinstance(loss_value, list): + output += sum(loss_value) + else: + output += loss_value + return output + else: + # grad-free method + # results is DetDataSample + pred_instances = results.pred_instances + if len(pred_instances) == 0: + return output + + pred_bboxes = pred_instances.bboxes + pred_scores = pred_instances.scores + pred_labels = pred_instances.labels + + for focal_box, focal_label in zip(self.focal_bboxes, + self.focal_labels): + ious = torchvision.ops.box_iou(focal_box[None], + pred_bboxes[..., :4]) + index = ious.argmax() + if ious[0, index] > self.match_iou_thr and pred_labels[ + index] == focal_label: + # TODO: Adaptive adjustment of weights based on algorithms + score = ious[0, index] + pred_scores[index] + output = output + score + return output + + +class SpatialBaseCAM(BaseCAM): + """CAM that maintains spatial information. + + Gradients are often averaged over the spatial dimension in CAM + visualization for classification, but this is unreasonable in detection + tasks. There is no need to average the gradients in the detection task. + """ + + def get_cam_image(self, + input_tensor: torch.Tensor, + target_layer: torch.nn.Module, + targets: List[torch.nn.Module], + activations: torch.Tensor, + grads: torch.Tensor, + eigen_smooth: bool = False) -> np.ndarray: + + weights = self.get_cam_weights(input_tensor, target_layer, targets, + activations, grads) + weighted_activations = weights * activations + if eigen_smooth: + cam = get_2d_projection(weighted_activations) + else: + cam = weighted_activations.sum(axis=1) + return cam + + +class GradCAM(SpatialBaseCAM, Base_GradCAM): + """Gradients are no longer averaged over the spatial dimension.""" + + def get_cam_weights(self, input_tensor, target_layer, target_category, + activations, grads): + return grads + + +class GradCAMPlusPlus(SpatialBaseCAM, Base_GradCAMPlusPlus): + """Gradients are no longer averaged over the spatial dimension.""" + + def get_cam_weights(self, input_tensor, target_layers, target_category, + activations, grads): + grads_power_2 = grads**2 + grads_power_3 = grads_power_2 * grads + # Equation 19 in https://arxiv.org/abs/1710.11063 + sum_activations = np.sum(activations, axis=(2, 3)) + eps = 0.000001 + aij = grads_power_2 / ( + 2 * grads_power_2 + + sum_activations[:, :, None, None] * grads_power_3 + eps) + # Now bring back the ReLU from eq.7 in the paper, + # And zero out aijs where the activations are 0 + aij = np.where(grads != 0, aij, 0) + + weights = np.maximum(grads, 0) * aij + return weights diff --git a/mmyolo/utils/collect_env.py b/mmyolo/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..89bad658cb7d4f1b602690d8d888a309166283ee --- /dev/null +++ b/mmyolo/utils/collect_env.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmdet +from mmengine.utils import get_git_hash +from mmengine.utils.dl_utils import collect_env as collect_base_env + +import mmyolo + + +def collect_env() -> dict: + """Collect the information of the running environments.""" + env_info = collect_base_env() + env_info['MMCV'] = mmcv.__version__ + env_info['MMDetection'] = mmdet.__version__ + env_info['MMYOLO'] = mmyolo.__version__ + '+' + get_git_hash()[:7] + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print(f'{name}: {val}') diff --git a/mmyolo/utils/labelme_utils.py b/mmyolo/utils/labelme_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0981919771a617ca79b29c3ddf96ea14c82fccc6 --- /dev/null +++ b/mmyolo/utils/labelme_utils.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path + +from mmengine.structures import InstanceData + + +class LabelmeFormat: + """Predict results save into labelme file. + + Base on https://github.com/wkentaro/labelme/blob/main/labelme/label_file.py + + Args: + classes (tuple): Model classes name. + """ + + def __init__(self, classes: tuple): + super().__init__() + self.classes = classes + + def __call__(self, pred_instances: InstanceData, metainfo: dict, + output_path: str, selected_classes: list): + """Get image data field for labelme. + + Args: + pred_instances (InstanceData): Candidate prediction info. + metainfo (dict): Meta info of prediction. + output_path (str): Image file path. + selected_classes (list): Selected class name. + + Labelme file eg. + { + "version": "5.1.1", + "flags": {}, + "imagePath": "/data/cat/1.jpg", + "imageData": null, + "imageHeight": 3000, + "imageWidth": 4000, + "shapes": [ + { + "label": "cat", + "points": [ + [ + 1148.076923076923, + 1188.4615384615383 + ], + [ + 2471.1538461538457, + 2176.923076923077 + ] + ], + "group_id": null, + "shape_type": "rectangle", + "flags": {} + }, + {...} + ] + } + """ + + image_path = os.path.abspath(metainfo['img_path']) + + json_info = { + 'version': '5.1.1', + 'flags': {}, + 'imagePath': image_path, + 'imageData': None, + 'imageHeight': metainfo['ori_shape'][0], + 'imageWidth': metainfo['ori_shape'][1], + 'shapes': [] + } + + for pred_instance in pred_instances: + pred_bbox = pred_instance.bboxes.cpu().numpy().tolist()[0] + pred_label = self.classes[pred_instance.labels] + + if selected_classes is not None and \ + pred_label not in selected_classes: + # filter class name + continue + + sub_dict = { + 'label': pred_label, + 'points': [pred_bbox[:2], pred_bbox[2:]], + 'group_id': None, + 'shape_type': 'rectangle', + 'flags': {} + } + json_info['shapes'].append(sub_dict) + + with open(output_path, 'w', encoding='utf-8') as f_json: + json.dump(json_info, f_json, ensure_ascii=False, indent=2) diff --git a/mmyolo/utils/large_image.py b/mmyolo/utils/large_image.py new file mode 100644 index 0000000000000000000000000000000000000000..8670804684f6dcdc6dc1846cf85260d900b3474e --- /dev/null +++ b/mmyolo/utils/large_image.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple + +import torch +from mmcv.ops import batched_nms +from mmdet.structures import DetDataSample, SampleList +from mmengine.structures import InstanceData + + +def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]): + """Shift rotated bboxes with offset. + + Args: + bboxes (Tensor): The rotated bboxes need to be translated. + With shape (n, 5), which means (x, y, w, h, a). + offset (Sequence[int]): The translation offsets with shape of (2, ). + Returns: + Tensor: Shifted rotated bboxes. + """ + offset_tensor = bboxes.new_tensor(offset) + shifted_bboxes = bboxes.clone() + shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor + return shifted_bboxes + + +def shift_predictions(det_data_samples: SampleList, + offsets: Sequence[Tuple[int, int]], + src_image_shape: Tuple[int, int]) -> SampleList: + """Shift predictions to the original image. + + Args: + det_data_samples (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + Returns: + (List[:obj:`DetDataSample`]): shifted results. + """ + try: + from sahi.slicing import shift_bboxes, shift_masks + except ImportError: + raise ImportError('Please run "pip install -U sahi" ' + 'to install sahi first for large image inference.') + + assert len(det_data_samples) == len( + offsets), 'The `results` should has the ' 'same length with `offsets`.' + shifted_predictions = [] + for det_data_sample, offset in zip(det_data_samples, offsets): + pred_inst = det_data_sample.pred_instances.clone() + + # Check bbox type + if pred_inst.bboxes.size(-1) == 4: + # Horizontal bboxes + shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset) + elif pred_inst.bboxes.size(-1) == 5: + # Rotated bboxes + shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset) + else: + raise NotImplementedError + + # shift bboxes and masks + pred_inst.bboxes = shifted_bboxes + if 'masks' in det_data_sample: + pred_inst.masks = shift_masks(pred_inst.masks, offset, + src_image_shape) + + shifted_predictions.append(pred_inst.clone()) + + shifted_predictions = InstanceData.cat(shifted_predictions) + + return shifted_predictions + + +def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int, + int]], + src_image_shape: Tuple[int, int], + nms_cfg: dict) -> DetDataSample: + """Merge patch results by nms. + + Args: + results (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + nms_cfg (dict): it should specify nms type and other parameters + like `iou_threshold`. + Returns: + :obj:`DetDataSample`: merged results. + """ + shifted_instances = shift_predictions(results, offsets, src_image_shape) + + _, keeps = batched_nms( + boxes=shifted_instances.bboxes, + scores=shifted_instances.scores, + idxs=shifted_instances.labels, + nms_cfg=nms_cfg) + merged_instances = shifted_instances[keeps] + + merged_result = results[0].clone() + merged_result.pred_instances = merged_instances + return merged_result diff --git a/mmyolo/utils/misc.py b/mmyolo/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..c90f52b94ee9e174c3a289122b6bc7fc58e6e6f1 --- /dev/null +++ b/mmyolo/utils/misc.py @@ -0,0 +1,133 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import urllib + +import numpy as np +import torch +from mmengine.utils import scandir +from prettytable import PrettyTable + +from mmyolo.models import RepVGGBlock + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def switch_to_deploy(model): + """Model switch to deploy status.""" + for layer in model.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + + print('Switch model to deploy modality.') + + +def auto_arrange_images(image_list: list, image_column: int = 2) -> np.ndarray: + """Auto arrange image to image_column x N row. + + Args: + image_list (list): cv2 image list. + image_column (int): Arrange to N column. Default: 2. + Return: + (np.ndarray): image_column x N row merge image + """ + img_count = len(image_list) + if img_count <= image_column: + # no need to arrange + image_show = np.concatenate(image_list, axis=1) + else: + # arrange image according to image_column + image_row = round(img_count / image_column) + fill_img_list = [np.ones(image_list[0].shape, dtype=np.uint8) * 255 + ] * ( + image_row * image_column - img_count) + image_list.extend(fill_img_list) + merge_imgs_col = [] + for i in range(image_row): + start_col = image_column * i + end_col = image_column * (i + 1) + merge_col = np.hstack(image_list[start_col:end_col]) + merge_imgs_col.append(merge_col) + + # merge to one image + image_show = np.vstack(merge_imgs_col) + + return image_show + + +def get_file_list(source_root: str) -> [list, dict]: + """Get file list. + + Args: + source_root (str): image or video source path + + Return: + source_file_path_list (list): A list for all source file. + source_type (dict): Source type: file or url or dir. + """ + is_dir = os.path.isdir(source_root) + is_url = source_root.startswith(('http:/', 'https:/')) + is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS + + source_file_path_list = [] + if is_dir: + # when input source is dir + for file in scandir(source_root, IMG_EXTENSIONS, recursive=True): + source_file_path_list.append(os.path.join(source_root, file)) + elif is_url: + # when input source is url + filename = os.path.basename( + urllib.parse.unquote(source_root).split('?')[0]) + file_save_path = os.path.join(os.getcwd(), filename) + print(f'Downloading source file to {file_save_path}') + torch.hub.download_url_to_file(source_root, file_save_path) + source_file_path_list = [file_save_path] + elif is_file: + # when input source is single image + source_file_path_list = [source_root] + else: + print('Cannot find image file.') + + source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file) + + return source_file_path_list, source_type + + +def show_data_classes(data_classes): + """When printing an error, all class names of the dataset.""" + print('\n\nThe name of the class contained in the dataset:') + data_classes_info = PrettyTable() + data_classes_info.title = 'Information of dataset class' + # List Print Settings + # If the quantity is too large, 25 rows will be displayed in each column + if len(data_classes) < 25: + data_classes_info.add_column('Class name', data_classes) + elif len(data_classes) % 25 != 0 and len(data_classes) > 25: + col_num = int(len(data_classes) / 25) + 1 + data_name_list = list(data_classes) + for i in range(0, (col_num * 25) - len(data_classes)): + data_name_list.append('') + for i in range(0, len(data_name_list), 25): + data_classes_info.add_column('Class name', + data_name_list[i:i + 25]) + + # Align display data to the left + data_classes_info.align['Class name'] = 'l' + print(data_classes_info) + + +def is_metainfo_lower(cfg): + """Determine whether the custom metainfo fields are all lowercase.""" + + def judge_keys(dataloader_cfg): + while 'dataset' in dataloader_cfg: + dataloader_cfg = dataloader_cfg['dataset'] + if 'metainfo' in dataloader_cfg: + all_keys = dataloader_cfg['metainfo'].keys() + all_is_lower = all([str(k).islower() for k in all_keys]) + assert all_is_lower, f'The keys in dataset metainfo must be all lowercase, but got {all_keys}. ' \ + f'Please refer to https://github.com/open-mmlab/mmyolo/blob/e62c8c4593/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py#L8' # noqa + + judge_keys(cfg.get('train_dataloader', {})) + judge_keys(cfg.get('val_dataloader', {})) + judge_keys(cfg.get('test_dataloader', {})) diff --git a/mmyolo/utils/setup_env.py b/mmyolo/utils/setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..f51ed928cbddb98c2274e09b5acea1d70dfd1abd --- /dev/null +++ b/mmyolo/utils/setup_env.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import warnings + +from mmengine import DefaultScope + + +def register_all_modules(init_default_scope: bool = True): + """Register all modules in mmdet into the registries. + + Args: + init_default_scope (bool): Whether initialize the mmdet default scope. + When `init_default_scope=True`, the global default scope will be + set to `mmyolo`, and all registries will build modules from mmdet's + registry node. To understand more about the registry, please refer + to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md + Defaults to True. + """ # noqa + import mmdet.engine # noqa: F401,F403 + import mmdet.visualization # noqa: F401,F403 + + import mmyolo.datasets # noqa: F401,F403 + import mmyolo.engine # noqa: F401,F403 + import mmyolo.models # noqa: F401,F403 + + if init_default_scope: + never_created = DefaultScope.get_current_instance() is None \ + or not DefaultScope.check_instance_created('mmyolo') + if never_created: + DefaultScope.get_instance('mmyolo', scope_name='mmyolo') + return + current_scope = DefaultScope.get_current_instance() + if current_scope.scope_name != 'mmyolo': + warnings.warn('The current default scope ' + f'"{current_scope.scope_name}" is not "mmyolo", ' + '`register_all_modules` will force the current' + 'default scope to be "mmyolo". If this is not ' + 'expected, please set `init_default_scope=False`.') + # avoid name conflict + new_instance_name = f'mmyolo-{datetime.datetime.now()}' + DefaultScope.get_instance(new_instance_name, scope_name='mmyolo') diff --git a/mmyolo/version.py b/mmyolo/version.py new file mode 100644 index 0000000000000000000000000000000000000000..75c44c7b2a4abf1652db1f9878aef80ec52b5ec9 --- /dev/null +++ b/mmyolo/version.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +__version__ = '0.5.0' + +from typing import Tuple + +short_version = __version__ + + +def parse_version_info(version_str: str) -> Tuple: + """Parse version info of MMYOLO.""" + version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = parse_version_info(__version__)