Spaces:

KyanChen
/

RSPrompter

Runtime error

App Files Files Community

KyanChen commited on Jun 30, 2023

Commit

3094730

1 Parent(s): 2ae34e9

Upload 89 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

mmyolo/__init__.py +39 -0
mmyolo/datasets/__init__.py +12 -0
mmyolo/datasets/transforms/__init__.py +14 -0
mmyolo/datasets/transforms/mix_img_transforms.py +1150 -0
mmyolo/datasets/transforms/transforms.py +1557 -0
mmyolo/datasets/utils.py +114 -0
mmyolo/datasets/yolov5_coco.py +65 -0
mmyolo/datasets/yolov5_crowdhuman.py +15 -0
mmyolo/datasets/yolov5_dota.py +29 -0
mmyolo/datasets/yolov5_voc.py +15 -0
mmyolo/deploy/__init__.py +7 -0
mmyolo/deploy/models/__init__.py +2 -0
mmyolo/deploy/models/dense_heads/__init__.py +4 -0
mmyolo/deploy/models/dense_heads/yolov5_head.py +189 -0
mmyolo/deploy/models/layers/__init__.py +4 -0
mmyolo/deploy/models/layers/bbox_nms.py +113 -0
mmyolo/deploy/object_detection.py +132 -0
mmyolo/engine/__init__.py +3 -0
mmyolo/engine/hooks/__init__.py +10 -0
mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py +96 -0
mmyolo/engine/hooks/switch_to_deploy_hook.py +21 -0
mmyolo/engine/hooks/yolov5_param_scheduler_hook.py +130 -0
mmyolo/engine/hooks/yolox_mode_switch_hook.py +54 -0
mmyolo/engine/optimizers/__init__.py +5 -0
mmyolo/engine/optimizers/yolov5_optim_constructor.py +132 -0
mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py +139 -0
mmyolo/models/__init__.py +10 -0
mmyolo/models/backbones/__init__.py +13 -0
mmyolo/models/backbones/base_backbone.py +225 -0
mmyolo/models/backbones/csp_darknet.py +427 -0
mmyolo/models/backbones/csp_resnet.py +169 -0
mmyolo/models/backbones/cspnext.py +187 -0
mmyolo/models/backbones/efficient_rep.py +287 -0
mmyolo/models/backbones/yolov7_backbone.py +285 -0
mmyolo/models/data_preprocessors/__init__.py +10 -0
mmyolo/models/data_preprocessors/data_preprocessor.py +302 -0
mmyolo/models/dense_heads/__init__.py +20 -0
mmyolo/models/dense_heads/ppyoloe_head.py +374 -0
mmyolo/models/dense_heads/rtmdet_head.py +368 -0
mmyolo/models/dense_heads/rtmdet_ins_head.py +725 -0
mmyolo/models/dense_heads/rtmdet_rotated_head.py +641 -0
mmyolo/models/dense_heads/yolov5_head.py +890 -0
mmyolo/models/dense_heads/yolov6_head.py +369 -0
mmyolo/models/dense_heads/yolov7_head.py +404 -0
mmyolo/models/dense_heads/yolov8_head.py +398 -0
mmyolo/models/dense_heads/yolox_head.py +514 -0
mmyolo/models/detectors/__init__.py +4 -0
mmyolo/models/detectors/yolo_detector.py +53 -0
mmyolo/models/layers/__init__.py +16 -0
mmyolo/models/layers/ema.py +96 -0

mmyolo/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmdet
+import mmengine
+from mmengine.utils import digit_version
+from .version import __version__, version_info
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '2.1.0'
+mmcv_version = digit_version(mmcv.__version__)
+mmengine_minimum_version = '0.6.0'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+mmdet_minimum_version = '3.0.0rc6'
+mmdet_maximum_version = '3.1.0'
+mmdet_version = digit_version(mmdet.__version__)
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version < digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+assert (mmengine_version >= digit_version(mmengine_minimum_version)
+        and mmengine_version < digit_version(mmengine_maximum_version)), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_minimum_version}, ' \
+    f'<{mmengine_maximum_version}.'
+assert (mmdet_version >= digit_version(mmdet_minimum_version)
+        and mmdet_version < digit_version(mmdet_maximum_version)), \
+    f'MMDetection=={mmdet.__version__} is used but incompatible. ' \
+    f'Please install mmdet>={mmdet_minimum_version}, ' \
+    f'<{mmdet_maximum_version}.'
+__all__ = ['__version__', 'version_info', 'digit_version']

mmyolo/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .transforms import *  # noqa: F401,F403
+from .utils import BatchShapePolicy, yolov5_collate
+from .yolov5_coco import YOLOv5CocoDataset
+from .yolov5_crowdhuman import YOLOv5CrowdHumanDataset
+from .yolov5_dota import YOLOv5DOTADataset
+from .yolov5_voc import YOLOv5VOCDataset
+__all__ = [
+    'YOLOv5CocoDataset', 'YOLOv5VOCDataset', 'BatchShapePolicy',
+    'yolov5_collate', 'YOLOv5CrowdHumanDataset', 'YOLOv5DOTADataset'
+]

mmyolo/datasets/transforms/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .mix_img_transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp
+from .transforms import (LetterResize, LoadAnnotations, PPYOLOERandomCrop,
+                         PPYOLOERandomDistort, RegularizeRotatedBox,
+                         RemoveDataElement, YOLOv5CopyPaste,
+                         YOLOv5HSVRandomAug, YOLOv5KeepRatioResize,
+                         YOLOv5RandomAffine)
+__all__ = [
+    'YOLOv5KeepRatioResize', 'LetterResize', 'Mosaic', 'YOLOXMixUp',
+    'YOLOv5MixUp', 'YOLOv5HSVRandomAug', 'LoadAnnotations',
+    'YOLOv5RandomAffine', 'PPYOLOERandomDistort', 'PPYOLOERandomCrop',
+    'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox'
+]

mmyolo/datasets/transforms/mix_img_transforms.py ADDED Viewed

	@@ -0,0 +1,1150 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import copy
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Sequence, Tuple, Union
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmdet.structures.bbox import autocast_box_type
+from mmengine.dataset import BaseDataset
+from mmengine.dataset.base_dataset import Compose
+from numpy import random
+from mmyolo.registry import TRANSFORMS
+class BaseMixImageTransform(BaseTransform, metaclass=ABCMeta):
+    """A Base Transform of multiple images mixed.
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup.
+    Cached mosaic transform will random select images from the cache
+    and combine them into one output image if use_cached is True.
+    Args:
+        pre_transform(Sequence[str]): Sequence of transform object or
+            config dict to be composed. Defaults to None.
+        prob(float): The transformation probability. Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+    def __init__(self,
+                 pre_transform: Optional[Sequence[str]] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        self.max_refetch = max_refetch
+        self.prob = prob
+        self.use_cached = use_cached
+        self.max_cached_images = max_cached_images
+        self.random_pop = random_pop
+        self.results_cache = []
+        if pre_transform is None:
+            self.pre_transform = None
+        else:
+            self.pre_transform = Compose(pre_transform)
+    @abstractmethod
+    def get_indexes(self, dataset: Union[BaseDataset,
+                                         list]) -> Union[list, int]:
+        """Call function to collect indexes.
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+        Returns:
+            list or int: indexes.
+        """
+        pass
+    @abstractmethod
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            results (dict): Updated result dict.
+        """
+        pass
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Data augmentation function.
+        The transform steps are as follows:
+        1. Randomly generate index list of other images.
+        2. Before Mosaic or MixUp need to go through the necessary
+            pre_transform, such as MixUp' pre_transform pipeline
+            include: 'LoadImageFromFile','LoadAnnotations',
+            'Mosaic' and 'RandomAffine'.
+        3. Use mix_img_transform function to implement specific
+            mix operations.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            results (dict): Updated result dict.
+        """
+        if random.uniform(0, 1) > self.prob:
+            return results
+        if self.use_cached:
+            # Be careful: deep copying can be very time-consuming
+            # if results includes dataset.
+            dataset = results.pop('dataset', None)
+            self.results_cache.append(copy.deepcopy(results))
+            if len(self.results_cache) > self.max_cached_images:
+                if self.random_pop:
+                    index = random.randint(0, len(self.results_cache) - 1)
+                else:
+                    index = 0
+                self.results_cache.pop(index)
+            if len(self.results_cache) <= 4:
+                return results
+        else:
+            assert 'dataset' in results
+            # Be careful: deep copying can be very time-consuming
+            # if results includes dataset.
+            dataset = results.pop('dataset', None)
+        for _ in range(self.max_refetch):
+            # get index of one or three other images
+            if self.use_cached:
+                indexes = self.get_indexes(self.results_cache)
+            else:
+                indexes = self.get_indexes(dataset)
+            if not isinstance(indexes, collections.abc.Sequence):
+                indexes = [indexes]
+            if self.use_cached:
+                mix_results = [
+                    copy.deepcopy(self.results_cache[i]) for i in indexes
+                ]
+            else:
+                # get images information will be used for Mosaic or MixUp
+                mix_results = [
+                    copy.deepcopy(dataset.get_data_info(index))
+                    for index in indexes
+                ]
+            if self.pre_transform is not None:
+                for i, data in enumerate(mix_results):
+                    # pre_transform may also require dataset
+                    data.update({'dataset': dataset})
+                    # before Mosaic or MixUp need to go through
+                    # the necessary pre_transform
+                    _results = self.pre_transform(data)
+                    _results.pop('dataset')
+                    mix_results[i] = _results
+            if None not in mix_results:
+                results['mix_results'] = mix_results
+                break
+            print('Repeated calculation')
+        else:
+            raise RuntimeError(
+                'The loading pipeline of the original dataset'
+                ' always return None. Please check the correctness '
+                'of the dataset and its pipeline.')
+        # Mosaic or MixUp
+        results = self.mix_img_transform(results)
+        if 'mix_results' in results:
+            results.pop('mix_results')
+        results['dataset'] = dataset
+        return results
+@TRANSFORMS.register_module()
+class Mosaic(BaseMixImageTransform):
+    """Mosaic augmentation.
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+    .. code:: text
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |           |
+                |      +-----------+    pad    |
+                |      |           |           |
+                |      |  image1   +-----------+
+                |      |           |           |
+                |      |           |   image2  |
+     center_y   |----+-+-----------+-----------+
+                |    |   cropped   |           |
+                |pad |   image3    |   image4  |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+     The mosaic transform steps are as follows:
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+    Required Keys:
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+    Modified Keys:
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 bbox_clip_border: bool = True,
+                 pad_val: float = 114.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        if use_cached:
+            assert max_cached_images >= 4, 'The length of cache must >= 4, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> list:
+        """Call function to collect indexes.
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+        Returns:
+            list: indexes.
+        """
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        mosaic_masks = []
+        with_mask = True if 'gt_masks' in results else False
+        # self.img_scale is wh format
+        img_scale_w, img_scale_h = self.img_scale
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(img_scale_h * 2), int(img_scale_w * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)),
+                                 self.pad_val,
+                                 dtype=results['img'].dtype)
+        # mosaic center x, y
+        center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w)
+        center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h)
+        center_position = (center_x, center_y)
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = results
+            else:
+                results_patch = results['mix_results'][i - 1]
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+            if with_mask and results_patch.get('gt_masks', None) is not None:
+                gt_masks_i = results_patch['gt_masks']
+                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padw,
+                    direction='horizontal')
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padh,
+                    direction='vertical')
+                mosaic_masks.append(gt_masks_i)
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w])
+            if with_mask:
+                mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
+                results['gt_masks'] = mosaic_masks
+        else:
+            # remove outside bboxes
+            inside_inds = mosaic_bboxes.is_inside(
+                [2 * img_scale_h, 2 * img_scale_w]).numpy()
+            mosaic_bboxes = mosaic_bboxes[inside_inds]
+            mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+            mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+            if with_mask:
+                mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds]
+                results['gt_masks'] = mosaic_masks
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        return results
+    def _mosaic_combine(
+            self, loc: str, center_position_xy: Sequence[float],
+            img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+@TRANSFORMS.register_module()
+class Mosaic9(BaseMixImageTransform):
+    """Mosaic9 augmentation.
+    Given 9 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+    .. code:: text
+                +-------------------------------+------------+
+                | pad           |      pad      |            |
+                |    +----------+               |            |
+                |    |          +---------------+  top_right |
+                |    |          |      top      |   image2   |
+                |    | top_left |     image1    |            |
+                |    |  image8  o--------+------+--------+---+
+                |    |          |        |               |   |
+                +----+----------+        |     right     |pad|
+                |               | center |     image3    |   |
+                |     left      | image0 +---------------+---|
+                |    image7     |        |               |   |
+            +---+-----------+---+--------+               |   |
+            |   |  cropped  |            |  bottom_right |pad|
+            |   |bottom_left|            |    image4     |   |
+            |   |  image6   |   bottom   |               |   |
+            +---|-----------+   image5   +---------------+---|
+                |    pad    |            |        pad        |
+                +-----------+------------+-------------------+
+     The mosaic transform steps are as follows:
+         1. Get the center image according to the index, and randomly
+            sample another 8 images from the custom dataset.
+         2. Randomly offset the image after Mosaic
+    Required Keys:
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+    Modified Keys:
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 5 caches for each image suffices for
+            randomness. Defaults to 50.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 bbox_clip_border: bool = True,
+                 pad_val: Union[float, int] = 114.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 50,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        if use_cached:
+            assert max_cached_images >= 9, 'The length of cache must >= 9, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.img_scale = img_scale
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+        # intermediate variables
+        self._current_img_shape = [0, 0]
+        self._center_img_shape = [0, 0]
+        self._previous_img_shape = [0, 0]
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> list:
+        """Call function to collect indexes.
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+        Returns:
+            list: indexes.
+        """
+        indexes = [random.randint(0, len(dataset)) for _ in range(8)]
+        return indexes
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        img_scale_w, img_scale_h = self.img_scale
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(img_scale_h * 3), int(img_scale_w * 3), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)),
+                                 self.pad_val,
+                                 dtype=results['img'].dtype)
+        # index = 0 is mean original image
+        # len(results['mix_results']) = 8
+        loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right',
+                    'bottom', 'bottom_left', 'left', 'top_left')
+        results_all = [results, *results['mix_results']]
+        for index, results_patch in enumerate(results_all):
+            img_i = results_patch['img']
+            # keep_ratio resize
+            img_i_h, img_i_w = img_i.shape[:2]
+            scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w)
+            img_i = mmcv.imresize(
+                img_i,
+                (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i)))
+            paste_coord = self._mosaic_combine(loc_strs[index],
+                                               img_i.shape[:2])
+            padw, padh = paste_coord[:2]
+            x1, y1, x2, y2 = (max(x, 0) for x in paste_coord)
+            mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:]
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+        # Offset
+        offset_x = int(random.uniform(0, img_scale_w))
+        offset_y = int(random.uniform(0, img_scale_h))
+        mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h,
+                                offset_x:offset_x + 2 * img_scale_w]
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes.translate_([-offset_x, -offset_y])
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w])
+        else:
+            # remove outside bboxes
+            inside_inds = mosaic_bboxes.is_inside(
+                [2 * img_scale_h, 2 * img_scale_w]).numpy()
+            mosaic_bboxes = mosaic_bboxes[inside_inds]
+            mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+            mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        return results
+    def _mosaic_combine(self, loc: str,
+                        img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]:
+        """Calculate global coordinate of mosaic image.
+        Args:
+            loc (str): Index for the sub-image.
+            img_shape_hw (Sequence[int]): Height and width of sub-image
+        Returns:
+             paste_coord (tuple): paste corner coordinate in mosaic image.
+        """
+        assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right',
+                       'bottom', 'bottom_left', 'left', 'top_left')
+        img_scale_w, img_scale_h = self.img_scale
+        self._current_img_shape = img_shape_hw
+        current_img_h, current_img_w = self._current_img_shape
+        previous_img_h, previous_img_w = self._previous_img_shape
+        center_img_h, center_img_w = self._center_img_shape
+        if loc == 'center':
+            self._center_img_shape = self._current_img_shape
+            #  xmin, ymin, xmax, ymax
+            paste_coord = img_scale_w, \
+                img_scale_h, \
+                img_scale_w + current_img_w, \
+                img_scale_h + current_img_h
+        elif loc == 'top':
+            paste_coord = img_scale_w, \
+                          img_scale_h - current_img_h, \
+                          img_scale_w + current_img_w, \
+                          img_scale_h
+        elif loc == 'top_right':
+            paste_coord = img_scale_w + previous_img_w, \
+                          img_scale_h - current_img_h, \
+                          img_scale_w + previous_img_w + current_img_w, \
+                          img_scale_h
+        elif loc == 'right':
+            paste_coord = img_scale_w + center_img_w, \
+                          img_scale_h, \
+                          img_scale_w + center_img_w + current_img_w, \
+                          img_scale_h + current_img_h
+        elif loc == 'bottom_right':
+            paste_coord = img_scale_w + center_img_w, \
+                          img_scale_h + previous_img_h, \
+                          img_scale_w + center_img_w + current_img_w, \
+                          img_scale_h + previous_img_h + current_img_h
+        elif loc == 'bottom':
+            paste_coord = img_scale_w + center_img_w - current_img_w, \
+                          img_scale_h + center_img_h, \
+                          img_scale_w + center_img_w, \
+                          img_scale_h + center_img_h + current_img_h
+        elif loc == 'bottom_left':
+            paste_coord = img_scale_w + center_img_w - \
+                          previous_img_w - current_img_w, \
+                          img_scale_h + center_img_h, \
+                          img_scale_w + center_img_w - previous_img_w, \
+                          img_scale_h + center_img_h + current_img_h
+        elif loc == 'left':
+            paste_coord = img_scale_w - current_img_w, \
+                          img_scale_h + center_img_h - current_img_h, \
+                          img_scale_w, \
+                          img_scale_h + center_img_h
+        elif loc == 'top_left':
+            paste_coord = img_scale_w - current_img_w, \
+                          img_scale_h + center_img_h - \
+                          previous_img_h - current_img_h, \
+                          img_scale_w, \
+                          img_scale_h + center_img_h - previous_img_h
+        self._previous_img_shape = self._current_img_shape
+        #  xmin, ymin, xmax, ymax
+        return paste_coord
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+@TRANSFORMS.register_module()
+class YOLOv5MixUp(BaseMixImageTransform):
+    """MixUp data augmentation for YOLOv5.
+    .. code:: text
+    The mixup transform steps are as follows:
+        1. Another random image is picked by dataset.
+        2. Randomly obtain the fusion ratio from the beta distribution,
+            then fuse the target
+        of the original image and mixup image through this ratio.
+    Required Keys:
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+    Modified Keys:
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    Args:
+        alpha (float): parameter of beta distribution to get mixup ratio.
+            Defaults to 32.
+        beta (float):  parameter of beta distribution to get mixup ratio.
+            Defaults to 32.
+        pre_transform (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_refetch`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+    """
+    def __init__(self,
+                 alpha: float = 32.0,
+                 beta: float = 32.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        if use_cached:
+            assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.alpha = alpha
+        self.beta = beta
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> int:
+        """Call function to collect indexes.
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+        Returns:
+            int: indexes.
+        """
+        return random.randint(0, len(dataset))
+    def mix_img_transform(self, results: dict) -> dict:
+        """YOLOv5 MixUp transform function.
+        Args:
+            results (dict): Result dict
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+        ori_img = results['img']
+        assert ori_img.shape == retrieve_img.shape
+        # Randomly obtain the fusion ratio from the beta distribution,
+        # which is around 0.5
+        ratio = np.random.beta(self.alpha, self.beta)
+        mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio))
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+        mixup_gt_bboxes = retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        if 'gt_masks' in results:
+            assert 'gt_masks' in retrieve_results
+            mixup_gt_masks = results['gt_masks'].cat(
+                [results['gt_masks'], retrieve_results['gt_masks']])
+            results['gt_masks'] = mixup_gt_masks
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+        return results
+@TRANSFORMS.register_module()
+class YOLOXMixUp(BaseMixImageTransform):
+    """MixUp data augmentation for YOLOX.
+    .. code:: text
+                         mixup transform
+                +---------------+--------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                +---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      +-----------------+     |
+                |             pad              |
+                +------------------------------+
+    The mixup transform steps are as follows:
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+    Required Keys:
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+    Modified Keys:
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_refetch`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+    """
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 bbox_clip_border: bool = True,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        if use_cached:
+            assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.img_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.bbox_clip_border = bbox_clip_border
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> int:
+        """Call function to collect indexes.
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+        Returns:
+            int: indexes.
+        """
+        return random.randint(0, len(dataset))
+    def mix_img_transform(self, results: dict) -> dict:
+        """YOLOX MixUp transform function.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+        jit_factor = random.uniform(*self.ratio_range)
+        is_filp = random.uniform(0, 1) > self.flip_ratio
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones((self.img_scale[1], self.img_scale[0], 3),
+                              dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val
+        # 1. keep_ratio resize
+        scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0],
+                          self.img_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+        # 4. flip
+        if is_filp:
+            out_img = out_img[:, ::-1, :]
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+        if is_filp:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+        # 8. mix up
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        if not self.bbox_clip_border:
+            # remove outside bbox
+            inside_inds = mixup_gt_bboxes.is_inside([target_h,
+                                                     target_w]).numpy()
+            mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+            mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+            mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+        return results
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_refetch={self.max_refetch}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str

mmyolo/datasets/transforms/transforms.py ADDED Viewed

	@@ -0,0 +1,1557 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from copy import deepcopy
+from typing import List, Sequence, Tuple, Union
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv.transforms import BaseTransform, Compose
+from mmcv.transforms.utils import cache_randomness
+from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations
+from mmdet.datasets.transforms import Resize as MMDET_Resize
+from mmdet.structures.bbox import (HorizontalBoxes, autocast_box_type,
+                                   get_box_type)
+from mmdet.structures.mask import PolygonMasks
+from numpy import random
+from mmyolo.registry import TRANSFORMS
+# TODO: Waiting for MMCV support
+TRANSFORMS.register_module(module=Compose, force=True)
+@TRANSFORMS.register_module()
+class YOLOv5KeepRatioResize(MMDET_Resize):
+    """Resize images & bbox(if existed).
+    This transform resizes the input image according to ``scale``.
+    Bboxes (if existed) are then resized with the same scale factor.
+    Required Keys:
+    - img (np.uint8)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    Modified Keys:
+    - img (np.uint8)
+    - img_shape (tuple)
+    - gt_bboxes (optional)
+    - scale (float)
+    Added Keys:
+    - scale_factor (np.float32)
+    Args:
+        scale (Union[int, Tuple[int, int]]): Images scales for resizing.
+    """
+    def __init__(self,
+                 scale: Union[int, Tuple[int, int]],
+                 keep_ratio: bool = True,
+                 **kwargs):
+        assert keep_ratio is True
+        super().__init__(scale=scale, keep_ratio=True, **kwargs)
+    @staticmethod
+    def _get_rescale_ratio(old_size: Tuple[int, int],
+                           scale: Union[float, Tuple[int]]) -> float:
+        """Calculate the ratio for rescaling.
+        Args:
+            old_size (tuple[int]): The old size (w, h) of image.
+            scale (float | tuple[int]): The scaling factor or maximum size.
+                If it is a float number, then the image will be rescaled by
+                this factor, else if it is a tuple of 2 integers, then
+                the image will be rescaled as large as possible within
+                the scale.
+        Returns:
+            float: The resize ratio.
+        """
+        w, h = old_size
+        if isinstance(scale, (float, int)):
+            if scale <= 0:
+                raise ValueError(f'Invalid scale {scale}, must be positive.')
+            scale_factor = scale
+        elif isinstance(scale, tuple):
+            max_long_edge = max(scale)
+            max_short_edge = min(scale)
+            scale_factor = min(max_long_edge / max(h, w),
+                               max_short_edge / min(h, w))
+        else:
+            raise TypeError('Scale must be a number or tuple of int, '
+                            f'but got {type(scale)}')
+        return scale_factor
+    def _resize_img(self, results: dict):
+        """Resize images with ``results['scale']``."""
+        assert self.keep_ratio is True
+        if results.get('img', None) is not None:
+            image = results['img']
+            original_h, original_w = image.shape[:2]
+            ratio = self._get_rescale_ratio((original_h, original_w),
+                                            self.scale)
+            if ratio != 1:
+                # resize image according to the ratio
+                image = mmcv.imrescale(
+                    img=image,
+                    scale=ratio,
+                    interpolation='area' if ratio < 1 else 'bilinear',
+                    backend=self.backend)
+            resized_h, resized_w = image.shape[:2]
+            scale_ratio = resized_h / original_h
+            scale_factor = (scale_ratio, scale_ratio)
+            results['img'] = image
+            results['img_shape'] = image.shape[:2]
+            results['scale_factor'] = scale_factor
+@TRANSFORMS.register_module()
+class LetterResize(MMDET_Resize):
+    """Resize and pad image while meeting stride-multiple constraints.
+    Required Keys:
+    - img (np.uint8)
+    - batch_shape (np.int64) (optional)
+    Modified Keys:
+    - img (np.uint8)
+    - img_shape (tuple)
+    - gt_bboxes (optional)
+    Added Keys:
+    - pad_param (np.float32)
+    Args:
+        scale (Union[int, Tuple[int, int]]): Images scales for resizing.
+        pad_val (dict): Padding value. Defaults to dict(img=0, seg=255).
+        use_mini_pad (bool): Whether using minimum rectangle padding.
+            Defaults to True
+        stretch_only (bool): Whether stretch to the specified size directly.
+            Defaults to False
+        allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True
+    """
+    def __init__(self,
+                 scale: Union[int, Tuple[int, int]],
+                 pad_val: dict = dict(img=0, mask=0, seg=255),
+                 use_mini_pad: bool = False,
+                 stretch_only: bool = False,
+                 allow_scale_up: bool = True,
+                 **kwargs):
+        super().__init__(scale=scale, keep_ratio=True, **kwargs)
+        self.pad_val = pad_val
+        if isinstance(pad_val, (int, float)):
+            pad_val = dict(img=pad_val, seg=255)
+        assert isinstance(
+            pad_val, dict), f'pad_val must be dict, but got {type(pad_val)}'
+        self.use_mini_pad = use_mini_pad
+        self.stretch_only = stretch_only
+        self.allow_scale_up = allow_scale_up
+    def _resize_img(self, results: dict):
+        """Resize images with ``results['scale']``."""
+        image = results.get('img', None)
+        if image is None:
+            return
+        # Use batch_shape if a batch_shape policy is configured
+        if 'batch_shape' in results:
+            scale = tuple(results['batch_shape'])  # hw
+        else:
+            scale = self.scale[::-1]  # wh -> hw
+        image_shape = image.shape[:2]  # height, width
+        # Scale ratio (new / old)
+        ratio = min(scale[0] / image_shape[0], scale[1] / image_shape[1])
+        # only scale down, do not scale up (for better test mAP)
+        if not self.allow_scale_up:
+            ratio = min(ratio, 1.0)
+        ratio = [ratio, ratio]  # float -> (float, float) for (height, width)
+        # compute the best size of the image
+        no_pad_shape = (int(round(image_shape[0] * ratio[0])),
+                        int(round(image_shape[1] * ratio[1])))
+        # padding height & width
+        padding_h, padding_w = [
+            scale[0] - no_pad_shape[0], scale[1] - no_pad_shape[1]
+        ]
+        if self.use_mini_pad:
+            # minimum rectangle padding
+            padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32)
+        elif self.stretch_only:
+            # stretch to the specified size directly
+            padding_h, padding_w = 0.0, 0.0
+            no_pad_shape = (scale[0], scale[1])
+            ratio = [scale[0] / image_shape[0],
+                     scale[1] / image_shape[1]]  # height, width ratios
+        if image_shape != no_pad_shape:
+            # compare with no resize and padding size
+            image = mmcv.imresize(
+                image, (no_pad_shape[1], no_pad_shape[0]),
+                interpolation=self.interpolation,
+                backend=self.backend)
+        scale_factor = (ratio[1], ratio[0])  # mmcv scale factor is (w, h)
+        if 'scale_factor' in results:
+            results['scale_factor_origin'] = results['scale_factor']
+        results['scale_factor'] = scale_factor
+        # padding
+        top_padding, left_padding = int(round(padding_h // 2 - 0.1)), int(
+            round(padding_w // 2 - 0.1))
+        bottom_padding = padding_h - top_padding
+        right_padding = padding_w - left_padding
+        padding_list = [
+            top_padding, bottom_padding, left_padding, right_padding
+        ]
+        if top_padding != 0 or bottom_padding != 0 or \
+                left_padding != 0 or right_padding != 0:
+            pad_val = self.pad_val.get('img', 0)
+            if isinstance(pad_val, int) and image.ndim == 3:
+                pad_val = tuple(pad_val for _ in range(image.shape[2]))
+            image = mmcv.impad(
+                img=image,
+                padding=(padding_list[2], padding_list[0], padding_list[3],
+                         padding_list[1]),
+                pad_val=pad_val,
+                padding_mode='constant')
+        results['img'] = image
+        results['img_shape'] = image.shape
+        if 'pad_param' in results:
+            results['pad_param_origin'] = results['pad_param'] * \
+                                          np.repeat(ratio, 2)
+        results['pad_param'] = np.array(padding_list, dtype=np.float32)
+    def _resize_masks(self, results: dict):
+        """Resize masks with ``results['scale']``"""
+        if results.get('gt_masks', None) is None:
+            return
+        gt_masks = results['gt_masks']
+        assert isinstance(
+            gt_masks, PolygonMasks
+        ), f'Only supports PolygonMasks, but got {type(gt_masks)}'
+        # resize the gt_masks
+        gt_mask_h = results['gt_masks'].height * results['scale_factor'][1]
+        gt_mask_w = results['gt_masks'].width * results['scale_factor'][0]
+        gt_masks = results['gt_masks'].resize(
+            (int(round(gt_mask_h)), int(round(gt_mask_w))))
+        top_padding, _, left_padding, _ = results['pad_param']
+        if int(left_padding) != 0:
+            gt_masks = gt_masks.translate(
+                out_shape=results['img_shape'][:2],
+                offset=int(left_padding),
+                direction='horizontal')
+        if int(top_padding) != 0:
+            gt_masks = gt_masks.translate(
+                out_shape=results['img_shape'][:2],
+                offset=int(top_padding),
+                direction='vertical')
+        results['gt_masks'] = gt_masks
+    def _resize_bboxes(self, results: dict):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_bboxes', None) is None:
+            return
+        results['gt_bboxes'].rescale_(results['scale_factor'])
+        if len(results['pad_param']) != 4:
+            return
+        results['gt_bboxes'].translate_(
+            (results['pad_param'][2], results['pad_param'][0]))
+        if self.clip_object_border:
+            results['gt_bboxes'].clip_(results['img_shape'])
+    def transform(self, results: dict) -> dict:
+        results = super().transform(results)
+        if 'scale_factor_origin' in results:
+            scale_factor_origin = results.pop('scale_factor_origin')
+            results['scale_factor'] = (results['scale_factor'][0] *
+                                       scale_factor_origin[0],
+                                       results['scale_factor'][1] *
+                                       scale_factor_origin[1])
+        if 'pad_param_origin' in results:
+            pad_param_origin = results.pop('pad_param_origin')
+            results['pad_param'] += pad_param_origin
+        return results
+# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug
+@TRANSFORMS.register_module()
+class YOLOv5HSVRandomAug(BaseTransform):
+    """Apply HSV augmentation to image sequentially.
+    Required Keys:
+    - img
+    Modified Keys:
+    - img
+    Args:
+        hue_delta ([int, float]): delta of hue. Defaults to 0.015.
+        saturation_delta ([int, float]): delta of saturation. Defaults to 0.7.
+        value_delta ([int, float]): delta of value. Defaults to 0.4.
+    """
+    def __init__(self,
+                 hue_delta: Union[int, float] = 0.015,
+                 saturation_delta: Union[int, float] = 0.7,
+                 value_delta: Union[int, float] = 0.4):
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+    def transform(self, results: dict) -> dict:
+        """The HSV augmentation transform function.
+        Args:
+            results (dict): The result dict.
+        Returns:
+            dict: The result dict.
+        """
+        hsv_gains = \
+            random.uniform(-1, 1, 3) * \
+            [self.hue_delta, self.saturation_delta, self.value_delta] + 1
+        hue, sat, val = cv2.split(
+            cv2.cvtColor(results['img'], cv2.COLOR_BGR2HSV))
+        table_list = np.arange(0, 256, dtype=hsv_gains.dtype)
+        lut_hue = ((table_list * hsv_gains[0]) % 180).astype(np.uint8)
+        lut_sat = np.clip(table_list * hsv_gains[1], 0, 255).astype(np.uint8)
+        lut_val = np.clip(table_list * hsv_gains[2], 0, 255).astype(np.uint8)
+        im_hsv = cv2.merge(
+            (cv2.LUT(hue, lut_hue), cv2.LUT(sat,
+                                            lut_sat), cv2.LUT(val, lut_val)))
+        results['img'] = cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR)
+        return results
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+@TRANSFORMS.register_module()
+class LoadAnnotations(MMDET_LoadAnnotations):
+    """Because the yolo series does not need to consider ignore bboxes for the
+    time being, in order to speed up the pipeline, it can be excluded in
+    advance."""
+    def __init__(self,
+                 mask2bbox: bool = False,
+                 poly2mask: bool = False,
+                 **kwargs) -> None:
+        self.mask2bbox = mask2bbox
+        assert not poly2mask, 'Does not support BitmapMasks considering ' \
+                              'that bitmap consumes more memory.'
+        super().__init__(poly2mask=poly2mask, **kwargs)
+        if self.mask2bbox:
+            assert self.with_mask, 'Using mask2bbox requires ' \
+                                   'with_mask is True.'
+        self._mask_ignore_flag = None
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation.
+        """
+        if self.mask2bbox:
+            self._load_masks(results)
+            if self.with_label:
+                self._load_labels(results)
+                self._update_mask_ignore_data(results)
+            gt_bboxes = results['gt_masks'].get_bboxes(dst_type='hbox')
+            results['gt_bboxes'] = gt_bboxes
+        else:
+            results = super().transform(results)
+            self._update_mask_ignore_data(results)
+        return results
+    def _update_mask_ignore_data(self, results: dict) -> None:
+        if 'gt_masks' not in results:
+            return
+        if 'gt_bboxes_labels' in results and len(
+                results['gt_bboxes_labels']) != len(results['gt_masks']):
+            assert len(results['gt_bboxes_labels']) == len(
+                self._mask_ignore_flag)
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                self._mask_ignore_flag]
+        if 'gt_bboxes' in results and len(results['gt_bboxes']) != len(
+                results['gt_masks']):
+            assert len(results['gt_bboxes']) == len(self._mask_ignore_flag)
+            results['gt_bboxes'] = results['gt_bboxes'][self._mask_ignore_flag]
+    def _load_bboxes(self, results: dict):
+        """Private function to load bounding box annotations.
+        Note: BBoxes with ignore_flag of 1 is not considered.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            if instance['ignore_flag'] == 0:
+                gt_bboxes.append(instance['bbox'])
+                gt_ignore_flags.append(instance['ignore_flag'])
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+        if self.box_type is None:
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape((-1, 4))
+        else:
+            _, box_type_cls = get_box_type(self.box_type)
+            results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32)
+    def _load_labels(self, results: dict):
+        """Private function to load label annotations.
+        Note: BBoxes with ignore_flag of 1 is not considered.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results.get('instances', []):
+            if instance['ignore_flag'] == 0:
+                gt_bboxes_labels.append(instance['bbox_label'])
+        results['gt_bboxes_labels'] = np.array(
+            gt_bboxes_labels, dtype=np.int64)
+    def _load_masks(self, results: dict) -> None:
+        """Private function to load mask annotations.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        """
+        gt_masks = []
+        gt_ignore_flags = []
+        self._mask_ignore_flag = []
+        for instance in results.get('instances', []):
+            if instance['ignore_flag'] == 0:
+                if 'mask' in instance:
+                    gt_mask = instance['mask']
+                    if isinstance(gt_mask, list):
+                        gt_mask = [
+                            np.array(polygon) for polygon in gt_mask
+                            if len(polygon) % 2 == 0 and len(polygon) >= 6
+                        ]
+                        if len(gt_mask) == 0:
+                            # ignore
+                            self._mask_ignore_flag.append(0)
+                        else:
+                            gt_masks.append(gt_mask)
+                            gt_ignore_flags.append(instance['ignore_flag'])
+                            self._mask_ignore_flag.append(1)
+                    else:
+                        raise NotImplementedError(
+                            'Only supports mask annotations in polygon '
+                            'format currently')
+                else:
+                    # TODO: Actually, gt with bbox and without mask needs
+                    #  to be retained
+                    self._mask_ignore_flag.append(0)
+        self._mask_ignore_flag = np.array(self._mask_ignore_flag, dtype=bool)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+        h, w = results['ori_shape']
+        gt_masks = PolygonMasks([mask for mask in gt_masks], h, w)
+        results['gt_masks'] = gt_masks
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'mask2bbox={self.mask2bbox}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'file_client_args={self.file_client_args})'
+        return repr_str
+@TRANSFORMS.register_module()
+class YOLOv5RandomAffine(BaseTransform):
+    """Random affine transform data augmentation in YOLOv5 and YOLOv8. It is
+    different from the implementation in YOLOX.
+    This operation randomly generates affine transform matrix which including
+    rotation, translation, shear and scaling transforms.
+    If you set use_mask_refine == True, the code will use the masks
+    annotation to refine the bbox.
+    Our implementation is slightly different from the official. In COCO
+    dataset, a gt may have multiple mask tags.  The official YOLOv5
+    annotation file already combines the masks that an object has,
+    but our code takes into account the fact that an object has multiple masks.
+    Required Keys:
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (PolygonMasks) (optional)
+    Modified Keys:
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (PolygonMasks) (optional)
+    Args:
+        max_rotate_degree (float): Maximum degrees of rotation transform.
+            Defaults to 10.
+        max_translate_ratio (float): Maximum ratio of translation.
+            Defaults to 0.1.
+        scaling_ratio_range (tuple[float]): Min and max ratio of
+            scaling transform. Defaults to (0.5, 1.5).
+        max_shear_degree (float): Maximum degrees of shear
+            transform. Defaults to 2.
+        border (tuple[int]): Distance from width and height sides of input
+            image to adjust output shape. Only used in mosaic dataset.
+            Defaults to (0, 0).
+        border_val (tuple[int]): Border padding values of 3 channels.
+            Defaults to (114, 114, 114).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        min_bbox_size (float): Width and height threshold to filter bboxes.
+            If the height or width of a box is smaller than this value, it
+            will be removed. Defaults to 2.
+        min_area_ratio (float): Threshold of area ratio between
+            original bboxes and wrapped bboxes. If smaller than this value,
+            the box will be removed. Defaults to 0.1.
+        use_mask_refine (bool): Whether to refine bbox by mask.
+        max_aspect_ratio (float): Aspect ratio of width and height
+            threshold to filter bboxes. If max(h/w, w/h) larger than this
+            value, the box will be removed. Defaults to 20.
+        resample_num (int): Number of poly to resample to.
+    """
+    def __init__(self,
+                 max_rotate_degree: float = 10.0,
+                 max_translate_ratio: float = 0.1,
+                 scaling_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 max_shear_degree: float = 2.0,
+                 border: Tuple[int, int] = (0, 0),
+                 border_val: Tuple[int, int, int] = (114, 114, 114),
+                 bbox_clip_border: bool = True,
+                 min_bbox_size: int = 2,
+                 min_area_ratio: float = 0.1,
+                 use_mask_refine: bool = False,
+                 max_aspect_ratio: float = 20.,
+                 resample_num: int = 1000):
+        assert 0 <= max_translate_ratio <= 1
+        assert scaling_ratio_range[0] <= scaling_ratio_range[1]
+        assert scaling_ratio_range[0] > 0
+        self.max_rotate_degree = max_rotate_degree
+        self.max_translate_ratio = max_translate_ratio
+        self.scaling_ratio_range = scaling_ratio_range
+        self.max_shear_degree = max_shear_degree
+        self.border = border
+        self.border_val = border_val
+        self.bbox_clip_border = bbox_clip_border
+        self.min_bbox_size = min_bbox_size
+        self.min_area_ratio = min_area_ratio
+        self.use_mask_refine = use_mask_refine
+        self.max_aspect_ratio = max_aspect_ratio
+        self.resample_num = resample_num
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """The YOLOv5 random affine transform function.
+        Args:
+            results (dict): The result dict.
+        Returns:
+            dict: The result dict.
+        """
+        img = results['img']
+        # self.border is wh format
+        height = img.shape[0] + self.border[1] * 2
+        width = img.shape[1] + self.border[0] * 2
+        # Note: Different from YOLOX
+        center_matrix = np.eye(3, dtype=np.float32)
+        center_matrix[0, 2] = -img.shape[1] / 2
+        center_matrix[1, 2] = -img.shape[0] / 2
+        warp_matrix, scaling_ratio = self._get_random_homography_matrix(
+            height, width)
+        warp_matrix = warp_matrix @ center_matrix
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape
+        img_h, img_w = img.shape[:2]
+        bboxes = results['gt_bboxes']
+        num_bboxes = len(bboxes)
+        if num_bboxes:
+            orig_bboxes = bboxes.clone()
+            if self.use_mask_refine and 'gt_masks' in results:
+                # If the dataset has annotations of mask,
+                # the mask will be used to refine bbox.
+                gt_masks = results['gt_masks']
+                gt_masks_resample = self.resample_masks(gt_masks)
+                gt_masks = self.warp_mask(gt_masks_resample, warp_matrix,
+                                          img_h, img_w)
+                # refine bboxes by masks
+                bboxes = gt_masks.get_bboxes(dst_type='hbox')
+                # filter bboxes outside image
+                valid_index = self.filter_gt_bboxes(orig_bboxes,
+                                                    bboxes).numpy()
+                results['gt_masks'] = gt_masks[valid_index]
+            else:
+                bboxes.project_(warp_matrix)
+                if self.bbox_clip_border:
+                    bboxes.clip_([height, width])
+                # filter bboxes
+                orig_bboxes.rescale_([scaling_ratio, scaling_ratio])
+                # Be careful: valid_index must convert to numpy,
+                # otherwise it will raise out of bounds when len(valid_index)=1
+                valid_index = self.filter_gt_bboxes(orig_bboxes,
+                                                    bboxes).numpy()
+                if 'gt_masks' in results:
+                    results['gt_masks'] = PolygonMasks(
+                        results['gt_masks'].masks, img_h, img_w)
+            results['gt_bboxes'] = bboxes[valid_index]
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_index]
+            results['gt_ignore_flags'] = results['gt_ignore_flags'][
+                valid_index]
+        return results
+    @staticmethod
+    def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int,
+                  img_h: int) -> np.ndarray:
+        """Function to warp one mask and filter points outside image.
+        Args:
+            poly (np.ndarray): Segmentation annotation with shape (n, ) and
+                with format (x1, y1, x2, y2, ...).
+            warp_matrix (np.ndarray): Affine transformation matrix.
+                Shape: (3, 3).
+            img_w (int): Width of output image.
+            img_h (int): Height of output image.
+        """
+        # TODO: Current logic may cause retained masks unusable for
+        #  semantic segmentation training, which is same as official
+        #  implementation.
+        poly = poly.reshape((-1, 2))
+        poly = np.concatenate((poly, np.ones(
+            (len(poly), 1), dtype=poly.dtype)),
+                              axis=-1)
+        # transform poly
+        poly = poly @ warp_matrix.T
+        poly = poly[:, :2] / poly[:, 2:3]
+        # filter point outside image
+        x, y = poly.T
+        valid_ind_point = (x >= 0) & (y >= 0) & (x <= img_w) & (y <= img_h)
+        return poly[valid_ind_point].reshape(-1)
+    def warp_mask(self, gt_masks: PolygonMasks, warp_matrix: np.ndarray,
+                  img_w: int, img_h: int) -> PolygonMasks:
+        """Warp masks by warp_matrix and retain masks inside image after
+        warping.
+        Args:
+            gt_masks (PolygonMasks): Annotations of semantic segmentation.
+            warp_matrix (np.ndarray): Affine transformation matrix.
+                Shape: (3, 3).
+            img_w (int): Width of output image.
+            img_h (int): Height of output image.
+        Returns:
+            PolygonMasks: Masks after warping.
+        """
+        masks = gt_masks.masks
+        new_masks = []
+        for poly_per_obj in masks:
+            warpped_poly_per_obj = []
+            # One gt may have multiple masks.
+            for poly in poly_per_obj:
+                valid_poly = self.warp_poly(poly, warp_matrix, img_w, img_h)
+                if len(valid_poly):
+                    warpped_poly_per_obj.append(valid_poly.reshape(-1))
+            # If all the masks are invalid,
+            # add [0, 0, 0, 0, 0, 0,] here.
+            if not warpped_poly_per_obj:
+                # This will be filtered in function `filter_gt_bboxes`.
+                warpped_poly_per_obj = [
+                    np.zeros(6, dtype=poly_per_obj[0].dtype)
+                ]
+            new_masks.append(warpped_poly_per_obj)
+        gt_masks = PolygonMasks(new_masks, img_h, img_w)
+        return gt_masks
+    def resample_masks(self, gt_masks: PolygonMasks) -> PolygonMasks:
+        """Function to resample each mask annotation with shape (2 * n, ) to
+        shape (resample_num * 2, ).
+        Args:
+            gt_masks (PolygonMasks): Annotations of semantic segmentation.
+        """
+        masks = gt_masks.masks
+        new_masks = []
+        for poly_per_obj in masks:
+            resample_poly_per_obj = []
+            for poly in poly_per_obj:
+                poly = poly.reshape((-1, 2))  # xy
+                poly = np.concatenate((poly, poly[0:1, :]), axis=0)
+                x = np.linspace(0, len(poly) - 1, self.resample_num)
+                xp = np.arange(len(poly))
+                poly = np.concatenate([
+                    np.interp(x, xp, poly[:, i]) for i in range(2)
+                ]).reshape(2, -1).T.reshape(-1)
+                resample_poly_per_obj.append(poly)
+            new_masks.append(resample_poly_per_obj)
+        return PolygonMasks(new_masks, gt_masks.height, gt_masks.width)
+    def filter_gt_bboxes(self, origin_bboxes: HorizontalBoxes,
+                         wrapped_bboxes: HorizontalBoxes) -> torch.Tensor:
+        """Filter gt bboxes.
+        Args:
+            origin_bboxes (HorizontalBoxes): Origin bboxes.
+            wrapped_bboxes (HorizontalBoxes): Wrapped bboxes
+        Returns:
+            dict: The result dict.
+        """
+        origin_w = origin_bboxes.widths
+        origin_h = origin_bboxes.heights
+        wrapped_w = wrapped_bboxes.widths
+        wrapped_h = wrapped_bboxes.heights
+        aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16),
+                                  wrapped_h / (wrapped_w + 1e-16))
+        wh_valid_idx = (wrapped_w > self.min_bbox_size) & \
+                       (wrapped_h > self.min_bbox_size)
+        area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h +
+                                                  1e-16) > self.min_area_ratio
+        aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio
+        return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx
+    @cache_randomness
+    def _get_random_homography_matrix(self, height: int,
+                                      width: int) -> Tuple[np.ndarray, float]:
+        """Get random homography matrix.
+        Args:
+            height (int): Image height.
+            width (int): Image width.
+        Returns:
+            Tuple[np.ndarray, float]: The result of warp_matrix and
+            scaling_ratio.
+        """
+        # Rotation
+        rotation_degree = random.uniform(-self.max_rotate_degree,
+                                         self.max_rotate_degree)
+        rotation_matrix = self._get_rotation_matrix(rotation_degree)
+        # Scaling
+        scaling_ratio = random.uniform(self.scaling_ratio_range[0],
+                                       self.scaling_ratio_range[1])
+        scaling_matrix = self._get_scaling_matrix(scaling_ratio)
+        # Shear
+        x_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        y_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        shear_matrix = self._get_shear_matrix(x_degree, y_degree)
+        # Translation
+        trans_x = random.uniform(0.5 - self.max_translate_ratio,
+                                 0.5 + self.max_translate_ratio) * width
+        trans_y = random.uniform(0.5 - self.max_translate_ratio,
+                                 0.5 + self.max_translate_ratio) * height
+        translate_matrix = self._get_translation_matrix(trans_x, trans_y)
+        warp_matrix = (
+            translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
+        return warp_matrix, scaling_ratio
+    @staticmethod
+    def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray:
+        """Get rotation matrix.
+        Args:
+            rotate_degrees (float): Rotate degrees.
+        Returns:
+            np.ndarray: The rotation matrix.
+        """
+        radian = math.radians(rotate_degrees)
+        rotation_matrix = np.array(
+            [[np.cos(radian), -np.sin(radian), 0.],
+             [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return rotation_matrix
+    @staticmethod
+    def _get_scaling_matrix(scale_ratio: float) -> np.ndarray:
+        """Get scaling matrix.
+        Args:
+            scale_ratio (float): Scale ratio.
+        Returns:
+            np.ndarray: The scaling matrix.
+        """
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+    @staticmethod
+    def _get_shear_matrix(x_shear_degrees: float,
+                          y_shear_degrees: float) -> np.ndarray:
+        """Get shear matrix.
+        Args:
+            x_shear_degrees (float): X shear degrees.
+            y_shear_degrees (float): Y shear degrees.
+        Returns:
+            np.ndarray: The shear matrix.
+        """
+        x_radian = math.radians(x_shear_degrees)
+        y_radian = math.radians(y_shear_degrees)
+        shear_matrix = np.array([[1, np.tan(x_radian), 0.],
+                                 [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
+                                dtype=np.float32)
+        return shear_matrix
+    @staticmethod
+    def _get_translation_matrix(x: float, y: float) -> np.ndarray:
+        """Get translation matrix.
+        Args:
+            x (float): X translation.
+            y (float): Y translation.
+        Returns:
+            np.ndarray: The translation matrix.
+        """
+        translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
+                                      dtype=np.float32)
+        return translation_matrix
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
+        repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
+        repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, '
+        repr_str += f'max_shear_degree={self.max_shear_degree}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'border_val={self.border_val}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+@TRANSFORMS.register_module()
+class PPYOLOERandomDistort(BaseTransform):
+    """Random hue, saturation, contrast and brightness distortion.
+    Required Keys:
+    - img
+    Modified Keys:
+    - img (np.float32)
+    Args:
+        hue_cfg (dict): Hue settings. Defaults to dict(min=-18,
+            max=18, prob=0.5).
+        saturation_cfg (dict): Saturation settings. Defaults to dict(
+            min=0.5, max=1.5, prob=0.5).
+        contrast_cfg (dict): Contrast settings. Defaults to dict(
+            min=0.5, max=1.5, prob=0.5).
+        brightness_cfg (dict): Brightness settings. Defaults to dict(
+            min=0.5, max=1.5, prob=0.5).
+        num_distort_func (int): The number of distort function. Defaults
+            to 4.
+    """
+    def __init__(self,
+                 hue_cfg: dict = dict(min=-18, max=18, prob=0.5),
+                 saturation_cfg: dict = dict(min=0.5, max=1.5, prob=0.5),
+                 contrast_cfg: dict = dict(min=0.5, max=1.5, prob=0.5),
+                 brightness_cfg: dict = dict(min=0.5, max=1.5, prob=0.5),
+                 num_distort_func: int = 4):
+        self.hue_cfg = hue_cfg
+        self.saturation_cfg = saturation_cfg
+        self.contrast_cfg = contrast_cfg
+        self.brightness_cfg = brightness_cfg
+        self.num_distort_func = num_distort_func
+        assert 0 < self.num_distort_func <= 4, \
+            'num_distort_func must > 0 and <= 4'
+        for cfg in [
+                self.hue_cfg, self.saturation_cfg, self.contrast_cfg,
+                self.brightness_cfg
+        ]:
+            assert 0. <= cfg['prob'] <= 1., 'prob must >=0 and <=1'
+    def transform_hue(self, results):
+        """Transform hue randomly."""
+        if random.uniform(0., 1.) >= self.hue_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.hue_cfg['min'], self.hue_cfg['max'])
+        u = np.cos(delta * np.pi)
+        w = np.sin(delta * np.pi)
+        delta_iq = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
+        rgb2yiq_matrix = np.array([[0.114, 0.587, 0.299],
+                                   [-0.321, -0.274, 0.596],
+                                   [0.311, -0.523, 0.211]])
+        yiq2rgb_matric = np.array([[1.0, -1.107, 1.705], [1.0, -0.272, -0.647],
+                                   [1.0, 0.956, 0.621]])
+        t = np.dot(np.dot(yiq2rgb_matric, delta_iq), rgb2yiq_matrix).T
+        img = np.dot(img, t)
+        results['img'] = img
+        return results
+    def transform_saturation(self, results):
+        """Transform saturation randomly."""
+        if random.uniform(0., 1.) >= self.saturation_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.saturation_cfg['min'],
+                               self.saturation_cfg['max'])
+        # convert bgr img to gray img
+        gray = img * np.array([[[0.114, 0.587, 0.299]]], dtype=np.float32)
+        gray = gray.sum(axis=2, keepdims=True)
+        gray *= (1.0 - delta)
+        img *= delta
+        img += gray
+        results['img'] = img
+        return results
+    def transform_contrast(self, results):
+        """Transform contrast randomly."""
+        if random.uniform(0., 1.) >= self.contrast_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.contrast_cfg['min'],
+                               self.contrast_cfg['max'])
+        img *= delta
+        results['img'] = img
+        return results
+    def transform_brightness(self, results):
+        """Transform brightness randomly."""
+        if random.uniform(0., 1.) >= self.brightness_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.brightness_cfg['min'],
+                               self.brightness_cfg['max'])
+        img += delta
+        results['img'] = img
+        return results
+    def transform(self, results: dict) -> dict:
+        """The hue, saturation, contrast and brightness distortion function.
+        Args:
+            results (dict): The result dict.
+        Returns:
+            dict: The result dict.
+        """
+        results['img'] = results['img'].astype(np.float32)
+        functions = [
+            self.transform_brightness, self.transform_contrast,
+            self.transform_saturation, self.transform_hue
+        ]
+        distortions = random.permutation(functions)[:self.num_distort_func]
+        for func in distortions:
+            results = func(results)
+        return results
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_cfg={self.hue_cfg}, '
+        repr_str += f'saturation_cfg={self.saturation_cfg}, '
+        repr_str += f'contrast_cfg={self.contrast_cfg}, '
+        repr_str += f'brightness_cfg={self.brightness_cfg}, '
+        repr_str += f'num_distort_func={self.num_distort_func})'
+        return repr_str
+@TRANSFORMS.register_module()
+class PPYOLOERandomCrop(BaseTransform):
+    """Random crop the img and bboxes. Different thresholds are used in PPYOLOE
+    to judge whether the clipped image meets the requirements. This
+    implementation is different from the implementation of RandomCrop in mmdet.
+    Required Keys:
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    Modified Keys:
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    Added Keys:
+    - pad_param (np.float32)
+    Args:
+        aspect_ratio (List[float]): Aspect ratio of cropped region. Default to
+             [.5, 2].
+        thresholds (List[float]): Iou thresholds for deciding a valid bbox crop
+            in [min, max] format. Defaults to [.0, .1, .3, .5, .7, .9].
+        scaling (List[float]): Ratio between a cropped region and the original
+            image in [min, max] format. Default to [.3, 1.].
+        num_attempts (int): Number of tries for each threshold before
+            giving up. Default to 50.
+        allow_no_crop (bool): Allow return without actually cropping them.
+            Default to True.
+        cover_all_box (bool): Ensure all bboxes are covered in the final crop.
+            Default to False.
+    """
+    def __init__(self,
+                 aspect_ratio: List[float] = [.5, 2.],
+                 thresholds: List[float] = [.0, .1, .3, .5, .7, .9],
+                 scaling: List[float] = [.3, 1.],
+                 num_attempts: int = 50,
+                 allow_no_crop: bool = True,
+                 cover_all_box: bool = False):
+        self.aspect_ratio = aspect_ratio
+        self.thresholds = thresholds
+        self.scaling = scaling
+        self.num_attempts = num_attempts
+        self.allow_no_crop = allow_no_crop
+        self.cover_all_box = cover_all_box
+    def _crop_data(self, results: dict, crop_box: Tuple[int, int, int, int],
+                   valid_inds: np.ndarray) -> Union[dict, None]:
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_box (Tuple[int, int, int, int]): Expected absolute coordinates
+                for cropping, (x1, y1, x2, y2).
+            valid_inds (np.ndarray): The indexes of gt that needs to be
+                retained.
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        # crop the image
+        img = results['img']
+        crop_x1, crop_y1, crop_x2, crop_y2 = crop_box
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        results['img'] = img
+        img_shape = img.shape
+        results['img_shape'] = img.shape
+        # crop bboxes accordingly and clip to the image boundary
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes']
+            bboxes.translate_([-crop_x1, -crop_y1])
+            bboxes.clip_(img_shape[:2])
+            results['gt_bboxes'] = bboxes[valid_inds]
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+            if results.get('gt_bboxes_labels', None) is not None:
+                results['gt_bboxes_labels'] = \
+                    results['gt_bboxes_labels'][valid_inds]
+            if results.get('gt_masks', None) is not None:
+                results['gt_masks'] = results['gt_masks'][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+        # crop semantic seg
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
+                                                          crop_x1:crop_x2]
+        return results
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """The random crop transform function.
+        Args:
+            results (dict): The result dict.
+        Returns:
+            dict: The result dict.
+        """
+        if results.get('gt_bboxes', None) is None or len(
+                results['gt_bboxes']) == 0:
+            return results
+        orig_img_h, orig_img_w = results['img'].shape[:2]
+        gt_bboxes = results['gt_bboxes']
+        thresholds = list(self.thresholds)
+        if self.allow_no_crop:
+            thresholds.append('no_crop')
+        random.shuffle(thresholds)
+        for thresh in thresholds:
+            # Determine the coordinates for cropping
+            if thresh == 'no_crop':
+                return results
+            found = False
+            for i in range(self.num_attempts):
+                crop_h, crop_w = self._get_crop_size((orig_img_h, orig_img_w))
+                if self.aspect_ratio is None:
+                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
+                        continue
+                # get image crop_box
+                margin_h = max(orig_img_h - crop_h, 0)
+                margin_w = max(orig_img_w - crop_w, 0)
+                offset_h, offset_w = self._rand_offset((margin_h, margin_w))
+                crop_y1, crop_y2 = offset_h, offset_h + crop_h
+                crop_x1, crop_x2 = offset_w, offset_w + crop_w
+                crop_box = [crop_x1, crop_y1, crop_x2, crop_y2]
+                # Calculate the iou between gt_bboxes and crop_boxes
+                iou = self._iou_matrix(gt_bboxes,
+                                       np.array([crop_box], dtype=np.float32))
+                # If the maximum value of the iou is less than thresh,
+                # the current crop_box is considered invalid.
+                if iou.max() < thresh:
+                    continue
+                # If cover_all_box == True and the minimum value of
+                # the iou is less than thresh, the current crop_box
+                # is considered invalid.
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+                # Get which gt_bboxes to keep after cropping.
+                valid_inds = self._get_valid_inds(
+                    gt_bboxes, np.array(crop_box, dtype=np.float32))
+                if valid_inds.size > 0:
+                    found = True
+                    break
+            if found:
+                results = self._crop_data(results, crop_box, valid_inds)
+                return results
+        return results
+    @cache_randomness
+    def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generate crop offset.
+        Args:
+            margin (Tuple[int, int]): The upper bound for the offset generated
+                randomly.
+        Returns:
+            Tuple[int, int]: The random offset for the crop.
+        """
+        margin_h, margin_w = margin
+        offset_h = np.random.randint(0, margin_h + 1)
+        offset_w = np.random.randint(0, margin_w + 1)
+        return (offset_h, offset_w)
+    @cache_randomness
+    def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generates the crop size based on `image_size`.
+        Args:
+            image_size (Tuple[int, int]): (h, w).
+        Returns:
+            crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        scale = random.uniform(*self.scaling)
+        if self.aspect_ratio is not None:
+            min_ar, max_ar = self.aspect_ratio
+            aspect_ratio = random.uniform(
+                max(min_ar, scale**2), min(max_ar, scale**-2))
+            h_scale = scale / np.sqrt(aspect_ratio)
+            w_scale = scale * np.sqrt(aspect_ratio)
+        else:
+            h_scale = random.uniform(*self.scaling)
+            w_scale = random.uniform(*self.scaling)
+        crop_h = h * h_scale
+        crop_w = w * w_scale
+        return int(crop_h), int(crop_w)
+    def _iou_matrix(self,
+                    gt_bbox: HorizontalBoxes,
+                    crop_bbox: np.ndarray,
+                    eps: float = 1e-10) -> np.ndarray:
+        """Calculate iou between gt and image crop box.
+        Args:
+            gt_bbox (HorizontalBoxes): Ground truth bounding boxes.
+            crop_bbox (np.ndarray): Image crop coordinates in
+                [x1, y1, x2, y2] format.
+            eps (float): Default to 1e-10.
+        Return:
+            (np.ndarray): IoU.
+        """
+        gt_bbox = gt_bbox.tensor.numpy()
+        lefttop = np.maximum(gt_bbox[:, np.newaxis, :2], crop_bbox[:, :2])
+        rightbottom = np.minimum(gt_bbox[:, np.newaxis, 2:], crop_bbox[:, 2:])
+        overlap = np.prod(
+            rightbottom - lefttop,
+            axis=2) * (lefttop < rightbottom).all(axis=2)
+        area_gt_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1)
+        area_crop_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1)
+        area_o = (area_gt_bbox[:, np.newaxis] + area_crop_bbox - overlap)
+        return overlap / (area_o + eps)
+    def _get_valid_inds(self, gt_bbox: HorizontalBoxes,
+                        img_crop_bbox: np.ndarray) -> np.ndarray:
+        """Get which Bboxes to keep at the current cropping coordinates.
+        Args:
+            gt_bbox (HorizontalBoxes): Ground truth bounding boxes.
+            img_crop_bbox (np.ndarray): Image crop coordinates in
+                [x1, y1, x2, y2] format.
+        Returns:
+            (np.ndarray): Valid indexes.
+        """
+        cropped_box = gt_bbox.tensor.numpy().copy()
+        gt_bbox = gt_bbox.tensor.numpy().copy()
+        cropped_box[:, :2] = np.maximum(gt_bbox[:, :2], img_crop_bbox[:2])
+        cropped_box[:, 2:] = np.minimum(gt_bbox[:, 2:], img_crop_bbox[2:])
+        cropped_box[:, :2] -= img_crop_bbox[:2]
+        cropped_box[:, 2:] -= img_crop_bbox[:2]
+        centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2
+        valid = np.logical_and(img_crop_bbox[:2] <= centers,
+                               centers < img_crop_bbox[2:]).all(axis=1)
+        valid = np.logical_and(
+            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
+        return np.where(valid)[0]
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(aspect_ratio={self.aspect_ratio}, '
+        repr_str += f'thresholds={self.thresholds}, '
+        repr_str += f'scaling={self.scaling}, '
+        repr_str += f'num_attempts={self.num_attempts}, '
+        repr_str += f'allow_no_crop={self.allow_no_crop}, '
+        repr_str += f'cover_all_box={self.cover_all_box})'
+        return repr_str
+@TRANSFORMS.register_module()
+class YOLOv5CopyPaste(BaseTransform):
+    """Copy-Paste used in YOLOv5 and YOLOv8.
+    This transform randomly copy some objects in the image to the mirror
+    position of the image.It is different from the `CopyPaste` in mmdet.
+    Required Keys:
+    - img (np.uint8)
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (PolygonMasks) (optional)
+    Modified Keys:
+    - img
+    - gt_bboxes
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+    Args:
+        ioa_thresh (float): Ioa thresholds for deciding valid bbox.
+        prob (float): Probability of choosing objects.
+            Defaults to 0.5.
+    """
+    def __init__(self, ioa_thresh: float = 0.3, prob: float = 0.5):
+        self.ioa_thresh = ioa_thresh
+        self.prob = prob
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """The YOLOv5 and YOLOv8 Copy-Paste transform function.
+        Args:
+            results (dict): The result dict.
+        Returns:
+            dict: The result dict.
+        """
+        if len(results.get('gt_masks', [])) == 0:
+            return results
+        gt_masks = results['gt_masks']
+        assert isinstance(gt_masks, PolygonMasks),\
+            'only support type of PolygonMasks,' \
+            ' but get type: %s' % type(gt_masks)
+        gt_bboxes = results['gt_bboxes']
+        gt_bboxes_labels = results.get('gt_bboxes_labels', None)
+        img = results['img']
+        img_h, img_w = img.shape[:2]
+        # calculate ioa
+        gt_bboxes_flip = deepcopy(gt_bboxes)
+        gt_bboxes_flip.flip_(img.shape)
+        ioa = self.bbox_ioa(gt_bboxes_flip, gt_bboxes)
+        indexes = torch.nonzero((ioa < self.ioa_thresh).all(1))[:, 0]
+        n = len(indexes)
+        valid_inds = random.choice(
+            indexes, size=round(self.prob * n), replace=False)
+        if len(valid_inds) == 0:
+            return results
+        if gt_bboxes_labels is not None:
+            # prepare labels
+            gt_bboxes_labels = np.concatenate(
+                (gt_bboxes_labels, gt_bboxes_labels[valid_inds]), axis=0)
+        # prepare bboxes
+        copypaste_bboxes = gt_bboxes_flip[valid_inds]
+        gt_bboxes = gt_bboxes.cat([gt_bboxes, copypaste_bboxes])
+        # prepare images
+        copypaste_gt_masks = gt_masks[valid_inds]
+        copypaste_gt_masks_flip = copypaste_gt_masks.flip()
+        # convert poly format to bitmap format
+        # example: poly: [[array(0.0, 0.0, 10.0, 0.0, 10.0, 10.0, 0.0, 10.0]]
+        #  -> bitmap: a mask with shape equal to (1, img_h, img_w)
+        # # type1 low speed
+        # copypaste_gt_masks_bitmap = copypaste_gt_masks.to_ndarray()
+        # copypaste_mask = np.sum(copypaste_gt_masks_bitmap, axis=0) > 0
+        # type2
+        copypaste_mask = np.zeros((img_h, img_w), dtype=np.uint8)
+        for poly in copypaste_gt_masks.masks:
+            poly = [i.reshape((-1, 1, 2)).astype(np.int32) for i in poly]
+            cv2.drawContours(copypaste_mask, poly, -1, (1, ), cv2.FILLED)
+        copypaste_mask = copypaste_mask.astype(bool)
+        # copy objects, and paste to the mirror position of the image
+        copypaste_mask_flip = mmcv.imflip(
+            copypaste_mask, direction='horizontal')
+        copypaste_img = mmcv.imflip(img, direction='horizontal')
+        img[copypaste_mask_flip] = copypaste_img[copypaste_mask_flip]
+        # prepare masks
+        gt_masks = copypaste_gt_masks.cat([gt_masks, copypaste_gt_masks_flip])
+        if 'gt_ignore_flags' in results:
+            # prepare gt_ignore_flags
+            gt_ignore_flags = results['gt_ignore_flags']
+            gt_ignore_flags = np.concatenate(
+                [gt_ignore_flags, gt_ignore_flags[valid_inds]], axis=0)
+            results['gt_ignore_flags'] = gt_ignore_flags
+        results['img'] = img
+        results['gt_bboxes'] = gt_bboxes
+        if gt_bboxes_labels is not None:
+            results['gt_bboxes_labels'] = gt_bboxes_labels
+        results['gt_masks'] = gt_masks
+        return results
+    @staticmethod
+    def bbox_ioa(gt_bboxes_flip: HorizontalBoxes,
+                 gt_bboxes: HorizontalBoxes,
+                 eps: float = 1e-7) -> np.ndarray:
+        """Calculate ioa between gt_bboxes_flip and gt_bboxes.
+        Args:
+            gt_bboxes_flip (HorizontalBoxes): Flipped ground truth
+                bounding boxes.
+            gt_bboxes (HorizontalBoxes): Ground truth bounding boxes.
+            eps (float): Default to 1e-10.
+        Return:
+            (Tensor): Ioa.
+        """
+        gt_bboxes_flip = gt_bboxes_flip.tensor
+        gt_bboxes = gt_bboxes.tensor
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = gt_bboxes_flip.T
+        b2_x1, b2_y1, b2_x2, b2_y2 = gt_bboxes.T
+        # Intersection area
+        inter_area = (torch.minimum(b1_x2[:, None],
+                                    b2_x2) - torch.maximum(b1_x1[:, None],
+                                                           b2_x1)).clip(0) * \
+                     (torch.minimum(b1_y2[:, None],
+                                    b2_y2) - torch.maximum(b1_y1[:, None],
+                                                           b2_y1)).clip(0)
+        # box2 area
+        box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps
+        # Intersection over box2 area
+        return inter_area / box2_area
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(ioa_thresh={self.ioa_thresh},'
+        repr_str += f'prob={self.prob})'
+        return repr_str
+@TRANSFORMS.register_module()
+class RemoveDataElement(BaseTransform):
+    """Remove unnecessary data element in results.
+    Args:
+        keys (Union[str, Sequence[str]]): Keys need to be removed.
+    """
+    def __init__(self, keys: Union[str, Sequence[str]]):
+        self.keys = [keys] if isinstance(keys, str) else keys
+    def transform(self, results: dict) -> dict:
+        for key in self.keys:
+            results.pop(key, None)
+        return results
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(keys={self.keys})'
+        return repr_str
+@TRANSFORMS.register_module()
+class RegularizeRotatedBox(BaseTransform):
+    """Regularize rotated boxes.
+    Due to the angle periodicity, one rotated box can be represented in
+    many different (x, y, w, h, t). To make each rotated box unique,
+    ``regularize_boxes`` will take the remainder of the angle divided by
+    180 degrees.
+    For convenience, three angle_version can be used here:
+    - 'oc': OpenCV Definition. Has the same box representation as
+        ``cv2.minAreaRect`` the angle ranges in [-90, 0).
+    - 'le90': Long Edge Definition (90). the angle ranges in [-90, 90).
+        The width is always longer than the height.
+    - 'le135': Long Edge Definition (135). the angle ranges in [-45, 135).
+        The width is always longer than the height.
+    Required Keys:
+    - gt_bboxes (RotatedBoxes[torch.float32])
+    Modified Keys:
+    - gt_bboxes
+    Args:
+        angle_version (str): Angle version. Can only be 'oc',
+            'le90', or 'le135'. Defaults to 'le90.
+    """
+    def __init__(self, angle_version='le90') -> None:
+        self.angle_version = angle_version
+        try:
+            from mmrotate.structures.bbox import RotatedBoxes
+            self.box_type = RotatedBoxes
+        except ImportError:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmrotate.txt" '
+                'to install mmrotate first for rotated detection.')
+    def transform(self, results: dict) -> dict:
+        assert isinstance(results['gt_bboxes'], self.box_type)
+        results['gt_bboxes'] = self.box_type(
+            results['gt_bboxes'].regularize_boxes(self.angle_version))
+        return results

mmyolo/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence
+import numpy as np
+import torch
+from mmengine.dataset import COLLATE_FUNCTIONS
+from ..registry import TASK_UTILS
+@COLLATE_FUNCTIONS.register_module()
+def yolov5_collate(data_batch: Sequence,
+                   use_ms_training: bool = False) -> dict:
+    """Rewrite collate_fn to get faster training speed.
+    Args:
+       data_batch (Sequence): Batch of data.
+       use_ms_training (bool): Whether to use multi-scale training.
+    """
+    batch_imgs = []
+    batch_bboxes_labels = []
+    batch_masks = []
+    for i in range(len(data_batch)):
+        datasamples = data_batch[i]['data_samples']
+        inputs = data_batch[i]['inputs']
+        batch_imgs.append(inputs)
+        gt_bboxes = datasamples.gt_instances.bboxes.tensor
+        gt_labels = datasamples.gt_instances.labels
+        if 'masks' in datasamples.gt_instances:
+            masks = datasamples.gt_instances.masks.to_tensor(
+                dtype=torch.bool, device=gt_bboxes.device)
+            batch_masks.append(masks)
+        batch_idx = gt_labels.new_full((len(gt_labels), 1), i)
+        bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes),
+                                  dim=1)
+        batch_bboxes_labels.append(bboxes_labels)
+    collated_results = {
+        'data_samples': {
+            'bboxes_labels': torch.cat(batch_bboxes_labels, 0)
+        }
+    }
+    if len(batch_masks) > 0:
+        collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0)
+    if use_ms_training:
+        collated_results['inputs'] = batch_imgs
+    else:
+        collated_results['inputs'] = torch.stack(batch_imgs, 0)
+    return collated_results
+@TASK_UTILS.register_module()
+class BatchShapePolicy:
+    """BatchShapePolicy is only used in the testing phase, which can reduce the
+    number of pad pixels during batch inference.
+    Args:
+       batch_size (int): Single GPU batch size during batch inference.
+           Defaults to 32.
+       img_size (int): Expected output image size. Defaults to 640.
+       size_divisor (int): The minimum size that is divisible
+           by size_divisor. Defaults to 32.
+       extra_pad_ratio (float):  Extra pad ratio. Defaults to 0.5.
+    """
+    def __init__(self,
+                 batch_size: int = 32,
+                 img_size: int = 640,
+                 size_divisor: int = 32,
+                 extra_pad_ratio: float = 0.5):
+        self.batch_size = batch_size
+        self.img_size = img_size
+        self.size_divisor = size_divisor
+        self.extra_pad_ratio = extra_pad_ratio
+    def __call__(self, data_list: List[dict]) -> List[dict]:
+        image_shapes = []
+        for data_info in data_list:
+            image_shapes.append((data_info['width'], data_info['height']))
+        image_shapes = np.array(image_shapes, dtype=np.float64)
+        n = len(image_shapes)  # number of images
+        batch_index = np.floor(np.arange(n) / self.batch_size).astype(
+            np.int64)  # batch index
+        number_of_batches = batch_index[-1] + 1  # number of batches
+        aspect_ratio = image_shapes[:, 1] / image_shapes[:, 0]  # aspect ratio
+        irect = aspect_ratio.argsort()
+        data_list = [data_list[i] for i in irect]
+        aspect_ratio = aspect_ratio[irect]
+        # Set training image shapes
+        shapes = [[1, 1]] * number_of_batches
+        for i in range(number_of_batches):
+            aspect_ratio_index = aspect_ratio[batch_index == i]
+            min_index, max_index = aspect_ratio_index.min(
+            ), aspect_ratio_index.max()
+            if max_index < 1:
+                shapes[i] = [max_index, 1]
+            elif min_index > 1:
+                shapes[i] = [1, 1 / min_index]
+        batch_shapes = np.ceil(
+            np.array(shapes) * self.img_size / self.size_divisor +
+            self.extra_pad_ratio).astype(np.int64) * self.size_divisor
+        for i, data_info in enumerate(data_list):
+            data_info['batch_shape'] = batch_shapes[batch_index[i]]
+        return data_list

mmyolo/datasets/yolov5_coco.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional
+from mmdet.datasets import BaseDetDataset, CocoDataset
+from ..registry import DATASETS, TASK_UTILS
+class BatchShapePolicyDataset(BaseDetDataset):
+    """Dataset with the batch shape policy that makes paddings with least
+    pixels during batch inference process, which does not require the image
+    scales of all batches to be the same throughout validation."""
+    def __init__(self,
+                 *args,
+                 batch_shapes_cfg: Optional[dict] = None,
+                 **kwargs):
+        self.batch_shapes_cfg = batch_shapes_cfg
+        super().__init__(*args, **kwargs)
+    def full_init(self):
+        """rewrite full_init() to be compatible with serialize_data in
+        BatchShapePolicy."""
+        if self._fully_initialized:
+            return
+        # load data information
+        self.data_list = self.load_data_list()
+        # batch_shapes_cfg
+        if self.batch_shapes_cfg:
+            batch_shapes_policy = TASK_UTILS.build(self.batch_shapes_cfg)
+            self.data_list = batch_shapes_policy(self.data_list)
+            del batch_shapes_policy
+        # filter illegal data, such as data that has no annotations.
+        self.data_list = self.filter_data()
+        # Get subset data according to indices.
+        if self._indices is not None:
+            self.data_list = self._get_unserialized_subset(self._indices)
+        # serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+        self._fully_initialized = True
+    def prepare_data(self, idx: int) -> Any:
+        """Pass the dataset to the pipeline during training to support mixed
+        data augmentation, such as Mosaic and MixUp."""
+        if self.test_mode is False:
+            data_info = self.get_data_info(idx)
+            data_info['dataset'] = self
+            return self.pipeline(data_info)
+        else:
+            return super().prepare_data(idx)
+@DATASETS.register_module()
+class YOLOv5CocoDataset(BatchShapePolicyDataset, CocoDataset):
+    """Dataset for YOLOv5 COCO Dataset.
+    We only add `BatchShapePolicy` function compared with CocoDataset. See
+    `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    pass

mmyolo/datasets/yolov5_crowdhuman.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.datasets import CrowdHumanDataset
+from ..registry import DATASETS
+from .yolov5_coco import BatchShapePolicyDataset
+@DATASETS.register_module()
+class YOLOv5CrowdHumanDataset(BatchShapePolicyDataset, CrowdHumanDataset):
+    """Dataset for YOLOv5 CrowdHuman Dataset.
+    We only add `BatchShapePolicy` function compared with CrowdHumanDataset.
+    See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    pass

mmyolo/datasets/yolov5_dota.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
+from ..registry import DATASETS
+try:
+    from mmrotate.datasets import DOTADataset
+    MMROTATE_AVAILABLE = True
+except ImportError:
+    from mmengine.dataset import BaseDataset
+    DOTADataset = BaseDataset
+    MMROTATE_AVAILABLE = False
+@DATASETS.register_module()
+class YOLOv5DOTADataset(BatchShapePolicyDataset, DOTADataset):
+    """Dataset for YOLOv5 DOTA Dataset.
+    We only add `BatchShapePolicy` function compared with DOTADataset. See
+    `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    def __init__(self, *args, **kwargs):
+        if not MMROTATE_AVAILABLE:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmrotate.txt" '
+                'to install mmrotate first for rotated detection.')
+        super().__init__(*args, **kwargs)

mmyolo/datasets/yolov5_voc.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.datasets import VOCDataset
+from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
+from ..registry import DATASETS
+@DATASETS.register_module()
+class YOLOv5VOCDataset(BatchShapePolicyDataset, VOCDataset):
+    """Dataset for YOLOv5 VOC Dataset.
+    We only add `BatchShapePolicy` function compared with VOCDataset. See
+    `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    pass

mmyolo/deploy/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdeploy.codebase.base import MMCodebase
+from .models import *  # noqa: F401,F403
+from .object_detection import MMYOLO, YOLOObjectDetection
+__all__ = ['MMCodebase', 'MMYOLO', 'YOLOObjectDetection']

mmyolo/deploy/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (c) OpenMMLab. All rights reserved.
2	+ from . import dense_heads # noqa: F401,F403

mmyolo/deploy/models/dense_heads/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from . import yolov5_head  # noqa: F401,F403
+__all__ = ['yolov5_head']

mmyolo/deploy/models/dense_heads/yolov5_head.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from functools import partial
+from typing import List, Optional, Tuple
+import torch
+from mmdeploy.codebase.mmdet import get_post_processing_params
+from mmdeploy.codebase.mmdet.models.layers import multiclass_nms
+from mmdeploy.core import FUNCTION_REWRITER
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.deploy.models.layers import efficient_nms
+from mmyolo.models.dense_heads import YOLOv5Head
+def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
+                        stride: int) -> Tensor:
+    """Decode YOLOv5 bounding boxes.
+    Args:
+        priors (Tensor): Prior boxes in center-offset form.
+        bbox_preds (Tensor): Predicted bounding boxes.
+        stride (int): Stride of the feature map.
+    Returns:
+        Tensor: Decoded bounding boxes.
+    """
+    bbox_preds = bbox_preds.sigmoid()
+    x_center = (priors[..., 0] + priors[..., 2]) * 0.5
+    y_center = (priors[..., 1] + priors[..., 3]) * 0.5
+    w = priors[..., 2] - priors[..., 0]
+    h = priors[..., 3] - priors[..., 1]
+    x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center
+    y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center
+    w_pred = (bbox_preds[..., 2] * 2)**2 * w
+    h_pred = (bbox_preds[..., 3] * 2)**2 * h
+    decoded_bboxes = torch.stack(
+        [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1)
+    return decoded_bboxes
+@FUNCTION_REWRITER.register_rewriter(
+    func_name='mmyolo.models.dense_heads.yolov5_head.'
+    'YOLOv5Head.predict_by_feat')
+def yolov5_head__predict_by_feat(self,
+                                 cls_scores: List[Tensor],
+                                 bbox_preds: List[Tensor],
+                                 objectnesses: Optional[List[Tensor]] = None,
+                                 batch_img_metas: Optional[List[dict]] = None,
+                                 cfg: Optional[ConfigDict] = None,
+                                 rescale: bool = False,
+                                 with_nms: bool = True) -> Tuple[InstanceData]:
+    """Transform a batch of output features extracted by the head into
+    bbox results.
+    Args:
+        cls_scores (list[Tensor]): Classification scores for all
+            scale levels, each is a 4D-tensor, has shape
+            (batch_size, num_priors * num_classes, H, W).
+        bbox_preds (list[Tensor]): Box energies / deltas for all
+            scale levels, each is a 4D-tensor, has shape
+            (batch_size, num_priors * 4, H, W).
+        objectnesses (list[Tensor], Optional): Score factor for
+            all scale level, each is a 4D-tensor, has shape
+            (batch_size, 1, H, W).
+        batch_img_metas (list[dict], Optional): Batch image meta info.
+            Defaults to None.
+        cfg (ConfigDict, optional): Test / postprocessing
+            configuration, if None, test_cfg would be used.
+            Defaults to None.
+        rescale (bool): If True, return boxes in original image space.
+            Defaults to False.
+        with_nms (bool): If True, do nms before return boxes.
+            Defaults to True.
+    Returns:
+        tuple[Tensor, Tensor]: The first item is an (N, num_box, 5) tensor,
+            where 5 represent (tl_x, tl_y, br_x, br_y, score), N is batch
+            size and the score between 0 and 1. The shape of the second
+            tensor in the tuple is (N, num_box), and each element
+            represents the class label of the corresponding box.
+    """
+    ctx = FUNCTION_REWRITER.get_context()
+    detector_type = type(self)
+    deploy_cfg = ctx.cfg
+    use_efficientnms = deploy_cfg.get('use_efficientnms', False)
+    dtype = cls_scores[0].dtype
+    device = cls_scores[0].device
+    bbox_decoder = self.bbox_coder.decode
+    nms_func = multiclass_nms
+    if use_efficientnms:
+        if detector_type is YOLOv5Head:
+            nms_func = partial(efficient_nms, box_coding=0)
+            bbox_decoder = yolov5_bbox_decoder
+        else:
+            nms_func = efficient_nms
+    assert len(cls_scores) == len(bbox_preds)
+    cfg = self.test_cfg if cfg is None else cfg
+    cfg = copy.deepcopy(cfg)
+    num_imgs = cls_scores[0].shape[0]
+    featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+    mlvl_priors = self.prior_generator.grid_priors(
+        featmap_sizes, dtype=dtype, device=device)
+    flatten_priors = torch.cat(mlvl_priors)
+    mlvl_strides = [
+        flatten_priors.new_full(
+            (featmap_size[0] * featmap_size[1] * self.num_base_priors, ),
+            stride)
+        for featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+    ]
+    flatten_stride = torch.cat(mlvl_strides)
+    # flatten cls_scores, bbox_preds and objectness
+    flatten_cls_scores = [
+        cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes)
+        for cls_score in cls_scores
+    ]
+    cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+    flatten_bbox_preds = [
+        bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+        for bbox_pred in bbox_preds
+    ]
+    flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+    if objectnesses is not None:
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+        flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1))
+    scores = cls_scores
+    bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds,
+                          flatten_stride)
+    if not with_nms:
+        return bboxes, scores
+    post_params = get_post_processing_params(deploy_cfg)
+    max_output_boxes_per_class = post_params.max_output_boxes_per_class
+    iou_threshold = cfg.nms.get('iou_threshold', post_params.iou_threshold)
+    score_threshold = cfg.get('score_thr', post_params.score_threshold)
+    pre_top_k = post_params.pre_top_k
+    keep_top_k = cfg.get('max_per_img', post_params.keep_top_k)
+    return nms_func(bboxes, scores, max_output_boxes_per_class, iou_threshold,
+                    score_threshold, pre_top_k, keep_top_k)
+@FUNCTION_REWRITER.register_rewriter(
+    func_name='mmyolo.models.dense_heads.yolov5_head.'
+    'YOLOv5Head.predict',
+    backend='rknn')
+def yolov5_head__predict__rknn(self, x: Tuple[Tensor], *args,
+                               **kwargs) -> Tuple[Tensor, Tensor, Tensor]:
+    """Perform forward propagation of the detection head and predict detection
+    results on the features of the upstream network.
+    Args:
+        x (tuple[Tensor]): Multi-level features from the
+            upstream network, each is a 4D-tensor.
+    """
+    outs = self(x)
+    return outs
+@FUNCTION_REWRITER.register_rewriter(
+    func_name='mmyolo.models.dense_heads.yolov5_head.'
+    'YOLOv5HeadModule.forward',
+    backend='rknn')
+def yolov5_head_module__forward__rknn(
+        self, x: Tensor, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor]:
+    """Forward feature of a single scale level."""
+    out = []
+    for i, feat in enumerate(x):
+        out.append(self.convs_pred[i](feat))
+    return out

mmyolo/deploy/models/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_nms import efficient_nms
+__all__ = ['efficient_nms']

mmyolo/deploy/models/layers/bbox_nms.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdeploy.core import mark
+from torch import Tensor
+def _efficient_nms(
+    boxes: Tensor,
+    scores: Tensor,
+    max_output_boxes_per_class: int = 1000,
+    iou_threshold: float = 0.5,
+    score_threshold: float = 0.05,
+    pre_top_k: int = -1,
+    keep_top_k: int = 100,
+    box_coding: int = 0,
+):
+    """Wrapper for `efficient_nms` with TensorRT.
+    Args:
+        boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
+        scores (Tensor): The detection scores of shape
+            [N, num_boxes, num_classes].
+        max_output_boxes_per_class (int): Maximum number of output
+            boxes per class of nms. Defaults to 1000.
+        iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
+        score_threshold (float): score threshold of nms.
+            Defaults to 0.05.
+        pre_top_k (int): Number of top K boxes to keep before nms.
+            Defaults to -1.
+        keep_top_k (int): Number of top K boxes to keep after nms.
+            Defaults to -1.
+        box_coding (int): Bounding boxes format for nms.
+            Defaults to 0 means [x, y, w, h].
+            Set to 1 means [x1, y1 ,x2, y2].
+    Returns:
+        tuple[Tensor, Tensor]: (dets, labels), `dets` of shape [N, num_det, 5]
+            and `labels` of shape [N, num_det].
+    """
+    boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2)
+    _, det_boxes, det_scores, labels = TRTEfficientNMSop.apply(
+        boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0,
+        score_threshold)
+    dets = torch.cat([det_boxes, det_scores.unsqueeze(2)], -1)
+    # retain shape info
+    batch_size = boxes.size(0)
+    dets_shape = dets.shape
+    label_shape = labels.shape
+    dets = dets.reshape([batch_size, *dets_shape[1:]])
+    labels = labels.reshape([batch_size, *label_shape[1:]])
+    return dets, labels
+@mark('efficient_nms', inputs=['boxes', 'scores'], outputs=['dets', 'labels'])
+def efficient_nms(*args, **kwargs):
+    """Wrapper function for `_efficient_nms`."""
+    return _efficient_nms(*args, **kwargs)
+class TRTEfficientNMSop(torch.autograd.Function):
+    """Efficient NMS op for TensorRT."""
+    @staticmethod
+    def forward(
+        ctx,
+        boxes,
+        scores,
+        background_class=-1,
+        box_coding=0,
+        iou_threshold=0.45,
+        max_output_boxes=100,
+        plugin_version='1',
+        score_activation=0,
+        score_threshold=0.25,
+    ):
+        """Forward function of TRTEfficientNMSop."""
+        batch_size, num_boxes, num_classes = scores.shape
+        num_det = torch.randint(
+            0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
+        det_boxes = torch.randn(batch_size, max_output_boxes, 4)
+        det_scores = torch.randn(batch_size, max_output_boxes)
+        det_classes = torch.randint(
+            0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
+        return num_det, det_boxes, det_scores, det_classes
+    @staticmethod
+    def symbolic(g,
+                 boxes,
+                 scores,
+                 background_class=-1,
+                 box_coding=0,
+                 iou_threshold=0.45,
+                 max_output_boxes=100,
+                 plugin_version='1',
+                 score_activation=0,
+                 score_threshold=0.25):
+        """Symbolic function of TRTEfficientNMSop."""
+        out = g.op(
+            'TRT::EfficientNMS_TRT',
+            boxes,
+            scores,
+            background_class_i=background_class,
+            box_coding_i=box_coding,
+            iou_threshold_f=iou_threshold,
+            max_output_boxes_i=max_output_boxes,
+            plugin_version_s=plugin_version,
+            score_activation_i=score_activation,
+            score_threshold_f=score_threshold,
+            outputs=4)
+        nums, boxes, scores, classes = out
+        return nums, boxes, scores, classes

mmyolo/deploy/object_detection.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Dict, Optional
+import torch
+from mmdeploy.codebase.base import CODEBASE, MMCodebase
+from mmdeploy.codebase.mmdet.deploy import ObjectDetection
+from mmdeploy.utils import Codebase, Task
+from mmengine import Config
+from mmengine.registry import Registry
+MMYOLO_TASK = Registry('mmyolo_tasks')
+@CODEBASE.register_module(Codebase.MMYOLO.value)
+class MMYOLO(MMCodebase):
+    """MMYOLO codebase class."""
+    task_registry = MMYOLO_TASK
+    @classmethod
+    def register_deploy_modules(cls):
+        """register all rewriters for mmdet."""
+        import mmdeploy.codebase.mmdet.models  # noqa: F401
+        import mmdeploy.codebase.mmdet.ops  # noqa: F401
+        import mmdeploy.codebase.mmdet.structures  # noqa: F401
+    @classmethod
+    def register_all_modules(cls):
+        """register all modules."""
+        from mmdet.utils.setup_env import \
+            register_all_modules as register_all_modules_mmdet
+        from mmyolo.utils.setup_env import \
+            register_all_modules as register_all_modules_mmyolo
+        cls.register_deploy_modules()
+        register_all_modules_mmyolo(True)
+        register_all_modules_mmdet(False)
+def _get_dataset_metainfo(model_cfg: Config):
+    """Get metainfo of dataset.
+    Args:
+        model_cfg Config: Input model Config object.
+    Returns:
+        list[str]: A list of string specifying names of different class.
+    """
+    from mmyolo import datasets  # noqa
+    from mmyolo.registry import DATASETS
+    module_dict = DATASETS.module_dict
+    for dataloader_name in [
+            'test_dataloader', 'val_dataloader', 'train_dataloader'
+    ]:
+        if dataloader_name not in model_cfg:
+            continue
+        dataloader_cfg = model_cfg[dataloader_name]
+        dataset_cfg = dataloader_cfg.dataset
+        dataset_cls = module_dict.get(dataset_cfg.type, None)
+        if dataset_cls is None:
+            continue
+        if hasattr(dataset_cls, '_load_metainfo') and isinstance(
+                dataset_cls._load_metainfo, Callable):
+            meta = dataset_cls._load_metainfo(
+                dataset_cfg.get('metainfo', None))
+            if meta is not None:
+                return meta
+        if hasattr(dataset_cls, 'METAINFO'):
+            return dataset_cls.METAINFO
+    return None
+@MMYOLO_TASK.register_module(Task.OBJECT_DETECTION.value)
+class YOLOObjectDetection(ObjectDetection):
+    """YOLO Object Detection task."""
+    def get_visualizer(self, name: str, save_dir: str):
+        """Get visualizer.
+        Args:
+            name (str): Name of visualizer.
+            save_dir (str): Directory to save visualization results.
+        Returns:
+            Visualizer: A visualizer instance.
+        """
+        from mmdet.visualization import DetLocalVisualizer  # noqa: F401,F403
+        metainfo = _get_dataset_metainfo(self.model_cfg)
+        visualizer = super().get_visualizer(name, save_dir)
+        if metainfo is not None:
+            visualizer.dataset_meta = metainfo
+        return visualizer
+    def build_pytorch_model(self,
+                            model_checkpoint: Optional[str] = None,
+                            cfg_options: Optional[Dict] = None,
+                            **kwargs) -> torch.nn.Module:
+        """Initialize torch model.
+        Args:
+            model_checkpoint (str): The checkpoint file of torch model,
+                defaults to `None`.
+            cfg_options (dict): Optional config key-pair parameters.
+        Returns:
+            nn.Module: An initialized torch model generated by other OpenMMLab
+                codebases.
+        """
+        from copy import deepcopy
+        from mmengine.model import revert_sync_batchnorm
+        from mmengine.registry import MODELS
+        from mmyolo.utils import switch_to_deploy
+        model = deepcopy(self.model_cfg.model)
+        preprocess_cfg = deepcopy(self.model_cfg.get('preprocess_cfg', {}))
+        preprocess_cfg.update(
+            deepcopy(self.model_cfg.get('data_preprocessor', {})))
+        model.setdefault('data_preprocessor', preprocess_cfg)
+        model = MODELS.build(model)
+        if model_checkpoint is not None:
+            from mmengine.runner.checkpoint import load_checkpoint
+            load_checkpoint(model, model_checkpoint, map_location=self.device)
+        model = revert_sync_batchnorm(model)
+        switch_to_deploy(model)
+        model = model.to(self.device)
+        model.eval()
+        return model

mmyolo/engine/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import *  # noqa: F401,F403
+from .optimizers import *  # noqa: F401,F403

mmyolo/engine/hooks/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .ppyoloe_param_scheduler_hook import PPYOLOEParamSchedulerHook
+from .switch_to_deploy_hook import SwitchToDeployHook
+from .yolov5_param_scheduler_hook import YOLOv5ParamSchedulerHook
+from .yolox_mode_switch_hook import YOLOXModeSwitchHook
+__all__ = [
+    'YOLOv5ParamSchedulerHook', 'YOLOXModeSwitchHook', 'SwitchToDeployHook',
+    'PPYOLOEParamSchedulerHook'
+]

mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+from mmengine.hooks import ParamSchedulerHook
+from mmengine.runner import Runner
+from mmyolo.registry import HOOKS
+@HOOKS.register_module()
+class PPYOLOEParamSchedulerHook(ParamSchedulerHook):
+    """A hook to update learning rate and momentum in optimizer of PPYOLOE. We
+    use this hook to implement adaptive computation for `warmup_total_iters`,
+    which is not possible with the built-in ParamScheduler in mmyolo.
+    Args:
+        warmup_min_iter (int): Minimum warmup iters. Defaults to 1000.
+        start_factor (float): The number we multiply learning rate in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 0.
+        warmup_epochs (int): Epochs for warmup. Defaults to 5.
+        min_lr_ratio (float): Minimum learning rate ratio.
+        total_epochs (int): In PPYOLOE, `total_epochs` is set to
+            training_epochs x 1.2. Defaults to 360.
+    """
+    priority = 9
+    def __init__(self,
+                 warmup_min_iter: int = 1000,
+                 start_factor: float = 0.,
+                 warmup_epochs: int = 5,
+                 min_lr_ratio: float = 0.0,
+                 total_epochs: int = 360):
+        self.warmup_min_iter = warmup_min_iter
+        self.start_factor = start_factor
+        self.warmup_epochs = warmup_epochs
+        self.min_lr_ratio = min_lr_ratio
+        self.total_epochs = total_epochs
+        self._warmup_end = False
+        self._base_lr = None
+    def before_train(self, runner: Runner):
+        """Operations before train.
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        optimizer = runner.optim_wrapper.optimizer
+        for group in optimizer.param_groups:
+            # If the param is never be scheduled, record the current value
+            # as the initial value.
+            group.setdefault('initial_lr', group['lr'])
+        self._base_lr = [
+            group['initial_lr'] for group in optimizer.param_groups
+        ]
+        self._min_lr = [i * self.min_lr_ratio for i in self._base_lr]
+    def before_train_iter(self,
+                          runner: Runner,
+                          batch_idx: int,
+                          data_batch: Optional[dict] = None):
+        """Operations before each training iteration.
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+        """
+        cur_iters = runner.iter
+        optimizer = runner.optim_wrapper.optimizer
+        dataloader_len = len(runner.train_dataloader)
+        # The minimum warmup is self.warmup_min_iter
+        warmup_total_iters = max(
+            round(self.warmup_epochs * dataloader_len), self.warmup_min_iter)
+        if cur_iters <= warmup_total_iters:
+            # warm up
+            alpha = cur_iters / warmup_total_iters
+            factor = self.start_factor * (1 - alpha) + alpha
+            for group_idx, param in enumerate(optimizer.param_groups):
+                param['lr'] = self._base_lr[group_idx] * factor
+        else:
+            for group_idx, param in enumerate(optimizer.param_groups):
+                total_iters = self.total_epochs * dataloader_len
+                lr = self._min_lr[group_idx] + (
+                    self._base_lr[group_idx] -
+                    self._min_lr[group_idx]) * 0.5 * (
+                        math.cos((cur_iters - warmup_total_iters) * math.pi /
+                                 (total_iters - warmup_total_iters)) + 1.0)
+                param['lr'] = lr

mmyolo/engine/hooks/switch_to_deploy_hook.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmyolo.registry import HOOKS
+from mmyolo.utils import switch_to_deploy
+@HOOKS.register_module()
+class SwitchToDeployHook(Hook):
+    """Switch to deploy mode before testing.
+    This hook converts the multi-channel structure of the training network
+    (high performance) to the one-way structure of the testing network (fast
+    speed and  memory saving).
+    """
+    def before_test_epoch(self, runner: Runner):
+        """Switch to deploy mode before testing."""
+        switch_to_deploy(runner.model)

mmyolo/engine/hooks/yolov5_param_scheduler_hook.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+import numpy as np
+from mmengine.hooks import ParamSchedulerHook
+from mmengine.runner import Runner
+from mmyolo.registry import HOOKS
+def linear_fn(lr_factor: float, max_epochs: int):
+    """Generate linear function."""
+    return lambda x: (1 - x / max_epochs) * (1.0 - lr_factor) + lr_factor
+def cosine_fn(lr_factor: float, max_epochs: int):
+    """Generate cosine function."""
+    return lambda x: (
+        (1 - math.cos(x * math.pi / max_epochs)) / 2) * (lr_factor - 1) + 1
+@HOOKS.register_module()
+class YOLOv5ParamSchedulerHook(ParamSchedulerHook):
+    """A hook to update learning rate and momentum in optimizer of YOLOv5."""
+    priority = 9
+    scheduler_maps = {'linear': linear_fn, 'cosine': cosine_fn}
+    def __init__(self,
+                 scheduler_type: str = 'linear',
+                 lr_factor: float = 0.01,
+                 max_epochs: int = 300,
+                 warmup_epochs: int = 3,
+                 warmup_bias_lr: float = 0.1,
+                 warmup_momentum: float = 0.8,
+                 warmup_mim_iter: int = 1000,
+                 **kwargs):
+        assert scheduler_type in self.scheduler_maps
+        self.warmup_epochs = warmup_epochs
+        self.warmup_bias_lr = warmup_bias_lr
+        self.warmup_momentum = warmup_momentum
+        self.warmup_mim_iter = warmup_mim_iter
+        kwargs.update({'lr_factor': lr_factor, 'max_epochs': max_epochs})
+        self.scheduler_fn = self.scheduler_maps[scheduler_type](**kwargs)
+        self._warmup_end = False
+        self._base_lr = None
+        self._base_momentum = None
+    def before_train(self, runner: Runner):
+        """Operations before train.
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        optimizer = runner.optim_wrapper.optimizer
+        for group in optimizer.param_groups:
+            # If the param is never be scheduled, record the current value
+            # as the initial value.
+            group.setdefault('initial_lr', group['lr'])
+            group.setdefault('initial_momentum', group.get('momentum', -1))
+        self._base_lr = [
+            group['initial_lr'] for group in optimizer.param_groups
+        ]
+        self._base_momentum = [
+            group['initial_momentum'] for group in optimizer.param_groups
+        ]
+    def before_train_iter(self,
+                          runner: Runner,
+                          batch_idx: int,
+                          data_batch: Optional[dict] = None):
+        """Operations before each training iteration.
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+        """
+        cur_iters = runner.iter
+        cur_epoch = runner.epoch
+        optimizer = runner.optim_wrapper.optimizer
+        # The minimum warmup is self.warmup_mim_iter
+        warmup_total_iters = max(
+            round(self.warmup_epochs * len(runner.train_dataloader)),
+            self.warmup_mim_iter)
+        if cur_iters <= warmup_total_iters:
+            xp = [0, warmup_total_iters]
+            for group_idx, param in enumerate(optimizer.param_groups):
+                if group_idx == 2:
+                    # bias learning rate will be handled specially
+                    yp = [
+                        self.warmup_bias_lr,
+                        self._base_lr[group_idx] * self.scheduler_fn(cur_epoch)
+                    ]
+                else:
+                    yp = [
+                        0.0,
+                        self._base_lr[group_idx] * self.scheduler_fn(cur_epoch)
+                    ]
+                param['lr'] = np.interp(cur_iters, xp, yp)
+                if 'momentum' in param:
+                    param['momentum'] = np.interp(
+                        cur_iters, xp,
+                        [self.warmup_momentum, self._base_momentum[group_idx]])
+        else:
+            self._warmup_end = True
+    def after_train_epoch(self, runner: Runner):
+        """Operations after each training epoch.
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if not self._warmup_end:
+            return
+        cur_epoch = runner.epoch
+        optimizer = runner.optim_wrapper.optimizer
+        for group_idx, param in enumerate(optimizer.param_groups):
+            param['lr'] = self._base_lr[group_idx] * self.scheduler_fn(
+                cur_epoch)

mmyolo/engine/hooks/yolox_mode_switch_hook.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Sequence
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+from mmyolo.registry import HOOKS
+@HOOKS.register_module()
+class YOLOXModeSwitchHook(Hook):
+    """Switch the mode of YOLOX during training.
+    This hook turns off the mosaic and mixup data augmentation and switches
+    to use L1 loss in bbox_head.
+    Args:
+        num_last_epochs (int): The number of latter epochs in the end of the
+            training to close the data augmentation and switch to L1 loss.
+            Defaults to 15.
+    """
+    def __init__(self,
+                 num_last_epochs: int = 15,
+                 new_train_pipeline: Sequence[dict] = None):
+        self.num_last_epochs = num_last_epochs
+        self.new_train_pipeline_cfg = new_train_pipeline
+    def before_train_epoch(self, runner: Runner):
+        """Close mosaic and mixup augmentation and switches to use L1 loss."""
+        epoch = runner.epoch
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        if (epoch + 1) == runner.max_epochs - self.num_last_epochs:
+            runner.logger.info(f'New Pipeline: {self.new_train_pipeline_cfg}')
+            train_dataloader_cfg = copy.deepcopy(runner.cfg.train_dataloader)
+            train_dataloader_cfg.dataset.pipeline = self.new_train_pipeline_cfg
+            # Note: Why rebuild the dataset?
+            # When build_dataloader will make a deep copy of the dataset,
+            # it will lead to potential risks, such as the global instance
+            # object FileClient data is disordered.
+            # This problem needs to be solved in the future.
+            new_train_dataloader = Runner.build_dataloader(
+                train_dataloader_cfg)
+            runner.train_loop.dataloader = new_train_dataloader
+            runner.logger.info('recreate the dataloader!')
+            runner.logger.info('Add additional bbox reg loss now!')
+            model.bbox_head.use_bbox_aux = True

mmyolo/engine/optimizers/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .yolov5_optim_constructor import YOLOv5OptimizerConstructor
+from .yolov7_optim_wrapper_constructor import YOLOv7OptimWrapperConstructor
+__all__ = ['YOLOv5OptimizerConstructor', 'YOLOv7OptimWrapperConstructor']

mmyolo/engine/optimizers/yolov5_optim_constructor.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+import torch.nn as nn
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+from mmengine.model import is_model_wrapper
+from mmengine.optim import OptimWrapper
+from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                             OPTIMIZERS)
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class YOLOv5OptimizerConstructor:
+    """YOLOv5 constructor for optimizers.
+    It has the following functions：
+        - divides the optimizer parameters into 3 groups:
+        Conv, Bias and BN
+        - support `weight_decay` parameter adaption based on
+        `batch_size_per_gpu`
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+            Positional fields are
+                - ``type``: class name of the OptimizerWrapper
+                - ``optimizer``: The configuration of optimizer.
+            Optional fields are
+                - any arguments of the corresponding optimizer wrapper type,
+                  e.g., accumulative_counts, clip_grad, etc.
+            The positional fields of ``optimizer`` are
+                - `type`: class name of the optimizer.
+            Optional fields are
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+        paramwise_cfg (dict, optional): Parameter-wise options. Must include
+            `base_total_batch_size` if not None. If the total input batch
+            is smaller than `base_total_batch_size`, the `weight_decay`
+            parameter will be kept unchanged, otherwise linear scaling.
+    Example:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16))
+        >>> paramwise_cfg = dict(base_total_batch_size=64)
+        >>> optim_wrapper_builder = YOLOv5OptimizerConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+    """
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        if paramwise_cfg is None:
+            paramwise_cfg = {'base_total_batch_size': 64}
+        assert 'base_total_batch_size' in paramwise_cfg
+        if not isinstance(optim_wrapper_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optim_wrapper_cfg)}')
+        assert 'optimizer' in optim_wrapper_cfg, (
+            '`optim_wrapper_cfg` must contain "optimizer" config')
+        self.optim_wrapper_cfg = optim_wrapper_cfg
+        self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer')
+        self.base_total_batch_size = paramwise_cfg['base_total_batch_size']
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+        if is_model_wrapper(model):
+            model = model.module
+        optimizer_cfg = self.optimizer_cfg.copy()
+        weight_decay = optimizer_cfg.pop('weight_decay', 0)
+        if 'batch_size_per_gpu' in optimizer_cfg:
+            batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu')
+            # No scaling if total_batch_size is less than
+            # base_total_batch_size, otherwise linear scaling.
+            total_batch_size = get_world_size() * batch_size_per_gpu
+            accumulate = max(
+                round(self.base_total_batch_size / total_batch_size), 1)
+            scale_factor = total_batch_size * \
+                accumulate / self.base_total_batch_size
+            if scale_factor != 1:
+                weight_decay *= scale_factor
+                print_log(f'Scaled weight_decay to {weight_decay}', 'current')
+        params_groups = [], [], []
+        for v in model.modules():
+            if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+                params_groups[2].append(v.bias)
+            # Includes SyncBatchNorm
+            if isinstance(v, nn.modules.batchnorm._NormBase):
+                params_groups[1].append(v.weight)
+            elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+                params_groups[0].append(v.weight)
+        # Note: Make sure bias is in the last parameter group
+        optimizer_cfg['params'] = []
+        # conv
+        optimizer_cfg['params'].append({
+            'params': params_groups[0],
+            'weight_decay': weight_decay
+        })
+        # bn
+        optimizer_cfg['params'].append({'params': params_groups[1]})
+        # bias
+        optimizer_cfg['params'].append({'params': params_groups[2]})
+        print_log(
+            'Optimizer groups: %g .bias, %g conv.weight, %g other' %
+            (len(params_groups[2]), len(params_groups[0]), len(
+                params_groups[1])), 'current')
+        del params_groups
+        optimizer = OPTIMIZERS.build(optimizer_cfg)
+        optim_wrapper = OPTIM_WRAPPERS.build(
+            self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
+        return optim_wrapper

mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+import torch.nn as nn
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+from mmengine.model import is_model_wrapper
+from mmengine.optim import OptimWrapper
+from mmyolo.models.dense_heads.yolov7_head import ImplicitA, ImplicitM
+from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                             OPTIMIZERS)
+# TODO: Consider merging into YOLOv5OptimizerConstructor
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class YOLOv7OptimWrapperConstructor:
+    """YOLOv7 constructor for optimizer wrappers.
+    It has the following functions：
+        - divides the optimizer parameters into 3 groups:
+        Conv, Bias and BN/ImplicitA/ImplicitM
+        - support `weight_decay` parameter adaption based on
+        `batch_size_per_gpu`
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+            Positional fields are
+                - ``type``: class name of the OptimizerWrapper
+                - ``optimizer``: The configuration of optimizer.
+            Optional fields are
+                - any arguments of the corresponding optimizer wrapper type,
+                  e.g., accumulative_counts, clip_grad, etc.
+            The positional fields of ``optimizer`` are
+                - `type`: class name of the optimizer.
+            Optional fields are
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+        paramwise_cfg (dict, optional): Parameter-wise options. Must include
+            `base_total_batch_size` if not None. If the total input batch
+            is smaller than `base_total_batch_size`, the `weight_decay`
+            parameter will be kept unchanged, otherwise linear scaling.
+    Example:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16))
+        >>> paramwise_cfg = dict(base_total_batch_size=64)
+        >>> optim_wrapper_builder = YOLOv7OptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+    """
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        if paramwise_cfg is None:
+            paramwise_cfg = {'base_total_batch_size': 64}
+        assert 'base_total_batch_size' in paramwise_cfg
+        if not isinstance(optim_wrapper_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optim_wrapper_cfg)}')
+        assert 'optimizer' in optim_wrapper_cfg, (
+            '`optim_wrapper_cfg` must contain "optimizer" config')
+        self.optim_wrapper_cfg = optim_wrapper_cfg
+        self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer')
+        self.base_total_batch_size = paramwise_cfg['base_total_batch_size']
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+        if is_model_wrapper(model):
+            model = model.module
+        optimizer_cfg = self.optimizer_cfg.copy()
+        weight_decay = optimizer_cfg.pop('weight_decay', 0)
+        if 'batch_size_per_gpu' in optimizer_cfg:
+            batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu')
+            # No scaling if total_batch_size is less than
+            # base_total_batch_size, otherwise linear scaling.
+            total_batch_size = get_world_size() * batch_size_per_gpu
+            accumulate = max(
+                round(self.base_total_batch_size / total_batch_size), 1)
+            scale_factor = total_batch_size * \
+                accumulate / self.base_total_batch_size
+            if scale_factor != 1:
+                weight_decay *= scale_factor
+                print_log(f'Scaled weight_decay to {weight_decay}', 'current')
+        params_groups = [], [], []
+        for v in model.modules():
+            # no decay
+            # Caution: Coupling with model
+            if isinstance(v, (ImplicitA, ImplicitM)):
+                params_groups[0].append(v.implicit)
+            elif isinstance(v, nn.modules.batchnorm._NormBase):
+                params_groups[0].append(v.weight)
+            # apply decay
+            elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+                params_groups[1].append(v.weight)  # apply decay
+            # biases, no decay
+            if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+                params_groups[2].append(v.bias)
+        # Note: Make sure bias is in the last parameter group
+        optimizer_cfg['params'] = []
+        # conv
+        optimizer_cfg['params'].append({
+            'params': params_groups[1],
+            'weight_decay': weight_decay
+        })
+        # bn ...
+        optimizer_cfg['params'].append({'params': params_groups[0]})
+        # bias
+        optimizer_cfg['params'].append({'params': params_groups[2]})
+        print_log(
+            'Optimizer groups: %g .bias, %g conv.weight, %g other' %
+            (len(params_groups[2]), len(params_groups[1]), len(
+                params_groups[0])), 'current')
+        del params_groups
+        optimizer = OPTIMIZERS.build(optimizer_cfg)
+        optim_wrapper = OPTIM_WRAPPERS.build(
+            self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
+        return optim_wrapper

mmyolo/models/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .data_preprocessors import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .plugins import *  # noqa: F401,F403
+from .task_modules import *  # noqa: F401,F403

mmyolo/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_backbone import BaseBackbone
+from .csp_darknet import YOLOv5CSPDarknet, YOLOv8CSPDarknet, YOLOXCSPDarknet
+from .csp_resnet import PPYOLOECSPResNet
+from .cspnext import CSPNeXt
+from .efficient_rep import YOLOv6CSPBep, YOLOv6EfficientRep
+from .yolov7_backbone import YOLOv7Backbone
+__all__ = [
+    'YOLOv5CSPDarknet', 'BaseBackbone', 'YOLOv6EfficientRep', 'YOLOv6CSPBep',
+    'YOLOXCSPDarknet', 'CSPNeXt', 'YOLOv7Backbone', 'PPYOLOECSPResNet',
+    'YOLOv8CSPDarknet'
+]

mmyolo/models/backbones/base_backbone.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Sequence, Union
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_plugin_layer
+from mmdet.utils import ConfigType, OptMultiConfig
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+from mmyolo.registry import MODELS
+@MODELS.register_module()
+class BaseBackbone(BaseModule, metaclass=ABCMeta):
+    """BaseBackbone backbone used in YOLO series.
+    .. code:: text
+     Backbone model structure diagram
+     +-----------+
+     |   input   |
+     +-----------+
+           v
+     +-----------+
+     |   stem    |
+     |   layer   |
+     +-----------+
+           v
+     +-----------+
+     |   stage   |
+     |  layer 1  |
+     +-----------+
+           v
+     +-----------+
+     |   stage   |
+     |  layer 2  |
+     +-----------+
+           v
+         ......
+           v
+     +-----------+
+     |   stage   |
+     |  layer n  |
+     +-----------+
+     In P5 model, n=4
+     In P6 model, n=5
+    Args:
+        arch_setting (list): Architecture of BaseBackbone.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels: Number of input image channels. Defaults to 3.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to None.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to None.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 arch_setting: list,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Sequence[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 plugins: Union[dict, List[dict]] = None,
+                 norm_cfg: ConfigType = None,
+                 act_cfg: ConfigType = None,
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg)
+        self.num_stages = len(arch_setting)
+        self.arch_setting = arch_setting
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('"frozen_stages" must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+        self.input_channels = input_channels
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.widen_factor = widen_factor
+        self.deepen_factor = deepen_factor
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.plugins = plugins
+        self.stem = self.build_stem_layer()
+        self.layers = ['stem']
+        for idx, setting in enumerate(arch_setting):
+            stage = []
+            stage += self.build_stage_layer(idx, setting)
+            if plugins is not None:
+                stage += self.make_stage_plugins(plugins, idx, setting)
+            self.add_module(f'stage{idx + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{idx + 1}')
+    @abstractmethod
+    def build_stem_layer(self):
+        """Build a stem layer."""
+        pass
+    @abstractmethod
+    def build_stage_layer(self, stage_idx: int, setting: list):
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        pass
+    def make_stage_plugins(self, plugins, stage_idx, setting):
+        """Make plugins for backbone ``stage_idx`` th stage.
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block``, ``dropout_block``
+        into the backbone.
+        An example of plugins format could be:
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True)),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True)),
+            ... ]
+            >>> model = YOLOv5CSPDarknet()
+            >>> stage_plugins = model.make_stage_plugins(plugins, 0, setting)
+            >>> assert len(stage_plugins) == 1
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+        .. code-block:: none
+            conv1 -> conv2 -> conv3 -> yyy
+        Suppose ``stage_idx=1``, the structure of blocks in the stage would be:
+        .. code-block:: none
+            conv1 -> conv2 -> conv3 -> xxx -> yyy
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+                If stages is missing, the plugin would be applied to all
+                stages.
+            setting (list): The architecture setting of a stage layer.
+        Returns:
+            list[nn.Module]: Plugins for current stage
+        """
+        # TODO: It is not general enough to support any channel and needs
+        # to be refactored
+        in_channels = int(setting[1] * self.widen_factor)
+        plugin_layers = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            if stages is None or stages[stage_idx]:
+                name, layer = build_plugin_layer(
+                    plugin['cfg'], in_channels=in_channels)
+                plugin_layers.append(layer)
+        return plugin_layers
+    def _freeze_stages(self):
+        """Freeze the parameters of the specified stage so that they are no
+        longer updated."""
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def train(self, mode: bool = True):
+        """Convert the model into training mode while keep normalization layer
+        frozen."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+    def forward(self, x: torch.Tensor) -> tuple:
+        """Forward batch_inputs from the data_preprocessor."""
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)

mmyolo/models/backbones/csp_darknet.py ADDED Viewed

	@@ -0,0 +1,427 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.backbones.csp_darknet import CSPLayer, Focus
+from mmdet.utils import ConfigType, OptMultiConfig
+from mmyolo.registry import MODELS
+from ..layers import CSPLayerWithTwoConv, SPPFBottleneck
+from ..utils import make_divisible, make_round
+from .base_backbone import BaseBackbone
+@MODELS.register_module()
+class YOLOv5CSPDarknet(BaseBackbone):
+    """CSP-Darknet backbone used in YOLOv5.
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to: 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        init_cfg (Union[dict,list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOv5CSPDarknet
+        >>> import torch
+        >>> model = YOLOv5CSPDarknet()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, True, True]]
+    }
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        return ConvModule(
+            self.input_channels,
+            make_divisible(self.arch_setting[0][0], self.widen_factor),
+            kernel_size=6,
+            stride=2,
+            padding=2,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+        in_channels = make_divisible(in_channels, self.widen_factor)
+        out_channels = make_divisible(out_channels, self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        stage = []
+        conv_layer = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        csp_layer = CSPLayer(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage
+    def init_weights(self):
+        """Initialize the parameters."""
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+@MODELS.register_module()
+class YOLOv8CSPDarknet(BaseBackbone):
+    """CSP-Darknet backbone used in YOLOv8.
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5}.
+            Defaults to P5.
+        last_stage_out_channels (int): Final layer output channel.
+            Defaults to 1024.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to: 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        init_cfg (Union[dict,list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOv8CSPDarknet
+        >>> import torch
+        >>> model = YOLOv8CSPDarknet()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    # the final out_channels will be set according to the param.
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, None, 3, True, True]],
+    }
+    def __init__(self,
+                 arch: str = 'P5',
+                 last_stage_out_channels: int = 1024,
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        self.arch_settings[arch][-1][1] = last_stage_out_channels
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        return ConvModule(
+            self.input_channels,
+            make_divisible(self.arch_setting[0][0], self.widen_factor),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+        in_channels = make_divisible(in_channels, self.widen_factor)
+        out_channels = make_divisible(out_channels, self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        stage = []
+        conv_layer = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        csp_layer = CSPLayerWithTwoConv(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage
+    def init_weights(self):
+        """Initialize the parameters."""
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+@MODELS.register_module()
+class YOLOXCSPDarknet(BaseBackbone):
+    """CSP-Darknet backbone used in YOLOX.
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Defaults to (5, 9, 13).
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (Union[dict,list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOXCSPDarknet
+        >>> import torch
+        >>> model = YOLOXCSPDarknet()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+    }
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 use_depthwise: bool = False,
+                 spp_kernal_sizes: Tuple[int] = (5, 9, 13),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        self.use_depthwise = use_depthwise
+        self.spp_kernal_sizes = spp_kernal_sizes
+        super().__init__(self.arch_settings[arch], deepen_factor, widen_factor,
+                         input_channels, out_indices, frozen_stages, plugins,
+                         norm_cfg, act_cfg, norm_eval, init_cfg)
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        return Focus(
+            3,
+            make_divisible(64, self.widen_factor),
+            kernel_size=3,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+        in_channels = make_divisible(in_channels, self.widen_factor)
+        out_channels = make_divisible(out_channels, self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        stage = []
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        conv_layer = conv(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=self.spp_kernal_sizes,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        csp_layer = CSPLayer(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        return stage

mmyolo/models/backbones/csp_resnet.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.utils import ConfigType, OptMultiConfig
+from mmyolo.models.backbones import BaseBackbone
+from mmyolo.models.layers.yolo_bricks import CSPResLayer
+from mmyolo.registry import MODELS
+@MODELS.register_module()
+class PPYOLOECSPResNet(BaseBackbone):
+    """CSP-ResNet backbone used in PPYOLOE.
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        block_cfg (dict): Config dict for block. Defaults to
+            dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True)
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', momentum=0.1,
+            eps=1e-5).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        attention_cfg (dict): Config dict for `EffectiveSELayer`.
+            Defaults to dict(type='EffectiveSELayer',
+            act_cfg=dict(type='HSigmoid')).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+        use_large_stem (bool): Whether to use large stem layer.
+            Defaults to False.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks
+    arch_settings = {
+        'P5': [[64, 128, 3], [128, 256, 6], [256, 512, 6], [512, 1024, 3]]
+    }
+    def __init__(self,
+                 arch: str = 'P5',
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 plugins: Union[dict, List[dict]] = None,
+                 arch_ovewrite: dict = None,
+                 block_cfg: ConfigType = dict(
+                     type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 attention_cfg: ConfigType = dict(
+                     type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None,
+                 use_large_stem: bool = False):
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        arch_setting = [[
+            int(in_channels * widen_factor),
+            int(out_channels * widen_factor),
+            round(num_blocks * deepen_factor)
+        ] for in_channels, out_channels, num_blocks in arch_setting]
+        self.block_cfg = block_cfg
+        self.use_large_stem = use_large_stem
+        self.attention_cfg = attention_cfg
+        super().__init__(
+            arch_setting,
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        if self.use_large_stem:
+            stem = nn.Sequential(
+                ConvModule(
+                    self.input_channels,
+                    self.arch_setting[0][0] // 2,
+                    3,
+                    stride=2,
+                    padding=1,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg),
+                ConvModule(
+                    self.arch_setting[0][0] // 2,
+                    self.arch_setting[0][0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    self.arch_setting[0][0] // 2,
+                    self.arch_setting[0][0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        else:
+            stem = nn.Sequential(
+                ConvModule(
+                    self.input_channels,
+                    self.arch_setting[0][0] // 2,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    self.arch_setting[0][0] // 2,
+                    self.arch_setting[0][0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        return stem
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks = setting
+        cspres_layer = CSPResLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_block=num_blocks,
+            block_cfg=self.block_cfg,
+            stride=2,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            attention_cfg=self.attention_cfg,
+            use_spp=False)
+        return [cspres_layer]

mmyolo/models/backbones/cspnext.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Sequence, Union
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.backbones.csp_darknet import CSPLayer
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from mmyolo.registry import MODELS
+from ..layers import SPPFBottleneck
+from .base_backbone import BaseBackbone
+@MODELS.register_module()
+class CSPNeXt(BaseBackbone):
+    """CSPNeXt backbone used in RTMDet.
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.Defaults to
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+    def __init__(
+        self,
+        arch: str = 'P5',
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        input_channels: int = 3,
+        out_indices: Sequence[int] = (2, 3, 4),
+        frozen_stages: int = -1,
+        plugins: Union[dict, List[dict]] = None,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        arch_ovewrite: dict = None,
+        channel_attention: bool = True,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        norm_eval: bool = False,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        self.channel_attention = channel_attention
+        self.use_depthwise = use_depthwise
+        self.conv = DepthwiseSeparableConvModule \
+            if use_depthwise else ConvModule
+        self.expand_ratio = expand_ratio
+        self.conv_cfg = conv_cfg
+        super().__init__(
+            arch_setting,
+            deepen_factor,
+            widen_factor,
+            input_channels,
+            out_indices,
+            frozen_stages=frozen_stages,
+            plugins=plugins,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        stem = nn.Sequential(
+            ConvModule(
+                3,
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                3,
+                padding=1,
+                stride=2,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                int(self.arch_setting[0][0] * self.widen_factor),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        return stem
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        num_blocks = max(round(num_blocks * self.deepen_factor), 1)
+        stage = []
+        conv_layer = self.conv(
+            in_channels,
+            out_channels,
+            3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=5,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        csp_layer = CSPLayer(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            use_depthwise=self.use_depthwise,
+            use_cspnext_block=True,
+            expand_ratio=self.expand_ratio,
+            channel_attention=self.channel_attention,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        return stage

mmyolo/models/backbones/efficient_rep.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+import torch
+import torch.nn as nn
+from mmdet.utils import ConfigType, OptMultiConfig
+from mmyolo.models.layers.yolo_bricks import SPPFBottleneck
+from mmyolo.registry import MODELS
+from ..layers import BepC3StageBlock, RepStageBlock
+from ..utils import make_round
+from .base_backbone import BaseBackbone
+@MODELS.register_module()
+class YOLOv6EfficientRep(BaseBackbone):
+    """EfficientRep backbone used in YOLOv6.
+    Args:
+        arch (str): Architecture of BaseDarknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        init_cfg (Union[dict, list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOv6EfficientRep
+        >>> import torch
+        >>> model = YOLOv6EfficientRep()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 6, False], [128, 256, 12, False],
+               [256, 512, 18, False], [512, 1024, 6, True]]
+    }
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 norm_eval: bool = False,
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 init_cfg: OptMultiConfig = None):
+        self.block_cfg = block_cfg
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        block_cfg = self.block_cfg.copy()
+        block_cfg.update(
+            dict(
+                in_channels=self.input_channels,
+                out_channels=int(self.arch_setting[0][0] * self.widen_factor),
+                kernel_size=3,
+                stride=2,
+            ))
+        return MODELS.build(block_cfg)
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, use_spp = setting
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        rep_stage_block = RepStageBlock(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            num_blocks=num_blocks,
+            block_cfg=self.block_cfg,
+        )
+        block_cfg = self.block_cfg.copy()
+        block_cfg.update(
+            dict(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=2))
+        stage = []
+        ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block)
+        stage.append(ef_block)
+        if use_spp:
+            spp = SPPFBottleneck(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage
+    def init_weights(self):
+        if self.init_cfg is None:
+            """Initialize the parameters."""
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+@MODELS.register_module()
+class YOLOv6CSPBep(YOLOv6EfficientRep):
+    """CSPBep backbone used in YOLOv6.
+    Args:
+        arch (str): Architecture of BaseDarknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        block_act_cfg (dict): Config dict for activation layer used in each
+            stage. Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (Union[dict, list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOv6CSPBep
+        >>> import torch
+        >>> model = YOLOv6CSPBep()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 6, False], [128, 256, 12, False],
+               [256, 512, 18, False], [512, 1024, 6, True]]
+    }
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 hidden_ratio: float = 0.5,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 block_cfg: ConfigType = dict(type='ConvWrapper'),
+                 init_cfg: OptMultiConfig = None):
+        self.hidden_ratio = hidden_ratio
+        super().__init__(
+            arch=arch,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            block_cfg=block_cfg,
+            init_cfg=init_cfg)
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, use_spp = setting
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        rep_stage_block = BepC3StageBlock(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            num_blocks=num_blocks,
+            hidden_ratio=self.hidden_ratio,
+            block_cfg=self.block_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        block_cfg = self.block_cfg.copy()
+        block_cfg.update(
+            dict(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=2))
+        stage = []
+        ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block)
+        stage.append(ef_block)
+        if use_spp:
+            spp = SPPFBottleneck(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage

mmyolo/models/backbones/yolov7_backbone.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.backbones.csp_darknet import Focus
+from mmdet.utils import ConfigType, OptMultiConfig
+from mmyolo.registry import MODELS
+from ..layers import MaxPoolAndStrideConvBlock
+from .base_backbone import BaseBackbone
+@MODELS.register_module()
+class YOLOv7Backbone(BaseBackbone):
+    """Backbone used in YOLOv7.
+    Args:
+        arch (str): Architecture of YOLOv7Defaults to L.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    _tiny_stage1_cfg = dict(type='TinyDownSampleBlock', middle_ratio=0.5)
+    _tiny_stage2_4_cfg = dict(type='TinyDownSampleBlock', middle_ratio=1.0)
+    _l_expand_channel_2x = dict(
+        type='ELANBlock',
+        middle_ratio=0.5,
+        block_ratio=0.5,
+        num_blocks=2,
+        num_convs_in_block=2)
+    _l_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.25,
+        block_ratio=0.25,
+        num_blocks=2,
+        num_convs_in_block=2)
+    _x_expand_channel_2x = dict(
+        type='ELANBlock',
+        middle_ratio=0.4,
+        block_ratio=0.4,
+        num_blocks=3,
+        num_convs_in_block=2)
+    _x_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.2,
+        block_ratio=0.2,
+        num_blocks=3,
+        num_convs_in_block=2)
+    _w_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.5,
+        block_ratio=0.5,
+        num_blocks=2,
+        num_convs_in_block=2)
+    _e_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.4,
+        block_ratio=0.4,
+        num_blocks=3,
+        num_convs_in_block=2)
+    _d_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=1 / 3,
+        block_ratio=1 / 3,
+        num_blocks=4,
+        num_convs_in_block=2)
+    _e2e_no_change_channel = dict(
+        type='EELANBlock',
+        num_elan_block=2,
+        middle_ratio=0.4,
+        block_ratio=0.4,
+        num_blocks=3,
+        num_convs_in_block=2)
+    # From left to right:
+    # in_channels, out_channels, Block_params
+    arch_settings = {
+        'Tiny': [[64, 64, _tiny_stage1_cfg], [64, 128, _tiny_stage2_4_cfg],
+                 [128, 256, _tiny_stage2_4_cfg],
+                 [256, 512, _tiny_stage2_4_cfg]],
+        'L': [[64, 256, _l_expand_channel_2x],
+              [256, 512, _l_expand_channel_2x],
+              [512, 1024, _l_expand_channel_2x],
+              [1024, 1024, _l_no_change_channel]],
+        'X': [[80, 320, _x_expand_channel_2x],
+              [320, 640, _x_expand_channel_2x],
+              [640, 1280, _x_expand_channel_2x],
+              [1280, 1280, _x_no_change_channel]],
+        'W':
+        [[64, 128, _w_no_change_channel], [128, 256, _w_no_change_channel],
+         [256, 512, _w_no_change_channel], [512, 768, _w_no_change_channel],
+         [768, 1024, _w_no_change_channel]],
+        'E':
+        [[80, 160, _e_no_change_channel], [160, 320, _e_no_change_channel],
+         [320, 640, _e_no_change_channel], [640, 960, _e_no_change_channel],
+         [960, 1280, _e_no_change_channel]],
+        'D': [[96, 192,
+               _d_no_change_channel], [192, 384, _d_no_change_channel],
+              [384, 768, _d_no_change_channel],
+              [768, 1152, _d_no_change_channel],
+              [1152, 1536, _d_no_change_channel]],
+        'E2E': [[80, 160, _e2e_no_change_channel],
+                [160, 320, _e2e_no_change_channel],
+                [320, 640, _e2e_no_change_channel],
+                [640, 960, _e2e_no_change_channel],
+                [960, 1280, _e2e_no_change_channel]],
+    }
+    def __init__(self,
+                 arch: str = 'L',
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 plugins: Union[dict, List[dict]] = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        assert arch in self.arch_settings.keys()
+        self.arch = arch
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        if self.arch in ['L', 'X']:
+            stem = nn.Sequential(
+                ConvModule(
+                    3,
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    3,
+                    padding=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    3,
+                    padding=1,
+                    stride=2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    3,
+                    padding=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        elif self.arch == 'Tiny':
+            stem = nn.Sequential(
+                ConvModule(
+                    3,
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    3,
+                    padding=1,
+                    stride=2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    3,
+                    padding=1,
+                    stride=2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        elif self.arch in ['W', 'E', 'D', 'E2E']:
+            stem = Focus(
+                3,
+                int(self.arch_setting[0][0] * self.widen_factor),
+                kernel_size=3,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        return stem
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, stage_block_cfg = setting
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        stage_block_cfg = stage_block_cfg.copy()
+        stage_block_cfg.setdefault('norm_cfg', self.norm_cfg)
+        stage_block_cfg.setdefault('act_cfg', self.act_cfg)
+        stage_block_cfg['in_channels'] = in_channels
+        stage_block_cfg['out_channels'] = out_channels
+        stage = []
+        if self.arch in ['W', 'E', 'D', 'E2E']:
+            stage_block_cfg['in_channels'] = out_channels
+        elif self.arch in ['L', 'X']:
+            if stage_idx == 0:
+                stage_block_cfg['in_channels'] = out_channels // 2
+        downsample_layer = self._build_downsample_layer(
+            stage_idx, in_channels, out_channels)
+        stage.append(MODELS.build(stage_block_cfg))
+        if downsample_layer is not None:
+            stage.insert(0, downsample_layer)
+        return stage
+    def _build_downsample_layer(self, stage_idx: int, in_channels: int,
+                                out_channels: int) -> Optional[nn.Module]:
+        """Build a downsample layer pre stage."""
+        if self.arch in ['E', 'D', 'E2E']:
+            downsample_layer = MaxPoolAndStrideConvBlock(
+                in_channels,
+                out_channels,
+                use_in_channels_of_middle=True,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        elif self.arch == 'W':
+            downsample_layer = ConvModule(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        elif self.arch == 'Tiny':
+            if stage_idx != 0:
+                downsample_layer = nn.MaxPool2d(2, 2)
+            else:
+                downsample_layer = None
+        elif self.arch in ['L', 'X']:
+            if stage_idx == 0:
+                downsample_layer = ConvModule(
+                    in_channels,
+                    out_channels // 2,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
+            else:
+                downsample_layer = MaxPoolAndStrideConvBlock(
+                    in_channels,
+                    in_channels,
+                    use_in_channels_of_middle=False,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
+        return downsample_layer

mmyolo/models/data_preprocessors/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import (PPYOLOEBatchRandomResize,
+                                PPYOLOEDetDataPreprocessor,
+                                YOLOv5DetDataPreprocessor,
+                                YOLOXBatchSyncRandomResize)
+__all__ = [
+    'YOLOv5DetDataPreprocessor', 'PPYOLOEDetDataPreprocessor',
+    'PPYOLOEBatchRandomResize', 'YOLOXBatchSyncRandomResize'
+]

mmyolo/models/data_preprocessors/data_preprocessor.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from mmdet.models import BatchSyncRandomResize
+from mmdet.models.data_preprocessors import DetDataPreprocessor
+from mmengine import MessageHub, is_list_of
+from mmengine.structures import BaseDataElement
+from torch import Tensor
+from mmyolo.registry import MODELS
+CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str,
+                 None]
+@MODELS.register_module()
+class YOLOXBatchSyncRandomResize(BatchSyncRandomResize):
+    """YOLOX batch random resize.
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+    """
+    def forward(self, inputs: Tensor, data_samples: dict) -> Tensor and dict:
+        """resize a batch of images and bboxes to shape ``self._input_size``"""
+        h, w = inputs.shape[-2:]
+        inputs = inputs.float()
+        assert isinstance(data_samples, dict)
+        if self._input_size is None:
+            self._input_size = (h, w)
+        scale_y = self._input_size[0] / h
+        scale_x = self._input_size[1] / w
+        if scale_x != 1 or scale_y != 1:
+            inputs = F.interpolate(
+                inputs,
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            data_samples['bboxes_labels'][:, 2::2] *= scale_x
+            data_samples['bboxes_labels'][:, 3::2] *= scale_y
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            self._input_size = self._get_random_size(
+                aspect_ratio=float(w / h), device=inputs.device)
+        return inputs, data_samples
+@MODELS.register_module()
+class YOLOv5DetDataPreprocessor(DetDataPreprocessor):
+    """Rewrite collate_fn to get faster training speed.
+    Note: It must be used together with `mmyolo.datasets.utils.yolov5_collate`
+    """
+    def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs):
+        super().__init__(*args, non_blocking=non_blocking, **kwargs)
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding and bgr2rgb conversion based on
+        ``DetDataPreprocessorr``.
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if not training:
+            return super().forward(data, training)
+        data = self.cast_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        assert isinstance(data['data_samples'], dict)
+        # TODO: Supports multi-scale training
+        if self._channel_conversion and inputs.shape[1] == 3:
+            inputs = inputs[:, [2, 1, 0], ...]
+        if self._enable_normalize:
+            inputs = (inputs - self.mean) / self.std
+        if self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(inputs, data_samples)
+        img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs)
+        data_samples_output = {
+            'bboxes_labels': data_samples['bboxes_labels'],
+            'img_metas': img_metas
+        }
+        if 'masks' in data_samples:
+            data_samples_output['masks'] = data_samples['masks']
+        return {'inputs': inputs, 'data_samples': data_samples_output}
+@MODELS.register_module()
+class PPYOLOEDetDataPreprocessor(DetDataPreprocessor):
+    """Image pre-processor for detection tasks.
+    The main difference between PPYOLOEDetDataPreprocessor and
+    DetDataPreprocessor is the normalization order. The official
+    PPYOLOE resize image first, and then normalize image.
+    In DetDataPreprocessor, the order is reversed.
+    Note: It must be used together with
+    `mmyolo.datasets.utils.yolov5_collate`
+    """
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization、padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``. This class use batch_augments first, and then
+        normalize the image, which is different from the `DetDataPreprocessor`
+        .
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if not training:
+            return super().forward(data, training)
+        assert isinstance(data['inputs'], list) and is_list_of(
+            data['inputs'], torch.Tensor), \
+            '"inputs" should be a list of Tensor, but got ' \
+            f'{type(data["inputs"])}. The possible reason for this ' \
+            'is that you are not using it with ' \
+            '"mmyolo.datasets.utils.yolov5_collate". Please refer to ' \
+            '"cconfigs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py".'
+        data = self.cast_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        assert isinstance(data['data_samples'], dict)
+        # Process data.
+        batch_inputs = []
+        for _input in inputs:
+            # channel transform
+            if self._channel_conversion:
+                _input = _input[[2, 1, 0], ...]
+            # Convert to float after channel conversion to ensure
+            # efficiency
+            _input = _input.float()
+            batch_inputs.append(_input)
+        # Batch random resize image.
+        if self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(batch_inputs, data_samples)
+        if self._enable_normalize:
+            inputs = (inputs - self.mean) / self.std
+        img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs)
+        data_samples = {
+            'bboxes_labels': data_samples['bboxes_labels'],
+            'img_metas': img_metas
+        }
+        return {'inputs': inputs, 'data_samples': data_samples}
+# TODO: No generality. Its input data format is different
+#  mmdet's batch aug, and it must be compatible in the future.
+@MODELS.register_module()
+class PPYOLOEBatchRandomResize(BatchSyncRandomResize):
+    """PPYOLOE batch random resize.
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+        random_interp (bool): Whether to choose interp_mode randomly.
+            If set to True, the type of `interp_mode` must be list.
+            If set to False, the type of `interp_mode` must be str.
+            Defaults to True.
+        interp_mode (Union[List, str]): The modes available for resizing
+            are ('nearest', 'bilinear', 'bicubic', 'area').
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing
+            the image. Now we only support keep_ratio=False.
+            Defaults to False.
+    """
+    def __init__(self,
+                 random_size_range: Tuple[int, int],
+                 interval: int = 1,
+                 size_divisor: int = 32,
+                 random_interp=True,
+                 interp_mode: Union[List[str], str] = [
+                     'nearest', 'bilinear', 'bicubic', 'area'
+                 ],
+                 keep_ratio: bool = False) -> None:
+        super().__init__(random_size_range, interval, size_divisor)
+        self.random_interp = random_interp
+        self.keep_ratio = keep_ratio
+        # TODO: need to support keep_ratio==True
+        assert not self.keep_ratio, 'We do not yet support keep_ratio=True'
+        if self.random_interp:
+            assert isinstance(interp_mode, list) and len(interp_mode) > 1,\
+                'While random_interp==True, the type of `interp_mode`' \
+                ' must be list and len(interp_mode) must large than 1'
+            self.interp_mode_list = interp_mode
+            self.interp_mode = None
+        else:
+            assert isinstance(interp_mode, str),\
+                'While random_interp==False, the type of ' \
+                '`interp_mode` must be str'
+            assert interp_mode in ['nearest', 'bilinear', 'bicubic', 'area']
+            self.interp_mode_list = None
+            self.interp_mode = interp_mode
+    def forward(self, inputs: list,
+                data_samples: dict) -> Tuple[Tensor, Tensor]:
+        """Resize a batch of images and bboxes to shape ``self._input_size``.
+        The inputs and data_samples should be list, and
+        ``PPYOLOEBatchRandomResize`` must be used with
+        ``PPYOLOEDetDataPreprocessor`` and ``yolov5_collate`` with
+        ``use_ms_training == True``.
+        """
+        assert isinstance(inputs, list),\
+            'The type of inputs must be list. The possible reason for this ' \
+            'is that you are not using it with `PPYOLOEDetDataPreprocessor` ' \
+            'and `yolov5_collate` with use_ms_training == True.'
+        bboxes_labels = data_samples['bboxes_labels']
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            # get current input size
+            self._input_size, interp_mode = self._get_random_size_and_interp()
+            if self.random_interp:
+                self.interp_mode = interp_mode
+        # TODO: need to support type(inputs)==Tensor
+        if isinstance(inputs, list):
+            outputs = []
+            for i in range(len(inputs)):
+                _batch_input = inputs[i]
+                h, w = _batch_input.shape[-2:]
+                scale_y = self._input_size[0] / h
+                scale_x = self._input_size[1] / w
+                if scale_x != 1. or scale_y != 1.:
+                    if self.interp_mode in ('nearest', 'area'):
+                        align_corners = None
+                    else:
+                        align_corners = False
+                    _batch_input = F.interpolate(
+                        _batch_input.unsqueeze(0),
+                        size=self._input_size,
+                        mode=self.interp_mode,
+                        align_corners=align_corners)
+                    # rescale boxes
+                    indexes = bboxes_labels[:, 0] == i
+                    bboxes_labels[indexes, 2] *= scale_x
+                    bboxes_labels[indexes, 3] *= scale_y
+                    bboxes_labels[indexes, 4] *= scale_x
+                    bboxes_labels[indexes, 5] *= scale_y
+                    data_samples['bboxes_labels'] = bboxes_labels
+                else:
+                    _batch_input = _batch_input.unsqueeze(0)
+                outputs.append(_batch_input)
+            # convert to Tensor
+            return torch.cat(outputs, dim=0), data_samples
+        else:
+            raise NotImplementedError('Not implemented yet!')
+    def _get_random_size_and_interp(self) -> Tuple[int, int]:
+        """Randomly generate a shape in ``_random_size_range`` and a
+        interp_mode in interp_mode_list."""
+        size = random.randint(*self._random_size_range)
+        input_size = (self._size_divisor * size, self._size_divisor * size)
+        if self.random_interp:
+            interp_ind = random.randint(0, len(self.interp_mode_list) - 1)
+            interp_mode = self.interp_mode_list[interp_ind]
+        else:
+            interp_mode = None
+        return input_size, interp_mode

mmyolo/models/dense_heads/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .ppyoloe_head import PPYOLOEHead, PPYOLOEHeadModule
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule
+from .rtmdet_ins_head import RTMDetInsSepBNHead, RTMDetInsSepBNHeadModule
+from .rtmdet_rotated_head import (RTMDetRotatedHead,
+                                  RTMDetRotatedSepBNHeadModule)
+from .yolov5_head import YOLOv5Head, YOLOv5HeadModule
+from .yolov6_head import YOLOv6Head, YOLOv6HeadModule
+from .yolov7_head import YOLOv7Head, YOLOv7HeadModule, YOLOv7p6HeadModule
+from .yolov8_head import YOLOv8Head, YOLOv8HeadModule
+from .yolox_head import YOLOXHead, YOLOXHeadModule
+__all__ = [
+    'YOLOv5Head', 'YOLOv6Head', 'YOLOXHead', 'YOLOv5HeadModule',
+    'YOLOv6HeadModule', 'YOLOXHeadModule', 'RTMDetHead',
+    'RTMDetSepBNHeadModule', 'YOLOv7Head', 'PPYOLOEHead', 'PPYOLOEHeadModule',
+    'YOLOv7HeadModule', 'YOLOv7p6HeadModule', 'YOLOv8Head', 'YOLOv8HeadModule',
+    'RTMDetRotatedHead', 'RTMDetRotatedSepBNHeadModule', 'RTMDetInsSepBNHead',
+    'RTMDetInsSepBNHeadModule'
+]

mmyolo/models/dense_heads/ppyoloe_head.py ADDED Viewed

	@@ -0,0 +1,374 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models.utils import multi_apply
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from mmengine import MessageHub
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS
+from ..layers.yolo_bricks import PPYOLOESELayer
+from ..utils import gt_instances_preprocess
+from .yolov6_head import YOLOv6Head
+@MODELS.register_module()
+class PPYOLOEHeadModule(BaseModule):
+    """PPYOLOEHead head module used in `PPYOLOE.
+    <https://arxiv.org/abs/2203.16250>`_.
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}``
+            in QFL setting. Defaults to 16.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 1,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 reg_max: int = 16,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.featmap_strides = featmap_strides
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.reg_max = reg_max
+        if isinstance(in_channels, int):
+            self.in_channels = [int(in_channels * widen_factor)
+                                ] * self.num_levels
+        else:
+            self.in_channels = [int(i * widen_factor) for i in in_channels]
+        self._init_layers()
+    def init_weights(self, prior_prob=0.01):
+        """Initialize the weight and bias of PPYOLOE head."""
+        super().init_weights()
+        for conv in self.cls_preds:
+            conv.bias.data.fill_(bias_init_with_prob(prior_prob))
+            conv.weight.data.fill_(0.)
+        for conv in self.reg_preds:
+            conv.bias.data.fill_(1.0)
+            conv.weight.data.fill_(0.)
+    def _init_layers(self):
+        """initialize conv layers in PPYOLOE head."""
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.cls_stems = nn.ModuleList()
+        self.reg_stems = nn.ModuleList()
+        for in_channel in self.in_channels:
+            self.cls_stems.append(
+                PPYOLOESELayer(
+                    in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg))
+            self.reg_stems.append(
+                PPYOLOESELayer(
+                    in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg))
+        for in_channel in self.in_channels:
+            self.cls_preds.append(
+                nn.Conv2d(in_channel, self.num_classes, 3, padding=1))
+            self.reg_preds.append(
+                nn.Conv2d(in_channel, 4 * (self.reg_max + 1), 3, padding=1))
+        # init proj
+        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).view(
+            [1, self.reg_max + 1, 1, 1])
+        self.register_buffer('proj', proj, persistent=False)
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions.
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.cls_stems,
+                           self.cls_preds, self.reg_stems, self.reg_preds)
+    def forward_single(self, x: Tensor, cls_stem: nn.ModuleList,
+                       cls_pred: nn.ModuleList, reg_stem: nn.ModuleList,
+                       reg_pred: nn.ModuleList) -> Tensor:
+        """Forward feature of a single scale level."""
+        b, _, h, w = x.shape
+        hw = h * w
+        avg_feat = F.adaptive_avg_pool2d(x, (1, 1))
+        cls_logit = cls_pred(cls_stem(x, avg_feat) + x)
+        bbox_dist_preds = reg_pred(reg_stem(x, avg_feat))
+        # TODO: Test whether use matmul instead of conv can speed up training.
+        bbox_dist_preds = bbox_dist_preds.reshape(
+            [-1, 4, self.reg_max + 1, hw]).permute(0, 2, 3, 1)
+        bbox_preds = F.conv2d(F.softmax(bbox_dist_preds, dim=1), self.proj)
+        if self.training:
+            return cls_logit, bbox_preds, bbox_dist_preds
+        else:
+            return cls_logit, bbox_preds
+@MODELS.register_module()
+class PPYOLOEHead(YOLOv6Head):
+    """PPYOLOEHead head used in `PPYOLOE <https://arxiv.org/abs/2203.16250>`_.
+    The YOLOv6 head and the PPYOLOE head are only slightly different.
+    Distribution focal loss is extra used in PPYOLOE, but not in YOLOv6.
+    Args:
+        head_module(ConfigType): Base module used for YOLOv5Head
+        prior_generator(dict): Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_dfl (:obj:`ConfigDict` or dict): Config of distribution focal
+            loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0.5,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='giou',
+                     bbox_format='xyxy',
+                     reduction='mean',
+                     loss_weight=2.5,
+                     return_iou=False),
+                 loss_dfl: ConfigType = dict(
+                     type='mmdet.DistributionFocalLoss',
+                     reduction='mean',
+                     loss_weight=0.5 / 4),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.loss_dfl = MODELS.build(loss_dfl)
+        # ppyoloe doesn't need loss_obj
+        self.loss_obj = None
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            bbox_dist_preds: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            bbox_dist_preds (Sequence[Tensor]): Box distribution logits for
+                each scale level with shape (bs, reg_max + 1, H*W, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        # get epoch information from message hub
+        message_hub = MessageHub.get_current_instance()
+        current_epoch = message_hub.get_info('epoch')
+        num_imgs = len(batch_img_metas)
+        current_featmap_sizes = [
+            cls_score.shape[2:] for cls_score in cls_scores
+        ]
+        # If the shape does not equal, generate new one
+        if current_featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = current_featmap_sizes
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                self.featmap_sizes_train,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+            self.num_level_priors = [len(n) for n in mlvl_priors_with_stride]
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+            self.stride_tensor = self.flatten_priors_train[..., [2]]
+        # gt info
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+        # pred info
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+        flatten_pred_bboxes = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        # (bs, reg_max+1, n, 4) -> (bs, n, 4, reg_max+1)
+        flatten_pred_dists = [
+            bbox_pred_org.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, (self.head_module.reg_max + 1) * 4)
+            for bbox_pred_org in bbox_dist_preds
+        ]
+        flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1)
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1)
+        flatten_pred_bboxes = self.bbox_coder.decode(
+            self.flatten_priors_train[..., :2], flatten_pred_bboxes,
+            self.stride_tensor[..., 0])
+        pred_scores = torch.sigmoid(flatten_cls_preds)
+        if current_epoch < self.initial_epoch:
+            assigned_result = self.initial_assigner(
+                flatten_pred_bboxes.detach(), self.flatten_priors_train,
+                self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag)
+        else:
+            assigned_result = self.assigner(flatten_pred_bboxes.detach(),
+                                            pred_scores.detach(),
+                                            self.flatten_priors_train,
+                                            gt_labels, gt_bboxes,
+                                            pad_bbox_flag)
+        assigned_bboxes = assigned_result['assigned_bboxes']
+        assigned_scores = assigned_result['assigned_scores']
+        fg_mask_pre_prior = assigned_result['fg_mask_pre_prior']
+        # cls loss
+        with torch.cuda.amp.autocast(enabled=False):
+            loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores)
+        # rescale bbox
+        assigned_bboxes /= self.stride_tensor
+        flatten_pred_bboxes /= self.stride_tensor
+        assigned_scores_sum = assigned_scores.sum()
+        # reduce_mean between all gpus
+        assigned_scores_sum = torch.clamp(
+            reduce_mean(assigned_scores_sum), min=1)
+        loss_cls /= assigned_scores_sum
+        # select positive samples mask
+        num_pos = fg_mask_pre_prior.sum()
+        if num_pos > 0:
+            # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox
+            # will not report an error
+            # iou loss
+            prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(
+                flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = torch.masked_select(
+                assigned_bboxes, prior_bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1)
+            loss_bbox = self.loss_bbox(
+                pred_bboxes_pos,
+                assigned_bboxes_pos,
+                weight=bbox_weight,
+                avg_factor=assigned_scores_sum)
+            # dfl loss
+            dist_mask = fg_mask_pre_prior.unsqueeze(-1).repeat(
+                [1, 1, (self.head_module.reg_max + 1) * 4])
+            pred_dist_pos = torch.masked_select(
+                flatten_dist_preds,
+                dist_mask).reshape([-1, 4, self.head_module.reg_max + 1])
+            assigned_ltrb = self.bbox_coder.encode(
+                self.flatten_priors_train[..., :2] / self.stride_tensor,
+                assigned_bboxes,
+                max_dis=self.head_module.reg_max,
+                eps=0.01)
+            assigned_ltrb_pos = torch.masked_select(
+                assigned_ltrb, prior_bbox_mask).reshape([-1, 4])
+            loss_dfl = self.loss_dfl(
+                pred_dist_pos.reshape(-1, self.head_module.reg_max + 1),
+                assigned_ltrb_pos.reshape(-1),
+                weight=bbox_weight.expand(-1, 4).reshape(-1),
+                avg_factor=assigned_scores_sum)
+        else:
+            loss_bbox = flatten_pred_bboxes.sum() * 0
+            loss_dfl = flatten_pred_bboxes.sum() * 0
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dfl=loss_dfl)

mmyolo/models/dense_heads/rtmdet_head.py ADDED Viewed

	@@ -0,0 +1,368 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmdet.models.task_modules.samplers import PseudoSampler
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig, reduce_mean)
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from torch import Tensor
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess
+from .yolov5_head import YOLOv5Head
+@MODELS.register_module()
+class RTMDetSepBNHeadModule(BaseModule):
+    """Detection Head of RTMDet.
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.  Defaults to 1.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Defaults to 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 2.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN')``.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='SiLU', inplace=True).
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        widen_factor: float = 1.0,
+        num_base_priors: int = 1,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        share_conv: bool = True,
+        pred_kernel_size: int = 1,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: OptMultiConfig = None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.share_conv = share_conv
+        self.num_classes = num_classes
+        self.pred_kernel_size = pred_kernel_size
+        self.feat_channels = int(feat_channels * widen_factor)
+        self.stacked_convs = stacked_convs
+        self.num_base_priors = num_base_priors
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.featmap_strides = featmap_strides
+        self.in_channels = int(in_channels * widen_factor)
+        self._init_layers()
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        for n in range(len(self.featmap_strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.num_classes,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+        if self.share_conv:
+            for n in range(len(self.featmap_strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        # Use prior in model initialization to improve stability
+        super().init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01)
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for idx, x in enumerate(feats):
+            cls_feat = x
+            reg_feat = x
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+            reg_dist = self.rtm_reg[idx](reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
+@MODELS.register_module()
+class RTMDetHead(YOLOv5Head):
+    """RTMDet head.
+    Args:
+        head_module(ConfigType): Base module used for RTMDetHead
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.QualityFocalLoss',
+                     use_sigmoid=True,
+                     beta=2.0,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.GIoULoss', loss_weight=2.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+        # rtmdet doesn't need loss_obj
+        self.loss_obj = None
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+        The special_init function is designed to deal with this situation.
+        """
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg.sampler, default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+            self.featmap_sizes_train = None
+            self.flatten_priors_train = None
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        return self.head_module(x)
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+        device = cls_scores[0].device
+        # If the shape does not equal, generate new one
+        if featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = featmap_sizes
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                featmap_sizes, device=device, with_stride=True)
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1).contiguous()
+        flatten_bboxes = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ], 1)
+        flatten_bboxes = flatten_bboxes * self.flatten_priors_train[..., -1,
+                                                                    None]
+        flatten_bboxes = distance2bbox(self.flatten_priors_train[..., :2],
+                                       flatten_bboxes)
+        assigned_result = self.assigner(flatten_bboxes.detach(),
+                                        flatten_cls_scores.detach(),
+                                        self.flatten_priors_train, gt_labels,
+                                        gt_bboxes, pad_bbox_flag)
+        labels = assigned_result['assigned_labels'].reshape(-1)
+        label_weights = assigned_result['assigned_labels_weights'].reshape(-1)
+        bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4)
+        assign_metrics = assigned_result['assign_metrics'].reshape(-1)
+        cls_preds = flatten_cls_scores.reshape(-1, self.num_classes)
+        bbox_preds = flatten_bboxes.reshape(-1, 4)
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item()
+        loss_cls = self.loss_cls(
+            cls_preds, (labels, assign_metrics),
+            label_weights,
+            avg_factor=avg_factor)
+        if len(pos_inds) > 0:
+            loss_bbox = self.loss_bbox(
+                bbox_preds[pos_inds],
+                bbox_targets[pos_inds],
+                weight=assign_metrics[pos_inds],
+                avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_preds.sum() * 0
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)

mmyolo/models/dense_heads/rtmdet_ins_head.py ADDED Viewed

	@@ -0,0 +1,725 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmcv.ops import batched_nms
+from mmdet.models.utils import filter_scores_and_topk
+from mmdet.structures.bbox import get_box_tensor, get_box_wh, scale_boxes
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from mmengine import ConfigDict
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule
+class MaskFeatModule(BaseModule):
+    """Mask feature head used in RTMDet-Ins. Copy from mmdet.
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        stacked_convs (int): Number of convs in mask feature branch.
+        num_levels (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        num_prototypes (int): Number of output channel of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True)
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        num_levels: int = 3,
+        num_prototypes: int = 8,
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        norm_cfg: ConfigType = dict(type='BN')
+    ) -> None:
+        super().__init__(init_cfg=None)
+        self.num_levels = num_levels
+        self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1)
+        convs = []
+        for i in range(stacked_convs):
+            in_c = in_channels if i == 0 else feat_channels
+            convs.append(
+                ConvModule(
+                    in_c,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg))
+        self.stacked_convs = nn.Sequential(*convs)
+        self.projection = nn.Conv2d(
+            feat_channels, num_prototypes, kernel_size=1)
+    def forward(self, features: Tuple[Tensor, ...]) -> Tensor:
+        # multi-level feature fusion
+        fusion_feats = [features[0]]
+        size = features[0].shape[-2:]
+        for i in range(1, self.num_levels):
+            f = F.interpolate(features[i], size=size, mode='bilinear')
+            fusion_feats.append(f)
+        fusion_feats = torch.cat(fusion_feats, dim=1)
+        fusion_feats = self.fusion_conv(fusion_feats)
+        # pred mask feats
+        mask_features = self.stacked_convs(fusion_feats)
+        mask_features = self.projection(mask_features)
+        return mask_features
+@MODELS.register_module()
+class RTMDetInsSepBNHeadModule(RTMDetSepBNHeadModule):
+    """Detection and Instance Segmentation Head of RTMDet.
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        num_prototypes (int): Number of mask prototype features extracted
+            from the mask head. Defaults to 8.
+        dyconv_channels (int): Channel of the dynamic conv layers.
+            Defaults to 8.
+        num_dyconvs (int): Number of the dynamic convolution layers.
+            Defaults to 3.
+        use_sigmoid_cls (bool): Use sigmoid for class prediction.
+            Defaults to True.
+    """
+    def __init__(self,
+                 num_classes: int,
+                 *args,
+                 num_prototypes: int = 8,
+                 dyconv_channels: int = 8,
+                 num_dyconvs: int = 3,
+                 use_sigmoid_cls: bool = True,
+                 **kwargs):
+        self.num_prototypes = num_prototypes
+        self.num_dyconvs = num_dyconvs
+        self.dyconv_channels = dyconv_channels
+        self.use_sigmoid_cls = use_sigmoid_cls
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        super().__init__(num_classes=num_classes, *args, **kwargs)
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        self.rtm_kernel = nn.ModuleList()
+        self.rtm_obj = nn.ModuleList()
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+        pred_pad_size = self.pred_kernel_size // 2
+        for n in range(len(self.featmap_strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            kernel_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                kernel_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(cls_convs)
+            self.kernel_convs.append(kernel_convs)
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_kernel.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_gen_params,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+        if self.share_conv:
+            for n in range(len(self.featmap_strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.featmap_strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg,
+                                                self.rtm_kernel):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01, bias=1)
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Mask prototype features.
+                Has shape (batch_size, num_prototypes, H, W).
+        """
+        mask_feat = self.mask_head(feats)
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, stride) in enumerate(zip(feats, self.featmap_strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+            for kernel_layer in self.kernel_convs[idx]:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel[idx](kernel_feat)
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+            reg_dist = self.rtm_reg[idx](reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
+@MODELS.register_module()
+class RTMDetInsSepBNHead(RTMDetHead):
+    """RTMDet Instance Segmentation head.
+    Args:
+        head_module(ConfigType): Base module used for RTMDetInsSepBNHead
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_mask (:obj:`ConfigDict` or dict): Config of mask loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.QualityFocalLoss',
+                     use_sigmoid=True,
+                     beta=2.0,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.GIoULoss', loss_weight=2.0),
+                 loss_mask=dict(
+                     type='mmdet.DiceLoss',
+                     loss_weight=2.0,
+                     eps=5e-6,
+                     reduction='mean'),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if isinstance(self.head_module, RTMDetInsSepBNHeadModule):
+            assert self.use_sigmoid_cls == self.head_module.use_sigmoid_cls
+        self.loss_mask = MODELS.build(loss_mask)
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        kernel_preds: List[Tensor],
+                        mask_feats: Tensor,
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_params, H, W).
+            mask_feats (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, num_prototypes, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+        Returns:
+            list[:obj:`InstanceData`]: Object detection and instance
+            segmentation results of each image after the post process.
+            Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        multi_label = cfg.multi_label
+        multi_label &= self.num_classes > 1
+        cfg.multi_label = multi_label
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size.numel() * self.num_base_priors, ), stride) for
+            featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+        # flatten cls_scores, bbox_preds
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_classes)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_kernel_preds = [
+            kernel_pred.permute(0, 2, 3,
+                                1).reshape(num_imgs, -1,
+                                           self.head_module.num_gen_params)
+            for kernel_pred in kernel_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_decoded_bboxes = self.bbox_coder.decode(
+            flatten_priors[..., :2].unsqueeze(0), flatten_bbox_preds,
+            flatten_stride)
+        flatten_kernel_preds = torch.cat(flatten_kernel_preds, dim=1)
+        results_list = []
+        for (bboxes, scores, kernel_pred, mask_feat,
+             img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores,
+                              flatten_kernel_preds, mask_feats,
+                              batch_img_metas):
+            ori_shape = img_meta['ori_shape']
+            scale_factor = img_meta['scale_factor']
+            if 'pad_param' in img_meta:
+                pad_param = img_meta['pad_param']
+            else:
+                pad_param = None
+            score_thr = cfg.get('score_thr', -1)
+            if scores.shape[0] == 0:
+                empty_results = InstanceData()
+                empty_results.bboxes = bboxes
+                empty_results.scores = scores[:, 0]
+                empty_results.labels = scores[:, 0].int()
+                h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2]
+                empty_results.masks = torch.zeros(
+                    size=(0, h, w), dtype=torch.bool, device=bboxes.device)
+                results_list.append(empty_results)
+                continue
+            nms_pre = cfg.get('nms_pre', 100000)
+            if cfg.multi_label is False:
+                scores, labels = scores.max(1, keepdim=True)
+                scores, _, keep_idxs, results = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(
+                        labels=labels[:, 0],
+                        kernel_pred=kernel_pred,
+                        priors=flatten_priors))
+                labels = results['labels']
+                kernel_pred = results['kernel_pred']
+                priors = results['priors']
+            else:
+                out = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(
+                        kernel_pred=kernel_pred, priors=flatten_priors))
+                scores, labels, keep_idxs, filtered_results = out
+                kernel_pred = filtered_results['kernel_pred']
+                priors = filtered_results['priors']
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=bboxes[keep_idxs],
+                kernels=kernel_pred,
+                priors=priors)
+            if rescale:
+                if pad_param is not None:
+                    results.bboxes -= results.bboxes.new_tensor([
+                        pad_param[2], pad_param[0], pad_param[2], pad_param[0]
+                    ])
+                results.bboxes /= results.bboxes.new_tensor(
+                    scale_factor).repeat((1, 2))
+            if cfg.get('yolox_style', False):
+                # do not need max_per_img
+                cfg.max_per_img = len(results)
+            results = self._bbox_mask_post_process(
+                results=results,
+                mask_feat=mask_feat,
+                cfg=cfg,
+                rescale_bbox=False,
+                rescale_mask=rescale,
+                with_nms=with_nms,
+                pad_param=pad_param,
+                img_meta=img_meta)
+            results.bboxes[:, 0::2].clamp_(0, ori_shape[1])
+            results.bboxes[:, 1::2].clamp_(0, ori_shape[0])
+            results_list.append(results)
+        return results_list
+    def _bbox_mask_post_process(
+            self,
+            results: InstanceData,
+            mask_feat: Tensor,
+            cfg: ConfigDict,
+            rescale_bbox: bool = False,
+            rescale_mask: bool = True,
+            with_nms: bool = True,
+            pad_param: Optional[np.ndarray] = None,
+            img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox and mask post-processing method.
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            mask_feat (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, num_prototypes, H, W).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale_bbox (bool): If True, return boxes in original image space.
+                Default to False.
+            rescale_mask (bool): If True, return masks in original image space.
+                Default to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        if rescale_bbox:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+        if hasattr(results, 'score_factors'):
+            # TODO： Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        assert with_nms, 'with_nms must be True for RTMDet-Ins'
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+            # process masks
+            mask_logits = self._mask_predict_by_feat(mask_feat,
+                                                     results.kernels,
+                                                     results.priors)
+            stride = self.prior_generator.strides[0][0]
+            mask_logits = F.interpolate(
+                mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear')
+            if rescale_mask:
+                # TODO: When use mmdet.Resize or mmdet.Pad, will meet bug
+                # Use img_meta to crop and resize
+                ori_h, ori_w = img_meta['ori_shape'][:2]
+                if isinstance(pad_param, np.ndarray):
+                    pad_param = pad_param.astype(np.int32)
+                    crop_y1, crop_y2 = pad_param[
+                        0], mask_logits.shape[-2] - pad_param[1]
+                    crop_x1, crop_x2 = pad_param[
+                        2], mask_logits.shape[-1] - pad_param[3]
+                    mask_logits = mask_logits[..., crop_y1:crop_y2,
+                                              crop_x1:crop_x2]
+                mask_logits = F.interpolate(
+                    mask_logits,
+                    size=[ori_h, ori_w],
+                    mode='bilinear',
+                    align_corners=False)
+            masks = mask_logits.sigmoid().squeeze(0)
+            masks = masks > cfg.mask_thr_binary
+            results.masks = masks
+        else:
+            h, w = img_meta['ori_shape'][:2] if rescale_mask else img_meta[
+                'img_shape'][:2]
+            results.masks = torch.zeros(
+                size=(results.bboxes.shape[0], h, w),
+                dtype=torch.bool,
+                device=results.bboxes.device)
+        return results
+    def _mask_predict_by_feat(self, mask_feat: Tensor, kernels: Tensor,
+                              priors: Tensor) -> Tensor:
+        """Generate mask logits from mask features with dynamic convs.
+        Args:
+            mask_feat (Tensor): Mask prototype features.
+                Has shape (num_prototypes, H, W).
+            kernels (Tensor): Kernel parameters for each instance.
+                Has shape (num_instance, num_params)
+            priors (Tensor): Center priors for each instance.
+                Has shape (num_instance, 4).
+        Returns:
+            Tensor: Instance segmentation masks for each instance.
+                Has shape (num_instance, H, W).
+        """
+        num_inst = kernels.shape[0]
+        h, w = mask_feat.size()[-2:]
+        if num_inst < 1:
+            return torch.empty(
+                size=(num_inst, h, w),
+                dtype=mask_feat.dtype,
+                device=mask_feat.device)
+        if len(mask_feat.shape) < 4:
+            mask_feat.unsqueeze(0)
+        coord = self.prior_generator.single_level_grid_priors(
+            (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2)
+        num_inst = priors.shape[0]
+        points = priors[:, :2].reshape(-1, 1, 2)
+        strides = priors[:, 2:].reshape(-1, 1, 2)
+        relative_coord = (points - coord).permute(0, 2, 1) / (
+            strides[..., 0].reshape(-1, 1, 1) * 8)
+        relative_coord = relative_coord.reshape(num_inst, 2, h, w)
+        mask_feat = torch.cat(
+            [relative_coord,
+             mask_feat.repeat(num_inst, 1, 1, 1)], dim=1)
+        weights, biases = self.parse_dynamic_params(kernels)
+        n_layers = len(weights)
+        x = mask_feat.reshape(1, -1, h, w)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                x, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        x = x.reshape(num_inst, h, w)
+        return x
+    def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple:
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(self.head_module.weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels,
+                self.head_module.weight_nums + self.head_module.bias_nums,
+                dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * self.head_module.dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(
+                    n_inst * self.head_module.dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+        return weight_splits, bias_splits
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        raise NotImplementedError

mmyolo/models/dense_heads/rtmdet_rotated_head.py ADDED Viewed

	@@ -0,0 +1,641 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Optional, Sequence, Tuple
+import torch
+import torch.nn as nn
+from mmdet.models.utils import filter_scores_and_topk
+from mmdet.structures.bbox import HorizontalBoxes, distance2bbox
+from mmdet.structures.bbox.transforms import bbox_cxcywh_to_xyxy, scale_boxes
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig, reduce_mean)
+from mmengine.config import ConfigDict
+from mmengine.model import normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule
+try:
+    from mmrotate.structures.bbox import RotatedBoxes, distance2obb
+    MMROTATE_AVAILABLE = True
+except ImportError:
+    RotatedBoxes = None
+    distance2obb = None
+    MMROTATE_AVAILABLE = False
+@MODELS.register_module()
+class RTMDetRotatedSepBNHeadModule(RTMDetSepBNHeadModule):
+    """Detection Head Module of RTMDet-R.
+    Compared with RTMDet Detection Head Module, RTMDet-R adds
+    a conv for angle prediction.
+    An `angle_out_dim` arg is added, which is generated by the
+    angle_coder module and controls the angle pred dim.
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.  Defaults to 1.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Defaults to 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 2.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1.
+        angle_out_dim (int): Encoded length of angle, will passed by head.
+            Defaults to 1.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN')``.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='SiLU', inplace=True).
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        widen_factor: float = 1.0,
+        num_base_priors: int = 1,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        share_conv: bool = True,
+        pred_kernel_size: int = 1,
+        angle_out_dim: int = 1,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: OptMultiConfig = None,
+    ):
+        self.angle_out_dim = angle_out_dim
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            widen_factor=widen_factor,
+            num_base_priors=num_base_priors,
+            feat_channels=feat_channels,
+            stacked_convs=stacked_convs,
+            featmap_strides=featmap_strides,
+            share_conv=share_conv,
+            pred_kernel_size=pred_kernel_size,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.rtm_ang = nn.ModuleList()
+        for _ in range(len(self.featmap_strides)):
+            self.rtm_ang.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.angle_out_dim,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        # Use prior in model initialization to improve stability
+        super().init_weights()
+        for rtm_ang in self.rtm_ang:
+            normal_init(rtm_ang, std=0.01)
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - angle_preds (list[Tensor]): Angle prediction for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * angle_out_dim.
+        """
+        cls_scores = []
+        bbox_preds = []
+        angle_preds = []
+        for idx, x in enumerate(feats):
+            cls_feat = x
+            reg_feat = x
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+            reg_dist = self.rtm_reg[idx](reg_feat)
+            angle_pred = self.rtm_ang[idx](reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            angle_preds.append(angle_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(angle_preds)
+@MODELS.register_module()
+class RTMDetRotatedHead(RTMDetHead):
+    """RTMDet-R head.
+    Compared with RTMDetHead, RTMDetRotatedHead add some args to support
+    rotated object detection.
+    - `angle_version` used to limit angle_range during training.
+    - `angle_coder` used to encode and decode angle, which is similar
+      to bbox_coder.
+    - `use_hbbox_loss` and `loss_angle` allow custom regression loss
+      calculation for rotated box.
+      There are three combination options for regression:
+      1. `use_hbbox_loss=False` and loss_angle is None.
+      .. code:: text
+        bbox_pred────(tblr)───┐
+                              ▼
+        angle_pred          decode──►rbox_pred──(xywha)─►loss_bbox
+            │                 ▲
+            └────►decode──(a)─┘
+      2. `use_hbbox_loss=False` and loss_angle is specified.
+         A angle loss is added on angle_pred.
+      .. code:: text
+        bbox_pred────(tblr)───┐
+                              ▼
+        angle_pred          decode──►rbox_pred──(xywha)─►loss_bbox
+            │                 ▲
+            ├────►decode──(a)─┘
+            │
+            └───────────────────────────────────────────►loss_angle
+      3. `use_hbbox_loss=True` and loss_angle is specified.
+         In this case the loss_angle must be set.
+      .. code:: text
+        bbox_pred──(tblr)──►decode──►hbox_pred──(xyxy)──►loss_bbox
+        angle_pred──────────────────────────────────────►loss_angle
+    - There's a `decoded_with_angle` flag in test_cfg, which is similar
+      to training process.
+      When `decoded_with_angle=True`:
+      .. code:: text
+        bbox_pred────(tblr)───┐
+                              ▼
+        angle_pred          decode──(xywha)──►rbox_pred
+            │                 ▲
+            └────►decode──(a)─┘
+      When `decoded_with_angle=False`:
+      .. code:: text
+        bbox_pred──(tblr)─►decode
+                              │ (xyxy)
+                              ▼
+                           format───(xywh)──►concat──(xywha)──►rbox_pred
+                                               ▲
+        angle_pred────────►decode────(a)───────┘
+    Args:
+        head_module(ConfigType): Base module used for RTMDetRotatedHead.
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        angle_version (str): Angle representations. Defaults to 'le90'.
+        use_hbbox_loss (bool): If true, use horizontal bbox loss and
+            loss_angle should not be None. Default to False.
+        angle_coder (:obj:`ConfigDict` or dict): Config of angle coder.
+        loss_angle (:obj:`ConfigDict` or dict, optional): Config of angle loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(
+            self,
+            head_module: ConfigType,
+            prior_generator: ConfigType = dict(
+                type='mmdet.MlvlPointGenerator', strides=[8, 16, 32],
+                offset=0),
+            bbox_coder: ConfigType = dict(type='DistanceAnglePointCoder'),
+            loss_cls: ConfigType = dict(
+                type='mmdet.QualityFocalLoss',
+                use_sigmoid=True,
+                beta=2.0,
+                loss_weight=1.0),
+            loss_bbox: ConfigType = dict(
+                type='mmrotate.RotatedIoULoss', mode='linear',
+                loss_weight=2.0),
+            angle_version: str = 'le90',
+            use_hbbox_loss: bool = False,
+            angle_coder: ConfigType = dict(type='mmrotate.PseudoAngleCoder'),
+            loss_angle: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            init_cfg: OptMultiConfig = None):
+        if not MMROTATE_AVAILABLE:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmrotate.txt" '
+                'to install mmrotate first for rotated detection.')
+        self.angle_version = angle_version
+        self.use_hbbox_loss = use_hbbox_loss
+        if self.use_hbbox_loss:
+            assert loss_angle is not None, \
+                ('When use hbbox loss, loss_angle needs to be specified')
+        self.angle_coder = TASK_UTILS.build(angle_coder)
+        self.angle_out_dim = self.angle_coder.encode_size
+        if head_module.get('angle_out_dim') is not None:
+            warnings.warn('angle_out_dim will be overridden by angle_coder '
+                          'and does not need to be set manually')
+        head_module['angle_out_dim'] = self.angle_out_dim
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        if loss_angle is not None:
+            self.loss_angle = MODELS.build(loss_angle)
+        else:
+            self.loss_angle = None
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        angle_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into bbox
+        results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            angle_preds (list[Tensor]): Box angle for each scale level
+                with shape (N, num_points * angle_dim, H, W)
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 5),
+              the last dimension 4 arrange as (x, y, w, h, angle).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        if objectnesses is None:
+            with_objectnesses = False
+        else:
+            with_objectnesses = True
+            assert len(cls_scores) == len(objectnesses)
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        multi_label = cfg.multi_label
+        multi_label &= self.num_classes > 1
+        cfg.multi_label = multi_label
+        # Whether to decode rbox with angle.
+        # different setting lead to different final results.
+        # Defaults to True.
+        decode_with_angle = cfg.get('decode_with_angle', True)
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size.numel() * self.num_base_priors, ), stride) for
+            featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_classes)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_angle_preds = [
+            angle_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                   self.angle_out_dim)
+            for angle_pred in angle_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_angle_preds = torch.cat(flatten_angle_preds, dim=1)
+        flatten_angle_preds = self.angle_coder.decode(
+            flatten_angle_preds, keepdim=True)
+        if decode_with_angle:
+            flatten_rbbox_preds = torch.cat(
+                [flatten_bbox_preds, flatten_angle_preds], dim=-1)
+            flatten_decoded_bboxes = self.bbox_coder.decode(
+                flatten_priors[None], flatten_rbbox_preds, flatten_stride)
+        else:
+            flatten_decoded_hbboxes = self.bbox_coder.decode(
+                flatten_priors[None], flatten_bbox_preds, flatten_stride)
+            flatten_decoded_hbboxes = HorizontalBoxes.xyxy_to_cxcywh(
+                flatten_decoded_hbboxes)
+            flatten_decoded_bboxes = torch.cat(
+                [flatten_decoded_hbboxes, flatten_angle_preds], dim=-1)
+        if with_objectnesses:
+            flatten_objectness = [
+                objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+                for objectness in objectnesses
+            ]
+            flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        else:
+            flatten_objectness = [None for _ in range(num_imgs)]
+        results_list = []
+        for (bboxes, scores, objectness,
+             img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores,
+                              flatten_objectness, batch_img_metas):
+            scale_factor = img_meta['scale_factor']
+            if 'pad_param' in img_meta:
+                pad_param = img_meta['pad_param']
+            else:
+                pad_param = None
+            score_thr = cfg.get('score_thr', -1)
+            # yolox_style does not require the following operations
+            if objectness is not None and score_thr > 0 and not cfg.get(
+                    'yolox_style', False):
+                conf_inds = objectness > score_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+            if objectness is not None:
+                # conf = obj_conf * cls_conf
+                scores *= objectness[:, None]
+            if scores.shape[0] == 0:
+                empty_results = InstanceData()
+                empty_results.bboxes = RotatedBoxes(bboxes)
+                empty_results.scores = scores[:, 0]
+                empty_results.labels = scores[:, 0].int()
+                results_list.append(empty_results)
+                continue
+            nms_pre = cfg.get('nms_pre', 100000)
+            if cfg.multi_label is False:
+                scores, labels = scores.max(1, keepdim=True)
+                scores, _, keep_idxs, results = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(labels=labels[:, 0]))
+                labels = results['labels']
+            else:
+                scores, labels, keep_idxs, _ = filter_scores_and_topk(
+                    scores, score_thr, nms_pre)
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=RotatedBoxes(bboxes[keep_idxs]))
+            if rescale:
+                if pad_param is not None:
+                    results.bboxes.translate_([-pad_param[2], -pad_param[0]])
+                scale_factor = [1 / s for s in img_meta['scale_factor']]
+                results.bboxes = scale_boxes(results.bboxes, scale_factor)
+            if cfg.get('yolox_style', False):
+                # do not need max_per_img
+                cfg.max_per_img = len(results)
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=False,
+                with_nms=with_nms,
+                img_meta=img_meta)
+            results_list.append(results)
+        return results_list
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            angle_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            angle_preds (list[Tensor]): Angle prediction for each scale
+                level with shape (N, num_anchors * angle_out_dim, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xywha
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+        device = cls_scores[0].device
+        # If the shape does not equal, generate new one
+        if featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = featmap_sizes
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                featmap_sizes, device=device, with_stride=True)
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1).contiguous()
+        flatten_tblrs = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ], 1)
+        flatten_tblrs = flatten_tblrs * self.flatten_priors_train[..., -1,
+                                                                  None]
+        flatten_angles = torch.cat([
+            angle_pred.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.angle_out_dim) for angle_pred in angle_preds
+        ], 1)
+        flatten_decoded_angle = self.angle_coder.decode(
+            flatten_angles, keepdim=True)
+        flatten_tblra = torch.cat([flatten_tblrs, flatten_decoded_angle],
+                                  dim=-1)
+        flatten_rbboxes = distance2obb(
+            self.flatten_priors_train[..., :2],
+            flatten_tblra,
+            angle_version=self.angle_version)
+        if self.use_hbbox_loss:
+            flatten_hbboxes = distance2bbox(self.flatten_priors_train[..., :2],
+                                            flatten_tblrs)
+        assigned_result = self.assigner(flatten_rbboxes.detach(),
+                                        flatten_cls_scores.detach(),
+                                        self.flatten_priors_train, gt_labels,
+                                        gt_bboxes, pad_bbox_flag)
+        labels = assigned_result['assigned_labels'].reshape(-1)
+        label_weights = assigned_result['assigned_labels_weights'].reshape(-1)
+        bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 5)
+        assign_metrics = assigned_result['assign_metrics'].reshape(-1)
+        cls_preds = flatten_cls_scores.reshape(-1, self.num_classes)
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item()
+        loss_cls = self.loss_cls(
+            cls_preds, (labels, assign_metrics),
+            label_weights,
+            avg_factor=avg_factor)
+        pos_bbox_targets = bbox_targets[pos_inds]
+        if self.use_hbbox_loss:
+            bbox_preds = flatten_hbboxes.reshape(-1, 4)
+            pos_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets[:, :4])
+        else:
+            bbox_preds = flatten_rbboxes.reshape(-1, 5)
+        angle_preds = flatten_angles.reshape(-1, self.angle_out_dim)
+        if len(pos_inds) > 0:
+            loss_bbox = self.loss_bbox(
+                bbox_preds[pos_inds],
+                pos_bbox_targets,
+                weight=assign_metrics[pos_inds],
+                avg_factor=avg_factor)
+            loss_angle = angle_preds.sum() * 0
+            if self.loss_angle is not None:
+                pos_angle_targets = bbox_targets[pos_inds][:, 4:5]
+                pos_angle_targets = self.angle_coder.encode(pos_angle_targets)
+                loss_angle = self.loss_angle(
+                    angle_preds[pos_inds],
+                    pos_angle_targets,
+                    weight=assign_metrics[pos_inds],
+                    avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_preds.sum() * 0
+            loss_angle = angle_preds.sum() * 0
+        losses = dict()
+        losses['loss_cls'] = loss_cls
+        losses['loss_bbox'] = loss_bbox
+        if self.loss_angle is not None:
+            losses['loss_angle'] = loss_angle
+        return losses

mmyolo/models/dense_heads/yolov5_head.py ADDED Viewed

	@@ -0,0 +1,890 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from mmdet.models.dense_heads.base_dense_head import BaseDenseHead
+from mmdet.models.utils import filter_scores_and_topk, multi_apply
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig)
+from mmengine.config import ConfigDict
+from mmengine.dist import get_dist_info
+from mmengine.logging import print_log
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import make_divisible
+def get_prior_xy_info(index: int, num_base_priors: int,
+                      featmap_sizes: int) -> Tuple[int, int, int]:
+    """Get prior index and xy index in feature map by flatten index."""
+    _, featmap_w = featmap_sizes
+    priors = index % num_base_priors
+    xy_index = index // num_base_priors
+    grid_y = xy_index // featmap_w
+    grid_x = xy_index % featmap_w
+    return priors, grid_x, grid_y
+@MODELS.register_module()
+class YOLOv5HeadModule(BaseModule):
+    """YOLOv5Head head module used in `YOLOv5`.
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 3,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.widen_factor = widen_factor
+        self.featmap_strides = featmap_strides
+        self.num_out_attrib = 5 + self.num_classes
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+        if isinstance(in_channels, int):
+            self.in_channels = [make_divisible(in_channels, widen_factor)
+                                ] * self.num_levels
+        else:
+            self.in_channels = [
+                make_divisible(i, widen_factor) for i in in_channels
+            ]
+        self._init_layers()
+    def _init_layers(self):
+        """initialize conv layers in YOLOv5 head."""
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_pred = nn.Conv2d(self.in_channels[i],
+                                  self.num_base_priors * self.num_out_attrib,
+                                  1)
+            self.convs_pred.append(conv_pred)
+    def init_weights(self):
+        """Initialize the bias of YOLOv5 head."""
+        super().init_weights()
+        for mi, s in zip(self.convs_pred, self.featmap_strides):  # from
+            b = mi.bias.data.view(self.num_base_priors, -1)
+            # obj (8 objects per 640 image)
+            b.data[:, 4] += math.log(8 / (640 / s)**2)
+            b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.999999))
+            mi.bias.data = b.view(-1)
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.convs_pred)
+    def forward_single(self, x: Tensor,
+                       convs: nn.Module) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+        pred_map = convs(x)
+        bs, _, ny, nx = pred_map.shape
+        pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib,
+                                 ny, nx)
+        cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx)
+        bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx)
+        objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx)
+        return cls_score, bbox_pred, objectness
+@MODELS.register_module()
+class YOLOv5Head(BaseDenseHead):
+    """YOLOv5Head head used in `YOLOv5`.
+    Args:
+        head_module(ConfigType): Base module used for YOLOv5Head
+        prior_generator(dict): Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss.
+        prior_match_thr (float): Defaults to 4.0.
+        ignore_iof_thr (float): Defaults to -1.0.
+        obj_level_weights (List[float]): Defaults to [4.0, 1.0, 0.4].
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.YOLOAnchorGenerator',
+                     base_sizes=[[(10, 13), (16, 30), (33, 23)],
+                                 [(30, 61), (62, 45), (59, 119)],
+                                 [(116, 90), (156, 198), (373, 326)]],
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='YOLOv5BBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=0.5),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='ciou',
+                     bbox_format='xywh',
+                     eps=1e-7,
+                     reduction='mean',
+                     loss_weight=0.05,
+                     return_iou=True),
+                 loss_obj: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=1.0),
+                 prior_match_thr: float = 4.0,
+                 near_neighbor_thr: float = 0.5,
+                 ignore_iof_thr: float = -1.0,
+                 obj_level_weights: List[float] = [4.0, 1.0, 0.4],
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.head_module = MODELS.build(head_module)
+        self.num_classes = self.head_module.num_classes
+        self.featmap_strides = self.head_module.featmap_strides
+        self.num_levels = len(self.featmap_strides)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.loss_cls: nn.Module = MODELS.build(loss_cls)
+        self.loss_bbox: nn.Module = MODELS.build(loss_bbox)
+        self.loss_obj: nn.Module = MODELS.build(loss_obj)
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        self.featmap_sizes = [torch.empty(1)] * self.num_levels
+        self.prior_match_thr = prior_match_thr
+        self.near_neighbor_thr = near_neighbor_thr
+        self.obj_level_weights = obj_level_weights
+        self.ignore_iof_thr = ignore_iof_thr
+        self.special_init()
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+        The special_init function is designed to deal with this situation.
+        """
+        assert len(self.obj_level_weights) == len(
+            self.featmap_strides) == self.num_levels
+        if self.prior_match_thr != 4.0:
+            print_log(
+                "!!!Now, you've changed the prior_match_thr "
+                'parameter to something other than 4.0. Please make sure '
+                'that you have modified both the regression formula in '
+                'bbox_coder and before loss_box computation, '
+                'otherwise the accuracy may be degraded!!!')
+        if self.num_classes == 1:
+            print_log('!!!You are using `YOLOv5Head` with num_classes == 1.'
+                      ' The loss_cls will be 0. This is a normal phenomenon.')
+        priors_base_sizes = torch.tensor(
+            self.prior_generator.base_sizes, dtype=torch.float)
+        featmap_strides = torch.tensor(
+            self.featmap_strides, dtype=torch.float)[:, None, None]
+        self.register_buffer(
+            'priors_base_sizes',
+            priors_base_sizes / featmap_strides,
+            persistent=False)
+        grid_offset = torch.tensor([
+            [0, 0],  # center
+            [1, 0],  # left
+            [0, 1],  # up
+            [-1, 0],  # right
+            [0, -1],  # bottom
+        ]).float()
+        self.register_buffer(
+            'grid_offset', grid_offset[:, None], persistent=False)
+        prior_inds = torch.arange(self.num_base_priors).float().view(
+            self.num_base_priors, 1)
+        self.register_buffer('prior_inds', prior_inds, persistent=False)
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        return self.head_module(x)
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into
+        bbox results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        if objectnesses is None:
+            with_objectnesses = False
+        else:
+            with_objectnesses = True
+            assert len(cls_scores) == len(objectnesses)
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        multi_label = cfg.multi_label
+        multi_label &= self.num_classes > 1
+        cfg.multi_label = multi_label
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size.numel() * self.num_base_priors, ), stride) for
+            featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_classes)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_decoded_bboxes = self.bbox_coder.decode(
+            flatten_priors[None], flatten_bbox_preds, flatten_stride)
+        if with_objectnesses:
+            flatten_objectness = [
+                objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+                for objectness in objectnesses
+            ]
+            flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        else:
+            flatten_objectness = [None for _ in range(num_imgs)]
+        results_list = []
+        for (bboxes, scores, objectness,
+             img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores,
+                              flatten_objectness, batch_img_metas):
+            ori_shape = img_meta['ori_shape']
+            scale_factor = img_meta['scale_factor']
+            if 'pad_param' in img_meta:
+                pad_param = img_meta['pad_param']
+            else:
+                pad_param = None
+            score_thr = cfg.get('score_thr', -1)
+            # yolox_style does not require the following operations
+            if objectness is not None and score_thr > 0 and not cfg.get(
+                    'yolox_style', False):
+                conf_inds = objectness > score_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+            if objectness is not None:
+                # conf = obj_conf * cls_conf
+                scores *= objectness[:, None]
+            if scores.shape[0] == 0:
+                empty_results = InstanceData()
+                empty_results.bboxes = bboxes
+                empty_results.scores = scores[:, 0]
+                empty_results.labels = scores[:, 0].int()
+                results_list.append(empty_results)
+                continue
+            nms_pre = cfg.get('nms_pre', 100000)
+            if cfg.multi_label is False:
+                scores, labels = scores.max(1, keepdim=True)
+                scores, _, keep_idxs, results = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(labels=labels[:, 0]))
+                labels = results['labels']
+            else:
+                scores, labels, keep_idxs, _ = filter_scores_and_topk(
+                    scores, score_thr, nms_pre)
+            results = InstanceData(
+                scores=scores, labels=labels, bboxes=bboxes[keep_idxs])
+            if rescale:
+                if pad_param is not None:
+                    results.bboxes -= results.bboxes.new_tensor([
+                        pad_param[2], pad_param[0], pad_param[2], pad_param[0]
+                    ])
+                results.bboxes /= results.bboxes.new_tensor(
+                    scale_factor).repeat((1, 2))
+            if cfg.get('yolox_style', False):
+                # do not need max_per_img
+                cfg.max_per_img = len(results)
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=False,
+                with_nms=with_nms,
+                img_meta=img_meta)
+            results.bboxes[:, 0::2].clamp_(0, ori_shape[1])
+            results.bboxes[:, 1::2].clamp_(0, ori_shape[0])
+            results_list.append(results)
+        return results_list
+    def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list,
+                                                               dict]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`], dict): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        if isinstance(batch_data_samples, list):
+            losses = super().loss(x, batch_data_samples)
+        else:
+            outs = self(x)
+            # Fast version
+            loss_inputs = outs + (batch_data_samples['bboxes_labels'],
+                                  batch_data_samples['img_metas'])
+            losses = self.loss_by_feat(*loss_inputs)
+        return losses
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (Sequence[InstanceData]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (Sequence[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        if self.ignore_iof_thr != -1:
+            # TODO: Support fast version
+            # convert ignore gt
+            batch_target_ignore_list = []
+            for i, gt_instances_ignore in enumerate(batch_gt_instances_ignore):
+                bboxes = gt_instances_ignore.bboxes
+                labels = gt_instances_ignore.labels
+                index = bboxes.new_full((len(bboxes), 1), i)
+                # (batch_idx, label, bboxes)
+                target = torch.cat((index, labels[:, None].float(), bboxes),
+                                   dim=1)
+                batch_target_ignore_list.append(target)
+            # (num_bboxes, 6)
+            batch_gt_targets_ignore = torch.cat(
+                batch_target_ignore_list, dim=0)
+            if batch_gt_targets_ignore.shape[0] != 0:
+                # Consider regions with ignore in annotations
+                return self._loss_by_feat_with_ignore(
+                    cls_scores,
+                    bbox_preds,
+                    objectnesses,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas,
+                    batch_gt_instances_ignore=batch_gt_targets_ignore)
+        # 1. Convert gt to norm format
+        batch_targets_normed = self._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+        device = cls_scores[0].device
+        loss_cls = torch.zeros(1, device=device)
+        loss_box = torch.zeros(1, device=device)
+        loss_obj = torch.zeros(1, device=device)
+        scaled_factor = torch.ones(7, device=device)
+        for i in range(self.num_levels):
+            batch_size, _, h, w = bbox_preds[i].shape
+            target_obj = torch.zeros_like(objectnesses[i])
+            # empty gt bboxes
+            if batch_targets_normed.shape[1] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i], target_obj) * self.obj_level_weights[i]
+                continue
+            priors_base_sizes_i = self.priors_base_sizes[i]
+            # feature map scale whwh
+            scaled_factor[2:6] = torch.tensor(
+                bbox_preds[i].shape)[[3, 2, 3, 2]]
+            # Scale batch_targets from range 0-1 to range 0-features_maps size.
+            # (num_base_priors, num_bboxes, 7)
+            batch_targets_scaled = batch_targets_normed * scaled_factor
+            # 2. Shape match
+            wh_ratio = batch_targets_scaled[...,
+                                            4:6] / priors_base_sizes_i[:, None]
+            match_inds = torch.max(
+                wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr
+            batch_targets_scaled = batch_targets_scaled[match_inds]
+            # no gt bbox matches anchor
+            if batch_targets_scaled.shape[0] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i], target_obj) * self.obj_level_weights[i]
+                continue
+            # 3. Positive samples with additional neighbors
+            # check the left, up, right, bottom sides of the
+            # targets grid, and determine whether assigned
+            # them as positive samples as well.
+            batch_targets_cxcy = batch_targets_scaled[:, 2:4]
+            grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy
+            left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) &
+                        (batch_targets_cxcy > 1)).T
+            right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) &
+                             (grid_xy > 1)).T
+            offset_inds = torch.stack(
+                (torch.ones_like(left), left, up, right, bottom))
+            batch_targets_scaled = batch_targets_scaled.repeat(
+                (5, 1, 1))[offset_inds]
+            retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1],
+                                                       1)[offset_inds]
+            # prepare pred results and positive sample indexes to
+            # calculate class loss and bbox lo
+            _chunk_targets = batch_targets_scaled.chunk(4, 1)
+            img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets
+            priors_inds, (img_inds, class_inds) = priors_inds.long().view(
+                -1), img_class_inds.long().T
+            grid_xy_long = (grid_xy -
+                            retained_offsets * self.near_neighbor_thr).long()
+            grid_x_inds, grid_y_inds = grid_xy_long.T
+            bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1)
+            # 4. Calculate loss
+            # bbox loss
+            retained_bbox_pred = bbox_preds[i].reshape(
+                batch_size, self.num_base_priors, -1, h,
+                w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+            priors_base_sizes_i = priors_base_sizes_i[priors_inds]
+            decoded_bbox_pred = self._decode_bbox_to_xywh(
+                retained_bbox_pred, priors_base_sizes_i)
+            loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets)
+            loss_box += loss_box_i
+            # obj loss
+            iou = iou.detach().clamp(0)
+            target_obj[img_inds, priors_inds, grid_y_inds,
+                       grid_x_inds] = iou.type(target_obj.dtype)
+            loss_obj += self.loss_obj(objectnesses[i],
+                                      target_obj) * self.obj_level_weights[i]
+            # cls loss
+            if self.num_classes > 1:
+                pred_cls_scores = cls_scores[i].reshape(
+                    batch_size, self.num_base_priors, -1, h,
+                    w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+                target_class = torch.full_like(pred_cls_scores, 0.)
+                target_class[range(batch_targets_scaled.shape[0]),
+                             class_inds] = 1.
+                loss_cls += self.loss_cls(pred_cls_scores, target_class)
+            else:
+                loss_cls += cls_scores[i].sum() * 0
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * batch_size * world_size,
+            loss_obj=loss_obj * batch_size * world_size,
+            loss_bbox=loss_box * batch_size * world_size)
+    def _convert_gt_to_norm_format(self,
+                                   batch_gt_instances: Sequence[InstanceData],
+                                   batch_img_metas: Sequence[dict]) -> Tensor:
+        if isinstance(batch_gt_instances, torch.Tensor):
+            # fast version
+            img_shape = batch_img_metas[0]['batch_input_shape']
+            gt_bboxes_xyxy = batch_gt_instances[:, 2:]
+            xy1, xy2 = gt_bboxes_xyxy.split((2, 2), dim=-1)
+            gt_bboxes_xywh = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1)
+            gt_bboxes_xywh[:, 1::2] /= img_shape[0]
+            gt_bboxes_xywh[:, 0::2] /= img_shape[1]
+            batch_gt_instances[:, 2:] = gt_bboxes_xywh
+            # (num_base_priors, num_bboxes, 6)
+            batch_targets_normed = batch_gt_instances.repeat(
+                self.num_base_priors, 1, 1)
+        else:
+            batch_target_list = []
+            # Convert xyxy bbox to yolo format.
+            for i, gt_instances in enumerate(batch_gt_instances):
+                img_shape = batch_img_metas[i]['batch_input_shape']
+                bboxes = gt_instances.bboxes
+                labels = gt_instances.labels
+                xy1, xy2 = bboxes.split((2, 2), dim=-1)
+                bboxes = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1)
+                # normalized to 0-1
+                bboxes[:, 1::2] /= img_shape[0]
+                bboxes[:, 0::2] /= img_shape[1]
+                index = bboxes.new_full((len(bboxes), 1), i)
+                # (batch_idx, label, normed_bbox)
+                target = torch.cat((index, labels[:, None].float(), bboxes),
+                                   dim=1)
+                batch_target_list.append(target)
+            # (num_base_priors, num_bboxes, 6)
+            batch_targets_normed = torch.cat(
+                batch_target_list, dim=0).repeat(self.num_base_priors, 1, 1)
+        # (num_base_priors, num_bboxes, 1)
+        batch_targets_prior_inds = self.prior_inds.repeat(
+            1, batch_targets_normed.shape[1])[..., None]
+        # (num_base_priors, num_bboxes, 7)
+        # (img_ind, labels, bbox_cx, bbox_cy, bbox_w, bbox_h, prior_ind)
+        batch_targets_normed = torch.cat(
+            (batch_targets_normed, batch_targets_prior_inds), 2)
+        return batch_targets_normed
+    def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes) -> Tensor:
+        bbox_pred = bbox_pred.sigmoid()
+        pred_xy = bbox_pred[:, :2] * 2 - 0.5
+        pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes
+        decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1)
+        return decoded_bbox_pred
+    def _loss_by_feat_with_ignore(
+            self, cls_scores: Sequence[Tensor], bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: Sequence[Tensor]) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (Sequence[InstanceData]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (Sequence[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (Sequence[Tensor]): Ignore boxes with
+                batch_ids and labels, each is a 2D-tensor, the channel number
+                is 6, means that (batch_id, label, xmin, ymin, xmax, ymax).
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        # 1. Convert gt to norm format
+        batch_targets_normed = self._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+        device = cls_scores[0].device
+        loss_cls = torch.zeros(1, device=device)
+        loss_box = torch.zeros(1, device=device)
+        loss_obj = torch.zeros(1, device=device)
+        scaled_factor = torch.ones(7, device=device)
+        for i in range(self.num_levels):
+            batch_size, _, h, w = bbox_preds[i].shape
+            target_obj = torch.zeros_like(objectnesses[i])
+            not_ignore_flags = bbox_preds[i].new_ones(batch_size,
+                                                      self.num_base_priors, h,
+                                                      w)
+            ignore_overlaps = bbox_overlaps(self.mlvl_priors[i],
+                                            batch_gt_instances_ignore[..., 2:],
+                                            'iof')
+            ignore_max_overlaps, ignore_max_ignore_index = ignore_overlaps.max(
+                dim=1)
+            batch_inds = batch_gt_instances_ignore[:,
+                                                   0][ignore_max_ignore_index]
+            ignore_inds = (ignore_max_overlaps > self.ignore_iof_thr).nonzero(
+                as_tuple=True)[0]
+            batch_inds = batch_inds[ignore_inds].long()
+            ignore_priors, ignore_grid_xs, ignore_grid_ys = get_prior_xy_info(
+                ignore_inds, self.num_base_priors, self.featmap_sizes[i])
+            not_ignore_flags[batch_inds, ignore_priors, ignore_grid_ys,
+                             ignore_grid_xs] = 0
+            # empty gt bboxes
+            if batch_targets_normed.shape[1] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i],
+                    target_obj,
+                    weight=not_ignore_flags,
+                    avg_factor=max(not_ignore_flags.sum(),
+                                   1)) * self.obj_level_weights[i]
+                continue
+            priors_base_sizes_i = self.priors_base_sizes[i]
+            # feature map scale whwh
+            scaled_factor[2:6] = torch.tensor(
+                bbox_preds[i].shape)[[3, 2, 3, 2]]
+            # Scale batch_targets from range 0-1 to range 0-features_maps size.
+            # (num_base_priors, num_bboxes, 7)
+            batch_targets_scaled = batch_targets_normed * scaled_factor
+            # 2. Shape match
+            wh_ratio = batch_targets_scaled[...,
+                                            4:6] / priors_base_sizes_i[:, None]
+            match_inds = torch.max(
+                wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr
+            batch_targets_scaled = batch_targets_scaled[match_inds]
+            # no gt bbox matches anchor
+            if batch_targets_scaled.shape[0] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i],
+                    target_obj,
+                    weight=not_ignore_flags,
+                    avg_factor=max(not_ignore_flags.sum(),
+                                   1)) * self.obj_level_weights[i]
+                continue
+            # 3. Positive samples with additional neighbors
+            # check the left, up, right, bottom sides of the
+            # targets grid, and determine whether assigned
+            # them as positive samples as well.
+            batch_targets_cxcy = batch_targets_scaled[:, 2:4]
+            grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy
+            left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) &
+                        (batch_targets_cxcy > 1)).T
+            right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) &
+                             (grid_xy > 1)).T
+            offset_inds = torch.stack(
+                (torch.ones_like(left), left, up, right, bottom))
+            batch_targets_scaled = batch_targets_scaled.repeat(
+                (5, 1, 1))[offset_inds]
+            retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1],
+                                                       1)[offset_inds]
+            # prepare pred results and positive sample indexes to
+            # calculate class loss and bbox lo
+            _chunk_targets = batch_targets_scaled.chunk(4, 1)
+            img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets
+            priors_inds, (img_inds, class_inds) = priors_inds.long().view(
+                -1), img_class_inds.long().T
+            grid_xy_long = (grid_xy -
+                            retained_offsets * self.near_neighbor_thr).long()
+            grid_x_inds, grid_y_inds = grid_xy_long.T
+            bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1)
+            # 4. Calculate loss
+            # bbox loss
+            retained_bbox_pred = bbox_preds[i].reshape(
+                batch_size, self.num_base_priors, -1, h,
+                w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+            priors_base_sizes_i = priors_base_sizes_i[priors_inds]
+            decoded_bbox_pred = self._decode_bbox_to_xywh(
+                retained_bbox_pred, priors_base_sizes_i)
+            not_ignore_weights = not_ignore_flags[img_inds, priors_inds,
+                                                  grid_y_inds, grid_x_inds]
+            loss_box_i, iou = self.loss_bbox(
+                decoded_bbox_pred,
+                bboxes_targets,
+                weight=not_ignore_weights,
+                avg_factor=max(not_ignore_weights.sum(), 1))
+            loss_box += loss_box_i
+            # obj loss
+            iou = iou.detach().clamp(0)
+            target_obj[img_inds, priors_inds, grid_y_inds,
+                       grid_x_inds] = iou.type(target_obj.dtype)
+            loss_obj += self.loss_obj(
+                objectnesses[i],
+                target_obj,
+                weight=not_ignore_flags,
+                avg_factor=max(not_ignore_flags.sum(),
+                               1)) * self.obj_level_weights[i]
+            # cls loss
+            if self.num_classes > 1:
+                pred_cls_scores = cls_scores[i].reshape(
+                    batch_size, self.num_base_priors, -1, h,
+                    w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+                target_class = torch.full_like(pred_cls_scores, 0.)
+                target_class[range(batch_targets_scaled.shape[0]),
+                             class_inds] = 1.
+                loss_cls += self.loss_cls(
+                    pred_cls_scores,
+                    target_class,
+                    weight=not_ignore_weights[:, None].repeat(
+                        1, self.num_classes),
+                    avg_factor=max(not_ignore_weights.sum(), 1))
+            else:
+                loss_cls += cls_scores[i].sum() * 0
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * batch_size * world_size,
+            loss_obj=loss_obj * batch_size * world_size,
+            loss_bbox=loss_box * batch_size * world_size)

mmyolo/models/dense_heads/yolov6_head.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig)
+from mmengine import MessageHub
+from mmengine.dist import get_dist_info
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess
+from .yolov5_head import YOLOv5Head
+@MODELS.register_module()
+class YOLOv6HeadModule(BaseModule):
+    """YOLOv6Head head module used in `YOLOv6.
+    <https://arxiv.org/pdf/2209.02976>`_.
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors: (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to [8, 16, 32].
+            None, otherwise False. Defaults to "auto".
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 1,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.featmap_strides = featmap_strides
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        if isinstance(in_channels, int):
+            self.in_channels = [int(in_channels * widen_factor)
+                                ] * self.num_levels
+        else:
+            self.in_channels = [int(i * widen_factor) for i in in_channels]
+        self._init_layers()
+    def _init_layers(self):
+        """initialize conv layers in YOLOv6 head."""
+        # Init decouple head
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        for i in range(self.num_levels):
+            self.stems.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.in_channels[i],
+                    kernel_size=1,
+                    stride=1,
+                    padding=1 // 2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.cls_convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.in_channels[i],
+                    kernel_size=3,
+                    stride=1,
+                    padding=3 // 2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.in_channels[i],
+                    kernel_size=3,
+                    stride=1,
+                    padding=3 // 2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.num_base_priors * self.num_classes,
+                    kernel_size=1))
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.num_base_priors * 4,
+                    kernel_size=1))
+    def init_weights(self):
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.01)
+        for conv in self.cls_preds:
+            conv.bias.data.fill_(bias_init)
+            conv.weight.data.fill_(0.)
+        for conv in self.reg_preds:
+            conv.bias.data.fill_(1.0)
+            conv.weight.data.fill_(0.)
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions.
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.stems, self.cls_convs,
+                           self.cls_preds, self.reg_convs, self.reg_preds)
+    def forward_single(self, x: Tensor, stem: nn.Module, cls_conv: nn.Module,
+                       cls_pred: nn.Module, reg_conv: nn.Module,
+                       reg_pred: nn.Module) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+        y = stem(x)
+        cls_x = y
+        reg_x = y
+        cls_feat = cls_conv(cls_x)
+        reg_feat = reg_conv(reg_x)
+        cls_score = cls_pred(cls_feat)
+        bbox_pred = reg_pred(reg_feat)
+        return cls_score, bbox_pred
+@MODELS.register_module()
+class YOLOv6Head(YOLOv5Head):
+    """YOLOv6Head head used in `YOLOv6 <https://arxiv.org/pdf/2209.02976>`_.
+    Args:
+        head_module(ConfigType): Base module used for YOLOv6Head
+        prior_generator(dict): Points generator feature maps
+            in 2D points-based detectors.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0.5,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='giou',
+                     bbox_format='xyxy',
+                     reduction='mean',
+                     loss_weight=2.5,
+                     return_iou=False),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        # yolov6 doesn't need loss_obj
+        self.loss_obj = None
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+        The special_init function is designed to deal with this situation.
+        """
+        if self.train_cfg:
+            self.initial_epoch = self.train_cfg['initial_epoch']
+            self.initial_assigner = TASK_UTILS.build(
+                self.train_cfg.initial_assigner)
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            # Add common attributes to reduce calculation
+            self.featmap_sizes_train = None
+            self.num_level_priors = None
+            self.flatten_priors_train = None
+            self.stride_tensor = None
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        # get epoch information from message hub
+        message_hub = MessageHub.get_current_instance()
+        current_epoch = message_hub.get_info('epoch')
+        num_imgs = len(batch_img_metas)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        current_featmap_sizes = [
+            cls_score.shape[2:] for cls_score in cls_scores
+        ]
+        # If the shape does not equal, generate new one
+        if current_featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = current_featmap_sizes
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                self.featmap_sizes_train,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+            self.num_level_priors = [len(n) for n in mlvl_priors_with_stride]
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+            self.stride_tensor = self.flatten_priors_train[..., [2]]
+        # gt info
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+        # pred info
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+        flatten_pred_bboxes = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1)
+        flatten_pred_bboxes = self.bbox_coder.decode(
+            self.flatten_priors_train[..., :2], flatten_pred_bboxes,
+            self.stride_tensor[:, 0])
+        pred_scores = torch.sigmoid(flatten_cls_preds)
+        if current_epoch < self.initial_epoch:
+            assigned_result = self.initial_assigner(
+                flatten_pred_bboxes.detach(), self.flatten_priors_train,
+                self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag)
+        else:
+            assigned_result = self.assigner(flatten_pred_bboxes.detach(),
+                                            pred_scores.detach(),
+                                            self.flatten_priors_train,
+                                            gt_labels, gt_bboxes,
+                                            pad_bbox_flag)
+        assigned_bboxes = assigned_result['assigned_bboxes']
+        assigned_scores = assigned_result['assigned_scores']
+        fg_mask_pre_prior = assigned_result['fg_mask_pre_prior']
+        # cls loss
+        with torch.cuda.amp.autocast(enabled=False):
+            loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores)
+        # rescale bbox
+        assigned_bboxes /= self.stride_tensor
+        flatten_pred_bboxes /= self.stride_tensor
+        # TODO: Add all_reduce makes training more stable
+        assigned_scores_sum = assigned_scores.sum()
+        if assigned_scores_sum > 0:
+            loss_cls /= assigned_scores_sum
+        # select positive samples mask
+        num_pos = fg_mask_pre_prior.sum()
+        if num_pos > 0:
+            # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox
+            # will not report an error
+            # iou loss
+            prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(
+                flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = torch.masked_select(
+                assigned_bboxes, prior_bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1)
+            loss_bbox = self.loss_bbox(
+                pred_bboxes_pos,
+                assigned_bboxes_pos,
+                weight=bbox_weight,
+                avg_factor=assigned_scores_sum)
+        else:
+            loss_bbox = flatten_pred_bboxes.sum() * 0
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * world_size, loss_bbox=loss_bbox * world_size)

mmyolo/models/dense_heads/yolov7_head.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmdet.utils import ConfigType, OptInstanceList
+from mmengine.dist import get_dist_info
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS
+from ..layers import ImplicitA, ImplicitM
+from ..task_modules.assigners.batch_yolov7_assigner import BatchYOLOv7Assigner
+from .yolov5_head import YOLOv5Head, YOLOv5HeadModule
+@MODELS.register_module()
+class YOLOv7HeadModule(YOLOv5HeadModule):
+    """YOLOv7Head head module used in YOLOv7."""
+    def _init_layers(self):
+        """initialize conv layers in YOLOv7 head."""
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_pred = nn.Sequential(
+                ImplicitA(self.in_channels[i]),
+                nn.Conv2d(self.in_channels[i],
+                          self.num_base_priors * self.num_out_attrib, 1),
+                ImplicitM(self.num_base_priors * self.num_out_attrib),
+            )
+            self.convs_pred.append(conv_pred)
+    def init_weights(self):
+        """Initialize the bias of YOLOv7 head."""
+        super(YOLOv5HeadModule, self).init_weights()
+        for mi, s in zip(self.convs_pred, self.featmap_strides):  # from
+            mi = mi[1]  # nn.Conv2d
+            b = mi.bias.data.view(3, -1)
+            # obj (8 objects per 640 image)
+            b.data[:, 4] += math.log(8 / (640 / s)**2)
+            b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99))
+            mi.bias.data = b.view(-1)
+@MODELS.register_module()
+class YOLOv7p6HeadModule(YOLOv5HeadModule):
+    """YOLOv7Head head module used in YOLOv7."""
+    def __init__(self,
+                 *args,
+                 main_out_channels: Sequence[int] = [256, 512, 768, 1024],
+                 aux_out_channels: Sequence[int] = [320, 640, 960, 1280],
+                 use_aux: bool = True,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 **kwargs):
+        self.main_out_channels = main_out_channels
+        self.aux_out_channels = aux_out_channels
+        self.use_aux = use_aux
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        super().__init__(*args, **kwargs)
+    def _init_layers(self):
+        """initialize conv layers in YOLOv7 head."""
+        self.main_convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_pred = nn.Sequential(
+                ConvModule(
+                    self.in_channels[i],
+                    self.main_out_channels[i],
+                    3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ImplicitA(self.main_out_channels[i]),
+                nn.Conv2d(self.main_out_channels[i],
+                          self.num_base_priors * self.num_out_attrib, 1),
+                ImplicitM(self.num_base_priors * self.num_out_attrib),
+            )
+            self.main_convs_pred.append(conv_pred)
+        if self.use_aux:
+            self.aux_convs_pred = nn.ModuleList()
+            for i in range(self.num_levels):
+                aux_pred = nn.Sequential(
+                    ConvModule(
+                        self.in_channels[i],
+                        self.aux_out_channels[i],
+                        3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    nn.Conv2d(self.aux_out_channels[i],
+                              self.num_base_priors * self.num_out_attrib, 1))
+                self.aux_convs_pred.append(aux_pred)
+        else:
+            self.aux_convs_pred = [None] * len(self.main_convs_pred)
+    def init_weights(self):
+        """Initialize the bias of YOLOv5 head."""
+        super(YOLOv5HeadModule, self).init_weights()
+        for mi, aux, s in zip(self.main_convs_pred, self.aux_convs_pred,
+                              self.featmap_strides):  # from
+            mi = mi[2]  # nn.Conv2d
+            b = mi.bias.data.view(3, -1)
+            # obj (8 objects per 640 image)
+            b.data[:, 4] += math.log(8 / (640 / s)**2)
+            b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99))
+            mi.bias.data = b.view(-1)
+            if self.use_aux:
+                aux = aux[1]  # nn.Conv2d
+                b = aux.bias.data.view(3, -1)
+                # obj (8 objects per 640 image)
+                b.data[:, 4] += math.log(8 / (640 / s)**2)
+                b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99))
+                mi.bias.data = b.view(-1)
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.main_convs_pred,
+                           self.aux_convs_pred)
+    def forward_single(self, x: Tensor, convs: nn.Module,
+                       aux_convs: Optional[nn.Module]) \
+            -> Tuple[Union[Tensor, List], Union[Tensor, List],
+                     Union[Tensor, List]]:
+        """Forward feature of a single scale level."""
+        pred_map = convs(x)
+        bs, _, ny, nx = pred_map.shape
+        pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib,
+                                 ny, nx)
+        cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx)
+        bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx)
+        objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx)
+        if not self.training or not self.use_aux:
+            return cls_score, bbox_pred, objectness
+        else:
+            aux_pred_map = aux_convs(x)
+            aux_pred_map = aux_pred_map.view(bs, self.num_base_priors,
+                                             self.num_out_attrib, ny, nx)
+            aux_cls_score = aux_pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx)
+            aux_bbox_pred = aux_pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx)
+            aux_objectness = aux_pred_map[:, :, 4:5,
+                                          ...].reshape(bs, -1, ny, nx)
+            return [cls_score,
+                    aux_cls_score], [bbox_pred, aux_bbox_pred
+                                     ], [objectness, aux_objectness]
+@MODELS.register_module()
+class YOLOv7Head(YOLOv5Head):
+    """YOLOv7Head head used in `YOLOv7 <https://arxiv.org/abs/2207.02696>`_.
+    Args:
+        simota_candidate_topk (int): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k in BatchYOLOv7Assigner.
+            Defaults to 10.
+        simota_iou_weight (float): The scale factor for regression
+            iou cost in BatchYOLOv7Assigner. Defaults to 3.0.
+        simota_cls_weight (float): The scale factor for classification
+            cost in BatchYOLOv7Assigner. Defaults to 1.0.
+    """
+    def __init__(self,
+                 *args,
+                 simota_candidate_topk: int = 20,
+                 simota_iou_weight: float = 3.0,
+                 simota_cls_weight: float = 1.0,
+                 aux_loss_weights: float = 0.25,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.aux_loss_weights = aux_loss_weights
+        self.assigner = BatchYOLOv7Assigner(
+            num_classes=self.num_classes,
+            num_base_priors=self.num_base_priors,
+            featmap_strides=self.featmap_strides,
+            prior_match_thr=self.prior_match_thr,
+            candidate_topk=simota_candidate_topk,
+            iou_weight=simota_iou_weight,
+            cls_weight=simota_cls_weight)
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Union[Tensor, List]],
+            bbox_preds: Sequence[Union[Tensor, List]],
+            objectnesses: Sequence[Union[Tensor, List]],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        if isinstance(cls_scores[0], Sequence):
+            with_aux = True
+            batch_size = cls_scores[0][0].shape[0]
+            device = cls_scores[0][0].device
+            bbox_preds_main, bbox_preds_aux = zip(*bbox_preds)
+            objectnesses_main, objectnesses_aux = zip(*objectnesses)
+            cls_scores_main, cls_scores_aux = zip(*cls_scores)
+            head_preds = self._merge_predict_results(bbox_preds_main,
+                                                     objectnesses_main,
+                                                     cls_scores_main)
+            head_preds_aux = self._merge_predict_results(
+                bbox_preds_aux, objectnesses_aux, cls_scores_aux)
+        else:
+            with_aux = False
+            batch_size = cls_scores[0].shape[0]
+            device = cls_scores[0].device
+            head_preds = self._merge_predict_results(bbox_preds, objectnesses,
+                                                     cls_scores)
+        # Convert gt to norm xywh format
+        # (num_base_priors, num_batch_gt, 7)
+        # 7 is mean (batch_idx, cls_id, x_norm, y_norm,
+        # w_norm, h_norm, prior_idx)
+        batch_targets_normed = self._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+        scaled_factors = [
+            torch.tensor(head_pred.shape, device=device)[[3, 2, 3, 2]]
+            for head_pred in head_preds
+        ]
+        loss_cls, loss_obj, loss_box = self._calc_loss(
+            head_preds=head_preds,
+            head_preds_aux=None,
+            batch_targets_normed=batch_targets_normed,
+            near_neighbor_thr=self.near_neighbor_thr,
+            scaled_factors=scaled_factors,
+            batch_img_metas=batch_img_metas,
+            device=device)
+        if with_aux:
+            loss_cls_aux, loss_obj_aux, loss_box_aux = self._calc_loss(
+                head_preds=head_preds,
+                head_preds_aux=head_preds_aux,
+                batch_targets_normed=batch_targets_normed,
+                near_neighbor_thr=self.near_neighbor_thr * 2,
+                scaled_factors=scaled_factors,
+                batch_img_metas=batch_img_metas,
+                device=device)
+            loss_cls += self.aux_loss_weights * loss_cls_aux
+            loss_obj += self.aux_loss_weights * loss_obj_aux
+            loss_box += self.aux_loss_weights * loss_box_aux
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * batch_size * world_size,
+            loss_obj=loss_obj * batch_size * world_size,
+            loss_bbox=loss_box * batch_size * world_size)
+    def _calc_loss(self, head_preds, head_preds_aux, batch_targets_normed,
+                   near_neighbor_thr, scaled_factors, batch_img_metas, device):
+        loss_cls = torch.zeros(1, device=device)
+        loss_box = torch.zeros(1, device=device)
+        loss_obj = torch.zeros(1, device=device)
+        assigner_results = self.assigner(
+            head_preds,
+            batch_targets_normed,
+            batch_img_metas[0]['batch_input_shape'],
+            self.priors_base_sizes,
+            self.grid_offset,
+            near_neighbor_thr=near_neighbor_thr)
+        # mlvl is mean multi_level
+        mlvl_positive_infos = assigner_results['mlvl_positive_infos']
+        mlvl_priors = assigner_results['mlvl_priors']
+        mlvl_targets_normed = assigner_results['mlvl_targets_normed']
+        if head_preds_aux is not None:
+            # This is mean calc aux branch loss
+            head_preds = head_preds_aux
+        for i, head_pred in enumerate(head_preds):
+            batch_inds, proir_idx, grid_x, grid_y = mlvl_positive_infos[i].T
+            num_pred_positive = batch_inds.shape[0]
+            target_obj = torch.zeros_like(head_pred[..., 0])
+            # empty positive sampler
+            if num_pred_positive == 0:
+                loss_box += head_pred[..., :4].sum() * 0
+                loss_cls += head_pred[..., 5:].sum() * 0
+                loss_obj += self.loss_obj(
+                    head_pred[..., 4], target_obj) * self.obj_level_weights[i]
+                continue
+            priors = mlvl_priors[i]
+            targets_normed = mlvl_targets_normed[i]
+            head_pred_positive = head_pred[batch_inds, proir_idx, grid_y,
+                                           grid_x]
+            # calc bbox loss
+            grid_xy = torch.stack([grid_x, grid_y], dim=1)
+            decoded_pred_bbox = self._decode_bbox_to_xywh(
+                head_pred_positive[:, :4], priors, grid_xy)
+            target_bbox_scaled = targets_normed[:, 2:6] * scaled_factors[i]
+            loss_box_i, iou = self.loss_bbox(decoded_pred_bbox,
+                                             target_bbox_scaled)
+            loss_box += loss_box_i
+            # calc obj loss
+            target_obj[batch_inds, proir_idx, grid_y,
+                       grid_x] = iou.detach().clamp(0).type(target_obj.dtype)
+            loss_obj += self.loss_obj(head_pred[..., 4],
+                                      target_obj) * self.obj_level_weights[i]
+            # calc cls loss
+            if self.num_classes > 1:
+                pred_cls_scores = targets_normed[:, 1].long()
+                target_class = torch.full_like(
+                    head_pred_positive[:, 5:], 0., device=device)
+                target_class[range(num_pred_positive), pred_cls_scores] = 1.
+                loss_cls += self.loss_cls(head_pred_positive[:, 5:],
+                                          target_class)
+            else:
+                loss_cls += head_pred_positive[:, 5:].sum() * 0
+        return loss_cls, loss_obj, loss_box
+    def _merge_predict_results(self, bbox_preds: Sequence[Tensor],
+                               objectnesses: Sequence[Tensor],
+                               cls_scores: Sequence[Tensor]) -> List[Tensor]:
+        """Merge predict output from 3 heads.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+        Returns:
+              List[Tensor]: Merged output.
+        """
+        head_preds = []
+        for bbox_pred, objectness, cls_score in zip(bbox_preds, objectnesses,
+                                                    cls_scores):
+            b, _, h, w = bbox_pred.shape
+            bbox_pred = bbox_pred.reshape(b, self.num_base_priors, -1, h, w)
+            objectness = objectness.reshape(b, self.num_base_priors, -1, h, w)
+            cls_score = cls_score.reshape(b, self.num_base_priors, -1, h, w)
+            head_pred = torch.cat([bbox_pred, objectness, cls_score],
+                                  dim=2).permute(0, 1, 3, 4, 2).contiguous()
+            head_preds.append(head_pred)
+        return head_preds
+    def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes,
+                             grid_xy) -> Tensor:
+        bbox_pred = bbox_pred.sigmoid()
+        pred_xy = bbox_pred[:, :2] * 2 - 0.5 + grid_xy
+        pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes
+        decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1)
+        return decoded_bbox_pred

mmyolo/models/dense_heads/yolov8_head.py ADDED Viewed

	@@ -0,0 +1,398 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig)
+from mmengine.dist import get_dist_info
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess, make_divisible
+from .yolov5_head import YOLOv5Head
+@MODELS.register_module()
+class YOLOv8HeadModule(BaseModule):
+    """YOLOv8HeadModule head module used in `YOLOv8`.
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to [8, 16, 32].
+        reg_max (int): Max value of integral set :math: ``{0, ..., reg_max-1}``
+            in QFL setting. Defaults to 16.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 1,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 reg_max: int = 16,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.featmap_strides = featmap_strides
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_channels = in_channels
+        self.reg_max = reg_max
+        in_channels = []
+        for channel in self.in_channels:
+            channel = make_divisible(channel, widen_factor)
+            in_channels.append(channel)
+        self.in_channels = in_channels
+        self._init_layers()
+    def init_weights(self, prior_prob=0.01):
+        """Initialize the weight and bias of PPYOLOE head."""
+        super().init_weights()
+        for reg_pred, cls_pred, stride in zip(self.reg_preds, self.cls_preds,
+                                              self.featmap_strides):
+            reg_pred[-1].bias.data[:] = 1.0  # box
+            # cls (.01 objects, 80 classes, 640 img)
+            cls_pred[-1].bias.data[:self.num_classes] = math.log(
+                5 / self.num_classes / (640 / stride)**2)
+    def _init_layers(self):
+        """initialize conv layers in YOLOv8 head."""
+        # Init decouple head
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        reg_out_channels = max(
+            (16, self.in_channels[0] // 4, self.reg_max * 4))
+        cls_out_channels = max(self.in_channels[0], self.num_classes)
+        for i in range(self.num_levels):
+            self.reg_preds.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=reg_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    ConvModule(
+                        in_channels=reg_out_channels,
+                        out_channels=reg_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    nn.Conv2d(
+                        in_channels=reg_out_channels,
+                        out_channels=4 * self.reg_max,
+                        kernel_size=1)))
+            self.cls_preds.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=cls_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    ConvModule(
+                        in_channels=cls_out_channels,
+                        out_channels=cls_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    nn.Conv2d(
+                        in_channels=cls_out_channels,
+                        out_channels=self.num_classes,
+                        kernel_size=1)))
+        proj = torch.arange(self.reg_max, dtype=torch.float)
+        self.register_buffer('proj', proj, persistent=False)
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.cls_preds,
+                           self.reg_preds)
+    def forward_single(self, x: torch.Tensor, cls_pred: nn.ModuleList,
+                       reg_pred: nn.ModuleList) -> Tuple:
+        """Forward feature of a single scale level."""
+        b, _, h, w = x.shape
+        cls_logit = cls_pred(x)
+        bbox_dist_preds = reg_pred(x)
+        if self.reg_max > 1:
+            bbox_dist_preds = bbox_dist_preds.reshape(
+                [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2)
+            # TODO: The get_flops script cannot handle the situation of
+            #  matmul, and needs to be fixed later
+            # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj)
+            bbox_preds = bbox_dist_preds.softmax(3).matmul(
+                self.proj.view([-1, 1])).squeeze(-1)
+            bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w)
+        else:
+            bbox_preds = bbox_dist_preds
+        if self.training:
+            return cls_logit, bbox_preds, bbox_dist_preds
+        else:
+            return cls_logit, bbox_preds
+@MODELS.register_module()
+class YOLOv8Head(YOLOv5Head):
+    """YOLOv8Head head used in `YOLOv8`.
+    Args:
+        head_module(:obj:`ConfigDict` or dict): Base module used for YOLOv8Head
+        prior_generator(dict): Points generator feature maps
+            in 2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_dfl (:obj:`ConfigDict` or dict): Config of Distribution Focal
+            Loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0.5,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none',
+                     loss_weight=0.5),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='ciou',
+                     bbox_format='xyxy',
+                     reduction='sum',
+                     loss_weight=7.5,
+                     return_iou=False),
+                 loss_dfl=dict(
+                     type='mmdet.DistributionFocalLoss',
+                     reduction='mean',
+                     loss_weight=1.5 / 4),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None
+                 ):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.loss_dfl = MODELS.build(loss_dfl)
+        # YOLOv8 doesn't need loss_obj
+        self.loss_obj = None
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+        The special_init function is designed to deal with this situation.
+        """
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            # Add common attributes to reduce calculation
+            self.featmap_sizes_train = None
+            self.num_level_priors = None
+            self.flatten_priors_train = None
+            self.stride_tensor = None
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            bbox_dist_preds: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            bbox_dist_preds (Sequence[Tensor]): Box distribution logits for
+                each scale level with shape (bs, reg_max + 1, H*W, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        num_imgs = len(batch_img_metas)
+        current_featmap_sizes = [
+            cls_score.shape[2:] for cls_score in cls_scores
+        ]
+        # If the shape does not equal, generate new one
+        if current_featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = current_featmap_sizes
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                self.featmap_sizes_train,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+            self.num_level_priors = [len(n) for n in mlvl_priors_with_stride]
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+            self.stride_tensor = self.flatten_priors_train[..., [2]]
+        # gt info
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+        # pred info
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+        flatten_pred_bboxes = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        # (bs, n, 4 * reg_max)
+        flatten_pred_dists = [
+            bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4)
+            for bbox_pred_org in bbox_dist_preds
+        ]
+        flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1)
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1)
+        flatten_pred_bboxes = self.bbox_coder.decode(
+            self.flatten_priors_train[..., :2], flatten_pred_bboxes,
+            self.stride_tensor[..., 0])
+        assigned_result = self.assigner(
+            (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype),
+            flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train,
+            gt_labels, gt_bboxes, pad_bbox_flag)
+        assigned_bboxes = assigned_result['assigned_bboxes']
+        assigned_scores = assigned_result['assigned_scores']
+        fg_mask_pre_prior = assigned_result['fg_mask_pre_prior']
+        assigned_scores_sum = assigned_scores.sum().clamp(min=1)
+        loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum()
+        loss_cls /= assigned_scores_sum
+        # rescale bbox
+        assigned_bboxes /= self.stride_tensor
+        flatten_pred_bboxes /= self.stride_tensor
+        # select positive samples mask
+        num_pos = fg_mask_pre_prior.sum()
+        if num_pos > 0:
+            # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox
+            # will not report an error
+            # iou loss
+            prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(
+                flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = torch.masked_select(
+                assigned_bboxes, prior_bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1)
+            loss_bbox = self.loss_bbox(
+                pred_bboxes_pos, assigned_bboxes_pos,
+                weight=bbox_weight) / assigned_scores_sum
+            # dfl loss
+            pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior]
+            assigned_ltrb = self.bbox_coder.encode(
+                self.flatten_priors_train[..., :2] / self.stride_tensor,
+                assigned_bboxes,
+                max_dis=self.head_module.reg_max - 1,
+                eps=0.01)
+            assigned_ltrb_pos = torch.masked_select(
+                assigned_ltrb, prior_bbox_mask).reshape([-1, 4])
+            loss_dfl = self.loss_dfl(
+                pred_dist_pos.reshape(-1, self.head_module.reg_max),
+                assigned_ltrb_pos.reshape(-1),
+                weight=bbox_weight.expand(-1, 4).reshape(-1),
+                avg_factor=assigned_scores_sum)
+        else:
+            loss_bbox = flatten_pred_bboxes.sum() * 0
+            loss_dfl = flatten_pred_bboxes.sum() * 0
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * num_imgs * world_size,
+            loss_bbox=loss_bbox * num_imgs * world_size,
+            loss_dfl=loss_dfl * num_imgs * world_size)

mmyolo/models/dense_heads/yolox_head.py ADDED Viewed

	@@ -0,0 +1,514 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.task_modules.samplers import PseudoSampler
+from mmdet.models.utils import multi_apply
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+from mmyolo.registry import MODELS, TASK_UTILS
+from .yolov5_head import YOLOv5Head
+@MODELS.register_module()
+class YOLOXHeadModule(BaseModule):
+    """YOLOXHead head module used in `YOLOX.
+    `<https://arxiv.org/abs/2107.08430>`_
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 2.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to [8, 16, 32].
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Defaults to False.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: Union[int, Sequence],
+        widen_factor: float = 1.0,
+        num_base_priors: int = 1,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        use_depthwise: bool = False,
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: OptMultiConfig = None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.feat_channels = int(feat_channels * widen_factor)
+        self.stacked_convs = stacked_convs
+        self.use_depthwise = use_depthwise
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.num_base_priors = num_base_priors
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.featmap_strides = featmap_strides
+        if isinstance(in_channels, int):
+            in_channels = int(in_channels * widen_factor)
+        self.in_channels = in_channels
+        self._init_layers()
+    def _init_layers(self):
+        """Initialize heads for all level feature maps."""
+        self.multi_level_cls_convs = nn.ModuleList()
+        self.multi_level_reg_convs = nn.ModuleList()
+        self.multi_level_conv_cls = nn.ModuleList()
+        self.multi_level_conv_reg = nn.ModuleList()
+        self.multi_level_conv_obj = nn.ModuleList()
+        for _ in self.featmap_strides:
+            self.multi_level_cls_convs.append(self._build_stacked_convs())
+            self.multi_level_reg_convs.append(self._build_stacked_convs())
+            conv_cls, conv_reg, conv_obj = self._build_predictor()
+            self.multi_level_conv_cls.append(conv_cls)
+            self.multi_level_conv_reg.append(conv_reg)
+            self.multi_level_conv_obj.append(conv_obj)
+    def _build_stacked_convs(self) -> nn.Sequential:
+        """Initialize conv layers of a single level head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        stacked_convs = []
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            stacked_convs.append(
+                conv(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.conv_bias))
+        return nn.Sequential(*stacked_convs)
+    def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]:
+        """Initialize predictor layers of a single level head."""
+        conv_cls = nn.Conv2d(self.feat_channels, self.num_classes, 1)
+        conv_reg = nn.Conv2d(self.feat_channels, 4, 1)
+        conv_obj = nn.Conv2d(self.feat_channels, 1, 1)
+        return conv_cls, conv_reg, conv_obj
+    def init_weights(self):
+        """Initialize weights of the head."""
+        # Use prior in model initialization to improve stability
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj in zip(self.multi_level_conv_cls,
+                                      self.multi_level_conv_obj):
+            conv_cls.bias.data.fill_(bias_init)
+            conv_obj.bias.data.fill_(bias_init)
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        return multi_apply(self.forward_single, x, self.multi_level_cls_convs,
+                           self.multi_level_reg_convs,
+                           self.multi_level_conv_cls,
+                           self.multi_level_conv_reg,
+                           self.multi_level_conv_obj)
+    def forward_single(self, x: Tensor, cls_convs: nn.Module,
+                       reg_convs: nn.Module, conv_cls: nn.Module,
+                       conv_reg: nn.Module,
+                       conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+        cls_feat = cls_convs(x)
+        reg_feat = reg_convs(x)
+        cls_score = conv_cls(cls_feat)
+        bbox_pred = conv_reg(reg_feat)
+        objectness = conv_obj(reg_feat)
+        return cls_score, bbox_pred, objectness
+@MODELS.register_module()
+class YOLOXHead(YOLOv5Head):
+    """YOLOXHead head used in `YOLOX <https://arxiv.org/abs/2107.08430>`_.
+    Args:
+        head_module(ConfigType): Base module used for YOLOXHead
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss.
+        loss_bbox_aux (:obj:`ConfigDict` or dict): Config of bbox aux loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='YOLOXBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.IoULoss',
+                     mode='square',
+                     eps=1e-16,
+                     reduction='sum',
+                     loss_weight=5.0),
+                 loss_obj: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox_aux: ConfigType = dict(
+                     type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        self.use_bbox_aux = False
+        self.loss_bbox_aux = loss_bbox_aux
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_obj=loss_obj,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+        The special_init function is designed to deal with this situation.
+        """
+        self.loss_bbox_aux: nn.Module = MODELS.build(self.loss_bbox_aux)
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            # YOLOX does not support sampling
+            self.sampler = PseudoSampler()
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        return self.head_module(x)
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Tensor,
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        num_imgs = len(batch_img_metas)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        batch_gt_instances = self.gt_instances_preprocess(
+            batch_gt_instances, len(batch_img_metas))
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1)
+        flatten_priors = torch.cat(mlvl_priors)
+        flatten_bboxes = self.bbox_coder.decode(flatten_priors[..., :2],
+                                                flatten_bbox_preds,
+                                                flatten_priors[..., 2])
+        (pos_masks, cls_targets, obj_targets, bbox_targets, bbox_aux_target,
+         num_fg_imgs) = multi_apply(
+             self._get_targets_single,
+             flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1),
+             flatten_cls_preds.detach(), flatten_bboxes.detach(),
+             flatten_objectness.detach(), batch_gt_instances, batch_img_metas,
+             batch_gt_instances_ignore)
+        # The experimental results show that 'reduce_mean' can improve
+        # performance on the COCO dataset.
+        num_pos = torch.tensor(
+            sum(num_fg_imgs),
+            dtype=torch.float,
+            device=flatten_cls_preds.device)
+        num_total_samples = max(reduce_mean(num_pos), 1.0)
+        pos_masks = torch.cat(pos_masks, 0)
+        cls_targets = torch.cat(cls_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        if self.use_bbox_aux:
+            bbox_aux_target = torch.cat(bbox_aux_target, 0)
+        loss_obj = self.loss_obj(flatten_objectness.view(-1, 1),
+                                 obj_targets) / num_total_samples
+        if num_pos > 0:
+            loss_cls = self.loss_cls(
+                flatten_cls_preds.view(-1, self.num_classes)[pos_masks],
+                cls_targets) / num_total_samples
+            loss_bbox = self.loss_bbox(
+                flatten_bboxes.view(-1, 4)[pos_masks],
+                bbox_targets) / num_total_samples
+        else:
+            # Avoid cls and reg branch not participating in the gradient
+            # propagation when there is no ground-truth in the images.
+            # For more details, please refer to
+            # https://github.com/open-mmlab/mmdetection/issues/7298
+            loss_cls = flatten_cls_preds.sum() * 0
+            loss_bbox = flatten_bboxes.sum() * 0
+        loss_dict = dict(
+            loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj)
+        if self.use_bbox_aux:
+            if num_pos > 0:
+                loss_bbox_aux = self.loss_bbox_aux(
+                    flatten_bbox_preds.view(-1, 4)[pos_masks],
+                    bbox_aux_target) / num_total_samples
+            else:
+                # Avoid cls and reg branch not participating in the gradient
+                # propagation when there is no ground-truth in the images.
+                # For more details, please refer to
+                # https://github.com/open-mmlab/mmdetection/issues/7298
+                loss_bbox_aux = flatten_bbox_preds.sum() * 0
+            loss_dict.update(loss_bbox_aux=loss_bbox_aux)
+        return loss_dict
+    @torch.no_grad()
+    def _get_targets_single(
+            self,
+            priors: Tensor,
+            cls_preds: Tensor,
+            decoded_bboxes: Tensor,
+            objectness: Tensor,
+            gt_instances: InstanceData,
+            img_meta: dict,
+            gt_instances_ignore: Optional[InstanceData] = None) -> tuple:
+        """Compute classification, regression, and objectness targets for
+        priors in a single image.
+        Args:
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            cls_preds (Tensor): Classification predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            decoded_bboxes (Tensor): Decoded bboxes predictions of one image,
+                a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y,
+                br_x, br_y] format.
+            objectness (Tensor): Objectness predictions of one image,
+                a 1D-Tensor with shape [num_priors]
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            tuple:
+                foreground_mask (list[Tensor]): Binary mask of foreground
+                targets.
+                cls_target (list[Tensor]): Classification targets of an image.
+                obj_target (list[Tensor]): Objectness targets of an image.
+                bbox_target (list[Tensor]): BBox targets of an image.
+                bbox_aux_target (int): BBox aux targets of an image.
+                num_pos_per_img (int): Number of positive samples in an image.
+        """
+        num_priors = priors.size(0)
+        num_gts = len(gt_instances)
+        # No target
+        if num_gts == 0:
+            cls_target = cls_preds.new_zeros((0, self.num_classes))
+            bbox_target = cls_preds.new_zeros((0, 4))
+            bbox_aux_target = cls_preds.new_zeros((0, 4))
+            obj_target = cls_preds.new_zeros((num_priors, 1))
+            foreground_mask = cls_preds.new_zeros(num_priors).bool()
+            return (foreground_mask, cls_target, obj_target, bbox_target,
+                    bbox_aux_target, 0)
+        # YOLOX uses center priors with 0.5 offset to assign targets,
+        # but use center priors without offset to regress bboxes.
+        offset_priors = torch.cat(
+            [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1)
+        scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid()
+        pred_instances = InstanceData(
+            bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        pos_inds = sampling_result.pos_inds
+        num_pos_per_img = pos_inds.size(0)
+        pos_ious = assign_result.max_overlaps[pos_inds]
+        # IOU aware classification score
+        cls_target = F.one_hot(sampling_result.pos_gt_labels,
+                               self.num_classes) * pos_ious.unsqueeze(-1)
+        obj_target = torch.zeros_like(objectness).unsqueeze(-1)
+        obj_target[pos_inds] = 1
+        bbox_target = sampling_result.pos_gt_bboxes
+        bbox_aux_target = cls_preds.new_zeros((num_pos_per_img, 4))
+        if self.use_bbox_aux:
+            bbox_aux_target = self._get_bbox_aux_target(
+                bbox_aux_target, bbox_target, priors[pos_inds])
+        foreground_mask = torch.zeros_like(objectness).to(torch.bool)
+        foreground_mask[pos_inds] = 1
+        return (foreground_mask, cls_target, obj_target, bbox_target,
+                bbox_aux_target, num_pos_per_img)
+    def _get_bbox_aux_target(self,
+                             bbox_aux_target: Tensor,
+                             gt_bboxes: Tensor,
+                             priors: Tensor,
+                             eps: float = 1e-8) -> Tensor:
+        """Convert gt bboxes to center offset and log width height."""
+        gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes)
+        bbox_aux_target[:, :2] = (gt_cxcywh[:, :2] -
+                                  priors[:, :2]) / priors[:, 2:]
+        bbox_aux_target[:,
+                        2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps)
+        return bbox_aux_target
+    @staticmethod
+    def gt_instances_preprocess(batch_gt_instances: Tensor,
+                                batch_size: int) -> List[InstanceData]:
+        """Split batch_gt_instances with batch size.
+        Args:
+            batch_gt_instances (Tensor): Ground truth
+                a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6]
+            batch_size (int): Batch size.
+        Returns:
+            List: batch gt instances data, shape [batch_size, InstanceData]
+        """
+        # faster version
+        batch_instance_list = []
+        for i in range(batch_size):
+            batch_gt_instance_ = InstanceData()
+            single_batch_instance = \
+                batch_gt_instances[batch_gt_instances[:, 0] == i, :]
+            batch_gt_instance_.bboxes = single_batch_instance[:, 2:]
+            batch_gt_instance_.labels = single_batch_instance[:, 1]
+            batch_instance_list.append(batch_gt_instance_)
+        return batch_instance_list

mmyolo/models/detectors/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .yolo_detector import YOLODetector
+__all__ = ['YOLODetector']

mmyolo/models/detectors/yolo_detector.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.models.detectors.single_stage import SingleStageDetector
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+from mmyolo.registry import MODELS
+@MODELS.register_module()
+class YOLODetector(SingleStageDetector):
+    r"""Implementation of YOLO Series
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLO. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLO. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+        use_syncbn (bool): whether to use SyncBatchNorm. Defaults to True.
+    """
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 use_syncbn: bool = True):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        # TODO： Waiting for mmengine support
+        if use_syncbn and get_world_size() > 1:
+            torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)
+            print_log('Using SyncBatchNorm()', 'current')

mmyolo/models/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+from .ema import ExpMomentumEMA
+from .yolo_bricks import (BepC3StageBlock, CSPLayerWithTwoConv,
+                          DarknetBottleneck, EELANBlock, EffectiveSELayer,
+                          ELANBlock, ImplicitA, ImplicitM,
+                          MaxPoolAndStrideConvBlock, PPYOLOEBasicBlock,
+                          RepStageBlock, RepVGGBlock, SPPFBottleneck,
+                          SPPFCSPBlock, TinyDownSampleBlock)
+__all__ = [
+    'SPPFBottleneck', 'RepVGGBlock', 'RepStageBlock', 'ExpMomentumEMA',
+    'ELANBlock', 'MaxPoolAndStrideConvBlock', 'SPPFCSPBlock',
+    'PPYOLOEBasicBlock', 'EffectiveSELayer', 'TinyDownSampleBlock',
+    'EELANBlock', 'ImplicitA', 'ImplicitM', 'BepC3StageBlock',
+    'CSPLayerWithTwoConv', 'DarknetBottleneck'
+]

mmyolo/models/layers/ema.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+from mmdet.models.layers import ExpMomentumEMA as MMDET_ExpMomentumEMA
+from torch import Tensor
+from mmyolo.registry import MODELS
+@MODELS.register_module()
+class ExpMomentumEMA(MMDET_ExpMomentumEMA):
+    """Exponential moving average (EMA) with exponential momentum strategy,
+    which is used in YOLO.
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Ema's parameters are updated with the formula:
+           `averaged_param = (1-momentum) * averaged_param + momentum *
+           source_param`. Defaults to 0.0002.
+        gamma (int): Use a larger momentum early in training and gradually
+            annealing to a smaller value to update the ema model smoothly. The
+            momentum is calculated as
+            `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`.
+            Defaults to 2000.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 gamma: int = 2000,
+                 interval=1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False):
+        super().__init__(
+            model=model,
+            momentum=momentum,
+            interval=interval,
+            device=device,
+            update_buffers=update_buffers)
+        assert gamma > 0, f'gamma must be greater than 0, but got {gamma}'
+        self.gamma = gamma
+        # Note: There is no need to re-fetch every update,
+        # as most models do not change their structure
+        # during the training process.
+        self.src_parameters = (
+            model.state_dict()
+            if self.update_buffers else dict(model.named_parameters()))
+        if not self.update_buffers:
+            self.src_buffers = model.buffers()
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int):
+        """Compute the moving average of the parameters using the exponential
+        momentum strategy.
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        momentum = (1 - self.momentum) * math.exp(
+            -float(1 + steps) / self.gamma) + self.momentum
+        averaged_param.lerp_(source_param, momentum)
+    def update_parameters(self, model: nn.Module):
+        """Update the parameters after each training step.
+        Args:
+            model (nn.Module): The model of the parameter needs to be updated.
+        """
+        if self.steps == 0:
+            for k, p_avg in self.avg_parameters.items():
+                p_avg.data.copy_(self.src_parameters[k].data)
+        elif self.steps % self.interval == 0:
+            for k, p_avg in self.avg_parameters.items():
+                if p_avg.dtype.is_floating_point:
+                    self.avg_func(p_avg.data, self.src_parameters[k].data,
+                                  self.steps)
+        if not self.update_buffers:
+            # If not update the buffers,
+            # keep the buffers in sync with the source model.
+            for b_avg, b_src in zip(self.module.buffers(), self.src_buffers):
+                b_avg.data.copy_(b_src.data)
+        self.steps += 1