diff --git a/mmyolo/__init__.py b/mmyolo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7a2f33338e484f9ecb4e9123e88b84f902cd6cf
--- /dev/null
+++ b/mmyolo/__init__.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmdet
+import mmengine
+from mmengine.utils import digit_version
+
+from .version import __version__, version_info
+
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '2.1.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+mmengine_minimum_version = '0.6.0'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+
+mmdet_minimum_version = '3.0.0rc6'
+mmdet_maximum_version = '3.1.0'
+mmdet_version = digit_version(mmdet.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version < digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+
+assert (mmengine_version >= digit_version(mmengine_minimum_version)
+        and mmengine_version < digit_version(mmengine_maximum_version)), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_minimum_version}, ' \
+    f'<{mmengine_maximum_version}.'
+
+assert (mmdet_version >= digit_version(mmdet_minimum_version)
+        and mmdet_version < digit_version(mmdet_maximum_version)), \
+    f'MMDetection=={mmdet.__version__} is used but incompatible. ' \
+    f'Please install mmdet>={mmdet_minimum_version}, ' \
+    f'<{mmdet_maximum_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/mmyolo/datasets/__init__.py b/mmyolo/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b6b971937e0179306965fbb5695121fd5d3b64
--- /dev/null
+++ b/mmyolo/datasets/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .transforms import *  # noqa: F401,F403
+from .utils import BatchShapePolicy, yolov5_collate
+from .yolov5_coco import YOLOv5CocoDataset
+from .yolov5_crowdhuman import YOLOv5CrowdHumanDataset
+from .yolov5_dota import YOLOv5DOTADataset
+from .yolov5_voc import YOLOv5VOCDataset
+
+__all__ = [
+    'YOLOv5CocoDataset', 'YOLOv5VOCDataset', 'BatchShapePolicy',
+    'yolov5_collate', 'YOLOv5CrowdHumanDataset', 'YOLOv5DOTADataset'
+]
diff --git a/mmyolo/datasets/transforms/__init__.py b/mmyolo/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..58f4e6fdb5d7272743240f1d0da55c5a7d489fbf
--- /dev/null
+++ b/mmyolo/datasets/transforms/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mix_img_transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp
+from .transforms import (LetterResize, LoadAnnotations, PPYOLOERandomCrop,
+                         PPYOLOERandomDistort, RegularizeRotatedBox,
+                         RemoveDataElement, YOLOv5CopyPaste,
+                         YOLOv5HSVRandomAug, YOLOv5KeepRatioResize,
+                         YOLOv5RandomAffine)
+
+__all__ = [
+    'YOLOv5KeepRatioResize', 'LetterResize', 'Mosaic', 'YOLOXMixUp',
+    'YOLOv5MixUp', 'YOLOv5HSVRandomAug', 'LoadAnnotations',
+    'YOLOv5RandomAffine', 'PPYOLOERandomDistort', 'PPYOLOERandomCrop',
+    'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox'
+]
diff --git a/mmyolo/datasets/transforms/mix_img_transforms.py b/mmyolo/datasets/transforms/mix_img_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a25f6f7ef327878cc19b51f32037037d0412aaa
--- /dev/null
+++ b/mmyolo/datasets/transforms/mix_img_transforms.py
@@ -0,0 +1,1150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import copy
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Sequence, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmdet.structures.bbox import autocast_box_type
+from mmengine.dataset import BaseDataset
+from mmengine.dataset.base_dataset import Compose
+from numpy import random
+
+from mmyolo.registry import TRANSFORMS
+
+
+class BaseMixImageTransform(BaseTransform, metaclass=ABCMeta):
+    """A Base Transform of multiple images mixed.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup.
+
+    Cached mosaic transform will random select images from the cache
+    and combine them into one output image if use_cached is True.
+
+    Args:
+        pre_transform(Sequence[str]): Sequence of transform object or
+            config dict to be composed. Defaults to None.
+        prob(float): The transformation probability. Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+
+    def __init__(self,
+                 pre_transform: Optional[Sequence[str]] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+
+        self.max_refetch = max_refetch
+        self.prob = prob
+
+        self.use_cached = use_cached
+        self.max_cached_images = max_cached_images
+        self.random_pop = random_pop
+        self.results_cache = []
+
+        if pre_transform is None:
+            self.pre_transform = None
+        else:
+            self.pre_transform = Compose(pre_transform)
+
+    @abstractmethod
+    def get_indexes(self, dataset: Union[BaseDataset,
+                                         list]) -> Union[list, int]:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            list or int: indexes.
+        """
+        pass
+
+    @abstractmethod
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        pass
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Data augmentation function.
+
+        The transform steps are as follows:
+        1. Randomly generate index list of other images.
+        2. Before Mosaic or MixUp need to go through the necessary
+            pre_transform, such as MixUp' pre_transform pipeline
+            include: 'LoadImageFromFile','LoadAnnotations',
+            'Mosaic' and 'RandomAffine'.
+        3. Use mix_img_transform function to implement specific
+            mix operations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        if self.use_cached:
+            # Be careful: deep copying can be very time-consuming
+            # if results includes dataset.
+            dataset = results.pop('dataset', None)
+            self.results_cache.append(copy.deepcopy(results))
+            if len(self.results_cache) > self.max_cached_images:
+                if self.random_pop:
+                    index = random.randint(0, len(self.results_cache) - 1)
+                else:
+                    index = 0
+                self.results_cache.pop(index)
+
+            if len(self.results_cache) <= 4:
+                return results
+        else:
+            assert 'dataset' in results
+            # Be careful: deep copying can be very time-consuming
+            # if results includes dataset.
+            dataset = results.pop('dataset', None)
+
+        for _ in range(self.max_refetch):
+            # get index of one or three other images
+            if self.use_cached:
+                indexes = self.get_indexes(self.results_cache)
+            else:
+                indexes = self.get_indexes(dataset)
+
+            if not isinstance(indexes, collections.abc.Sequence):
+                indexes = [indexes]
+
+            if self.use_cached:
+                mix_results = [
+                    copy.deepcopy(self.results_cache[i]) for i in indexes
+                ]
+            else:
+                # get images information will be used for Mosaic or MixUp
+                mix_results = [
+                    copy.deepcopy(dataset.get_data_info(index))
+                    for index in indexes
+                ]
+
+            if self.pre_transform is not None:
+                for i, data in enumerate(mix_results):
+                    # pre_transform may also require dataset
+                    data.update({'dataset': dataset})
+                    # before Mosaic or MixUp need to go through
+                    # the necessary pre_transform
+                    _results = self.pre_transform(data)
+                    _results.pop('dataset')
+                    mix_results[i] = _results
+
+            if None not in mix_results:
+                results['mix_results'] = mix_results
+                break
+            print('Repeated calculation')
+        else:
+            raise RuntimeError(
+                'The loading pipeline of the original dataset'
+                ' always return None. Please check the correctness '
+                'of the dataset and its pipeline.')
+
+        # Mosaic or MixUp
+        results = self.mix_img_transform(results)
+
+        if 'mix_results' in results:
+            results.pop('mix_results')
+        results['dataset'] = dataset
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class Mosaic(BaseMixImageTransform):
+    """Mosaic augmentation.
+
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |           |
+                |      +-----------+    pad    |
+                |      |           |           |
+                |      |  image1   +-----------+
+                |      |           |           |
+                |      |           |   image2  |
+     center_y   |----+-+-----------+-----------+
+                |    |   cropped   |           |
+                |pad |   image3    |   image4  |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 bbox_clip_border: bool = True,
+                 pad_val: float = 114.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        if use_cached:
+            assert max_cached_images >= 4, 'The length of cache must >= 4, ' \
+                                           f'but got {max_cached_images}.'
+
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> list:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            list: indexes.
+        """
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        mosaic_masks = []
+        with_mask = True if 'gt_masks' in results else False
+        # self.img_scale is wh format
+        img_scale_w, img_scale_h = self.img_scale
+
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(img_scale_h * 2), int(img_scale_w * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)),
+                                 self.pad_val,
+                                 dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w)
+        center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h)
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = results
+            else:
+                results_patch = results['mix_results'][i - 1]
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+            if with_mask and results_patch.get('gt_masks', None) is not None:
+                gt_masks_i = results_patch['gt_masks']
+                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padw,
+                    direction='horizontal')
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padh,
+                    direction='vertical')
+                mosaic_masks.append(gt_masks_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w])
+            if with_mask:
+                mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
+                results['gt_masks'] = mosaic_masks
+        else:
+            # remove outside bboxes
+            inside_inds = mosaic_bboxes.is_inside(
+                [2 * img_scale_h, 2 * img_scale_w]).numpy()
+            mosaic_bboxes = mosaic_bboxes[inside_inds]
+            mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+            mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+            if with_mask:
+                mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds]
+                results['gt_masks'] = mosaic_masks
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+
+        return results
+
+    def _mosaic_combine(
+            self, loc: str, center_position_xy: Sequence[float],
+            img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Mosaic9(BaseMixImageTransform):
+    """Mosaic9 augmentation.
+
+    Given 9 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                +-------------------------------+------------+
+                | pad           |      pad      |            |
+                |    +----------+               |            |
+                |    |          +---------------+  top_right |
+                |    |          |      top      |   image2   |
+                |    | top_left |     image1    |            |
+                |    |  image8  o--------+------+--------+---+
+                |    |          |        |               |   |
+                +----+----------+        |     right     |pad|
+                |               | center |     image3    |   |
+                |     left      | image0 +---------------+---|
+                |    image7     |        |               |   |
+            +---+-----------+---+--------+               |   |
+            |   |  cropped  |            |  bottom_right |pad|
+            |   |bottom_left|            |    image4     |   |
+            |   |  image6   |   bottom   |               |   |
+            +---|-----------+   image5   +---------------+---|
+                |    pad    |            |        pad        |
+                +-----------+------------+-------------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Get the center image according to the index, and randomly
+            sample another 8 images from the custom dataset.
+         2. Randomly offset the image after Mosaic
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size after mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 5 caches for each image suffices for
+            randomness. Defaults to 50.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Defaults to 15.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 bbox_clip_border: bool = True,
+                 pad_val: Union[float, int] = 114.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 50,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        if use_cached:
+            assert max_cached_images >= 9, 'The length of cache must >= 9, ' \
+                                           f'but got {max_cached_images}.'
+
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+
+        self.img_scale = img_scale
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+
+        # intermediate variables
+        self._current_img_shape = [0, 0]
+        self._center_img_shape = [0, 0]
+        self._previous_img_shape = [0, 0]
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> list:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            list: indexes.
+        """
+        indexes = [random.randint(0, len(dataset)) for _ in range(8)]
+        return indexes
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """Mixed image data transformation.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+
+        img_scale_w, img_scale_h = self.img_scale
+
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(img_scale_h * 3), int(img_scale_w * 3), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)),
+                                 self.pad_val,
+                                 dtype=results['img'].dtype)
+
+        # index = 0 is mean original image
+        # len(results['mix_results']) = 8
+        loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right',
+                    'bottom', 'bottom_left', 'left', 'top_left')
+
+        results_all = [results, *results['mix_results']]
+        for index, results_patch in enumerate(results_all):
+            img_i = results_patch['img']
+            # keep_ratio resize
+            img_i_h, img_i_w = img_i.shape[:2]
+            scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w)
+            img_i = mmcv.imresize(
+                img_i,
+                (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i)))
+
+            paste_coord = self._mosaic_combine(loc_strs[index],
+                                               img_i.shape[:2])
+
+            padw, padh = paste_coord[:2]
+            x1, y1, x2, y2 = (max(x, 0) for x in paste_coord)
+            mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:]
+
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+
+        # Offset
+        offset_x = int(random.uniform(0, img_scale_w))
+        offset_y = int(random.uniform(0, img_scale_h))
+        mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h,
+                                offset_x:offset_x + 2 * img_scale_w]
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes.translate_([-offset_x, -offset_y])
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w])
+        else:
+            # remove outside bboxes
+            inside_inds = mosaic_bboxes.is_inside(
+                [2 * img_scale_h, 2 * img_scale_w]).numpy()
+            mosaic_bboxes = mosaic_bboxes[inside_inds]
+            mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+            mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        return results
+
+    def _mosaic_combine(self, loc: str,
+                        img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]:
+        """Calculate global coordinate of mosaic image.
+
+        Args:
+            loc (str): Index for the sub-image.
+            img_shape_hw (Sequence[int]): Height and width of sub-image
+
+        Returns:
+             paste_coord (tuple): paste corner coordinate in mosaic image.
+        """
+        assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right',
+                       'bottom', 'bottom_left', 'left', 'top_left')
+
+        img_scale_w, img_scale_h = self.img_scale
+
+        self._current_img_shape = img_shape_hw
+        current_img_h, current_img_w = self._current_img_shape
+        previous_img_h, previous_img_w = self._previous_img_shape
+        center_img_h, center_img_w = self._center_img_shape
+
+        if loc == 'center':
+            self._center_img_shape = self._current_img_shape
+            #  xmin, ymin, xmax, ymax
+            paste_coord = img_scale_w, \
+                img_scale_h, \
+                img_scale_w + current_img_w, \
+                img_scale_h + current_img_h
+        elif loc == 'top':
+            paste_coord = img_scale_w, \
+                          img_scale_h - current_img_h, \
+                          img_scale_w + current_img_w, \
+                          img_scale_h
+        elif loc == 'top_right':
+            paste_coord = img_scale_w + previous_img_w, \
+                          img_scale_h - current_img_h, \
+                          img_scale_w + previous_img_w + current_img_w, \
+                          img_scale_h
+        elif loc == 'right':
+            paste_coord = img_scale_w + center_img_w, \
+                          img_scale_h, \
+                          img_scale_w + center_img_w + current_img_w, \
+                          img_scale_h + current_img_h
+        elif loc == 'bottom_right':
+            paste_coord = img_scale_w + center_img_w, \
+                          img_scale_h + previous_img_h, \
+                          img_scale_w + center_img_w + current_img_w, \
+                          img_scale_h + previous_img_h + current_img_h
+        elif loc == 'bottom':
+            paste_coord = img_scale_w + center_img_w - current_img_w, \
+                          img_scale_h + center_img_h, \
+                          img_scale_w + center_img_w, \
+                          img_scale_h + center_img_h + current_img_h
+        elif loc == 'bottom_left':
+            paste_coord = img_scale_w + center_img_w - \
+                          previous_img_w - current_img_w, \
+                          img_scale_h + center_img_h, \
+                          img_scale_w + center_img_w - previous_img_w, \
+                          img_scale_h + center_img_h + current_img_h
+        elif loc == 'left':
+            paste_coord = img_scale_w - current_img_w, \
+                          img_scale_h + center_img_h - current_img_h, \
+                          img_scale_w, \
+                          img_scale_h + center_img_h
+        elif loc == 'top_left':
+            paste_coord = img_scale_w - current_img_w, \
+                          img_scale_h + center_img_h - \
+                          previous_img_h - current_img_h, \
+                          img_scale_w, \
+                          img_scale_h + center_img_h - previous_img_h
+
+        self._previous_img_shape = self._current_img_shape
+        #  xmin, ymin, xmax, ymax
+        return paste_coord
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class YOLOv5MixUp(BaseMixImageTransform):
+    """MixUp data augmentation for YOLOv5.
+
+    .. code:: text
+
+    The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset.
+        2. Randomly obtain the fusion ratio from the beta distribution,
+            then fuse the target
+        of the original image and mixup image through this ratio.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        alpha (float): parameter of beta distribution to get mixup ratio.
+            Defaults to 32.
+        beta (float):  parameter of beta distribution to get mixup ratio.
+            Defaults to 32.
+        pre_transform (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_refetch`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+    """
+
+    def __init__(self,
+                 alpha: float = 32.0,
+                 beta: float = 32.0,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        if use_cached:
+            assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.alpha = alpha
+        self.beta = beta
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            int: indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """YOLOv5 MixUp transform function.
+
+        Args:
+            results (dict): Result dict
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+        ori_img = results['img']
+        assert ori_img.shape == retrieve_img.shape
+
+        # Randomly obtain the fusion ratio from the beta distribution,
+        # which is around 0.5
+        ratio = np.random.beta(self.alpha, self.beta)
+        mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio))
+
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        if 'gt_masks' in results:
+            assert 'gt_masks' in retrieve_results
+            mixup_gt_masks = results['gt_masks'].cat(
+                [results['gt_masks'], retrieve_results['gt_masks']])
+            results['gt_masks'] = mixup_gt_masks
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class YOLOXMixUp(BaseMixImageTransform):
+    """MixUp data augmentation for YOLOX.
+
+    .. code:: text
+
+                         mixup transform
+                +---------------+--------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                +---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      +-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+    The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pre_transform(Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        use_cached (bool): Whether to use cache. Defaults to False.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        max_refetch (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_refetch`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 bbox_clip_border: bool = True,
+                 pre_transform: Sequence[dict] = None,
+                 prob: float = 1.0,
+                 use_cached: bool = False,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 max_refetch: int = 15):
+        assert isinstance(img_scale, tuple)
+        if use_cached:
+            assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                           f'but got {max_cached_images}.'
+        super().__init__(
+            pre_transform=pre_transform,
+            prob=prob,
+            use_cached=use_cached,
+            max_cached_images=max_cached_images,
+            random_pop=random_pop,
+            max_refetch=max_refetch)
+        self.img_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.bbox_clip_border = bbox_clip_border
+
+    def get_indexes(self, dataset: Union[BaseDataset, list]) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`Dataset` or list): The dataset or cached list.
+
+        Returns:
+            int: indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    def mix_img_transform(self, results: dict) -> dict:
+        """YOLOX MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            results (dict): Updated result dict.
+        """
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_filp = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones((self.img_scale[1], self.img_scale[0], 3),
+                              dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0],
+                          self.img_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_filp:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_filp:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+
+        if not self.bbox_clip_border:
+            # remove outside bbox
+            inside_inds = mixup_gt_bboxes.is_inside([target_h,
+                                                     target_w]).numpy()
+            mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+            mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+            mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_refetch={self.max_refetch}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
diff --git a/mmyolo/datasets/transforms/transforms.py b/mmyolo/datasets/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5179fba3372c53573716afbe8daf3efa674d587
--- /dev/null
+++ b/mmyolo/datasets/transforms/transforms.py
@@ -0,0 +1,1557 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from copy import deepcopy
+from typing import List, Sequence, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv.transforms import BaseTransform, Compose
+from mmcv.transforms.utils import cache_randomness
+from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations
+from mmdet.datasets.transforms import Resize as MMDET_Resize
+from mmdet.structures.bbox import (HorizontalBoxes, autocast_box_type,
+                                   get_box_type)
+from mmdet.structures.mask import PolygonMasks
+from numpy import random
+
+from mmyolo.registry import TRANSFORMS
+
+# TODO: Waiting for MMCV support
+TRANSFORMS.register_module(module=Compose, force=True)
+
+
+@TRANSFORMS.register_module()
+class YOLOv5KeepRatioResize(MMDET_Resize):
+    """Resize images & bbox(if existed).
+
+    This transform resizes the input image according to ``scale``.
+    Bboxes (if existed) are then resized with the same scale factor.
+
+    Required Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - img_shape (tuple)
+    - gt_bboxes (optional)
+    - scale (float)
+
+    Added Keys:
+
+    - scale_factor (np.float32)
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): Images scales for resizing.
+    """
+
+    def __init__(self,
+                 scale: Union[int, Tuple[int, int]],
+                 keep_ratio: bool = True,
+                 **kwargs):
+        assert keep_ratio is True
+        super().__init__(scale=scale, keep_ratio=True, **kwargs)
+
+    @staticmethod
+    def _get_rescale_ratio(old_size: Tuple[int, int],
+                           scale: Union[float, Tuple[int]]) -> float:
+        """Calculate the ratio for rescaling.
+
+        Args:
+            old_size (tuple[int]): The old size (w, h) of image.
+            scale (float | tuple[int]): The scaling factor or maximum size.
+                If it is a float number, then the image will be rescaled by
+                this factor, else if it is a tuple of 2 integers, then
+                the image will be rescaled as large as possible within
+                the scale.
+
+        Returns:
+            float: The resize ratio.
+        """
+        w, h = old_size
+        if isinstance(scale, (float, int)):
+            if scale <= 0:
+                raise ValueError(f'Invalid scale {scale}, must be positive.')
+            scale_factor = scale
+        elif isinstance(scale, tuple):
+            max_long_edge = max(scale)
+            max_short_edge = min(scale)
+            scale_factor = min(max_long_edge / max(h, w),
+                               max_short_edge / min(h, w))
+        else:
+            raise TypeError('Scale must be a number or tuple of int, '
+                            f'but got {type(scale)}')
+
+        return scale_factor
+
+    def _resize_img(self, results: dict):
+        """Resize images with ``results['scale']``."""
+        assert self.keep_ratio is True
+
+        if results.get('img', None) is not None:
+            image = results['img']
+            original_h, original_w = image.shape[:2]
+            ratio = self._get_rescale_ratio((original_h, original_w),
+                                            self.scale)
+
+            if ratio != 1:
+                # resize image according to the ratio
+                image = mmcv.imrescale(
+                    img=image,
+                    scale=ratio,
+                    interpolation='area' if ratio < 1 else 'bilinear',
+                    backend=self.backend)
+
+            resized_h, resized_w = image.shape[:2]
+            scale_ratio = resized_h / original_h
+
+            scale_factor = (scale_ratio, scale_ratio)
+
+            results['img'] = image
+            results['img_shape'] = image.shape[:2]
+            results['scale_factor'] = scale_factor
+
+
+@TRANSFORMS.register_module()
+class LetterResize(MMDET_Resize):
+    """Resize and pad image while meeting stride-multiple constraints.
+
+    Required Keys:
+
+    - img (np.uint8)
+    - batch_shape (np.int64) (optional)
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - img_shape (tuple)
+    - gt_bboxes (optional)
+
+    Added Keys:
+    - pad_param (np.float32)
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): Images scales for resizing.
+        pad_val (dict): Padding value. Defaults to dict(img=0, seg=255).
+        use_mini_pad (bool): Whether using minimum rectangle padding.
+            Defaults to True
+        stretch_only (bool): Whether stretch to the specified size directly.
+            Defaults to False
+        allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True
+    """
+
+    def __init__(self,
+                 scale: Union[int, Tuple[int, int]],
+                 pad_val: dict = dict(img=0, mask=0, seg=255),
+                 use_mini_pad: bool = False,
+                 stretch_only: bool = False,
+                 allow_scale_up: bool = True,
+                 **kwargs):
+        super().__init__(scale=scale, keep_ratio=True, **kwargs)
+
+        self.pad_val = pad_val
+        if isinstance(pad_val, (int, float)):
+            pad_val = dict(img=pad_val, seg=255)
+        assert isinstance(
+            pad_val, dict), f'pad_val must be dict, but got {type(pad_val)}'
+
+        self.use_mini_pad = use_mini_pad
+        self.stretch_only = stretch_only
+        self.allow_scale_up = allow_scale_up
+
+    def _resize_img(self, results: dict):
+        """Resize images with ``results['scale']``."""
+        image = results.get('img', None)
+        if image is None:
+            return
+
+        # Use batch_shape if a batch_shape policy is configured
+        if 'batch_shape' in results:
+            scale = tuple(results['batch_shape'])  # hw
+        else:
+            scale = self.scale[::-1]  # wh -> hw
+
+        image_shape = image.shape[:2]  # height, width
+
+        # Scale ratio (new / old)
+        ratio = min(scale[0] / image_shape[0], scale[1] / image_shape[1])
+
+        # only scale down, do not scale up (for better test mAP)
+        if not self.allow_scale_up:
+            ratio = min(ratio, 1.0)
+
+        ratio = [ratio, ratio]  # float -> (float, float) for (height, width)
+
+        # compute the best size of the image
+        no_pad_shape = (int(round(image_shape[0] * ratio[0])),
+                        int(round(image_shape[1] * ratio[1])))
+
+        # padding height & width
+        padding_h, padding_w = [
+            scale[0] - no_pad_shape[0], scale[1] - no_pad_shape[1]
+        ]
+        if self.use_mini_pad:
+            # minimum rectangle padding
+            padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32)
+
+        elif self.stretch_only:
+            # stretch to the specified size directly
+            padding_h, padding_w = 0.0, 0.0
+            no_pad_shape = (scale[0], scale[1])
+            ratio = [scale[0] / image_shape[0],
+                     scale[1] / image_shape[1]]  # height, width ratios
+
+        if image_shape != no_pad_shape:
+            # compare with no resize and padding size
+            image = mmcv.imresize(
+                image, (no_pad_shape[1], no_pad_shape[0]),
+                interpolation=self.interpolation,
+                backend=self.backend)
+
+        scale_factor = (ratio[1], ratio[0])  # mmcv scale factor is (w, h)
+
+        if 'scale_factor' in results:
+            results['scale_factor_origin'] = results['scale_factor']
+        results['scale_factor'] = scale_factor
+
+        # padding
+        top_padding, left_padding = int(round(padding_h // 2 - 0.1)), int(
+            round(padding_w // 2 - 0.1))
+        bottom_padding = padding_h - top_padding
+        right_padding = padding_w - left_padding
+
+        padding_list = [
+            top_padding, bottom_padding, left_padding, right_padding
+        ]
+        if top_padding != 0 or bottom_padding != 0 or \
+                left_padding != 0 or right_padding != 0:
+
+            pad_val = self.pad_val.get('img', 0)
+            if isinstance(pad_val, int) and image.ndim == 3:
+                pad_val = tuple(pad_val for _ in range(image.shape[2]))
+
+            image = mmcv.impad(
+                img=image,
+                padding=(padding_list[2], padding_list[0], padding_list[3],
+                         padding_list[1]),
+                pad_val=pad_val,
+                padding_mode='constant')
+
+        results['img'] = image
+        results['img_shape'] = image.shape
+        if 'pad_param' in results:
+            results['pad_param_origin'] = results['pad_param'] * \
+                                          np.repeat(ratio, 2)
+        results['pad_param'] = np.array(padding_list, dtype=np.float32)
+
+    def _resize_masks(self, results: dict):
+        """Resize masks with ``results['scale']``"""
+        if results.get('gt_masks', None) is None:
+            return
+
+        gt_masks = results['gt_masks']
+        assert isinstance(
+            gt_masks, PolygonMasks
+        ), f'Only supports PolygonMasks, but got {type(gt_masks)}'
+
+        # resize the gt_masks
+        gt_mask_h = results['gt_masks'].height * results['scale_factor'][1]
+        gt_mask_w = results['gt_masks'].width * results['scale_factor'][0]
+        gt_masks = results['gt_masks'].resize(
+            (int(round(gt_mask_h)), int(round(gt_mask_w))))
+
+        top_padding, _, left_padding, _ = results['pad_param']
+        if int(left_padding) != 0:
+            gt_masks = gt_masks.translate(
+                out_shape=results['img_shape'][:2],
+                offset=int(left_padding),
+                direction='horizontal')
+        if int(top_padding) != 0:
+            gt_masks = gt_masks.translate(
+                out_shape=results['img_shape'][:2],
+                offset=int(top_padding),
+                direction='vertical')
+        results['gt_masks'] = gt_masks
+
+    def _resize_bboxes(self, results: dict):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_bboxes', None) is None:
+            return
+        results['gt_bboxes'].rescale_(results['scale_factor'])
+
+        if len(results['pad_param']) != 4:
+            return
+        results['gt_bboxes'].translate_(
+            (results['pad_param'][2], results['pad_param'][0]))
+
+        if self.clip_object_border:
+            results['gt_bboxes'].clip_(results['img_shape'])
+
+    def transform(self, results: dict) -> dict:
+        results = super().transform(results)
+        if 'scale_factor_origin' in results:
+            scale_factor_origin = results.pop('scale_factor_origin')
+            results['scale_factor'] = (results['scale_factor'][0] *
+                                       scale_factor_origin[0],
+                                       results['scale_factor'][1] *
+                                       scale_factor_origin[1])
+        if 'pad_param_origin' in results:
+            pad_param_origin = results.pop('pad_param_origin')
+            results['pad_param'] += pad_param_origin
+        return results
+
+
+# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug
+@TRANSFORMS.register_module()
+class YOLOv5HSVRandomAug(BaseTransform):
+    """Apply HSV augmentation to image sequentially.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        hue_delta ([int, float]): delta of hue. Defaults to 0.015.
+        saturation_delta ([int, float]): delta of saturation. Defaults to 0.7.
+        value_delta ([int, float]): delta of value. Defaults to 0.4.
+    """
+
+    def __init__(self,
+                 hue_delta: Union[int, float] = 0.015,
+                 saturation_delta: Union[int, float] = 0.7,
+                 value_delta: Union[int, float] = 0.4):
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+
+    def transform(self, results: dict) -> dict:
+        """The HSV augmentation transform function.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        hsv_gains = \
+            random.uniform(-1, 1, 3) * \
+            [self.hue_delta, self.saturation_delta, self.value_delta] + 1
+        hue, sat, val = cv2.split(
+            cv2.cvtColor(results['img'], cv2.COLOR_BGR2HSV))
+
+        table_list = np.arange(0, 256, dtype=hsv_gains.dtype)
+        lut_hue = ((table_list * hsv_gains[0]) % 180).astype(np.uint8)
+        lut_sat = np.clip(table_list * hsv_gains[1], 0, 255).astype(np.uint8)
+        lut_val = np.clip(table_list * hsv_gains[2], 0, 255).astype(np.uint8)
+
+        im_hsv = cv2.merge(
+            (cv2.LUT(hue, lut_hue), cv2.LUT(sat,
+                                            lut_sat), cv2.LUT(val, lut_val)))
+        results['img'] = cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(MMDET_LoadAnnotations):
+    """Because the yolo series does not need to consider ignore bboxes for the
+    time being, in order to speed up the pipeline, it can be excluded in
+    advance."""
+
+    def __init__(self,
+                 mask2bbox: bool = False,
+                 poly2mask: bool = False,
+                 **kwargs) -> None:
+        self.mask2bbox = mask2bbox
+        assert not poly2mask, 'Does not support BitmapMasks considering ' \
+                              'that bitmap consumes more memory.'
+        super().__init__(poly2mask=poly2mask, **kwargs)
+        if self.mask2bbox:
+            assert self.with_mask, 'Using mask2bbox requires ' \
+                                   'with_mask is True.'
+        self._mask_ignore_flag = None
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation.
+        """
+        if self.mask2bbox:
+            self._load_masks(results)
+            if self.with_label:
+                self._load_labels(results)
+                self._update_mask_ignore_data(results)
+            gt_bboxes = results['gt_masks'].get_bboxes(dst_type='hbox')
+            results['gt_bboxes'] = gt_bboxes
+        else:
+            results = super().transform(results)
+            self._update_mask_ignore_data(results)
+        return results
+
+    def _update_mask_ignore_data(self, results: dict) -> None:
+        if 'gt_masks' not in results:
+            return
+
+        if 'gt_bboxes_labels' in results and len(
+                results['gt_bboxes_labels']) != len(results['gt_masks']):
+            assert len(results['gt_bboxes_labels']) == len(
+                self._mask_ignore_flag)
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                self._mask_ignore_flag]
+
+        if 'gt_bboxes' in results and len(results['gt_bboxes']) != len(
+                results['gt_masks']):
+            assert len(results['gt_bboxes']) == len(self._mask_ignore_flag)
+            results['gt_bboxes'] = results['gt_bboxes'][self._mask_ignore_flag]
+
+    def _load_bboxes(self, results: dict):
+        """Private function to load bounding box annotations.
+        Note: BBoxes with ignore_flag of 1 is not considered.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            if instance['ignore_flag'] == 0:
+                gt_bboxes.append(instance['bbox'])
+                gt_ignore_flags.append(instance['ignore_flag'])
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+        if self.box_type is None:
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape((-1, 4))
+        else:
+            _, box_type_cls = get_box_type(self.box_type)
+            results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32)
+
+    def _load_labels(self, results: dict):
+        """Private function to load label annotations.
+
+        Note: BBoxes with ignore_flag of 1 is not considered.
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results.get('instances', []):
+            if instance['ignore_flag'] == 0:
+                gt_bboxes_labels.append(instance['bbox_label'])
+        results['gt_bboxes_labels'] = np.array(
+            gt_bboxes_labels, dtype=np.int64)
+
+    def _load_masks(self, results: dict) -> None:
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        """
+        gt_masks = []
+        gt_ignore_flags = []
+        self._mask_ignore_flag = []
+        for instance in results.get('instances', []):
+            if instance['ignore_flag'] == 0:
+                if 'mask' in instance:
+                    gt_mask = instance['mask']
+                    if isinstance(gt_mask, list):
+                        gt_mask = [
+                            np.array(polygon) for polygon in gt_mask
+                            if len(polygon) % 2 == 0 and len(polygon) >= 6
+                        ]
+                        if len(gt_mask) == 0:
+                            # ignore
+                            self._mask_ignore_flag.append(0)
+                        else:
+                            gt_masks.append(gt_mask)
+                            gt_ignore_flags.append(instance['ignore_flag'])
+                            self._mask_ignore_flag.append(1)
+                    else:
+                        raise NotImplementedError(
+                            'Only supports mask annotations in polygon '
+                            'format currently')
+                else:
+                    # TODO: Actually, gt with bbox and without mask needs
+                    #  to be retained
+                    self._mask_ignore_flag.append(0)
+        self._mask_ignore_flag = np.array(self._mask_ignore_flag, dtype=bool)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+        h, w = results['ori_shape']
+        gt_masks = PolygonMasks([mask for mask in gt_masks], h, w)
+        results['gt_masks'] = gt_masks
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'mask2bbox={self.mask2bbox}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'file_client_args={self.file_client_args})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class YOLOv5RandomAffine(BaseTransform):
+    """Random affine transform data augmentation in YOLOv5 and YOLOv8. It is
+    different from the implementation in YOLOX.
+
+    This operation randomly generates affine transform matrix which including
+    rotation, translation, shear and scaling transforms.
+    If you set use_mask_refine == True, the code will use the masks
+    annotation to refine the bbox.
+    Our implementation is slightly different from the official. In COCO
+    dataset, a gt may have multiple mask tags.  The official YOLOv5
+    annotation file already combines the masks that an object has,
+    but our code takes into account the fact that an object has multiple masks.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (PolygonMasks) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (PolygonMasks) (optional)
+
+    Args:
+        max_rotate_degree (float): Maximum degrees of rotation transform.
+            Defaults to 10.
+        max_translate_ratio (float): Maximum ratio of translation.
+            Defaults to 0.1.
+        scaling_ratio_range (tuple[float]): Min and max ratio of
+            scaling transform. Defaults to (0.5, 1.5).
+        max_shear_degree (float): Maximum degrees of shear
+            transform. Defaults to 2.
+        border (tuple[int]): Distance from width and height sides of input
+            image to adjust output shape. Only used in mosaic dataset.
+            Defaults to (0, 0).
+        border_val (tuple[int]): Border padding values of 3 channels.
+            Defaults to (114, 114, 114).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        min_bbox_size (float): Width and height threshold to filter bboxes.
+            If the height or width of a box is smaller than this value, it
+            will be removed. Defaults to 2.
+        min_area_ratio (float): Threshold of area ratio between
+            original bboxes and wrapped bboxes. If smaller than this value,
+            the box will be removed. Defaults to 0.1.
+        use_mask_refine (bool): Whether to refine bbox by mask.
+        max_aspect_ratio (float): Aspect ratio of width and height
+            threshold to filter bboxes. If max(h/w, w/h) larger than this
+            value, the box will be removed. Defaults to 20.
+        resample_num (int): Number of poly to resample to.
+    """
+
+    def __init__(self,
+                 max_rotate_degree: float = 10.0,
+                 max_translate_ratio: float = 0.1,
+                 scaling_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 max_shear_degree: float = 2.0,
+                 border: Tuple[int, int] = (0, 0),
+                 border_val: Tuple[int, int, int] = (114, 114, 114),
+                 bbox_clip_border: bool = True,
+                 min_bbox_size: int = 2,
+                 min_area_ratio: float = 0.1,
+                 use_mask_refine: bool = False,
+                 max_aspect_ratio: float = 20.,
+                 resample_num: int = 1000):
+        assert 0 <= max_translate_ratio <= 1
+        assert scaling_ratio_range[0] <= scaling_ratio_range[1]
+        assert scaling_ratio_range[0] > 0
+        self.max_rotate_degree = max_rotate_degree
+        self.max_translate_ratio = max_translate_ratio
+        self.scaling_ratio_range = scaling_ratio_range
+        self.max_shear_degree = max_shear_degree
+        self.border = border
+        self.border_val = border_val
+        self.bbox_clip_border = bbox_clip_border
+        self.min_bbox_size = min_bbox_size
+        self.min_area_ratio = min_area_ratio
+        self.use_mask_refine = use_mask_refine
+        self.max_aspect_ratio = max_aspect_ratio
+        self.resample_num = resample_num
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """The YOLOv5 random affine transform function.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        img = results['img']
+        # self.border is wh format
+        height = img.shape[0] + self.border[1] * 2
+        width = img.shape[1] + self.border[0] * 2
+
+        # Note: Different from YOLOX
+        center_matrix = np.eye(3, dtype=np.float32)
+        center_matrix[0, 2] = -img.shape[1] / 2
+        center_matrix[1, 2] = -img.shape[0] / 2
+
+        warp_matrix, scaling_ratio = self._get_random_homography_matrix(
+            height, width)
+        warp_matrix = warp_matrix @ center_matrix
+
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape
+        img_h, img_w = img.shape[:2]
+
+        bboxes = results['gt_bboxes']
+        num_bboxes = len(bboxes)
+        if num_bboxes:
+            orig_bboxes = bboxes.clone()
+            if self.use_mask_refine and 'gt_masks' in results:
+                # If the dataset has annotations of mask,
+                # the mask will be used to refine bbox.
+                gt_masks = results['gt_masks']
+
+                gt_masks_resample = self.resample_masks(gt_masks)
+                gt_masks = self.warp_mask(gt_masks_resample, warp_matrix,
+                                          img_h, img_w)
+
+                # refine bboxes by masks
+                bboxes = gt_masks.get_bboxes(dst_type='hbox')
+                # filter bboxes outside image
+                valid_index = self.filter_gt_bboxes(orig_bboxes,
+                                                    bboxes).numpy()
+                results['gt_masks'] = gt_masks[valid_index]
+            else:
+                bboxes.project_(warp_matrix)
+                if self.bbox_clip_border:
+                    bboxes.clip_([height, width])
+
+                # filter bboxes
+                orig_bboxes.rescale_([scaling_ratio, scaling_ratio])
+
+                # Be careful: valid_index must convert to numpy,
+                # otherwise it will raise out of bounds when len(valid_index)=1
+                valid_index = self.filter_gt_bboxes(orig_bboxes,
+                                                    bboxes).numpy()
+                if 'gt_masks' in results:
+                    results['gt_masks'] = PolygonMasks(
+                        results['gt_masks'].masks, img_h, img_w)
+
+            results['gt_bboxes'] = bboxes[valid_index]
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_index]
+            results['gt_ignore_flags'] = results['gt_ignore_flags'][
+                valid_index]
+
+        return results
+
+    @staticmethod
+    def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int,
+                  img_h: int) -> np.ndarray:
+        """Function to warp one mask and filter points outside image.
+
+        Args:
+            poly (np.ndarray): Segmentation annotation with shape (n, ) and
+                with format (x1, y1, x2, y2, ...).
+            warp_matrix (np.ndarray): Affine transformation matrix.
+                Shape: (3, 3).
+            img_w (int): Width of output image.
+            img_h (int): Height of output image.
+        """
+        # TODO: Current logic may cause retained masks unusable for
+        #  semantic segmentation training, which is same as official
+        #  implementation.
+        poly = poly.reshape((-1, 2))
+        poly = np.concatenate((poly, np.ones(
+            (len(poly), 1), dtype=poly.dtype)),
+                              axis=-1)
+        # transform poly
+        poly = poly @ warp_matrix.T
+        poly = poly[:, :2] / poly[:, 2:3]
+
+        # filter point outside image
+        x, y = poly.T
+        valid_ind_point = (x >= 0) & (y >= 0) & (x <= img_w) & (y <= img_h)
+        return poly[valid_ind_point].reshape(-1)
+
+    def warp_mask(self, gt_masks: PolygonMasks, warp_matrix: np.ndarray,
+                  img_w: int, img_h: int) -> PolygonMasks:
+        """Warp masks by warp_matrix and retain masks inside image after
+        warping.
+
+        Args:
+            gt_masks (PolygonMasks): Annotations of semantic segmentation.
+            warp_matrix (np.ndarray): Affine transformation matrix.
+                Shape: (3, 3).
+            img_w (int): Width of output image.
+            img_h (int): Height of output image.
+
+        Returns:
+            PolygonMasks: Masks after warping.
+        """
+        masks = gt_masks.masks
+
+        new_masks = []
+        for poly_per_obj in masks:
+            warpped_poly_per_obj = []
+            # One gt may have multiple masks.
+            for poly in poly_per_obj:
+                valid_poly = self.warp_poly(poly, warp_matrix, img_w, img_h)
+                if len(valid_poly):
+                    warpped_poly_per_obj.append(valid_poly.reshape(-1))
+            # If all the masks are invalid,
+            # add [0, 0, 0, 0, 0, 0,] here.
+            if not warpped_poly_per_obj:
+                # This will be filtered in function `filter_gt_bboxes`.
+                warpped_poly_per_obj = [
+                    np.zeros(6, dtype=poly_per_obj[0].dtype)
+                ]
+            new_masks.append(warpped_poly_per_obj)
+
+        gt_masks = PolygonMasks(new_masks, img_h, img_w)
+        return gt_masks
+
+    def resample_masks(self, gt_masks: PolygonMasks) -> PolygonMasks:
+        """Function to resample each mask annotation with shape (2 * n, ) to
+        shape (resample_num * 2, ).
+
+        Args:
+            gt_masks (PolygonMasks): Annotations of semantic segmentation.
+        """
+        masks = gt_masks.masks
+        new_masks = []
+        for poly_per_obj in masks:
+            resample_poly_per_obj = []
+            for poly in poly_per_obj:
+                poly = poly.reshape((-1, 2))  # xy
+                poly = np.concatenate((poly, poly[0:1, :]), axis=0)
+                x = np.linspace(0, len(poly) - 1, self.resample_num)
+                xp = np.arange(len(poly))
+                poly = np.concatenate([
+                    np.interp(x, xp, poly[:, i]) for i in range(2)
+                ]).reshape(2, -1).T.reshape(-1)
+                resample_poly_per_obj.append(poly)
+            new_masks.append(resample_poly_per_obj)
+        return PolygonMasks(new_masks, gt_masks.height, gt_masks.width)
+
+    def filter_gt_bboxes(self, origin_bboxes: HorizontalBoxes,
+                         wrapped_bboxes: HorizontalBoxes) -> torch.Tensor:
+        """Filter gt bboxes.
+
+        Args:
+            origin_bboxes (HorizontalBoxes): Origin bboxes.
+            wrapped_bboxes (HorizontalBoxes): Wrapped bboxes
+
+        Returns:
+            dict: The result dict.
+        """
+        origin_w = origin_bboxes.widths
+        origin_h = origin_bboxes.heights
+        wrapped_w = wrapped_bboxes.widths
+        wrapped_h = wrapped_bboxes.heights
+        aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16),
+                                  wrapped_h / (wrapped_w + 1e-16))
+
+        wh_valid_idx = (wrapped_w > self.min_bbox_size) & \
+                       (wrapped_h > self.min_bbox_size)
+        area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h +
+                                                  1e-16) > self.min_area_ratio
+        aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio
+        return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx
+
+    @cache_randomness
+    def _get_random_homography_matrix(self, height: int,
+                                      width: int) -> Tuple[np.ndarray, float]:
+        """Get random homography matrix.
+
+        Args:
+            height (int): Image height.
+            width (int): Image width.
+
+        Returns:
+            Tuple[np.ndarray, float]: The result of warp_matrix and
+            scaling_ratio.
+        """
+        # Rotation
+        rotation_degree = random.uniform(-self.max_rotate_degree,
+                                         self.max_rotate_degree)
+        rotation_matrix = self._get_rotation_matrix(rotation_degree)
+
+        # Scaling
+        scaling_ratio = random.uniform(self.scaling_ratio_range[0],
+                                       self.scaling_ratio_range[1])
+        scaling_matrix = self._get_scaling_matrix(scaling_ratio)
+
+        # Shear
+        x_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        y_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        shear_matrix = self._get_shear_matrix(x_degree, y_degree)
+
+        # Translation
+        trans_x = random.uniform(0.5 - self.max_translate_ratio,
+                                 0.5 + self.max_translate_ratio) * width
+        trans_y = random.uniform(0.5 - self.max_translate_ratio,
+                                 0.5 + self.max_translate_ratio) * height
+        translate_matrix = self._get_translation_matrix(trans_x, trans_y)
+        warp_matrix = (
+            translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
+        return warp_matrix, scaling_ratio
+
+    @staticmethod
+    def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray:
+        """Get rotation matrix.
+
+        Args:
+            rotate_degrees (float): Rotate degrees.
+
+        Returns:
+            np.ndarray: The rotation matrix.
+        """
+        radian = math.radians(rotate_degrees)
+        rotation_matrix = np.array(
+            [[np.cos(radian), -np.sin(radian), 0.],
+             [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return rotation_matrix
+
+    @staticmethod
+    def _get_scaling_matrix(scale_ratio: float) -> np.ndarray:
+        """Get scaling matrix.
+
+        Args:
+            scale_ratio (float): Scale ratio.
+
+        Returns:
+            np.ndarray: The scaling matrix.
+        """
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_shear_matrix(x_shear_degrees: float,
+                          y_shear_degrees: float) -> np.ndarray:
+        """Get shear matrix.
+
+        Args:
+            x_shear_degrees (float): X shear degrees.
+            y_shear_degrees (float): Y shear degrees.
+
+        Returns:
+            np.ndarray: The shear matrix.
+        """
+        x_radian = math.radians(x_shear_degrees)
+        y_radian = math.radians(y_shear_degrees)
+        shear_matrix = np.array([[1, np.tan(x_radian), 0.],
+                                 [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
+                                dtype=np.float32)
+        return shear_matrix
+
+    @staticmethod
+    def _get_translation_matrix(x: float, y: float) -> np.ndarray:
+        """Get translation matrix.
+
+        Args:
+            x (float): X translation.
+            y (float): Y translation.
+
+        Returns:
+            np.ndarray: The translation matrix.
+        """
+        translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
+                                      dtype=np.float32)
+        return translation_matrix
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
+        repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
+        repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, '
+        repr_str += f'max_shear_degree={self.max_shear_degree}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'border_val={self.border_val}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PPYOLOERandomDistort(BaseTransform):
+    """Random hue, saturation, contrast and brightness distortion.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img (np.float32)
+
+    Args:
+        hue_cfg (dict): Hue settings. Defaults to dict(min=-18,
+            max=18, prob=0.5).
+        saturation_cfg (dict): Saturation settings. Defaults to dict(
+            min=0.5, max=1.5, prob=0.5).
+        contrast_cfg (dict): Contrast settings. Defaults to dict(
+            min=0.5, max=1.5, prob=0.5).
+        brightness_cfg (dict): Brightness settings. Defaults to dict(
+            min=0.5, max=1.5, prob=0.5).
+        num_distort_func (int): The number of distort function. Defaults
+            to 4.
+    """
+
+    def __init__(self,
+                 hue_cfg: dict = dict(min=-18, max=18, prob=0.5),
+                 saturation_cfg: dict = dict(min=0.5, max=1.5, prob=0.5),
+                 contrast_cfg: dict = dict(min=0.5, max=1.5, prob=0.5),
+                 brightness_cfg: dict = dict(min=0.5, max=1.5, prob=0.5),
+                 num_distort_func: int = 4):
+        self.hue_cfg = hue_cfg
+        self.saturation_cfg = saturation_cfg
+        self.contrast_cfg = contrast_cfg
+        self.brightness_cfg = brightness_cfg
+        self.num_distort_func = num_distort_func
+        assert 0 < self.num_distort_func <= 4, \
+            'num_distort_func must > 0 and <= 4'
+        for cfg in [
+                self.hue_cfg, self.saturation_cfg, self.contrast_cfg,
+                self.brightness_cfg
+        ]:
+            assert 0. <= cfg['prob'] <= 1., 'prob must >=0 and <=1'
+
+    def transform_hue(self, results):
+        """Transform hue randomly."""
+        if random.uniform(0., 1.) >= self.hue_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.hue_cfg['min'], self.hue_cfg['max'])
+        u = np.cos(delta * np.pi)
+        w = np.sin(delta * np.pi)
+        delta_iq = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
+        rgb2yiq_matrix = np.array([[0.114, 0.587, 0.299],
+                                   [-0.321, -0.274, 0.596],
+                                   [0.311, -0.523, 0.211]])
+        yiq2rgb_matric = np.array([[1.0, -1.107, 1.705], [1.0, -0.272, -0.647],
+                                   [1.0, 0.956, 0.621]])
+        t = np.dot(np.dot(yiq2rgb_matric, delta_iq), rgb2yiq_matrix).T
+        img = np.dot(img, t)
+        results['img'] = img
+        return results
+
+    def transform_saturation(self, results):
+        """Transform saturation randomly."""
+        if random.uniform(0., 1.) >= self.saturation_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.saturation_cfg['min'],
+                               self.saturation_cfg['max'])
+
+        # convert bgr img to gray img
+        gray = img * np.array([[[0.114, 0.587, 0.299]]], dtype=np.float32)
+        gray = gray.sum(axis=2, keepdims=True)
+        gray *= (1.0 - delta)
+        img *= delta
+        img += gray
+        results['img'] = img
+        return results
+
+    def transform_contrast(self, results):
+        """Transform contrast randomly."""
+        if random.uniform(0., 1.) >= self.contrast_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.contrast_cfg['min'],
+                               self.contrast_cfg['max'])
+        img *= delta
+        results['img'] = img
+        return results
+
+    def transform_brightness(self, results):
+        """Transform brightness randomly."""
+        if random.uniform(0., 1.) >= self.brightness_cfg['prob']:
+            return results
+        img = results['img']
+        delta = random.uniform(self.brightness_cfg['min'],
+                               self.brightness_cfg['max'])
+        img += delta
+        results['img'] = img
+        return results
+
+    def transform(self, results: dict) -> dict:
+        """The hue, saturation, contrast and brightness distortion function.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        results['img'] = results['img'].astype(np.float32)
+
+        functions = [
+            self.transform_brightness, self.transform_contrast,
+            self.transform_saturation, self.transform_hue
+        ]
+        distortions = random.permutation(functions)[:self.num_distort_func]
+        for func in distortions:
+            results = func(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_cfg={self.hue_cfg}, '
+        repr_str += f'saturation_cfg={self.saturation_cfg}, '
+        repr_str += f'contrast_cfg={self.contrast_cfg}, '
+        repr_str += f'brightness_cfg={self.brightness_cfg}, '
+        repr_str += f'num_distort_func={self.num_distort_func})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PPYOLOERandomCrop(BaseTransform):
+    """Random crop the img and bboxes. Different thresholds are used in PPYOLOE
+    to judge whether the clipped image meets the requirements. This
+    implementation is different from the implementation of RandomCrop in mmdet.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Added Keys:
+    - pad_param (np.float32)
+
+    Args:
+        aspect_ratio (List[float]): Aspect ratio of cropped region. Default to
+             [.5, 2].
+        thresholds (List[float]): Iou thresholds for deciding a valid bbox crop
+            in [min, max] format. Defaults to [.0, .1, .3, .5, .7, .9].
+        scaling (List[float]): Ratio between a cropped region and the original
+            image in [min, max] format. Default to [.3, 1.].
+        num_attempts (int): Number of tries for each threshold before
+            giving up. Default to 50.
+        allow_no_crop (bool): Allow return without actually cropping them.
+            Default to True.
+        cover_all_box (bool): Ensure all bboxes are covered in the final crop.
+            Default to False.
+    """
+
+    def __init__(self,
+                 aspect_ratio: List[float] = [.5, 2.],
+                 thresholds: List[float] = [.0, .1, .3, .5, .7, .9],
+                 scaling: List[float] = [.3, 1.],
+                 num_attempts: int = 50,
+                 allow_no_crop: bool = True,
+                 cover_all_box: bool = False):
+        self.aspect_ratio = aspect_ratio
+        self.thresholds = thresholds
+        self.scaling = scaling
+        self.num_attempts = num_attempts
+        self.allow_no_crop = allow_no_crop
+        self.cover_all_box = cover_all_box
+
+    def _crop_data(self, results: dict, crop_box: Tuple[int, int, int, int],
+                   valid_inds: np.ndarray) -> Union[dict, None]:
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_box (Tuple[int, int, int, int]): Expected absolute coordinates
+                for cropping, (x1, y1, x2, y2).
+            valid_inds (np.ndarray): The indexes of gt that needs to be
+                retained.
+
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        # crop the image
+        img = results['img']
+        crop_x1, crop_y1, crop_x2, crop_y2 = crop_box
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        results['img'] = img
+        img_shape = img.shape
+        results['img_shape'] = img.shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes']
+            bboxes.translate_([-crop_x1, -crop_y1])
+            bboxes.clip_(img_shape[:2])
+
+            results['gt_bboxes'] = bboxes[valid_inds]
+
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+
+            if results.get('gt_bboxes_labels', None) is not None:
+                results['gt_bboxes_labels'] = \
+                    results['gt_bboxes_labels'][valid_inds]
+
+            if results.get('gt_masks', None) is not None:
+                results['gt_masks'] = results['gt_masks'][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+
+        # crop semantic seg
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
+                                                          crop_x1:crop_x2]
+
+        return results
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """The random crop transform function.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        if results.get('gt_bboxes', None) is None or len(
+                results['gt_bboxes']) == 0:
+            return results
+
+        orig_img_h, orig_img_w = results['img'].shape[:2]
+        gt_bboxes = results['gt_bboxes']
+
+        thresholds = list(self.thresholds)
+        if self.allow_no_crop:
+            thresholds.append('no_crop')
+        random.shuffle(thresholds)
+
+        for thresh in thresholds:
+            # Determine the coordinates for cropping
+            if thresh == 'no_crop':
+                return results
+
+            found = False
+            for i in range(self.num_attempts):
+                crop_h, crop_w = self._get_crop_size((orig_img_h, orig_img_w))
+                if self.aspect_ratio is None:
+                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
+                        continue
+
+                # get image crop_box
+                margin_h = max(orig_img_h - crop_h, 0)
+                margin_w = max(orig_img_w - crop_w, 0)
+                offset_h, offset_w = self._rand_offset((margin_h, margin_w))
+                crop_y1, crop_y2 = offset_h, offset_h + crop_h
+                crop_x1, crop_x2 = offset_w, offset_w + crop_w
+
+                crop_box = [crop_x1, crop_y1, crop_x2, crop_y2]
+                # Calculate the iou between gt_bboxes and crop_boxes
+                iou = self._iou_matrix(gt_bboxes,
+                                       np.array([crop_box], dtype=np.float32))
+                # If the maximum value of the iou is less than thresh,
+                # the current crop_box is considered invalid.
+                if iou.max() < thresh:
+                    continue
+
+                # If cover_all_box == True and the minimum value of
+                # the iou is less than thresh, the current crop_box
+                # is considered invalid.
+                if self.cover_all_box and iou.min() < thresh:
+                    continue
+
+                # Get which gt_bboxes to keep after cropping.
+                valid_inds = self._get_valid_inds(
+                    gt_bboxes, np.array(crop_box, dtype=np.float32))
+                if valid_inds.size > 0:
+                    found = True
+                    break
+
+            if found:
+                results = self._crop_data(results, crop_box, valid_inds)
+                return results
+        return results
+
+    @cache_randomness
+    def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generate crop offset.
+
+        Args:
+            margin (Tuple[int, int]): The upper bound for the offset generated
+                randomly.
+
+        Returns:
+            Tuple[int, int]: The random offset for the crop.
+        """
+        margin_h, margin_w = margin
+        offset_h = np.random.randint(0, margin_h + 1)
+        offset_w = np.random.randint(0, margin_w + 1)
+
+        return (offset_h, offset_w)
+
+    @cache_randomness
+    def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generates the crop size based on `image_size`.
+
+        Args:
+            image_size (Tuple[int, int]): (h, w).
+
+        Returns:
+            crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        scale = random.uniform(*self.scaling)
+        if self.aspect_ratio is not None:
+            min_ar, max_ar = self.aspect_ratio
+            aspect_ratio = random.uniform(
+                max(min_ar, scale**2), min(max_ar, scale**-2))
+            h_scale = scale / np.sqrt(aspect_ratio)
+            w_scale = scale * np.sqrt(aspect_ratio)
+        else:
+            h_scale = random.uniform(*self.scaling)
+            w_scale = random.uniform(*self.scaling)
+        crop_h = h * h_scale
+        crop_w = w * w_scale
+        return int(crop_h), int(crop_w)
+
+    def _iou_matrix(self,
+                    gt_bbox: HorizontalBoxes,
+                    crop_bbox: np.ndarray,
+                    eps: float = 1e-10) -> np.ndarray:
+        """Calculate iou between gt and image crop box.
+
+        Args:
+            gt_bbox (HorizontalBoxes): Ground truth bounding boxes.
+            crop_bbox (np.ndarray): Image crop coordinates in
+                [x1, y1, x2, y2] format.
+            eps (float): Default to 1e-10.
+        Return:
+            (np.ndarray): IoU.
+        """
+        gt_bbox = gt_bbox.tensor.numpy()
+        lefttop = np.maximum(gt_bbox[:, np.newaxis, :2], crop_bbox[:, :2])
+        rightbottom = np.minimum(gt_bbox[:, np.newaxis, 2:], crop_bbox[:, 2:])
+
+        overlap = np.prod(
+            rightbottom - lefttop,
+            axis=2) * (lefttop < rightbottom).all(axis=2)
+        area_gt_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1)
+        area_crop_bbox = np.prod(gt_bbox[:, 2:] - crop_bbox[:, :2], axis=1)
+        area_o = (area_gt_bbox[:, np.newaxis] + area_crop_bbox - overlap)
+        return overlap / (area_o + eps)
+
+    def _get_valid_inds(self, gt_bbox: HorizontalBoxes,
+                        img_crop_bbox: np.ndarray) -> np.ndarray:
+        """Get which Bboxes to keep at the current cropping coordinates.
+
+        Args:
+            gt_bbox (HorizontalBoxes): Ground truth bounding boxes.
+            img_crop_bbox (np.ndarray): Image crop coordinates in
+                [x1, y1, x2, y2] format.
+
+        Returns:
+            (np.ndarray): Valid indexes.
+        """
+        cropped_box = gt_bbox.tensor.numpy().copy()
+        gt_bbox = gt_bbox.tensor.numpy().copy()
+
+        cropped_box[:, :2] = np.maximum(gt_bbox[:, :2], img_crop_bbox[:2])
+        cropped_box[:, 2:] = np.minimum(gt_bbox[:, 2:], img_crop_bbox[2:])
+        cropped_box[:, :2] -= img_crop_bbox[:2]
+        cropped_box[:, 2:] -= img_crop_bbox[:2]
+
+        centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2
+        valid = np.logical_and(img_crop_bbox[:2] <= centers,
+                               centers < img_crop_bbox[2:]).all(axis=1)
+        valid = np.logical_and(
+            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
+
+        return np.where(valid)[0]
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(aspect_ratio={self.aspect_ratio}, '
+        repr_str += f'thresholds={self.thresholds}, '
+        repr_str += f'scaling={self.scaling}, '
+        repr_str += f'num_attempts={self.num_attempts}, '
+        repr_str += f'allow_no_crop={self.allow_no_crop}, '
+        repr_str += f'cover_all_box={self.cover_all_box})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class YOLOv5CopyPaste(BaseTransform):
+    """Copy-Paste used in YOLOv5 and YOLOv8.
+
+    This transform randomly copy some objects in the image to the mirror
+    position of the image.It is different from the `CopyPaste` in mmdet.
+
+    Required Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (PolygonMasks) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+
+    Args:
+        ioa_thresh (float): Ioa thresholds for deciding valid bbox.
+        prob (float): Probability of choosing objects.
+            Defaults to 0.5.
+    """
+
+    def __init__(self, ioa_thresh: float = 0.3, prob: float = 0.5):
+        self.ioa_thresh = ioa_thresh
+        self.prob = prob
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """The YOLOv5 and YOLOv8 Copy-Paste transform function.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        if len(results.get('gt_masks', [])) == 0:
+            return results
+        gt_masks = results['gt_masks']
+        assert isinstance(gt_masks, PolygonMasks),\
+            'only support type of PolygonMasks,' \
+            ' but get type: %s' % type(gt_masks)
+        gt_bboxes = results['gt_bboxes']
+        gt_bboxes_labels = results.get('gt_bboxes_labels', None)
+        img = results['img']
+        img_h, img_w = img.shape[:2]
+
+        # calculate ioa
+        gt_bboxes_flip = deepcopy(gt_bboxes)
+        gt_bboxes_flip.flip_(img.shape)
+
+        ioa = self.bbox_ioa(gt_bboxes_flip, gt_bboxes)
+        indexes = torch.nonzero((ioa < self.ioa_thresh).all(1))[:, 0]
+        n = len(indexes)
+        valid_inds = random.choice(
+            indexes, size=round(self.prob * n), replace=False)
+        if len(valid_inds) == 0:
+            return results
+
+        if gt_bboxes_labels is not None:
+            # prepare labels
+            gt_bboxes_labels = np.concatenate(
+                (gt_bboxes_labels, gt_bboxes_labels[valid_inds]), axis=0)
+
+        # prepare bboxes
+        copypaste_bboxes = gt_bboxes_flip[valid_inds]
+        gt_bboxes = gt_bboxes.cat([gt_bboxes, copypaste_bboxes])
+
+        # prepare images
+        copypaste_gt_masks = gt_masks[valid_inds]
+        copypaste_gt_masks_flip = copypaste_gt_masks.flip()
+        # convert poly format to bitmap format
+        # example: poly: [[array(0.0, 0.0, 10.0, 0.0, 10.0, 10.0, 0.0, 10.0]]
+        #  -> bitmap: a mask with shape equal to (1, img_h, img_w)
+        # # type1 low speed
+        # copypaste_gt_masks_bitmap = copypaste_gt_masks.to_ndarray()
+        # copypaste_mask = np.sum(copypaste_gt_masks_bitmap, axis=0) > 0
+
+        # type2
+        copypaste_mask = np.zeros((img_h, img_w), dtype=np.uint8)
+        for poly in copypaste_gt_masks.masks:
+            poly = [i.reshape((-1, 1, 2)).astype(np.int32) for i in poly]
+            cv2.drawContours(copypaste_mask, poly, -1, (1, ), cv2.FILLED)
+
+        copypaste_mask = copypaste_mask.astype(bool)
+
+        # copy objects, and paste to the mirror position of the image
+        copypaste_mask_flip = mmcv.imflip(
+            copypaste_mask, direction='horizontal')
+        copypaste_img = mmcv.imflip(img, direction='horizontal')
+        img[copypaste_mask_flip] = copypaste_img[copypaste_mask_flip]
+
+        # prepare masks
+        gt_masks = copypaste_gt_masks.cat([gt_masks, copypaste_gt_masks_flip])
+
+        if 'gt_ignore_flags' in results:
+            # prepare gt_ignore_flags
+            gt_ignore_flags = results['gt_ignore_flags']
+            gt_ignore_flags = np.concatenate(
+                [gt_ignore_flags, gt_ignore_flags[valid_inds]], axis=0)
+            results['gt_ignore_flags'] = gt_ignore_flags
+
+        results['img'] = img
+        results['gt_bboxes'] = gt_bboxes
+        if gt_bboxes_labels is not None:
+            results['gt_bboxes_labels'] = gt_bboxes_labels
+        results['gt_masks'] = gt_masks
+
+        return results
+
+    @staticmethod
+    def bbox_ioa(gt_bboxes_flip: HorizontalBoxes,
+                 gt_bboxes: HorizontalBoxes,
+                 eps: float = 1e-7) -> np.ndarray:
+        """Calculate ioa between gt_bboxes_flip and gt_bboxes.
+
+        Args:
+            gt_bboxes_flip (HorizontalBoxes): Flipped ground truth
+                bounding boxes.
+            gt_bboxes (HorizontalBoxes): Ground truth bounding boxes.
+            eps (float): Default to 1e-10.
+        Return:
+            (Tensor): Ioa.
+        """
+        gt_bboxes_flip = gt_bboxes_flip.tensor
+        gt_bboxes = gt_bboxes.tensor
+
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = gt_bboxes_flip.T
+        b2_x1, b2_y1, b2_x2, b2_y2 = gt_bboxes.T
+
+        # Intersection area
+        inter_area = (torch.minimum(b1_x2[:, None],
+                                    b2_x2) - torch.maximum(b1_x1[:, None],
+                                                           b2_x1)).clip(0) * \
+                     (torch.minimum(b1_y2[:, None],
+                                    b2_y2) - torch.maximum(b1_y1[:, None],
+                                                           b2_y1)).clip(0)
+
+        # box2 area
+        box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps
+
+        # Intersection over box2 area
+        return inter_area / box2_area
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(ioa_thresh={self.ioa_thresh},'
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RemoveDataElement(BaseTransform):
+    """Remove unnecessary data element in results.
+
+    Args:
+        keys (Union[str, Sequence[str]]): Keys need to be removed.
+    """
+
+    def __init__(self, keys: Union[str, Sequence[str]]):
+        self.keys = [keys] if isinstance(keys, str) else keys
+
+    def transform(self, results: dict) -> dict:
+        for key in self.keys:
+            results.pop(key, None)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(keys={self.keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RegularizeRotatedBox(BaseTransform):
+    """Regularize rotated boxes.
+
+    Due to the angle periodicity, one rotated box can be represented in
+    many different (x, y, w, h, t). To make each rotated box unique,
+    ``regularize_boxes`` will take the remainder of the angle divided by
+    180 degrees.
+
+    For convenience, three angle_version can be used here:
+
+    - 'oc': OpenCV Definition. Has the same box representation as
+        ``cv2.minAreaRect`` the angle ranges in [-90, 0).
+    - 'le90': Long Edge Definition (90). the angle ranges in [-90, 90).
+        The width is always longer than the height.
+    - 'le135': Long Edge Definition (135). the angle ranges in [-45, 135).
+        The width is always longer than the height.
+
+    Required Keys:
+
+    - gt_bboxes (RotatedBoxes[torch.float32])
+
+    Modified Keys:
+
+    - gt_bboxes
+
+    Args:
+        angle_version (str): Angle version. Can only be 'oc',
+            'le90', or 'le135'. Defaults to 'le90.
+    """
+
+    def __init__(self, angle_version='le90') -> None:
+        self.angle_version = angle_version
+        try:
+            from mmrotate.structures.bbox import RotatedBoxes
+            self.box_type = RotatedBoxes
+        except ImportError:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmrotate.txt" '
+                'to install mmrotate first for rotated detection.')
+
+    def transform(self, results: dict) -> dict:
+        assert isinstance(results['gt_bboxes'], self.box_type)
+        results['gt_bboxes'] = self.box_type(
+            results['gt_bboxes'].regularize_boxes(self.angle_version))
+        return results
diff --git a/mmyolo/datasets/utils.py b/mmyolo/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..62fe5484b4befc76328798d6f044d1c283edc397
--- /dev/null
+++ b/mmyolo/datasets/utils.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence
+
+import numpy as np
+import torch
+from mmengine.dataset import COLLATE_FUNCTIONS
+
+from ..registry import TASK_UTILS
+
+
+@COLLATE_FUNCTIONS.register_module()
+def yolov5_collate(data_batch: Sequence,
+                   use_ms_training: bool = False) -> dict:
+    """Rewrite collate_fn to get faster training speed.
+
+    Args:
+       data_batch (Sequence): Batch of data.
+       use_ms_training (bool): Whether to use multi-scale training.
+    """
+    batch_imgs = []
+    batch_bboxes_labels = []
+    batch_masks = []
+    for i in range(len(data_batch)):
+        datasamples = data_batch[i]['data_samples']
+        inputs = data_batch[i]['inputs']
+        batch_imgs.append(inputs)
+
+        gt_bboxes = datasamples.gt_instances.bboxes.tensor
+        gt_labels = datasamples.gt_instances.labels
+        if 'masks' in datasamples.gt_instances:
+            masks = datasamples.gt_instances.masks.to_tensor(
+                dtype=torch.bool, device=gt_bboxes.device)
+            batch_masks.append(masks)
+        batch_idx = gt_labels.new_full((len(gt_labels), 1), i)
+        bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes),
+                                  dim=1)
+        batch_bboxes_labels.append(bboxes_labels)
+
+    collated_results = {
+        'data_samples': {
+            'bboxes_labels': torch.cat(batch_bboxes_labels, 0)
+        }
+    }
+    if len(batch_masks) > 0:
+        collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0)
+
+    if use_ms_training:
+        collated_results['inputs'] = batch_imgs
+    else:
+        collated_results['inputs'] = torch.stack(batch_imgs, 0)
+    return collated_results
+
+
+@TASK_UTILS.register_module()
+class BatchShapePolicy:
+    """BatchShapePolicy is only used in the testing phase, which can reduce the
+    number of pad pixels during batch inference.
+
+    Args:
+       batch_size (int): Single GPU batch size during batch inference.
+           Defaults to 32.
+       img_size (int): Expected output image size. Defaults to 640.
+       size_divisor (int): The minimum size that is divisible
+           by size_divisor. Defaults to 32.
+       extra_pad_ratio (float):  Extra pad ratio. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 batch_size: int = 32,
+                 img_size: int = 640,
+                 size_divisor: int = 32,
+                 extra_pad_ratio: float = 0.5):
+        self.batch_size = batch_size
+        self.img_size = img_size
+        self.size_divisor = size_divisor
+        self.extra_pad_ratio = extra_pad_ratio
+
+    def __call__(self, data_list: List[dict]) -> List[dict]:
+        image_shapes = []
+        for data_info in data_list:
+            image_shapes.append((data_info['width'], data_info['height']))
+
+        image_shapes = np.array(image_shapes, dtype=np.float64)
+
+        n = len(image_shapes)  # number of images
+        batch_index = np.floor(np.arange(n) / self.batch_size).astype(
+            np.int64)  # batch index
+        number_of_batches = batch_index[-1] + 1  # number of batches
+
+        aspect_ratio = image_shapes[:, 1] / image_shapes[:, 0]  # aspect ratio
+        irect = aspect_ratio.argsort()
+
+        data_list = [data_list[i] for i in irect]
+
+        aspect_ratio = aspect_ratio[irect]
+        # Set training image shapes
+        shapes = [[1, 1]] * number_of_batches
+        for i in range(number_of_batches):
+            aspect_ratio_index = aspect_ratio[batch_index == i]
+            min_index, max_index = aspect_ratio_index.min(
+            ), aspect_ratio_index.max()
+            if max_index < 1:
+                shapes[i] = [max_index, 1]
+            elif min_index > 1:
+                shapes[i] = [1, 1 / min_index]
+
+        batch_shapes = np.ceil(
+            np.array(shapes) * self.img_size / self.size_divisor +
+            self.extra_pad_ratio).astype(np.int64) * self.size_divisor
+
+        for i, data_info in enumerate(data_list):
+            data_info['batch_shape'] = batch_shapes[batch_index[i]]
+
+        return data_list
diff --git a/mmyolo/datasets/yolov5_coco.py b/mmyolo/datasets/yolov5_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..55bc899abfcceebfdadf7549e56336725d891dcb
--- /dev/null
+++ b/mmyolo/datasets/yolov5_coco.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional
+
+from mmdet.datasets import BaseDetDataset, CocoDataset
+
+from ..registry import DATASETS, TASK_UTILS
+
+
+class BatchShapePolicyDataset(BaseDetDataset):
+    """Dataset with the batch shape policy that makes paddings with least
+    pixels during batch inference process, which does not require the image
+    scales of all batches to be the same throughout validation."""
+
+    def __init__(self,
+                 *args,
+                 batch_shapes_cfg: Optional[dict] = None,
+                 **kwargs):
+        self.batch_shapes_cfg = batch_shapes_cfg
+        super().__init__(*args, **kwargs)
+
+    def full_init(self):
+        """rewrite full_init() to be compatible with serialize_data in
+        BatchShapePolicy."""
+        if self._fully_initialized:
+            return
+        # load data information
+        self.data_list = self.load_data_list()
+
+        # batch_shapes_cfg
+        if self.batch_shapes_cfg:
+            batch_shapes_policy = TASK_UTILS.build(self.batch_shapes_cfg)
+            self.data_list = batch_shapes_policy(self.data_list)
+            del batch_shapes_policy
+
+        # filter illegal data, such as data that has no annotations.
+        self.data_list = self.filter_data()
+        # Get subset data according to indices.
+        if self._indices is not None:
+            self.data_list = self._get_unserialized_subset(self._indices)
+
+        # serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+
+        self._fully_initialized = True
+
+    def prepare_data(self, idx: int) -> Any:
+        """Pass the dataset to the pipeline during training to support mixed
+        data augmentation, such as Mosaic and MixUp."""
+        if self.test_mode is False:
+            data_info = self.get_data_info(idx)
+            data_info['dataset'] = self
+            return self.pipeline(data_info)
+        else:
+            return super().prepare_data(idx)
+
+
+@DATASETS.register_module()
+class YOLOv5CocoDataset(BatchShapePolicyDataset, CocoDataset):
+    """Dataset for YOLOv5 COCO Dataset.
+
+    We only add `BatchShapePolicy` function compared with CocoDataset. See
+    `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    pass
diff --git a/mmyolo/datasets/yolov5_crowdhuman.py b/mmyolo/datasets/yolov5_crowdhuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..486a8324fb4c7d8a34bf885f1818d2e6f974f6e7
--- /dev/null
+++ b/mmyolo/datasets/yolov5_crowdhuman.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.datasets import CrowdHumanDataset
+
+from ..registry import DATASETS
+from .yolov5_coco import BatchShapePolicyDataset
+
+
+@DATASETS.register_module()
+class YOLOv5CrowdHumanDataset(BatchShapePolicyDataset, CrowdHumanDataset):
+    """Dataset for YOLOv5 CrowdHuman Dataset.
+
+    We only add `BatchShapePolicy` function compared with CrowdHumanDataset.
+    See `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    pass
diff --git a/mmyolo/datasets/yolov5_dota.py b/mmyolo/datasets/yolov5_dota.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9647981333ed725a568a293279873ab9e20db47
--- /dev/null
+++ b/mmyolo/datasets/yolov5_dota.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
+from ..registry import DATASETS
+
+try:
+    from mmrotate.datasets import DOTADataset
+    MMROTATE_AVAILABLE = True
+except ImportError:
+    from mmengine.dataset import BaseDataset
+    DOTADataset = BaseDataset
+    MMROTATE_AVAILABLE = False
+
+
+@DATASETS.register_module()
+class YOLOv5DOTADataset(BatchShapePolicyDataset, DOTADataset):
+    """Dataset for YOLOv5 DOTA Dataset.
+
+    We only add `BatchShapePolicy` function compared with DOTADataset. See
+    `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+
+    def __init__(self, *args, **kwargs):
+        if not MMROTATE_AVAILABLE:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmrotate.txt" '
+                'to install mmrotate first for rotated detection.')
+
+        super().__init__(*args, **kwargs)
diff --git a/mmyolo/datasets/yolov5_voc.py b/mmyolo/datasets/yolov5_voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5be764f1db3097645ae1be387e45cafb1b460731
--- /dev/null
+++ b/mmyolo/datasets/yolov5_voc.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.datasets import VOCDataset
+
+from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset
+from ..registry import DATASETS
+
+
+@DATASETS.register_module()
+class YOLOv5VOCDataset(BatchShapePolicyDataset, VOCDataset):
+    """Dataset for YOLOv5 VOC Dataset.
+
+    We only add `BatchShapePolicy` function compared with VOCDataset. See
+    `mmyolo/datasets/utils.py#BatchShapePolicy` for details
+    """
+    pass
diff --git a/mmyolo/deploy/__init__.py b/mmyolo/deploy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4904a9058b41526d9719994ed718ae58336d290e
--- /dev/null
+++ b/mmyolo/deploy/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdeploy.codebase.base import MMCodebase
+
+from .models import *  # noqa: F401,F403
+from .object_detection import MMYOLO, YOLOObjectDetection
+
+__all__ = ['MMCodebase', 'MMYOLO', 'YOLOObjectDetection']
diff --git a/mmyolo/deploy/models/__init__.py b/mmyolo/deploy/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b999a0161543d6a9d2ab56d797af740dc7261e4
--- /dev/null
+++ b/mmyolo/deploy/models/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from . import dense_heads  # noqa: F401,F403
diff --git a/mmyolo/deploy/models/dense_heads/__init__.py b/mmyolo/deploy/models/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc423af3ec374cabe2b9f46d2fe4f4dc9755b8e3
--- /dev/null
+++ b/mmyolo/deploy/models/dense_heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from . import yolov5_head  # noqa: F401,F403
+
+__all__ = ['yolov5_head']
diff --git a/mmyolo/deploy/models/dense_heads/yolov5_head.py b/mmyolo/deploy/models/dense_heads/yolov5_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac996ba41336243ef091e3e952430382be9ff978
--- /dev/null
+++ b/mmyolo/deploy/models/dense_heads/yolov5_head.py
@@ -0,0 +1,189 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from functools import partial
+from typing import List, Optional, Tuple
+
+import torch
+from mmdeploy.codebase.mmdet import get_post_processing_params
+from mmdeploy.codebase.mmdet.models.layers import multiclass_nms
+from mmdeploy.core import FUNCTION_REWRITER
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.deploy.models.layers import efficient_nms
+from mmyolo.models.dense_heads import YOLOv5Head
+
+
+def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor,
+                        stride: int) -> Tensor:
+    """Decode YOLOv5 bounding boxes.
+
+    Args:
+        priors (Tensor): Prior boxes in center-offset form.
+        bbox_preds (Tensor): Predicted bounding boxes.
+        stride (int): Stride of the feature map.
+
+    Returns:
+        Tensor: Decoded bounding boxes.
+    """
+    bbox_preds = bbox_preds.sigmoid()
+
+    x_center = (priors[..., 0] + priors[..., 2]) * 0.5
+    y_center = (priors[..., 1] + priors[..., 3]) * 0.5
+    w = priors[..., 2] - priors[..., 0]
+    h = priors[..., 3] - priors[..., 1]
+
+    x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center
+    y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center
+    w_pred = (bbox_preds[..., 2] * 2)**2 * w
+    h_pred = (bbox_preds[..., 3] * 2)**2 * h
+
+    decoded_bboxes = torch.stack(
+        [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1)
+
+    return decoded_bboxes
+
+
+@FUNCTION_REWRITER.register_rewriter(
+    func_name='mmyolo.models.dense_heads.yolov5_head.'
+    'YOLOv5Head.predict_by_feat')
+def yolov5_head__predict_by_feat(self,
+                                 cls_scores: List[Tensor],
+                                 bbox_preds: List[Tensor],
+                                 objectnesses: Optional[List[Tensor]] = None,
+                                 batch_img_metas: Optional[List[dict]] = None,
+                                 cfg: Optional[ConfigDict] = None,
+                                 rescale: bool = False,
+                                 with_nms: bool = True) -> Tuple[InstanceData]:
+    """Transform a batch of output features extracted by the head into
+    bbox results.
+    Args:
+        cls_scores (list[Tensor]): Classification scores for all
+            scale levels, each is a 4D-tensor, has shape
+            (batch_size, num_priors * num_classes, H, W).
+        bbox_preds (list[Tensor]): Box energies / deltas for all
+            scale levels, each is a 4D-tensor, has shape
+            (batch_size, num_priors * 4, H, W).
+        objectnesses (list[Tensor], Optional): Score factor for
+            all scale level, each is a 4D-tensor, has shape
+            (batch_size, 1, H, W).
+        batch_img_metas (list[dict], Optional): Batch image meta info.
+            Defaults to None.
+        cfg (ConfigDict, optional): Test / postprocessing
+            configuration, if None, test_cfg would be used.
+            Defaults to None.
+        rescale (bool): If True, return boxes in original image space.
+            Defaults to False.
+        with_nms (bool): If True, do nms before return boxes.
+            Defaults to True.
+    Returns:
+        tuple[Tensor, Tensor]: The first item is an (N, num_box, 5) tensor,
+            where 5 represent (tl_x, tl_y, br_x, br_y, score), N is batch
+            size and the score between 0 and 1. The shape of the second
+            tensor in the tuple is (N, num_box), and each element
+            represents the class label of the corresponding box.
+    """
+    ctx = FUNCTION_REWRITER.get_context()
+    detector_type = type(self)
+    deploy_cfg = ctx.cfg
+    use_efficientnms = deploy_cfg.get('use_efficientnms', False)
+    dtype = cls_scores[0].dtype
+    device = cls_scores[0].device
+    bbox_decoder = self.bbox_coder.decode
+    nms_func = multiclass_nms
+    if use_efficientnms:
+        if detector_type is YOLOv5Head:
+            nms_func = partial(efficient_nms, box_coding=0)
+            bbox_decoder = yolov5_bbox_decoder
+        else:
+            nms_func = efficient_nms
+
+    assert len(cls_scores) == len(bbox_preds)
+    cfg = self.test_cfg if cfg is None else cfg
+    cfg = copy.deepcopy(cfg)
+
+    num_imgs = cls_scores[0].shape[0]
+    featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+
+    mlvl_priors = self.prior_generator.grid_priors(
+        featmap_sizes, dtype=dtype, device=device)
+
+    flatten_priors = torch.cat(mlvl_priors)
+
+    mlvl_strides = [
+        flatten_priors.new_full(
+            (featmap_size[0] * featmap_size[1] * self.num_base_priors, ),
+            stride)
+        for featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+    ]
+    flatten_stride = torch.cat(mlvl_strides)
+
+    # flatten cls_scores, bbox_preds and objectness
+    flatten_cls_scores = [
+        cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes)
+        for cls_score in cls_scores
+    ]
+    cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+
+    flatten_bbox_preds = [
+        bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+        for bbox_pred in bbox_preds
+    ]
+    flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+
+    if objectnesses is not None:
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+        flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1))
+
+    scores = cls_scores
+
+    bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds,
+                          flatten_stride)
+
+    if not with_nms:
+        return bboxes, scores
+
+    post_params = get_post_processing_params(deploy_cfg)
+    max_output_boxes_per_class = post_params.max_output_boxes_per_class
+    iou_threshold = cfg.nms.get('iou_threshold', post_params.iou_threshold)
+    score_threshold = cfg.get('score_thr', post_params.score_threshold)
+    pre_top_k = post_params.pre_top_k
+    keep_top_k = cfg.get('max_per_img', post_params.keep_top_k)
+
+    return nms_func(bboxes, scores, max_output_boxes_per_class, iou_threshold,
+                    score_threshold, pre_top_k, keep_top_k)
+
+
+@FUNCTION_REWRITER.register_rewriter(
+    func_name='mmyolo.models.dense_heads.yolov5_head.'
+    'YOLOv5Head.predict',
+    backend='rknn')
+def yolov5_head__predict__rknn(self, x: Tuple[Tensor], *args,
+                               **kwargs) -> Tuple[Tensor, Tensor, Tensor]:
+    """Perform forward propagation of the detection head and predict detection
+    results on the features of the upstream network.
+
+    Args:
+        x (tuple[Tensor]): Multi-level features from the
+            upstream network, each is a 4D-tensor.
+    """
+    outs = self(x)
+    return outs
+
+
+@FUNCTION_REWRITER.register_rewriter(
+    func_name='mmyolo.models.dense_heads.yolov5_head.'
+    'YOLOv5HeadModule.forward',
+    backend='rknn')
+def yolov5_head_module__forward__rknn(
+        self, x: Tensor, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor]:
+    """Forward feature of a single scale level."""
+    out = []
+    for i, feat in enumerate(x):
+        out.append(self.convs_pred[i](feat))
+    return out
diff --git a/mmyolo/deploy/models/layers/__init__.py b/mmyolo/deploy/models/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6017cf83425b640eb788a8abf6b253f29d759afb
--- /dev/null
+++ b/mmyolo/deploy/models/layers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_nms import efficient_nms
+
+__all__ = ['efficient_nms']
diff --git a/mmyolo/deploy/models/layers/bbox_nms.py b/mmyolo/deploy/models/layers/bbox_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db81c0227a36e0315855082dcd8125e1f9be70a
--- /dev/null
+++ b/mmyolo/deploy/models/layers/bbox_nms.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdeploy.core import mark
+from torch import Tensor
+
+
+def _efficient_nms(
+    boxes: Tensor,
+    scores: Tensor,
+    max_output_boxes_per_class: int = 1000,
+    iou_threshold: float = 0.5,
+    score_threshold: float = 0.05,
+    pre_top_k: int = -1,
+    keep_top_k: int = 100,
+    box_coding: int = 0,
+):
+    """Wrapper for `efficient_nms` with TensorRT.
+
+    Args:
+        boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
+        scores (Tensor): The detection scores of shape
+            [N, num_boxes, num_classes].
+        max_output_boxes_per_class (int): Maximum number of output
+            boxes per class of nms. Defaults to 1000.
+        iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
+        score_threshold (float): score threshold of nms.
+            Defaults to 0.05.
+        pre_top_k (int): Number of top K boxes to keep before nms.
+            Defaults to -1.
+        keep_top_k (int): Number of top K boxes to keep after nms.
+            Defaults to -1.
+        box_coding (int): Bounding boxes format for nms.
+            Defaults to 0 means [x, y, w, h].
+            Set to 1 means [x1, y1 ,x2, y2].
+
+    Returns:
+        tuple[Tensor, Tensor]: (dets, labels), `dets` of shape [N, num_det, 5]
+            and `labels` of shape [N, num_det].
+    """
+    boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2)
+    _, det_boxes, det_scores, labels = TRTEfficientNMSop.apply(
+        boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0,
+        score_threshold)
+    dets = torch.cat([det_boxes, det_scores.unsqueeze(2)], -1)
+
+    # retain shape info
+    batch_size = boxes.size(0)
+
+    dets_shape = dets.shape
+    label_shape = labels.shape
+    dets = dets.reshape([batch_size, *dets_shape[1:]])
+    labels = labels.reshape([batch_size, *label_shape[1:]])
+    return dets, labels
+
+
+@mark('efficient_nms', inputs=['boxes', 'scores'], outputs=['dets', 'labels'])
+def efficient_nms(*args, **kwargs):
+    """Wrapper function for `_efficient_nms`."""
+    return _efficient_nms(*args, **kwargs)
+
+
+class TRTEfficientNMSop(torch.autograd.Function):
+    """Efficient NMS op for TensorRT."""
+
+    @staticmethod
+    def forward(
+        ctx,
+        boxes,
+        scores,
+        background_class=-1,
+        box_coding=0,
+        iou_threshold=0.45,
+        max_output_boxes=100,
+        plugin_version='1',
+        score_activation=0,
+        score_threshold=0.25,
+    ):
+        """Forward function of TRTEfficientNMSop."""
+        batch_size, num_boxes, num_classes = scores.shape
+        num_det = torch.randint(
+            0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
+        det_boxes = torch.randn(batch_size, max_output_boxes, 4)
+        det_scores = torch.randn(batch_size, max_output_boxes)
+        det_classes = torch.randint(
+            0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
+        return num_det, det_boxes, det_scores, det_classes
+
+    @staticmethod
+    def symbolic(g,
+                 boxes,
+                 scores,
+                 background_class=-1,
+                 box_coding=0,
+                 iou_threshold=0.45,
+                 max_output_boxes=100,
+                 plugin_version='1',
+                 score_activation=0,
+                 score_threshold=0.25):
+        """Symbolic function of TRTEfficientNMSop."""
+        out = g.op(
+            'TRT::EfficientNMS_TRT',
+            boxes,
+            scores,
+            background_class_i=background_class,
+            box_coding_i=box_coding,
+            iou_threshold_f=iou_threshold,
+            max_output_boxes_i=max_output_boxes,
+            plugin_version_s=plugin_version,
+            score_activation_i=score_activation,
+            score_threshold_f=score_threshold,
+            outputs=4)
+        nums, boxes, scores, classes = out
+        return nums, boxes, scores, classes
diff --git a/mmyolo/deploy/object_detection.py b/mmyolo/deploy/object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..7efdfcfb7a46c8bc6b90e76bd06d9065410e55f0
--- /dev/null
+++ b/mmyolo/deploy/object_detection.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Dict, Optional
+
+import torch
+from mmdeploy.codebase.base import CODEBASE, MMCodebase
+from mmdeploy.codebase.mmdet.deploy import ObjectDetection
+from mmdeploy.utils import Codebase, Task
+from mmengine import Config
+from mmengine.registry import Registry
+
+MMYOLO_TASK = Registry('mmyolo_tasks')
+
+
+@CODEBASE.register_module(Codebase.MMYOLO.value)
+class MMYOLO(MMCodebase):
+    """MMYOLO codebase class."""
+
+    task_registry = MMYOLO_TASK
+
+    @classmethod
+    def register_deploy_modules(cls):
+        """register all rewriters for mmdet."""
+        import mmdeploy.codebase.mmdet.models  # noqa: F401
+        import mmdeploy.codebase.mmdet.ops  # noqa: F401
+        import mmdeploy.codebase.mmdet.structures  # noqa: F401
+
+    @classmethod
+    def register_all_modules(cls):
+        """register all modules."""
+        from mmdet.utils.setup_env import \
+            register_all_modules as register_all_modules_mmdet
+
+        from mmyolo.utils.setup_env import \
+            register_all_modules as register_all_modules_mmyolo
+
+        cls.register_deploy_modules()
+        register_all_modules_mmyolo(True)
+        register_all_modules_mmdet(False)
+
+
+def _get_dataset_metainfo(model_cfg: Config):
+    """Get metainfo of dataset.
+
+    Args:
+        model_cfg Config: Input model Config object.
+
+    Returns:
+        list[str]: A list of string specifying names of different class.
+    """
+    from mmyolo import datasets  # noqa
+    from mmyolo.registry import DATASETS
+
+    module_dict = DATASETS.module_dict
+    for dataloader_name in [
+            'test_dataloader', 'val_dataloader', 'train_dataloader'
+    ]:
+        if dataloader_name not in model_cfg:
+            continue
+        dataloader_cfg = model_cfg[dataloader_name]
+        dataset_cfg = dataloader_cfg.dataset
+        dataset_cls = module_dict.get(dataset_cfg.type, None)
+        if dataset_cls is None:
+            continue
+        if hasattr(dataset_cls, '_load_metainfo') and isinstance(
+                dataset_cls._load_metainfo, Callable):
+            meta = dataset_cls._load_metainfo(
+                dataset_cfg.get('metainfo', None))
+            if meta is not None:
+                return meta
+        if hasattr(dataset_cls, 'METAINFO'):
+            return dataset_cls.METAINFO
+
+    return None
+
+
+@MMYOLO_TASK.register_module(Task.OBJECT_DETECTION.value)
+class YOLOObjectDetection(ObjectDetection):
+    """YOLO Object Detection task."""
+
+    def get_visualizer(self, name: str, save_dir: str):
+        """Get visualizer.
+
+        Args:
+            name (str): Name of visualizer.
+            save_dir (str): Directory to save visualization results.
+
+        Returns:
+            Visualizer: A visualizer instance.
+        """
+        from mmdet.visualization import DetLocalVisualizer  # noqa: F401,F403
+        metainfo = _get_dataset_metainfo(self.model_cfg)
+        visualizer = super().get_visualizer(name, save_dir)
+        if metainfo is not None:
+            visualizer.dataset_meta = metainfo
+        return visualizer
+
+    def build_pytorch_model(self,
+                            model_checkpoint: Optional[str] = None,
+                            cfg_options: Optional[Dict] = None,
+                            **kwargs) -> torch.nn.Module:
+        """Initialize torch model.
+
+        Args:
+            model_checkpoint (str): The checkpoint file of torch model,
+                defaults to `None`.
+            cfg_options (dict): Optional config key-pair parameters.
+        Returns:
+            nn.Module: An initialized torch model generated by other OpenMMLab
+                codebases.
+        """
+        from copy import deepcopy
+
+        from mmengine.model import revert_sync_batchnorm
+        from mmengine.registry import MODELS
+
+        from mmyolo.utils import switch_to_deploy
+
+        model = deepcopy(self.model_cfg.model)
+        preprocess_cfg = deepcopy(self.model_cfg.get('preprocess_cfg', {}))
+        preprocess_cfg.update(
+            deepcopy(self.model_cfg.get('data_preprocessor', {})))
+        model.setdefault('data_preprocessor', preprocess_cfg)
+        model = MODELS.build(model)
+        if model_checkpoint is not None:
+            from mmengine.runner.checkpoint import load_checkpoint
+            load_checkpoint(model, model_checkpoint, map_location=self.device)
+
+        model = revert_sync_batchnorm(model)
+        switch_to_deploy(model)
+        model = model.to(self.device)
+        model.eval()
+        return model
diff --git a/mmyolo/engine/__init__.py b/mmyolo/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2e0a126c09797b327f7309d6e980245b7e44773
--- /dev/null
+++ b/mmyolo/engine/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import *  # noqa: F401,F403
+from .optimizers import *  # noqa: F401,F403
diff --git a/mmyolo/engine/hooks/__init__.py b/mmyolo/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b8deebc8827da5b9a3f8c92a2fffe70e42d0bfa
--- /dev/null
+++ b/mmyolo/engine/hooks/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ppyoloe_param_scheduler_hook import PPYOLOEParamSchedulerHook
+from .switch_to_deploy_hook import SwitchToDeployHook
+from .yolov5_param_scheduler_hook import YOLOv5ParamSchedulerHook
+from .yolox_mode_switch_hook import YOLOXModeSwitchHook
+
+__all__ = [
+    'YOLOv5ParamSchedulerHook', 'YOLOXModeSwitchHook', 'SwitchToDeployHook',
+    'PPYOLOEParamSchedulerHook'
+]
diff --git a/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py b/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..26dfe6ef2d5cf590ea381efb3e42cdc1c5492361
--- /dev/null
+++ b/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+from mmengine.hooks import ParamSchedulerHook
+from mmengine.runner import Runner
+
+from mmyolo.registry import HOOKS
+
+
+@HOOKS.register_module()
+class PPYOLOEParamSchedulerHook(ParamSchedulerHook):
+    """A hook to update learning rate and momentum in optimizer of PPYOLOE. We
+    use this hook to implement adaptive computation for `warmup_total_iters`,
+    which is not possible with the built-in ParamScheduler in mmyolo.
+
+    Args:
+        warmup_min_iter (int): Minimum warmup iters. Defaults to 1000.
+        start_factor (float): The number we multiply learning rate in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 0.
+        warmup_epochs (int): Epochs for warmup. Defaults to 5.
+        min_lr_ratio (float): Minimum learning rate ratio.
+        total_epochs (int): In PPYOLOE, `total_epochs` is set to
+            training_epochs x 1.2. Defaults to 360.
+    """
+    priority = 9
+
+    def __init__(self,
+                 warmup_min_iter: int = 1000,
+                 start_factor: float = 0.,
+                 warmup_epochs: int = 5,
+                 min_lr_ratio: float = 0.0,
+                 total_epochs: int = 360):
+
+        self.warmup_min_iter = warmup_min_iter
+        self.start_factor = start_factor
+        self.warmup_epochs = warmup_epochs
+        self.min_lr_ratio = min_lr_ratio
+        self.total_epochs = total_epochs
+
+        self._warmup_end = False
+        self._base_lr = None
+
+    def before_train(self, runner: Runner):
+        """Operations before train.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        optimizer = runner.optim_wrapper.optimizer
+        for group in optimizer.param_groups:
+            # If the param is never be scheduled, record the current value
+            # as the initial value.
+            group.setdefault('initial_lr', group['lr'])
+
+        self._base_lr = [
+            group['initial_lr'] for group in optimizer.param_groups
+        ]
+        self._min_lr = [i * self.min_lr_ratio for i in self._base_lr]
+
+    def before_train_iter(self,
+                          runner: Runner,
+                          batch_idx: int,
+                          data_batch: Optional[dict] = None):
+        """Operations before each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+        """
+        cur_iters = runner.iter
+        optimizer = runner.optim_wrapper.optimizer
+        dataloader_len = len(runner.train_dataloader)
+
+        # The minimum warmup is self.warmup_min_iter
+        warmup_total_iters = max(
+            round(self.warmup_epochs * dataloader_len), self.warmup_min_iter)
+
+        if cur_iters <= warmup_total_iters:
+            # warm up
+            alpha = cur_iters / warmup_total_iters
+            factor = self.start_factor * (1 - alpha) + alpha
+
+            for group_idx, param in enumerate(optimizer.param_groups):
+                param['lr'] = self._base_lr[group_idx] * factor
+        else:
+            for group_idx, param in enumerate(optimizer.param_groups):
+                total_iters = self.total_epochs * dataloader_len
+                lr = self._min_lr[group_idx] + (
+                    self._base_lr[group_idx] -
+                    self._min_lr[group_idx]) * 0.5 * (
+                        math.cos((cur_iters - warmup_total_iters) * math.pi /
+                                 (total_iters - warmup_total_iters)) + 1.0)
+                param['lr'] = lr
diff --git a/mmyolo/engine/hooks/switch_to_deploy_hook.py b/mmyolo/engine/hooks/switch_to_deploy_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..28ac345f40c44c974fb33b7bf9756a61fcabf820
--- /dev/null
+++ b/mmyolo/engine/hooks/switch_to_deploy_hook.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmyolo.registry import HOOKS
+from mmyolo.utils import switch_to_deploy
+
+
+@HOOKS.register_module()
+class SwitchToDeployHook(Hook):
+    """Switch to deploy mode before testing.
+
+    This hook converts the multi-channel structure of the training network
+    (high performance) to the one-way structure of the testing network (fast
+    speed and  memory saving).
+    """
+
+    def before_test_epoch(self, runner: Runner):
+        """Switch to deploy mode before testing."""
+        switch_to_deploy(runner.model)
diff --git a/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py b/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..777bb49d7abd7fc37385370546d05e70c274b3b7
--- /dev/null
+++ b/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import numpy as np
+from mmengine.hooks import ParamSchedulerHook
+from mmengine.runner import Runner
+
+from mmyolo.registry import HOOKS
+
+
+def linear_fn(lr_factor: float, max_epochs: int):
+    """Generate linear function."""
+    return lambda x: (1 - x / max_epochs) * (1.0 - lr_factor) + lr_factor
+
+
+def cosine_fn(lr_factor: float, max_epochs: int):
+    """Generate cosine function."""
+    return lambda x: (
+        (1 - math.cos(x * math.pi / max_epochs)) / 2) * (lr_factor - 1) + 1
+
+
+@HOOKS.register_module()
+class YOLOv5ParamSchedulerHook(ParamSchedulerHook):
+    """A hook to update learning rate and momentum in optimizer of YOLOv5."""
+    priority = 9
+
+    scheduler_maps = {'linear': linear_fn, 'cosine': cosine_fn}
+
+    def __init__(self,
+                 scheduler_type: str = 'linear',
+                 lr_factor: float = 0.01,
+                 max_epochs: int = 300,
+                 warmup_epochs: int = 3,
+                 warmup_bias_lr: float = 0.1,
+                 warmup_momentum: float = 0.8,
+                 warmup_mim_iter: int = 1000,
+                 **kwargs):
+
+        assert scheduler_type in self.scheduler_maps
+
+        self.warmup_epochs = warmup_epochs
+        self.warmup_bias_lr = warmup_bias_lr
+        self.warmup_momentum = warmup_momentum
+        self.warmup_mim_iter = warmup_mim_iter
+
+        kwargs.update({'lr_factor': lr_factor, 'max_epochs': max_epochs})
+        self.scheduler_fn = self.scheduler_maps[scheduler_type](**kwargs)
+
+        self._warmup_end = False
+        self._base_lr = None
+        self._base_momentum = None
+
+    def before_train(self, runner: Runner):
+        """Operations before train.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        optimizer = runner.optim_wrapper.optimizer
+        for group in optimizer.param_groups:
+            # If the param is never be scheduled, record the current value
+            # as the initial value.
+            group.setdefault('initial_lr', group['lr'])
+            group.setdefault('initial_momentum', group.get('momentum', -1))
+
+        self._base_lr = [
+            group['initial_lr'] for group in optimizer.param_groups
+        ]
+        self._base_momentum = [
+            group['initial_momentum'] for group in optimizer.param_groups
+        ]
+
+    def before_train_iter(self,
+                          runner: Runner,
+                          batch_idx: int,
+                          data_batch: Optional[dict] = None):
+        """Operations before each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+        """
+        cur_iters = runner.iter
+        cur_epoch = runner.epoch
+        optimizer = runner.optim_wrapper.optimizer
+
+        # The minimum warmup is self.warmup_mim_iter
+        warmup_total_iters = max(
+            round(self.warmup_epochs * len(runner.train_dataloader)),
+            self.warmup_mim_iter)
+
+        if cur_iters <= warmup_total_iters:
+            xp = [0, warmup_total_iters]
+            for group_idx, param in enumerate(optimizer.param_groups):
+                if group_idx == 2:
+                    # bias learning rate will be handled specially
+                    yp = [
+                        self.warmup_bias_lr,
+                        self._base_lr[group_idx] * self.scheduler_fn(cur_epoch)
+                    ]
+                else:
+                    yp = [
+                        0.0,
+                        self._base_lr[group_idx] * self.scheduler_fn(cur_epoch)
+                    ]
+                param['lr'] = np.interp(cur_iters, xp, yp)
+
+                if 'momentum' in param:
+                    param['momentum'] = np.interp(
+                        cur_iters, xp,
+                        [self.warmup_momentum, self._base_momentum[group_idx]])
+        else:
+            self._warmup_end = True
+
+    def after_train_epoch(self, runner: Runner):
+        """Operations after each training epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if not self._warmup_end:
+            return
+
+        cur_epoch = runner.epoch
+        optimizer = runner.optim_wrapper.optimizer
+        for group_idx, param in enumerate(optimizer.param_groups):
+            param['lr'] = self._base_lr[group_idx] * self.scheduler_fn(
+                cur_epoch)
diff --git a/mmyolo/engine/hooks/yolox_mode_switch_hook.py b/mmyolo/engine/hooks/yolox_mode_switch_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..27711768c3f89b26410ae1373bc920d0bfded603
--- /dev/null
+++ b/mmyolo/engine/hooks/yolox_mode_switch_hook.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Sequence
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmyolo.registry import HOOKS
+
+
+@HOOKS.register_module()
+class YOLOXModeSwitchHook(Hook):
+    """Switch the mode of YOLOX during training.
+
+    This hook turns off the mosaic and mixup data augmentation and switches
+    to use L1 loss in bbox_head.
+
+    Args:
+        num_last_epochs (int): The number of latter epochs in the end of the
+            training to close the data augmentation and switch to L1 loss.
+            Defaults to 15.
+    """
+
+    def __init__(self,
+                 num_last_epochs: int = 15,
+                 new_train_pipeline: Sequence[dict] = None):
+        self.num_last_epochs = num_last_epochs
+        self.new_train_pipeline_cfg = new_train_pipeline
+
+    def before_train_epoch(self, runner: Runner):
+        """Close mosaic and mixup augmentation and switches to use L1 loss."""
+        epoch = runner.epoch
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+
+        if (epoch + 1) == runner.max_epochs - self.num_last_epochs:
+            runner.logger.info(f'New Pipeline: {self.new_train_pipeline_cfg}')
+
+            train_dataloader_cfg = copy.deepcopy(runner.cfg.train_dataloader)
+            train_dataloader_cfg.dataset.pipeline = self.new_train_pipeline_cfg
+            # Note: Why rebuild the dataset?
+            # When build_dataloader will make a deep copy of the dataset,
+            # it will lead to potential risks, such as the global instance
+            # object FileClient data is disordered.
+            # This problem needs to be solved in the future.
+            new_train_dataloader = Runner.build_dataloader(
+                train_dataloader_cfg)
+            runner.train_loop.dataloader = new_train_dataloader
+
+            runner.logger.info('recreate the dataloader!')
+            runner.logger.info('Add additional bbox reg loss now!')
+            model.bbox_head.use_bbox_aux = True
diff --git a/mmyolo/engine/optimizers/__init__.py b/mmyolo/engine/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b598020d05db54cdc1d803d39ebd2c91026a6112
--- /dev/null
+++ b/mmyolo/engine/optimizers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .yolov5_optim_constructor import YOLOv5OptimizerConstructor
+from .yolov7_optim_wrapper_constructor import YOLOv7OptimWrapperConstructor
+
+__all__ = ['YOLOv5OptimizerConstructor', 'YOLOv7OptimWrapperConstructor']
diff --git a/mmyolo/engine/optimizers/yolov5_optim_constructor.py b/mmyolo/engine/optimizers/yolov5_optim_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5f42cb5c2c18962f989288b45011c742845c2f
--- /dev/null
+++ b/mmyolo/engine/optimizers/yolov5_optim_constructor.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+from mmengine.model import is_model_wrapper
+from mmengine.optim import OptimWrapper
+
+from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                             OPTIMIZERS)
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class YOLOv5OptimizerConstructor:
+    """YOLOv5 constructor for optimizers.
+
+    It has the following functions：
+
+        - divides the optimizer parameters into 3 groups:
+        Conv, Bias and BN
+
+        - support `weight_decay` parameter adaption based on
+        `batch_size_per_gpu`
+
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+            Positional fields are
+
+                - ``type``: class name of the OptimizerWrapper
+                - ``optimizer``: The configuration of optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer wrapper type,
+                  e.g., accumulative_counts, clip_grad, etc.
+
+            The positional fields of ``optimizer`` are
+
+                - `type`: class name of the optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+
+        paramwise_cfg (dict, optional): Parameter-wise options. Must include
+            `base_total_batch_size` if not None. If the total input batch
+            is smaller than `base_total_batch_size`, the `weight_decay`
+            parameter will be kept unchanged, otherwise linear scaling.
+
+    Example:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16))
+        >>> paramwise_cfg = dict(base_total_batch_size=64)
+        >>> optim_wrapper_builder = YOLOv5OptimizerConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+    """
+
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        if paramwise_cfg is None:
+            paramwise_cfg = {'base_total_batch_size': 64}
+        assert 'base_total_batch_size' in paramwise_cfg
+
+        if not isinstance(optim_wrapper_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optim_wrapper_cfg)}')
+        assert 'optimizer' in optim_wrapper_cfg, (
+            '`optim_wrapper_cfg` must contain "optimizer" config')
+
+        self.optim_wrapper_cfg = optim_wrapper_cfg
+        self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer')
+        self.base_total_batch_size = paramwise_cfg['base_total_batch_size']
+
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+        if is_model_wrapper(model):
+            model = model.module
+        optimizer_cfg = self.optimizer_cfg.copy()
+        weight_decay = optimizer_cfg.pop('weight_decay', 0)
+
+        if 'batch_size_per_gpu' in optimizer_cfg:
+            batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu')
+            # No scaling if total_batch_size is less than
+            # base_total_batch_size, otherwise linear scaling.
+            total_batch_size = get_world_size() * batch_size_per_gpu
+            accumulate = max(
+                round(self.base_total_batch_size / total_batch_size), 1)
+            scale_factor = total_batch_size * \
+                accumulate / self.base_total_batch_size
+
+            if scale_factor != 1:
+                weight_decay *= scale_factor
+                print_log(f'Scaled weight_decay to {weight_decay}', 'current')
+
+        params_groups = [], [], []
+
+        for v in model.modules():
+            if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+                params_groups[2].append(v.bias)
+            # Includes SyncBatchNorm
+            if isinstance(v, nn.modules.batchnorm._NormBase):
+                params_groups[1].append(v.weight)
+            elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+                params_groups[0].append(v.weight)
+
+        # Note: Make sure bias is in the last parameter group
+        optimizer_cfg['params'] = []
+        # conv
+        optimizer_cfg['params'].append({
+            'params': params_groups[0],
+            'weight_decay': weight_decay
+        })
+        # bn
+        optimizer_cfg['params'].append({'params': params_groups[1]})
+        # bias
+        optimizer_cfg['params'].append({'params': params_groups[2]})
+
+        print_log(
+            'Optimizer groups: %g .bias, %g conv.weight, %g other' %
+            (len(params_groups[2]), len(params_groups[0]), len(
+                params_groups[1])), 'current')
+        del params_groups
+
+        optimizer = OPTIMIZERS.build(optimizer_cfg)
+        optim_wrapper = OPTIM_WRAPPERS.build(
+            self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
+        return optim_wrapper
diff --git a/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py b/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ea8b69976760c0e45e35f8420d0cc69b13331a
--- /dev/null
+++ b/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+from mmengine.model import is_model_wrapper
+from mmengine.optim import OptimWrapper
+
+from mmyolo.models.dense_heads.yolov7_head import ImplicitA, ImplicitM
+from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                             OPTIMIZERS)
+
+
+# TODO: Consider merging into YOLOv5OptimizerConstructor
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class YOLOv7OptimWrapperConstructor:
+    """YOLOv7 constructor for optimizer wrappers.
+
+    It has the following functions：
+
+        - divides the optimizer parameters into 3 groups:
+        Conv, Bias and BN/ImplicitA/ImplicitM
+
+        - support `weight_decay` parameter adaption based on
+        `batch_size_per_gpu`
+
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+            Positional fields are
+
+                - ``type``: class name of the OptimizerWrapper
+                - ``optimizer``: The configuration of optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer wrapper type,
+                  e.g., accumulative_counts, clip_grad, etc.
+
+            The positional fields of ``optimizer`` are
+
+                - `type`: class name of the optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+
+        paramwise_cfg (dict, optional): Parameter-wise options. Must include
+            `base_total_batch_size` if not None. If the total input batch
+            is smaller than `base_total_batch_size`, the `weight_decay`
+            parameter will be kept unchanged, otherwise linear scaling.
+
+    Example:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16))
+        >>> paramwise_cfg = dict(base_total_batch_size=64)
+        >>> optim_wrapper_builder = YOLOv7OptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+    """
+
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        if paramwise_cfg is None:
+            paramwise_cfg = {'base_total_batch_size': 64}
+        assert 'base_total_batch_size' in paramwise_cfg
+
+        if not isinstance(optim_wrapper_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optim_wrapper_cfg)}')
+        assert 'optimizer' in optim_wrapper_cfg, (
+            '`optim_wrapper_cfg` must contain "optimizer" config')
+
+        self.optim_wrapper_cfg = optim_wrapper_cfg
+        self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer')
+        self.base_total_batch_size = paramwise_cfg['base_total_batch_size']
+
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+        if is_model_wrapper(model):
+            model = model.module
+        optimizer_cfg = self.optimizer_cfg.copy()
+        weight_decay = optimizer_cfg.pop('weight_decay', 0)
+
+        if 'batch_size_per_gpu' in optimizer_cfg:
+            batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu')
+            # No scaling if total_batch_size is less than
+            # base_total_batch_size, otherwise linear scaling.
+            total_batch_size = get_world_size() * batch_size_per_gpu
+            accumulate = max(
+                round(self.base_total_batch_size / total_batch_size), 1)
+            scale_factor = total_batch_size * \
+                accumulate / self.base_total_batch_size
+
+            if scale_factor != 1:
+                weight_decay *= scale_factor
+                print_log(f'Scaled weight_decay to {weight_decay}', 'current')
+
+        params_groups = [], [], []
+        for v in model.modules():
+            # no decay
+            # Caution: Coupling with model
+            if isinstance(v, (ImplicitA, ImplicitM)):
+                params_groups[0].append(v.implicit)
+            elif isinstance(v, nn.modules.batchnorm._NormBase):
+                params_groups[0].append(v.weight)
+            # apply decay
+            elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+                params_groups[1].append(v.weight)  # apply decay
+
+            # biases, no decay
+            if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+                params_groups[2].append(v.bias)
+
+        # Note: Make sure bias is in the last parameter group
+        optimizer_cfg['params'] = []
+        # conv
+        optimizer_cfg['params'].append({
+            'params': params_groups[1],
+            'weight_decay': weight_decay
+        })
+        # bn ...
+        optimizer_cfg['params'].append({'params': params_groups[0]})
+        # bias
+        optimizer_cfg['params'].append({'params': params_groups[2]})
+
+        print_log(
+            'Optimizer groups: %g .bias, %g conv.weight, %g other' %
+            (len(params_groups[2]), len(params_groups[1]), len(
+                params_groups[0])), 'current')
+        del params_groups
+
+        optimizer = OPTIMIZERS.build(optimizer_cfg)
+        optim_wrapper = OPTIM_WRAPPERS.build(
+            self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
+        return optim_wrapper
diff --git a/mmyolo/models/__init__.py b/mmyolo/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..51c37f0436f131dcd26b9a8115e58fe49d59207e
--- /dev/null
+++ b/mmyolo/models/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .data_preprocessors import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .plugins import *  # noqa: F401,F403
+from .task_modules import *  # noqa: F401,F403
diff --git a/mmyolo/models/backbones/__init__.py b/mmyolo/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48c8e28b1e7eb97e3f7cb064c75af0dc79b4cc8d
--- /dev/null
+++ b/mmyolo/models/backbones/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_backbone import BaseBackbone
+from .csp_darknet import YOLOv5CSPDarknet, YOLOv8CSPDarknet, YOLOXCSPDarknet
+from .csp_resnet import PPYOLOECSPResNet
+from .cspnext import CSPNeXt
+from .efficient_rep import YOLOv6CSPBep, YOLOv6EfficientRep
+from .yolov7_backbone import YOLOv7Backbone
+
+__all__ = [
+    'YOLOv5CSPDarknet', 'BaseBackbone', 'YOLOv6EfficientRep', 'YOLOv6CSPBep',
+    'YOLOXCSPDarknet', 'CSPNeXt', 'YOLOv7Backbone', 'PPYOLOECSPResNet',
+    'YOLOv8CSPDarknet'
+]
diff --git a/mmyolo/models/backbones/base_backbone.py b/mmyolo/models/backbones/base_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..730c7095eccf66b0d563fad96122454c98dff0ac
--- /dev/null
+++ b/mmyolo/models/backbones/base_backbone.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Sequence, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_plugin_layer
+from mmdet.utils import ConfigType, OptMultiConfig
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmyolo.registry import MODELS
+
+
+@MODELS.register_module()
+class BaseBackbone(BaseModule, metaclass=ABCMeta):
+    """BaseBackbone backbone used in YOLO series.
+
+    .. code:: text
+
+     Backbone model structure diagram
+     +-----------+
+     |   input   |
+     +-----------+
+           v
+     +-----------+
+     |   stem    |
+     |   layer   |
+     +-----------+
+           v
+     +-----------+
+     |   stage   |
+     |  layer 1  |
+     +-----------+
+           v
+     +-----------+
+     |   stage   |
+     |  layer 2  |
+     +-----------+
+           v
+         ......
+           v
+     +-----------+
+     |   stage   |
+     |  layer n  |
+     +-----------+
+     In P5 model, n=4
+     In P6 model, n=5
+
+    Args:
+        arch_setting (list): Architecture of BaseBackbone.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels: Number of input image channels. Defaults to 3.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to None.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to None.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 arch_setting: list,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Sequence[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 plugins: Union[dict, List[dict]] = None,
+                 norm_cfg: ConfigType = None,
+                 act_cfg: ConfigType = None,
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg)
+        self.num_stages = len(arch_setting)
+        self.arch_setting = arch_setting
+
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('"frozen_stages" must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.input_channels = input_channels
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.widen_factor = widen_factor
+        self.deepen_factor = deepen_factor
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.plugins = plugins
+
+        self.stem = self.build_stem_layer()
+        self.layers = ['stem']
+
+        for idx, setting in enumerate(arch_setting):
+            stage = []
+            stage += self.build_stage_layer(idx, setting)
+            if plugins is not None:
+                stage += self.make_stage_plugins(plugins, idx, setting)
+            self.add_module(f'stage{idx + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{idx + 1}')
+
+    @abstractmethod
+    def build_stem_layer(self):
+        """Build a stem layer."""
+        pass
+
+    @abstractmethod
+    def build_stage_layer(self, stage_idx: int, setting: list):
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        pass
+
+    def make_stage_plugins(self, plugins, stage_idx, setting):
+        """Make plugins for backbone ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block``, ``dropout_block``
+        into the backbone.
+
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True)),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True)),
+            ... ]
+            >>> model = YOLOv5CSPDarknet()
+            >>> stage_plugins = model.make_stage_plugins(plugins, 0, setting)
+            >>> assert len(stage_plugins) == 1
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1 -> conv2 -> conv3 -> yyy
+
+        Suppose ``stage_idx=1``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1 -> conv2 -> conv3 -> xxx -> yyy
+
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+                If stages is missing, the plugin would be applied to all
+                stages.
+            setting (list): The architecture setting of a stage layer.
+
+        Returns:
+            list[nn.Module]: Plugins for current stage
+        """
+        # TODO: It is not general enough to support any channel and needs
+        # to be refactored
+        in_channels = int(setting[1] * self.widen_factor)
+        plugin_layers = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            if stages is None or stages[stage_idx]:
+                name, layer = build_plugin_layer(
+                    plugin['cfg'], in_channels=in_channels)
+                plugin_layers.append(layer)
+        return plugin_layers
+
+    def _freeze_stages(self):
+        """Freeze the parameters of the specified stage so that they are no
+        longer updated."""
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode: bool = True):
+        """Convert the model into training mode while keep normalization layer
+        frozen."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        """Forward batch_inputs from the data_preprocessor."""
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
diff --git a/mmyolo/models/backbones/csp_darknet.py b/mmyolo/models/backbones/csp_darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..92bd69a5a9378a37ed8fb50c52dfba0de6879083
--- /dev/null
+++ b/mmyolo/models/backbones/csp_darknet.py
@@ -0,0 +1,427 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.backbones.csp_darknet import CSPLayer, Focus
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from ..layers import CSPLayerWithTwoConv, SPPFBottleneck
+from ..utils import make_divisible, make_round
+from .base_backbone import BaseBackbone
+
+
+@MODELS.register_module()
+class YOLOv5CSPDarknet(BaseBackbone):
+    """CSP-Darknet backbone used in YOLOv5.
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to: 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        init_cfg (Union[dict,list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOv5CSPDarknet
+        >>> import torch
+        >>> model = YOLOv5CSPDarknet()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, True, True]]
+    }
+
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        return ConvModule(
+            self.input_channels,
+            make_divisible(self.arch_setting[0][0], self.widen_factor),
+            kernel_size=6,
+            stride=2,
+            padding=2,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+
+        in_channels = make_divisible(in_channels, self.widen_factor)
+        out_channels = make_divisible(out_channels, self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        stage = []
+        conv_layer = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        csp_layer = CSPLayer(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage
+
+    def init_weights(self):
+        """Initialize the parameters."""
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+
+
+@MODELS.register_module()
+class YOLOv8CSPDarknet(BaseBackbone):
+    """CSP-Darknet backbone used in YOLOv8.
+
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5}.
+            Defaults to P5.
+        last_stage_out_channels (int): Final layer output channel.
+            Defaults to 1024.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to: 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        init_cfg (Union[dict,list[dict]], optional): Initialization config
+            dict. Defaults to None.
+
+    Example:
+        >>> from mmyolo.models import YOLOv8CSPDarknet
+        >>> import torch
+        >>> model = YOLOv8CSPDarknet()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    # the final out_channels will be set according to the param.
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, None, 3, True, True]],
+    }
+
+    def __init__(self,
+                 arch: str = 'P5',
+                 last_stage_out_channels: int = 1024,
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        self.arch_settings[arch][-1][1] = last_stage_out_channels
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        return ConvModule(
+            self.input_channels,
+            make_divisible(self.arch_setting[0][0], self.widen_factor),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+
+        in_channels = make_divisible(in_channels, self.widen_factor)
+        out_channels = make_divisible(out_channels, self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        stage = []
+        conv_layer = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        csp_layer = CSPLayerWithTwoConv(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage
+
+    def init_weights(self):
+        """Initialize the parameters."""
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+
+
+@MODELS.register_module()
+class YOLOXCSPDarknet(BaseBackbone):
+    """CSP-Darknet backbone used in YOLOX.
+
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Defaults to (5, 9, 13).
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (Union[dict,list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOXCSPDarknet
+        >>> import torch
+        >>> model = YOLOXCSPDarknet()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+    }
+
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 use_depthwise: bool = False,
+                 spp_kernal_sizes: Tuple[int] = (5, 9, 13),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        self.use_depthwise = use_depthwise
+        self.spp_kernal_sizes = spp_kernal_sizes
+        super().__init__(self.arch_settings[arch], deepen_factor, widen_factor,
+                         input_channels, out_indices, frozen_stages, plugins,
+                         norm_cfg, act_cfg, norm_eval, init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        return Focus(
+            3,
+            make_divisible(64, self.widen_factor),
+            kernel_size=3,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+
+        in_channels = make_divisible(in_channels, self.widen_factor)
+        out_channels = make_divisible(out_channels, self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+        stage = []
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        conv_layer = conv(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=self.spp_kernal_sizes,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        csp_layer = CSPLayer(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        return stage
diff --git a/mmyolo/models/backbones/csp_resnet.py b/mmyolo/models/backbones/csp_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42ed489d8872913f4aacce08497c8e48fdace49
--- /dev/null
+++ b/mmyolo/models/backbones/csp_resnet.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.models.backbones import BaseBackbone
+from mmyolo.models.layers.yolo_bricks import CSPResLayer
+from mmyolo.registry import MODELS
+
+
+@MODELS.register_module()
+class PPYOLOECSPResNet(BaseBackbone):
+    """CSP-ResNet backbone used in PPYOLOE.
+
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        block_cfg (dict): Config dict for block. Defaults to
+            dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True)
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', momentum=0.1,
+            eps=1e-5).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        attention_cfg (dict): Config dict for `EffectiveSELayer`.
+            Defaults to dict(type='EffectiveSELayer',
+            act_cfg=dict(type='HSigmoid')).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+        use_large_stem (bool): Whether to use large stem layer.
+            Defaults to False.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks
+    arch_settings = {
+        'P5': [[64, 128, 3], [128, 256, 6], [256, 512, 6], [512, 1024, 3]]
+    }
+
+    def __init__(self,
+                 arch: str = 'P5',
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 plugins: Union[dict, List[dict]] = None,
+                 arch_ovewrite: dict = None,
+                 block_cfg: ConfigType = dict(
+                     type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 attention_cfg: ConfigType = dict(
+                     type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None,
+                 use_large_stem: bool = False):
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        arch_setting = [[
+            int(in_channels * widen_factor),
+            int(out_channels * widen_factor),
+            round(num_blocks * deepen_factor)
+        ] for in_channels, out_channels, num_blocks in arch_setting]
+        self.block_cfg = block_cfg
+        self.use_large_stem = use_large_stem
+        self.attention_cfg = attention_cfg
+
+        super().__init__(
+            arch_setting,
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        if self.use_large_stem:
+            stem = nn.Sequential(
+                ConvModule(
+                    self.input_channels,
+                    self.arch_setting[0][0] // 2,
+                    3,
+                    stride=2,
+                    padding=1,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg),
+                ConvModule(
+                    self.arch_setting[0][0] // 2,
+                    self.arch_setting[0][0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    self.arch_setting[0][0] // 2,
+                    self.arch_setting[0][0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        else:
+            stem = nn.Sequential(
+                ConvModule(
+                    self.input_channels,
+                    self.arch_setting[0][0] // 2,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    self.arch_setting[0][0] // 2,
+                    self.arch_setting[0][0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        return stem
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks = setting
+
+        cspres_layer = CSPResLayer(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_block=num_blocks,
+            block_cfg=self.block_cfg,
+            stride=2,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            attention_cfg=self.attention_cfg,
+            use_spp=False)
+        return [cspres_layer]
diff --git a/mmyolo/models/backbones/cspnext.py b/mmyolo/models/backbones/cspnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..adca9dd9d11baecefda90a99a4188e78c2ca8188
--- /dev/null
+++ b/mmyolo/models/backbones/cspnext.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Sequence, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.backbones.csp_darknet import CSPLayer
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from ..layers import SPPFBottleneck
+from .base_backbone import BaseBackbone
+
+
+@MODELS.register_module()
+class CSPNeXt(BaseBackbone):
+    """CSPNeXt backbone used in RTMDet.
+
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.Defaults to
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(
+        self,
+        arch: str = 'P5',
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        input_channels: int = 3,
+        out_indices: Sequence[int] = (2, 3, 4),
+        frozen_stages: int = -1,
+        plugins: Union[dict, List[dict]] = None,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        arch_ovewrite: dict = None,
+        channel_attention: bool = True,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        norm_eval: bool = False,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        self.channel_attention = channel_attention
+        self.use_depthwise = use_depthwise
+        self.conv = DepthwiseSeparableConvModule \
+            if use_depthwise else ConvModule
+        self.expand_ratio = expand_ratio
+        self.conv_cfg = conv_cfg
+
+        super().__init__(
+            arch_setting,
+            deepen_factor,
+            widen_factor,
+            input_channels,
+            out_indices,
+            frozen_stages=frozen_stages,
+            plugins=plugins,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        stem = nn.Sequential(
+            ConvModule(
+                3,
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                3,
+                padding=1,
+                stride=2,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                int(self.arch_setting[0][0] * self.widen_factor // 2),
+                int(self.arch_setting[0][0] * self.widen_factor),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        return stem
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, add_identity, use_spp = setting
+
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        num_blocks = max(round(num_blocks * self.deepen_factor), 1)
+
+        stage = []
+        conv_layer = self.conv(
+            in_channels,
+            out_channels,
+            3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(conv_layer)
+        if use_spp:
+            spp = SPPFBottleneck(
+                out_channels,
+                out_channels,
+                kernel_sizes=5,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        csp_layer = CSPLayer(
+            out_channels,
+            out_channels,
+            num_blocks=num_blocks,
+            add_identity=add_identity,
+            use_depthwise=self.use_depthwise,
+            use_cspnext_block=True,
+            expand_ratio=self.expand_ratio,
+            channel_attention=self.channel_attention,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        stage.append(csp_layer)
+        return stage
diff --git a/mmyolo/models/backbones/efficient_rep.py b/mmyolo/models/backbones/efficient_rep.py
new file mode 100644
index 0000000000000000000000000000000000000000..691c5b846a6453ff1dfbccb6785337f43e356bdc
--- /dev/null
+++ b/mmyolo/models/backbones/efficient_rep.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.models.layers.yolo_bricks import SPPFBottleneck
+from mmyolo.registry import MODELS
+from ..layers import BepC3StageBlock, RepStageBlock
+from ..utils import make_round
+from .base_backbone import BaseBackbone
+
+
+@MODELS.register_module()
+class YOLOv6EfficientRep(BaseBackbone):
+    """EfficientRep backbone used in YOLOv6.
+    Args:
+        arch (str): Architecture of BaseDarknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        init_cfg (Union[dict, list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOv6EfficientRep
+        >>> import torch
+        >>> model = YOLOv6EfficientRep()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 6, False], [128, 256, 12, False],
+               [256, 512, 18, False], [512, 1024, 6, True]]
+    }
+
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 norm_eval: bool = False,
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 init_cfg: OptMultiConfig = None):
+        self.block_cfg = block_cfg
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+
+        block_cfg = self.block_cfg.copy()
+        block_cfg.update(
+            dict(
+                in_channels=self.input_channels,
+                out_channels=int(self.arch_setting[0][0] * self.widen_factor),
+                kernel_size=3,
+                stride=2,
+            ))
+        return MODELS.build(block_cfg)
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, use_spp = setting
+
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+
+        rep_stage_block = RepStageBlock(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            num_blocks=num_blocks,
+            block_cfg=self.block_cfg,
+        )
+
+        block_cfg = self.block_cfg.copy()
+        block_cfg.update(
+            dict(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=2))
+        stage = []
+
+        ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block)
+
+        stage.append(ef_block)
+
+        if use_spp:
+            spp = SPPFBottleneck(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            """Initialize the parameters."""
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+
+
+@MODELS.register_module()
+class YOLOv6CSPBep(YOLOv6EfficientRep):
+    """CSPBep backbone used in YOLOv6.
+    Args:
+        arch (str): Architecture of BaseDarknet, from {P5, P6}.
+            Defaults to P5.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        input_channels (int): Number of input image channels. Defaults to 3.
+        out_indices (Tuple[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        block_act_cfg (dict): Config dict for activation layer used in each
+            stage. Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (Union[dict, list[dict]], optional): Initialization config
+            dict. Defaults to None.
+    Example:
+        >>> from mmyolo.models import YOLOv6CSPBep
+        >>> import torch
+        >>> model = YOLOv6CSPBep()
+        >>> model.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = model(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 6, False], [128, 256, 12, False],
+               [256, 512, 18, False], [512, 1024, 6, True]]
+    }
+
+    def __init__(self,
+                 arch: str = 'P5',
+                 plugins: Union[dict, List[dict]] = None,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 hidden_ratio: float = 0.5,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 block_cfg: ConfigType = dict(type='ConvWrapper'),
+                 init_cfg: OptMultiConfig = None):
+        self.hidden_ratio = hidden_ratio
+        super().__init__(
+            arch=arch,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            block_cfg=block_cfg,
+            init_cfg=init_cfg)
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, num_blocks, use_spp = setting
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+        num_blocks = make_round(num_blocks, self.deepen_factor)
+
+        rep_stage_block = BepC3StageBlock(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            num_blocks=num_blocks,
+            hidden_ratio=self.hidden_ratio,
+            block_cfg=self.block_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        block_cfg = self.block_cfg.copy()
+        block_cfg.update(
+            dict(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=2))
+        stage = []
+
+        ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block)
+
+        stage.append(ef_block)
+
+        if use_spp:
+            spp = SPPFBottleneck(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            stage.append(spp)
+        return stage
diff --git a/mmyolo/models/backbones/yolov7_backbone.py b/mmyolo/models/backbones/yolov7_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb9a5eed85ca1ee6884f7348ef3745a9ceaba032
--- /dev/null
+++ b/mmyolo/models/backbones/yolov7_backbone.py
@@ -0,0 +1,285 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.backbones.csp_darknet import Focus
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from ..layers import MaxPoolAndStrideConvBlock
+from .base_backbone import BaseBackbone
+
+
+@MODELS.register_module()
+class YOLOv7Backbone(BaseBackbone):
+    """Backbone used in YOLOv7.
+
+    Args:
+        arch (str): Architecture of YOLOv7Defaults to L.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    _tiny_stage1_cfg = dict(type='TinyDownSampleBlock', middle_ratio=0.5)
+    _tiny_stage2_4_cfg = dict(type='TinyDownSampleBlock', middle_ratio=1.0)
+    _l_expand_channel_2x = dict(
+        type='ELANBlock',
+        middle_ratio=0.5,
+        block_ratio=0.5,
+        num_blocks=2,
+        num_convs_in_block=2)
+    _l_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.25,
+        block_ratio=0.25,
+        num_blocks=2,
+        num_convs_in_block=2)
+    _x_expand_channel_2x = dict(
+        type='ELANBlock',
+        middle_ratio=0.4,
+        block_ratio=0.4,
+        num_blocks=3,
+        num_convs_in_block=2)
+    _x_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.2,
+        block_ratio=0.2,
+        num_blocks=3,
+        num_convs_in_block=2)
+    _w_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.5,
+        block_ratio=0.5,
+        num_blocks=2,
+        num_convs_in_block=2)
+    _e_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=0.4,
+        block_ratio=0.4,
+        num_blocks=3,
+        num_convs_in_block=2)
+    _d_no_change_channel = dict(
+        type='ELANBlock',
+        middle_ratio=1 / 3,
+        block_ratio=1 / 3,
+        num_blocks=4,
+        num_convs_in_block=2)
+    _e2e_no_change_channel = dict(
+        type='EELANBlock',
+        num_elan_block=2,
+        middle_ratio=0.4,
+        block_ratio=0.4,
+        num_blocks=3,
+        num_convs_in_block=2)
+
+    # From left to right:
+    # in_channels, out_channels, Block_params
+    arch_settings = {
+        'Tiny': [[64, 64, _tiny_stage1_cfg], [64, 128, _tiny_stage2_4_cfg],
+                 [128, 256, _tiny_stage2_4_cfg],
+                 [256, 512, _tiny_stage2_4_cfg]],
+        'L': [[64, 256, _l_expand_channel_2x],
+              [256, 512, _l_expand_channel_2x],
+              [512, 1024, _l_expand_channel_2x],
+              [1024, 1024, _l_no_change_channel]],
+        'X': [[80, 320, _x_expand_channel_2x],
+              [320, 640, _x_expand_channel_2x],
+              [640, 1280, _x_expand_channel_2x],
+              [1280, 1280, _x_no_change_channel]],
+        'W':
+        [[64, 128, _w_no_change_channel], [128, 256, _w_no_change_channel],
+         [256, 512, _w_no_change_channel], [512, 768, _w_no_change_channel],
+         [768, 1024, _w_no_change_channel]],
+        'E':
+        [[80, 160, _e_no_change_channel], [160, 320, _e_no_change_channel],
+         [320, 640, _e_no_change_channel], [640, 960, _e_no_change_channel],
+         [960, 1280, _e_no_change_channel]],
+        'D': [[96, 192,
+               _d_no_change_channel], [192, 384, _d_no_change_channel],
+              [384, 768, _d_no_change_channel],
+              [768, 1152, _d_no_change_channel],
+              [1152, 1536, _d_no_change_channel]],
+        'E2E': [[80, 160, _e2e_no_change_channel],
+                [160, 320, _e2e_no_change_channel],
+                [320, 640, _e2e_no_change_channel],
+                [640, 960, _e2e_no_change_channel],
+                [960, 1280, _e2e_no_change_channel]],
+    }
+
+    def __init__(self,
+                 arch: str = 'L',
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 input_channels: int = 3,
+                 out_indices: Tuple[int] = (2, 3, 4),
+                 frozen_stages: int = -1,
+                 plugins: Union[dict, List[dict]] = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 norm_eval: bool = False,
+                 init_cfg: OptMultiConfig = None):
+        assert arch in self.arch_settings.keys()
+        self.arch = arch
+        super().__init__(
+            self.arch_settings[arch],
+            deepen_factor,
+            widen_factor,
+            input_channels=input_channels,
+            out_indices=out_indices,
+            plugins=plugins,
+            frozen_stages=frozen_stages,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            norm_eval=norm_eval,
+            init_cfg=init_cfg)
+
+    def build_stem_layer(self) -> nn.Module:
+        """Build a stem layer."""
+        if self.arch in ['L', 'X']:
+            stem = nn.Sequential(
+                ConvModule(
+                    3,
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    3,
+                    padding=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    3,
+                    padding=1,
+                    stride=2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    3,
+                    padding=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        elif self.arch == 'Tiny':
+            stem = nn.Sequential(
+                ConvModule(
+                    3,
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    3,
+                    padding=1,
+                    stride=2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    int(self.arch_setting[0][0] * self.widen_factor // 2),
+                    int(self.arch_setting[0][0] * self.widen_factor),
+                    3,
+                    padding=1,
+                    stride=2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        elif self.arch in ['W', 'E', 'D', 'E2E']:
+            stem = Focus(
+                3,
+                int(self.arch_setting[0][0] * self.widen_factor),
+                kernel_size=3,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        return stem
+
+    def build_stage_layer(self, stage_idx: int, setting: list) -> list:
+        """Build a stage layer.
+
+        Args:
+            stage_idx (int): The index of a stage layer.
+            setting (list): The architecture setting of a stage layer.
+        """
+        in_channels, out_channels, stage_block_cfg = setting
+        in_channels = int(in_channels * self.widen_factor)
+        out_channels = int(out_channels * self.widen_factor)
+
+        stage_block_cfg = stage_block_cfg.copy()
+        stage_block_cfg.setdefault('norm_cfg', self.norm_cfg)
+        stage_block_cfg.setdefault('act_cfg', self.act_cfg)
+
+        stage_block_cfg['in_channels'] = in_channels
+        stage_block_cfg['out_channels'] = out_channels
+
+        stage = []
+        if self.arch in ['W', 'E', 'D', 'E2E']:
+            stage_block_cfg['in_channels'] = out_channels
+        elif self.arch in ['L', 'X']:
+            if stage_idx == 0:
+                stage_block_cfg['in_channels'] = out_channels // 2
+
+        downsample_layer = self._build_downsample_layer(
+            stage_idx, in_channels, out_channels)
+        stage.append(MODELS.build(stage_block_cfg))
+        if downsample_layer is not None:
+            stage.insert(0, downsample_layer)
+        return stage
+
+    def _build_downsample_layer(self, stage_idx: int, in_channels: int,
+                                out_channels: int) -> Optional[nn.Module]:
+        """Build a downsample layer pre stage."""
+        if self.arch in ['E', 'D', 'E2E']:
+            downsample_layer = MaxPoolAndStrideConvBlock(
+                in_channels,
+                out_channels,
+                use_in_channels_of_middle=True,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        elif self.arch == 'W':
+            downsample_layer = ConvModule(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        elif self.arch == 'Tiny':
+            if stage_idx != 0:
+                downsample_layer = nn.MaxPool2d(2, 2)
+            else:
+                downsample_layer = None
+        elif self.arch in ['L', 'X']:
+            if stage_idx == 0:
+                downsample_layer = ConvModule(
+                    in_channels,
+                    out_channels // 2,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
+            else:
+                downsample_layer = MaxPoolAndStrideConvBlock(
+                    in_channels,
+                    in_channels,
+                    use_in_channels_of_middle=False,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
+        return downsample_layer
diff --git a/mmyolo/models/data_preprocessors/__init__.py b/mmyolo/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ef4f6d7d801cb8150ebca645ddb3cbf5d1b9599
--- /dev/null
+++ b/mmyolo/models/data_preprocessors/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import (PPYOLOEBatchRandomResize,
+                                PPYOLOEDetDataPreprocessor,
+                                YOLOv5DetDataPreprocessor,
+                                YOLOXBatchSyncRandomResize)
+
+__all__ = [
+    'YOLOv5DetDataPreprocessor', 'PPYOLOEDetDataPreprocessor',
+    'PPYOLOEBatchRandomResize', 'YOLOXBatchSyncRandomResize'
+]
diff --git a/mmyolo/models/data_preprocessors/data_preprocessor.py b/mmyolo/models/data_preprocessors/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..f09fd8e741b9ec7d002391968eab40924ff7ab8d
--- /dev/null
+++ b/mmyolo/models/data_preprocessors/data_preprocessor.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from mmdet.models import BatchSyncRandomResize
+from mmdet.models.data_preprocessors import DetDataPreprocessor
+from mmengine import MessageHub, is_list_of
+from mmengine.structures import BaseDataElement
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+
+CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str,
+                 None]
+
+
+@MODELS.register_module()
+class YOLOXBatchSyncRandomResize(BatchSyncRandomResize):
+    """YOLOX batch random resize.
+
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+    """
+
+    def forward(self, inputs: Tensor, data_samples: dict) -> Tensor and dict:
+        """resize a batch of images and bboxes to shape ``self._input_size``"""
+        h, w = inputs.shape[-2:]
+        inputs = inputs.float()
+        assert isinstance(data_samples, dict)
+
+        if self._input_size is None:
+            self._input_size = (h, w)
+        scale_y = self._input_size[0] / h
+        scale_x = self._input_size[1] / w
+        if scale_x != 1 or scale_y != 1:
+            inputs = F.interpolate(
+                inputs,
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+
+            data_samples['bboxes_labels'][:, 2::2] *= scale_x
+            data_samples['bboxes_labels'][:, 3::2] *= scale_y
+
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            self._input_size = self._get_random_size(
+                aspect_ratio=float(w / h), device=inputs.device)
+
+        return inputs, data_samples
+
+
+@MODELS.register_module()
+class YOLOv5DetDataPreprocessor(DetDataPreprocessor):
+    """Rewrite collate_fn to get faster training speed.
+
+    Note: It must be used together with `mmyolo.datasets.utils.yolov5_collate`
+    """
+
+    def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs):
+        super().__init__(*args, non_blocking=non_blocking, **kwargs)
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding and bgr2rgb conversion based on
+        ``DetDataPreprocessorr``.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if not training:
+            return super().forward(data, training)
+
+        data = self.cast_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        assert isinstance(data['data_samples'], dict)
+
+        # TODO: Supports multi-scale training
+        if self._channel_conversion and inputs.shape[1] == 3:
+            inputs = inputs[:, [2, 1, 0], ...]
+        if self._enable_normalize:
+            inputs = (inputs - self.mean) / self.std
+
+        if self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(inputs, data_samples)
+
+        img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs)
+        data_samples_output = {
+            'bboxes_labels': data_samples['bboxes_labels'],
+            'img_metas': img_metas
+        }
+        if 'masks' in data_samples:
+            data_samples_output['masks'] = data_samples['masks']
+
+        return {'inputs': inputs, 'data_samples': data_samples_output}
+
+
+@MODELS.register_module()
+class PPYOLOEDetDataPreprocessor(DetDataPreprocessor):
+    """Image pre-processor for detection tasks.
+
+    The main difference between PPYOLOEDetDataPreprocessor and
+    DetDataPreprocessor is the normalization order. The official
+    PPYOLOE resize image first, and then normalize image.
+    In DetDataPreprocessor, the order is reversed.
+
+    Note: It must be used together with
+    `mmyolo.datasets.utils.yolov5_collate`
+    """
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization、padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``. This class use batch_augments first, and then
+        normalize the image, which is different from the `DetDataPreprocessor`
+        .
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if not training:
+            return super().forward(data, training)
+
+        assert isinstance(data['inputs'], list) and is_list_of(
+            data['inputs'], torch.Tensor), \
+            '"inputs" should be a list of Tensor, but got ' \
+            f'{type(data["inputs"])}. The possible reason for this ' \
+            'is that you are not using it with ' \
+            '"mmyolo.datasets.utils.yolov5_collate". Please refer to ' \
+            '"cconfigs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py".'
+
+        data = self.cast_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        assert isinstance(data['data_samples'], dict)
+
+        # Process data.
+        batch_inputs = []
+        for _input in inputs:
+            # channel transform
+            if self._channel_conversion:
+                _input = _input[[2, 1, 0], ...]
+            # Convert to float after channel conversion to ensure
+            # efficiency
+            _input = _input.float()
+            batch_inputs.append(_input)
+
+        # Batch random resize image.
+        if self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(batch_inputs, data_samples)
+
+        if self._enable_normalize:
+            inputs = (inputs - self.mean) / self.std
+
+        img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs)
+        data_samples = {
+            'bboxes_labels': data_samples['bboxes_labels'],
+            'img_metas': img_metas
+        }
+
+        return {'inputs': inputs, 'data_samples': data_samples}
+
+
+# TODO: No generality. Its input data format is different
+#  mmdet's batch aug, and it must be compatible in the future.
+@MODELS.register_module()
+class PPYOLOEBatchRandomResize(BatchSyncRandomResize):
+    """PPYOLOE batch random resize.
+
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+        random_interp (bool): Whether to choose interp_mode randomly.
+            If set to True, the type of `interp_mode` must be list.
+            If set to False, the type of `interp_mode` must be str.
+            Defaults to True.
+        interp_mode (Union[List, str]): The modes available for resizing
+            are ('nearest', 'bilinear', 'bicubic', 'area').
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing
+            the image. Now we only support keep_ratio=False.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 random_size_range: Tuple[int, int],
+                 interval: int = 1,
+                 size_divisor: int = 32,
+                 random_interp=True,
+                 interp_mode: Union[List[str], str] = [
+                     'nearest', 'bilinear', 'bicubic', 'area'
+                 ],
+                 keep_ratio: bool = False) -> None:
+        super().__init__(random_size_range, interval, size_divisor)
+        self.random_interp = random_interp
+        self.keep_ratio = keep_ratio
+        # TODO: need to support keep_ratio==True
+        assert not self.keep_ratio, 'We do not yet support keep_ratio=True'
+
+        if self.random_interp:
+            assert isinstance(interp_mode, list) and len(interp_mode) > 1,\
+                'While random_interp==True, the type of `interp_mode`' \
+                ' must be list and len(interp_mode) must large than 1'
+            self.interp_mode_list = interp_mode
+            self.interp_mode = None
+        else:
+            assert isinstance(interp_mode, str),\
+                'While random_interp==False, the type of ' \
+                '`interp_mode` must be str'
+            assert interp_mode in ['nearest', 'bilinear', 'bicubic', 'area']
+            self.interp_mode_list = None
+            self.interp_mode = interp_mode
+
+    def forward(self, inputs: list,
+                data_samples: dict) -> Tuple[Tensor, Tensor]:
+        """Resize a batch of images and bboxes to shape ``self._input_size``.
+
+        The inputs and data_samples should be list, and
+        ``PPYOLOEBatchRandomResize`` must be used with
+        ``PPYOLOEDetDataPreprocessor`` and ``yolov5_collate`` with
+        ``use_ms_training == True``.
+        """
+        assert isinstance(inputs, list),\
+            'The type of inputs must be list. The possible reason for this ' \
+            'is that you are not using it with `PPYOLOEDetDataPreprocessor` ' \
+            'and `yolov5_collate` with use_ms_training == True.'
+
+        bboxes_labels = data_samples['bboxes_labels']
+
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            # get current input size
+            self._input_size, interp_mode = self._get_random_size_and_interp()
+            if self.random_interp:
+                self.interp_mode = interp_mode
+
+        # TODO: need to support type(inputs)==Tensor
+        if isinstance(inputs, list):
+            outputs = []
+            for i in range(len(inputs)):
+                _batch_input = inputs[i]
+                h, w = _batch_input.shape[-2:]
+                scale_y = self._input_size[0] / h
+                scale_x = self._input_size[1] / w
+                if scale_x != 1. or scale_y != 1.:
+                    if self.interp_mode in ('nearest', 'area'):
+                        align_corners = None
+                    else:
+                        align_corners = False
+                    _batch_input = F.interpolate(
+                        _batch_input.unsqueeze(0),
+                        size=self._input_size,
+                        mode=self.interp_mode,
+                        align_corners=align_corners)
+
+                    # rescale boxes
+                    indexes = bboxes_labels[:, 0] == i
+                    bboxes_labels[indexes, 2] *= scale_x
+                    bboxes_labels[indexes, 3] *= scale_y
+                    bboxes_labels[indexes, 4] *= scale_x
+                    bboxes_labels[indexes, 5] *= scale_y
+
+                    data_samples['bboxes_labels'] = bboxes_labels
+                else:
+                    _batch_input = _batch_input.unsqueeze(0)
+
+                outputs.append(_batch_input)
+
+            # convert to Tensor
+            return torch.cat(outputs, dim=0), data_samples
+        else:
+            raise NotImplementedError('Not implemented yet!')
+
+    def _get_random_size_and_interp(self) -> Tuple[int, int]:
+        """Randomly generate a shape in ``_random_size_range`` and a
+        interp_mode in interp_mode_list."""
+        size = random.randint(*self._random_size_range)
+        input_size = (self._size_divisor * size, self._size_divisor * size)
+
+        if self.random_interp:
+            interp_ind = random.randint(0, len(self.interp_mode_list) - 1)
+            interp_mode = self.interp_mode_list[interp_ind]
+        else:
+            interp_mode = None
+        return input_size, interp_mode
diff --git a/mmyolo/models/dense_heads/__init__.py b/mmyolo/models/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a95abd611db4115484c62fab610650a091c092cf
--- /dev/null
+++ b/mmyolo/models/dense_heads/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ppyoloe_head import PPYOLOEHead, PPYOLOEHeadModule
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule
+from .rtmdet_ins_head import RTMDetInsSepBNHead, RTMDetInsSepBNHeadModule
+from .rtmdet_rotated_head import (RTMDetRotatedHead,
+                                  RTMDetRotatedSepBNHeadModule)
+from .yolov5_head import YOLOv5Head, YOLOv5HeadModule
+from .yolov6_head import YOLOv6Head, YOLOv6HeadModule
+from .yolov7_head import YOLOv7Head, YOLOv7HeadModule, YOLOv7p6HeadModule
+from .yolov8_head import YOLOv8Head, YOLOv8HeadModule
+from .yolox_head import YOLOXHead, YOLOXHeadModule
+
+__all__ = [
+    'YOLOv5Head', 'YOLOv6Head', 'YOLOXHead', 'YOLOv5HeadModule',
+    'YOLOv6HeadModule', 'YOLOXHeadModule', 'RTMDetHead',
+    'RTMDetSepBNHeadModule', 'YOLOv7Head', 'PPYOLOEHead', 'PPYOLOEHeadModule',
+    'YOLOv7HeadModule', 'YOLOv7p6HeadModule', 'YOLOv8Head', 'YOLOv8HeadModule',
+    'RTMDetRotatedHead', 'RTMDetRotatedSepBNHeadModule', 'RTMDetInsSepBNHead',
+    'RTMDetInsSepBNHeadModule'
+]
diff --git a/mmyolo/models/dense_heads/ppyoloe_head.py b/mmyolo/models/dense_heads/ppyoloe_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d820041cf2fc3d3f605fee1ae9cc87cf7cee4c
--- /dev/null
+++ b/mmyolo/models/dense_heads/ppyoloe_head.py
@@ -0,0 +1,374 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.models.utils import multi_apply
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from mmengine import MessageHub
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+from ..layers.yolo_bricks import PPYOLOESELayer
+from ..utils import gt_instances_preprocess
+from .yolov6_head import YOLOv6Head
+
+
+@MODELS.register_module()
+class PPYOLOEHeadModule(BaseModule):
+    """PPYOLOEHead head module used in `PPYOLOE.
+
+    <https://arxiv.org/abs/2203.16250>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}``
+            in QFL setting. Defaults to 16.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 1,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 reg_max: int = 16,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.num_classes = num_classes
+        self.featmap_strides = featmap_strides
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.reg_max = reg_max
+
+        if isinstance(in_channels, int):
+            self.in_channels = [int(in_channels * widen_factor)
+                                ] * self.num_levels
+        else:
+            self.in_channels = [int(i * widen_factor) for i in in_channels]
+
+        self._init_layers()
+
+    def init_weights(self, prior_prob=0.01):
+        """Initialize the weight and bias of PPYOLOE head."""
+        super().init_weights()
+        for conv in self.cls_preds:
+            conv.bias.data.fill_(bias_init_with_prob(prior_prob))
+            conv.weight.data.fill_(0.)
+
+        for conv in self.reg_preds:
+            conv.bias.data.fill_(1.0)
+            conv.weight.data.fill_(0.)
+
+    def _init_layers(self):
+        """initialize conv layers in PPYOLOE head."""
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.cls_stems = nn.ModuleList()
+        self.reg_stems = nn.ModuleList()
+
+        for in_channel in self.in_channels:
+            self.cls_stems.append(
+                PPYOLOESELayer(
+                    in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg))
+            self.reg_stems.append(
+                PPYOLOESELayer(
+                    in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg))
+
+        for in_channel in self.in_channels:
+            self.cls_preds.append(
+                nn.Conv2d(in_channel, self.num_classes, 3, padding=1))
+            self.reg_preds.append(
+                nn.Conv2d(in_channel, 4 * (self.reg_max + 1), 3, padding=1))
+
+        # init proj
+        proj = torch.linspace(0, self.reg_max, self.reg_max + 1).view(
+            [1, self.reg_max + 1, 1, 1])
+        self.register_buffer('proj', proj, persistent=False)
+
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions.
+        """
+        assert len(x) == self.num_levels
+
+        return multi_apply(self.forward_single, x, self.cls_stems,
+                           self.cls_preds, self.reg_stems, self.reg_preds)
+
+    def forward_single(self, x: Tensor, cls_stem: nn.ModuleList,
+                       cls_pred: nn.ModuleList, reg_stem: nn.ModuleList,
+                       reg_pred: nn.ModuleList) -> Tensor:
+        """Forward feature of a single scale level."""
+        b, _, h, w = x.shape
+        hw = h * w
+        avg_feat = F.adaptive_avg_pool2d(x, (1, 1))
+        cls_logit = cls_pred(cls_stem(x, avg_feat) + x)
+        bbox_dist_preds = reg_pred(reg_stem(x, avg_feat))
+        # TODO: Test whether use matmul instead of conv can speed up training.
+        bbox_dist_preds = bbox_dist_preds.reshape(
+            [-1, 4, self.reg_max + 1, hw]).permute(0, 2, 3, 1)
+
+        bbox_preds = F.conv2d(F.softmax(bbox_dist_preds, dim=1), self.proj)
+
+        if self.training:
+            return cls_logit, bbox_preds, bbox_dist_preds
+        else:
+            return cls_logit, bbox_preds
+
+
+@MODELS.register_module()
+class PPYOLOEHead(YOLOv6Head):
+    """PPYOLOEHead head used in `PPYOLOE <https://arxiv.org/abs/2203.16250>`_.
+    The YOLOv6 head and the PPYOLOE head are only slightly different.
+    Distribution focal loss is extra used in PPYOLOE, but not in YOLOv6.
+
+    Args:
+        head_module(ConfigType): Base module used for YOLOv5Head
+        prior_generator(dict): Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_dfl (:obj:`ConfigDict` or dict): Config of distribution focal
+            loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0.5,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='giou',
+                     bbox_format='xyxy',
+                     reduction='mean',
+                     loss_weight=2.5,
+                     return_iou=False),
+                 loss_dfl: ConfigType = dict(
+                     type='mmdet.DistributionFocalLoss',
+                     reduction='mean',
+                     loss_weight=0.5 / 4),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.loss_dfl = MODELS.build(loss_dfl)
+        # ppyoloe doesn't need loss_obj
+        self.loss_obj = None
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            bbox_dist_preds: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            bbox_dist_preds (Sequence[Tensor]): Box distribution logits for
+                each scale level with shape (bs, reg_max + 1, H*W, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+
+        # get epoch information from message hub
+        message_hub = MessageHub.get_current_instance()
+        current_epoch = message_hub.get_info('epoch')
+
+        num_imgs = len(batch_img_metas)
+
+        current_featmap_sizes = [
+            cls_score.shape[2:] for cls_score in cls_scores
+        ]
+        # If the shape does not equal, generate new one
+        if current_featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = current_featmap_sizes
+
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                self.featmap_sizes_train,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+
+            self.num_level_priors = [len(n) for n in mlvl_priors_with_stride]
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+            self.stride_tensor = self.flatten_priors_train[..., [2]]
+
+        # gt info
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+
+        # pred info
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+        flatten_pred_bboxes = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        # (bs, reg_max+1, n, 4) -> (bs, n, 4, reg_max+1)
+        flatten_pred_dists = [
+            bbox_pred_org.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, (self.head_module.reg_max + 1) * 4)
+            for bbox_pred_org in bbox_dist_preds
+        ]
+
+        flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1)
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1)
+        flatten_pred_bboxes = self.bbox_coder.decode(
+            self.flatten_priors_train[..., :2], flatten_pred_bboxes,
+            self.stride_tensor[..., 0])
+        pred_scores = torch.sigmoid(flatten_cls_preds)
+
+        if current_epoch < self.initial_epoch:
+            assigned_result = self.initial_assigner(
+                flatten_pred_bboxes.detach(), self.flatten_priors_train,
+                self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag)
+        else:
+            assigned_result = self.assigner(flatten_pred_bboxes.detach(),
+                                            pred_scores.detach(),
+                                            self.flatten_priors_train,
+                                            gt_labels, gt_bboxes,
+                                            pad_bbox_flag)
+
+        assigned_bboxes = assigned_result['assigned_bboxes']
+        assigned_scores = assigned_result['assigned_scores']
+        fg_mask_pre_prior = assigned_result['fg_mask_pre_prior']
+
+        # cls loss
+        with torch.cuda.amp.autocast(enabled=False):
+            loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores)
+
+        # rescale bbox
+        assigned_bboxes /= self.stride_tensor
+        flatten_pred_bboxes /= self.stride_tensor
+
+        assigned_scores_sum = assigned_scores.sum()
+        # reduce_mean between all gpus
+        assigned_scores_sum = torch.clamp(
+            reduce_mean(assigned_scores_sum), min=1)
+        loss_cls /= assigned_scores_sum
+
+        # select positive samples mask
+        num_pos = fg_mask_pre_prior.sum()
+        if num_pos > 0:
+            # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox
+            # will not report an error
+            # iou loss
+            prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(
+                flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = torch.masked_select(
+                assigned_bboxes, prior_bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1)
+            loss_bbox = self.loss_bbox(
+                pred_bboxes_pos,
+                assigned_bboxes_pos,
+                weight=bbox_weight,
+                avg_factor=assigned_scores_sum)
+
+            # dfl loss
+            dist_mask = fg_mask_pre_prior.unsqueeze(-1).repeat(
+                [1, 1, (self.head_module.reg_max + 1) * 4])
+
+            pred_dist_pos = torch.masked_select(
+                flatten_dist_preds,
+                dist_mask).reshape([-1, 4, self.head_module.reg_max + 1])
+            assigned_ltrb = self.bbox_coder.encode(
+                self.flatten_priors_train[..., :2] / self.stride_tensor,
+                assigned_bboxes,
+                max_dis=self.head_module.reg_max,
+                eps=0.01)
+            assigned_ltrb_pos = torch.masked_select(
+                assigned_ltrb, prior_bbox_mask).reshape([-1, 4])
+            loss_dfl = self.loss_dfl(
+                pred_dist_pos.reshape(-1, self.head_module.reg_max + 1),
+                assigned_ltrb_pos.reshape(-1),
+                weight=bbox_weight.expand(-1, 4).reshape(-1),
+                avg_factor=assigned_scores_sum)
+        else:
+            loss_bbox = flatten_pred_bboxes.sum() * 0
+            loss_dfl = flatten_pred_bboxes.sum() * 0
+
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
diff --git a/mmyolo/models/dense_heads/rtmdet_head.py b/mmyolo/models/dense_heads/rtmdet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..54245a97f404b66eba47e41f03302110c8894134
--- /dev/null
+++ b/mmyolo/models/dense_heads/rtmdet_head.py
@@ -0,0 +1,368 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmdet.models.task_modules.samplers import PseudoSampler
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig, reduce_mean)
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from torch import Tensor
+
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess
+from .yolov5_head import YOLOv5Head
+
+
+@MODELS.register_module()
+class RTMDetSepBNHeadModule(BaseModule):
+    """Detection Head of RTMDet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.  Defaults to 1.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Defaults to 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 2.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN')``.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='SiLU', inplace=True).
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        widen_factor: float = 1.0,
+        num_base_priors: int = 1,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        share_conv: bool = True,
+        pred_kernel_size: int = 1,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: OptMultiConfig = None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.share_conv = share_conv
+        self.num_classes = num_classes
+        self.pred_kernel_size = pred_kernel_size
+        self.feat_channels = int(feat_channels * widen_factor)
+        self.stacked_convs = stacked_convs
+        self.num_base_priors = num_base_priors
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.featmap_strides = featmap_strides
+
+        self.in_channels = int(in_channels * widen_factor)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        for n in range(len(self.featmap_strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.num_classes,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+
+        if self.share_conv:
+            for n in range(len(self.featmap_strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        # Use prior in model initialization to improve stability
+        super().init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        for idx, x in enumerate(feats):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            reg_dist = self.rtm_reg[idx](reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+
+@MODELS.register_module()
+class RTMDetHead(YOLOv5Head):
+    """RTMDet head.
+
+    Args:
+        head_module(ConfigType): Base module used for RTMDetHead
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.QualityFocalLoss',
+                     use_sigmoid=True,
+                     beta=2.0,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.GIoULoss', loss_weight=2.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+        # rtmdet doesn't need loss_obj
+        self.loss_obj = None
+
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+
+        The special_init function is designed to deal with this situation.
+        """
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg.sampler, default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+            self.featmap_sizes_train = None
+            self.flatten_priors_train = None
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        return self.head_module(x)
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+
+        device = cls_scores[0].device
+
+        # If the shape does not equal, generate new one
+        if featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = featmap_sizes
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                featmap_sizes, device=device, with_stride=True)
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1).contiguous()
+
+        flatten_bboxes = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ], 1)
+        flatten_bboxes = flatten_bboxes * self.flatten_priors_train[..., -1,
+                                                                    None]
+        flatten_bboxes = distance2bbox(self.flatten_priors_train[..., :2],
+                                       flatten_bboxes)
+
+        assigned_result = self.assigner(flatten_bboxes.detach(),
+                                        flatten_cls_scores.detach(),
+                                        self.flatten_priors_train, gt_labels,
+                                        gt_bboxes, pad_bbox_flag)
+
+        labels = assigned_result['assigned_labels'].reshape(-1)
+        label_weights = assigned_result['assigned_labels_weights'].reshape(-1)
+        bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4)
+        assign_metrics = assigned_result['assign_metrics'].reshape(-1)
+        cls_preds = flatten_cls_scores.reshape(-1, self.num_classes)
+        bbox_preds = flatten_bboxes.reshape(-1, 4)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item()
+
+        loss_cls = self.loss_cls(
+            cls_preds, (labels, assign_metrics),
+            label_weights,
+            avg_factor=avg_factor)
+
+        if len(pos_inds) > 0:
+            loss_bbox = self.loss_bbox(
+                bbox_preds[pos_inds],
+                bbox_targets[pos_inds],
+                weight=assign_metrics[pos_inds],
+                avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_preds.sum() * 0
+
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
diff --git a/mmyolo/models/dense_heads/rtmdet_ins_head.py b/mmyolo/models/dense_heads/rtmdet_ins_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d0562aad6fb977516924ef9cd72cdef54ff0016
--- /dev/null
+++ b/mmyolo/models/dense_heads/rtmdet_ins_head.py
@@ -0,0 +1,725 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmcv.ops import batched_nms
+from mmdet.models.utils import filter_scores_and_topk
+from mmdet.structures.bbox import get_box_tensor, get_box_wh, scale_boxes
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from mmengine import ConfigDict
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule
+
+
+class MaskFeatModule(BaseModule):
+    """Mask feature head used in RTMDet-Ins. Copy from mmdet.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        stacked_convs (int): Number of convs in mask feature branch.
+        num_levels (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        num_prototypes (int): Number of output channel of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True)
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        num_levels: int = 3,
+        num_prototypes: int = 8,
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        norm_cfg: ConfigType = dict(type='BN')
+    ) -> None:
+        super().__init__(init_cfg=None)
+        self.num_levels = num_levels
+        self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1)
+        convs = []
+        for i in range(stacked_convs):
+            in_c = in_channels if i == 0 else feat_channels
+            convs.append(
+                ConvModule(
+                    in_c,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg))
+        self.stacked_convs = nn.Sequential(*convs)
+        self.projection = nn.Conv2d(
+            feat_channels, num_prototypes, kernel_size=1)
+
+    def forward(self, features: Tuple[Tensor, ...]) -> Tensor:
+        # multi-level feature fusion
+        fusion_feats = [features[0]]
+        size = features[0].shape[-2:]
+        for i in range(1, self.num_levels):
+            f = F.interpolate(features[i], size=size, mode='bilinear')
+            fusion_feats.append(f)
+        fusion_feats = torch.cat(fusion_feats, dim=1)
+        fusion_feats = self.fusion_conv(fusion_feats)
+        # pred mask feats
+        mask_features = self.stacked_convs(fusion_feats)
+        mask_features = self.projection(mask_features)
+        return mask_features
+
+
+@MODELS.register_module()
+class RTMDetInsSepBNHeadModule(RTMDetSepBNHeadModule):
+    """Detection and Instance Segmentation Head of RTMDet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        num_prototypes (int): Number of mask prototype features extracted
+            from the mask head. Defaults to 8.
+        dyconv_channels (int): Channel of the dynamic conv layers.
+            Defaults to 8.
+        num_dyconvs (int): Number of the dynamic convolution layers.
+            Defaults to 3.
+        use_sigmoid_cls (bool): Use sigmoid for class prediction.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 *args,
+                 num_prototypes: int = 8,
+                 dyconv_channels: int = 8,
+                 num_dyconvs: int = 3,
+                 use_sigmoid_cls: bool = True,
+                 **kwargs):
+        self.num_prototypes = num_prototypes
+        self.num_dyconvs = num_dyconvs
+        self.dyconv_channels = dyconv_channels
+        self.use_sigmoid_cls = use_sigmoid_cls
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        super().__init__(num_classes=num_classes, *args, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        self.rtm_kernel = nn.ModuleList()
+        self.rtm_obj = nn.ModuleList()
+
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+        pred_pad_size = self.pred_kernel_size // 2
+
+        for n in range(len(self.featmap_strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            kernel_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                kernel_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(cls_convs)
+            self.kernel_convs.append(kernel_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_kernel.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_gen_params,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+
+        if self.share_conv:
+            for n in range(len(self.featmap_strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.featmap_strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg,
+                                                self.rtm_kernel):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01, bias=1)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Mask prototype features.
+                Has shape (batch_size, num_prototypes, H, W).
+        """
+        mask_feat = self.mask_head(feats)
+
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, stride) in enumerate(zip(feats, self.featmap_strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for kernel_layer in self.kernel_convs[idx]:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel[idx](kernel_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+            reg_dist = self.rtm_reg[idx](reg_feat)
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
+
+
+@MODELS.register_module()
+class RTMDetInsSepBNHead(RTMDetHead):
+    """RTMDet Instance Segmentation head.
+
+    Args:
+        head_module(ConfigType): Base module used for RTMDetInsSepBNHead
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_mask (:obj:`ConfigDict` or dict): Config of mask loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.QualityFocalLoss',
+                     use_sigmoid=True,
+                     beta=2.0,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.GIoULoss', loss_weight=2.0),
+                 loss_mask=dict(
+                     type='mmdet.DiceLoss',
+                     loss_weight=2.0,
+                     eps=5e-6,
+                     reduction='mean'),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if isinstance(self.head_module, RTMDetInsSepBNHeadModule):
+            assert self.use_sigmoid_cls == self.head_module.use_sigmoid_cls
+        self.loss_mask = MODELS.build(loss_mask)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        kernel_preds: List[Tensor],
+                        mask_feats: Tensor,
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_params, H, W).
+            mask_feats (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, num_prototypes, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection and instance
+            segmentation results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+
+        multi_label = cfg.multi_label
+        multi_label &= self.num_classes > 1
+        cfg.multi_label = multi_label
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size.numel() * self.num_base_priors, ), stride) for
+            featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+
+        # flatten cls_scores, bbox_preds
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_classes)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_kernel_preds = [
+            kernel_pred.permute(0, 2, 3,
+                                1).reshape(num_imgs, -1,
+                                           self.head_module.num_gen_params)
+            for kernel_pred in kernel_preds
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_decoded_bboxes = self.bbox_coder.decode(
+            flatten_priors[..., :2].unsqueeze(0), flatten_bbox_preds,
+            flatten_stride)
+
+        flatten_kernel_preds = torch.cat(flatten_kernel_preds, dim=1)
+
+        results_list = []
+        for (bboxes, scores, kernel_pred, mask_feat,
+             img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores,
+                              flatten_kernel_preds, mask_feats,
+                              batch_img_metas):
+            ori_shape = img_meta['ori_shape']
+            scale_factor = img_meta['scale_factor']
+            if 'pad_param' in img_meta:
+                pad_param = img_meta['pad_param']
+            else:
+                pad_param = None
+
+            score_thr = cfg.get('score_thr', -1)
+            if scores.shape[0] == 0:
+                empty_results = InstanceData()
+                empty_results.bboxes = bboxes
+                empty_results.scores = scores[:, 0]
+                empty_results.labels = scores[:, 0].int()
+                h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2]
+                empty_results.masks = torch.zeros(
+                    size=(0, h, w), dtype=torch.bool, device=bboxes.device)
+                results_list.append(empty_results)
+                continue
+
+            nms_pre = cfg.get('nms_pre', 100000)
+            if cfg.multi_label is False:
+                scores, labels = scores.max(1, keepdim=True)
+                scores, _, keep_idxs, results = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(
+                        labels=labels[:, 0],
+                        kernel_pred=kernel_pred,
+                        priors=flatten_priors))
+                labels = results['labels']
+                kernel_pred = results['kernel_pred']
+                priors = results['priors']
+            else:
+                out = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(
+                        kernel_pred=kernel_pred, priors=flatten_priors))
+                scores, labels, keep_idxs, filtered_results = out
+                kernel_pred = filtered_results['kernel_pred']
+                priors = filtered_results['priors']
+
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=bboxes[keep_idxs],
+                kernels=kernel_pred,
+                priors=priors)
+
+            if rescale:
+                if pad_param is not None:
+                    results.bboxes -= results.bboxes.new_tensor([
+                        pad_param[2], pad_param[0], pad_param[2], pad_param[0]
+                    ])
+                results.bboxes /= results.bboxes.new_tensor(
+                    scale_factor).repeat((1, 2))
+
+            if cfg.get('yolox_style', False):
+                # do not need max_per_img
+                cfg.max_per_img = len(results)
+
+            results = self._bbox_mask_post_process(
+                results=results,
+                mask_feat=mask_feat,
+                cfg=cfg,
+                rescale_bbox=False,
+                rescale_mask=rescale,
+                with_nms=with_nms,
+                pad_param=pad_param,
+                img_meta=img_meta)
+            results.bboxes[:, 0::2].clamp_(0, ori_shape[1])
+            results.bboxes[:, 1::2].clamp_(0, ori_shape[0])
+
+            results_list.append(results)
+        return results_list
+
+    def _bbox_mask_post_process(
+            self,
+            results: InstanceData,
+            mask_feat: Tensor,
+            cfg: ConfigDict,
+            rescale_bbox: bool = False,
+            rescale_mask: bool = True,
+            with_nms: bool = True,
+            pad_param: Optional[np.ndarray] = None,
+            img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox and mask post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            mask_feat (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, num_prototypes, H, W).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale_bbox (bool): If True, return boxes in original image space.
+                Default to False.
+            rescale_mask (bool): If True, return masks in original image space.
+                Default to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        if rescale_bbox:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        if hasattr(results, 'score_factors'):
+            # TODO： Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        assert with_nms, 'with_nms must be True for RTMDet-Ins'
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+            # process masks
+            mask_logits = self._mask_predict_by_feat(mask_feat,
+                                                     results.kernels,
+                                                     results.priors)
+
+            stride = self.prior_generator.strides[0][0]
+            mask_logits = F.interpolate(
+                mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear')
+            if rescale_mask:
+                # TODO: When use mmdet.Resize or mmdet.Pad, will meet bug
+                # Use img_meta to crop and resize
+                ori_h, ori_w = img_meta['ori_shape'][:2]
+                if isinstance(pad_param, np.ndarray):
+                    pad_param = pad_param.astype(np.int32)
+                    crop_y1, crop_y2 = pad_param[
+                        0], mask_logits.shape[-2] - pad_param[1]
+                    crop_x1, crop_x2 = pad_param[
+                        2], mask_logits.shape[-1] - pad_param[3]
+                    mask_logits = mask_logits[..., crop_y1:crop_y2,
+                                              crop_x1:crop_x2]
+                mask_logits = F.interpolate(
+                    mask_logits,
+                    size=[ori_h, ori_w],
+                    mode='bilinear',
+                    align_corners=False)
+
+            masks = mask_logits.sigmoid().squeeze(0)
+            masks = masks > cfg.mask_thr_binary
+            results.masks = masks
+        else:
+            h, w = img_meta['ori_shape'][:2] if rescale_mask else img_meta[
+                'img_shape'][:2]
+            results.masks = torch.zeros(
+                size=(results.bboxes.shape[0], h, w),
+                dtype=torch.bool,
+                device=results.bboxes.device)
+        return results
+
+    def _mask_predict_by_feat(self, mask_feat: Tensor, kernels: Tensor,
+                              priors: Tensor) -> Tensor:
+        """Generate mask logits from mask features with dynamic convs.
+
+        Args:
+            mask_feat (Tensor): Mask prototype features.
+                Has shape (num_prototypes, H, W).
+            kernels (Tensor): Kernel parameters for each instance.
+                Has shape (num_instance, num_params)
+            priors (Tensor): Center priors for each instance.
+                Has shape (num_instance, 4).
+        Returns:
+            Tensor: Instance segmentation masks for each instance.
+                Has shape (num_instance, H, W).
+        """
+        num_inst = kernels.shape[0]
+        h, w = mask_feat.size()[-2:]
+        if num_inst < 1:
+            return torch.empty(
+                size=(num_inst, h, w),
+                dtype=mask_feat.dtype,
+                device=mask_feat.device)
+        if len(mask_feat.shape) < 4:
+            mask_feat.unsqueeze(0)
+
+        coord = self.prior_generator.single_level_grid_priors(
+            (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2)
+        num_inst = priors.shape[0]
+        points = priors[:, :2].reshape(-1, 1, 2)
+        strides = priors[:, 2:].reshape(-1, 1, 2)
+        relative_coord = (points - coord).permute(0, 2, 1) / (
+            strides[..., 0].reshape(-1, 1, 1) * 8)
+        relative_coord = relative_coord.reshape(num_inst, 2, h, w)
+
+        mask_feat = torch.cat(
+            [relative_coord,
+             mask_feat.repeat(num_inst, 1, 1, 1)], dim=1)
+        weights, biases = self.parse_dynamic_params(kernels)
+
+        n_layers = len(weights)
+        x = mask_feat.reshape(1, -1, h, w)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                x, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        x = x.reshape(num_inst, h, w)
+        return x
+
+    def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple:
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(self.head_module.weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels,
+                self.head_module.weight_nums + self.head_module.bias_nums,
+                dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * self.head_module.dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(
+                    n_inst * self.head_module.dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+
+        return weight_splits, bias_splits
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        raise NotImplementedError
diff --git a/mmyolo/models/dense_heads/rtmdet_rotated_head.py b/mmyolo/models/dense_heads/rtmdet_rotated_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1428b4fd05065e3dba764313febc46d6125408ac
--- /dev/null
+++ b/mmyolo/models/dense_heads/rtmdet_rotated_head.py
@@ -0,0 +1,641 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmdet.models.utils import filter_scores_and_topk
+from mmdet.structures.bbox import HorizontalBoxes, distance2bbox
+from mmdet.structures.bbox.transforms import bbox_cxcywh_to_xyxy, scale_boxes
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig, reduce_mean)
+from mmengine.config import ConfigDict
+from mmengine.model import normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule
+
+try:
+    from mmrotate.structures.bbox import RotatedBoxes, distance2obb
+    MMROTATE_AVAILABLE = True
+except ImportError:
+    RotatedBoxes = None
+    distance2obb = None
+    MMROTATE_AVAILABLE = False
+
+
+@MODELS.register_module()
+class RTMDetRotatedSepBNHeadModule(RTMDetSepBNHeadModule):
+    """Detection Head Module of RTMDet-R.
+
+    Compared with RTMDet Detection Head Module, RTMDet-R adds
+    a conv for angle prediction.
+    An `angle_out_dim` arg is added, which is generated by the
+    angle_coder module and controls the angle pred dim.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.  Defaults to 1.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Defaults to 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 2.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1.
+        angle_out_dim (int): Encoded length of angle, will passed by head.
+            Defaults to 1.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN')``.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='SiLU', inplace=True).
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        widen_factor: float = 1.0,
+        num_base_priors: int = 1,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        share_conv: bool = True,
+        pred_kernel_size: int = 1,
+        angle_out_dim: int = 1,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: OptMultiConfig = None,
+    ):
+        self.angle_out_dim = angle_out_dim
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            widen_factor=widen_factor,
+            num_base_priors=num_base_priors,
+            feat_channels=feat_channels,
+            stacked_convs=stacked_convs,
+            featmap_strides=featmap_strides,
+            share_conv=share_conv,
+            pred_kernel_size=pred_kernel_size,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.rtm_ang = nn.ModuleList()
+        for _ in range(len(self.featmap_strides)):
+            self.rtm_ang.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.angle_out_dim,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        # Use prior in model initialization to improve stability
+        super().init_weights()
+        for rtm_ang in self.rtm_ang:
+            normal_init(rtm_ang, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - angle_preds (list[Tensor]): Angle prediction for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * angle_out_dim.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        angle_preds = []
+        for idx, x in enumerate(feats):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            reg_dist = self.rtm_reg[idx](reg_feat)
+            angle_pred = self.rtm_ang[idx](reg_feat)
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            angle_preds.append(angle_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(angle_preds)
+
+
+@MODELS.register_module()
+class RTMDetRotatedHead(RTMDetHead):
+    """RTMDet-R head.
+
+    Compared with RTMDetHead, RTMDetRotatedHead add some args to support
+    rotated object detection.
+
+    - `angle_version` used to limit angle_range during training.
+    - `angle_coder` used to encode and decode angle, which is similar
+      to bbox_coder.
+    - `use_hbbox_loss` and `loss_angle` allow custom regression loss
+      calculation for rotated box.
+
+      There are three combination options for regression:
+
+      1. `use_hbbox_loss=False` and loss_angle is None.
+
+      .. code:: text
+
+        bbox_pred────(tblr)───┐
+                              ▼
+        angle_pred          decode──►rbox_pred──(xywha)─►loss_bbox
+            │                 ▲
+            └────►decode──(a)─┘
+
+      2. `use_hbbox_loss=False` and loss_angle is specified.
+         A angle loss is added on angle_pred.
+
+      .. code:: text
+
+        bbox_pred────(tblr)───┐
+                              ▼
+        angle_pred          decode──►rbox_pred──(xywha)─►loss_bbox
+            │                 ▲
+            ├────►decode──(a)─┘
+            │
+            └───────────────────────────────────────────►loss_angle
+
+      3. `use_hbbox_loss=True` and loss_angle is specified.
+         In this case the loss_angle must be set.
+
+      .. code:: text
+
+        bbox_pred──(tblr)──►decode──►hbox_pred──(xyxy)──►loss_bbox
+
+        angle_pred──────────────────────────────────────►loss_angle
+
+    - There's a `decoded_with_angle` flag in test_cfg, which is similar
+      to training process.
+
+      When `decoded_with_angle=True`:
+
+      .. code:: text
+
+        bbox_pred────(tblr)───┐
+                              ▼
+        angle_pred          decode──(xywha)──►rbox_pred
+            │                 ▲
+            └────►decode──(a)─┘
+
+      When `decoded_with_angle=False`:
+
+      .. code:: text
+
+        bbox_pred──(tblr)─►decode
+                              │ (xyxy)
+                              ▼
+                           format───(xywh)──►concat──(xywha)──►rbox_pred
+                                               ▲
+        angle_pred────────►decode────(a)───────┘
+
+    Args:
+        head_module(ConfigType): Base module used for RTMDetRotatedHead.
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        angle_version (str): Angle representations. Defaults to 'le90'.
+        use_hbbox_loss (bool): If true, use horizontal bbox loss and
+            loss_angle should not be None. Default to False.
+        angle_coder (:obj:`ConfigDict` or dict): Config of angle coder.
+        loss_angle (:obj:`ConfigDict` or dict, optional): Config of angle loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+            self,
+            head_module: ConfigType,
+            prior_generator: ConfigType = dict(
+                type='mmdet.MlvlPointGenerator', strides=[8, 16, 32],
+                offset=0),
+            bbox_coder: ConfigType = dict(type='DistanceAnglePointCoder'),
+            loss_cls: ConfigType = dict(
+                type='mmdet.QualityFocalLoss',
+                use_sigmoid=True,
+                beta=2.0,
+                loss_weight=1.0),
+            loss_bbox: ConfigType = dict(
+                type='mmrotate.RotatedIoULoss', mode='linear',
+                loss_weight=2.0),
+            angle_version: str = 'le90',
+            use_hbbox_loss: bool = False,
+            angle_coder: ConfigType = dict(type='mmrotate.PseudoAngleCoder'),
+            loss_angle: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            init_cfg: OptMultiConfig = None):
+        if not MMROTATE_AVAILABLE:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmrotate.txt" '
+                'to install mmrotate first for rotated detection.')
+
+        self.angle_version = angle_version
+        self.use_hbbox_loss = use_hbbox_loss
+        if self.use_hbbox_loss:
+            assert loss_angle is not None, \
+                ('When use hbbox loss, loss_angle needs to be specified')
+        self.angle_coder = TASK_UTILS.build(angle_coder)
+        self.angle_out_dim = self.angle_coder.encode_size
+        if head_module.get('angle_out_dim') is not None:
+            warnings.warn('angle_out_dim will be overridden by angle_coder '
+                          'and does not need to be set manually')
+
+        head_module['angle_out_dim'] = self.angle_out_dim
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+        if loss_angle is not None:
+            self.loss_angle = MODELS.build(loss_angle)
+        else:
+            self.loss_angle = None
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        angle_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into bbox
+        results.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            angle_preds (list[Tensor]): Box angle for each scale level
+                with shape (N, num_points * angle_dim, H, W)
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 5),
+              the last dimension 4 arrange as (x, y, w, h, angle).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        if objectnesses is None:
+            with_objectnesses = False
+        else:
+            with_objectnesses = True
+            assert len(cls_scores) == len(objectnesses)
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+
+        multi_label = cfg.multi_label
+        multi_label &= self.num_classes > 1
+        cfg.multi_label = multi_label
+
+        # Whether to decode rbox with angle.
+        # different setting lead to different final results.
+        # Defaults to True.
+        decode_with_angle = cfg.get('decode_with_angle', True)
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size.numel() * self.num_base_priors, ), stride) for
+            featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_classes)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_angle_preds = [
+            angle_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                   self.angle_out_dim)
+            for angle_pred in angle_preds
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_angle_preds = torch.cat(flatten_angle_preds, dim=1)
+        flatten_angle_preds = self.angle_coder.decode(
+            flatten_angle_preds, keepdim=True)
+
+        if decode_with_angle:
+            flatten_rbbox_preds = torch.cat(
+                [flatten_bbox_preds, flatten_angle_preds], dim=-1)
+            flatten_decoded_bboxes = self.bbox_coder.decode(
+                flatten_priors[None], flatten_rbbox_preds, flatten_stride)
+        else:
+            flatten_decoded_hbboxes = self.bbox_coder.decode(
+                flatten_priors[None], flatten_bbox_preds, flatten_stride)
+            flatten_decoded_hbboxes = HorizontalBoxes.xyxy_to_cxcywh(
+                flatten_decoded_hbboxes)
+            flatten_decoded_bboxes = torch.cat(
+                [flatten_decoded_hbboxes, flatten_angle_preds], dim=-1)
+
+        if with_objectnesses:
+            flatten_objectness = [
+                objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+                for objectness in objectnesses
+            ]
+            flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        else:
+            flatten_objectness = [None for _ in range(num_imgs)]
+
+        results_list = []
+        for (bboxes, scores, objectness,
+             img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores,
+                              flatten_objectness, batch_img_metas):
+            scale_factor = img_meta['scale_factor']
+            if 'pad_param' in img_meta:
+                pad_param = img_meta['pad_param']
+            else:
+                pad_param = None
+
+            score_thr = cfg.get('score_thr', -1)
+            # yolox_style does not require the following operations
+            if objectness is not None and score_thr > 0 and not cfg.get(
+                    'yolox_style', False):
+                conf_inds = objectness > score_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+
+            if objectness is not None:
+                # conf = obj_conf * cls_conf
+                scores *= objectness[:, None]
+
+            if scores.shape[0] == 0:
+                empty_results = InstanceData()
+                empty_results.bboxes = RotatedBoxes(bboxes)
+                empty_results.scores = scores[:, 0]
+                empty_results.labels = scores[:, 0].int()
+                results_list.append(empty_results)
+                continue
+
+            nms_pre = cfg.get('nms_pre', 100000)
+            if cfg.multi_label is False:
+                scores, labels = scores.max(1, keepdim=True)
+                scores, _, keep_idxs, results = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(labels=labels[:, 0]))
+                labels = results['labels']
+            else:
+                scores, labels, keep_idxs, _ = filter_scores_and_topk(
+                    scores, score_thr, nms_pre)
+
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=RotatedBoxes(bboxes[keep_idxs]))
+
+            if rescale:
+                if pad_param is not None:
+                    results.bboxes.translate_([-pad_param[2], -pad_param[0]])
+
+                scale_factor = [1 / s for s in img_meta['scale_factor']]
+                results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+            if cfg.get('yolox_style', False):
+                # do not need max_per_img
+                cfg.max_per_img = len(results)
+
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=False,
+                with_nms=with_nms,
+                img_meta=img_meta)
+
+            results_list.append(results)
+        return results_list
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            angle_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            angle_preds (list[Tensor]): Angle prediction for each scale
+                level with shape (N, num_anchors * angle_out_dim, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xywha
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+
+        device = cls_scores[0].device
+
+        # If the shape does not equal, generate new one
+        if featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = featmap_sizes
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                featmap_sizes, device=device, with_stride=True)
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1).contiguous()
+
+        flatten_tblrs = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ], 1)
+        flatten_tblrs = flatten_tblrs * self.flatten_priors_train[..., -1,
+                                                                  None]
+        flatten_angles = torch.cat([
+            angle_pred.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.angle_out_dim) for angle_pred in angle_preds
+        ], 1)
+        flatten_decoded_angle = self.angle_coder.decode(
+            flatten_angles, keepdim=True)
+        flatten_tblra = torch.cat([flatten_tblrs, flatten_decoded_angle],
+                                  dim=-1)
+        flatten_rbboxes = distance2obb(
+            self.flatten_priors_train[..., :2],
+            flatten_tblra,
+            angle_version=self.angle_version)
+        if self.use_hbbox_loss:
+            flatten_hbboxes = distance2bbox(self.flatten_priors_train[..., :2],
+                                            flatten_tblrs)
+
+        assigned_result = self.assigner(flatten_rbboxes.detach(),
+                                        flatten_cls_scores.detach(),
+                                        self.flatten_priors_train, gt_labels,
+                                        gt_bboxes, pad_bbox_flag)
+
+        labels = assigned_result['assigned_labels'].reshape(-1)
+        label_weights = assigned_result['assigned_labels_weights'].reshape(-1)
+        bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 5)
+        assign_metrics = assigned_result['assign_metrics'].reshape(-1)
+        cls_preds = flatten_cls_scores.reshape(-1, self.num_classes)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item()
+
+        loss_cls = self.loss_cls(
+            cls_preds, (labels, assign_metrics),
+            label_weights,
+            avg_factor=avg_factor)
+
+        pos_bbox_targets = bbox_targets[pos_inds]
+
+        if self.use_hbbox_loss:
+            bbox_preds = flatten_hbboxes.reshape(-1, 4)
+            pos_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets[:, :4])
+        else:
+            bbox_preds = flatten_rbboxes.reshape(-1, 5)
+        angle_preds = flatten_angles.reshape(-1, self.angle_out_dim)
+
+        if len(pos_inds) > 0:
+            loss_bbox = self.loss_bbox(
+                bbox_preds[pos_inds],
+                pos_bbox_targets,
+                weight=assign_metrics[pos_inds],
+                avg_factor=avg_factor)
+            loss_angle = angle_preds.sum() * 0
+            if self.loss_angle is not None:
+                pos_angle_targets = bbox_targets[pos_inds][:, 4:5]
+                pos_angle_targets = self.angle_coder.encode(pos_angle_targets)
+                loss_angle = self.loss_angle(
+                    angle_preds[pos_inds],
+                    pos_angle_targets,
+                    weight=assign_metrics[pos_inds],
+                    avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_preds.sum() * 0
+            loss_angle = angle_preds.sum() * 0
+
+        losses = dict()
+        losses['loss_cls'] = loss_cls
+        losses['loss_bbox'] = loss_bbox
+        if self.loss_angle is not None:
+            losses['loss_angle'] = loss_angle
+
+        return losses
diff --git a/mmyolo/models/dense_heads/yolov5_head.py b/mmyolo/models/dense_heads/yolov5_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49d0851897c36fd68d9de1c6097ae58b532024f
--- /dev/null
+++ b/mmyolo/models/dense_heads/yolov5_head.py
@@ -0,0 +1,890 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmdet.models.dense_heads.base_dense_head import BaseDenseHead
+from mmdet.models.utils import filter_scores_and_topk, multi_apply
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig)
+from mmengine.config import ConfigDict
+from mmengine.dist import get_dist_info
+from mmengine.logging import print_log
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import make_divisible
+
+
+def get_prior_xy_info(index: int, num_base_priors: int,
+                      featmap_sizes: int) -> Tuple[int, int, int]:
+    """Get prior index and xy index in feature map by flatten index."""
+    _, featmap_w = featmap_sizes
+    priors = index % num_base_priors
+    xy_index = index // num_base_priors
+    grid_y = xy_index // featmap_w
+    grid_x = xy_index % featmap_w
+    return priors, grid_x, grid_y
+
+
+@MODELS.register_module()
+class YOLOv5HeadModule(BaseModule):
+    """YOLOv5Head head module used in `YOLOv5`.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to (8, 16, 32).
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 3,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.widen_factor = widen_factor
+
+        self.featmap_strides = featmap_strides
+        self.num_out_attrib = 5 + self.num_classes
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+
+        if isinstance(in_channels, int):
+            self.in_channels = [make_divisible(in_channels, widen_factor)
+                                ] * self.num_levels
+        else:
+            self.in_channels = [
+                make_divisible(i, widen_factor) for i in in_channels
+            ]
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """initialize conv layers in YOLOv5 head."""
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_pred = nn.Conv2d(self.in_channels[i],
+                                  self.num_base_priors * self.num_out_attrib,
+                                  1)
+
+            self.convs_pred.append(conv_pred)
+
+    def init_weights(self):
+        """Initialize the bias of YOLOv5 head."""
+        super().init_weights()
+        for mi, s in zip(self.convs_pred, self.featmap_strides):  # from
+            b = mi.bias.data.view(self.num_base_priors, -1)
+            # obj (8 objects per 640 image)
+            b.data[:, 4] += math.log(8 / (640 / s)**2)
+            b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.999999))
+
+            mi.bias.data = b.view(-1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.convs_pred)
+
+    def forward_single(self, x: Tensor,
+                       convs: nn.Module) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+
+        pred_map = convs(x)
+        bs, _, ny, nx = pred_map.shape
+        pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib,
+                                 ny, nx)
+
+        cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx)
+        bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx)
+        objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx)
+
+        return cls_score, bbox_pred, objectness
+
+
+@MODELS.register_module()
+class YOLOv5Head(BaseDenseHead):
+    """YOLOv5Head head used in `YOLOv5`.
+
+    Args:
+        head_module(ConfigType): Base module used for YOLOv5Head
+        prior_generator(dict): Points generator feature maps in
+            2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss.
+        prior_match_thr (float): Defaults to 4.0.
+        ignore_iof_thr (float): Defaults to -1.0.
+        obj_level_weights (List[float]): Defaults to [4.0, 1.0, 0.4].
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.YOLOAnchorGenerator',
+                     base_sizes=[[(10, 13), (16, 30), (33, 23)],
+                                 [(30, 61), (62, 45), (59, 119)],
+                                 [(116, 90), (156, 198), (373, 326)]],
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='YOLOv5BBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=0.5),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='ciou',
+                     bbox_format='xywh',
+                     eps=1e-7,
+                     reduction='mean',
+                     loss_weight=0.05,
+                     return_iou=True),
+                 loss_obj: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=1.0),
+                 prior_match_thr: float = 4.0,
+                 near_neighbor_thr: float = 0.5,
+                 ignore_iof_thr: float = -1.0,
+                 obj_level_weights: List[float] = [4.0, 1.0, 0.4],
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.head_module = MODELS.build(head_module)
+        self.num_classes = self.head_module.num_classes
+        self.featmap_strides = self.head_module.featmap_strides
+        self.num_levels = len(self.featmap_strides)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self.loss_cls: nn.Module = MODELS.build(loss_cls)
+        self.loss_bbox: nn.Module = MODELS.build(loss_bbox)
+        self.loss_obj: nn.Module = MODELS.build(loss_obj)
+
+        self.prior_generator = TASK_UTILS.build(prior_generator)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.featmap_sizes = [torch.empty(1)] * self.num_levels
+
+        self.prior_match_thr = prior_match_thr
+        self.near_neighbor_thr = near_neighbor_thr
+        self.obj_level_weights = obj_level_weights
+        self.ignore_iof_thr = ignore_iof_thr
+
+        self.special_init()
+
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+
+        The special_init function is designed to deal with this situation.
+        """
+        assert len(self.obj_level_weights) == len(
+            self.featmap_strides) == self.num_levels
+        if self.prior_match_thr != 4.0:
+            print_log(
+                "!!!Now, you've changed the prior_match_thr "
+                'parameter to something other than 4.0. Please make sure '
+                'that you have modified both the regression formula in '
+                'bbox_coder and before loss_box computation, '
+                'otherwise the accuracy may be degraded!!!')
+
+        if self.num_classes == 1:
+            print_log('!!!You are using `YOLOv5Head` with num_classes == 1.'
+                      ' The loss_cls will be 0. This is a normal phenomenon.')
+
+        priors_base_sizes = torch.tensor(
+            self.prior_generator.base_sizes, dtype=torch.float)
+        featmap_strides = torch.tensor(
+            self.featmap_strides, dtype=torch.float)[:, None, None]
+        self.register_buffer(
+            'priors_base_sizes',
+            priors_base_sizes / featmap_strides,
+            persistent=False)
+
+        grid_offset = torch.tensor([
+            [0, 0],  # center
+            [1, 0],  # left
+            [0, 1],  # up
+            [-1, 0],  # right
+            [0, -1],  # bottom
+        ]).float()
+        self.register_buffer(
+            'grid_offset', grid_offset[:, None], persistent=False)
+
+        prior_inds = torch.arange(self.num_base_priors).float().view(
+            self.num_base_priors, 1)
+        self.register_buffer('prior_inds', prior_inds, persistent=False)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        return self.head_module(x)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = True,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into
+        bbox results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        if objectnesses is None:
+            with_objectnesses = False
+        else:
+            with_objectnesses = True
+            assert len(cls_scores) == len(objectnesses)
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+
+        multi_label = cfg.multi_label
+        multi_label &= self.num_classes > 1
+        cfg.multi_label = multi_label
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+
+        # If the shape does not change, use the previous mlvl_priors
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+        flatten_priors = torch.cat(self.mlvl_priors)
+
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size.numel() * self.num_base_priors, ), stride) for
+            featmap_size, stride in zip(featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_classes)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_decoded_bboxes = self.bbox_coder.decode(
+            flatten_priors[None], flatten_bbox_preds, flatten_stride)
+
+        if with_objectnesses:
+            flatten_objectness = [
+                objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+                for objectness in objectnesses
+            ]
+            flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        else:
+            flatten_objectness = [None for _ in range(num_imgs)]
+
+        results_list = []
+        for (bboxes, scores, objectness,
+             img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores,
+                              flatten_objectness, batch_img_metas):
+            ori_shape = img_meta['ori_shape']
+            scale_factor = img_meta['scale_factor']
+            if 'pad_param' in img_meta:
+                pad_param = img_meta['pad_param']
+            else:
+                pad_param = None
+
+            score_thr = cfg.get('score_thr', -1)
+            # yolox_style does not require the following operations
+            if objectness is not None and score_thr > 0 and not cfg.get(
+                    'yolox_style', False):
+                conf_inds = objectness > score_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+
+            if objectness is not None:
+                # conf = obj_conf * cls_conf
+                scores *= objectness[:, None]
+
+            if scores.shape[0] == 0:
+                empty_results = InstanceData()
+                empty_results.bboxes = bboxes
+                empty_results.scores = scores[:, 0]
+                empty_results.labels = scores[:, 0].int()
+                results_list.append(empty_results)
+                continue
+
+            nms_pre = cfg.get('nms_pre', 100000)
+            if cfg.multi_label is False:
+                scores, labels = scores.max(1, keepdim=True)
+                scores, _, keep_idxs, results = filter_scores_and_topk(
+                    scores,
+                    score_thr,
+                    nms_pre,
+                    results=dict(labels=labels[:, 0]))
+                labels = results['labels']
+            else:
+                scores, labels, keep_idxs, _ = filter_scores_and_topk(
+                    scores, score_thr, nms_pre)
+
+            results = InstanceData(
+                scores=scores, labels=labels, bboxes=bboxes[keep_idxs])
+
+            if rescale:
+                if pad_param is not None:
+                    results.bboxes -= results.bboxes.new_tensor([
+                        pad_param[2], pad_param[0], pad_param[2], pad_param[0]
+                    ])
+                results.bboxes /= results.bboxes.new_tensor(
+                    scale_factor).repeat((1, 2))
+
+            if cfg.get('yolox_style', False):
+                # do not need max_per_img
+                cfg.max_per_img = len(results)
+
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=False,
+                with_nms=with_nms,
+                img_meta=img_meta)
+            results.bboxes[:, 0::2].clamp_(0, ori_shape[1])
+            results.bboxes[:, 1::2].clamp_(0, ori_shape[0])
+
+            results_list.append(results)
+        return results_list
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list,
+                                                               dict]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`], dict): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        if isinstance(batch_data_samples, list):
+            losses = super().loss(x, batch_data_samples)
+        else:
+            outs = self(x)
+            # Fast version
+            loss_inputs = outs + (batch_data_samples['bboxes_labels'],
+                                  batch_data_samples['img_metas'])
+            losses = self.loss_by_feat(*loss_inputs)
+
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (Sequence[InstanceData]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (Sequence[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        if self.ignore_iof_thr != -1:
+            # TODO: Support fast version
+            # convert ignore gt
+            batch_target_ignore_list = []
+            for i, gt_instances_ignore in enumerate(batch_gt_instances_ignore):
+                bboxes = gt_instances_ignore.bboxes
+                labels = gt_instances_ignore.labels
+                index = bboxes.new_full((len(bboxes), 1), i)
+                # (batch_idx, label, bboxes)
+                target = torch.cat((index, labels[:, None].float(), bboxes),
+                                   dim=1)
+                batch_target_ignore_list.append(target)
+
+            # (num_bboxes, 6)
+            batch_gt_targets_ignore = torch.cat(
+                batch_target_ignore_list, dim=0)
+            if batch_gt_targets_ignore.shape[0] != 0:
+                # Consider regions with ignore in annotations
+                return self._loss_by_feat_with_ignore(
+                    cls_scores,
+                    bbox_preds,
+                    objectnesses,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas,
+                    batch_gt_instances_ignore=batch_gt_targets_ignore)
+
+        # 1. Convert gt to norm format
+        batch_targets_normed = self._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+
+        device = cls_scores[0].device
+        loss_cls = torch.zeros(1, device=device)
+        loss_box = torch.zeros(1, device=device)
+        loss_obj = torch.zeros(1, device=device)
+        scaled_factor = torch.ones(7, device=device)
+
+        for i in range(self.num_levels):
+            batch_size, _, h, w = bbox_preds[i].shape
+            target_obj = torch.zeros_like(objectnesses[i])
+
+            # empty gt bboxes
+            if batch_targets_normed.shape[1] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i], target_obj) * self.obj_level_weights[i]
+                continue
+
+            priors_base_sizes_i = self.priors_base_sizes[i]
+            # feature map scale whwh
+            scaled_factor[2:6] = torch.tensor(
+                bbox_preds[i].shape)[[3, 2, 3, 2]]
+            # Scale batch_targets from range 0-1 to range 0-features_maps size.
+            # (num_base_priors, num_bboxes, 7)
+            batch_targets_scaled = batch_targets_normed * scaled_factor
+
+            # 2. Shape match
+            wh_ratio = batch_targets_scaled[...,
+                                            4:6] / priors_base_sizes_i[:, None]
+            match_inds = torch.max(
+                wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr
+            batch_targets_scaled = batch_targets_scaled[match_inds]
+
+            # no gt bbox matches anchor
+            if batch_targets_scaled.shape[0] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i], target_obj) * self.obj_level_weights[i]
+                continue
+
+            # 3. Positive samples with additional neighbors
+
+            # check the left, up, right, bottom sides of the
+            # targets grid, and determine whether assigned
+            # them as positive samples as well.
+            batch_targets_cxcy = batch_targets_scaled[:, 2:4]
+            grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy
+            left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) &
+                        (batch_targets_cxcy > 1)).T
+            right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) &
+                             (grid_xy > 1)).T
+            offset_inds = torch.stack(
+                (torch.ones_like(left), left, up, right, bottom))
+
+            batch_targets_scaled = batch_targets_scaled.repeat(
+                (5, 1, 1))[offset_inds]
+            retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1],
+                                                       1)[offset_inds]
+
+            # prepare pred results and positive sample indexes to
+            # calculate class loss and bbox lo
+            _chunk_targets = batch_targets_scaled.chunk(4, 1)
+            img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets
+            priors_inds, (img_inds, class_inds) = priors_inds.long().view(
+                -1), img_class_inds.long().T
+
+            grid_xy_long = (grid_xy -
+                            retained_offsets * self.near_neighbor_thr).long()
+            grid_x_inds, grid_y_inds = grid_xy_long.T
+            bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1)
+
+            # 4. Calculate loss
+            # bbox loss
+            retained_bbox_pred = bbox_preds[i].reshape(
+                batch_size, self.num_base_priors, -1, h,
+                w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+            priors_base_sizes_i = priors_base_sizes_i[priors_inds]
+            decoded_bbox_pred = self._decode_bbox_to_xywh(
+                retained_bbox_pred, priors_base_sizes_i)
+            loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets)
+            loss_box += loss_box_i
+
+            # obj loss
+            iou = iou.detach().clamp(0)
+            target_obj[img_inds, priors_inds, grid_y_inds,
+                       grid_x_inds] = iou.type(target_obj.dtype)
+            loss_obj += self.loss_obj(objectnesses[i],
+                                      target_obj) * self.obj_level_weights[i]
+
+            # cls loss
+            if self.num_classes > 1:
+                pred_cls_scores = cls_scores[i].reshape(
+                    batch_size, self.num_base_priors, -1, h,
+                    w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+
+                target_class = torch.full_like(pred_cls_scores, 0.)
+                target_class[range(batch_targets_scaled.shape[0]),
+                             class_inds] = 1.
+                loss_cls += self.loss_cls(pred_cls_scores, target_class)
+            else:
+                loss_cls += cls_scores[i].sum() * 0
+
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * batch_size * world_size,
+            loss_obj=loss_obj * batch_size * world_size,
+            loss_bbox=loss_box * batch_size * world_size)
+
+    def _convert_gt_to_norm_format(self,
+                                   batch_gt_instances: Sequence[InstanceData],
+                                   batch_img_metas: Sequence[dict]) -> Tensor:
+        if isinstance(batch_gt_instances, torch.Tensor):
+            # fast version
+            img_shape = batch_img_metas[0]['batch_input_shape']
+            gt_bboxes_xyxy = batch_gt_instances[:, 2:]
+            xy1, xy2 = gt_bboxes_xyxy.split((2, 2), dim=-1)
+            gt_bboxes_xywh = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1)
+            gt_bboxes_xywh[:, 1::2] /= img_shape[0]
+            gt_bboxes_xywh[:, 0::2] /= img_shape[1]
+            batch_gt_instances[:, 2:] = gt_bboxes_xywh
+
+            # (num_base_priors, num_bboxes, 6)
+            batch_targets_normed = batch_gt_instances.repeat(
+                self.num_base_priors, 1, 1)
+        else:
+            batch_target_list = []
+            # Convert xyxy bbox to yolo format.
+            for i, gt_instances in enumerate(batch_gt_instances):
+                img_shape = batch_img_metas[i]['batch_input_shape']
+                bboxes = gt_instances.bboxes
+                labels = gt_instances.labels
+
+                xy1, xy2 = bboxes.split((2, 2), dim=-1)
+                bboxes = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1)
+                # normalized to 0-1
+                bboxes[:, 1::2] /= img_shape[0]
+                bboxes[:, 0::2] /= img_shape[1]
+
+                index = bboxes.new_full((len(bboxes), 1), i)
+                # (batch_idx, label, normed_bbox)
+                target = torch.cat((index, labels[:, None].float(), bboxes),
+                                   dim=1)
+                batch_target_list.append(target)
+
+            # (num_base_priors, num_bboxes, 6)
+            batch_targets_normed = torch.cat(
+                batch_target_list, dim=0).repeat(self.num_base_priors, 1, 1)
+
+        # (num_base_priors, num_bboxes, 1)
+        batch_targets_prior_inds = self.prior_inds.repeat(
+            1, batch_targets_normed.shape[1])[..., None]
+        # (num_base_priors, num_bboxes, 7)
+        # (img_ind, labels, bbox_cx, bbox_cy, bbox_w, bbox_h, prior_ind)
+        batch_targets_normed = torch.cat(
+            (batch_targets_normed, batch_targets_prior_inds), 2)
+        return batch_targets_normed
+
+    def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes) -> Tensor:
+        bbox_pred = bbox_pred.sigmoid()
+        pred_xy = bbox_pred[:, :2] * 2 - 0.5
+        pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes
+        decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1)
+        return decoded_bbox_pred
+
+    def _loss_by_feat_with_ignore(
+            self, cls_scores: Sequence[Tensor], bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: Sequence[Tensor]) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (Sequence[InstanceData]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (Sequence[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (Sequence[Tensor]): Ignore boxes with
+                batch_ids and labels, each is a 2D-tensor, the channel number
+                is 6, means that (batch_id, label, xmin, ymin, xmax, ymax).
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        # 1. Convert gt to norm format
+        batch_targets_normed = self._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        if featmap_sizes != self.featmap_sizes:
+            self.mlvl_priors = self.prior_generator.grid_priors(
+                featmap_sizes,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device)
+            self.featmap_sizes = featmap_sizes
+
+        device = cls_scores[0].device
+        loss_cls = torch.zeros(1, device=device)
+        loss_box = torch.zeros(1, device=device)
+        loss_obj = torch.zeros(1, device=device)
+        scaled_factor = torch.ones(7, device=device)
+
+        for i in range(self.num_levels):
+            batch_size, _, h, w = bbox_preds[i].shape
+            target_obj = torch.zeros_like(objectnesses[i])
+
+            not_ignore_flags = bbox_preds[i].new_ones(batch_size,
+                                                      self.num_base_priors, h,
+                                                      w)
+
+            ignore_overlaps = bbox_overlaps(self.mlvl_priors[i],
+                                            batch_gt_instances_ignore[..., 2:],
+                                            'iof')
+            ignore_max_overlaps, ignore_max_ignore_index = ignore_overlaps.max(
+                dim=1)
+
+            batch_inds = batch_gt_instances_ignore[:,
+                                                   0][ignore_max_ignore_index]
+            ignore_inds = (ignore_max_overlaps > self.ignore_iof_thr).nonzero(
+                as_tuple=True)[0]
+            batch_inds = batch_inds[ignore_inds].long()
+            ignore_priors, ignore_grid_xs, ignore_grid_ys = get_prior_xy_info(
+                ignore_inds, self.num_base_priors, self.featmap_sizes[i])
+            not_ignore_flags[batch_inds, ignore_priors, ignore_grid_ys,
+                             ignore_grid_xs] = 0
+
+            # empty gt bboxes
+            if batch_targets_normed.shape[1] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i],
+                    target_obj,
+                    weight=not_ignore_flags,
+                    avg_factor=max(not_ignore_flags.sum(),
+                                   1)) * self.obj_level_weights[i]
+                continue
+
+            priors_base_sizes_i = self.priors_base_sizes[i]
+            # feature map scale whwh
+            scaled_factor[2:6] = torch.tensor(
+                bbox_preds[i].shape)[[3, 2, 3, 2]]
+            # Scale batch_targets from range 0-1 to range 0-features_maps size.
+            # (num_base_priors, num_bboxes, 7)
+            batch_targets_scaled = batch_targets_normed * scaled_factor
+
+            # 2. Shape match
+            wh_ratio = batch_targets_scaled[...,
+                                            4:6] / priors_base_sizes_i[:, None]
+            match_inds = torch.max(
+                wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr
+            batch_targets_scaled = batch_targets_scaled[match_inds]
+
+            # no gt bbox matches anchor
+            if batch_targets_scaled.shape[0] == 0:
+                loss_box += bbox_preds[i].sum() * 0
+                loss_cls += cls_scores[i].sum() * 0
+                loss_obj += self.loss_obj(
+                    objectnesses[i],
+                    target_obj,
+                    weight=not_ignore_flags,
+                    avg_factor=max(not_ignore_flags.sum(),
+                                   1)) * self.obj_level_weights[i]
+                continue
+
+            # 3. Positive samples with additional neighbors
+
+            # check the left, up, right, bottom sides of the
+            # targets grid, and determine whether assigned
+            # them as positive samples as well.
+            batch_targets_cxcy = batch_targets_scaled[:, 2:4]
+            grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy
+            left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) &
+                        (batch_targets_cxcy > 1)).T
+            right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) &
+                             (grid_xy > 1)).T
+            offset_inds = torch.stack(
+                (torch.ones_like(left), left, up, right, bottom))
+
+            batch_targets_scaled = batch_targets_scaled.repeat(
+                (5, 1, 1))[offset_inds]
+            retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1],
+                                                       1)[offset_inds]
+
+            # prepare pred results and positive sample indexes to
+            # calculate class loss and bbox lo
+            _chunk_targets = batch_targets_scaled.chunk(4, 1)
+            img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets
+            priors_inds, (img_inds, class_inds) = priors_inds.long().view(
+                -1), img_class_inds.long().T
+
+            grid_xy_long = (grid_xy -
+                            retained_offsets * self.near_neighbor_thr).long()
+            grid_x_inds, grid_y_inds = grid_xy_long.T
+            bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1)
+
+            # 4. Calculate loss
+            # bbox loss
+            retained_bbox_pred = bbox_preds[i].reshape(
+                batch_size, self.num_base_priors, -1, h,
+                w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+            priors_base_sizes_i = priors_base_sizes_i[priors_inds]
+            decoded_bbox_pred = self._decode_bbox_to_xywh(
+                retained_bbox_pred, priors_base_sizes_i)
+
+            not_ignore_weights = not_ignore_flags[img_inds, priors_inds,
+                                                  grid_y_inds, grid_x_inds]
+            loss_box_i, iou = self.loss_bbox(
+                decoded_bbox_pred,
+                bboxes_targets,
+                weight=not_ignore_weights,
+                avg_factor=max(not_ignore_weights.sum(), 1))
+            loss_box += loss_box_i
+
+            # obj loss
+            iou = iou.detach().clamp(0)
+            target_obj[img_inds, priors_inds, grid_y_inds,
+                       grid_x_inds] = iou.type(target_obj.dtype)
+            loss_obj += self.loss_obj(
+                objectnesses[i],
+                target_obj,
+                weight=not_ignore_flags,
+                avg_factor=max(not_ignore_flags.sum(),
+                               1)) * self.obj_level_weights[i]
+
+            # cls loss
+            if self.num_classes > 1:
+                pred_cls_scores = cls_scores[i].reshape(
+                    batch_size, self.num_base_priors, -1, h,
+                    w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds]
+
+                target_class = torch.full_like(pred_cls_scores, 0.)
+                target_class[range(batch_targets_scaled.shape[0]),
+                             class_inds] = 1.
+                loss_cls += self.loss_cls(
+                    pred_cls_scores,
+                    target_class,
+                    weight=not_ignore_weights[:, None].repeat(
+                        1, self.num_classes),
+                    avg_factor=max(not_ignore_weights.sum(), 1))
+            else:
+                loss_cls += cls_scores[i].sum() * 0
+
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * batch_size * world_size,
+            loss_obj=loss_obj * batch_size * world_size,
+            loss_bbox=loss_box * batch_size * world_size)
diff --git a/mmyolo/models/dense_heads/yolov6_head.py b/mmyolo/models/dense_heads/yolov6_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b492d121a02acc194ba45637adc9e8b3e26a22c
--- /dev/null
+++ b/mmyolo/models/dense_heads/yolov6_head.py
@@ -0,0 +1,369 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig)
+from mmengine import MessageHub
+from mmengine.dist import get_dist_info
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess
+from .yolov5_head import YOLOv5Head
+
+
+@MODELS.register_module()
+class YOLOv6HeadModule(BaseModule):
+    """YOLOv6Head head module used in `YOLOv6.
+
+    <https://arxiv.org/pdf/2209.02976>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors: (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to [8, 16, 32].
+            None, otherwise False. Defaults to "auto".
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 1,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.num_classes = num_classes
+        self.featmap_strides = featmap_strides
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        if isinstance(in_channels, int):
+            self.in_channels = [int(in_channels * widen_factor)
+                                ] * self.num_levels
+        else:
+            self.in_channels = [int(i * widen_factor) for i in in_channels]
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """initialize conv layers in YOLOv6 head."""
+        # Init decouple head
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        for i in range(self.num_levels):
+            self.stems.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.in_channels[i],
+                    kernel_size=1,
+                    stride=1,
+                    padding=1 // 2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.cls_convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.in_channels[i],
+                    kernel_size=3,
+                    stride=1,
+                    padding=3 // 2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.in_channels[i],
+                    kernel_size=3,
+                    stride=1,
+                    padding=3 // 2,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.num_base_priors * self.num_classes,
+                    kernel_size=1))
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.num_base_priors * 4,
+                    kernel_size=1))
+
+    def init_weights(self):
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.01)
+        for conv in self.cls_preds:
+            conv.bias.data.fill_(bias_init)
+            conv.weight.data.fill_(0.)
+
+        for conv in self.reg_preds:
+            conv.bias.data.fill_(1.0)
+            conv.weight.data.fill_(0.)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions.
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.stems, self.cls_convs,
+                           self.cls_preds, self.reg_convs, self.reg_preds)
+
+    def forward_single(self, x: Tensor, stem: nn.Module, cls_conv: nn.Module,
+                       cls_pred: nn.Module, reg_conv: nn.Module,
+                       reg_pred: nn.Module) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+        y = stem(x)
+        cls_x = y
+        reg_x = y
+        cls_feat = cls_conv(cls_x)
+        reg_feat = reg_conv(reg_x)
+
+        cls_score = cls_pred(cls_feat)
+        bbox_pred = reg_pred(reg_feat)
+
+        return cls_score, bbox_pred
+
+
+@MODELS.register_module()
+class YOLOv6Head(YOLOv5Head):
+    """YOLOv6Head head used in `YOLOv6 <https://arxiv.org/pdf/2209.02976>`_.
+
+    Args:
+        head_module(ConfigType): Base module used for YOLOv6Head
+        prior_generator(dict): Points generator feature maps
+            in 2D points-based detectors.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0.5,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='giou',
+                     bbox_format='xyxy',
+                     reduction='mean',
+                     loss_weight=2.5,
+                     return_iou=False),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        # yolov6 doesn't need loss_obj
+        self.loss_obj = None
+
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+
+        The special_init function is designed to deal with this situation.
+        """
+        if self.train_cfg:
+            self.initial_epoch = self.train_cfg['initial_epoch']
+            self.initial_assigner = TASK_UTILS.build(
+                self.train_cfg.initial_assigner)
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+
+            # Add common attributes to reduce calculation
+            self.featmap_sizes_train = None
+            self.num_level_priors = None
+            self.flatten_priors_train = None
+            self.stride_tensor = None
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+
+        # get epoch information from message hub
+        message_hub = MessageHub.get_current_instance()
+        current_epoch = message_hub.get_info('epoch')
+
+        num_imgs = len(batch_img_metas)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        current_featmap_sizes = [
+            cls_score.shape[2:] for cls_score in cls_scores
+        ]
+        # If the shape does not equal, generate new one
+        if current_featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = current_featmap_sizes
+
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                self.featmap_sizes_train,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+
+            self.num_level_priors = [len(n) for n in mlvl_priors_with_stride]
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+            self.stride_tensor = self.flatten_priors_train[..., [2]]
+
+        # gt info
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+
+        # pred info
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+
+        flatten_pred_bboxes = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1)
+        flatten_pred_bboxes = self.bbox_coder.decode(
+            self.flatten_priors_train[..., :2], flatten_pred_bboxes,
+            self.stride_tensor[:, 0])
+        pred_scores = torch.sigmoid(flatten_cls_preds)
+
+        if current_epoch < self.initial_epoch:
+            assigned_result = self.initial_assigner(
+                flatten_pred_bboxes.detach(), self.flatten_priors_train,
+                self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag)
+        else:
+            assigned_result = self.assigner(flatten_pred_bboxes.detach(),
+                                            pred_scores.detach(),
+                                            self.flatten_priors_train,
+                                            gt_labels, gt_bboxes,
+                                            pad_bbox_flag)
+
+        assigned_bboxes = assigned_result['assigned_bboxes']
+        assigned_scores = assigned_result['assigned_scores']
+        fg_mask_pre_prior = assigned_result['fg_mask_pre_prior']
+
+        # cls loss
+        with torch.cuda.amp.autocast(enabled=False):
+            loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores)
+
+        # rescale bbox
+        assigned_bboxes /= self.stride_tensor
+        flatten_pred_bboxes /= self.stride_tensor
+
+        # TODO: Add all_reduce makes training more stable
+        assigned_scores_sum = assigned_scores.sum()
+        if assigned_scores_sum > 0:
+            loss_cls /= assigned_scores_sum
+
+        # select positive samples mask
+        num_pos = fg_mask_pre_prior.sum()
+        if num_pos > 0:
+            # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox
+            # will not report an error
+            # iou loss
+            prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(
+                flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = torch.masked_select(
+                assigned_bboxes, prior_bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1)
+            loss_bbox = self.loss_bbox(
+                pred_bboxes_pos,
+                assigned_bboxes_pos,
+                weight=bbox_weight,
+                avg_factor=assigned_scores_sum)
+        else:
+            loss_bbox = flatten_pred_bboxes.sum() * 0
+
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * world_size, loss_bbox=loss_bbox * world_size)
diff --git a/mmyolo/models/dense_heads/yolov7_head.py b/mmyolo/models/dense_heads/yolov7_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..80e6aadd2880fbe95b7c897630ab9033183c2062
--- /dev/null
+++ b/mmyolo/models/dense_heads/yolov7_head.py
@@ -0,0 +1,404 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmdet.utils import ConfigType, OptInstanceList
+from mmengine.dist import get_dist_info
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+from ..layers import ImplicitA, ImplicitM
+from ..task_modules.assigners.batch_yolov7_assigner import BatchYOLOv7Assigner
+from .yolov5_head import YOLOv5Head, YOLOv5HeadModule
+
+
+@MODELS.register_module()
+class YOLOv7HeadModule(YOLOv5HeadModule):
+    """YOLOv7Head head module used in YOLOv7."""
+
+    def _init_layers(self):
+        """initialize conv layers in YOLOv7 head."""
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_pred = nn.Sequential(
+                ImplicitA(self.in_channels[i]),
+                nn.Conv2d(self.in_channels[i],
+                          self.num_base_priors * self.num_out_attrib, 1),
+                ImplicitM(self.num_base_priors * self.num_out_attrib),
+            )
+            self.convs_pred.append(conv_pred)
+
+    def init_weights(self):
+        """Initialize the bias of YOLOv7 head."""
+        super(YOLOv5HeadModule, self).init_weights()
+        for mi, s in zip(self.convs_pred, self.featmap_strides):  # from
+            mi = mi[1]  # nn.Conv2d
+
+            b = mi.bias.data.view(3, -1)
+            # obj (8 objects per 640 image)
+            b.data[:, 4] += math.log(8 / (640 / s)**2)
+            b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99))
+
+            mi.bias.data = b.view(-1)
+
+
+@MODELS.register_module()
+class YOLOv7p6HeadModule(YOLOv5HeadModule):
+    """YOLOv7Head head module used in YOLOv7."""
+
+    def __init__(self,
+                 *args,
+                 main_out_channels: Sequence[int] = [256, 512, 768, 1024],
+                 aux_out_channels: Sequence[int] = [320, 640, 960, 1280],
+                 use_aux: bool = True,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 **kwargs):
+        self.main_out_channels = main_out_channels
+        self.aux_out_channels = aux_out_channels
+        self.use_aux = use_aux
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self):
+        """initialize conv layers in YOLOv7 head."""
+        self.main_convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_pred = nn.Sequential(
+                ConvModule(
+                    self.in_channels[i],
+                    self.main_out_channels[i],
+                    3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ImplicitA(self.main_out_channels[i]),
+                nn.Conv2d(self.main_out_channels[i],
+                          self.num_base_priors * self.num_out_attrib, 1),
+                ImplicitM(self.num_base_priors * self.num_out_attrib),
+            )
+            self.main_convs_pred.append(conv_pred)
+
+        if self.use_aux:
+            self.aux_convs_pred = nn.ModuleList()
+            for i in range(self.num_levels):
+                aux_pred = nn.Sequential(
+                    ConvModule(
+                        self.in_channels[i],
+                        self.aux_out_channels[i],
+                        3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    nn.Conv2d(self.aux_out_channels[i],
+                              self.num_base_priors * self.num_out_attrib, 1))
+                self.aux_convs_pred.append(aux_pred)
+        else:
+            self.aux_convs_pred = [None] * len(self.main_convs_pred)
+
+    def init_weights(self):
+        """Initialize the bias of YOLOv5 head."""
+        super(YOLOv5HeadModule, self).init_weights()
+        for mi, aux, s in zip(self.main_convs_pred, self.aux_convs_pred,
+                              self.featmap_strides):  # from
+            mi = mi[2]  # nn.Conv2d
+            b = mi.bias.data.view(3, -1)
+            # obj (8 objects per 640 image)
+            b.data[:, 4] += math.log(8 / (640 / s)**2)
+            b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99))
+            mi.bias.data = b.view(-1)
+
+            if self.use_aux:
+                aux = aux[1]  # nn.Conv2d
+                b = aux.bias.data.view(3, -1)
+                # obj (8 objects per 640 image)
+                b.data[:, 4] += math.log(8 / (640 / s)**2)
+                b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99))
+                mi.bias.data = b.view(-1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.main_convs_pred,
+                           self.aux_convs_pred)
+
+    def forward_single(self, x: Tensor, convs: nn.Module,
+                       aux_convs: Optional[nn.Module]) \
+            -> Tuple[Union[Tensor, List], Union[Tensor, List],
+                     Union[Tensor, List]]:
+        """Forward feature of a single scale level."""
+
+        pred_map = convs(x)
+        bs, _, ny, nx = pred_map.shape
+        pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib,
+                                 ny, nx)
+
+        cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx)
+        bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx)
+        objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx)
+
+        if not self.training or not self.use_aux:
+            return cls_score, bbox_pred, objectness
+        else:
+            aux_pred_map = aux_convs(x)
+            aux_pred_map = aux_pred_map.view(bs, self.num_base_priors,
+                                             self.num_out_attrib, ny, nx)
+            aux_cls_score = aux_pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx)
+            aux_bbox_pred = aux_pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx)
+            aux_objectness = aux_pred_map[:, :, 4:5,
+                                          ...].reshape(bs, -1, ny, nx)
+
+            return [cls_score,
+                    aux_cls_score], [bbox_pred, aux_bbox_pred
+                                     ], [objectness, aux_objectness]
+
+
+@MODELS.register_module()
+class YOLOv7Head(YOLOv5Head):
+    """YOLOv7Head head used in `YOLOv7 <https://arxiv.org/abs/2207.02696>`_.
+
+    Args:
+        simota_candidate_topk (int): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k in BatchYOLOv7Assigner.
+            Defaults to 10.
+        simota_iou_weight (float): The scale factor for regression
+            iou cost in BatchYOLOv7Assigner. Defaults to 3.0.
+        simota_cls_weight (float): The scale factor for classification
+            cost in BatchYOLOv7Assigner. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 *args,
+                 simota_candidate_topk: int = 20,
+                 simota_iou_weight: float = 3.0,
+                 simota_cls_weight: float = 1.0,
+                 aux_loss_weights: float = 0.25,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.aux_loss_weights = aux_loss_weights
+        self.assigner = BatchYOLOv7Assigner(
+            num_classes=self.num_classes,
+            num_base_priors=self.num_base_priors,
+            featmap_strides=self.featmap_strides,
+            prior_match_thr=self.prior_match_thr,
+            candidate_topk=simota_candidate_topk,
+            iou_weight=simota_iou_weight,
+            cls_weight=simota_cls_weight)
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Union[Tensor, List]],
+            bbox_preds: Sequence[Union[Tensor, List]],
+            objectnesses: Sequence[Union[Tensor, List]],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+
+        if isinstance(cls_scores[0], Sequence):
+            with_aux = True
+            batch_size = cls_scores[0][0].shape[0]
+            device = cls_scores[0][0].device
+
+            bbox_preds_main, bbox_preds_aux = zip(*bbox_preds)
+            objectnesses_main, objectnesses_aux = zip(*objectnesses)
+            cls_scores_main, cls_scores_aux = zip(*cls_scores)
+
+            head_preds = self._merge_predict_results(bbox_preds_main,
+                                                     objectnesses_main,
+                                                     cls_scores_main)
+            head_preds_aux = self._merge_predict_results(
+                bbox_preds_aux, objectnesses_aux, cls_scores_aux)
+        else:
+            with_aux = False
+            batch_size = cls_scores[0].shape[0]
+            device = cls_scores[0].device
+
+            head_preds = self._merge_predict_results(bbox_preds, objectnesses,
+                                                     cls_scores)
+
+        # Convert gt to norm xywh format
+        # (num_base_priors, num_batch_gt, 7)
+        # 7 is mean (batch_idx, cls_id, x_norm, y_norm,
+        # w_norm, h_norm, prior_idx)
+        batch_targets_normed = self._convert_gt_to_norm_format(
+            batch_gt_instances, batch_img_metas)
+
+        scaled_factors = [
+            torch.tensor(head_pred.shape, device=device)[[3, 2, 3, 2]]
+            for head_pred in head_preds
+        ]
+
+        loss_cls, loss_obj, loss_box = self._calc_loss(
+            head_preds=head_preds,
+            head_preds_aux=None,
+            batch_targets_normed=batch_targets_normed,
+            near_neighbor_thr=self.near_neighbor_thr,
+            scaled_factors=scaled_factors,
+            batch_img_metas=batch_img_metas,
+            device=device)
+
+        if with_aux:
+            loss_cls_aux, loss_obj_aux, loss_box_aux = self._calc_loss(
+                head_preds=head_preds,
+                head_preds_aux=head_preds_aux,
+                batch_targets_normed=batch_targets_normed,
+                near_neighbor_thr=self.near_neighbor_thr * 2,
+                scaled_factors=scaled_factors,
+                batch_img_metas=batch_img_metas,
+                device=device)
+            loss_cls += self.aux_loss_weights * loss_cls_aux
+            loss_obj += self.aux_loss_weights * loss_obj_aux
+            loss_box += self.aux_loss_weights * loss_box_aux
+
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * batch_size * world_size,
+            loss_obj=loss_obj * batch_size * world_size,
+            loss_bbox=loss_box * batch_size * world_size)
+
+    def _calc_loss(self, head_preds, head_preds_aux, batch_targets_normed,
+                   near_neighbor_thr, scaled_factors, batch_img_metas, device):
+        loss_cls = torch.zeros(1, device=device)
+        loss_box = torch.zeros(1, device=device)
+        loss_obj = torch.zeros(1, device=device)
+
+        assigner_results = self.assigner(
+            head_preds,
+            batch_targets_normed,
+            batch_img_metas[0]['batch_input_shape'],
+            self.priors_base_sizes,
+            self.grid_offset,
+            near_neighbor_thr=near_neighbor_thr)
+        # mlvl is mean multi_level
+        mlvl_positive_infos = assigner_results['mlvl_positive_infos']
+        mlvl_priors = assigner_results['mlvl_priors']
+        mlvl_targets_normed = assigner_results['mlvl_targets_normed']
+
+        if head_preds_aux is not None:
+            # This is mean calc aux branch loss
+            head_preds = head_preds_aux
+
+        for i, head_pred in enumerate(head_preds):
+            batch_inds, proir_idx, grid_x, grid_y = mlvl_positive_infos[i].T
+            num_pred_positive = batch_inds.shape[0]
+            target_obj = torch.zeros_like(head_pred[..., 0])
+            # empty positive sampler
+            if num_pred_positive == 0:
+                loss_box += head_pred[..., :4].sum() * 0
+                loss_cls += head_pred[..., 5:].sum() * 0
+                loss_obj += self.loss_obj(
+                    head_pred[..., 4], target_obj) * self.obj_level_weights[i]
+                continue
+
+            priors = mlvl_priors[i]
+            targets_normed = mlvl_targets_normed[i]
+
+            head_pred_positive = head_pred[batch_inds, proir_idx, grid_y,
+                                           grid_x]
+
+            # calc bbox loss
+            grid_xy = torch.stack([grid_x, grid_y], dim=1)
+            decoded_pred_bbox = self._decode_bbox_to_xywh(
+                head_pred_positive[:, :4], priors, grid_xy)
+            target_bbox_scaled = targets_normed[:, 2:6] * scaled_factors[i]
+
+            loss_box_i, iou = self.loss_bbox(decoded_pred_bbox,
+                                             target_bbox_scaled)
+            loss_box += loss_box_i
+
+            # calc obj loss
+            target_obj[batch_inds, proir_idx, grid_y,
+                       grid_x] = iou.detach().clamp(0).type(target_obj.dtype)
+            loss_obj += self.loss_obj(head_pred[..., 4],
+                                      target_obj) * self.obj_level_weights[i]
+
+            # calc cls loss
+            if self.num_classes > 1:
+                pred_cls_scores = targets_normed[:, 1].long()
+                target_class = torch.full_like(
+                    head_pred_positive[:, 5:], 0., device=device)
+                target_class[range(num_pred_positive), pred_cls_scores] = 1.
+                loss_cls += self.loss_cls(head_pred_positive[:, 5:],
+                                          target_class)
+            else:
+                loss_cls += head_pred_positive[:, 5:].sum() * 0
+        return loss_cls, loss_obj, loss_box
+
+    def _merge_predict_results(self, bbox_preds: Sequence[Tensor],
+                               objectnesses: Sequence[Tensor],
+                               cls_scores: Sequence[Tensor]) -> List[Tensor]:
+        """Merge predict output from 3 heads.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+
+        Returns:
+              List[Tensor]: Merged output.
+        """
+        head_preds = []
+        for bbox_pred, objectness, cls_score in zip(bbox_preds, objectnesses,
+                                                    cls_scores):
+            b, _, h, w = bbox_pred.shape
+            bbox_pred = bbox_pred.reshape(b, self.num_base_priors, -1, h, w)
+            objectness = objectness.reshape(b, self.num_base_priors, -1, h, w)
+            cls_score = cls_score.reshape(b, self.num_base_priors, -1, h, w)
+            head_pred = torch.cat([bbox_pred, objectness, cls_score],
+                                  dim=2).permute(0, 1, 3, 4, 2).contiguous()
+            head_preds.append(head_pred)
+        return head_preds
+
+    def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes,
+                             grid_xy) -> Tensor:
+        bbox_pred = bbox_pred.sigmoid()
+        pred_xy = bbox_pred[:, :2] * 2 - 0.5 + grid_xy
+        pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes
+        decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1)
+        return decoded_bbox_pred
diff --git a/mmyolo/models/dense_heads/yolov8_head.py b/mmyolo/models/dense_heads/yolov8_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..123d0dfb0d21e392dbffdc79a8cdcd4601e9e12a
--- /dev/null
+++ b/mmyolo/models/dense_heads/yolov8_head.py
@@ -0,0 +1,398 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.utils import multi_apply
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig)
+from mmengine.dist import get_dist_info
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS, TASK_UTILS
+from ..utils import gt_instances_preprocess, make_divisible
+from .yolov5_head import YOLOv5Head
+
+
+@MODELS.register_module()
+class YOLOv8HeadModule(BaseModule):
+    """YOLOv8HeadModule head module used in `YOLOv8`.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to [8, 16, 32].
+        reg_max (int): Max value of integral set :math: ``{0, ..., reg_max-1}``
+            in QFL setting. Defaults to 16.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Union[int, Sequence],
+                 widen_factor: float = 1.0,
+                 num_base_priors: int = 1,
+                 featmap_strides: Sequence[int] = (8, 16, 32),
+                 reg_max: int = 16,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.featmap_strides = featmap_strides
+        self.num_levels = len(self.featmap_strides)
+        self.num_base_priors = num_base_priors
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_channels = in_channels
+        self.reg_max = reg_max
+
+        in_channels = []
+        for channel in self.in_channels:
+            channel = make_divisible(channel, widen_factor)
+            in_channels.append(channel)
+        self.in_channels = in_channels
+
+        self._init_layers()
+
+    def init_weights(self, prior_prob=0.01):
+        """Initialize the weight and bias of PPYOLOE head."""
+        super().init_weights()
+        for reg_pred, cls_pred, stride in zip(self.reg_preds, self.cls_preds,
+                                              self.featmap_strides):
+            reg_pred[-1].bias.data[:] = 1.0  # box
+            # cls (.01 objects, 80 classes, 640 img)
+            cls_pred[-1].bias.data[:self.num_classes] = math.log(
+                5 / self.num_classes / (640 / stride)**2)
+
+    def _init_layers(self):
+        """initialize conv layers in YOLOv8 head."""
+        # Init decouple head
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+
+        reg_out_channels = max(
+            (16, self.in_channels[0] // 4, self.reg_max * 4))
+        cls_out_channels = max(self.in_channels[0], self.num_classes)
+
+        for i in range(self.num_levels):
+            self.reg_preds.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=reg_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    ConvModule(
+                        in_channels=reg_out_channels,
+                        out_channels=reg_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    nn.Conv2d(
+                        in_channels=reg_out_channels,
+                        out_channels=4 * self.reg_max,
+                        kernel_size=1)))
+            self.cls_preds.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=cls_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    ConvModule(
+                        in_channels=cls_out_channels,
+                        out_channels=cls_out_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    nn.Conv2d(
+                        in_channels=cls_out_channels,
+                        out_channels=self.num_classes,
+                        kernel_size=1)))
+
+        proj = torch.arange(self.reg_max, dtype=torch.float)
+        self.register_buffer('proj', proj, persistent=False)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions
+        """
+        assert len(x) == self.num_levels
+        return multi_apply(self.forward_single, x, self.cls_preds,
+                           self.reg_preds)
+
+    def forward_single(self, x: torch.Tensor, cls_pred: nn.ModuleList,
+                       reg_pred: nn.ModuleList) -> Tuple:
+        """Forward feature of a single scale level."""
+        b, _, h, w = x.shape
+        cls_logit = cls_pred(x)
+        bbox_dist_preds = reg_pred(x)
+        if self.reg_max > 1:
+            bbox_dist_preds = bbox_dist_preds.reshape(
+                [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2)
+
+            # TODO: The get_flops script cannot handle the situation of
+            #  matmul, and needs to be fixed later
+            # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj)
+            bbox_preds = bbox_dist_preds.softmax(3).matmul(
+                self.proj.view([-1, 1])).squeeze(-1)
+            bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w)
+        else:
+            bbox_preds = bbox_dist_preds
+        if self.training:
+            return cls_logit, bbox_preds, bbox_dist_preds
+        else:
+            return cls_logit, bbox_preds
+
+
+@MODELS.register_module()
+class YOLOv8Head(YOLOv5Head):
+    """YOLOv8Head head used in `YOLOv8`.
+
+    Args:
+        head_module(:obj:`ConfigDict` or dict): Base module used for YOLOv8Head
+        prior_generator(dict): Points generator feature maps
+            in 2D points-based detectors.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_dfl (:obj:`ConfigDict` or dict): Config of Distribution Focal
+            Loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0.5,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none',
+                     loss_weight=0.5),
+                 loss_bbox: ConfigType = dict(
+                     type='IoULoss',
+                     iou_mode='ciou',
+                     bbox_format='xyxy',
+                     reduction='sum',
+                     loss_weight=7.5,
+                     return_iou=False),
+                 loss_dfl=dict(
+                     type='mmdet.DistributionFocalLoss',
+                     reduction='mean',
+                     loss_weight=1.5 / 4),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None
+                 ):
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.loss_dfl = MODELS.build(loss_dfl)
+        # YOLOv8 doesn't need loss_obj
+        self.loss_obj = None
+
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+
+        The special_init function is designed to deal with this situation.
+        """
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+
+            # Add common attributes to reduce calculation
+            self.featmap_sizes_train = None
+            self.num_level_priors = None
+            self.flatten_priors_train = None
+            self.stride_tensor = None
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            bbox_dist_preds: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            bbox_dist_preds (Sequence[Tensor]): Box distribution logits for
+                each scale level with shape (bs, reg_max + 1, H*W, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        num_imgs = len(batch_img_metas)
+
+        current_featmap_sizes = [
+            cls_score.shape[2:] for cls_score in cls_scores
+        ]
+        # If the shape does not equal, generate new one
+        if current_featmap_sizes != self.featmap_sizes_train:
+            self.featmap_sizes_train = current_featmap_sizes
+
+            mlvl_priors_with_stride = self.prior_generator.grid_priors(
+                self.featmap_sizes_train,
+                dtype=cls_scores[0].dtype,
+                device=cls_scores[0].device,
+                with_stride=True)
+
+            self.num_level_priors = [len(n) for n in mlvl_priors_with_stride]
+            self.flatten_priors_train = torch.cat(
+                mlvl_priors_with_stride, dim=0)
+            self.stride_tensor = self.flatten_priors_train[..., [2]]
+
+        # gt info
+        gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs)
+        gt_labels = gt_info[:, :, :1]
+        gt_bboxes = gt_info[:, :, 1:]  # xyxy
+        pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+
+        # pred info
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+        flatten_pred_bboxes = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        # (bs, n, 4 * reg_max)
+        flatten_pred_dists = [
+            bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4)
+            for bbox_pred_org in bbox_dist_preds
+        ]
+
+        flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1)
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1)
+        flatten_pred_bboxes = self.bbox_coder.decode(
+            self.flatten_priors_train[..., :2], flatten_pred_bboxes,
+            self.stride_tensor[..., 0])
+
+        assigned_result = self.assigner(
+            (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype),
+            flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train,
+            gt_labels, gt_bboxes, pad_bbox_flag)
+
+        assigned_bboxes = assigned_result['assigned_bboxes']
+        assigned_scores = assigned_result['assigned_scores']
+        fg_mask_pre_prior = assigned_result['fg_mask_pre_prior']
+
+        assigned_scores_sum = assigned_scores.sum().clamp(min=1)
+
+        loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum()
+        loss_cls /= assigned_scores_sum
+
+        # rescale bbox
+        assigned_bboxes /= self.stride_tensor
+        flatten_pred_bboxes /= self.stride_tensor
+
+        # select positive samples mask
+        num_pos = fg_mask_pre_prior.sum()
+        if num_pos > 0:
+            # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox
+            # will not report an error
+            # iou loss
+            prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(
+                flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4])
+            assigned_bboxes_pos = torch.masked_select(
+                assigned_bboxes, prior_bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1)
+            loss_bbox = self.loss_bbox(
+                pred_bboxes_pos, assigned_bboxes_pos,
+                weight=bbox_weight) / assigned_scores_sum
+
+            # dfl loss
+            pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior]
+            assigned_ltrb = self.bbox_coder.encode(
+                self.flatten_priors_train[..., :2] / self.stride_tensor,
+                assigned_bboxes,
+                max_dis=self.head_module.reg_max - 1,
+                eps=0.01)
+            assigned_ltrb_pos = torch.masked_select(
+                assigned_ltrb, prior_bbox_mask).reshape([-1, 4])
+            loss_dfl = self.loss_dfl(
+                pred_dist_pos.reshape(-1, self.head_module.reg_max),
+                assigned_ltrb_pos.reshape(-1),
+                weight=bbox_weight.expand(-1, 4).reshape(-1),
+                avg_factor=assigned_scores_sum)
+        else:
+            loss_bbox = flatten_pred_bboxes.sum() * 0
+            loss_dfl = flatten_pred_bboxes.sum() * 0
+        _, world_size = get_dist_info()
+        return dict(
+            loss_cls=loss_cls * num_imgs * world_size,
+            loss_bbox=loss_bbox * num_imgs * world_size,
+            loss_dfl=loss_dfl * num_imgs * world_size)
diff --git a/mmyolo/models/dense_heads/yolox_head.py b/mmyolo/models/dense_heads/yolox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a203298d8536148a7022711eabeee7f04fea8ab4
--- /dev/null
+++ b/mmyolo/models/dense_heads/yolox_head.py
@@ -0,0 +1,514 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.task_modules.samplers import PseudoSampler
+from mmdet.models.utils import multi_apply
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS, TASK_UTILS
+from .yolov5_head import YOLOv5Head
+
+
+@MODELS.register_module()
+class YOLOXHeadModule(BaseModule):
+    """YOLOXHead head module used in `YOLOX.
+
+    `<https://arxiv.org/abs/2107.08430>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Union[int, Sequence]): Number of channels in the input
+            feature map.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_base_priors (int): The number of priors (points) at a point
+            on the feature grid
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 2.
+        featmap_strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to [8, 16, 32].
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Defaults to False.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: Union[int, Sequence],
+        widen_factor: float = 1.0,
+        num_base_priors: int = 1,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        featmap_strides: Sequence[int] = [8, 16, 32],
+        use_depthwise: bool = False,
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: OptMultiConfig = None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.feat_channels = int(feat_channels * widen_factor)
+        self.stacked_convs = stacked_convs
+        self.use_depthwise = use_depthwise
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.num_base_priors = num_base_priors
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.featmap_strides = featmap_strides
+
+        if isinstance(in_channels, int):
+            in_channels = int(in_channels * widen_factor)
+        self.in_channels = in_channels
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize heads for all level feature maps."""
+        self.multi_level_cls_convs = nn.ModuleList()
+        self.multi_level_reg_convs = nn.ModuleList()
+        self.multi_level_conv_cls = nn.ModuleList()
+        self.multi_level_conv_reg = nn.ModuleList()
+        self.multi_level_conv_obj = nn.ModuleList()
+        for _ in self.featmap_strides:
+            self.multi_level_cls_convs.append(self._build_stacked_convs())
+            self.multi_level_reg_convs.append(self._build_stacked_convs())
+            conv_cls, conv_reg, conv_obj = self._build_predictor()
+            self.multi_level_conv_cls.append(conv_cls)
+            self.multi_level_conv_reg.append(conv_reg)
+            self.multi_level_conv_obj.append(conv_obj)
+
+    def _build_stacked_convs(self) -> nn.Sequential:
+        """Initialize conv layers of a single level head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        stacked_convs = []
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            stacked_convs.append(
+                conv(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.conv_bias))
+        return nn.Sequential(*stacked_convs)
+
+    def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]:
+        """Initialize predictor layers of a single level head."""
+        conv_cls = nn.Conv2d(self.feat_channels, self.num_classes, 1)
+        conv_reg = nn.Conv2d(self.feat_channels, 4, 1)
+        conv_obj = nn.Conv2d(self.feat_channels, 1, 1)
+        return conv_cls, conv_reg, conv_obj
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        # Use prior in model initialization to improve stability
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj in zip(self.multi_level_conv_cls,
+                                      self.multi_level_conv_obj):
+            conv_cls.bias.data.fill_(bias_init)
+            conv_obj.bias.data.fill_(bias_init)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+
+        return multi_apply(self.forward_single, x, self.multi_level_cls_convs,
+                           self.multi_level_reg_convs,
+                           self.multi_level_conv_cls,
+                           self.multi_level_conv_reg,
+                           self.multi_level_conv_obj)
+
+    def forward_single(self, x: Tensor, cls_convs: nn.Module,
+                       reg_convs: nn.Module, conv_cls: nn.Module,
+                       conv_reg: nn.Module,
+                       conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+
+        cls_feat = cls_convs(x)
+        reg_feat = reg_convs(x)
+
+        cls_score = conv_cls(cls_feat)
+        bbox_pred = conv_reg(reg_feat)
+        objectness = conv_obj(reg_feat)
+
+        return cls_score, bbox_pred, objectness
+
+
+@MODELS.register_module()
+class YOLOXHead(YOLOv5Head):
+    """YOLOXHead head used in `YOLOX <https://arxiv.org/abs/2107.08430>`_.
+
+    Args:
+        head_module(ConfigType): Base module used for YOLOXHead
+        prior_generator: Points generator feature maps in
+            2D points-based detectors.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss.
+        loss_bbox_aux (:obj:`ConfigDict` or dict): Config of bbox aux loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 head_module: ConfigType,
+                 prior_generator: ConfigType = dict(
+                     type='mmdet.MlvlPointGenerator',
+                     offset=0,
+                     strides=[8, 16, 32]),
+                 bbox_coder: ConfigType = dict(type='YOLOXBBoxCoder'),
+                 loss_cls: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='mmdet.IoULoss',
+                     mode='square',
+                     eps=1e-16,
+                     reduction='sum',
+                     loss_weight=5.0),
+                 loss_obj: ConfigType = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     loss_weight=1.0),
+                 loss_bbox_aux: ConfigType = dict(
+                     type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        self.use_bbox_aux = False
+        self.loss_bbox_aux = loss_bbox_aux
+
+        super().__init__(
+            head_module=head_module,
+            prior_generator=prior_generator,
+            bbox_coder=bbox_coder,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_obj=loss_obj,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def special_init(self):
+        """Since YOLO series algorithms will inherit from YOLOv5Head, but
+        different algorithms have special initialization process.
+
+        The special_init function is designed to deal with this situation.
+        """
+        self.loss_bbox_aux: nn.Module = MODELS.build(self.loss_bbox_aux)
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            # YOLOX does not support sampling
+            self.sampler = PseudoSampler()
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        return self.head_module(x)
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Tensor,
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        num_imgs = len(batch_img_metas)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        batch_gt_instances = self.gt_instances_preprocess(
+            batch_gt_instances, len(batch_img_metas))
+
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.num_classes)
+            for cls_pred in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1)
+        flatten_priors = torch.cat(mlvl_priors)
+        flatten_bboxes = self.bbox_coder.decode(flatten_priors[..., :2],
+                                                flatten_bbox_preds,
+                                                flatten_priors[..., 2])
+
+        (pos_masks, cls_targets, obj_targets, bbox_targets, bbox_aux_target,
+         num_fg_imgs) = multi_apply(
+             self._get_targets_single,
+             flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1),
+             flatten_cls_preds.detach(), flatten_bboxes.detach(),
+             flatten_objectness.detach(), batch_gt_instances, batch_img_metas,
+             batch_gt_instances_ignore)
+
+        # The experimental results show that 'reduce_mean' can improve
+        # performance on the COCO dataset.
+        num_pos = torch.tensor(
+            sum(num_fg_imgs),
+            dtype=torch.float,
+            device=flatten_cls_preds.device)
+        num_total_samples = max(reduce_mean(num_pos), 1.0)
+
+        pos_masks = torch.cat(pos_masks, 0)
+        cls_targets = torch.cat(cls_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        if self.use_bbox_aux:
+            bbox_aux_target = torch.cat(bbox_aux_target, 0)
+
+        loss_obj = self.loss_obj(flatten_objectness.view(-1, 1),
+                                 obj_targets) / num_total_samples
+        if num_pos > 0:
+            loss_cls = self.loss_cls(
+                flatten_cls_preds.view(-1, self.num_classes)[pos_masks],
+                cls_targets) / num_total_samples
+            loss_bbox = self.loss_bbox(
+                flatten_bboxes.view(-1, 4)[pos_masks],
+                bbox_targets) / num_total_samples
+        else:
+            # Avoid cls and reg branch not participating in the gradient
+            # propagation when there is no ground-truth in the images.
+            # For more details, please refer to
+            # https://github.com/open-mmlab/mmdetection/issues/7298
+            loss_cls = flatten_cls_preds.sum() * 0
+            loss_bbox = flatten_bboxes.sum() * 0
+
+        loss_dict = dict(
+            loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj)
+
+        if self.use_bbox_aux:
+            if num_pos > 0:
+                loss_bbox_aux = self.loss_bbox_aux(
+                    flatten_bbox_preds.view(-1, 4)[pos_masks],
+                    bbox_aux_target) / num_total_samples
+            else:
+                # Avoid cls and reg branch not participating in the gradient
+                # propagation when there is no ground-truth in the images.
+                # For more details, please refer to
+                # https://github.com/open-mmlab/mmdetection/issues/7298
+                loss_bbox_aux = flatten_bbox_preds.sum() * 0
+            loss_dict.update(loss_bbox_aux=loss_bbox_aux)
+
+        return loss_dict
+
+    @torch.no_grad()
+    def _get_targets_single(
+            self,
+            priors: Tensor,
+            cls_preds: Tensor,
+            decoded_bboxes: Tensor,
+            objectness: Tensor,
+            gt_instances: InstanceData,
+            img_meta: dict,
+            gt_instances_ignore: Optional[InstanceData] = None) -> tuple:
+        """Compute classification, regression, and objectness targets for
+        priors in a single image.
+
+        Args:
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            cls_preds (Tensor): Classification predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            decoded_bboxes (Tensor): Decoded bboxes predictions of one image,
+                a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y,
+                br_x, br_y] format.
+            objectness (Tensor): Objectness predictions of one image,
+                a 1D-Tensor with shape [num_priors]
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            tuple:
+                foreground_mask (list[Tensor]): Binary mask of foreground
+                targets.
+                cls_target (list[Tensor]): Classification targets of an image.
+                obj_target (list[Tensor]): Objectness targets of an image.
+                bbox_target (list[Tensor]): BBox targets of an image.
+                bbox_aux_target (int): BBox aux targets of an image.
+                num_pos_per_img (int): Number of positive samples in an image.
+        """
+
+        num_priors = priors.size(0)
+        num_gts = len(gt_instances)
+        # No target
+        if num_gts == 0:
+            cls_target = cls_preds.new_zeros((0, self.num_classes))
+            bbox_target = cls_preds.new_zeros((0, 4))
+            bbox_aux_target = cls_preds.new_zeros((0, 4))
+            obj_target = cls_preds.new_zeros((num_priors, 1))
+            foreground_mask = cls_preds.new_zeros(num_priors).bool()
+            return (foreground_mask, cls_target, obj_target, bbox_target,
+                    bbox_aux_target, 0)
+
+        # YOLOX uses center priors with 0.5 offset to assign targets,
+        # but use center priors without offset to regress bboxes.
+        offset_priors = torch.cat(
+            [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1)
+
+        scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid()
+        pred_instances = InstanceData(
+            bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        pos_inds = sampling_result.pos_inds
+        num_pos_per_img = pos_inds.size(0)
+
+        pos_ious = assign_result.max_overlaps[pos_inds]
+        # IOU aware classification score
+        cls_target = F.one_hot(sampling_result.pos_gt_labels,
+                               self.num_classes) * pos_ious.unsqueeze(-1)
+        obj_target = torch.zeros_like(objectness).unsqueeze(-1)
+        obj_target[pos_inds] = 1
+        bbox_target = sampling_result.pos_gt_bboxes
+        bbox_aux_target = cls_preds.new_zeros((num_pos_per_img, 4))
+        if self.use_bbox_aux:
+            bbox_aux_target = self._get_bbox_aux_target(
+                bbox_aux_target, bbox_target, priors[pos_inds])
+        foreground_mask = torch.zeros_like(objectness).to(torch.bool)
+        foreground_mask[pos_inds] = 1
+        return (foreground_mask, cls_target, obj_target, bbox_target,
+                bbox_aux_target, num_pos_per_img)
+
+    def _get_bbox_aux_target(self,
+                             bbox_aux_target: Tensor,
+                             gt_bboxes: Tensor,
+                             priors: Tensor,
+                             eps: float = 1e-8) -> Tensor:
+        """Convert gt bboxes to center offset and log width height."""
+        gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes)
+        bbox_aux_target[:, :2] = (gt_cxcywh[:, :2] -
+                                  priors[:, :2]) / priors[:, 2:]
+        bbox_aux_target[:,
+                        2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps)
+        return bbox_aux_target
+
+    @staticmethod
+    def gt_instances_preprocess(batch_gt_instances: Tensor,
+                                batch_size: int) -> List[InstanceData]:
+        """Split batch_gt_instances with batch size.
+
+        Args:
+            batch_gt_instances (Tensor): Ground truth
+                a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6]
+            batch_size (int): Batch size.
+
+        Returns:
+            List: batch gt instances data, shape [batch_size, InstanceData]
+        """
+        # faster version
+        batch_instance_list = []
+        for i in range(batch_size):
+            batch_gt_instance_ = InstanceData()
+            single_batch_instance = \
+                batch_gt_instances[batch_gt_instances[:, 0] == i, :]
+            batch_gt_instance_.bboxes = single_batch_instance[:, 2:]
+            batch_gt_instance_.labels = single_batch_instance[:, 1]
+            batch_instance_list.append(batch_gt_instance_)
+
+        return batch_instance_list
diff --git a/mmyolo/models/detectors/__init__.py b/mmyolo/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..74fb1c6c21c5840a5cd3f45a1a9f827c0e670604
--- /dev/null
+++ b/mmyolo/models/detectors/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .yolo_detector import YOLODetector
+
+__all__ = ['YOLODetector']
diff --git a/mmyolo/models/detectors/yolo_detector.py b/mmyolo/models/detectors/yolo_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6783fbab41287df54f136ea121e827d0603414f
--- /dev/null
+++ b/mmyolo/models/detectors/yolo_detector.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.models.detectors.single_stage import SingleStageDetector
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+
+from mmyolo.registry import MODELS
+
+
+@MODELS.register_module()
+class YOLODetector(SingleStageDetector):
+    r"""Implementation of YOLO Series
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLO. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLO. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+        use_syncbn (bool): whether to use SyncBatchNorm. Defaults to True.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 use_syncbn: bool = True):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        # TODO： Waiting for mmengine support
+        if use_syncbn and get_world_size() > 1:
+            torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)
+            print_log('Using SyncBatchNorm()', 'current')
diff --git a/mmyolo/models/layers/__init__.py b/mmyolo/models/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f709dbb7e4bbd4c1a9d294a8d4cd28de2f2e457f
--- /dev/null
+++ b/mmyolo/models/layers/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ema import ExpMomentumEMA
+from .yolo_bricks import (BepC3StageBlock, CSPLayerWithTwoConv,
+                          DarknetBottleneck, EELANBlock, EffectiveSELayer,
+                          ELANBlock, ImplicitA, ImplicitM,
+                          MaxPoolAndStrideConvBlock, PPYOLOEBasicBlock,
+                          RepStageBlock, RepVGGBlock, SPPFBottleneck,
+                          SPPFCSPBlock, TinyDownSampleBlock)
+
+__all__ = [
+    'SPPFBottleneck', 'RepVGGBlock', 'RepStageBlock', 'ExpMomentumEMA',
+    'ELANBlock', 'MaxPoolAndStrideConvBlock', 'SPPFCSPBlock',
+    'PPYOLOEBasicBlock', 'EffectiveSELayer', 'TinyDownSampleBlock',
+    'EELANBlock', 'ImplicitA', 'ImplicitM', 'BepC3StageBlock',
+    'CSPLayerWithTwoConv', 'DarknetBottleneck'
+]
diff --git a/mmyolo/models/layers/ema.py b/mmyolo/models/layers/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ed204190ee4a5ab9395eddce5866545caac2c0
--- /dev/null
+++ b/mmyolo/models/layers/ema.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmdet.models.layers import ExpMomentumEMA as MMDET_ExpMomentumEMA
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+
+
+@MODELS.register_module()
+class ExpMomentumEMA(MMDET_ExpMomentumEMA):
+    """Exponential moving average (EMA) with exponential momentum strategy,
+    which is used in YOLO.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Ema's parameters are updated with the formula:
+           `averaged_param = (1-momentum) * averaged_param + momentum *
+           source_param`. Defaults to 0.0002.
+        gamma (int): Use a larger momentum early in training and gradually
+            annealing to a smaller value to update the ema model smoothly. The
+            momentum is calculated as
+            `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`.
+            Defaults to 2000.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 gamma: int = 2000,
+                 interval=1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False):
+        super().__init__(
+            model=model,
+            momentum=momentum,
+            interval=interval,
+            device=device,
+            update_buffers=update_buffers)
+        assert gamma > 0, f'gamma must be greater than 0, but got {gamma}'
+        self.gamma = gamma
+
+        # Note: There is no need to re-fetch every update,
+        # as most models do not change their structure
+        # during the training process.
+        self.src_parameters = (
+            model.state_dict()
+            if self.update_buffers else dict(model.named_parameters()))
+        if not self.update_buffers:
+            self.src_buffers = model.buffers()
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int):
+        """Compute the moving average of the parameters using the exponential
+        momentum strategy.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        momentum = (1 - self.momentum) * math.exp(
+            -float(1 + steps) / self.gamma) + self.momentum
+        averaged_param.lerp_(source_param, momentum)
+
+    def update_parameters(self, model: nn.Module):
+        """Update the parameters after each training step.
+
+        Args:
+            model (nn.Module): The model of the parameter needs to be updated.
+        """
+        if self.steps == 0:
+            for k, p_avg in self.avg_parameters.items():
+                p_avg.data.copy_(self.src_parameters[k].data)
+        elif self.steps % self.interval == 0:
+            for k, p_avg in self.avg_parameters.items():
+                if p_avg.dtype.is_floating_point:
+                    self.avg_func(p_avg.data, self.src_parameters[k].data,
+                                  self.steps)
+        if not self.update_buffers:
+            # If not update the buffers,
+            # keep the buffers in sync with the source model.
+            for b_avg, b_src in zip(self.module.buffers(), self.src_buffers):
+                b_avg.data.copy_(b_src.data)
+        self.steps += 1
diff --git a/mmyolo/models/layers/yolo_bricks.py b/mmyolo/models/layers/yolo_bricks.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e69d528bf6268a895f913fac25d89f5f35e3301
--- /dev/null
+++ b/mmyolo/models/layers/yolo_bricks.py
@@ -0,0 +1,1510 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, MaxPool2d,
+                      build_norm_layer)
+from mmdet.models.layers.csp_layer import \
+    DarknetBottleneck as MMDET_DarknetBottleneck
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from mmengine.model import BaseModule
+from mmengine.utils import digit_version
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+
+if digit_version(torch.__version__) >= digit_version('1.7.0'):
+    MODELS.register_module(module=nn.SiLU, name='SiLU')
+else:
+
+    class SiLU(nn.Module):
+        """Sigmoid Weighted Liner Unit."""
+
+        def __init__(self, inplace=True):
+            super().__init__()
+
+        def forward(self, inputs) -> Tensor:
+            return inputs * torch.sigmoid(inputs)
+
+    MODELS.register_module(module=SiLU, name='SiLU')
+
+
+class SPPFBottleneck(BaseModule):
+    """Spatial pyramid pooling - Fast (SPPF) layer for
+    YOLOv5, YOLOX and PPYOLOE by Glenn Jocher
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (int, tuple[int]): Sequential or number of kernel
+            sizes of pooling layers. Defaults to 5.
+        use_conv_first (bool): Whether to use conv before pooling layer.
+            In YOLOv5 and YOLOX, the para set to True.
+            In PPYOLOE, the para set to False.
+            Defaults to True.
+        mid_channels_scale (float): Channel multiplier, multiply in_channels
+            by this amount to get mid_channels. This parameter is valid only
+            when use_conv_fist=True.Defaults to 0.5.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None.
+            which means using conv2d. Defaults to None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_sizes: Union[int, Sequence[int]] = 5,
+                 use_conv_first: bool = True,
+                 mid_channels_scale: float = 0.5,
+                 conv_cfg: ConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg)
+
+        if use_conv_first:
+            mid_channels = int(in_channels * mid_channels_scale)
+            self.conv1 = ConvModule(
+                in_channels,
+                mid_channels,
+                1,
+                stride=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            mid_channels = in_channels
+            self.conv1 = None
+        self.kernel_sizes = kernel_sizes
+        if isinstance(kernel_sizes, int):
+            self.poolings = nn.MaxPool2d(
+                kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2)
+            conv2_in_channels = mid_channels * 4
+        else:
+            self.poolings = nn.ModuleList([
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ])
+            conv2_in_channels = mid_channels * (len(kernel_sizes) + 1)
+
+        self.conv2 = ConvModule(
+            conv2_in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process
+        Args:
+            x (Tensor): The input tensor.
+        """
+        if self.conv1:
+            x = self.conv1(x)
+        if isinstance(self.kernel_sizes, int):
+            y1 = self.poolings(x)
+            y2 = self.poolings(y1)
+            x = torch.cat([x, y1, y2, self.poolings(y2)], dim=1)
+        else:
+            x = torch.cat(
+                [x] + [pooling(x) for pooling in self.poolings], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+@MODELS.register_module()
+class RepVGGBlock(nn.Module):
+    """RepVGGBlock is a basic rep-style block, including training and deploy
+    status This code is based on
+    https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple): Stride of the convolution. Default: 1
+        padding (int, tuple): Padding added to all four sides of
+            the input. Default: 1
+        dilation (int or tuple): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        padding_mode (string, optional): Default: 'zeros'
+        use_se (bool): Whether to use se. Default: False
+        use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv.
+            In PPYOLOE+ model backbone, `use_alpha` will be set to True.
+            Default: False.
+        use_bn_first (bool): Whether to use bn layer before conv.
+            In YOLOv6 and YOLOv7, this will be set to True.
+            In PPYOLOE, this will be set to False.
+            Default: True.
+        deploy (bool): Whether in deploy mode. Default: False
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int]] = 3,
+                 stride: Union[int, Tuple[int]] = 1,
+                 padding: Union[int, Tuple[int]] = 1,
+                 dilation: Union[int, Tuple[int]] = 1,
+                 groups: Optional[int] = 1,
+                 padding_mode: Optional[str] = 'zeros',
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 use_se: bool = False,
+                 use_alpha: bool = False,
+                 use_bn_first=True,
+                 deploy: bool = False):
+        super().__init__()
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        self.nonlinearity = MODELS.build(act_cfg)
+
+        if use_se:
+            raise NotImplementedError('se block not supported yet')
+        else:
+            self.se = nn.Identity()
+
+        if use_alpha:
+            alpha = torch.ones([
+                1,
+            ], dtype=torch.float32, requires_grad=True)
+            self.alpha = nn.Parameter(alpha, requires_grad=True)
+        else:
+            self.alpha = None
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=True,
+                padding_mode=padding_mode)
+
+        else:
+            if use_bn_first and (out_channels == in_channels) and stride == 1:
+                self.rbr_identity = build_norm_layer(
+                    norm_cfg, num_features=in_channels)[1]
+            else:
+                self.rbr_identity = None
+
+            self.rbr_dense = ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            self.rbr_1x1 = ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=padding_11,
+                groups=groups,
+                bias=False,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        """Forward process.
+        Args:
+            inputs (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        if hasattr(self, 'rbr_reparam'):
+            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+        if self.alpha:
+            return self.nonlinearity(
+                self.se(
+                    self.rbr_dense(inputs) +
+                    self.alpha * self.rbr_1x1(inputs) + id_out))
+        else:
+            return self.nonlinearity(
+                self.se(
+                    self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+
+    def get_equivalent_kernel_bias(self):
+        """Derives the equivalent kernel and bias in a differentiable way.
+
+        Returns:
+            tuple: Equivalent kernel and bias
+        """
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1) + kernelid, bias3x3 + self.alpha * bias1x1 + biasid
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        """Pad 1x1 tensor to 3x3.
+        Args:
+            kernel1x1 (Tensor): The input 1x1 kernel need to be padded.
+
+        Returns:
+            Tensor: 3x3 kernel after padded.
+        """
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch: nn.Module) -> Tuple[np.ndarray, Tensor]:
+        """Derives the equivalent kernel and bias of a specific branch layer.
+
+        Args:
+            branch (nn.Module): The layer that needs to be equivalently
+                transformed, which can be nn.Sequential or nn.Batchnorm2d
+
+        Returns:
+            tuple: Equivalent kernel and bias
+        """
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, ConvModule):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, (nn.SyncBatchNorm, nn.BatchNorm2d))
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3),
+                                        dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(
+                    branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        """Switch to deploy mode."""
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(
+            in_channels=self.rbr_dense.conv.in_channels,
+            out_channels=self.rbr_dense.conv.out_channels,
+            kernel_size=self.rbr_dense.conv.kernel_size,
+            stride=self.rbr_dense.conv.stride,
+            padding=self.rbr_dense.conv.padding,
+            dilation=self.rbr_dense.conv.dilation,
+            groups=self.rbr_dense.conv.groups,
+            bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+
+
+@MODELS.register_module()
+class BepC3StageBlock(nn.Module):
+    """Beer-mug RepC3 Block.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        num_blocks (int): Number of blocks. Defaults to 1
+        hidden_ratio (float): Hidden channel expansion.
+            Default: 0.5
+        concat_all_layer (bool): Concat all layer when forward calculate.
+            Default: True
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        norm_cfg (ConfigType): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (ConfigType): Config dict for activation layer.
+            Defaults to dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 num_blocks: int = 1,
+                 hidden_ratio: float = 0.5,
+                 concat_all_layer: bool = True,
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True)):
+        super().__init__()
+        hidden_channels = int(out_channels * hidden_ratio)
+
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            in_channels,
+            hidden_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv3 = ConvModule(
+            2 * hidden_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.block = RepStageBlock(
+            in_channels=hidden_channels,
+            out_channels=hidden_channels,
+            num_blocks=num_blocks,
+            block_cfg=block_cfg,
+            bottle_block=BottleRep)
+        self.concat_all_layer = concat_all_layer
+        if not concat_all_layer:
+            self.conv3 = ConvModule(
+                hidden_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                bias=False,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def forward(self, x):
+        if self.concat_all_layer is True:
+            return self.conv3(
+                torch.cat((self.block(self.conv1(x)), self.conv2(x)), dim=1))
+        else:
+            return self.conv3(self.block(self.conv1(x)))
+
+
+class BottleRep(nn.Module):
+    """Bottle Rep Block.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        adaptive_weight (bool): Add adaptive_weight when forward calculate.
+            Defaults False.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 adaptive_weight: bool = False):
+        super().__init__()
+        conv1_cfg = block_cfg.copy()
+        conv2_cfg = block_cfg.copy()
+
+        conv1_cfg.update(
+            dict(in_channels=in_channels, out_channels=out_channels))
+        conv2_cfg.update(
+            dict(in_channels=out_channels, out_channels=out_channels))
+
+        self.conv1 = MODELS.build(conv1_cfg)
+        self.conv2 = MODELS.build(conv2_cfg)
+
+        if in_channels != out_channels:
+            self.shortcut = False
+        else:
+            self.shortcut = True
+        if adaptive_weight:
+            self.alpha = nn.Parameter(torch.ones(1))
+        else:
+            self.alpha = 1.0
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = self.conv1(x)
+        outputs = self.conv2(outputs)
+        return outputs + self.alpha * x if self.shortcut else outputs
+
+
+@MODELS.register_module()
+class ConvWrapper(nn.Module):
+    """Wrapper for normal Conv with SiLU activation.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple): Stride of the convolution. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): Conv bias. Default: True.
+        norm_cfg (ConfigType): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (ConfigType): Config dict for activation layer.
+            Defaults to dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int = 3,
+                 stride: int = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 norm_cfg: ConfigType = None,
+                 act_cfg: ConfigType = dict(type='SiLU')):
+        super().__init__()
+        self.block = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.block(x)
+
+
+@MODELS.register_module()
+class EffectiveSELayer(nn.Module):
+    """Effective Squeeze-Excitation.
+
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation`
+    arxiv (https://arxiv.org/abs/1911.06667)
+    This code referenced to
+    https://github.com/youngwanLEE/CenterMask/blob/72147e8aae673fcaf4103ee90a6a6b73863e7fa1/maskrcnn_benchmark/modeling/backbone/vovnet.py#L108-L121  # noqa
+
+    Args:
+        channels (int): The input and output channels of this Module.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='HSigmoid').
+    """
+
+    def __init__(self,
+                 channels: int,
+                 act_cfg: ConfigType = dict(type='HSigmoid')):
+        super().__init__()
+        assert isinstance(act_cfg, dict)
+        self.fc = ConvModule(channels, channels, 1, act_cfg=None)
+
+        act_cfg_ = act_cfg.copy()  # type: ignore
+        self.activate = MODELS.build(act_cfg_)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process
+         Args:
+             x (Tensor): The input tensor.
+         """
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        return x * self.activate(x_se)
+
+
+class PPYOLOESELayer(nn.Module):
+    """Squeeze-and-Excitation Attention Module for PPYOLOE.
+        There are some differences between the current implementation and
+        SELayer in mmdet:
+            1. For fast speed and avoiding double inference in ppyoloe,
+               use `F.adaptive_avg_pool2d` before PPYOLOESELayer.
+            2. Special ways to init weights.
+            3. Different convolution order.
+
+    Args:
+        feat_channels (int): The input (and output) channels of the SE layer.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.1, eps=1e-5).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+    """
+
+    def __init__(self,
+                 feat_channels: int,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True)):
+        super().__init__()
+        self.fc = nn.Conv2d(feat_channels, feat_channels, 1)
+        self.sig = nn.Sigmoid()
+        self.conv = ConvModule(
+            feat_channels,
+            feat_channels,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        """Init weights."""
+        nn.init.normal_(self.fc.weight, mean=0, std=0.001)
+
+    def forward(self, feat: Tensor, avg_feat: Tensor) -> Tensor:
+        """Forward process
+         Args:
+             feat (Tensor): The input tensor.
+             avg_feat (Tensor): Average pooling feature tensor.
+         """
+        weight = self.sig(self.fc(avg_feat))
+        return self.conv(feat * weight)
+
+
+@MODELS.register_module()
+class ELANBlock(BaseModule):
+    """Efficient layer aggregation networks for YOLOv7.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The out channels of this Module.
+        middle_ratio (float): The scaling ratio of the middle layer
+            based on the in_channels.
+        block_ratio (float): The scaling ratio of the block layer
+            based on the in_channels.
+        num_blocks (int): The number of blocks in the main branch.
+            Defaults to 2.
+        num_convs_in_block (int): The number of convs pre block.
+            Defaults to 1.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None.
+            which means using conv2d. Defaults to None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 middle_ratio: float,
+                 block_ratio: float,
+                 num_blocks: int = 2,
+                 num_convs_in_block: int = 1,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        assert num_blocks >= 1
+        assert num_convs_in_block >= 1
+
+        middle_channels = int(in_channels * middle_ratio)
+        block_channels = int(in_channels * block_ratio)
+        final_conv_in_channels = int(
+            num_blocks * block_channels) + 2 * middle_channels
+
+        self.main_conv = ConvModule(
+            in_channels,
+            middle_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.short_conv = ConvModule(
+            in_channels,
+            middle_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.ModuleList()
+        for _ in range(num_blocks):
+            if num_convs_in_block == 1:
+                internal_block = ConvModule(
+                    middle_channels,
+                    block_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+            else:
+                internal_block = []
+                for _ in range(num_convs_in_block):
+                    internal_block.append(
+                        ConvModule(
+                            middle_channels,
+                            block_channels,
+                            3,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg))
+                    middle_channels = block_channels
+                internal_block = nn.Sequential(*internal_block)
+
+            middle_channels = block_channels
+            self.blocks.append(internal_block)
+
+        self.final_conv = ConvModule(
+            final_conv_in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process
+         Args:
+             x (Tensor): The input tensor.
+         """
+        x_short = self.short_conv(x)
+        x_main = self.main_conv(x)
+        block_outs = []
+        x_block = x_main
+        for block in self.blocks:
+            x_block = block(x_block)
+            block_outs.append(x_block)
+        x_final = torch.cat((*block_outs[::-1], x_main, x_short), dim=1)
+        return self.final_conv(x_final)
+
+
+@MODELS.register_module()
+class EELANBlock(BaseModule):
+    """Expand efficient layer aggregation networks for YOLOv7.
+
+    Args:
+        num_elan_block (int): The number of ELANBlock.
+    """
+
+    def __init__(self, num_elan_block: int, **kwargs):
+        super().__init__()
+        assert num_elan_block >= 1
+        self.e_elan_blocks = nn.ModuleList()
+        for _ in range(num_elan_block):
+            self.e_elan_blocks.append(ELANBlock(**kwargs))
+
+    def forward(self, x: Tensor) -> Tensor:
+        outs = []
+        for elan_blocks in self.e_elan_blocks:
+            outs.append(elan_blocks(x))
+        return sum(outs)
+
+
+class MaxPoolAndStrideConvBlock(BaseModule):
+    """Max pooling and stride conv layer for YOLOv7.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The out channels of this Module.
+        maxpool_kernel_sizes (int): kernel sizes of pooling layers.
+            Defaults to 2.
+        use_in_channels_of_middle (bool): Whether to calculate middle channels
+            based on in_channels. Defaults to False.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None.
+            which means using conv2d. Defaults to None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 maxpool_kernel_sizes: int = 2,
+                 use_in_channels_of_middle: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+
+        middle_channels = in_channels if use_in_channels_of_middle \
+            else out_channels // 2
+
+        self.maxpool_branches = nn.Sequential(
+            MaxPool2d(
+                kernel_size=maxpool_kernel_sizes, stride=maxpool_kernel_sizes),
+            ConvModule(
+                in_channels,
+                out_channels // 2,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+        self.stride_conv_branches = nn.Sequential(
+            ConvModule(
+                in_channels,
+                middle_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                middle_channels,
+                out_channels // 2,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process
+        Args:
+            x (Tensor): The input tensor.
+        """
+        maxpool_out = self.maxpool_branches(x)
+        stride_conv_out = self.stride_conv_branches(x)
+        return torch.cat([stride_conv_out, maxpool_out], dim=1)
+
+
+@MODELS.register_module()
+class TinyDownSampleBlock(BaseModule):
+    """Down sample layer for YOLOv7-tiny.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The out channels of this Module.
+        middle_ratio (float): The scaling ratio of the middle layer
+            based on the in_channels. Defaults to 1.0.
+        kernel_sizes (int, tuple[int]): Sequential or number of kernel
+             sizes of pooling layers. Defaults to 3.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None.
+            which means using conv2d. Defaults to None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            middle_ratio: float = 1.0,
+            kernel_sizes: Union[int, Sequence[int]] = 3,
+            conv_cfg: OptConfigType = None,
+            norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg: ConfigType = dict(type='LeakyReLU', negative_slope=0.1),
+            init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg)
+
+        middle_channels = int(in_channels * middle_ratio)
+
+        self.short_conv = ConvModule(
+            in_channels,
+            middle_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.main_convs = nn.ModuleList()
+        for i in range(3):
+            if i == 0:
+                self.main_convs.append(
+                    ConvModule(
+                        in_channels,
+                        middle_channels,
+                        1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            else:
+                self.main_convs.append(
+                    ConvModule(
+                        middle_channels,
+                        middle_channels,
+                        kernel_sizes,
+                        padding=(kernel_sizes - 1) // 2,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+
+        self.final_conv = ConvModule(
+            middle_channels * 4,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x) -> Tensor:
+        short_out = self.short_conv(x)
+
+        main_outs = []
+        for main_conv in self.main_convs:
+            main_out = main_conv(x)
+            main_outs.append(main_out)
+            x = main_out
+
+        return self.final_conv(torch.cat([*main_outs[::-1], short_out], dim=1))
+
+
+@MODELS.register_module()
+class SPPFCSPBlock(BaseModule):
+    """Spatial pyramid pooling - Fast (SPPF) layer with CSP for
+     YOLOv7
+
+     Args:
+         in_channels (int): The input channels of this Module.
+         out_channels (int): The output channels of this Module.
+         expand_ratio (float): Expand ratio of SPPCSPBlock.
+            Defaults to 0.5.
+         kernel_sizes (int, tuple[int]): Sequential or number of kernel
+             sizes of pooling layers. Defaults to 5.
+         is_tiny_version (bool): Is tiny version of SPPFCSPBlock. If True,
+            it means it is a yolov7 tiny model. Defaults to False.
+         conv_cfg (dict): Config dict for convolution layer. Defaults to None.
+             which means using conv2d. Defaults to None.
+         norm_cfg (dict): Config dict for normalization layer.
+             Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+         act_cfg (dict): Config dict for activation layer.
+             Defaults to dict(type='SiLU', inplace=True).
+         init_cfg (dict or list[dict], optional): Initialization config dict.
+             Defaults to None.
+     """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: float = 0.5,
+                 kernel_sizes: Union[int, Sequence[int]] = 5,
+                 is_tiny_version: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        self.is_tiny_version = is_tiny_version
+
+        mid_channels = int(2 * out_channels * expand_ratio)
+
+        if is_tiny_version:
+            self.main_layers = ConvModule(
+                in_channels,
+                mid_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.main_layers = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    mid_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+                ConvModule(
+                    mid_channels,
+                    mid_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+                ConvModule(
+                    mid_channels,
+                    mid_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+
+        self.kernel_sizes = kernel_sizes
+        if isinstance(kernel_sizes, int):
+            self.poolings = nn.MaxPool2d(
+                kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2)
+        else:
+            self.poolings = nn.ModuleList([
+                nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+                for ks in kernel_sizes
+            ])
+
+        if is_tiny_version:
+            self.fuse_layers = ConvModule(
+                4 * mid_channels,
+                mid_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.fuse_layers = nn.Sequential(
+                ConvModule(
+                    4 * mid_channels,
+                    mid_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+                ConvModule(
+                    mid_channels,
+                    mid_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.short_layer = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.final_conv = ConvModule(
+            2 * mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x) -> Tensor:
+        """Forward process
+        Args:
+            x (Tensor): The input tensor.
+        """
+        x1 = self.main_layers(x)
+        if isinstance(self.kernel_sizes, int):
+            y1 = self.poolings(x1)
+            y2 = self.poolings(y1)
+            concat_list = [x1] + [y1, y2, self.poolings(y2)]
+            if self.is_tiny_version:
+                x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1))
+            else:
+                x1 = self.fuse_layers(torch.cat(concat_list, 1))
+        else:
+            concat_list = [x1] + [m(x1) for m in self.poolings]
+            if self.is_tiny_version:
+                x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1))
+            else:
+                x1 = self.fuse_layers(torch.cat(concat_list, 1))
+
+        x2 = self.short_layer(x)
+        return self.final_conv(torch.cat((x1, x2), dim=1))
+
+
+class ImplicitA(nn.Module):
+    """Implicit add layer in YOLOv7.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        mean (float): Mean value of implicit module. Defaults to 0.
+        std (float): Std value of implicit module. Defaults to 0.02
+    """
+
+    def __init__(self, in_channels: int, mean: float = 0., std: float = .02):
+        super().__init__()
+        self.implicit = nn.Parameter(torch.zeros(1, in_channels, 1, 1))
+        nn.init.normal_(self.implicit, mean=mean, std=std)
+
+    def forward(self, x):
+        """Forward process
+        Args:
+            x (Tensor): The input tensor.
+        """
+        return self.implicit + x
+
+
+class ImplicitM(nn.Module):
+    """Implicit multiplier layer in YOLOv7.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        mean (float): Mean value of implicit module. Defaults to 1.
+        std (float): Std value of implicit module. Defaults to 0.02.
+    """
+
+    def __init__(self, in_channels: int, mean: float = 1., std: float = .02):
+        super().__init__()
+        self.implicit = nn.Parameter(torch.ones(1, in_channels, 1, 1))
+        nn.init.normal_(self.implicit, mean=mean, std=std)
+
+    def forward(self, x):
+        """Forward process
+        Args:
+            x (Tensor): The input tensor.
+        """
+        return self.implicit * x
+
+
+@MODELS.register_module()
+class PPYOLOEBasicBlock(nn.Module):
+    """PPYOLOE Backbone BasicBlock.
+
+    Args:
+         in_channels (int): The input channels of this Module.
+         out_channels (int): The output channels of this Module.
+         norm_cfg (dict): Config dict for normalization layer.
+             Defaults to dict(type='BN', momentum=0.1, eps=1e-5).
+         act_cfg (dict): Config dict for activation layer.
+             Defaults to dict(type='SiLU', inplace=True).
+         shortcut (bool): Whether to add inputs and outputs together
+         at the end of this layer. Defaults to True.
+         use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 shortcut: bool = True,
+                 use_alpha: bool = False):
+        super().__init__()
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        self.conv1 = ConvModule(
+            in_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv2 = RepVGGBlock(
+            out_channels,
+            out_channels,
+            use_alpha=use_alpha,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            use_bn_first=False)
+        self.shortcut = shortcut
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process.
+        Args:
+            inputs (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return x + y
+        else:
+            return y
+
+
+class CSPResLayer(nn.Module):
+    """PPYOLOE Backbone Stage.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        num_block (int): Number of blocks in this stage.
+        block_cfg (dict): Config dict for block. Default config is
+            suitable for PPYOLOE+ backbone. And in PPYOLOE neck,
+            block_cfg is set to dict(type='PPYOLOEBasicBlock',
+            shortcut=False, use_alpha=False). Defaults to
+            dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True).
+        stride (int): Stride of the convolution. In backbone, the stride
+            must be set to 2. In neck, the stride must be set to 1.
+            Defaults to 1.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.1, eps=1e-5).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        attention_cfg (dict, optional): Config dict for `EffectiveSELayer`.
+            Defaults to dict(type='EffectiveSELayer',
+            act_cfg=dict(type='HSigmoid')).
+        use_spp (bool): Whether to use `SPPFBottleneck` layer.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 num_block: int,
+                 block_cfg: ConfigType = dict(
+                     type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True),
+                 stride: int = 1,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 attention_cfg: OptMultiConfig = dict(
+                     type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')),
+                 use_spp: bool = False):
+        super().__init__()
+
+        self.num_block = num_block
+        self.block_cfg = block_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.use_spp = use_spp
+        assert attention_cfg is None or isinstance(attention_cfg, dict)
+
+        if stride == 2:
+            conv1_in_channels = conv2_in_channels = conv3_in_channels = (
+                in_channels + out_channels) // 2
+            blocks_channels = conv1_in_channels // 2
+            self.conv_down = ConvModule(
+                in_channels,
+                conv1_in_channels,
+                3,
+                stride=2,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            conv1_in_channels = conv2_in_channels = in_channels
+            conv3_in_channels = out_channels
+            blocks_channels = out_channels // 2
+            self.conv_down = None
+
+        self.conv1 = ConvModule(
+            conv1_in_channels,
+            blocks_channels,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv2 = ConvModule(
+            conv2_in_channels,
+            blocks_channels,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = self.build_blocks_layer(blocks_channels)
+
+        self.conv3 = ConvModule(
+            conv3_in_channels,
+            out_channels,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if attention_cfg:
+            attention_cfg = attention_cfg.copy()
+            attention_cfg['channels'] = blocks_channels * 2
+            self.attn = MODELS.build(attention_cfg)
+        else:
+            self.attn = None
+
+    def build_blocks_layer(self, blocks_channels: int) -> nn.Module:
+        """Build blocks layer.
+
+        Args:
+            blocks_channels: The channels of this Module.
+        """
+        blocks = nn.Sequential()
+        block_cfg = self.block_cfg.copy()
+        block_cfg.update(
+            dict(in_channels=blocks_channels, out_channels=blocks_channels))
+        block_cfg.setdefault('norm_cfg', self.norm_cfg)
+        block_cfg.setdefault('act_cfg', self.act_cfg)
+
+        for i in range(self.num_block):
+            blocks.add_module(str(i), MODELS.build(block_cfg))
+
+            if i == (self.num_block - 1) // 2 and self.use_spp:
+                blocks.add_module(
+                    'spp',
+                    SPPFBottleneck(
+                        blocks_channels,
+                        blocks_channels,
+                        kernel_sizes=[5, 9, 13],
+                        use_conv_first=False,
+                        conv_cfg=None,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+
+        return blocks
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process
+         Args:
+             x (Tensor): The input tensor.
+         """
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = torch.cat([y1, y2], axis=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+@MODELS.register_module()
+class RepStageBlock(nn.Module):
+    """RepStageBlock is a stage block with rep-style basic block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        num_blocks (int, tuple[int]): Number of blocks.  Defaults to 1.
+        bottle_block (nn.Module): Basic unit of RepStage.
+            Defaults to RepVGGBlock.
+        block_cfg (ConfigType): Config of RepStage.
+            Defaults to 'RepVGGBlock'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 num_blocks: int = 1,
+                 bottle_block: nn.Module = RepVGGBlock,
+                 block_cfg: ConfigType = dict(type='RepVGGBlock')):
+        super().__init__()
+        block_cfg = block_cfg.copy()
+
+        block_cfg.update(
+            dict(in_channels=in_channels, out_channels=out_channels))
+
+        self.conv1 = MODELS.build(block_cfg)
+
+        block_cfg.update(
+            dict(in_channels=out_channels, out_channels=out_channels))
+
+        self.block = None
+        if num_blocks > 1:
+            self.block = nn.Sequential(*(MODELS.build(block_cfg)
+                                         for _ in range(num_blocks - 1)))
+
+        if bottle_block == BottleRep:
+            self.conv1 = BottleRep(
+                in_channels,
+                out_channels,
+                block_cfg=block_cfg,
+                adaptive_weight=True)
+            num_blocks = num_blocks // 2
+            self.block = None
+            if num_blocks > 1:
+                self.block = nn.Sequential(*(BottleRep(
+                    out_channels,
+                    out_channels,
+                    block_cfg=block_cfg,
+                    adaptive_weight=True) for _ in range(num_blocks - 1)))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process.
+
+        Args:
+            x (Tensor): The input tensor.
+
+        Returns:
+            Tensor: The output tensor.
+        """
+        x = self.conv1(x)
+        if self.block is not None:
+            x = self.block(x)
+        return x
+
+
+class DarknetBottleneck(MMDET_DarknetBottleneck):
+    """The basic bottleneck block used in Darknet.
+
+    Each ResBlock consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and LeakyReLU.
+    The first convLayer has filter size of k1Xk1 and the second one has the
+    filter size of k2Xk2.
+
+    Note:
+    This DarknetBottleneck is little different from MMDet's, we can
+    change the kernel size and padding for each conv.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): The kernel size for hidden channel.
+            Defaults to 0.5.
+        kernel_size (Sequence[int]): The kernel size of the convolution.
+            Defaults to (1, 3).
+        padding (Sequence[int]): The padding size of the convolution.
+            Defaults to (0, 1).
+        add_identity (bool): Whether to add identity to the out.
+            Defaults to True
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 kernel_size: Sequence[int] = (1, 3),
+                 padding: Sequence[int] = (0, 1),
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(in_channels, out_channels, init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        assert isinstance(kernel_size, Sequence) and len(kernel_size) == 2
+
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            kernel_size[0],
+            padding=padding[0],
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = conv(
+            hidden_channels,
+            out_channels,
+            kernel_size[1],
+            stride=1,
+            padding=padding[1],
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+
+class CSPLayerWithTwoConv(BaseModule):
+    """Cross Stage Partial Layer with 2 convolutions.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        num_blocks (int): Number of blocks. Defaults to 1
+        add_identity (bool): Whether to add identity in blocks.
+            Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            expand_ratio: float = 0.5,
+            num_blocks: int = 1,
+            add_identity: bool = True,  # shortcut
+            conv_cfg: OptConfigType = None,
+            norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+            init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        self.mid_channels = int(out_channels * expand_ratio)
+        self.main_conv = ConvModule(
+            in_channels,
+            2 * self.mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.final_conv = ConvModule(
+            (2 + num_blocks) * self.mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.ModuleList(
+            DarknetBottleneck(
+                self.mid_channels,
+                self.mid_channels,
+                expansion=1,
+                kernel_size=(3, 3),
+                padding=(1, 1),
+                add_identity=add_identity,
+                use_depthwise=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg) for _ in range(num_blocks))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward process."""
+        x_main = self.main_conv(x)
+        x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
+        x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
+        return self.final_conv(torch.cat(x_main, 1))
diff --git a/mmyolo/models/losses/__init__.py b/mmyolo/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee192921b0c4722a5334be10c192dfadcbe68f08
--- /dev/null
+++ b/mmyolo/models/losses/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .iou_loss import IoULoss, bbox_overlaps
+
+__all__ = ['IoULoss', 'bbox_overlaps']
diff --git a/mmyolo/models/losses/iou_loss.py b/mmyolo/models/losses/iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3d3dc40ef3e678989db85ee8cfd0035a26a9f19
--- /dev/null
+++ b/mmyolo/models/losses/iou_loss.py
@@ -0,0 +1,232 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmdet.models.losses.utils import weight_reduce_loss
+from mmdet.structures.bbox import HorizontalBoxes
+
+from mmyolo.registry import MODELS
+
+
+def bbox_overlaps(pred: torch.Tensor,
+                  target: torch.Tensor,
+                  iou_mode: str = 'ciou',
+                  bbox_format: str = 'xywh',
+                  siou_theta: float = 4.0,
+                  eps: float = 1e-7) -> torch.Tensor:
+    r"""Calculate overlap between two set of bboxes.
+    `Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    In the CIoU implementation of YOLOv5 and MMDetection, there is a slight
+    difference in the way the alpha parameter is computed.
+
+    mmdet version:
+        alpha = (ious > 0.5).float() * v / (1 - ious + v)
+    YOLOv5 version:
+        alpha = v / (v - ious + (1 + eps)
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2)
+            or (x, y, w, h),shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        iou_mode (str): Options are ('iou', 'ciou', 'giou', 'siou').
+            Defaults to "ciou".
+        bbox_format (str): Options are "xywh" and "xyxy".
+            Defaults to "xywh".
+        siou_theta (float): siou_theta for SIoU when calculate shape cost.
+            Defaults to 4.0.
+        eps (float): Eps to avoid log(0).
+
+    Returns:
+        Tensor: shape (n, ).
+    """
+    assert iou_mode in ('iou', 'ciou', 'giou', 'siou')
+    assert bbox_format in ('xyxy', 'xywh')
+    if bbox_format == 'xywh':
+        pred = HorizontalBoxes.cxcywh_to_xyxy(pred)
+        target = HorizontalBoxes.cxcywh_to_xyxy(target)
+
+    bbox1_x1, bbox1_y1 = pred[..., 0], pred[..., 1]
+    bbox1_x2, bbox1_y2 = pred[..., 2], pred[..., 3]
+    bbox2_x1, bbox2_y1 = target[..., 0], target[..., 1]
+    bbox2_x2, bbox2_y2 = target[..., 2], target[..., 3]
+
+    # Overlap
+    overlap = (torch.min(bbox1_x2, bbox2_x2) -
+               torch.max(bbox1_x1, bbox2_x1)).clamp(0) * \
+              (torch.min(bbox1_y2, bbox2_y2) -
+               torch.max(bbox1_y1, bbox2_y1)).clamp(0)
+
+    # Union
+    w1, h1 = bbox1_x2 - bbox1_x1, bbox1_y2 - bbox1_y1
+    w2, h2 = bbox2_x2 - bbox2_x1, bbox2_y2 - bbox2_y1
+    union = (w1 * h1) + (w2 * h2) - overlap + eps
+
+    h1 = bbox1_y2 - bbox1_y1 + eps
+    h2 = bbox2_y2 - bbox2_y1 + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[..., :2], target[..., :2])
+    enclose_x2y2 = torch.max(pred[..., 2:], target[..., 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    enclose_w = enclose_wh[..., 0]  # cw
+    enclose_h = enclose_wh[..., 1]  # ch
+
+    if iou_mode == 'ciou':
+        # CIoU = IoU - ( (ρ^2(b_pred,b_gt) / c^2) + (alpha x v) )
+
+        # calculate enclose area (c^2)
+        enclose_area = enclose_w**2 + enclose_h**2 + eps
+
+        # calculate ρ^2(b_pred,b_gt):
+        # euclidean distance between b_pred(bbox2) and b_gt(bbox1)
+        # center point, because bbox format is xyxy -> left-top xy and
+        # right-bottom xy, so need to / 4 to get center point.
+        rho2_left_item = ((bbox2_x1 + bbox2_x2) - (bbox1_x1 + bbox1_x2))**2 / 4
+        rho2_right_item = ((bbox2_y1 + bbox2_y2) -
+                           (bbox1_y1 + bbox1_y2))**2 / 4
+        rho2 = rho2_left_item + rho2_right_item  # rho^2 (ρ^2)
+
+        # Width and height ratio (v)
+        wh_ratio = (4 / (math.pi**2)) * torch.pow(
+            torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+        with torch.no_grad():
+            alpha = wh_ratio / (wh_ratio - ious + (1 + eps))
+
+        # CIoU
+        ious = ious - ((rho2 / enclose_area) + (alpha * wh_ratio))
+
+    elif iou_mode == 'giou':
+        # GIoU = IoU - ( (A_c - union) / A_c )
+        convex_area = enclose_w * enclose_h + eps  # convex area (A_c)
+        ious = ious - (convex_area - union) / convex_area
+
+    elif iou_mode == 'siou':
+        # SIoU: https://arxiv.org/pdf/2205.12740.pdf
+        # SIoU = IoU - ( (Distance Cost + Shape Cost) / 2 )
+
+        # calculate sigma (σ):
+        # euclidean distance between bbox2(pred) and bbox1(gt) center point,
+        # sigma_cw = b_cx_gt - b_cx
+        sigma_cw = (bbox2_x1 + bbox2_x2) / 2 - (bbox1_x1 + bbox1_x2) / 2 + eps
+        # sigma_ch = b_cy_gt - b_cy
+        sigma_ch = (bbox2_y1 + bbox2_y2) / 2 - (bbox1_y1 + bbox1_y2) / 2 + eps
+        # sigma = √( (sigma_cw ** 2) - (sigma_ch ** 2) )
+        sigma = torch.pow(sigma_cw**2 + sigma_ch**2, 0.5)
+
+        # choose minimize alpha, sin(alpha)
+        sin_alpha = torch.abs(sigma_ch) / sigma
+        sin_beta = torch.abs(sigma_cw) / sigma
+        sin_alpha = torch.where(sin_alpha <= math.sin(math.pi / 4), sin_alpha,
+                                sin_beta)
+
+        # Angle cost = 1 - 2 * ( sin^2 ( arcsin(x) - (pi / 4) ) )
+        angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2)
+
+        # Distance cost = Σ_(t=x,y) (1 - e ^ (- γ ρ_t))
+        rho_x = (sigma_cw / enclose_w)**2  # ρ_x
+        rho_y = (sigma_ch / enclose_h)**2  # ρ_y
+        gamma = 2 - angle_cost  # γ
+        distance_cost = (1 - torch.exp(-1 * gamma * rho_x)) + (
+            1 - torch.exp(-1 * gamma * rho_y))
+
+        # Shape cost = Ω = Σ_(t=w,h) ( ( 1 - ( e ^ (-ω_t) ) ) ^ θ )
+        omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2)  # ω_w
+        omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2)  # ω_h
+        shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w),
+                               siou_theta) + torch.pow(
+                                   1 - torch.exp(-1 * omiga_h), siou_theta)
+
+        ious = ious - ((distance_cost + shape_cost) * 0.5)
+
+    return ious.clamp(min=-1.0, max=1.0)
+
+
+@MODELS.register_module()
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    Args:
+        iou_mode (str): Options are "ciou".
+            Defaults to "ciou".
+        bbox_format (str): Options are "xywh" and "xyxy".
+            Defaults to "xywh".
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        return_iou (bool): If True, return loss and iou.
+    """
+
+    def __init__(self,
+                 iou_mode: str = 'ciou',
+                 bbox_format: str = 'xywh',
+                 eps: float = 1e-7,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 return_iou: bool = True):
+        super().__init__()
+        assert bbox_format in ('xywh', 'xyxy')
+        assert iou_mode in ('ciou', 'siou', 'giou')
+        self.iou_mode = iou_mode
+        self.bbox_format = bbox_format
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.return_iou = return_iou
+
+    def forward(
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        avg_factor: Optional[float] = None,
+        reduction_override: Optional[Union[str, bool]] = None
+    ) -> Tuple[Union[torch.Tensor, torch.Tensor], torch.Tensor]:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2)
+                or (x, y, w, h),shape (n, 4).
+            target (Tensor): Corresponding gt bboxes, shape (n, 4).
+            weight (Tensor, optional): Element-wise weights.
+            avg_factor (float, optional): Average factor when computing the
+                mean of losses.
+            reduction_override (str, bool, optional): Same as built-in losses
+                of PyTorch. Defaults to None.
+        Returns:
+            loss or tuple(loss, iou):
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if weight is not None and weight.dim() > 1:
+            weight = weight.mean(-1)
+
+        iou = bbox_overlaps(
+            pred,
+            target,
+            iou_mode=self.iou_mode,
+            bbox_format=self.bbox_format,
+            eps=self.eps)
+        loss = self.loss_weight * weight_reduce_loss(1.0 - iou, weight,
+                                                     reduction, avg_factor)
+
+        if self.return_iou:
+            return loss, iou
+        else:
+            return loss
diff --git a/mmyolo/models/necks/__init__.py b/mmyolo/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6da9641cee490a1708921bf93f4a07f95f2d6b7c
--- /dev/null
+++ b/mmyolo/models/necks/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_yolo_neck import BaseYOLONeck
+from .cspnext_pafpn import CSPNeXtPAFPN
+from .ppyoloe_csppan import PPYOLOECSPPAFPN
+from .yolov5_pafpn import YOLOv5PAFPN
+from .yolov6_pafpn import YOLOv6CSPRepPAFPN, YOLOv6RepPAFPN
+from .yolov7_pafpn import YOLOv7PAFPN
+from .yolov8_pafpn import YOLOv8PAFPN
+from .yolox_pafpn import YOLOXPAFPN
+
+__all__ = [
+    'YOLOv5PAFPN', 'BaseYOLONeck', 'YOLOv6RepPAFPN', 'YOLOXPAFPN',
+    'CSPNeXtPAFPN', 'YOLOv7PAFPN', 'PPYOLOECSPPAFPN', 'YOLOv6CSPRepPAFPN',
+    'YOLOv8PAFPN'
+]
diff --git a/mmyolo/models/necks/base_yolo_neck.py b/mmyolo/models/necks/base_yolo_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..54fddf79ce90bb0c023f9c445aced62551552174
--- /dev/null
+++ b/mmyolo/models/necks/base_yolo_neck.py
@@ -0,0 +1,261 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+from mmdet.utils import ConfigType, OptMultiConfig
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmyolo.registry import MODELS
+
+
+@MODELS.register_module()
+class BaseYOLONeck(BaseModule, metaclass=ABCMeta):
+    """Base neck used in YOLO series.
+
+    .. code:: text
+
+     P5 neck model structure diagram
+                        +--------+                     +-------+
+                        |top_down|----------+--------->|  out  |---> output0
+                        | layer1 |          |          | layer0|
+                        +--------+          |          +-------+
+     stride=8                ^              |
+     idx=0  +------+    +--------+          |
+     -----> |reduce|--->|   cat  |          |
+            |layer0|    +--------+          |
+            +------+         ^              v
+                        +--------+    +-----------+
+                        |upsample|    |downsample |
+                        | layer1 |    |  layer0   |
+                        +--------+    +-----------+
+                             ^              |
+                        +--------+          v
+                        |top_down|    +-----------+
+                        | layer2 |--->|    cat    |
+                        +--------+    +-----------+
+     stride=16               ^              v
+     idx=1  +------+    +--------+    +-----------+    +-------+
+     -----> |reduce|--->|   cat  |    | bottom_up |--->|  out  |---> output1
+            |layer1|    +--------+    |   layer0  |    | layer1|
+            +------+         ^        +-----------+    +-------+
+                             |              v
+                        +--------+    +-----------+
+                        |upsample|    |downsample |
+                        | layer2 |    |  layer1   |
+     stride=32          +--------+    +-----------+
+     idx=2  +------+         ^              v
+     -----> |reduce|         |        +-----------+
+            |layer2|---------+------->|    cat    |
+            +------+                  +-----------+
+                                            v
+                                      +-----------+    +-------+
+                                      | bottom_up |--->|  out  |---> output2
+                                      |  layer1   |    | layer2|
+                                      +-----------+    +-------+
+
+    .. code:: text
+
+     P6 neck model structure diagram
+                        +--------+                     +-------+
+                        |top_down|----------+--------->|  out  |---> output0
+                        | layer1 |          |          | layer0|
+                        +--------+          |          +-------+
+     stride=8                ^              |
+     idx=0  +------+    +--------+          |
+     -----> |reduce|--->|   cat  |          |
+            |layer0|    +--------+          |
+            +------+         ^              v
+                        +--------+    +-----------+
+                        |upsample|    |downsample |
+                        | layer1 |    |  layer0   |
+                        +--------+    +-----------+
+                             ^              |
+                        +--------+          v
+                        |top_down|    +-----------+
+                        | layer2 |--->|    cat    |
+                        +--------+    +-----------+
+     stride=16               ^              v
+     idx=1  +------+    +--------+    +-----------+    +-------+
+     -----> |reduce|--->|   cat  |    | bottom_up |--->|  out  |---> output1
+            |layer1|    +--------+    |   layer0  |    | layer1|
+            +------+         ^        +-----------+    +-------+
+                             |              v
+                        +--------+    +-----------+
+                        |upsample|    |downsample |
+                        | layer2 |    |  layer1   |
+                        +--------+    +-----------+
+                             ^              |
+                        +--------+          v
+                        |top_down|    +-----------+
+                        | layer3 |--->|    cat    |
+                        +--------+    +-----------+
+     stride=32               ^              v
+     idx=2  +------+    +--------+    +-----------+    +-------+
+     -----> |reduce|--->|   cat  |    | bottom_up |--->|  out  |---> output2
+            |layer2|    +--------+    |   layer1  |    | layer2|
+            +------+         ^        +-----------+    +-------+
+                             |              v
+                        +--------+    +-----------+
+                        |upsample|    |downsample |
+                        | layer3 |    |  layer2   |
+                        +--------+    +-----------+
+     stride=64               ^              v
+     idx=3  +------+         |        +-----------+
+     -----> |reduce|---------+------->|    cat    |
+            |layer3|                  +-----------+
+            +------+                        v
+                                      +-----------+    +-------+
+                                      | bottom_up |--->|  out  |---> output3
+                                      |  layer2   |    | layer3|
+                                      +-----------+    +-------+
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        upsample_feats_cat_first (bool): Whether the output features are
+            concat first after upsampling in the topdown module.
+            Defaults to True. Currently only YOLOv7 is false.
+        freeze_all(bool): Whether to freeze the model. Defaults to False
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to None.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: Union[int, List[int]],
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 upsample_feats_cat_first: bool = True,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = None,
+                 act_cfg: ConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.deepen_factor = deepen_factor
+        self.widen_factor = widen_factor
+        self.upsample_feats_cat_first = upsample_feats_cat_first
+        self.freeze_all = freeze_all
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.reduce_layers = nn.ModuleList()
+        for idx in range(len(in_channels)):
+            self.reduce_layers.append(self.build_reduce_layer(idx))
+
+        # build top-down blocks
+        self.upsample_layers = nn.ModuleList()
+        self.top_down_layers = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.upsample_layers.append(self.build_upsample_layer(idx=idx, n_layers=len(in_channels)))
+            self.top_down_layers.append(self.build_top_down_layer(idx))
+
+        # build bottom-up blocks
+        self.downsample_layers = nn.ModuleList()
+        self.bottom_up_layers = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsample_layers.append(self.build_downsample_layer(idx))
+            self.bottom_up_layers.append(self.build_bottom_up_layer(idx))
+
+        self.out_layers = nn.ModuleList()
+        for idx in range(len(in_channels)):
+            self.out_layers.append(self.build_out_layer(idx))
+
+    @abstractmethod
+    def build_reduce_layer(self, idx: int):
+        """build reduce layer."""
+        pass
+
+    @abstractmethod
+    def build_upsample_layer(self, idx: int):
+        """build upsample layer."""
+        pass
+
+    @abstractmethod
+    def build_top_down_layer(self, idx: int):
+        """build top down layer."""
+        pass
+
+    @abstractmethod
+    def build_downsample_layer(self, idx: int):
+        """build downsample layer."""
+        pass
+
+    @abstractmethod
+    def build_bottom_up_layer(self, idx: int):
+        """build bottom up layer."""
+        pass
+
+    @abstractmethod
+    def build_out_layer(self, idx: int):
+        """build out layer."""
+        pass
+
+    def _freeze_all(self):
+        """Freeze the model."""
+        for m in self.modules():
+            if isinstance(m, _BatchNorm):
+                m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep the normalization
+        layer freezed."""
+        super().train(mode)
+        if self.freeze_all:
+            self._freeze_all()
+
+    def forward(self, inputs: List[torch.Tensor]) -> tuple:
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+        # reduce layers
+        reduce_outs = []
+        for idx in range(len(self.in_channels)):
+            reduce_outs.append(self.reduce_layers[idx](inputs[idx]))
+
+        # top-down path
+        inner_outs = [reduce_outs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_low = reduce_outs[idx - 1]
+            upsample_feat = self.upsample_layers[len(self.in_channels) - 1 -
+                                                 idx](
+                                                     feat_high)
+            if self.upsample_feats_cat_first:
+                top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
+            else:
+                top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
+            inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](
+                top_down_layer_inputs)
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsample_layers[idx](feat_low)
+            out = self.bottom_up_layers[idx](
+                torch.cat([downsample_feat, feat_high], 1))
+            outs.append(out)
+
+        # out_layers
+        results = []
+        for idx in range(len(self.in_channels)):
+            results.append(self.out_layers[idx](outs[idx]))
+
+        return tuple(results)
diff --git a/mmyolo/models/necks/cspnext_pafpn.py b/mmyolo/models/necks/cspnext_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..310126f63e12f888daac50ca30674484f7b3a6ec
--- /dev/null
+++ b/mmyolo/models/necks/cspnext_pafpn.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.backbones.csp_darknet import CSPLayer
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from .base_yolo_neck import BaseYOLONeck
+
+
+@MODELS.register_module()
+class CSPNeXtPAFPN(BaseYOLONeck):
+    """Path Aggregation Network with CSPNeXt blocks.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer.
+            Defaults to 3.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='SiLU', inplace=True)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: Sequence[int],
+        out_channels: int,
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        num_csp_blocks: int = 3,
+        freeze_all: bool = False,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'),
+        conv_cfg: bool = None,
+        norm_cfg: ConfigType = dict(type='BN'),
+        act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        self.num_csp_blocks = round(num_csp_blocks * deepen_factor)
+        self.conv = DepthwiseSeparableConvModule \
+            if use_depthwise else ConvModule
+        self.upsample_cfg = upsample_cfg
+        self.expand_ratio = expand_ratio
+        self.conv_cfg = conv_cfg
+
+        super().__init__(
+            in_channels=[
+                int(channel * widen_factor) for channel in in_channels
+            ],
+            out_channels=int(out_channels * widen_factor),
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def build_reduce_layer(self, idx: int) -> nn.Module:
+        """build reduce layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The reduce layer.
+        """
+        if idx == len(self.in_channels) - 1:
+            layer = self.conv(
+                self.in_channels[idx],
+                self.in_channels[idx - 1],
+                1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            layer = nn.Identity()
+
+        return layer
+
+    def build_upsample_layer(self, *args, **kwargs) -> nn.Module:
+        """build upsample layer."""
+        return nn.Upsample(**self.upsample_cfg)
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The top down layer.
+        """
+        if idx == 1:
+            return CSPLayer(
+                self.in_channels[idx - 1] * 2,
+                self.in_channels[idx - 1],
+                num_blocks=self.num_csp_blocks,
+                add_identity=False,
+                use_cspnext_block=True,
+                expand_ratio=self.expand_ratio,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            return nn.Sequential(
+                CSPLayer(
+                    self.in_channels[idx - 1] * 2,
+                    self.in_channels[idx - 1],
+                    num_blocks=self.num_csp_blocks,
+                    add_identity=False,
+                    use_cspnext_block=True,
+                    expand_ratio=self.expand_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                self.conv(
+                    self.in_channels[idx - 1],
+                    self.in_channels[idx - 2],
+                    kernel_size=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def build_downsample_layer(self, idx: int) -> nn.Module:
+        """build downsample layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The downsample layer.
+        """
+        return self.conv(
+            self.in_channels[idx],
+            self.in_channels[idx],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        return CSPLayer(
+            self.in_channels[idx] * 2,
+            self.in_channels[idx + 1],
+            num_blocks=self.num_csp_blocks,
+            add_identity=False,
+            use_cspnext_block=True,
+            expand_ratio=self.expand_ratio,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_out_layer(self, idx: int) -> nn.Module:
+        """build out layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The out layer.
+        """
+        return self.conv(
+            self.in_channels[idx],
+            self.out_channels,
+            3,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
diff --git a/mmyolo/models/necks/ppyoloe_csppan.py b/mmyolo/models/necks/ppyoloe_csppan.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e4ef7200bfc6784a7ce8d92bcfbc46314e518e9
--- /dev/null
+++ b/mmyolo/models/necks/ppyoloe_csppan.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.models.backbones.csp_resnet import CSPResLayer
+from mmyolo.models.necks import BaseYOLONeck
+from mmyolo.registry import MODELS
+
+
+@MODELS.register_module()
+class PPYOLOECSPPAFPN(BaseYOLONeck):
+    """CSPPAN in PPYOLOE.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (List[int]): Number of output channels
+            (used at each scale).
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        freeze_all(bool): Whether to freeze the model.
+        num_csplayer (int): Number of `CSPResLayer` in per layer.
+            Defaults to 1.
+        num_blocks_per_layer (int): Number of blocks per `CSPResLayer`.
+            Defaults to 3.
+        block_cfg (dict): Config dict for block. Defaults to
+            dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=False)
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.1, eps=1e-5).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        drop_block_cfg (dict, optional): Drop block config.
+            Defaults to None. If you want to use Drop block after
+            `CSPResLayer`, you can set this para as
+            dict(type='mmdet.DropBlock', drop_prob=0.1,
+            block_size=3, warm_iters=0).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+        use_spp (bool): Whether to use `SPP` in reduce layer.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels: List[int] = [256, 512, 1024],
+                 out_channels: List[int] = [256, 512, 1024],
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 freeze_all: bool = False,
+                 num_csplayer: int = 1,
+                 num_blocks_per_layer: int = 3,
+                 block_cfg: ConfigType = dict(
+                     type='PPYOLOEBasicBlock', shortcut=False,
+                     use_alpha=False),
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.1, eps=1e-5),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 drop_block_cfg: ConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 use_spp: bool = False):
+        self.block_cfg = block_cfg
+        self.num_csplayer = num_csplayer
+        self.num_blocks_per_layer = round(num_blocks_per_layer * deepen_factor)
+        # Only use spp in last reduce_layer, if use_spp=True.
+        self.use_spp = use_spp
+        self.drop_block_cfg = drop_block_cfg
+        assert drop_block_cfg is None or isinstance(drop_block_cfg, dict)
+
+        super().__init__(
+            in_channels=[
+                int(channel * widen_factor) for channel in in_channels
+            ],
+            out_channels=[
+                int(channel * widen_factor) for channel in out_channels
+            ],
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def build_reduce_layer(self, idx: int):
+        """build reduce layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The reduce layer.
+        """
+        if idx == len(self.in_channels) - 1:
+            # fpn_stage
+            in_channels = self.in_channels[idx]
+            out_channels = self.out_channels[idx]
+
+            layer = [
+                CSPResLayer(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    out_channels=out_channels,
+                    num_block=self.num_blocks_per_layer,
+                    block_cfg=self.block_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    attention_cfg=None,
+                    use_spp=self.use_spp) for i in range(self.num_csplayer)
+            ]
+
+            if self.drop_block_cfg:
+                layer.append(MODELS.build(self.drop_block_cfg))
+            layer = nn.Sequential(*layer)
+        else:
+            layer = nn.Identity()
+
+        return layer
+
+    def build_upsample_layer(self, idx: int) -> nn.Module:
+        """build upsample layer."""
+        # fpn_route
+        in_channels = self.out_channels[idx]
+        return nn.Sequential(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=in_channels // 2,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            nn.Upsample(scale_factor=2, mode='nearest'))
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The top down layer.
+        """
+        # fpn_stage
+        in_channels = self.in_channels[idx - 1] + self.out_channels[idx] // 2
+        out_channels = self.out_channels[idx - 1]
+
+        layer = [
+            CSPResLayer(
+                in_channels=in_channels if i == 0 else out_channels,
+                out_channels=out_channels,
+                num_block=self.num_blocks_per_layer,
+                block_cfg=self.block_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                attention_cfg=None,
+                use_spp=False) for i in range(self.num_csplayer)
+        ]
+
+        if self.drop_block_cfg:
+            layer.append(MODELS.build(self.drop_block_cfg))
+
+        return nn.Sequential(*layer)
+
+    def build_downsample_layer(self, idx: int) -> nn.Module:
+        """build downsample layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The downsample layer.
+        """
+        # pan_route
+        return ConvModule(
+            in_channels=self.out_channels[idx],
+            out_channels=self.out_channels[idx],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        # pan_stage
+        in_channels = self.out_channels[idx + 1] + self.out_channels[idx]
+        out_channels = self.out_channels[idx + 1]
+
+        layer = [
+            CSPResLayer(
+                in_channels=in_channels if i == 0 else out_channels,
+                out_channels=out_channels,
+                num_block=self.num_blocks_per_layer,
+                block_cfg=self.block_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                attention_cfg=None,
+                use_spp=False) for i in range(self.num_csplayer)
+        ]
+
+        if self.drop_block_cfg:
+            layer.append(MODELS.build(self.drop_block_cfg))
+
+        return nn.Sequential(*layer)
+
+    def build_out_layer(self, *args, **kwargs) -> nn.Module:
+        """build out layer."""
+        return nn.Identity()
diff --git a/mmyolo/models/necks/yolov5_pafpn.py b/mmyolo/models/necks/yolov5_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95147fc512359442aeb1bbc88aadd07031bdadf
--- /dev/null
+++ b/mmyolo/models/necks/yolov5_pafpn.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.models.backbones.csp_darknet import CSPLayer
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from ..utils import make_divisible, make_round
+from .base_yolo_neck import BaseYOLONeck
+
+
+@MODELS.register_module()
+class YOLOv5PAFPN(BaseYOLONeck):
+    """Path Aggregation Network used in YOLOv5.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1.
+        freeze_all(bool): Whether to freeze the model
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: Union[List[int], int],
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 num_csp_blocks: int = 1,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        self.num_csp_blocks = num_csp_blocks
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            """Initialize the parameters."""
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+
+    def build_reduce_layer(self, idx: int) -> nn.Module:
+        """build reduce layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The reduce layer.
+        """
+        if idx == len(self.in_channels) - 1:
+            layer = ConvModule(
+                make_divisible(self.in_channels[idx], self.widen_factor),
+                make_divisible(self.in_channels[idx - 1], self.widen_factor),
+                1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            layer = nn.Identity()
+
+        return layer
+
+    def build_upsample_layer(self, *args, **kwargs) -> nn.Module:
+        """build upsample layer."""
+        return nn.Upsample(scale_factor=2, mode='nearest')
+
+    def build_top_down_layer(self, idx: int):
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The top down layer.
+        """
+
+        if idx == 1:
+            return CSPLayer(
+                make_divisible(self.in_channels[idx - 1] * 2,
+                               self.widen_factor),
+                make_divisible(self.in_channels[idx - 1], self.widen_factor),
+                num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+                add_identity=False,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            return nn.Sequential(
+                CSPLayer(
+                    make_divisible(self.in_channels[idx - 1] * 2,
+                                   self.widen_factor),
+                    make_divisible(self.in_channels[idx - 1],
+                                   self.widen_factor),
+                    num_blocks=make_round(self.num_csp_blocks,
+                                          self.deepen_factor),
+                    add_identity=False,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    make_divisible(self.in_channels[idx - 1],
+                                   self.widen_factor),
+                    make_divisible(self.in_channels[idx - 2],
+                                   self.widen_factor),
+                    kernel_size=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def build_downsample_layer(self, idx: int) -> nn.Module:
+        """build downsample layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The downsample layer.
+        """
+        return ConvModule(
+            make_divisible(self.in_channels[idx], self.widen_factor),
+            make_divisible(self.in_channels[idx], self.widen_factor),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        return CSPLayer(
+            make_divisible(self.in_channels[idx] * 2, self.widen_factor),
+            make_divisible(self.in_channels[idx + 1], self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            add_identity=False,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_out_layer(self, *args, **kwargs) -> nn.Module:
+        """build out layer."""
+        return nn.Identity()
diff --git a/mmyolo/models/necks/yolov6_pafpn.py b/mmyolo/models/necks/yolov6_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..74b7ce932ec5352db0fae9ed5f499fe447ac3d27
--- /dev/null
+++ b/mmyolo/models/necks/yolov6_pafpn.py
@@ -0,0 +1,285 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from ..layers import BepC3StageBlock, RepStageBlock
+from ..utils import make_round
+from .base_yolo_neck import BaseYOLONeck
+
+
+@MODELS.register_module()
+class YOLOv6RepPAFPN(BaseYOLONeck):
+    """Path Aggregation Network used in YOLOv6.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1.
+        freeze_all(bool): Whether to freeze the model.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU', inplace=True).
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: int,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 num_csp_blocks: int = 12,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 init_cfg: OptMultiConfig = None):
+        self.num_csp_blocks = num_csp_blocks
+        self.block_cfg = block_cfg
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def build_reduce_layer(self, idx: int) -> nn.Module:
+        """build reduce layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The reduce layer.
+        """
+        if idx == 2:
+            layer = ConvModule(
+                in_channels=int(self.in_channels[idx] * self.widen_factor),
+                out_channels=int(self.out_channels[idx - 1] *
+                                 self.widen_factor),
+                kernel_size=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            layer = nn.Identity()
+
+        return layer
+
+    def build_upsample_layer(self, idx: int) -> nn.Module:
+        """build upsample layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The upsample layer.
+        """
+        return nn.ConvTranspose2d(
+            in_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            out_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            kernel_size=2,
+            stride=2,
+            bias=True)
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The top down layer.
+        """
+        block_cfg = self.block_cfg.copy()
+
+        layer0 = RepStageBlock(
+            in_channels=int(
+                (self.out_channels[idx - 1] + self.in_channels[idx - 1]) *
+                self.widen_factor),
+            out_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            block_cfg=block_cfg)
+
+        if idx == 1:
+            return layer0
+        elif idx == 2:
+            layer1 = ConvModule(
+                in_channels=int(self.out_channels[idx - 1] *
+                                self.widen_factor),
+                out_channels=int(self.out_channels[idx - 2] *
+                                 self.widen_factor),
+                kernel_size=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            return nn.Sequential(layer0, layer1)
+
+    def build_downsample_layer(self, idx: int) -> nn.Module:
+        """build downsample layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The downsample layer.
+        """
+        return ConvModule(
+            in_channels=int(self.out_channels[idx] * self.widen_factor),
+            out_channels=int(self.out_channels[idx] * self.widen_factor),
+            kernel_size=3,
+            stride=2,
+            padding=3 // 2,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        block_cfg = self.block_cfg.copy()
+
+        return RepStageBlock(
+            in_channels=int(self.out_channels[idx] * 2 * self.widen_factor),
+            out_channels=int(self.out_channels[idx + 1] * self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            block_cfg=block_cfg)
+
+    def build_out_layer(self, *args, **kwargs) -> nn.Module:
+        """build out layer."""
+        return nn.Identity()
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            """Initialize the parameters."""
+            for m in self.modules():
+                if isinstance(m, torch.nn.Conv2d):
+                    # In order to be consistent with the source code,
+                    # reset the Conv2d initialization parameters
+                    m.reset_parameters()
+        else:
+            super().init_weights()
+
+
+@MODELS.register_module()
+class YOLOv6CSPRepPAFPN(YOLOv6RepPAFPN):
+    """Path Aggregation Network used in YOLOv6.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1.
+        freeze_all(bool): Whether to freeze the model.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='ReLU', inplace=True).
+        block_cfg (dict): Config dict for the block used to build each
+            layer. Defaults to dict(type='RepVGGBlock').
+        block_act_cfg (dict): Config dict for activation layer used in each
+            stage. Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: int,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 hidden_ratio: float = 0.5,
+                 num_csp_blocks: int = 12,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 block_act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 block_cfg: ConfigType = dict(type='RepVGGBlock'),
+                 init_cfg: OptMultiConfig = None):
+        self.hidden_ratio = hidden_ratio
+        self.block_act_cfg = block_act_cfg
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            num_csp_blocks=num_csp_blocks,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            block_cfg=block_cfg,
+            init_cfg=init_cfg)
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The top down layer.
+        """
+        block_cfg = self.block_cfg.copy()
+
+        layer0 = BepC3StageBlock(
+            in_channels=int(
+                (self.out_channels[idx - 1] + self.in_channels[idx - 1]) *
+                self.widen_factor),
+            out_channels=int(self.out_channels[idx - 1] * self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            block_cfg=block_cfg,
+            hidden_ratio=self.hidden_ratio,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.block_act_cfg)
+
+        if idx == 1:
+            return layer0
+        elif idx == 2:
+            layer1 = ConvModule(
+                in_channels=int(self.out_channels[idx - 1] *
+                                self.widen_factor),
+                out_channels=int(self.out_channels[idx - 2] *
+                                 self.widen_factor),
+                kernel_size=1,
+                stride=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            return nn.Sequential(layer0, layer1)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        block_cfg = self.block_cfg.copy()
+
+        return BepC3StageBlock(
+            in_channels=int(self.out_channels[idx] * 2 * self.widen_factor),
+            out_channels=int(self.out_channels[idx + 1] * self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            block_cfg=block_cfg,
+            hidden_ratio=self.hidden_ratio,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.block_act_cfg)
diff --git a/mmyolo/models/necks/yolov7_pafpn.py b/mmyolo/models/necks/yolov7_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d31f4623b50083ff820e6b20229b33ad0f41860
--- /dev/null
+++ b/mmyolo/models/necks/yolov7_pafpn.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from ..layers import MaxPoolAndStrideConvBlock, RepVGGBlock, SPPFCSPBlock
+from .base_yolo_neck import BaseYOLONeck
+
+
+@MODELS.register_module()
+class YOLOv7PAFPN(BaseYOLONeck):
+    """Path Aggregation Network used in YOLOv7.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        block_cfg (dict): Config dict for block.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        spp_expand_ratio (float): Expand ratio of SPPCSPBlock.
+            Defaults to 0.5.
+        is_tiny_version (bool): Is tiny version of neck. If True,
+            it means it is a yolov7 tiny model. Defaults to False.
+        use_maxpool_in_downsample (bool): Whether maxpooling is
+            used in downsample layers. Defaults to True.
+        use_in_channels_in_downsample (bool): MaxPoolAndStrideConvBlock
+            module input parameters. Defaults to False.
+        use_repconv_outs (bool): Whether to use `repconv` in the output
+            layer. Defaults to True.
+        upsample_feats_cat_first (bool): Whether the output features are
+            concat first after upsampling in the topdown module.
+            Defaults to True. Currently only YOLOv7 is false.
+        freeze_all(bool): Whether to freeze the model. Defaults to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: List[int],
+                 block_cfg: dict = dict(
+                     type='ELANBlock',
+                     middle_ratio=0.5,
+                     block_ratio=0.25,
+                     num_blocks=4,
+                     num_convs_in_block=1),
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 spp_expand_ratio: float = 0.5,
+                 is_tiny_version: bool = False,
+                 use_maxpool_in_downsample: bool = True,
+                 use_in_channels_in_downsample: bool = False,
+                 use_repconv_outs: bool = True,
+                 upsample_feats_cat_first: bool = False,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+
+        self.is_tiny_version = is_tiny_version
+        self.use_maxpool_in_downsample = use_maxpool_in_downsample
+        self.use_in_channels_in_downsample = use_in_channels_in_downsample
+        self.spp_expand_ratio = spp_expand_ratio
+        self.use_repconv_outs = use_repconv_outs
+        self.block_cfg = block_cfg
+        self.block_cfg.setdefault('norm_cfg', norm_cfg)
+        self.block_cfg.setdefault('act_cfg', act_cfg)
+
+        super().__init__(
+            in_channels=[
+                int(channel * widen_factor) for channel in in_channels
+            ],
+            out_channels=[
+                int(channel * widen_factor) for channel in out_channels
+            ],
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            upsample_feats_cat_first=upsample_feats_cat_first,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def build_reduce_layer(self, idx: int) -> nn.Module:
+        """build reduce layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The reduce layer.
+        """
+        if idx == len(self.in_channels) - 1:
+            layer = SPPFCSPBlock(
+                self.in_channels[idx],
+                self.out_channels[idx],
+                expand_ratio=self.spp_expand_ratio,
+                is_tiny_version=self.is_tiny_version,
+                kernel_sizes=5,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            layer = ConvModule(
+                self.in_channels[idx],
+                self.out_channels[idx],
+                1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+        return layer
+
+    def build_upsample_layer(self, idx: int) -> nn.Module:
+        """build upsample layer."""
+        return nn.Sequential(
+            ConvModule(
+                self.out_channels[idx],
+                self.out_channels[idx - 1],
+                1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            nn.Upsample(scale_factor=2, mode='nearest'))
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The top down layer.
+        """
+        block_cfg = self.block_cfg.copy()
+        block_cfg['in_channels'] = self.out_channels[idx - 1] * 2
+        block_cfg['out_channels'] = self.out_channels[idx - 1]
+        return MODELS.build(block_cfg)
+
+    def build_downsample_layer(self, idx: int) -> nn.Module:
+        """build downsample layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The downsample layer.
+        """
+        if self.use_maxpool_in_downsample and not self.is_tiny_version:
+            return MaxPoolAndStrideConvBlock(
+                self.out_channels[idx],
+                self.out_channels[idx + 1],
+                use_in_channels_of_middle=self.use_in_channels_in_downsample,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            return ConvModule(
+                self.out_channels[idx],
+                self.out_channels[idx + 1],
+                3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        block_cfg = self.block_cfg.copy()
+        block_cfg['in_channels'] = self.out_channels[idx + 1] * 2
+        block_cfg['out_channels'] = self.out_channels[idx + 1]
+        return MODELS.build(block_cfg)
+
+    def build_out_layer(self, idx: int) -> nn.Module:
+        """build out layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The out layer.
+        """
+        if len(self.in_channels) == 4:
+            # P6
+            return nn.Identity()
+
+        out_channels = self.out_channels[idx] * 2
+
+        if self.use_repconv_outs:
+            return RepVGGBlock(
+                self.out_channels[idx],
+                out_channels,
+                3,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            return ConvModule(
+                self.out_channels[idx],
+                out_channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
diff --git a/mmyolo/models/necks/yolov8_pafpn.py b/mmyolo/models/necks/yolov8_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e26698bcc191b0141d89c1e965de811494a96539
--- /dev/null
+++ b/mmyolo/models/necks/yolov8_pafpn.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch.nn as nn
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from .. import CSPLayerWithTwoConv
+from ..utils import make_divisible, make_round
+from .yolov5_pafpn import YOLOv5PAFPN
+
+
+@MODELS.register_module()
+class YOLOv8PAFPN(YOLOv5PAFPN):
+    """Path Aggregation Network used in YOLOv8.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1.
+        freeze_all(bool): Whether to freeze the model
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: Union[List[int], int],
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 num_csp_blocks: int = 3,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            num_csp_blocks=num_csp_blocks,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def build_reduce_layer(self, idx: int) -> nn.Module:
+        """build reduce layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The reduce layer.
+        """
+        return nn.Identity()
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The top down layer.
+        """
+        return CSPLayerWithTwoConv(
+            make_divisible((self.in_channels[idx - 1] + self.in_channels[idx]),
+                           self.widen_factor),
+            make_divisible(self.out_channels[idx - 1], self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            add_identity=False,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        return CSPLayerWithTwoConv(
+            make_divisible(
+                (self.out_channels[idx] + self.out_channels[idx + 1]),
+                self.widen_factor),
+            make_divisible(self.out_channels[idx + 1], self.widen_factor),
+            num_blocks=make_round(self.num_csp_blocks, self.deepen_factor),
+            add_identity=False,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
diff --git a/mmyolo/models/necks/yolox_pafpn.py b/mmyolo/models/necks/yolox_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd2595e70fe47e38e68ebd0d878deb6f264bf2d1
--- /dev/null
+++ b/mmyolo/models/necks/yolox_pafpn.py
@@ -0,0 +1,172 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmdet.models.backbones.csp_darknet import CSPLayer
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from .base_yolo_neck import BaseYOLONeck
+
+
+@MODELS.register_module()
+class YOLOXPAFPN(BaseYOLONeck):
+    """Path Aggregation Network used in YOLOX.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        freeze_all(bool): Whether to freeze the model. Defaults to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: int,
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 num_csp_blocks: int = 3,
+                 use_depthwise: bool = False,
+                 freeze_all: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None):
+        self.num_csp_blocks = round(num_csp_blocks * deepen_factor)
+        self.use_depthwise = use_depthwise
+
+        super().__init__(
+            in_channels=[
+                int(channel * widen_factor) for channel in in_channels
+            ],
+            out_channels=int(out_channels * widen_factor),
+            deepen_factor=deepen_factor,
+            widen_factor=widen_factor,
+            freeze_all=freeze_all,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+
+    def build_reduce_layer(self, idx: int) -> nn.Module:
+        """build reduce layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The reduce layer.
+        """
+        if idx == 2:
+            layer = ConvModule(
+                self.in_channels[idx],
+                self.in_channels[idx - 1],
+                1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            layer = nn.Identity()
+
+        return layer
+
+    def build_upsample_layer(self, *args, **kwargs) -> nn.Module:
+        """build upsample layer."""
+        return nn.Upsample(scale_factor=2, mode='nearest')
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The top down layer.
+        """
+        if idx == 1:
+            return CSPLayer(
+                self.in_channels[idx - 1] * 2,
+                self.in_channels[idx - 1],
+                num_blocks=self.num_csp_blocks,
+                add_identity=False,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        elif idx == 2:
+            return nn.Sequential(
+                CSPLayer(
+                    self.in_channels[idx - 1] * 2,
+                    self.in_channels[idx - 1],
+                    num_blocks=self.num_csp_blocks,
+                    add_identity=False,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                ConvModule(
+                    self.in_channels[idx - 1],
+                    self.in_channels[idx - 2],
+                    kernel_size=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def build_downsample_layer(self, idx: int) -> nn.Module:
+        """build downsample layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The downsample layer.
+        """
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        return conv(
+            self.in_channels[idx],
+            self.in_channels[idx],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        return CSPLayer(
+            self.in_channels[idx] * 2,
+            self.in_channels[idx + 1],
+            num_blocks=self.num_csp_blocks,
+            add_identity=False,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def build_out_layer(self, idx: int) -> nn.Module:
+        """build out layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The out layer.
+        """
+        return ConvModule(
+            self.in_channels[idx],
+            self.out_channels,
+            1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
diff --git a/mmyolo/models/plugins/__init__.py b/mmyolo/models/plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..497233ac21a4dd1a6a2a3127c09435d8146eb553
--- /dev/null
+++ b/mmyolo/models/plugins/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cbam import CBAM
+
+__all__ = ['CBAM']
diff --git a/mmyolo/models/plugins/cbam.py b/mmyolo/models/plugins/cbam.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9559f2e2db951a5681ec9af5864928ed480361b
--- /dev/null
+++ b/mmyolo/models/plugins/cbam.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmdet.utils import OptMultiConfig
+from mmengine.model import BaseModule
+
+from mmyolo.registry import MODELS
+
+
+class ChannelAttention(BaseModule):
+    """ChannelAttention.
+
+    Args:
+        channels (int): The input (and output) channels of the
+            ChannelAttention.
+        reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate
+            channel will be ``int(channels/ratio)``. Defaults to 16.
+        act_cfg (dict): Config dict for activation layer
+            Defaults to dict(type='ReLU').
+    """
+
+    def __init__(self,
+                 channels: int,
+                 reduce_ratio: int = 16,
+                 act_cfg: dict = dict(type='ReLU')):
+        super().__init__()
+
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+
+        self.fc = nn.Sequential(
+            ConvModule(
+                in_channels=channels,
+                out_channels=int(channels / reduce_ratio),
+                kernel_size=1,
+                stride=1,
+                conv_cfg=None,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=int(channels / reduce_ratio),
+                out_channels=channels,
+                kernel_size=1,
+                stride=1,
+                conv_cfg=None,
+                act_cfg=None))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward function."""
+        avgpool_out = self.fc(self.avg_pool(x))
+        maxpool_out = self.fc(self.max_pool(x))
+        out = self.sigmoid(avgpool_out + maxpool_out)
+        return out
+
+
+class SpatialAttention(BaseModule):
+    """SpatialAttention
+    Args:
+         kernel_size (int): The size of the convolution kernel in
+            SpatialAttention. Defaults to 7.
+    """
+
+    def __init__(self, kernel_size: int = 7):
+        super().__init__()
+
+        self.conv = ConvModule(
+            in_channels=2,
+            out_channels=1,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=None,
+            act_cfg=dict(type='Sigmoid'))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward function."""
+        avg_out = torch.mean(x, dim=1, keepdim=True)
+        max_out, _ = torch.max(x, dim=1, keepdim=True)
+        out = torch.cat([avg_out, max_out], dim=1)
+        out = self.conv(out)
+        return out
+
+
+@MODELS.register_module()
+class CBAM(BaseModule):
+    """Convolutional Block Attention Module. arxiv link:
+    https://arxiv.org/abs/1807.06521v2.
+
+    Args:
+        in_channels (int): The input (and output) channels of the CBAM.
+        reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate
+            channel will be ``int(channels/ratio)``. Defaults to 16.
+        kernel_size (int): The size of the convolution kernel in
+            SpatialAttention. Defaults to 7.
+        act_cfg (dict): Config dict for activation layer in ChannelAttention
+            Defaults to dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 reduce_ratio: int = 16,
+                 kernel_size: int = 7,
+                 act_cfg: dict = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg)
+        self.channel_attention = ChannelAttention(
+            channels=in_channels, reduce_ratio=reduce_ratio, act_cfg=act_cfg)
+
+        self.spatial_attention = SpatialAttention(kernel_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward function."""
+        out = self.channel_attention(x) * x
+        out = self.spatial_attention(out) * out
+        return out
diff --git a/mmyolo/models/task_modules/__init__.py b/mmyolo/models/task_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dbdc25fa3cf16e85e0e99e7d302a98f2b4f13ce
--- /dev/null
+++ b/mmyolo/models/task_modules/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import BatchATSSAssigner, BatchTaskAlignedAssigner
+from .coders import YOLOv5BBoxCoder, YOLOXBBoxCoder
+
+__all__ = [
+    'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'BatchATSSAssigner',
+    'BatchTaskAlignedAssigner'
+]
diff --git a/mmyolo/models/task_modules/assigners/__init__.py b/mmyolo/models/task_modules/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e74ab728b301b98eaa3702cae4befc82d62f0bc5
--- /dev/null
+++ b/mmyolo/models/task_modules/assigners/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .batch_atss_assigner import BatchATSSAssigner
+from .batch_dsl_assigner import BatchDynamicSoftLabelAssigner
+from .batch_task_aligned_assigner import BatchTaskAlignedAssigner
+from .utils import (select_candidates_in_gts, select_highest_overlaps,
+                    yolov6_iou_calculator)
+
+__all__ = [
+    'BatchATSSAssigner', 'BatchTaskAlignedAssigner',
+    'select_candidates_in_gts', 'select_highest_overlaps',
+    'yolov6_iou_calculator', 'BatchDynamicSoftLabelAssigner'
+]
diff --git a/mmyolo/models/task_modules/assigners/batch_atss_assigner.py b/mmyolo/models/task_modules/assigners/batch_atss_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b3069afde73e240890273c58e3860da59ad854
--- /dev/null
+++ b/mmyolo/models/task_modules/assigners/batch_atss_assigner.py
@@ -0,0 +1,339 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.utils import ConfigType
+from torch import Tensor
+
+from mmyolo.registry import TASK_UTILS
+from .utils import (select_candidates_in_gts, select_highest_overlaps,
+                    yolov6_iou_calculator)
+
+
+def bbox_center_distance(bboxes: Tensor,
+                         priors: Tensor) -> Tuple[Tensor, Tensor]:
+    """Compute the center distance between bboxes and priors.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bbox, "xyxy" format.
+        priors (Tensor): Shape (num_priors, 4) for priors, "xyxy" format.
+
+    Returns:
+        distances (Tensor): Center distances between bboxes and priors,
+            shape (num_priors, n).
+        priors_points (Tensor): Priors cx cy points,
+            shape (num_priors, 2).
+    """
+    bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+    bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+    bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1)
+
+    priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+    priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+    priors_points = torch.stack((priors_cx, priors_cy), dim=1)
+
+    distances = (bbox_points[:, None, :] -
+                 priors_points[None, :, :]).pow(2).sum(-1).sqrt()
+
+    return distances, priors_points
+
+
+@TASK_UTILS.register_module()
+class BatchATSSAssigner(nn.Module):
+    """Assign a batch of corresponding gt bboxes or background to each prior.
+
+    This code is based on
+    https://github.com/meituan/YOLOv6/blob/main/yolov6/assigners/atss_assigner.py
+
+    Each proposal will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        num_classes (int): number of class
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+        topk (int): number of priors selected in each level
+    """
+
+    def __init__(
+            self,
+            num_classes: int,
+            iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'),
+            topk: int = 9):
+        super().__init__()
+        self.num_classes = num_classes
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.topk = topk
+
+    @torch.no_grad()
+    def forward(self, pred_bboxes: Tensor, priors: Tensor,
+                num_level_priors: List, gt_labels: Tensor, gt_bboxes: Tensor,
+                pad_bbox_flag: Tensor) -> dict:
+        """Assign gt to priors.
+
+        The assignment is done in following steps
+
+        1. compute iou between all prior (prior of all pyramid levels) and gt
+        2. compute center distance between all prior and gt
+        3. on each pyramid level, for each gt, select k prior whose center
+           are closest to the gt center, so we total select k*l prior as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as positive
+        6. limit the positive sample's center in gt
+
+        Args:
+            pred_bboxes (Tensor): Predicted bounding boxes,
+                shape(batch_size, num_priors, 4)
+            priors (Tensor): Model priors with stride, shape(num_priors, 4)
+            num_level_priors (List): Number of bboxes in each level, len(3)
+            gt_labels (Tensor): Ground truth label,
+                shape(batch_size, num_gt, 1)
+            gt_bboxes (Tensor): Ground truth bbox,
+                shape(batch_size, num_gt, 4)
+            pad_bbox_flag (Tensor): Ground truth bbox mask,
+                1 means bbox, 0 means no bbox,
+                shape(batch_size, num_gt, 1)
+        Returns:
+            assigned_result (dict): Assigned result
+                'assigned_labels' (Tensor): shape(batch_size, num_gt)
+                'assigned_bboxes' (Tensor): shape(batch_size, num_gt, 4)
+                'assigned_scores' (Tensor):
+                    shape(batch_size, num_gt, number_classes)
+                'fg_mask_pre_prior' (Tensor): shape(bs, num_gt)
+        """
+        # generate priors
+        cell_half_size = priors[:, 2:] * 2.5
+        priors_gen = torch.zeros_like(priors)
+        priors_gen[:, :2] = priors[:, :2] - cell_half_size
+        priors_gen[:, 2:] = priors[:, :2] + cell_half_size
+        priors = priors_gen
+
+        batch_size = gt_bboxes.size(0)
+        num_gt, num_priors = gt_bboxes.size(1), priors.size(0)
+
+        assigned_result = {
+            'assigned_labels':
+            gt_bboxes.new_full([batch_size, num_priors], self.num_classes),
+            'assigned_bboxes':
+            gt_bboxes.new_full([batch_size, num_priors, 4], 0),
+            'assigned_scores':
+            gt_bboxes.new_full([batch_size, num_priors, self.num_classes], 0),
+            'fg_mask_pre_prior':
+            gt_bboxes.new_full([batch_size, num_priors], 0)
+        }
+
+        if num_gt == 0:
+            return assigned_result
+
+        # compute iou between all prior (prior of all pyramid levels) and gt
+        overlaps = self.iou_calculator(gt_bboxes.reshape([-1, 4]), priors)
+        overlaps = overlaps.reshape([batch_size, -1, num_priors])
+
+        # compute center distance between all prior and gt
+        distances, priors_points = bbox_center_distance(
+            gt_bboxes.reshape([-1, 4]), priors)
+        distances = distances.reshape([batch_size, -1, num_priors])
+
+        # Selecting candidates based on the center distance
+        is_in_candidate, candidate_idxs = self.select_topk_candidates(
+            distances, num_level_priors, pad_bbox_flag)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        overlaps_thr_per_gt, iou_candidates = self.threshold_calculator(
+            is_in_candidate, candidate_idxs, overlaps, num_priors, batch_size,
+            num_gt)
+
+        # select candidates iou >= threshold as positive
+        is_pos = torch.where(
+            iou_candidates > overlaps_thr_per_gt.repeat([1, 1, num_priors]),
+            is_in_candidate, torch.zeros_like(is_in_candidate))
+
+        is_in_gts = select_candidates_in_gts(priors_points, gt_bboxes)
+        pos_mask = is_pos * is_in_gts * pad_bbox_flag
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        gt_idx_pre_prior, fg_mask_pre_prior, pos_mask = \
+            select_highest_overlaps(pos_mask, overlaps, num_gt)
+
+        # assigned target
+        assigned_labels, assigned_bboxes, assigned_scores = self.get_targets(
+            gt_labels, gt_bboxes, gt_idx_pre_prior, fg_mask_pre_prior,
+            num_priors, batch_size, num_gt)
+
+        # soft label with iou
+        if pred_bboxes is not None:
+            ious = yolov6_iou_calculator(gt_bboxes, pred_bboxes) * pos_mask
+            ious = ious.max(axis=-2)[0].unsqueeze(-1)
+            assigned_scores *= ious
+
+        assigned_result['assigned_labels'] = assigned_labels.long()
+        assigned_result['assigned_bboxes'] = assigned_bboxes
+        assigned_result['assigned_scores'] = assigned_scores
+        assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool()
+        return assigned_result
+
+    def select_topk_candidates(self, distances: Tensor,
+                               num_level_priors: List[int],
+                               pad_bbox_flag: Tensor) -> Tuple[Tensor, Tensor]:
+        """Selecting candidates based on the center distance.
+
+        Args:
+            distances (Tensor): Distance between all bbox and gt,
+                shape(batch_size, num_gt, num_priors)
+            num_level_priors (List[int]): Number of bboxes in each level,
+                len(3)
+            pad_bbox_flag (Tensor): Ground truth bbox mask,
+                shape(batch_size, num_gt, 1)
+
+        Return:
+            is_in_candidate_list (Tensor): Flag show that each level have
+                topk candidates or not,  shape(batch_size, num_gt, num_priors)
+            candidate_idxs (Tensor): Candidates index,
+                shape(batch_size, num_gt, num_gt)
+        """
+        is_in_candidate_list = []
+        candidate_idxs = []
+        start_idx = 0
+
+        distances_dtype = distances.dtype
+        distances = torch.split(distances, num_level_priors, dim=-1)
+        pad_bbox_flag = pad_bbox_flag.repeat(1, 1, self.topk).bool()
+
+        for distances_per_level, priors_per_level in zip(
+                distances, num_level_priors):
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_index = start_idx + priors_per_level
+            selected_k = min(self.topk, priors_per_level)
+
+            _, topk_idxs_per_level = distances_per_level.topk(
+                selected_k, dim=-1, largest=False)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+
+            topk_idxs_per_level = torch.where(
+                pad_bbox_flag, topk_idxs_per_level,
+                torch.zeros_like(topk_idxs_per_level))
+
+            is_in_candidate = F.one_hot(topk_idxs_per_level,
+                                        priors_per_level).sum(dim=-2)
+            is_in_candidate = torch.where(is_in_candidate > 1,
+                                          torch.zeros_like(is_in_candidate),
+                                          is_in_candidate)
+            is_in_candidate_list.append(is_in_candidate.to(distances_dtype))
+
+            start_idx = end_index
+
+        is_in_candidate_list = torch.cat(is_in_candidate_list, dim=-1)
+        candidate_idxs = torch.cat(candidate_idxs, dim=-1)
+
+        return is_in_candidate_list, candidate_idxs
+
+    @staticmethod
+    def threshold_calculator(is_in_candidate: List, candidate_idxs: Tensor,
+                             overlaps: Tensor, num_priors: int,
+                             batch_size: int,
+                             num_gt: int) -> Tuple[Tensor, Tensor]:
+        """Get corresponding iou for the these candidates, and compute the mean
+        and std, set mean + std as the iou threshold.
+
+        Args:
+            is_in_candidate (Tensor): Flag show that each level have
+                topk candidates or not, shape(batch_size, num_gt, num_priors).
+            candidate_idxs (Tensor): Candidates index,
+                shape(batch_size, num_gt, num_gt)
+            overlaps (Tensor): Overlaps area,
+                shape(batch_size, num_gt, num_priors).
+            num_priors (int): Number of priors.
+            batch_size (int): Batch size.
+            num_gt (int): Number of ground truth.
+
+        Return:
+            overlaps_thr_per_gt (Tensor): Overlap threshold of
+                per ground truth, shape(batch_size, num_gt, 1).
+            candidate_overlaps (Tensor): Candidate overlaps,
+                shape(batch_size, num_gt, num_priors).
+        """
+
+        batch_size_num_gt = batch_size * num_gt
+        candidate_overlaps = torch.where(is_in_candidate > 0, overlaps,
+                                         torch.zeros_like(overlaps))
+        candidate_idxs = candidate_idxs.reshape([batch_size_num_gt, -1])
+
+        assist_indexes = num_priors * torch.arange(
+            batch_size_num_gt, device=candidate_idxs.device)
+        assist_indexes = assist_indexes[:, None]
+        flatten_indexes = candidate_idxs + assist_indexes
+
+        candidate_overlaps_reshape = candidate_overlaps.reshape(
+            -1)[flatten_indexes]
+        candidate_overlaps_reshape = candidate_overlaps_reshape.reshape(
+            [batch_size, num_gt, -1])
+
+        overlaps_mean_per_gt = candidate_overlaps_reshape.mean(
+            axis=-1, keepdim=True)
+        overlaps_std_per_gt = candidate_overlaps_reshape.std(
+            axis=-1, keepdim=True)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        return overlaps_thr_per_gt, candidate_overlaps
+
+    def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor,
+                    assigned_gt_inds: Tensor, fg_mask_pre_prior: Tensor,
+                    num_priors: int, batch_size: int,
+                    num_gt: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Get target info.
+
+        Args:
+            gt_labels (Tensor): Ground true labels,
+                shape(batch_size, num_gt, 1)
+            gt_bboxes (Tensor): Ground true bboxes,
+                shape(batch_size, num_gt, 4)
+            assigned_gt_inds (Tensor): Assigned ground truth indexes,
+                shape(batch_size, num_priors)
+            fg_mask_pre_prior (Tensor): Force ground truth matching mask,
+                shape(batch_size, num_priors)
+            num_priors (int): Number of priors.
+            batch_size (int): Batch size.
+            num_gt (int): Number of ground truth.
+
+        Return:
+            assigned_labels (Tensor): Assigned labels,
+                shape(batch_size, num_priors)
+            assigned_bboxes (Tensor): Assigned bboxes,
+                shape(batch_size, num_priors)
+            assigned_scores (Tensor): Assigned scores,
+                shape(batch_size, num_priors)
+        """
+
+        # assigned target labels
+        batch_index = torch.arange(
+            batch_size, dtype=gt_labels.dtype, device=gt_labels.device)
+        batch_index = batch_index[..., None]
+        assigned_gt_inds = (assigned_gt_inds + batch_index * num_gt).long()
+        assigned_labels = gt_labels.flatten()[assigned_gt_inds.flatten()]
+        assigned_labels = assigned_labels.reshape([batch_size, num_priors])
+        assigned_labels = torch.where(
+            fg_mask_pre_prior > 0, assigned_labels,
+            torch.full_like(assigned_labels, self.num_classes))
+
+        # assigned target boxes
+        assigned_bboxes = gt_bboxes.reshape([-1,
+                                             4])[assigned_gt_inds.flatten()]
+        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_priors, 4])
+
+        # assigned target scores
+        assigned_scores = F.one_hot(assigned_labels.long(),
+                                    self.num_classes + 1).float()
+        assigned_scores = assigned_scores[:, :, :self.num_classes]
+
+        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py b/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ae0f80239590f9c906778e6e4c7c6b4bd10c488
--- /dev/null
+++ b/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.utils import ConfigType
+from torch import Tensor
+
+from mmyolo.registry import TASK_UTILS
+
+INF = 100000000
+EPS = 1.0e-7
+
+
+def find_inside_points(boxes: Tensor,
+                       points: Tensor,
+                       box_dim: int = 4,
+                       eps: float = 0.01) -> Tensor:
+    """Find inside box points in batches. Boxes dimension must be 3.
+
+    Args:
+        boxes (Tensor): Boxes tensor. Must be batch input.
+            Has shape of (batch_size, n_boxes, box_dim).
+        points (Tensor): Points coordinates. Has shape of (n_points, 2).
+        box_dim (int): The dimension of box. 4 means horizontal box and
+            5 means rotated box. Defaults to 4.
+        eps (float): Make sure the points are inside not on the boundary.
+            Only use in rotated boxes. Defaults to 0.01.
+
+    Returns:
+        Tensor: A BoolTensor indicating whether a point is inside
+        boxes. The index has shape of (n_points, batch_size, n_boxes).
+    """
+    if box_dim == 4:
+        # Horizontal Boxes
+        lt_ = points[:, None, None] - boxes[..., :2]
+        rb_ = boxes[..., 2:] - points[:, None, None]
+
+        deltas = torch.cat([lt_, rb_], dim=-1)
+        is_in_gts = deltas.min(dim=-1).values > 0
+
+    elif box_dim == 5:
+        # Rotated Boxes
+        points = points[:, None, None]
+        ctrs, wh, t = torch.split(boxes, [2, 2, 1], dim=-1)
+        cos_value, sin_value = torch.cos(t), torch.sin(t)
+        matrix = torch.cat([cos_value, sin_value, -sin_value, cos_value],
+                           dim=-1).reshape(*boxes.shape[:-1], 2, 2)
+
+        offset = points - ctrs
+        offset = torch.matmul(matrix, offset[..., None])
+        offset = offset.squeeze(-1)
+        offset_x, offset_y = offset[..., 0], offset[..., 1]
+        w, h = wh[..., 0], wh[..., 1]
+        is_in_gts = (offset_x <= w / 2 - eps) & (offset_x >= - w / 2 + eps) & \
+                    (offset_y <= h / 2 - eps) & (offset_y >= - h / 2 + eps)
+    else:
+        raise NotImplementedError(f'Unsupport box_dim:{box_dim}')
+
+    return is_in_gts
+
+
+def get_box_center(boxes: Tensor, box_dim: int = 4) -> Tensor:
+    """Return a tensor representing the centers of boxes.
+
+    Args:
+        boxes (Tensor): Boxes tensor. Has shape of (b, n, box_dim)
+        box_dim (int): The dimension of box. 4 means horizontal box and
+            5 means rotated box. Defaults to 4.
+
+    Returns:
+        Tensor: Centers have shape of (b, n, 2)
+    """
+    if box_dim == 4:
+        # Horizontal Boxes, (x1, y1, x2, y2)
+        return (boxes[..., :2] + boxes[..., 2:]) / 2.0
+    elif box_dim == 5:
+        # Rotated Boxes, (x, y, w, h, a)
+        return boxes[..., :2]
+    else:
+        raise NotImplementedError(f'Unsupported box_dim:{box_dim}')
+
+
+@TASK_UTILS.register_module()
+class BatchDynamicSoftLabelAssigner(nn.Module):
+    """Computes matching between predictions and ground truth with dynamic soft
+    label assignment.
+
+    Args:
+        num_classes (int): number of class
+        soft_center_radius (float): Radius of the soft center prior.
+            Defaults to 3.0.
+        topk (int): Select top-k predictions to calculate dynamic k
+            best matches for each gt. Defaults to 13.
+        iou_weight (float): The scale factor of iou cost. Defaults to 3.0.
+        iou_calculator (ConfigType): Config of overlaps Calculator.
+            Defaults to dict(type='BboxOverlaps2D').
+        batch_iou (bool): Use batch input when calculate IoU.
+            If set to False use loop instead. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        num_classes,
+        soft_center_radius: float = 3.0,
+        topk: int = 13,
+        iou_weight: float = 3.0,
+        iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'),
+        batch_iou: bool = True,
+    ) -> None:
+        super().__init__()
+        self.num_classes = num_classes
+        self.soft_center_radius = soft_center_radius
+        self.topk = topk
+        self.iou_weight = iou_weight
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.batch_iou = batch_iou
+
+    @torch.no_grad()
+    def forward(self, pred_bboxes: Tensor, pred_scores: Tensor, priors: Tensor,
+                gt_labels: Tensor, gt_bboxes: Tensor,
+                pad_bbox_flag: Tensor) -> dict:
+        num_gt = gt_bboxes.size(1)
+        decoded_bboxes = pred_bboxes
+        batch_size, num_bboxes, box_dim = decoded_bboxes.size()
+
+        if num_gt == 0 or num_bboxes == 0:
+            return {
+                'assigned_labels':
+                gt_labels.new_full(
+                    pred_scores[..., 0].shape,
+                    self.num_classes,
+                    dtype=torch.long),
+                'assigned_labels_weights':
+                gt_bboxes.new_full(pred_scores[..., 0].shape, 1),
+                'assigned_bboxes':
+                gt_bboxes.new_full(pred_bboxes.shape, 0),
+                'assign_metrics':
+                gt_bboxes.new_full(pred_scores[..., 0].shape, 0)
+            }
+
+        prior_center = priors[:, :2]
+        if isinstance(gt_bboxes, BaseBoxes):
+            raise NotImplementedError(
+                f'type of {type(gt_bboxes)} are not implemented !')
+        else:
+            is_in_gts = find_inside_points(gt_bboxes, prior_center, box_dim)
+
+        # (N_points, B, N_boxes)
+        is_in_gts = is_in_gts * pad_bbox_flag[..., 0][None]
+        # (N_points, B, N_boxes) -> (B, N_points, N_boxes)
+        is_in_gts = is_in_gts.permute(1, 0, 2)
+        # (B, N_points)
+        valid_mask = is_in_gts.sum(dim=-1) > 0
+
+        gt_center = get_box_center(gt_bboxes, box_dim)
+
+        strides = priors[..., 2]
+        distance = (priors[None].unsqueeze(2)[..., :2] -
+                    gt_center[:, None, :, :]
+                    ).pow(2).sum(-1).sqrt() / strides[None, :, None]
+
+        # prevent overflow
+        distance = distance * valid_mask.unsqueeze(-1)
+        soft_center_prior = torch.pow(10, distance - self.soft_center_radius)
+
+        if self.batch_iou:
+            pairwise_ious = self.iou_calculator(decoded_bboxes, gt_bboxes)
+        else:
+            ious = []
+            for box, gt in zip(decoded_bboxes, gt_bboxes):
+                iou = self.iou_calculator(box, gt)
+                ious.append(iou)
+            pairwise_ious = torch.stack(ious, dim=0)
+
+        iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight
+
+        # select the predicted scores corresponded to the gt_labels
+        pairwise_pred_scores = pred_scores.permute(0, 2, 1)
+        idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long)
+        idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt)
+        idx[1] = gt_labels.long().squeeze(-1)
+        pairwise_pred_scores = pairwise_pred_scores[idx[0],
+                                                    idx[1]].permute(0, 2, 1)
+        # classification cost
+        scale_factor = pairwise_ious - pairwise_pred_scores.sigmoid()
+        pairwise_cls_cost = F.binary_cross_entropy_with_logits(
+            pairwise_pred_scores, pairwise_ious,
+            reduction='none') * scale_factor.abs().pow(2.0)
+
+        cost_matrix = pairwise_cls_cost + iou_cost + soft_center_prior
+
+        max_pad_value = torch.ones_like(cost_matrix) * INF
+        cost_matrix = torch.where(valid_mask[..., None].repeat(1, 1, num_gt),
+                                  cost_matrix, max_pad_value)
+
+        (matched_pred_ious, matched_gt_inds,
+         fg_mask_inboxes) = self.dynamic_k_matching(cost_matrix, pairwise_ious,
+                                                    pad_bbox_flag)
+
+        del pairwise_ious, cost_matrix
+
+        batch_index = (fg_mask_inboxes > 0).nonzero(as_tuple=True)[0]
+
+        assigned_labels = gt_labels.new_full(pred_scores[..., 0].shape,
+                                             self.num_classes)
+        assigned_labels[fg_mask_inboxes] = gt_labels[
+            batch_index, matched_gt_inds].squeeze(-1)
+        assigned_labels = assigned_labels.long()
+
+        assigned_labels_weights = gt_bboxes.new_full(pred_scores[..., 0].shape,
+                                                     1)
+
+        assigned_bboxes = gt_bboxes.new_full(pred_bboxes.shape, 0)
+        assigned_bboxes[fg_mask_inboxes] = gt_bboxes[batch_index,
+                                                     matched_gt_inds]
+
+        assign_metrics = gt_bboxes.new_full(pred_scores[..., 0].shape, 0)
+        assign_metrics[fg_mask_inboxes] = matched_pred_ious
+
+        return dict(
+            assigned_labels=assigned_labels,
+            assigned_labels_weights=assigned_labels_weights,
+            assigned_bboxes=assigned_bboxes,
+            assign_metrics=assign_metrics)
+
+    def dynamic_k_matching(
+            self, cost_matrix: Tensor, pairwise_ious: Tensor,
+            pad_bbox_flag: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets.
+
+        Args:
+            cost_matrix (Tensor): Cost matrix.
+            pairwise_ious (Tensor): Pairwise iou matrix.
+            num_gt (int): Number of gt.
+            valid_mask (Tensor): Mask for valid bboxes.
+        Returns:
+            tuple: matched ious and gt indexes.
+        """
+        matching_matrix = torch.zeros_like(cost_matrix, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.topk, pairwise_ious.size(1))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=1)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+
+        num_gts = pad_bbox_flag.sum((1, 2)).int()
+        # sorting the batch cost matirx is faster than topk
+        _, sorted_indices = torch.sort(cost_matrix, dim=1)
+        for b in range(pad_bbox_flag.shape[0]):
+            for gt_idx in range(num_gts[b]):
+                topk_ids = sorted_indices[b, :dynamic_ks[b, gt_idx], gt_idx]
+                matching_matrix[b, :, gt_idx][topk_ids] = 1
+
+        del topk_ious, dynamic_ks
+
+        prior_match_gt_mask = matching_matrix.sum(2) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost_matrix[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(2) > 0
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(2)[fg_mask_inboxes]
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        return matched_pred_ious, matched_gt_inds, fg_mask_inboxes
diff --git a/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py b/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..202d678986c3a398de63675c004592b98ea092e0
--- /dev/null
+++ b/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py
@@ -0,0 +1,311 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmyolo.models.losses import bbox_overlaps
+from mmyolo.registry import TASK_UTILS
+from .utils import (select_candidates_in_gts, select_highest_overlaps,
+                    yolov6_iou_calculator)
+
+
+@TASK_UTILS.register_module()
+class BatchTaskAlignedAssigner(nn.Module):
+    """This code referenced to
+    https://github.com/meituan/YOLOv6/blob/main/yolov6/
+    assigners/tal_assigner.py.
+    Batch Task aligned assigner base on the paper:
+    `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_.
+    Assign a corresponding gt bboxes or background to a batch of
+    predicted bboxes. Each bbox will be assigned with `0` or a
+    positive integer indicating the ground truth index.
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        num_classes (int): number of class
+        topk (int): number of bbox selected in each level
+        alpha (float): Hyper-parameters related to alignment_metrics.
+            Defaults to 1.0
+        beta (float): Hyper-parameters related to alignment_metrics.
+            Defaults to 6.
+        eps (float): Eps to avoid log(0). Default set to 1e-9
+        use_ciou (bool): Whether to use ciou while calculating iou.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 topk: int = 13,
+                 alpha: float = 1.0,
+                 beta: float = 6.0,
+                 eps: float = 1e-7,
+                 use_ciou: bool = False):
+        super().__init__()
+        self.num_classes = num_classes
+        self.topk = topk
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+        self.use_ciou = use_ciou
+
+    @torch.no_grad()
+    def forward(
+        self,
+        pred_bboxes: Tensor,
+        pred_scores: Tensor,
+        priors: Tensor,
+        gt_labels: Tensor,
+        gt_bboxes: Tensor,
+        pad_bbox_flag: Tensor,
+    ) -> dict:
+        """Assign gt to bboxes.
+
+        The assignment is done in following steps
+        1. compute alignment metric between all bbox (bbox of all pyramid
+           levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free
+           detector only can predict positive distance)
+        Args:
+            pred_bboxes (Tensor): Predict bboxes,
+                shape(batch_size, num_priors, 4)
+            pred_scores (Tensor): Scores of predict bboxes,
+                shape(batch_size, num_priors, num_classes)
+            priors (Tensor): Model priors,  shape (num_priors, 4)
+            gt_labels (Tensor): Ground true labels,
+                shape(batch_size, num_gt, 1)
+            gt_bboxes (Tensor): Ground true bboxes,
+                shape(batch_size, num_gt, 4)
+            pad_bbox_flag (Tensor): Ground truth bbox mask,
+                1 means bbox, 0 means no bbox,
+                shape(batch_size, num_gt, 1)
+        Returns:
+            assigned_result (dict) Assigned result:
+                assigned_labels (Tensor): Assigned labels,
+                    shape(batch_size, num_priors)
+                assigned_bboxes (Tensor): Assigned boxes,
+                    shape(batch_size, num_priors, 4)
+                assigned_scores (Tensor): Assigned scores,
+                    shape(batch_size, num_priors, num_classes)
+                fg_mask_pre_prior (Tensor): Force ground truth matching mask,
+                    shape(batch_size, num_priors)
+        """
+        # (num_priors, 4) -> (num_priors, 2)
+        priors = priors[:, :2]
+
+        batch_size = pred_scores.size(0)
+        num_gt = gt_bboxes.size(1)
+
+        assigned_result = {
+            'assigned_labels':
+            gt_bboxes.new_full(pred_scores[..., 0].shape, self.num_classes),
+            'assigned_bboxes':
+            gt_bboxes.new_full(pred_bboxes.shape, 0),
+            'assigned_scores':
+            gt_bboxes.new_full(pred_scores.shape, 0),
+            'fg_mask_pre_prior':
+            gt_bboxes.new_full(pred_scores[..., 0].shape, 0)
+        }
+
+        if num_gt == 0:
+            return assigned_result
+
+        pos_mask, alignment_metrics, overlaps = self.get_pos_mask(
+            pred_bboxes, pred_scores, priors, gt_labels, gt_bboxes,
+            pad_bbox_flag, batch_size, num_gt)
+
+        (assigned_gt_idxs, fg_mask_pre_prior,
+         pos_mask) = select_highest_overlaps(pos_mask, overlaps, num_gt)
+
+        # assigned target
+        assigned_labels, assigned_bboxes, assigned_scores = self.get_targets(
+            gt_labels, gt_bboxes, assigned_gt_idxs, fg_mask_pre_prior,
+            batch_size, num_gt)
+
+        # normalize
+        alignment_metrics *= pos_mask
+        pos_align_metrics = alignment_metrics.max(axis=-1, keepdim=True)[0]
+        pos_overlaps = (overlaps * pos_mask).max(axis=-1, keepdim=True)[0]
+        norm_align_metric = (
+            alignment_metrics * pos_overlaps /
+            (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1)
+        assigned_scores = assigned_scores * norm_align_metric
+
+        assigned_result['assigned_labels'] = assigned_labels
+        assigned_result['assigned_bboxes'] = assigned_bboxes
+        assigned_result['assigned_scores'] = assigned_scores
+        assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool()
+        return assigned_result
+
+    def get_pos_mask(self, pred_bboxes: Tensor, pred_scores: Tensor,
+                     priors: Tensor, gt_labels: Tensor, gt_bboxes: Tensor,
+                     pad_bbox_flag: Tensor, batch_size: int,
+                     num_gt: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Get possible mask.
+
+        Args:
+            pred_bboxes (Tensor): Predict bboxes,
+                shape(batch_size, num_priors, 4)
+            pred_scores (Tensor): Scores of predict bbox,
+                shape(batch_size, num_priors, num_classes)
+            priors (Tensor): Model priors, shape (num_priors, 2)
+            gt_labels (Tensor): Ground true labels,
+                shape(batch_size, num_gt, 1)
+            gt_bboxes (Tensor): Ground true bboxes,
+                shape(batch_size, num_gt, 4)
+            pad_bbox_flag (Tensor): Ground truth bbox mask,
+                1 means bbox, 0 means no bbox,
+                shape(batch_size, num_gt, 1)
+            batch_size (int): Batch size.
+            num_gt (int): Number of ground truth.
+        Returns:
+            pos_mask (Tensor): Possible mask,
+                shape(batch_size, num_gt, num_priors)
+            alignment_metrics (Tensor): Alignment metrics,
+                shape(batch_size, num_gt, num_priors)
+            overlaps (Tensor): Overlaps of gt_bboxes and pred_bboxes,
+                shape(batch_size, num_gt, num_priors)
+        """
+
+        # Compute alignment metric between all bbox and gt
+        alignment_metrics, overlaps = \
+            self.get_box_metrics(pred_bboxes, pred_scores, gt_labels,
+                                 gt_bboxes, batch_size, num_gt)
+
+        # get is_in_gts mask
+        is_in_gts = select_candidates_in_gts(priors, gt_bboxes)
+
+        # get topk_metric mask
+        topk_metric = self.select_topk_candidates(
+            alignment_metrics * is_in_gts,
+            topk_mask=pad_bbox_flag.repeat([1, 1, self.topk]).bool())
+
+        # merge all mask to a final mask
+        pos_mask = topk_metric * is_in_gts * pad_bbox_flag
+
+        return pos_mask, alignment_metrics, overlaps
+
+    def get_box_metrics(self, pred_bboxes: Tensor, pred_scores: Tensor,
+                        gt_labels: Tensor, gt_bboxes: Tensor, batch_size: int,
+                        num_gt: int) -> Tuple[Tensor, Tensor]:
+        """Compute alignment metric between all bbox and gt.
+
+        Args:
+            pred_bboxes (Tensor): Predict bboxes,
+                shape(batch_size, num_priors, 4)
+            pred_scores (Tensor): Scores of predict bbox,
+                shape(batch_size, num_priors, num_classes)
+            gt_labels (Tensor): Ground true labels,
+                shape(batch_size, num_gt, 1)
+            gt_bboxes (Tensor): Ground true bboxes,
+                shape(batch_size, num_gt, 4)
+            batch_size (int): Batch size.
+            num_gt (int): Number of ground truth.
+        Returns:
+            alignment_metrics (Tensor): Align metric,
+                shape(batch_size, num_gt, num_priors)
+            overlaps (Tensor): Overlaps, shape(batch_size, num_gt, num_priors)
+        """
+        pred_scores = pred_scores.permute(0, 2, 1)
+        gt_labels = gt_labels.to(torch.long)
+        idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long)
+        idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt)
+        idx[1] = gt_labels.squeeze(-1)
+        bbox_scores = pred_scores[idx[0], idx[1]]
+        # TODO: need to replace the yolov6_iou_calculator function
+        if self.use_ciou:
+            overlaps = bbox_overlaps(
+                pred_bboxes.unsqueeze(1),
+                gt_bboxes.unsqueeze(2),
+                iou_mode='ciou',
+                bbox_format='xyxy').clamp(0)
+        else:
+            overlaps = yolov6_iou_calculator(gt_bboxes, pred_bboxes)
+
+        alignment_metrics = bbox_scores.pow(self.alpha) * overlaps.pow(
+            self.beta)
+
+        return alignment_metrics, overlaps
+
+    def select_topk_candidates(self,
+                               alignment_gt_metrics: Tensor,
+                               using_largest_topk: bool = True,
+                               topk_mask: Optional[Tensor] = None) -> Tensor:
+        """Compute alignment metric between all bbox and gt.
+
+        Args:
+            alignment_gt_metrics (Tensor): Alignment metric of gt candidates,
+                shape(batch_size, num_gt, num_priors)
+            using_largest_topk (bool): Controls whether to using largest or
+                smallest elements.
+            topk_mask (Tensor): Topk mask,
+                shape(batch_size, num_gt, self.topk)
+        Returns:
+            Tensor: Topk candidates mask,
+                shape(batch_size, num_gt, num_priors)
+        """
+        num_priors = alignment_gt_metrics.shape[-1]
+        topk_metrics, topk_idxs = torch.topk(
+            alignment_gt_metrics,
+            self.topk,
+            axis=-1,
+            largest=using_largest_topk)
+        if topk_mask is None:
+            topk_mask = (topk_metrics.max(axis=-1, keepdim=True) >
+                         self.eps).tile([1, 1, self.topk])
+        topk_idxs = torch.where(topk_mask, topk_idxs,
+                                torch.zeros_like(topk_idxs))
+        is_in_topk = F.one_hot(topk_idxs, num_priors).sum(axis=-2)
+        is_in_topk = torch.where(is_in_topk > 1, torch.zeros_like(is_in_topk),
+                                 is_in_topk)
+        return is_in_topk.to(alignment_gt_metrics.dtype)
+
+    def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor,
+                    assigned_gt_idxs: Tensor, fg_mask_pre_prior: Tensor,
+                    batch_size: int,
+                    num_gt: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Get assigner info.
+
+        Args:
+            gt_labels (Tensor): Ground true labels,
+                shape(batch_size, num_gt, 1)
+            gt_bboxes (Tensor): Ground true bboxes,
+                shape(batch_size, num_gt, 4)
+            assigned_gt_idxs (Tensor): Assigned ground truth indexes,
+                shape(batch_size, num_priors)
+            fg_mask_pre_prior (Tensor): Force ground truth matching mask,
+                shape(batch_size, num_priors)
+            batch_size (int): Batch size.
+            num_gt (int): Number of ground truth.
+        Returns:
+            assigned_labels (Tensor): Assigned labels,
+                shape(batch_size, num_priors)
+            assigned_bboxes (Tensor): Assigned bboxes,
+                shape(batch_size, num_priors)
+            assigned_scores (Tensor): Assigned scores,
+                shape(batch_size, num_priors)
+        """
+        # assigned target labels
+        batch_ind = torch.arange(
+            end=batch_size, dtype=torch.int64, device=gt_labels.device)[...,
+                                                                        None]
+        assigned_gt_idxs = assigned_gt_idxs + batch_ind * num_gt
+        assigned_labels = gt_labels.long().flatten()[assigned_gt_idxs]
+
+        # assigned target boxes
+        assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_idxs]
+
+        # assigned target scores
+        assigned_labels[assigned_labels < 0] = 0
+        assigned_scores = F.one_hot(assigned_labels, self.num_classes)
+        force_gt_scores_mask = fg_mask_pre_prior[:, :, None].repeat(
+            1, 1, self.num_classes)
+        assigned_scores = torch.where(force_gt_scores_mask > 0,
+                                      assigned_scores,
+                                      torch.full_like(assigned_scores, 0))
+
+        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py b/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..6709968eeb1768fc4e6124f1f7a344f581dd43a7
--- /dev/null
+++ b/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py
@@ -0,0 +1,344 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_overlaps
+
+
+def _cat_multi_level_tensor_in_place(*multi_level_tensor, place_hold_var):
+    """concat multi-level tensor in place."""
+    for level_tensor in multi_level_tensor:
+        for i, var in enumerate(level_tensor):
+            if len(var) > 0:
+                level_tensor[i] = torch.cat(var, dim=0)
+            else:
+                level_tensor[i] = place_hold_var
+
+
+class BatchYOLOv7Assigner(nn.Module):
+    """Batch YOLOv7 Assigner.
+
+    It consists of two assigning steps:
+
+        1. YOLOv5 cross-grid sample assigning
+        2. SimOTA assigning
+
+    This code referenced to
+    https://github.com/WongKinYiu/yolov7/blob/main/utils/loss.py.
+
+    Args:
+        num_classes (int): Number of classes.
+        num_base_priors (int): Number of base priors.
+        featmap_strides (Sequence[int]): Feature map strides.
+        prior_match_thr (float): Threshold to match priors.
+            Defaults to 4.0.
+        candidate_topk (int): Number of topk candidates to
+            assign. Defaults to 10.
+        iou_weight (float): IOU weight. Defaults to 3.0.
+        cls_weight (float): Class weight. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 num_base_priors: int,
+                 featmap_strides: Sequence[int],
+                 prior_match_thr: float = 4.0,
+                 candidate_topk: int = 10,
+                 iou_weight: float = 3.0,
+                 cls_weight: float = 1.0):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_base_priors = num_base_priors
+        self.featmap_strides = featmap_strides
+        # yolov5 param
+        self.prior_match_thr = prior_match_thr
+        # simota param
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+
+    @torch.no_grad()
+    def forward(self,
+                pred_results,
+                batch_targets_normed,
+                batch_input_shape,
+                priors_base_sizes,
+                grid_offset,
+                near_neighbor_thr=0.5) -> dict:
+        """Forward function."""
+        # (num_base_priors, num_batch_gt, 7)
+        # 7 is mean (batch_idx, cls_id, x_norm, y_norm,
+        # w_norm, h_norm, prior_idx)
+
+        # mlvl is mean multi_level
+        if batch_targets_normed.shape[1] == 0:
+            # empty gt of batch
+            num_levels = len(pred_results)
+            return dict(
+                mlvl_positive_infos=[pred_results[0].new_empty(
+                    (0, 4))] * num_levels,
+                mlvl_priors=[] * num_levels,
+                mlvl_targets_normed=[] * num_levels)
+
+        # if near_neighbor_thr = 0.5 are mean the nearest
+        # 3 neighbors are also considered positive samples.
+        # if near_neighbor_thr = 1.0 are mean the nearest
+        # 5 neighbors are also considered positive samples.
+        mlvl_positive_infos, mlvl_priors = self.yolov5_assigner(
+            pred_results,
+            batch_targets_normed,
+            priors_base_sizes,
+            grid_offset,
+            near_neighbor_thr=near_neighbor_thr)
+
+        mlvl_positive_infos, mlvl_priors, \
+            mlvl_targets_normed = self.simota_assigner(
+                pred_results, batch_targets_normed, mlvl_positive_infos,
+                mlvl_priors, batch_input_shape)
+
+        place_hold_var = batch_targets_normed.new_empty((0, 4))
+        _cat_multi_level_tensor_in_place(
+            mlvl_positive_infos,
+            mlvl_priors,
+            mlvl_targets_normed,
+            place_hold_var=place_hold_var)
+
+        return dict(
+            mlvl_positive_infos=mlvl_positive_infos,
+            mlvl_priors=mlvl_priors,
+            mlvl_targets_normed=mlvl_targets_normed)
+
+    def yolov5_assigner(self,
+                        pred_results,
+                        batch_targets_normed,
+                        priors_base_sizes,
+                        grid_offset,
+                        near_neighbor_thr=0.5):
+        """YOLOv5 cross-grid sample assigner."""
+        num_batch_gts = batch_targets_normed.shape[1]
+        assert num_batch_gts > 0
+
+        mlvl_positive_infos, mlvl_priors = [], []
+
+        scaled_factor = torch.ones(7, device=pred_results[0].device)
+        for i in range(len(pred_results)):  # lever
+            priors_base_sizes_i = priors_base_sizes[i]
+            # (1, 1, feat_shape_w, feat_shape_h, feat_shape_w, feat_shape_h)
+            scaled_factor[2:6] = torch.tensor(
+                pred_results[i].shape)[[3, 2, 3, 2]]
+
+            # Scale batch_targets from range 0-1 to range 0-features_maps size.
+            # (num_base_priors, num_batch_gts, 7)
+            batch_targets_scaled = batch_targets_normed * scaled_factor
+
+            # Shape match
+            wh_ratio = batch_targets_scaled[...,
+                                            4:6] / priors_base_sizes_i[:, None]
+            match_inds = torch.max(
+                wh_ratio, 1. / wh_ratio).max(2)[0] < self.prior_match_thr
+            batch_targets_scaled = batch_targets_scaled[
+                match_inds]  # (num_matched_target, 7)
+
+            # no gt bbox matches anchor
+            if batch_targets_scaled.shape[0] == 0:
+                mlvl_positive_infos.append(
+                    batch_targets_scaled.new_empty((0, 4)))
+                mlvl_priors.append([])
+                continue
+
+            # Positive samples with additional neighbors
+            batch_targets_cxcy = batch_targets_scaled[:, 2:4]
+            grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy
+            left, up = ((batch_targets_cxcy % 1 < near_neighbor_thr) &
+                        (batch_targets_cxcy > 1)).T
+            right, bottom = ((grid_xy % 1 < near_neighbor_thr) &
+                             (grid_xy > 1)).T
+            offset_inds = torch.stack(
+                (torch.ones_like(left), left, up, right, bottom))
+            batch_targets_scaled = batch_targets_scaled.repeat(
+                (5, 1, 1))[offset_inds]  # ()
+            retained_offsets = grid_offset.repeat(1, offset_inds.shape[1],
+                                                  1)[offset_inds]
+
+            # batch_targets_scaled: (num_matched_target, 7)
+            # 7 is mean (batch_idx, cls_id, x_scaled,
+            # y_scaled, w_scaled, h_scaled, prior_idx)
+
+            # mlvl_positive_info: (num_matched_target, 4)
+            # 4 is mean (batch_idx, prior_idx, x_scaled, y_scaled)
+            mlvl_positive_info = batch_targets_scaled[:, [0, 6, 2, 3]]
+            retained_offsets = retained_offsets * near_neighbor_thr
+            mlvl_positive_info[:,
+                               2:] = mlvl_positive_info[:,
+                                                        2:] - retained_offsets
+            mlvl_positive_info[:, 2].clamp_(0, scaled_factor[2] - 1)
+            mlvl_positive_info[:, 3].clamp_(0, scaled_factor[3] - 1)
+            mlvl_positive_info = mlvl_positive_info.long()
+            priors_inds = mlvl_positive_info[:, 1]
+
+            mlvl_positive_infos.append(mlvl_positive_info)
+            mlvl_priors.append(priors_base_sizes_i[priors_inds])
+
+        return mlvl_positive_infos, mlvl_priors
+
+    def simota_assigner(self, pred_results, batch_targets_normed,
+                        mlvl_positive_infos, mlvl_priors, batch_input_shape):
+        """SimOTA assigner."""
+        num_batch_gts = batch_targets_normed.shape[1]
+        assert num_batch_gts > 0
+        num_levels = len(mlvl_positive_infos)
+
+        mlvl_positive_infos_matched = [[] for _ in range(num_levels)]
+        mlvl_priors_matched = [[] for _ in range(num_levels)]
+        mlvl_targets_normed_matched = [[] for _ in range(num_levels)]
+
+        for batch_idx in range(pred_results[0].shape[0]):
+            # (num_batch_gt, 7)
+            # 7 is mean (batch_idx, cls_id, x_norm, y_norm,
+            # w_norm, h_norm, prior_idx)
+            targets_normed = batch_targets_normed[0]
+            # (num_gt, 7)
+            targets_normed = targets_normed[targets_normed[:, 0] == batch_idx]
+            num_gts = targets_normed.shape[0]
+
+            if num_gts == 0:
+                continue
+
+            _mlvl_decoderd_bboxes = []
+            _mlvl_obj_cls = []
+            _mlvl_priors = []
+            _mlvl_positive_infos = []
+            _from_which_layer = []
+
+            for i, head_pred in enumerate(pred_results):
+                # (num_matched_target, 4)
+                #  4 is mean (batch_idx, prior_idx, grid_x, grid_y)
+                _mlvl_positive_info = mlvl_positive_infos[i]
+                if _mlvl_positive_info.shape[0] == 0:
+                    continue
+
+                idx = (_mlvl_positive_info[:, 0] == batch_idx)
+                _mlvl_positive_info = _mlvl_positive_info[idx]
+                _mlvl_positive_infos.append(_mlvl_positive_info)
+
+                priors = mlvl_priors[i][idx]
+                _mlvl_priors.append(priors)
+
+                _from_which_layer.append(
+                    _mlvl_positive_info.new_full(
+                        size=(_mlvl_positive_info.shape[0], ), fill_value=i))
+
+                # (n,85)
+                level_batch_idx, prior_ind, \
+                    grid_x, grid_y = _mlvl_positive_info.T
+                pred_positive = head_pred[level_batch_idx, prior_ind, grid_y,
+                                          grid_x]
+                _mlvl_obj_cls.append(pred_positive[:, 4:])
+
+                # decoded
+                grid = torch.stack([grid_x, grid_y], dim=1)
+                pred_positive_cxcy = (pred_positive[:, :2].sigmoid() * 2. -
+                                      0.5 + grid) * self.featmap_strides[i]
+                pred_positive_wh = (pred_positive[:, 2:4].sigmoid() * 2) ** 2 \
+                    * priors * self.featmap_strides[i]
+                pred_positive_xywh = torch.cat(
+                    [pred_positive_cxcy, pred_positive_wh], dim=-1)
+                _mlvl_decoderd_bboxes.append(pred_positive_xywh)
+
+            if len(_mlvl_decoderd_bboxes) == 0:
+                continue
+
+            # 1 calc pair_wise_iou_loss
+            _mlvl_decoderd_bboxes = torch.cat(_mlvl_decoderd_bboxes, dim=0)
+            num_pred_positive = _mlvl_decoderd_bboxes.shape[0]
+
+            if num_pred_positive == 0:
+                continue
+
+            # scaled xywh
+            batch_input_shape_wh = pred_results[0].new_tensor(
+                batch_input_shape[::-1]).repeat((1, 2))
+            targets_scaled_bbox = targets_normed[:, 2:6] * batch_input_shape_wh
+
+            targets_scaled_bbox = bbox_cxcywh_to_xyxy(targets_scaled_bbox)
+            _mlvl_decoderd_bboxes = bbox_cxcywh_to_xyxy(_mlvl_decoderd_bboxes)
+            pair_wise_iou = bbox_overlaps(targets_scaled_bbox,
+                                          _mlvl_decoderd_bboxes)
+            pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8)
+
+            # 2 calc pair_wise_cls_loss
+            _mlvl_obj_cls = torch.cat(_mlvl_obj_cls, dim=0).float().sigmoid()
+            _mlvl_positive_infos = torch.cat(_mlvl_positive_infos, dim=0)
+            _from_which_layer = torch.cat(_from_which_layer, dim=0)
+            _mlvl_priors = torch.cat(_mlvl_priors, dim=0)
+
+            gt_cls_per_image = (
+                F.one_hot(targets_normed[:, 1].to(torch.int64),
+                          self.num_classes).float().unsqueeze(1).repeat(
+                              1, num_pred_positive, 1))
+            # cls_score * obj
+            cls_preds_ = _mlvl_obj_cls[:, 1:]\
+                .unsqueeze(0)\
+                .repeat(num_gts, 1, 1) \
+                * _mlvl_obj_cls[:, 0:1]\
+                .unsqueeze(0).repeat(num_gts, 1, 1)
+            y = cls_preds_.sqrt_()
+            pair_wise_cls_loss = F.binary_cross_entropy_with_logits(
+                torch.log(y / (1 - y)), gt_cls_per_image,
+                reduction='none').sum(-1)
+            del cls_preds_
+
+            # calc cost
+            cost = (
+                self.cls_weight * pair_wise_cls_loss +
+                self.iou_weight * pair_wise_iou_loss)
+
+            # num_gt, num_match_pred
+            matching_matrix = torch.zeros_like(cost)
+
+            top_k, _ = torch.topk(
+                pair_wise_iou,
+                min(self.candidate_topk, pair_wise_iou.shape[1]),
+                dim=1)
+            dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1)
+
+            # Select only topk matches per gt
+            for gt_idx in range(num_gts):
+                _, pos_idx = torch.topk(
+                    cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
+                matching_matrix[gt_idx][pos_idx] = 1.0
+            del top_k, dynamic_ks
+
+            # Each prediction box can match at most one gt box,
+            # and if there are more than one,
+            # only the least costly one can be taken
+            anchor_matching_gt = matching_matrix.sum(0)
+            if (anchor_matching_gt > 1).sum() > 0:
+                _, cost_argmin = torch.min(
+                    cost[:, anchor_matching_gt > 1], dim=0)
+                matching_matrix[:, anchor_matching_gt > 1] *= 0.0
+                matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
+            fg_mask_inboxes = matching_matrix.sum(0) > 0.0
+            matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+
+            targets_normed = targets_normed[matched_gt_inds]
+            _mlvl_positive_infos = _mlvl_positive_infos[fg_mask_inboxes]
+            _from_which_layer = _from_which_layer[fg_mask_inboxes]
+            _mlvl_priors = _mlvl_priors[fg_mask_inboxes]
+
+            # Rearranged in the order of the prediction layers
+            # to facilitate loss
+            for i in range(num_levels):
+                layer_idx = _from_which_layer == i
+                mlvl_positive_infos_matched[i].append(
+                    _mlvl_positive_infos[layer_idx])
+                mlvl_priors_matched[i].append(_mlvl_priors[layer_idx])
+                mlvl_targets_normed_matched[i].append(
+                    targets_normed[layer_idx])
+
+        results = mlvl_positive_infos_matched, \
+            mlvl_priors_matched, \
+            mlvl_targets_normed_matched
+        return results
diff --git a/mmyolo/models/task_modules/assigners/utils.py b/mmyolo/models/task_modules/assigners/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5843200936ef7a269109517e6d2952cceea02059
--- /dev/null
+++ b/mmyolo/models/task_modules/assigners/utils.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def select_candidates_in_gts(priors_points: Tensor,
+                             gt_bboxes: Tensor,
+                             eps: float = 1e-9) -> Tensor:
+    """Select the positive priors' center in gt.
+
+    Args:
+        priors_points (Tensor): Model priors points,
+            shape(num_priors, 2)
+        gt_bboxes (Tensor): Ground true bboxes,
+            shape(batch_size, num_gt, 4)
+        eps (float): Default to 1e-9.
+    Return:
+        (Tensor): shape(batch_size, num_gt, num_priors)
+    """
+    batch_size, num_gt, _ = gt_bboxes.size()
+    gt_bboxes = gt_bboxes.reshape([-1, 4])
+
+    priors_number = priors_points.size(0)
+    priors_points = priors_points.unsqueeze(0).repeat(batch_size * num_gt, 1,
+                                                      1)
+
+    # calculate the left, top, right, bottom distance between positive
+    # prior center and gt side
+    gt_bboxes_lt = gt_bboxes[:, 0:2].unsqueeze(1).repeat(1, priors_number, 1)
+    gt_bboxes_rb = gt_bboxes[:, 2:4].unsqueeze(1).repeat(1, priors_number, 1)
+    bbox_deltas = torch.cat(
+        [priors_points - gt_bboxes_lt, gt_bboxes_rb - priors_points], dim=-1)
+    bbox_deltas = bbox_deltas.reshape([batch_size, num_gt, priors_number, -1])
+
+    return (bbox_deltas.min(axis=-1)[0] > eps).to(gt_bboxes.dtype)
+
+
+def select_highest_overlaps(pos_mask: Tensor, overlaps: Tensor,
+                            num_gt: int) -> Tuple[Tensor, Tensor, Tensor]:
+    """If an anchor box is assigned to multiple gts, the one with the highest
+    iou will be selected.
+
+    Args:
+        pos_mask (Tensor): The assigned positive sample mask,
+            shape(batch_size, num_gt, num_priors)
+        overlaps (Tensor): IoU between all bbox and ground truth,
+            shape(batch_size, num_gt, num_priors)
+        num_gt (int): Number of ground truth.
+    Return:
+        gt_idx_pre_prior (Tensor): Target ground truth index,
+            shape(batch_size, num_priors)
+        fg_mask_pre_prior (Tensor): Force matching ground truth,
+            shape(batch_size, num_priors)
+        pos_mask (Tensor): The assigned positive sample mask,
+            shape(batch_size, num_gt, num_priors)
+    """
+    fg_mask_pre_prior = pos_mask.sum(axis=-2)
+
+    # Make sure the positive sample matches the only one and is the largest IoU
+    if fg_mask_pre_prior.max() > 1:
+        mask_multi_gts = (fg_mask_pre_prior.unsqueeze(1) > 1).repeat(
+            [1, num_gt, 1])
+        index = overlaps.argmax(axis=1)
+        is_max_overlaps = F.one_hot(index, num_gt)
+        is_max_overlaps = \
+            is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype)
+
+        pos_mask = torch.where(mask_multi_gts, is_max_overlaps, pos_mask)
+        fg_mask_pre_prior = pos_mask.sum(axis=-2)
+
+    gt_idx_pre_prior = pos_mask.argmax(axis=-2)
+    return gt_idx_pre_prior, fg_mask_pre_prior, pos_mask
+
+
+# TODO:'mmdet.BboxOverlaps2D' will cause gradient inconsistency,
+# which will be found and solved in a later version.
+def yolov6_iou_calculator(bbox1: Tensor,
+                          bbox2: Tensor,
+                          eps: float = 1e-9) -> Tensor:
+    """Calculate iou for batch.
+
+    Args:
+        bbox1 (Tensor): shape(batch size, num_gt, 4)
+        bbox2 (Tensor): shape(batch size, num_priors, 4)
+        eps (float): Default to 1e-9.
+    Return:
+        (Tensor): IoU, shape(size, num_gt, num_priors)
+    """
+    bbox1 = bbox1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
+    bbox2 = bbox2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
+
+    # calculate xy info of predict and gt bbox
+    bbox1_x1y1, bbox1_x2y2 = bbox1[:, :, :, 0:2], bbox1[:, :, :, 2:4]
+    bbox2_x1y1, bbox2_x2y2 = bbox2[:, :, :, 0:2], bbox2[:, :, :, 2:4]
+
+    # calculate overlap area
+    overlap = (torch.minimum(bbox1_x2y2, bbox2_x2y2) -
+               torch.maximum(bbox1_x1y1, bbox2_x1y1)).clip(0).prod(-1)
+
+    # calculate bbox area
+    bbox1_area = (bbox1_x2y2 - bbox1_x1y1).clip(0).prod(-1)
+    bbox2_area = (bbox2_x2y2 - bbox2_x1y1).clip(0).prod(-1)
+
+    union = bbox1_area + bbox2_area - overlap + eps
+
+    return overlap / union
diff --git a/mmyolo/models/task_modules/coders/__init__.py b/mmyolo/models/task_modules/coders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..75b6e7d6b30afd3de21c738dfc8e75df2eae7120
--- /dev/null
+++ b/mmyolo/models/task_modules/coders/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .distance_angle_point_coder import DistanceAnglePointCoder
+from .distance_point_bbox_coder import DistancePointBBoxCoder
+from .yolov5_bbox_coder import YOLOv5BBoxCoder
+from .yolox_bbox_coder import YOLOXBBoxCoder
+
+__all__ = [
+    'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'DistancePointBBoxCoder',
+    'DistanceAnglePointCoder'
+]
diff --git a/mmyolo/models/task_modules/coders/distance_angle_point_coder.py b/mmyolo/models/task_modules/coders/distance_angle_point_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7e322f94725ee548c9b261be6f5bae2f3d9b4d9
--- /dev/null
+++ b/mmyolo/models/task_modules/coders/distance_angle_point_coder.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import torch
+
+from mmyolo.registry import TASK_UTILS
+
+try:
+    from mmrotate.models.task_modules.coders import \
+        DistanceAnglePointCoder as MMROTATE_DistanceAnglePointCoder
+    MMROTATE_AVAILABLE = True
+except ImportError:
+    from mmdet.models.task_modules.coders import BaseBBoxCoder
+    MMROTATE_DistanceAnglePointCoder = BaseBBoxCoder
+    MMROTATE_AVAILABLE = False
+
+
+@TASK_UTILS.register_module()
+class DistanceAnglePointCoder(MMROTATE_DistanceAnglePointCoder):
+    """Distance Angle Point BBox coder.
+
+    This coder encodes gt bboxes (x, y, w, h, theta) into (top, bottom, left,
+    right, theta) and decode it back to the original.
+    """
+
+    def __init__(self, clip_border=True, angle_version='oc'):
+        if not MMROTATE_AVAILABLE:
+            raise ImportError(
+                'Please run "mim install -r requirements/mmrotate.txt" '
+                'to install mmrotate first for rotated detection.')
+
+        super().__init__(clip_border=clip_border, angle_version=angle_version)
+
+    def decode(
+        self,
+        points: torch.Tensor,
+        pred_bboxes: torch.Tensor,
+        stride: torch.Tensor,
+        max_shape: Optional[Union[Sequence[int], torch.Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+    ) -> torch.Tensor:
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries and angle (left, top, right, bottom, angle).
+                Shape (B, N, 5) or (N, 5)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Tensor: Boxes with shape (N, 5) or (B, N, 5)
+        """
+        assert points.size(-2) == pred_bboxes.size(-2)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 5
+        if self.clip_border is False:
+            max_shape = None
+
+        if pred_bboxes.dim() == 2:
+            stride = stride[:, None]
+        else:
+            stride = stride[None, :, None]
+        pred_bboxes[..., :4] = pred_bboxes[..., :4] * stride
+
+        return self.distance2obb(points, pred_bboxes, max_shape,
+                                 self.angle_version)
+
+    def encode(self,
+               points: torch.Tensor,
+               gt_bboxes: torch.Tensor,
+               max_dis: float = 16.,
+               eps: float = 0.01) -> torch.Tensor:
+        """Encode bounding box to distances.
+
+        Args:
+            points (Tensor): Shape (N, 2), The format is [x, y].
+            gt_bboxes (Tensor): Shape (N, 5), The format is "xywha"
+            max_dis (float): Upper bound of the distance. Default None.
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.1.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 5).
+        """
+
+        assert points.size(-2) == gt_bboxes.size(-2)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 5
+        return self.obb2distance(points, gt_bboxes, max_dis, eps)
diff --git a/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py b/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..16417b8ab209c57880cfcfe0ba2a955e78c0a3f0
--- /dev/null
+++ b/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import torch
+from mmdet.models.task_modules.coders import \
+    DistancePointBBoxCoder as MMDET_DistancePointBBoxCoder
+from mmdet.structures.bbox import bbox2distance, distance2bbox
+
+from mmyolo.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class DistancePointBBoxCoder(MMDET_DistancePointBBoxCoder):
+    """Distance Point BBox coder.
+
+    This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+    """
+
+    def decode(
+        self,
+        points: torch.Tensor,
+        pred_bboxes: torch.Tensor,
+        stride: torch.Tensor,
+        max_shape: Optional[Union[Sequence[int], torch.Tensor,
+                                  Sequence[Sequence[int]]]] = None
+    ) -> torch.Tensor:
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom). Shape (B, N, 4)
+                or (N, 4)
+            stride (Tensor): Featmap stride.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Tensor: Boxes with shape (N, 4) or (B, N, 4)
+        """
+        assert points.size(-2) == pred_bboxes.size(-2)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 4
+        if self.clip_border is False:
+            max_shape = None
+
+        pred_bboxes = pred_bboxes * stride[None, :, None]
+
+        return distance2bbox(points, pred_bboxes, max_shape)
+
+    def encode(self,
+               points: torch.Tensor,
+               gt_bboxes: torch.Tensor,
+               max_dis: float = 16.,
+               eps: float = 0.01) -> torch.Tensor:
+        """Encode bounding box to distances. The rewrite is to support batch
+        operations.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2), The format is [x, y].
+            gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format
+                is "xyxy"
+            max_dis (float): Upper bound of the distance. Default to 16..
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.01.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 4) or
+             (B, N, 4).
+        """
+
+        assert points.size(-2) == gt_bboxes.size(-2)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 4
+        return bbox2distance(points, gt_bboxes, max_dis, eps)
diff --git a/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py b/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bab5f0e0fe06c1930497bdece7c7a06636fe9c37
--- /dev/null
+++ b/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder
+
+from mmyolo.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class YOLOv5BBoxCoder(BaseBBoxCoder):
+    """YOLOv5 BBox coder.
+
+    This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x,
+    tl_y, br_x, br_y).
+    """
+
+    def encode(self, **kwargs):
+        """Encode deltas between bboxes and ground truth boxes."""
+        pass
+
+    def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor,
+               stride: Union[torch.Tensor, int]) -> torch.Tensor:
+        """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x,
+        tl_y, br_x, br_y).
+
+        Args:
+            priors (torch.Tensor): Basic boxes or points, e.g. anchors.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            stride (torch.Tensor | int): Strides of bboxes.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        assert pred_bboxes.size(-1) == priors.size(-1) == 4
+
+        pred_bboxes = pred_bboxes.sigmoid()
+
+        x_center = (priors[..., 0] + priors[..., 2]) * 0.5
+        y_center = (priors[..., 1] + priors[..., 3]) * 0.5
+        w = priors[..., 2] - priors[..., 0]
+        h = priors[..., 3] - priors[..., 1]
+
+        # The anchor of mmdet has been offset by 0.5
+        x_center_pred = (pred_bboxes[..., 0] - 0.5) * 2 * stride + x_center
+        y_center_pred = (pred_bboxes[..., 1] - 0.5) * 2 * stride + y_center
+        w_pred = (pred_bboxes[..., 2] * 2)**2 * w
+        h_pred = (pred_bboxes[..., 3] * 2)**2 * h
+
+        decoded_bboxes = torch.stack(
+            (x_center_pred - w_pred / 2, y_center_pred - h_pred / 2,
+             x_center_pred + w_pred / 2, y_center_pred + h_pred / 2),
+            dim=-1)
+
+        return decoded_bboxes
diff --git a/mmyolo/models/task_modules/coders/yolox_bbox_coder.py b/mmyolo/models/task_modules/coders/yolox_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c898d814e89e5c8ef4db792831a7ba80c7c0cc
--- /dev/null
+++ b/mmyolo/models/task_modules/coders/yolox_bbox_coder.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder
+
+from mmyolo.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class YOLOXBBoxCoder(BaseBBoxCoder):
+    """YOLOX BBox coder.
+
+    This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x,
+    tl_y, br_x, br_y).
+    """
+
+    def encode(self, **kwargs):
+        """Encode deltas between bboxes and ground truth boxes."""
+        pass
+
+    def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor,
+               stride: Union[torch.Tensor, int]) -> torch.Tensor:
+        """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x,
+        tl_y, br_x, br_y).
+
+        Args:
+            priors (torch.Tensor): Basic boxes or points, e.g. anchors.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            stride (torch.Tensor | int): Strides of bboxes.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        stride = stride[None, :, None]
+        xys = (pred_bboxes[..., :2] * stride) + priors
+        whs = pred_bboxes[..., 2:].exp() * stride
+
+        tl_x = (xys[..., 0] - whs[..., 0] / 2)
+        tl_y = (xys[..., 1] - whs[..., 1] / 2)
+        br_x = (xys[..., 0] + whs[..., 0] / 2)
+        br_y = (xys[..., 1] + whs[..., 1] / 2)
+
+        decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
+        return decoded_bboxes
diff --git a/mmyolo/models/utils/__init__.py b/mmyolo/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdfeaaf0f206fd62dda27cbf44f519777da56ea8
--- /dev/null
+++ b/mmyolo/models/utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .misc import gt_instances_preprocess, make_divisible, make_round
+
+__all__ = ['make_divisible', 'make_round', 'gt_instances_preprocess']
diff --git a/mmyolo/models/utils/misc.py b/mmyolo/models/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..531558b69bc14141fb6299aea17b54b432fd5f59
--- /dev/null
+++ b/mmyolo/models/utils/misc.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Union
+
+import torch
+from mmdet.structures.bbox.transforms import get_box_tensor
+from torch import Tensor
+
+
+def make_divisible(x: float,
+                   widen_factor: float = 1.0,
+                   divisor: int = 8) -> int:
+    """Make sure that x*widen_factor is divisible by divisor."""
+    return math.ceil(x * widen_factor / divisor) * divisor
+
+
+def make_round(x: float, deepen_factor: float = 1.0) -> int:
+    """Make sure that x*deepen_factor becomes an integer not less than 1."""
+    return max(round(x * deepen_factor), 1) if x > 1 else x
+
+
+def gt_instances_preprocess(batch_gt_instances: Union[Tensor, Sequence],
+                            batch_size: int) -> Tensor:
+    """Split batch_gt_instances with batch size.
+
+    From [all_gt_bboxes, box_dim+2] to [batch_size, number_gt, box_dim+1].
+    For horizontal box, box_dim=4, for rotated box, box_dim=5
+
+    If some shape of single batch smaller than
+    gt bbox len, then using zeros to fill.
+
+    Args:
+        batch_gt_instances (Sequence[Tensor]): Ground truth
+            instances for whole batch, shape [all_gt_bboxes, box_dim+2]
+        batch_size (int): Batch size.
+
+    Returns:
+        Tensor: batch gt instances data, shape
+                [batch_size, number_gt, box_dim+1]
+    """
+    if isinstance(batch_gt_instances, Sequence):
+        max_gt_bbox_len = max(
+            [len(gt_instances) for gt_instances in batch_gt_instances])
+        # fill zeros with length box_dim+1 if some shape of
+        # single batch not equal max_gt_bbox_len
+        batch_instance_list = []
+        for index, gt_instance in enumerate(batch_gt_instances):
+            bboxes = gt_instance.bboxes
+            labels = gt_instance.labels
+            box_dim = get_box_tensor(bboxes).size(-1)
+            batch_instance_list.append(
+                torch.cat((labels[:, None], bboxes), dim=-1))
+
+            if bboxes.shape[0] >= max_gt_bbox_len:
+                continue
+
+            fill_tensor = bboxes.new_full(
+                [max_gt_bbox_len - bboxes.shape[0], box_dim + 1], 0)
+            batch_instance_list[index] = torch.cat(
+                (batch_instance_list[index], fill_tensor), dim=0)
+
+        return torch.stack(batch_instance_list)
+    else:
+        # faster version
+        # format of batch_gt_instances: [img_ind, cls_ind, (box)]
+        # For example horizontal box should be:
+        # [img_ind, cls_ind, x1, y1, x2, y2]
+        # Rotated box should be
+        # [img_ind, cls_ind, x, y, w, h, a]
+
+        # sqlit batch gt instance [all_gt_bboxes, box_dim+2] ->
+        # [batch_size, max_gt_bbox_len, box_dim+1]
+        assert isinstance(batch_gt_instances, Tensor)
+        box_dim = batch_gt_instances.size(-1) - 2
+        if len(batch_gt_instances) > 0:
+            gt_images_indexes = batch_gt_instances[:, 0]
+            max_gt_bbox_len = gt_images_indexes.unique(
+                return_counts=True)[1].max()
+            # fill zeros with length box_dim+1 if some shape of
+            # single batch not equal max_gt_bbox_len
+            batch_instance = torch.zeros(
+                (batch_size, max_gt_bbox_len, box_dim + 1),
+                dtype=batch_gt_instances.dtype,
+                device=batch_gt_instances.device)
+
+            for i in range(batch_size):
+                match_indexes = gt_images_indexes == i
+                gt_num = match_indexes.sum()
+                if gt_num:
+                    batch_instance[i, :gt_num] = batch_gt_instances[
+                        match_indexes, 1:]
+        else:
+            batch_instance = torch.zeros((batch_size, 0, box_dim + 1),
+                                         dtype=batch_gt_instances.dtype,
+                                         device=batch_gt_instances.device)
+
+        return batch_instance
diff --git a/mmyolo/registry.py b/mmyolo/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..71f43e6cf53d92917b7aea6175ae0540613ff720
--- /dev/null
+++ b/mmyolo/registry.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMYOLO provides 17 registry nodes to support using modules across projects.
+Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry(
+    'runner', parent=MMENGINE_RUNNERS, locations=['mmyolo.engine'])
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor',
+    parent=MMENGINE_RUNNER_CONSTRUCTORS,
+    locations=['mmyolo.engine'])
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry('loop', parent=MMENGINE_LOOPS, locations=['mmyolo.engine'])
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmyolo.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmyolo.datasets'])
+DATA_SAMPLERS = Registry(
+    'data sampler',
+    parent=MMENGINE_DATA_SAMPLERS,
+    locations=['mmyolo.datasets'])
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmyolo.datasets.transforms'])
+
+# manage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmyolo.models'])
+# manage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmyolo.models'])
+# manage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmyolo.models'])
+
+# manage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmyolo.engine.optimizers'])
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmyolo.engine.optimizers'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmyolo.engine.optimizers'])
+# manage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmyolo.engine.optimizers'])
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmyolo.engine'])
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmyolo.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer', parent=MMENGINE_VISUALIZERS, locations=['mmyolo.utils'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend', parent=MMENGINE_VISBACKENDS, locations=['mmyolo.utils'])
diff --git a/mmyolo/testing/__init__.py b/mmyolo/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6d7a010ee27b2822d44ad099f46f65bf6f0c00a
--- /dev/null
+++ b/mmyolo/testing/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._utils import get_detector_cfg
+
+__all__ = ['get_detector_cfg']
diff --git a/mmyolo/testing/_utils.py b/mmyolo/testing/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ccf2fe0cfd7baa3aeb7f3793c3db025d8889d5f
--- /dev/null
+++ b/mmyolo/testing/_utils.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from os.path import dirname, exists, join
+
+import numpy as np
+from mmengine.config import Config
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmyolo repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmyolo
+        repo_dpath = dirname(dirname(mmyolo.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    return model
+
+
+def _rand_bboxes(rng, num_boxes, w, h):
+    """Randomly generate a specified number of bboxes."""
+    cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+    tl_x = ((cx * w) - (w * bw / 2)).clip(0, w)
+    tl_y = ((cy * h) - (h * bh / 2)).clip(0, h)
+    br_x = ((cx * w) + (w * bw / 2)).clip(0, w)
+    br_y = ((cy * h) + (h * bh / 2)).clip(0, h)
+
+    bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+    return bboxes
diff --git a/mmyolo/utils/__init__.py b/mmyolo/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4e968494892ccefb60d0c7b713c131ddc6fb869
--- /dev/null
+++ b/mmyolo/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .misc import is_metainfo_lower, switch_to_deploy
+from .setup_env import register_all_modules
+
+__all__ = [
+    'register_all_modules', 'collect_env', 'switch_to_deploy',
+    'is_metainfo_lower'
+]
diff --git a/mmyolo/utils/boxam_utils.py b/mmyolo/utils/boxam_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a46f21c1b5b40e7bc106ae7a15281816ae3efcc
--- /dev/null
+++ b/mmyolo/utils/boxam_utils.py
@@ -0,0 +1,512 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import copy
+import warnings
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
+from mmcv.transforms import Compose
+from mmdet.evaluation import get_classes
+from mmdet.utils import ConfigType
+from mmengine.config import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmyolo.registry import MODELS
+
+try:
+    from pytorch_grad_cam import (AblationCAM, AblationLayer,
+                                  ActivationsAndGradients)
+    from pytorch_grad_cam import GradCAM as Base_GradCAM
+    from pytorch_grad_cam import GradCAMPlusPlus as Base_GradCAMPlusPlus
+    from pytorch_grad_cam.base_cam import BaseCAM
+    from pytorch_grad_cam.utils.image import scale_cam_image, show_cam_on_image
+    from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection
+except ImportError:
+    pass
+
+
+def init_detector(
+    config: Union[str, Path, Config],
+    checkpoint: Optional[str] = None,
+    palette: str = 'coco',
+    device: str = 'cuda:0',
+    cfg_options: Optional[dict] = None,
+) -> nn.Module:
+    """Initialize a detector from config file.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        palette (str): Color palette used for visualization. If palette
+            is stored in checkpoint, use checkpoint's palette first, otherwise
+            use externally passed palette. Currently, supports 'coco', 'voc',
+            'citys' and 'random'. Defaults to coco.
+        device (str): The device where the anchors will be put on.
+            Defaults to cuda:0.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    elif 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
+
+    # only change this
+    # grad based method requires train_cfg
+    # config.model.train_cfg = None
+    init_default_scope(config.get('default_scope', 'mmyolo'))
+
+    model = MODELS.build(config.model)
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            # mmdet 3.x, all keys should be lowercase
+            model.dataset_meta = {
+                k.lower(): v
+                for k, v in checkpoint_meta['dataset_meta'].items()
+            }
+        elif 'CLASSES' in checkpoint_meta:
+            # < mmdet 3.x
+            classes = checkpoint_meta['CLASSES']
+            model.dataset_meta = {'classes': classes, 'palette': palette}
+        else:
+            warnings.simplefilter('once')
+            warnings.warn(
+                'dataset_meta or class names are not saved in the '
+                'checkpoint\'s meta data, use COCO classes by default.')
+            model.dataset_meta = {
+                'classes': get_classes('coco'),
+                'palette': palette
+            }
+
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+def reshape_transform(feats: Union[Tensor, List[Tensor]],
+                      max_shape: Tuple[int, int] = (20, 20),
+                      is_need_grad: bool = False):
+    """Reshape and aggregate feature maps when the input is a multi-layer
+    feature map.
+
+    Takes these tensors with different sizes, resizes them to a common shape,
+    and concatenates them.
+    """
+    if len(max_shape) == 1:
+        max_shape = max_shape * 2
+
+    if isinstance(feats, torch.Tensor):
+        feats = [feats]
+    else:
+        if is_need_grad:
+            raise NotImplementedError('The `grad_base` method does not '
+                                      'support output multi-activation layers')
+
+    max_h = max([im.shape[-2] for im in feats])
+    max_w = max([im.shape[-1] for im in feats])
+    if -1 in max_shape:
+        max_shape = (max_h, max_w)
+    else:
+        max_shape = (min(max_h, max_shape[0]), min(max_w, max_shape[1]))
+
+    activations = []
+    for feat in feats:
+        activations.append(
+            torch.nn.functional.interpolate(
+                torch.abs(feat), max_shape, mode='bilinear'))
+
+    activations = torch.cat(activations, axis=1)
+    return activations
+
+
+class BoxAMDetectorWrapper(nn.Module):
+    """Wrap the mmdet model class to facilitate handling of non-tensor
+    situations during inference."""
+
+    def __init__(self,
+                 cfg: ConfigType,
+                 checkpoint: str,
+                 score_thr: float,
+                 device: str = 'cuda:0'):
+        super().__init__()
+        self.cfg = cfg
+        self.device = device
+        self.score_thr = score_thr
+        self.checkpoint = checkpoint
+        self.detector = init_detector(self.cfg, self.checkpoint, device=device)
+
+        pipeline_cfg = copy.deepcopy(self.cfg.test_dataloader.dataset.pipeline)
+        pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray'
+
+        new_test_pipeline = []
+        for pipeline in pipeline_cfg:
+            if not pipeline['type'].endswith('LoadAnnotations'):
+                new_test_pipeline.append(pipeline)
+        self.test_pipeline = Compose(new_test_pipeline)
+
+        self.is_need_loss = False
+        self.input_data = None
+        self.image = None
+
+    def need_loss(self, is_need_loss: bool):
+        """Grad-based methods require loss."""
+        self.is_need_loss = is_need_loss
+
+    def set_input_data(self,
+                       image: np.ndarray,
+                       pred_instances: Optional[InstanceData] = None):
+        """Set the input data to be used in the next step."""
+        self.image = image
+
+        if self.is_need_loss:
+            assert pred_instances is not None
+            pred_instances = pred_instances.numpy()
+            data = dict(
+                img=self.image,
+                img_id=0,
+                gt_bboxes=pred_instances.bboxes,
+                gt_bboxes_labels=pred_instances.labels)
+            data = self.test_pipeline(data)
+        else:
+            data = dict(img=self.image, img_id=0)
+            data = self.test_pipeline(data)
+            data['inputs'] = [data['inputs']]
+            data['data_samples'] = [data['data_samples']]
+        self.input_data = data
+
+    def __call__(self, *args, **kwargs):
+        assert self.input_data is not None
+        if self.is_need_loss:
+            # Maybe this is a direction that can be optimized
+            # self.detector.init_weights()
+
+            self.detector.bbox_head.head_module.training = True
+            if hasattr(self.detector.bbox_head, 'featmap_sizes'):
+                # Prevent the model algorithm error when calculating loss
+                self.detector.bbox_head.featmap_sizes = None
+
+            data_ = {}
+            data_['inputs'] = [self.input_data['inputs']]
+            data_['data_samples'] = [self.input_data['data_samples']]
+            data = self.detector.data_preprocessor(data_, training=False)
+            loss = self.detector._run_forward(data, mode='loss')
+
+            if hasattr(self.detector.bbox_head, 'featmap_sizes'):
+                self.detector.bbox_head.featmap_sizes = None
+
+            return [loss]
+        else:
+            self.detector.bbox_head.head_module.training = False
+            with torch.no_grad():
+                results = self.detector.test_step(self.input_data)
+                return results
+
+
+class BoxAMDetectorVisualizer:
+    """Box AM visualization class."""
+
+    def __init__(self,
+                 method_class,
+                 model: nn.Module,
+                 target_layers: List,
+                 reshape_transform: Optional[Callable] = None,
+                 is_need_grad: bool = False,
+                 extra_params: Optional[dict] = None):
+        self.target_layers = target_layers
+        self.reshape_transform = reshape_transform
+        self.is_need_grad = is_need_grad
+
+        if method_class.__name__ == 'AblationCAM':
+            batch_size = extra_params.get('batch_size', 1)
+            ratio_channels_to_ablate = extra_params.get(
+                'ratio_channels_to_ablate', 1.)
+            self.cam = AblationCAM(
+                model,
+                target_layers,
+                use_cuda=True if 'cuda' in model.device else False,
+                reshape_transform=reshape_transform,
+                batch_size=batch_size,
+                ablation_layer=extra_params['ablation_layer'],
+                ratio_channels_to_ablate=ratio_channels_to_ablate)
+        else:
+            self.cam = method_class(
+                model,
+                target_layers,
+                use_cuda=True if 'cuda' in model.device else False,
+                reshape_transform=reshape_transform,
+            )
+            if self.is_need_grad:
+                self.cam.activations_and_grads.release()
+
+        self.classes = model.detector.dataset_meta['classes']
+        self.COLORS = np.random.uniform(0, 255, size=(len(self.classes), 3))
+
+    def switch_activations_and_grads(self, model) -> None:
+        """In the grad-based method, we need to switch
+        ``ActivationsAndGradients`` layer, otherwise an error will occur."""
+        self.cam.model = model
+
+        if self.is_need_grad is True:
+            self.cam.activations_and_grads = ActivationsAndGradients(
+                model, self.target_layers, self.reshape_transform)
+            self.is_need_grad = False
+        else:
+            self.cam.activations_and_grads.release()
+            self.is_need_grad = True
+
+    def __call__(self, img, targets, aug_smooth=False, eigen_smooth=False):
+        img = torch.from_numpy(img)[None].permute(0, 3, 1, 2)
+        return self.cam(img, targets, aug_smooth, eigen_smooth)[0, :]
+
+    def show_am(self,
+                image: np.ndarray,
+                pred_instance: InstanceData,
+                grayscale_am: np.ndarray,
+                with_norm_in_bboxes: bool = False):
+        """Normalize the AM to be in the range [0, 1] inside every bounding
+        boxes, and zero outside of the bounding boxes."""
+
+        boxes = pred_instance.bboxes
+        labels = pred_instance.labels
+
+        if with_norm_in_bboxes is True:
+            boxes = boxes.astype(np.int32)
+            renormalized_am = np.zeros(grayscale_am.shape, dtype=np.float32)
+            images = []
+            for x1, y1, x2, y2 in boxes:
+                img = renormalized_am * 0
+                img[y1:y2, x1:x2] = scale_cam_image(
+                    [grayscale_am[y1:y2, x1:x2].copy()])[0]
+                images.append(img)
+
+            renormalized_am = np.max(np.float32(images), axis=0)
+            renormalized_am = scale_cam_image([renormalized_am])[0]
+        else:
+            renormalized_am = grayscale_am
+
+        am_image_renormalized = show_cam_on_image(
+            image / 255, renormalized_am, use_rgb=False)
+
+        image_with_bounding_boxes = self._draw_boxes(
+            boxes, labels, am_image_renormalized, pred_instance.get('scores'))
+        return image_with_bounding_boxes
+
+    def _draw_boxes(self,
+                    boxes: List,
+                    labels: List,
+                    image: np.ndarray,
+                    scores: Optional[List] = None):
+        """draw boxes on image."""
+        for i, box in enumerate(boxes):
+            label = labels[i]
+            color = self.COLORS[label]
+            cv2.rectangle(image, (int(box[0]), int(box[1])),
+                          (int(box[2]), int(box[3])), color, 2)
+            if scores is not None:
+                score = scores[i]
+                text = str(self.classes[label]) + ': ' + str(
+                    round(score * 100, 1))
+            else:
+                text = self.classes[label]
+
+            cv2.putText(
+                image,
+                text, (int(box[0]), int(box[1] - 5)),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                color,
+                1,
+                lineType=cv2.LINE_AA)
+        return image
+
+
+class DetAblationLayer(AblationLayer):
+    """Det AblationLayer."""
+
+    def __init__(self):
+        super().__init__()
+        self.activations = None
+
+    def set_next_batch(self, input_batch_index, activations,
+                       num_channels_to_ablate):
+        """Extract the next batch member from activations, and repeat it
+        num_channels_to_ablate times."""
+        if isinstance(activations, torch.Tensor):
+            return super().set_next_batch(input_batch_index, activations,
+                                          num_channels_to_ablate)
+
+        self.activations = []
+        for activation in activations:
+            activation = activation[
+                input_batch_index, :, :, :].clone().unsqueeze(0)
+            self.activations.append(
+                activation.repeat(num_channels_to_ablate, 1, 1, 1))
+
+    def __call__(self, x):
+        """Go over the activation indices to be ablated, stored in
+        self.indices."""
+        result = self.activations
+
+        if isinstance(result, torch.Tensor):
+            return super().__call__(x)
+
+        channel_cumsum = np.cumsum([r.shape[1] for r in result])
+        num_channels_to_ablate = result[0].size(0)  # batch
+        for i in range(num_channels_to_ablate):
+            pyramid_layer = bisect.bisect_right(channel_cumsum,
+                                                self.indices[i])
+            if pyramid_layer > 0:
+                index_in_pyramid_layer = self.indices[i] - channel_cumsum[
+                    pyramid_layer - 1]
+            else:
+                index_in_pyramid_layer = self.indices[i]
+            result[pyramid_layer][i, index_in_pyramid_layer, :, :] = -1000
+        return result
+
+
+class DetBoxScoreTarget:
+    """Det Score calculation class.
+
+    In the case of the grad-free method, the calculation method is that
+    for every original detected bounding box specified in "bboxes",
+    assign a score on how the current bounding boxes match it,
+
+        1. In Bbox IoU
+        2. In the classification score.
+        3. In Mask IoU if ``segms`` exist.
+
+    If there is not a large enough overlap, or the category changed,
+    assign a score of 0. The total score is the sum of all the box scores.
+
+    In the case of the grad-based method, the calculation method is
+    the sum of losses after excluding a specific key.
+    """
+
+    def __init__(self,
+                 pred_instance: InstanceData,
+                 match_iou_thr: float = 0.5,
+                 device: str = 'cuda:0',
+                 ignore_loss_params: Optional[List] = None):
+        self.focal_bboxes = pred_instance.bboxes
+        self.focal_labels = pred_instance.labels
+        self.match_iou_thr = match_iou_thr
+        self.device = device
+        self.ignore_loss_params = ignore_loss_params
+        if ignore_loss_params is not None:
+            assert isinstance(self.ignore_loss_params, list)
+
+    def __call__(self, results):
+        output = torch.tensor([0.], device=self.device)
+
+        if 'loss_cls' in results:
+            # grad-based method
+            # results is dict
+            for loss_key, loss_value in results.items():
+                if 'loss' not in loss_key or \
+                        loss_key in self.ignore_loss_params:
+                    continue
+                if isinstance(loss_value, list):
+                    output += sum(loss_value)
+                else:
+                    output += loss_value
+            return output
+        else:
+            # grad-free method
+            # results is DetDataSample
+            pred_instances = results.pred_instances
+            if len(pred_instances) == 0:
+                return output
+
+            pred_bboxes = pred_instances.bboxes
+            pred_scores = pred_instances.scores
+            pred_labels = pred_instances.labels
+
+            for focal_box, focal_label in zip(self.focal_bboxes,
+                                              self.focal_labels):
+                ious = torchvision.ops.box_iou(focal_box[None],
+                                               pred_bboxes[..., :4])
+                index = ious.argmax()
+                if ious[0, index] > self.match_iou_thr and pred_labels[
+                        index] == focal_label:
+                    # TODO: Adaptive adjustment of weights based on algorithms
+                    score = ious[0, index] + pred_scores[index]
+                    output = output + score
+            return output
+
+
+class SpatialBaseCAM(BaseCAM):
+    """CAM that maintains spatial information.
+
+    Gradients are often averaged over the spatial dimension in CAM
+    visualization for classification, but this is unreasonable in detection
+    tasks. There is no need to average the gradients in the detection task.
+    """
+
+    def get_cam_image(self,
+                      input_tensor: torch.Tensor,
+                      target_layer: torch.nn.Module,
+                      targets: List[torch.nn.Module],
+                      activations: torch.Tensor,
+                      grads: torch.Tensor,
+                      eigen_smooth: bool = False) -> np.ndarray:
+
+        weights = self.get_cam_weights(input_tensor, target_layer, targets,
+                                       activations, grads)
+        weighted_activations = weights * activations
+        if eigen_smooth:
+            cam = get_2d_projection(weighted_activations)
+        else:
+            cam = weighted_activations.sum(axis=1)
+        return cam
+
+
+class GradCAM(SpatialBaseCAM, Base_GradCAM):
+    """Gradients are no longer averaged over the spatial dimension."""
+
+    def get_cam_weights(self, input_tensor, target_layer, target_category,
+                        activations, grads):
+        return grads
+
+
+class GradCAMPlusPlus(SpatialBaseCAM, Base_GradCAMPlusPlus):
+    """Gradients are no longer averaged over the spatial dimension."""
+
+    def get_cam_weights(self, input_tensor, target_layers, target_category,
+                        activations, grads):
+        grads_power_2 = grads**2
+        grads_power_3 = grads_power_2 * grads
+        # Equation 19 in https://arxiv.org/abs/1710.11063
+        sum_activations = np.sum(activations, axis=(2, 3))
+        eps = 0.000001
+        aij = grads_power_2 / (
+            2 * grads_power_2 +
+            sum_activations[:, :, None, None] * grads_power_3 + eps)
+        # Now bring back the ReLU from eq.7 in the paper,
+        # And zero out aijs where the activations are 0
+        aij = np.where(grads != 0, aij, 0)
+
+        weights = np.maximum(grads, 0) * aij
+        return weights
diff --git a/mmyolo/utils/collect_env.py b/mmyolo/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..89bad658cb7d4f1b602690d8d888a309166283ee
--- /dev/null
+++ b/mmyolo/utils/collect_env.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmdet
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmyolo
+
+
+def collect_env() -> dict:
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMCV'] = mmcv.__version__
+    env_info['MMDetection'] = mmdet.__version__
+    env_info['MMYOLO'] = mmyolo.__version__ + '+' + get_git_hash()[:7]
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmyolo/utils/labelme_utils.py b/mmyolo/utils/labelme_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0981919771a617ca79b29c3ddf96ea14c82fccc6
--- /dev/null
+++ b/mmyolo/utils/labelme_utils.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path
+
+from mmengine.structures import InstanceData
+
+
+class LabelmeFormat:
+    """Predict results save into labelme file.
+
+    Base on https://github.com/wkentaro/labelme/blob/main/labelme/label_file.py
+
+    Args:
+        classes (tuple): Model classes name.
+    """
+
+    def __init__(self, classes: tuple):
+        super().__init__()
+        self.classes = classes
+
+    def __call__(self, pred_instances: InstanceData, metainfo: dict,
+                 output_path: str, selected_classes: list):
+        """Get image data field for labelme.
+
+        Args:
+            pred_instances (InstanceData): Candidate prediction info.
+            metainfo (dict): Meta info of prediction.
+            output_path (str): Image file path.
+            selected_classes (list): Selected class name.
+
+        Labelme file eg.
+            {
+              "version": "5.1.1",
+              "flags": {},
+              "imagePath": "/data/cat/1.jpg",
+              "imageData": null,
+              "imageHeight": 3000,
+              "imageWidth": 4000,
+              "shapes": [
+                {
+                  "label": "cat",
+                  "points": [
+                    [
+                      1148.076923076923,
+                      1188.4615384615383
+                    ],
+                    [
+                      2471.1538461538457,
+                      2176.923076923077
+                    ]
+                  ],
+                  "group_id": null,
+                  "shape_type": "rectangle",
+                  "flags": {}
+                },
+                {...}
+              ]
+            }
+        """
+
+        image_path = os.path.abspath(metainfo['img_path'])
+
+        json_info = {
+            'version': '5.1.1',
+            'flags': {},
+            'imagePath': image_path,
+            'imageData': None,
+            'imageHeight': metainfo['ori_shape'][0],
+            'imageWidth': metainfo['ori_shape'][1],
+            'shapes': []
+        }
+
+        for pred_instance in pred_instances:
+            pred_bbox = pred_instance.bboxes.cpu().numpy().tolist()[0]
+            pred_label = self.classes[pred_instance.labels]
+
+            if selected_classes is not None and \
+                    pred_label not in selected_classes:
+                # filter class name
+                continue
+
+            sub_dict = {
+                'label': pred_label,
+                'points': [pred_bbox[:2], pred_bbox[2:]],
+                'group_id': None,
+                'shape_type': 'rectangle',
+                'flags': {}
+            }
+            json_info['shapes'].append(sub_dict)
+
+        with open(output_path, 'w', encoding='utf-8') as f_json:
+            json.dump(json_info, f_json, ensure_ascii=False, indent=2)
diff --git a/mmyolo/utils/large_image.py b/mmyolo/utils/large_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..8670804684f6dcdc6dc1846cf85260d900b3474e
--- /dev/null
+++ b/mmyolo/utils/large_image.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmdet.structures import DetDataSample, SampleList
+from mmengine.structures import InstanceData
+
+
+def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]):
+    """Shift rotated bboxes with offset.
+
+    Args:
+        bboxes (Tensor): The rotated bboxes need to be translated.
+            With shape (n, 5), which means (x, y, w, h, a).
+        offset (Sequence[int]): The translation offsets with shape of (2, ).
+    Returns:
+        Tensor: Shifted rotated bboxes.
+    """
+    offset_tensor = bboxes.new_tensor(offset)
+    shifted_bboxes = bboxes.clone()
+    shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor
+    return shifted_bboxes
+
+
+def shift_predictions(det_data_samples: SampleList,
+                      offsets: Sequence[Tuple[int, int]],
+                      src_image_shape: Tuple[int, int]) -> SampleList:
+    """Shift predictions to the original image.
+
+    Args:
+        det_data_samples (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+    Returns:
+        (List[:obj:`DetDataSample`]): shifted results.
+    """
+    try:
+        from sahi.slicing import shift_bboxes, shift_masks
+    except ImportError:
+        raise ImportError('Please run "pip install -U sahi" '
+                          'to install sahi first for large image inference.')
+
+    assert len(det_data_samples) == len(
+        offsets), 'The `results` should has the ' 'same length with `offsets`.'
+    shifted_predictions = []
+    for det_data_sample, offset in zip(det_data_samples, offsets):
+        pred_inst = det_data_sample.pred_instances.clone()
+
+        # Check bbox type
+        if pred_inst.bboxes.size(-1) == 4:
+            # Horizontal bboxes
+            shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset)
+        elif pred_inst.bboxes.size(-1) == 5:
+            # Rotated bboxes
+            shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset)
+        else:
+            raise NotImplementedError
+
+        # shift bboxes and masks
+        pred_inst.bboxes = shifted_bboxes
+        if 'masks' in det_data_sample:
+            pred_inst.masks = shift_masks(pred_inst.masks, offset,
+                                          src_image_shape)
+
+        shifted_predictions.append(pred_inst.clone())
+
+    shifted_predictions = InstanceData.cat(shifted_predictions)
+
+    return shifted_predictions
+
+
+def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int,
+                                                                      int]],
+                         src_image_shape: Tuple[int, int],
+                         nms_cfg: dict) -> DetDataSample:
+    """Merge patch results by nms.
+
+    Args:
+        results (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+        nms_cfg (dict): it should specify nms type and other parameters
+            like `iou_threshold`.
+    Returns:
+        :obj:`DetDataSample`: merged results.
+    """
+    shifted_instances = shift_predictions(results, offsets, src_image_shape)
+
+    _, keeps = batched_nms(
+        boxes=shifted_instances.bboxes,
+        scores=shifted_instances.scores,
+        idxs=shifted_instances.labels,
+        nms_cfg=nms_cfg)
+    merged_instances = shifted_instances[keeps]
+
+    merged_result = results[0].clone()
+    merged_result.pred_instances = merged_instances
+    return merged_result
diff --git a/mmyolo/utils/misc.py b/mmyolo/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c90f52b94ee9e174c3a289122b6bc7fc58e6e6f1
--- /dev/null
+++ b/mmyolo/utils/misc.py
@@ -0,0 +1,133 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import urllib
+
+import numpy as np
+import torch
+from mmengine.utils import scandir
+from prettytable import PrettyTable
+
+from mmyolo.models import RepVGGBlock
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def switch_to_deploy(model):
+    """Model switch to deploy status."""
+    for layer in model.modules():
+        if isinstance(layer, RepVGGBlock):
+            layer.switch_to_deploy()
+
+    print('Switch model to deploy modality.')
+
+
+def auto_arrange_images(image_list: list, image_column: int = 2) -> np.ndarray:
+    """Auto arrange image to image_column x N row.
+
+    Args:
+        image_list (list): cv2 image list.
+        image_column (int): Arrange to N column. Default: 2.
+    Return:
+        (np.ndarray): image_column x N row merge image
+    """
+    img_count = len(image_list)
+    if img_count <= image_column:
+        # no need to arrange
+        image_show = np.concatenate(image_list, axis=1)
+    else:
+        # arrange image according to image_column
+        image_row = round(img_count / image_column)
+        fill_img_list = [np.ones(image_list[0].shape, dtype=np.uint8) * 255
+                         ] * (
+                             image_row * image_column - img_count)
+        image_list.extend(fill_img_list)
+        merge_imgs_col = []
+        for i in range(image_row):
+            start_col = image_column * i
+            end_col = image_column * (i + 1)
+            merge_col = np.hstack(image_list[start_col:end_col])
+            merge_imgs_col.append(merge_col)
+
+        # merge to one image
+        image_show = np.vstack(merge_imgs_col)
+
+    return image_show
+
+
+def get_file_list(source_root: str) -> [list, dict]:
+    """Get file list.
+
+    Args:
+        source_root (str): image or video source path
+
+    Return:
+        source_file_path_list (list): A list for all source file.
+        source_type (dict): Source type: file or url or dir.
+    """
+    is_dir = os.path.isdir(source_root)
+    is_url = source_root.startswith(('http:/', 'https:/'))
+    is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS
+
+    source_file_path_list = []
+    if is_dir:
+        # when input source is dir
+        for file in scandir(source_root, IMG_EXTENSIONS, recursive=True):
+            source_file_path_list.append(os.path.join(source_root, file))
+    elif is_url:
+        # when input source is url
+        filename = os.path.basename(
+            urllib.parse.unquote(source_root).split('?')[0])
+        file_save_path = os.path.join(os.getcwd(), filename)
+        print(f'Downloading source file to {file_save_path}')
+        torch.hub.download_url_to_file(source_root, file_save_path)
+        source_file_path_list = [file_save_path]
+    elif is_file:
+        # when input source is single image
+        source_file_path_list = [source_root]
+    else:
+        print('Cannot find image file.')
+
+    source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file)
+
+    return source_file_path_list, source_type
+
+
+def show_data_classes(data_classes):
+    """When printing an error, all class names of the dataset."""
+    print('\n\nThe name of the class contained in the dataset:')
+    data_classes_info = PrettyTable()
+    data_classes_info.title = 'Information of dataset class'
+    # List Print Settings
+    # If the quantity is too large, 25 rows will be displayed in each column
+    if len(data_classes) < 25:
+        data_classes_info.add_column('Class name', data_classes)
+    elif len(data_classes) % 25 != 0 and len(data_classes) > 25:
+        col_num = int(len(data_classes) / 25) + 1
+        data_name_list = list(data_classes)
+        for i in range(0, (col_num * 25) - len(data_classes)):
+            data_name_list.append('')
+        for i in range(0, len(data_name_list), 25):
+            data_classes_info.add_column('Class name',
+                                         data_name_list[i:i + 25])
+
+    # Align display data to the left
+    data_classes_info.align['Class name'] = 'l'
+    print(data_classes_info)
+
+
+def is_metainfo_lower(cfg):
+    """Determine whether the custom metainfo fields are all lowercase."""
+
+    def judge_keys(dataloader_cfg):
+        while 'dataset' in dataloader_cfg:
+            dataloader_cfg = dataloader_cfg['dataset']
+        if 'metainfo' in dataloader_cfg:
+            all_keys = dataloader_cfg['metainfo'].keys()
+            all_is_lower = all([str(k).islower() for k in all_keys])
+            assert all_is_lower, f'The keys in dataset metainfo must be all lowercase, but got {all_keys}. ' \
+                                 f'Please refer to https://github.com/open-mmlab/mmyolo/blob/e62c8c4593/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py#L8' # noqa
+
+    judge_keys(cfg.get('train_dataloader', {}))
+    judge_keys(cfg.get('val_dataloader', {}))
+    judge_keys(cfg.get('test_dataloader', {}))
diff --git a/mmyolo/utils/setup_env.py b/mmyolo/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..f51ed928cbddb98c2274e09b5acea1d70dfd1abd
--- /dev/null
+++ b/mmyolo/utils/setup_env.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import warnings
+
+from mmengine import DefaultScope
+
+
+def register_all_modules(init_default_scope: bool = True):
+    """Register all modules in mmdet into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmdet default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmyolo`, and all registries will build modules from mmdet's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmdet.engine  # noqa: F401,F403
+    import mmdet.visualization  # noqa: F401,F403
+
+    import mmyolo.datasets  # noqa: F401,F403
+    import mmyolo.engine  # noqa: F401,F403
+    import mmyolo.models  # noqa: F401,F403
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmyolo')
+        if never_created:
+            DefaultScope.get_instance('mmyolo', scope_name='mmyolo')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmyolo':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmyolo", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmyolo". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmyolo-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmyolo')
diff --git a/mmyolo/version.py b/mmyolo/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..75c44c7b2a4abf1652db1f9878aef80ec52b5ec9
--- /dev/null
+++ b/mmyolo/version.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+__version__ = '0.5.0'
+
+from typing import Tuple
+
+short_version = __version__
+
+
+def parse_version_info(version_str: str) -> Tuple:
+    """Parse version info of MMYOLO."""
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)