diff --git a/app.py b/app.py
index 585d97c991f7c243cbcd743a6317599248e2993d..15cf9536c53960e84faf3f1b0dfaec85773e89be 100644
--- a/app.py
+++ b/app.py
@@ -13,29 +13,22 @@ try:
 except:
     os.system('pip install /home/user/app/main/transformer_utils')
 hf_hub_download(repo_id="caizhongang/SMPLer-X", filename="smpler_x_h32.pth.tar", local_dir="/home/user/app/pretrained_models")
-os.system('cp -rf /home/user/app/assets/conversions.py /home/user/.pyenv/versions/3.9.18/lib/python3.9/site-packages/torchgeometry/core/conversions.py')
+os.system('cp -rf /home/user/app/assets/conversions.py /usr/local/lib/python3.10/site-packages/torchgeometry/core/conversions.py')
 DEFAULT_MODEL='smpler_x_h32'
 OUT_FOLDER = '/home/user/app/demo_out'
 os.makedirs(OUT_FOLDER, exist_ok=True)
-# num_gpus = 1 if torch.cuda.is_available() else -1
-# print("!!!", torch.cuda.is_available())      
-# print(torch.cuda.device_count())   
-# print(torch.version.cuda)  
-# index = torch.cuda.current_device()
-# print(index)  
-# print(torch.cuda.get_device_name(index))
+num_gpus = 1 if torch.cuda.is_available() else -1
+print("!!!", torch.cuda.is_available())      
+print(torch.cuda.device_count())   
+print(torch.version.cuda)  
+index = torch.cuda.current_device()
+print(index)  
+print(torch.cuda.get_device_name(index))
 # from main.inference import Inferer
 # inferer = Inferer(DEFAULT_MODEL, num_gpus, OUT_FOLDER)
 
 @spaces.GPU(enable_queue=True)
 def infer(video_input, in_threshold=0.5, num_people="Single person", render_mesh=False):
-    num_gpus = 1 if torch.cuda.is_available() else -1
-    print("!!!", torch.cuda.is_available())      
-    print(torch.cuda.device_count())   
-    print(torch.version.cuda)  
-    index = torch.cuda.current_device()
-    print(index)  
-    print(torch.cuda.get_device_name(index))
     from main.inference import Inferer
     inferer = Inferer(DEFAULT_MODEL, num_gpus, OUT_FOLDER)
     os.system(f'rm -rf {OUT_FOLDER}/*')
diff --git a/common/base.py b/common/base.py
index 21fe9d3f36f39167879aa84c248c32ec80b12927..8434f1b8e6c49988df0175649a5cfaf4c6a16269 100644
--- a/common/base.py
+++ b/common/base.py
@@ -17,7 +17,7 @@ import torch.utils.data.distributed
 from utils.distribute_utils import (
     get_rank, is_main_process, time_synchronized, get_group_idx, get_process_groups
 )
-from mmcv.runner import get_dist_info
+
 
 class Base(object):
     __metaclass__ = abc.ABCMeta
diff --git a/common/utils/distribute_utils.py b/common/utils/distribute_utils.py
index 5b1c71cd3863fe1f99370d7d072d6389f663959e..a6c928a5a3c66885e1311949a82feb181aee60a3 100644
--- a/common/utils/distribute_utils.py
+++ b/common/utils/distribute_utils.py
@@ -7,7 +7,7 @@ import tempfile
 import time
 import torch
 import torch.distributed as dist
-from mmcv.runner import get_dist_info
+from mmengine.dist import get_dist_info
 import random
 import numpy as np
 import subprocess
diff --git a/main/SMPLer_X.py b/main/SMPLer_X.py
index 1ca9477babbb0eee26f296b47359f7b0911d0a31..f1c71cc6d0d3d6ada4d164da94cda0784ab9aaad 100644
--- a/main/SMPLer_X.py
+++ b/main/SMPLer_X.py
@@ -9,7 +9,7 @@ from config import cfg
 import math
 import copy
 from mmpose.models import build_posenet
-from mmcv import Config
+from mmengine.config import Config
 
 class Model(nn.Module):
     def __init__(self, encoder, body_position_net, body_rotation_net, box_net, hand_position_net, hand_roi_net,
diff --git a/main/config.py b/main/config.py
index d9a53874b62f35492e5a50034f82420c9595eed3..dd5fd00b8e66a2ee782b3005075c5d23d032b1b9 100644
--- a/main/config.py
+++ b/main/config.py
@@ -2,7 +2,8 @@ import os
 import os.path as osp
 import sys
 import datetime
-from mmcv import Config as MMConfig
+from mmengine.config import Config as MMConfig
+
 
 class Config:
     def get_config_fromfile(self, config_path):
diff --git a/main/inference.py b/main/inference.py
index a0dc4a161590d1ec3de8c7b528e2f6fea3db6683..10a4f0ad6c7058096237a32037ed43b89f9d8030 100644
--- a/main/inference.py
+++ b/main/inference.py
@@ -53,8 +53,14 @@ class Inferer:
 
         ## mmdet inference
         mmdet_results = inference_detector(self.model, original_img)
-        mmdet_box = process_mmdet_results(mmdet_results, cat_id=0, multi_person=True)
         
+        pred_instance = mmdet_results.pred_instances.cpu().numpy()
+        bboxes = np.concatenate(
+            (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
+        bboxes = bboxes[pred_instance.labels == 0]
+        bboxes = np.expand_dims(bboxes, axis=0)
+        mmdet_box = process_mmdet_results(bboxes, cat_id=0, multi_person=True)
+
         # save original image if no bbox
         if len(mmdet_box[0])<1:
             return original_img, [], []
diff --git a/main/transformer_utils/mmpose/__init__.py b/main/transformer_utils/mmpose/__init__.py
index abcf8693e279f59c8c80f55e1797841e593dbd72..690da3a78ba0033e7dc820b3d9a681da3ca39706 100644
--- a/main/transformer_utils/mmpose/__init__.py
+++ b/main/transformer_utils/mmpose/__init__.py
@@ -17,7 +17,7 @@ def digit_version(version_str):
 
 
 mmcv_minimum_version = '1.3.8'
-mmcv_maximum_version = '1.8.0'
+mmcv_maximum_version = '2.3.0'
 mmcv_version = digit_version(mmcv.__version__)
 
 
diff --git a/main/transformer_utils/mmpose/core/camera/camera_base.py b/main/transformer_utils/mmpose/core/camera/camera_base.py
index 28b23e7c6279e3613265a949df91f6ced0413b99..092dc20d6b1f1d2db785ad720f67fd9184930ad5 100644
--- a/main/transformer_utils/mmpose/core/camera/camera_base.py
+++ b/main/transformer_utils/mmpose/core/camera/camera_base.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
 
-from mmcv.utils import Registry
+from mmengine import Registry
 
 CAMERAS = Registry('camera')
 
diff --git a/main/transformer_utils/mmpose/core/distributed_wrapper.py b/main/transformer_utils/mmpose/core/distributed_wrapper.py
index c67aceec992085e9952ea70c62009e9ec1db30ca..12122e71cb4fd46f0e23bb6df127339325f47520 100644
--- a/main/transformer_utils/mmpose/core/distributed_wrapper.py
+++ b/main/transformer_utils/mmpose/core/distributed_wrapper.py
@@ -4,7 +4,7 @@ import torch.nn as nn
 from mmcv.parallel import MODULE_WRAPPERS as MMCV_MODULE_WRAPPERS
 from mmcv.parallel import MMDistributedDataParallel
 from mmcv.parallel.scatter_gather import scatter_kwargs
-from mmcv.utils import Registry
+from mmengine import Registry
 from torch.cuda._utils import _get_device_index
 
 MODULE_WRAPPERS = Registry('module wrapper', parent=MMCV_MODULE_WRAPPERS)
diff --git a/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py b/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py
index b35a9c6a990c69b2beac9e73f893f97c237e4783..e94cb9a914d5e18816c81545c329958f7630877e 100644
--- a/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py
+++ b/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py
@@ -1,8 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+import os.path as osp
+import warnings
+from math import inf
+from typing import Callable, List, Optional
+
+import torch.distributed as dist
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.utils.data import DataLoader
+
+from mmengine.fileio import FileClient
+from mmengine.utils import is_seq_of
+from mmengine.hooks import Hook, LoggerHook
 
-from mmcv.runner import DistEvalHook as _DistEvalHook
-from mmcv.runner import EvalHook as _EvalHook
 
 MMPOSE_GREATER_KEYS = [
     'acc', 'ap', 'ar', 'pck', 'auc', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc',
@@ -10,6 +20,505 @@ MMPOSE_GREATER_KEYS = [
 ]
 MMPOSE_LESS_KEYS = ['loss', 'epe', 'nme', 'mpjpe', 'p-mpjpe', 'n-mpjpe']
 
+class _EvalHook(Hook):
+    """Non-Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in non-distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch or iteration.
+            It enables evaluation before the training starts if ``start`` <=
+            the resuming epoch or iteration. If None, whether to evaluate is
+            merely decided by ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader, and return the test results. If ``None``, the default
+            test function ``mmcv.engine.single_gpu_test`` will be used.
+            (default: ``None``)
+        greater_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. (default: ``None``)
+        less_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. (default: ``None``)
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+            `New in version 1.3.16.`
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+            `New in version 1.3.16.`
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+
+    Note:
+        If new arguments are added for EvalHook, tools/test.py,
+        tools/eval_metric.py may be affected.
+    """
+
+    # Since the key for determine greater or less is related to the downstream
+    # tasks, downstream repos may need to overwrite the following inner
+    # variable accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
+                 **eval_kwargs):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+
+        if interval <= 0:
+            raise ValueError(f'interval must be a positive number, '
+                             f'but got {interval}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
+
+        if start is not None and start < 0:
+            raise ValueError(f'The evaluation start epoch {start} is smaller '
+                             f'than 0')
+
+        self.dataloader = dataloader
+        self.interval = interval
+        self.start = start
+        self.by_epoch = by_epoch
+
+        assert isinstance(save_best, str) or save_best is None, \
+            '""save_best"" should be a str or None ' \
+            f'rather than {type(save_best)}'
+        self.save_best = save_best
+        self.eval_kwargs = eval_kwargs
+        self.initial_flag = True
+
+        if test_fn is None:
+            from mmcv.engine import single_gpu_test
+            self.test_fn = single_gpu_test
+        else:
+            self.test_fn = test_fn
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
+                greater_keys = (greater_keys, )
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
+                less_keys = (less_keys, )
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys
+
+        if self.save_best is not None:
+            self.best_ckpt_path = None
+            self._init_rule(rule, self.save_best)
+
+        self.out_dir = out_dir
+        self.file_client_args = file_client_args
+
+    def _init_rule(self, rule: Optional[str], key_indicator: str):
+        """Initialize rule, key_indicator, comparison_func, and best score.
+
+        Here is the rule to determine which rule is used for key indicator
+        when the rule is not specific (note that the key indicator matching
+        is case-insensitive):
+        1. If the key indicator is in ``self.greater_keys``, the rule will be
+           specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule will be
+           specified as 'less'.
+        3. Or if any one item in ``self.greater_keys`` is a substring of
+            key_indicator , the rule will be specified as 'greater'.
+        4. Or if any one item in ``self.less_keys`` is a substring of
+            key_indicator , the rule will be specified as 'less'.
+
+        Args:
+            rule (str | None): Comparison rule for best score.
+            key_indicator (str | None): Key indicator to determine the
+                comparison rule.
+        """
+        if rule not in self.rule_map and rule is not None:
+            raise KeyError(f'rule must be greater, less or None, '
+                           f'but got {rule}.')
+
+        if rule is None:
+            if key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                assert isinstance(key_indicator, str)
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = [key.lower() for key in self.greater_keys]
+                less_keys = [key.lower() for key in self.less_keys]
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError(f'Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     f'must be specified.')
+        self.rule = rule
+        self.key_indicator = key_indicator
+        if self.rule is not None:
+            self.compare_func = self.rule_map[self.rule]
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                f'The best checkpoint will be saved to {self.out_dir} by '
+                f'{self.file_client.name}')
+
+        if self.save_best is not None:
+            if runner.meta is None:
+                warnings.warn('runner.meta is None. Creating an empty one.')
+                runner.meta = dict()
+            runner.meta.setdefault('hook_msgs', dict())
+            self.best_ckpt_path = runner.meta['hook_msgs'].get(
+                'best_ckpt', None)
+
+    def before_train_iter(self, runner):
+        """Evaluate the model only at the start of training by iteration."""
+        if self.by_epoch or not self.initial_flag:
+            return
+        if self.start is not None and runner.iter >= self.start:
+            self.after_train_iter(runner)
+        self.initial_flag = False
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        if not (self.by_epoch and self.initial_flag):
+            return
+        if self.start is not None and runner.epoch >= self.start:
+            self.after_train_epoch(runner)
+        self.initial_flag = False
+
+    def after_train_iter(self, runner):
+        """Called after every training iter to evaluate the results."""
+        if not self.by_epoch and self._should_evaluate(runner):
+            # Because the priority of EvalHook is higher than LoggerHook, the
+            # training log and the evaluating log are mixed. Therefore,
+            # we need to dump the training log and clear it before evaluating
+            # log is generated. In addition, this problem will only appear in
+            # `IterBasedRunner` whose `self.by_epoch` is False, because
+            # `EpochBasedRunner` whose `self.by_epoch` is True calls
+            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
+            # the training log has been printed, so it will not cause any
+            # problem. more details at
+            # https://github.com/open-mmlab/mmsegmentation/issues/694
+            for hook in runner._hooks:
+                if isinstance(hook, LoggerHook):
+                    hook.after_train_iter(runner)
+            runner.log_buffer.clear()
+
+            self._do_evaluate(runner)
+
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if self.by_epoch and self._should_evaluate(runner):
+            self._do_evaluate(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        results = self.test_fn(runner.model, self.dataloader)
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
+            self._save_ckpt(runner, key_score)
+
+    def _should_evaluate(self, runner):
+        """Judge whether to perform evaluation.
+
+        Here is the rule to judge whether to perform evaluation:
+        1. It will not perform evaluation during the epoch/iteration interval,
+           which is determined by ``self.interval``.
+        2. It will not perform evaluation if the start time is larger than
+           current time.
+        3. It will not perform evaluation when current time is larger than
+           the start time but during epoch/iteration interval.
+
+        Returns:
+            bool: The flag indicating whether to perform evaluation.
+        """
+        if self.by_epoch:
+            current = runner.epoch
+            check_time = self.every_n_epochs
+        else:
+            current = runner.iter
+            check_time = self.every_n_iters
+
+        if self.start is None:
+            if not check_time(runner, self.interval):
+                # No evaluation during the interval.
+                return False
+        elif (current + 1) < self.start:
+            # No evaluation if start is larger than the current time.
+            return False
+        else:
+            # Evaluation only at epochs/iters 3, 5, 7...
+            # if start==3 and interval==2
+            if (current + 1 - self.start) % self.interval:
+                return False
+        return True
+
+    def _save_ckpt(self, runner, key_score):
+        """Save the best checkpoint.
+
+        It will compare the score according to the compare function, write
+        related information (best score, best checkpoint path) and save the
+        best checkpoint into ``work_dir``.
+        """
+        if self.by_epoch:
+            current = f'epoch_{runner.epoch + 1}'
+            cur_type, cur_time = 'epoch', runner.epoch + 1
+        else:
+            current = f'iter_{runner.iter + 1}'
+            cur_type, cur_time = 'iter', runner.iter + 1
+
+        best_score = runner.meta['hook_msgs'].get(
+            'best_score', self.init_value_map[self.rule])
+        if self.compare_func(key_score, best_score):
+            best_score = key_score
+            runner.meta['hook_msgs']['best_score'] = best_score
+
+            if self.best_ckpt_path and self.file_client.isfile(
+                    self.best_ckpt_path):
+                self.file_client.remove(self.best_ckpt_path)
+                runner.logger.info(
+                    f'The previous best checkpoint {self.best_ckpt_path} was '
+                    'removed')
+
+            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
+            self.best_ckpt_path = self.file_client.join_path(
+                self.out_dir, best_ckpt_name)
+            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
+
+            runner.save_checkpoint(
+                self.out_dir,
+                filename_tmpl=best_ckpt_name,
+                create_symlink=False)
+            runner.logger.info(
+                f'Now best checkpoint is saved as {best_ckpt_name}.')
+            runner.logger.info(
+                f'Best {self.key_indicator} is {best_score:0.4f} '
+                f'at {cur_time} {cur_type}.')
+
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        eval_res = self.dataloader.dataset.evaluate(
+            results, logger=runner.logger, **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            # If the performance of model is poor, the `eval_res` may be an
+            # empty dict and it will raise exception when `self.save_best` is
+            # not None. More details at
+            # https://github.com/open-mmlab/mmdetection/issues/6265.
+            if not eval_res:
+                warnings.warn(
+                    'Since `eval_res` is an empty dict, the behavior to save '
+                    'the best checkpoint will be skipped in this evaluation.')
+                return None
+
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+
+        return None
+
+
+class _DistEvalHook(_EvalHook):
+    """Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader in a multi-gpu manner, and return the test results. If
+            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
+            will be used. (default: ``None``)
+        tmpdir (str | None): Temporary directory to save the results of all
+            processes. Default: None.
+        gpu_collect (bool): Whether to use gpu or cpu to collect results.
+            Default: False.
+        broadcast_bn_buffer (bool): Whether to broadcast the
+            buffer(running_mean and running_var) of rank 0 to other rank
+            before evaluation. Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    def __init__(self,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 broadcast_bn_buffer: bool = True,
+                 tmpdir: Optional[str] = None,
+                 gpu_collect: bool = False,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmcv.engine import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        super().__init__(
+            dataloader,
+            start=start,
+            interval=interval,
+            by_epoch=by_epoch,
+            save_best=save_best,
+            rule=rule,
+            test_fn=test_fn,
+            greater_keys=greater_keys,
+            less_keys=less_keys,
+            out_dir=out_dir,
+            file_client_args=file_client_args,
+            **eval_kwargs)
+
+        self.broadcast_bn_buffer = broadcast_bn_buffer
+        self.tmpdir = tmpdir
+        self.gpu_collect = gpu_collect
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+            # the key_score may be `None` so it needs to skip the action to
+            # save the best checkpoint
+            if self.save_best and key_score:
+                self._save_ckpt(runner, key_score)
 
 class EvalHook(_EvalHook):
 
diff --git a/main/transformer_utils/mmpose/core/fp16/hooks.py b/main/transformer_utils/mmpose/core/fp16/hooks.py
index 74081a9b73b95ebb20cabf07cfaeab86cc874780..c4e414396925b9d15b5958d4831bec06f0d0f7bf 100644
--- a/main/transformer_utils/mmpose/core/fp16/hooks.py
+++ b/main/transformer_utils/mmpose/core/fp16/hooks.py
@@ -1,15 +1,90 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+import logging
+from typing import Optional
 
 import torch
 import torch.nn as nn
-from mmcv.runner import OptimizerHook
-from mmcv.utils import _BatchNorm
+from torch import Tensor
+from torch.nn.utils import clip_grad
+from mmengine.hooks import Hook
+from torch.nn.modules.batchnorm import _BatchNorm
 
 from ..utils.dist_utils import allreduce_grads
 from .utils import cast_tensor_type
 
 
+class OptimizerHook(Hook):
+    """A hook contains custom operations for the optimizer.
+
+    Args:
+        grad_clip (dict, optional): A config dict to control the clip_grad.
+            Default: None.
+        detect_anomalous_params (bool): This option is only used for
+            debugging which will slow down the training speed.
+            Detect anomalous parameters that are not included in
+            the computational graph with `loss` as the root.
+            There are two cases
+
+                - Parameters were not used during
+                  forward pass.
+                - Parameters were not used to produce
+                  loss.
+            Default: False.
+    """
+
+    def __init__(self,
+                 grad_clip: Optional[dict] = None,
+                 detect_anomalous_params: bool = False):
+        self.grad_clip = grad_clip
+        self.detect_anomalous_params = detect_anomalous_params
+
+    def clip_grads(self, params):
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        if self.detect_anomalous_params:
+            self.detect_anomalous_parameters(runner.outputs['loss'], runner)
+        runner.outputs['loss'].backward()
+
+        if self.grad_clip is not None:
+            grad_norm = self.clip_grads(runner.model.parameters())
+            if grad_norm is not None:
+                # Add grad norm to the logger
+                runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                         runner.outputs['num_samples'])
+        runner.optimizer.step()
+
+    def detect_anomalous_parameters(self, loss: Tensor, runner) -> None:
+        logger = runner.logger
+        parameters_in_graph = set()
+        visited = set()
+
+        def traverse(grad_fn):
+            if grad_fn is None:
+                return
+            if grad_fn not in visited:
+                visited.add(grad_fn)
+                if hasattr(grad_fn, 'variable'):
+                    parameters_in_graph.add(grad_fn.variable)
+                parents = grad_fn.next_functions
+                if parents is not None:
+                    for parent in parents:
+                        grad_fn = parent[0]
+                        traverse(grad_fn)
+
+        traverse(loss.grad_fn)
+        for n, p in runner.model.named_parameters():
+            if p not in parameters_in_graph and p.requires_grad:
+                logger.log(
+                    level=logging.ERROR,
+                    msg=f'{n} with shape {p.size()} is not '
+                    f'in the computational graph \n')
+
 class Fp16OptimizerHook(OptimizerHook):
     """FP16 optimizer hook.
 
diff --git a/main/transformer_utils/mmpose/core/optimizers/builder.py b/main/transformer_utils/mmpose/core/optimizers/builder.py
index cd2cf49133c57f28261b555d30a5cee18ae105af..aa9d2c7ab4b464b2900a7bd14076e601a4d1168c 100644
--- a/main/transformer_utils/mmpose/core/optimizers/builder.py
+++ b/main/transformer_utils/mmpose/core/optimizers/builder.py
@@ -1,24 +1,37 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.runner import build_optimizer
-from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS
-from mmcv.utils import Registry, build_from_cfg
+import copy
+from typing import Dict
+# from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS
+from mmengine import Registry
+from mmengine.registry import build_from_cfg
 
 OPTIMIZERS = Registry('optimizers')
-OPTIMIZER_BUILDERS = Registry(
-    'optimizer builder', parent=MMCV_OPTIMIZER_BUILDERS)
+OPTIMIZER_BUILDERS = Registry('optimizer builder')
 
 
 def build_optimizer_constructor(cfg):
     constructor_type = cfg.get('type')
     if constructor_type in OPTIMIZER_BUILDERS:
         return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
-    elif constructor_type in MMCV_OPTIMIZER_BUILDERS:
-        return build_from_cfg(cfg, MMCV_OPTIMIZER_BUILDERS)
     else:
         raise KeyError(f'{constructor_type} is not registered '
                        'in the optimizer builder registry.')
 
 
+def build_optimizer(model, cfg: Dict):
+    optimizer_cfg = copy.deepcopy(cfg)
+    constructor_type = optimizer_cfg.pop('constructor',
+                                         'DefaultOptimizerConstructor')
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    optim_constructor = build_optimizer_constructor(
+        dict(
+            type=constructor_type,
+            optimizer_cfg=optimizer_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optimizer = optim_constructor(model)
+    return optimizer
+
+
 def build_optimizers(model, cfgs):
     """Build multiple optimizers from configs.
 
diff --git a/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py b/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py
index 1ab6a82548c046483b7c412cefa0762cdbc531f8..958b50ae4839f5fd4dc0aa864a1723c9cbc9d8c8 100644
--- a/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py
+++ b/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py
@@ -1,8 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import warnings
-
-from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
+from mmengine.dist import get_dist_info
+from mmcv.runner import DefaultOptimizerConstructor
 
 from mmpose.utils import get_root_logger
 from .builder import OPTIMIZER_BUILDERS
diff --git a/main/transformer_utils/mmpose/core/post_processing/smoother.py b/main/transformer_utils/mmpose/core/post_processing/smoother.py
index 6b57768c03b48ff84877acbceb6e27b82832c04d..083e360a15f38660eea19a8115412ff70fcd1b80 100644
--- a/main/transformer_utils/mmpose/core/post_processing/smoother.py
+++ b/main/transformer_utils/mmpose/core/post_processing/smoother.py
@@ -4,8 +4,8 @@ import warnings
 from typing import Dict, Union
 
 import numpy as np
-from mmcv import Config, is_seq_of
-
+from mmengine.config import Config
+from mmengine.utils import is_seq_of
 from mmpose.core.post_processing.temporal_filters import build_filter
 
 
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py
index adb914c5222db967c9cdb56fa9f469ff47792f79..cd429df5106ff7a27dc4f63cb510b442ed48bb87 100644
--- a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import Registry
+from mmengine import Registry
 
 FILTERS = Registry('filters')
 
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py
index c7f8df520ad9457722f738c33b79d69d3a99fb9e..dd73b09717c25d2fef9839f6f9869bc45d8958ef 100644
--- a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py
@@ -3,7 +3,7 @@ from typing import Optional
 
 import numpy as np
 import torch
-from mmcv.runner import load_checkpoint
+from mmengine.runner import load_checkpoint
 from torch import Tensor, nn
 
 from .builder import FILTERS
diff --git a/main/transformer_utils/mmpose/core/utils/dist_utils.py b/main/transformer_utils/mmpose/core/utils/dist_utils.py
index b81f925ad7aa51ce800e27bead8eb8ba021c2592..b6273bab4870ac646edbddcc21e2c30de462f2a2 100644
--- a/main/transformer_utils/mmpose/core/utils/dist_utils.py
+++ b/main/transformer_utils/mmpose/core/utils/dist_utils.py
@@ -4,7 +4,7 @@ from collections import OrderedDict
 import numpy as np
 import torch
 import torch.distributed as dist
-from mmcv.runner import get_dist_info
+from mmengine.dist import get_dist_info
 from torch._utils import (_flatten_dense_tensors, _take_tensors,
                           _unflatten_dense_tensors)
 
diff --git a/main/transformer_utils/mmpose/core/utils/model_util_hooks.py b/main/transformer_utils/mmpose/core/utils/model_util_hooks.py
index d308a8a57a04f1a2acaa841ac2e8ad42439bb633..f03e3178309b08e7969dd6793e39d8bb743115cf 100644
--- a/main/transformer_utils/mmpose/core/utils/model_util_hooks.py
+++ b/main/transformer_utils/mmpose/core/utils/model_util_hooks.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.runner import HOOKS, Hook
-
+from mmengine.registry import HOOKS
+from mmengine.hooks import Hook
 
 @HOOKS.register_module()
 class ModelSetEpochHook(Hook):
diff --git a/main/transformer_utils/mmpose/core/visualization/image.py b/main/transformer_utils/mmpose/core/visualization/image.py
index d244b2b12cff970c810ae0798164e835dd6226e4..8188ccb0ed42427dc4311d20e135d115d9e5e6fc 100644
--- a/main/transformer_utils/mmpose/core/visualization/image.py
+++ b/main/transformer_utils/mmpose/core/visualization/image.py
@@ -7,7 +7,7 @@ import cv2
 import mmcv
 import numpy as np
 from matplotlib import pyplot as plt
-from mmcv.utils.misc import deprecated_api_warning
+from mmengine.utils import deprecated_api_warning
 from mmcv.visualization.color import color_val
 
 try:
diff --git a/main/transformer_utils/mmpose/models/__init__.py b/main/transformer_utils/mmpose/models/__init__.py
index 641d115a693abff882fa7604811430f8e6b605ab..fa68fc72fbce4204da6bc576daf8d04a9819bf52 100644
--- a/main/transformer_utils/mmpose/models/__init__.py
+++ b/main/transformer_utils/mmpose/models/__init__.py
@@ -3,9 +3,9 @@ from .builder import (BACKBONES, HEADS, LOSSES, MESH_MODELS, NECKS, POSENETS,
                       build_backbone, build_head, build_loss, build_mesh_model,
                       build_neck, build_posenet)
 from .detectors import *  # noqa
+from .backbones import *
 from .heads import *  # noqa
 from .losses import *  # noqa
-from .necks import *  # noqa
 from .utils import *  # noqa
 
 
diff --git a/main/transformer_utils/mmpose/models/backbones/__init__.py b/main/transformer_utils/mmpose/models/backbones/__init__.py
index 06717917a2dbd08800587d3ffa193149e42a653c..2003ee3af7c44b6fcbf3b46e0ac1e00785f7a6f1 100644
--- a/main/transformer_utils/mmpose/models/backbones/__init__.py
+++ b/main/transformer_utils/mmpose/models/backbones/__init__.py
@@ -1,41 +1,42 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .alexnet import AlexNet
-from .cpm import CPM
-from .hourglass import HourglassNet
-from .hourglass_ae import HourglassAENet
-from .hrformer import HRFormer
-from .hrnet import HRNet
-from .i3d import I3D
-from .litehrnet import LiteHRNet
-from .mobilenet_v2 import MobileNetV2
-from .mobilenet_v3 import MobileNetV3
-from .mspn import MSPN
-from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
-from .regnet import RegNet
-from .resnest import ResNeSt
-from .resnet import ResNet, ResNetV1d
-from .resnext import ResNeXt
-from .rsn import RSN
-from .scnet import SCNet
-from .seresnet import SEResNet
-from .seresnext import SEResNeXt
-from .shufflenet_v1 import ShuffleNetV1
-from .shufflenet_v2 import ShuffleNetV2
-from .swin import SwinTransformer
-from .tcformer import TCFormer
-from .tcn import TCN
-from .v2v_net import V2VNet
-from .vgg import VGG
-from .vipnas_mbv3 import ViPNAS_MobileNetV3
-from .vipnas_resnet import ViPNAS_ResNet
-from .hrt import HRT
+# from .alexnet import AlexNet
+# from .cpm import CPM
+# from .hourglass import HourglassNet
+# from .hourglass_ae import HourglassAENet
+# from .hrformer import HRFormer
+# from .hrnet import HRNet
+# from .i3d import I3D
+# from .litehrnet import LiteHRNet
+# from .mobilenet_v2 import MobileNetV2
+# from .mobilenet_v3 import MobileNetV3
+# from .mspn import MSPN
+# from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+# from .regnet import RegNet
+# from .resnest import ResNeSt
+# from .resnet import ResNet, ResNetV1d
+# from .resnext import ResNeXt
+# from .rsn import RSN
+# from .scnet import SCNet
+# from .seresnet import SEResNet
+# from .seresnext import SEResNeXt
+# from .shufflenet_v1 import ShuffleNetV1
+# from .shufflenet_v2 import ShuffleNetV2
+# from .swin import SwinTransformer
+# from .tcformer import TCFormer
+# from .tcn import TCN
+# from .v2v_net import V2VNet
+# from .vgg import VGG
+# from .vipnas_mbv3 import ViPNAS_MobileNetV3
+# from .vipnas_resnet import ViPNAS_ResNet
+# from .hrt import HRT
 from .vit import ViT
 
-__all__ = [
-    'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
-    'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
-    'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
-    'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
-    'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer',
-    'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer', 'ViT'
-]
+# __all__ = [
+#     'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
+#     'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
+#     'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
+#     'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
+#     'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer',
+#     'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer', 'ViT'
+# ]
+__all__ = ['ViT']
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/backbones/alexnet.py b/main/transformer_utils/mmpose/models/backbones/alexnet.py
deleted file mode 100644
index a8efd74d118f5abe4d9c880ebe80ce7cbd58c6b2..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/alexnet.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-
-
-@BACKBONES.register_module()
-class AlexNet(BaseBackbone):
-    """`AlexNet <https://en.wikipedia.org/wiki/AlexNet>`__ backbone.
-
-    The input for AlexNet is a 224x224 RGB image.
-
-    Args:
-        num_classes (int): number of classes for classification.
-            The default value is -1, which uses the backbone as
-            a feature extractor without the top classifier.
-    """
-
-    def __init__(self, num_classes=-1):
-        super().__init__()
-        self.num_classes = num_classes
-        self.features = nn.Sequential(
-            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-            nn.Conv2d(64, 192, kernel_size=5, padding=2),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-            nn.Conv2d(192, 384, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(384, 256, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, 256, kernel_size=3, padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=3, stride=2),
-        )
-        if self.num_classes > 0:
-            self.classifier = nn.Sequential(
-                nn.Dropout(),
-                nn.Linear(256 * 6 * 6, 4096),
-                nn.ReLU(inplace=True),
-                nn.Dropout(),
-                nn.Linear(4096, 4096),
-                nn.ReLU(inplace=True),
-                nn.Linear(4096, num_classes),
-            )
-
-    def forward(self, x):
-
-        x = self.features(x)
-        if self.num_classes > 0:
-            x = x.view(x.size(0), 256 * 6 * 6)
-            x = self.classifier(x)
-
-        return x
diff --git a/main/transformer_utils/mmpose/models/backbones/cpm.py b/main/transformer_utils/mmpose/models/backbones/cpm.py
deleted file mode 100644
index 458245d755f930f4ff625a754aadbab5c13494a6..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/cpm.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch
-import torch.nn as nn
-from mmcv.cnn import ConvModule, constant_init, normal_init
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from mmpose.utils import get_root_logger
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .utils import load_checkpoint
-
-
-class CpmBlock(nn.Module):
-    """CpmBlock for Convolutional Pose Machine.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        channels (list): Output channels of each conv module.
-        kernels (list): Kernel sizes of each conv module.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 channels=(128, 128, 128),
-                 kernels=(11, 11, 11),
-                 norm_cfg=None):
-        super().__init__()
-
-        assert len(channels) == len(kernels)
-        layers = []
-        for i in range(len(channels)):
-            if i == 0:
-                input_channels = in_channels
-            else:
-                input_channels = channels[i - 1]
-            layers.append(
-                ConvModule(
-                    input_channels,
-                    channels[i],
-                    kernels[i],
-                    padding=(kernels[i] - 1) // 2,
-                    norm_cfg=norm_cfg))
-        self.model = nn.Sequential(*layers)
-
-    def forward(self, x):
-        """Model forward function."""
-        out = self.model(x)
-        return out
-
-
-@BACKBONES.register_module()
-class CPM(BaseBackbone):
-    """CPM backbone.
-
-    Convolutional Pose Machines.
-    More details can be found in the `paper
-    <https://arxiv.org/abs/1602.00134>`__ .
-
-    Args:
-        in_channels (int): The input channels of the CPM.
-        out_channels (int): The output channels of the CPM.
-        feat_channels (int): Feature channel of each CPM stage.
-        middle_channels (int): Feature channel of conv after the middle stage.
-        num_stages (int): Number of stages.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-
-    Example:
-        >>> from mmpose.models import CPM
-        >>> import torch
-        >>> self = CPM(3, 17)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 368, 368)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_output in level_outputs:
-        ...     print(tuple(level_output.shape))
-        (1, 17, 46, 46)
-        (1, 17, 46, 46)
-        (1, 17, 46, 46)
-        (1, 17, 46, 46)
-        (1, 17, 46, 46)
-        (1, 17, 46, 46)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 feat_channels=128,
-                 middle_channels=32,
-                 num_stages=6,
-                 norm_cfg=dict(type='BN', requires_grad=True)):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-
-        assert in_channels == 3
-
-        self.num_stages = num_stages
-        assert self.num_stages >= 1
-
-        self.stem = nn.Sequential(
-            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
-            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
-            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
-            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
-            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
-            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
-            ConvModule(128, 32, 5, padding=2, norm_cfg=norm_cfg),
-            ConvModule(32, 512, 9, padding=4, norm_cfg=norm_cfg),
-            ConvModule(512, 512, 1, padding=0, norm_cfg=norm_cfg),
-            ConvModule(512, out_channels, 1, padding=0, act_cfg=None))
-
-        self.middle = nn.Sequential(
-            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
-            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
-            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
-            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
-            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
-            nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
-
-        self.cpm_stages = nn.ModuleList([
-            CpmBlock(
-                middle_channels + out_channels,
-                channels=[feat_channels, feat_channels, feat_channels],
-                kernels=[11, 11, 11],
-                norm_cfg=norm_cfg) for _ in range(num_stages - 1)
-        ])
-
-        self.middle_conv = nn.ModuleList([
-            nn.Sequential(
-                ConvModule(
-                    128, middle_channels, 5, padding=2, norm_cfg=norm_cfg))
-            for _ in range(num_stages - 1)
-        ])
-
-        self.out_convs = nn.ModuleList([
-            nn.Sequential(
-                ConvModule(
-                    feat_channels,
-                    feat_channels,
-                    1,
-                    padding=0,
-                    norm_cfg=norm_cfg),
-                ConvModule(feat_channels, out_channels, 1, act_cfg=None))
-            for _ in range(num_stages - 1)
-        ])
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    normal_init(m, std=0.001)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Model forward function."""
-        stage1_out = self.stem(x)
-        middle_out = self.middle(x)
-        out_feats = []
-
-        out_feats.append(stage1_out)
-
-        for ind in range(self.num_stages - 1):
-            single_stage = self.cpm_stages[ind]
-            out_conv = self.out_convs[ind]
-
-            inp_feat = torch.cat(
-                [out_feats[-1], self.middle_conv[ind](middle_out)], 1)
-            cpm_feat = single_stage(inp_feat)
-            out_feat = out_conv(cpm_feat)
-            out_feats.append(out_feat)
-
-        return out_feats
diff --git a/main/transformer_utils/mmpose/models/backbones/hourglass.py b/main/transformer_utils/mmpose/models/backbones/hourglass.py
deleted file mode 100644
index bf75fad9895ebfd3f3c2a6bffedb3d7e4cc77cba..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/hourglass.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch.nn as nn
-from mmcv.cnn import ConvModule, constant_init, normal_init
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from mmpose.utils import get_root_logger
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .resnet import BasicBlock, ResLayer
-from .utils import load_checkpoint
-
-
-class HourglassModule(nn.Module):
-    """Hourglass Module for HourglassNet backbone.
-
-    Generate module recursively and use BasicBlock as the base unit.
-
-    Args:
-        depth (int): Depth of current HourglassModule.
-        stage_channels (list[int]): Feature channels of sub-modules in current
-            and follow-up HourglassModule.
-        stage_blocks (list[int]): Number of sub-modules stacked in current and
-            follow-up HourglassModule.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-    """
-
-    def __init__(self,
-                 depth,
-                 stage_channels,
-                 stage_blocks,
-                 norm_cfg=dict(type='BN', requires_grad=True)):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-
-        self.depth = depth
-
-        cur_block = stage_blocks[0]
-        next_block = stage_blocks[1]
-
-        cur_channel = stage_channels[0]
-        next_channel = stage_channels[1]
-
-        self.up1 = ResLayer(
-            BasicBlock, cur_block, cur_channel, cur_channel, norm_cfg=norm_cfg)
-
-        self.low1 = ResLayer(
-            BasicBlock,
-            cur_block,
-            cur_channel,
-            next_channel,
-            stride=2,
-            norm_cfg=norm_cfg)
-
-        if self.depth > 1:
-            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
-                                        stage_blocks[1:])
-        else:
-            self.low2 = ResLayer(
-                BasicBlock,
-                next_block,
-                next_channel,
-                next_channel,
-                norm_cfg=norm_cfg)
-
-        self.low3 = ResLayer(
-            BasicBlock,
-            cur_block,
-            next_channel,
-            cur_channel,
-            norm_cfg=norm_cfg,
-            downsample_first=False)
-
-        self.up2 = nn.Upsample(scale_factor=2)
-
-    def forward(self, x):
-        """Model forward function."""
-        up1 = self.up1(x)
-        low1 = self.low1(x)
-        low2 = self.low2(low1)
-        low3 = self.low3(low2)
-        up2 = self.up2(low3)
-        return up1 + up2
-
-
-@BACKBONES.register_module()
-class HourglassNet(BaseBackbone):
-    """HourglassNet backbone.
-
-    Stacked Hourglass Networks for Human Pose Estimation.
-    More details can be found in the `paper
-    <https://arxiv.org/abs/1603.06937>`__ .
-
-    Args:
-        downsample_times (int): Downsample times in a HourglassModule.
-        num_stacks (int): Number of HourglassModule modules stacked,
-            1 for Hourglass-52, 2 for Hourglass-104.
-        stage_channels (list[int]): Feature channel of each sub-module in a
-            HourglassModule.
-        stage_blocks (list[int]): Number of sub-modules stacked in a
-            HourglassModule.
-        feat_channel (int): Feature channel of conv after a HourglassModule.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-
-    Example:
-        >>> from mmpose.models import HourglassNet
-        >>> import torch
-        >>> self = HourglassNet()
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 511, 511)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_output in level_outputs:
-        ...     print(tuple(level_output.shape))
-        (1, 256, 128, 128)
-        (1, 256, 128, 128)
-    """
-
-    def __init__(self,
-                 downsample_times=5,
-                 num_stacks=2,
-                 stage_channels=(256, 256, 384, 384, 384, 512),
-                 stage_blocks=(2, 2, 2, 2, 2, 4),
-                 feat_channel=256,
-                 norm_cfg=dict(type='BN', requires_grad=True)):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-
-        self.num_stacks = num_stacks
-        assert self.num_stacks >= 1
-        assert len(stage_channels) == len(stage_blocks)
-        assert len(stage_channels) > downsample_times
-
-        cur_channel = stage_channels[0]
-
-        self.stem = nn.Sequential(
-            ConvModule(3, 128, 7, padding=3, stride=2, norm_cfg=norm_cfg),
-            ResLayer(BasicBlock, 1, 128, 256, stride=2, norm_cfg=norm_cfg))
-
-        self.hourglass_modules = nn.ModuleList([
-            HourglassModule(downsample_times, stage_channels, stage_blocks)
-            for _ in range(num_stacks)
-        ])
-
-        self.inters = ResLayer(
-            BasicBlock,
-            num_stacks - 1,
-            cur_channel,
-            cur_channel,
-            norm_cfg=norm_cfg)
-
-        self.conv1x1s = nn.ModuleList([
-            ConvModule(
-                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
-            for _ in range(num_stacks - 1)
-        ])
-
-        self.out_convs = nn.ModuleList([
-            ConvModule(
-                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
-            for _ in range(num_stacks)
-        ])
-
-        self.remap_convs = nn.ModuleList([
-            ConvModule(
-                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
-            for _ in range(num_stacks - 1)
-        ])
-
-        self.relu = nn.ReLU(inplace=True)
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    normal_init(m, std=0.001)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Model forward function."""
-        inter_feat = self.stem(x)
-        out_feats = []
-
-        for ind in range(self.num_stacks):
-            single_hourglass = self.hourglass_modules[ind]
-            out_conv = self.out_convs[ind]
-
-            hourglass_feat = single_hourglass(inter_feat)
-            out_feat = out_conv(hourglass_feat)
-            out_feats.append(out_feat)
-
-            if ind < self.num_stacks - 1:
-                inter_feat = self.conv1x1s[ind](
-                    inter_feat) + self.remap_convs[ind](
-                        out_feat)
-                inter_feat = self.inters[ind](self.relu(inter_feat))
-
-        return out_feats
diff --git a/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py b/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py
deleted file mode 100644
index 5a700e5cb2157fd1dc16771145f065e991b270ea..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch.nn as nn
-from mmcv.cnn import ConvModule, MaxPool2d, constant_init, normal_init
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from mmpose.utils import get_root_logger
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .utils import load_checkpoint
-
-
-class HourglassAEModule(nn.Module):
-    """Modified Hourglass Module for HourglassNet_AE backbone.
-
-    Generate module recursively and use BasicBlock as the base unit.
-
-    Args:
-        depth (int): Depth of current HourglassModule.
-        stage_channels (list[int]): Feature channels of sub-modules in current
-            and follow-up HourglassModule.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-    """
-
-    def __init__(self,
-                 depth,
-                 stage_channels,
-                 norm_cfg=dict(type='BN', requires_grad=True)):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-
-        self.depth = depth
-
-        cur_channel = stage_channels[0]
-        next_channel = stage_channels[1]
-
-        self.up1 = ConvModule(
-            cur_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
-
-        self.pool1 = MaxPool2d(2, 2)
-
-        self.low1 = ConvModule(
-            cur_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
-
-        if self.depth > 1:
-            self.low2 = HourglassAEModule(depth - 1, stage_channels[1:])
-        else:
-            self.low2 = ConvModule(
-                next_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
-
-        self.low3 = ConvModule(
-            next_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
-
-        self.up2 = nn.UpsamplingNearest2d(scale_factor=2)
-
-    def forward(self, x):
-        """Model forward function."""
-        up1 = self.up1(x)
-        pool1 = self.pool1(x)
-        low1 = self.low1(pool1)
-        low2 = self.low2(low1)
-        low3 = self.low3(low2)
-        up2 = self.up2(low3)
-        return up1 + up2
-
-
-@BACKBONES.register_module()
-class HourglassAENet(BaseBackbone):
-    """Hourglass-AE Network proposed by Newell et al.
-
-    Associative Embedding: End-to-End Learning for Joint
-    Detection and Grouping.
-
-    More details can be found in the `paper
-    <https://arxiv.org/abs/1611.05424>`__ .
-
-    Args:
-        downsample_times (int): Downsample times in a HourglassModule.
-        num_stacks (int): Number of HourglassModule modules stacked,
-            1 for Hourglass-52, 2 for Hourglass-104.
-        stage_channels (list[int]): Feature channel of each sub-module in a
-            HourglassModule.
-        stage_blocks (list[int]): Number of sub-modules stacked in a
-            HourglassModule.
-        feat_channels (int): Feature channel of conv after a HourglassModule.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-
-    Example:
-        >>> from mmpose.models import HourglassAENet
-        >>> import torch
-        >>> self = HourglassAENet()
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 512, 512)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_output in level_outputs:
-        ...     print(tuple(level_output.shape))
-        (1, 34, 128, 128)
-    """
-
-    def __init__(self,
-                 downsample_times=4,
-                 num_stacks=1,
-                 out_channels=34,
-                 stage_channels=(256, 384, 512, 640, 768),
-                 feat_channels=256,
-                 norm_cfg=dict(type='BN', requires_grad=True)):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-
-        self.num_stacks = num_stacks
-        assert self.num_stacks >= 1
-        assert len(stage_channels) > downsample_times
-
-        cur_channels = stage_channels[0]
-
-        self.stem = nn.Sequential(
-            ConvModule(3, 64, 7, padding=3, stride=2, norm_cfg=norm_cfg),
-            ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg),
-            MaxPool2d(2, 2),
-            ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg),
-            ConvModule(128, feat_channels, 3, padding=1, norm_cfg=norm_cfg),
-        )
-
-        self.hourglass_modules = nn.ModuleList([
-            nn.Sequential(
-                HourglassAEModule(
-                    downsample_times, stage_channels, norm_cfg=norm_cfg),
-                ConvModule(
-                    feat_channels,
-                    feat_channels,
-                    3,
-                    padding=1,
-                    norm_cfg=norm_cfg),
-                ConvModule(
-                    feat_channels,
-                    feat_channels,
-                    3,
-                    padding=1,
-                    norm_cfg=norm_cfg)) for _ in range(num_stacks)
-        ])
-
-        self.out_convs = nn.ModuleList([
-            ConvModule(
-                cur_channels,
-                out_channels,
-                1,
-                padding=0,
-                norm_cfg=None,
-                act_cfg=None) for _ in range(num_stacks)
-        ])
-
-        self.remap_out_convs = nn.ModuleList([
-            ConvModule(
-                out_channels,
-                feat_channels,
-                1,
-                norm_cfg=norm_cfg,
-                act_cfg=None) for _ in range(num_stacks - 1)
-        ])
-
-        self.remap_feature_convs = nn.ModuleList([
-            ConvModule(
-                feat_channels,
-                feat_channels,
-                1,
-                norm_cfg=norm_cfg,
-                act_cfg=None) for _ in range(num_stacks - 1)
-        ])
-
-        self.relu = nn.ReLU(inplace=True)
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    normal_init(m, std=0.001)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Model forward function."""
-        inter_feat = self.stem(x)
-        out_feats = []
-
-        for ind in range(self.num_stacks):
-            single_hourglass = self.hourglass_modules[ind]
-            out_conv = self.out_convs[ind]
-
-            hourglass_feat = single_hourglass(inter_feat)
-            out_feat = out_conv(hourglass_feat)
-            out_feats.append(out_feat)
-
-            if ind < self.num_stacks - 1:
-                inter_feat = inter_feat + self.remap_out_convs[ind](
-                    out_feat) + self.remap_feature_convs[ind](
-                        hourglass_feat)
-
-        return out_feats
diff --git a/main/transformer_utils/mmpose/models/backbones/hrformer.py b/main/transformer_utils/mmpose/models/backbones/hrformer.py
deleted file mode 100644
index b843300a9fdb85908678c5a3fd45ce19e97ce2fe..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/hrformer.py
+++ /dev/null
@@ -1,746 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import math
-
-import torch
-import torch.nn as nn
-# from timm.models.layers import to_2tuple, trunc_normal_
-from mmcv.cnn import (build_activation_layer, build_conv_layer,
-                      build_norm_layer, trunc_normal_init)
-from mmcv.cnn.bricks.transformer import build_dropout
-from mmcv.runner import BaseModule
-from torch.nn.functional import pad
-
-from ..builder import BACKBONES
-from .hrnet import Bottleneck, HRModule, HRNet
-
-
-def nlc_to_nchw(x, hw_shape):
-    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
-
-    Args:
-        x (Tensor): The input tensor of shape [N, L, C] before conversion.
-        hw_shape (Sequence[int]): The height and width of output feature map.
-
-    Returns:
-        Tensor: The output tensor of shape [N, C, H, W] after conversion.
-    """
-    H, W = hw_shape
-    assert len(x.shape) == 3
-    B, L, C = x.shape
-    assert L == H * W, 'The seq_len doesn\'t match H, W'
-    return x.transpose(1, 2).reshape(B, C, H, W)
-
-
-def nchw_to_nlc(x):
-    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
-
-    Args:
-        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
-
-    Returns:
-        Tensor: The output tensor of shape [N, L, C] after conversion.
-    """
-    assert len(x.shape) == 4
-    return x.flatten(2).transpose(1, 2).contiguous()
-
-
-def build_drop_path(drop_path_rate):
-    """Build drop path layer."""
-    return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate))
-
-
-class WindowMSA(BaseModule):
-    """Window based multi-head self-attention (W-MSA) module with relative
-    position bias.
-
-    Args:
-        embed_dims (int): Number of input channels.
-        num_heads (int): Number of attention heads.
-        window_size (tuple[int]): The height and width of the window.
-        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
-            Default: True.
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        attn_drop_rate (float, optional): Dropout ratio of attention weight.
-            Default: 0.0
-        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
-        with_rpe (bool, optional): If True, use relative position bias.
-            Default: True.
-        init_cfg (dict | None, optional): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 window_size,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 attn_drop_rate=0.,
-                 proj_drop_rate=0.,
-                 with_rpe=True,
-                 init_cfg=None):
-
-        super().__init__(init_cfg=init_cfg)
-        self.embed_dims = embed_dims
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_embed_dims = embed_dims // num_heads
-        self.scale = qk_scale or head_embed_dims**-0.5
-
-        self.with_rpe = with_rpe
-        if self.with_rpe:
-            # define a parameter table of relative position bias
-            self.relative_position_bias_table = nn.Parameter(
-                torch.zeros(
-                    (2 * window_size[0] - 1) * (2 * window_size[1] - 1),
-                    num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-            Wh, Ww = self.window_size
-            rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
-            rel_position_index = rel_index_coords + rel_index_coords.T
-            rel_position_index = rel_position_index.flip(1).contiguous()
-            self.register_buffer('relative_position_index', rel_position_index)
-
-        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop_rate)
-        self.proj = nn.Linear(embed_dims, embed_dims)
-        self.proj_drop = nn.Dropout(proj_drop_rate)
-
-        self.softmax = nn.Softmax(dim=-1)
-
-    def init_weights(self):
-        trunc_normal_init(self.relative_position_bias_table, std=0.02)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-
-            x (tensor): input features with shape of (B*num_windows, N, C)
-            mask (tensor | None, Optional): mask with shape of (num_windows,
-                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
-        """
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        if self.with_rpe:
-            relative_position_bias = self.relative_position_bias_table[
-                self.relative_position_index.view(-1)].view(
-                    self.window_size[0] * self.window_size[1],
-                    self.window_size[0] * self.window_size[1],
-                    -1)  # Wh*Ww,Wh*Ww,nH
-            relative_position_bias = relative_position_bias.permute(
-                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-            attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B // nW, nW, self.num_heads, N,
-                             N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-        attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    @staticmethod
-    def double_step_seq(step1, len1, step2, len2):
-        seq1 = torch.arange(0, step1 * len1, step1)
-        seq2 = torch.arange(0, step2 * len2, step2)
-        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
-
-
-class LocalWindowSelfAttention(BaseModule):
-    r""" Local-window Self Attention (LSA) module with relative position bias.
-
-    This module is the short-range self-attention module in the
-    Interlaced Sparse Self-Attention <https://arxiv.org/abs/1907.12273>`_.
-
-    Args:
-        embed_dims (int): Number of input channels.
-        num_heads (int): Number of attention heads.
-        window_size (tuple[int] | int): The height and width of the window.
-        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
-            Default: True.
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        attn_drop_rate (float, optional): Dropout ratio of attention weight.
-            Default: 0.0
-        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
-        with_rpe (bool, optional): If True, use relative position bias.
-            Default: True.
-        with_pad_mask (bool, optional): If True, mask out the padded tokens in
-            the attention process. Default: False.
-        init_cfg (dict | None, optional): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 window_size,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 attn_drop_rate=0.,
-                 proj_drop_rate=0.,
-                 with_rpe=True,
-                 with_pad_mask=False,
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        if isinstance(window_size, int):
-            window_size = (window_size, window_size)
-        self.window_size = window_size
-        self.with_pad_mask = with_pad_mask
-        self.attn = WindowMSA(
-            embed_dims=embed_dims,
-            num_heads=num_heads,
-            window_size=window_size,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop_rate=attn_drop_rate,
-            proj_drop_rate=proj_drop_rate,
-            with_rpe=with_rpe,
-            init_cfg=init_cfg)
-
-    def forward(self, x, H, W, **kwargs):
-        """Forward function."""
-        B, N, C = x.shape
-        x = x.view(B, H, W, C)
-        Wh, Ww = self.window_size
-
-        # center-pad the feature on H and W axes
-        pad_h = math.ceil(H / Wh) * Wh - H
-        pad_w = math.ceil(W / Ww) * Ww - W
-        x = pad(x, (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
-                    pad_h - pad_h // 2))
-
-        # permute
-        x = x.view(B, math.ceil(H / Wh), Wh, math.ceil(W / Ww), Ww, C)
-        x = x.permute(0, 1, 3, 2, 4, 5)
-        x = x.reshape(-1, Wh * Ww, C)  # (B*num_window, Wh*Ww, C)
-
-        # attention
-        if self.with_pad_mask and pad_h > 0 and pad_w > 0:
-            pad_mask = x.new_zeros(1, H, W, 1)
-            pad_mask = pad(
-                pad_mask, [
-                    0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
-                    pad_h - pad_h // 2
-                ],
-                value=-float('inf'))
-            pad_mask = pad_mask.view(1, math.ceil(H / Wh), Wh,
-                                     math.ceil(W / Ww), Ww, 1)
-            pad_mask = pad_mask.permute(1, 3, 0, 2, 4, 5)
-            pad_mask = pad_mask.reshape(-1, Wh * Ww)
-            pad_mask = pad_mask[:, None, :].expand([-1, Wh * Ww, -1])
-            out = self.attn(x, pad_mask, **kwargs)
-        else:
-            out = self.attn(x, **kwargs)
-
-        # reverse permutation
-        out = out.reshape(B, math.ceil(H / Wh), math.ceil(W / Ww), Wh, Ww, C)
-        out = out.permute(0, 1, 3, 2, 4, 5)
-        out = out.reshape(B, H + pad_h, W + pad_w, C)
-
-        # de-pad
-        out = out[:, pad_h // 2:H + pad_h // 2, pad_w // 2:W + pad_w // 2]
-        return out.reshape(B, N, C)
-
-
-class CrossFFN(BaseModule):
-    r"""FFN with Depthwise Conv of HRFormer.
-
-    Args:
-        in_features (int): The feature dimension.
-        hidden_features (int, optional): The hidden dimension of FFNs.
-            Defaults: The same as in_features.
-        act_cfg (dict, optional): Config of activation layer.
-            Default: dict(type='GELU').
-        dw_act_cfg (dict, optional): Config of activation layer appended
-            right after DW Conv. Default: dict(type='GELU').
-        norm_cfg (dict, optional): Config of norm layer.
-            Default: dict(type='SyncBN').
-        init_cfg (dict | list | None, optional): The init config.
-            Default: None.
-    """
-
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_cfg=dict(type='GELU'),
-                 dw_act_cfg=dict(type='GELU'),
-                 norm_cfg=dict(type='SyncBN'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
-        self.act1 = build_activation_layer(act_cfg)
-        self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1]
-        self.dw3x3 = nn.Conv2d(
-            hidden_features,
-            hidden_features,
-            kernel_size=3,
-            stride=1,
-            groups=hidden_features,
-            padding=1)
-        self.act2 = build_activation_layer(dw_act_cfg)
-        self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1]
-        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
-        self.act3 = build_activation_layer(act_cfg)
-        self.norm3 = build_norm_layer(norm_cfg, out_features)[1]
-
-        # put the modules togather
-        self.layers = [
-            self.fc1, self.norm1, self.act1, self.dw3x3, self.norm2, self.act2,
-            self.fc2, self.norm3, self.act3
-        ]
-
-    def forward(self, x, H, W):
-        """Forward function."""
-        x = nlc_to_nchw(x, (H, W))
-        for layer in self.layers:
-            x = layer(x)
-        x = nchw_to_nlc(x)
-        return x
-
-
-class HRFormerBlock(BaseModule):
-    """High-Resolution Block for HRFormer.
-
-    Args:
-        in_features (int): The input dimension.
-        out_features (int): The output dimension.
-        num_heads (int): The number of head within each LSA.
-        window_size (int, optional): The window size for the LSA.
-            Default: 7
-        mlp_ratio (int, optional): The expansion ration of FFN.
-            Default: 4
-        act_cfg (dict, optional): Config of activation layer.
-            Default: dict(type='GELU').
-        norm_cfg (dict, optional): Config of norm layer.
-            Default: dict(type='SyncBN').
-        transformer_norm_cfg (dict, optional): Config of transformer norm
-            layer. Default: dict(type='LN', eps=1e-6).
-        init_cfg (dict | list | None, optional): The init config.
-            Default: None.
-    """
-
-    expansion = 1
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 num_heads,
-                 window_size=7,
-                 mlp_ratio=4.0,
-                 drop_path=0.0,
-                 act_cfg=dict(type='GELU'),
-                 norm_cfg=dict(type='SyncBN'),
-                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
-                 init_cfg=None,
-                 **kwargs):
-        super(HRFormerBlock, self).__init__(init_cfg=init_cfg)
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.mlp_ratio = mlp_ratio
-
-        self.norm1 = build_norm_layer(transformer_norm_cfg, in_features)[1]
-        self.attn = LocalWindowSelfAttention(
-            in_features,
-            num_heads=num_heads,
-            window_size=window_size,
-            init_cfg=None,
-            **kwargs)
-
-        self.norm2 = build_norm_layer(transformer_norm_cfg, out_features)[1]
-        self.ffn = CrossFFN(
-            in_features=in_features,
-            hidden_features=int(in_features * mlp_ratio),
-            out_features=out_features,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
-            dw_act_cfg=act_cfg,
-            init_cfg=None)
-
-        self.drop_path = build_drop_path(
-            drop_path) if drop_path > 0.0 else nn.Identity()
-
-    def forward(self, x):
-        """Forward function."""
-        B, C, H, W = x.size()
-        # Attention
-        x = x.view(B, C, -1).permute(0, 2, 1)
-        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
-        # FFN
-        x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
-        x = x.permute(0, 2, 1).view(B, C, H, W)
-        return x
-
-    def extra_repr(self):
-        """(Optional) Set the extra information about this module."""
-        return 'num_heads={}, window_size={}, mlp_ratio={}'.format(
-            self.num_heads, self.window_size, self.mlp_ratio)
-
-
-class HRFomerModule(HRModule):
-    """High-Resolution Module for HRFormer.
-
-    Args:
-        num_branches (int): The number of branches in the HRFormerModule.
-        block (nn.Module): The building block of HRFormer.
-            The block should be the HRFormerBlock.
-        num_blocks (tuple): The number of blocks in each branch.
-            The length must be equal to num_branches.
-        num_inchannels (tuple): The number of input channels in each branch.
-            The length must be equal to num_branches.
-        num_channels (tuple): The number of channels in each branch.
-            The length must be equal to num_branches.
-        num_heads (tuple): The number of heads within the LSAs.
-        num_window_sizes (tuple): The window size for the LSAs.
-        num_mlp_ratios (tuple): The expansion ratio for the FFNs.
-        drop_path (int, optional): The drop path rate of HRFomer.
-            Default: 0.0
-        multiscale_output (bool, optional): Whether to output multi-level
-            features produced by multiple branches. If False, only the first
-            level feature will be output. Default: True.
-        conv_cfg (dict, optional): Config of the conv layers.
-            Default: None.
-        norm_cfg (dict, optional): Config of the norm layers appended
-            right after conv. Default: dict(type='SyncBN', requires_grad=True)
-        transformer_norm_cfg (dict, optional): Config of the norm layers.
-            Default: dict(type='LN', eps=1e-6)
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False
-        upsample_cfg(dict, optional): The config of upsample layers in fuse
-            layers. Default: dict(mode='bilinear', align_corners=False)
-    """
-
-    def __init__(self,
-                 num_branches,
-                 block,
-                 num_blocks,
-                 num_inchannels,
-                 num_channels,
-                 num_heads,
-                 num_window_sizes,
-                 num_mlp_ratios,
-                 multiscale_output=True,
-                 drop_paths=0.0,
-                 with_rpe=True,
-                 with_pad_mask=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='SyncBN', requires_grad=True),
-                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
-                 with_cp=False,
-                 upsample_cfg=dict(mode='bilinear', align_corners=False)):
-
-        self.transformer_norm_cfg = transformer_norm_cfg
-        self.drop_paths = drop_paths
-        self.num_heads = num_heads
-        self.num_window_sizes = num_window_sizes
-        self.num_mlp_ratios = num_mlp_ratios
-        self.with_rpe = with_rpe
-        self.with_pad_mask = with_pad_mask
-
-        super().__init__(num_branches, block, num_blocks, num_inchannels,
-                         num_channels, multiscale_output, with_cp, conv_cfg,
-                         norm_cfg, upsample_cfg)
-
-    def _make_one_branch(self,
-                         branch_index,
-                         block,
-                         num_blocks,
-                         num_channels,
-                         stride=1):
-        """Build one branch."""
-        # HRFormerBlock does not support down sample layer yet.
-        assert stride == 1 and self.in_channels[branch_index] == num_channels[
-            branch_index]
-        layers = []
-        layers.append(
-            block(
-                self.in_channels[branch_index],
-                num_channels[branch_index],
-                num_heads=self.num_heads[branch_index],
-                window_size=self.num_window_sizes[branch_index],
-                mlp_ratio=self.num_mlp_ratios[branch_index],
-                drop_path=self.drop_paths[0],
-                norm_cfg=self.norm_cfg,
-                transformer_norm_cfg=self.transformer_norm_cfg,
-                init_cfg=None,
-                with_rpe=self.with_rpe,
-                with_pad_mask=self.with_pad_mask))
-
-        self.in_channels[
-            branch_index] = self.in_channels[branch_index] * block.expansion
-        for i in range(1, num_blocks[branch_index]):
-            layers.append(
-                block(
-                    self.in_channels[branch_index],
-                    num_channels[branch_index],
-                    num_heads=self.num_heads[branch_index],
-                    window_size=self.num_window_sizes[branch_index],
-                    mlp_ratio=self.num_mlp_ratios[branch_index],
-                    drop_path=self.drop_paths[i],
-                    norm_cfg=self.norm_cfg,
-                    transformer_norm_cfg=self.transformer_norm_cfg,
-                    init_cfg=None,
-                    with_rpe=self.with_rpe,
-                    with_pad_mask=self.with_pad_mask))
-        return nn.Sequential(*layers)
-
-    def _make_fuse_layers(self):
-        """Build fuse layers."""
-        if self.num_branches == 1:
-            return None
-        num_branches = self.num_branches
-        num_inchannels = self.in_channels
-        fuse_layers = []
-        for i in range(num_branches if self.multiscale_output else 1):
-            fuse_layer = []
-            for j in range(num_branches):
-                if j > i:
-                    fuse_layer.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_inchannels[j],
-                                num_inchannels[i],
-                                kernel_size=1,
-                                stride=1,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg,
-                                             num_inchannels[i])[1],
-                            nn.Upsample(
-                                scale_factor=2**(j - i),
-                                mode=self.upsample_cfg['mode'],
-                                align_corners=self.
-                                upsample_cfg['align_corners'])))
-                elif j == i:
-                    fuse_layer.append(None)
-                else:
-                    conv3x3s = []
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            num_outchannels_conv3x3 = num_inchannels[i]
-                            with_out_act = False
-                        else:
-                            num_outchannels_conv3x3 = num_inchannels[j]
-                            with_out_act = True
-                        sub_modules = [
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_inchannels[j],
-                                num_inchannels[j],
-                                kernel_size=3,
-                                stride=2,
-                                padding=1,
-                                groups=num_inchannels[j],
-                                bias=False,
-                            ),
-                            build_norm_layer(self.norm_cfg,
-                                             num_inchannels[j])[1],
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_inchannels[j],
-                                num_outchannels_conv3x3,
-                                kernel_size=1,
-                                stride=1,
-                                bias=False,
-                            ),
-                            build_norm_layer(self.norm_cfg,
-                                             num_outchannels_conv3x3)[1]
-                        ]
-                        if with_out_act:
-                            sub_modules.append(nn.ReLU(False))
-                        conv3x3s.append(nn.Sequential(*sub_modules))
-                    fuse_layer.append(nn.Sequential(*conv3x3s))
-            fuse_layers.append(nn.ModuleList(fuse_layer))
-
-        return nn.ModuleList(fuse_layers)
-
-    def get_num_inchannels(self):
-        """Return the number of input channels."""
-        return self.in_channels
-
-
-@BACKBONES.register_module()
-class HRFormer(HRNet):
-    """HRFormer backbone.
-
-    This backbone is the implementation of `HRFormer: High-Resolution
-    Transformer for Dense Prediction <https://arxiv.org/abs/2110.09408>`_.
-
-    Args:
-        extra (dict): Detailed configuration for each stage of HRNet.
-            There must be 4 stages, the configuration for each stage must have
-            5 keys:
-
-                - num_modules (int): The number of HRModule in this stage.
-                - num_branches (int): The number of branches in the HRModule.
-                - block (str): The type of block.
-                - num_blocks (tuple): The number of blocks in each branch.
-                    The length must be equal to num_branches.
-                - num_channels (tuple): The number of channels in each branch.
-                    The length must be equal to num_branches.
-        in_channels (int): Number of input image channels. Normally 3.
-        conv_cfg (dict): Dictionary to construct and config conv layer.
-            Default: None.
-        norm_cfg (dict): Config of norm layer.
-            Use `SyncBN` by default.
-        transformer_norm_cfg (dict): Config of transformer norm layer.
-            Use `LN` by default.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters. Default: -1.
-    Example:
-        >>> from mmpose.models import HRFormer
-        >>> import torch
-        >>> extra = dict(
-        >>>     stage1=dict(
-        >>>         num_modules=1,
-        >>>         num_branches=1,
-        >>>         block='BOTTLENECK',
-        >>>         num_blocks=(2, ),
-        >>>         num_channels=(64, )),
-        >>>     stage2=dict(
-        >>>         num_modules=1,
-        >>>         num_branches=2,
-        >>>         block='HRFORMER',
-        >>>         window_sizes=(7, 7),
-        >>>         num_heads=(1, 2),
-        >>>         mlp_ratios=(4, 4),
-        >>>         num_blocks=(2, 2),
-        >>>         num_channels=(32, 64)),
-        >>>     stage3=dict(
-        >>>         num_modules=4,
-        >>>         num_branches=3,
-        >>>         block='HRFORMER',
-        >>>         window_sizes=(7, 7, 7),
-        >>>         num_heads=(1, 2, 4),
-        >>>         mlp_ratios=(4, 4, 4),
-        >>>         num_blocks=(2, 2, 2),
-        >>>         num_channels=(32, 64, 128)),
-        >>>     stage4=dict(
-        >>>         num_modules=2,
-        >>>         num_branches=4,
-        >>>         block='HRFORMER',
-        >>>         window_sizes=(7, 7, 7, 7),
-        >>>         num_heads=(1, 2, 4, 8),
-        >>>         mlp_ratios=(4, 4, 4, 4),
-        >>>         num_blocks=(2, 2, 2, 2),
-        >>>         num_channels=(32, 64, 128, 256)))
-        >>> self = HRFormer(extra, in_channels=1)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 1, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 32, 8, 8)
-        (1, 64, 4, 4)
-        (1, 128, 2, 2)
-        (1, 256, 1, 1)
-    """
-
-    blocks_dict = {'BOTTLENECK': Bottleneck, 'HRFORMERBLOCK': HRFormerBlock}
-
-    def __init__(self,
-                 extra,
-                 in_channels=3,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True),
-                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
-                 norm_eval=False,
-                 with_cp=False,
-                 zero_init_residual=False,
-                 frozen_stages=-1):
-
-        # stochastic depth
-        depths = [
-            extra[stage]['num_blocks'][0] * extra[stage]['num_modules']
-            for stage in ['stage2', 'stage3', 'stage4']
-        ]
-        depth_s2, depth_s3, _ = depths
-        drop_path_rate = extra['drop_path_rate']
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
-        ]
-        extra['stage2']['drop_path_rates'] = dpr[0:depth_s2]
-        extra['stage3']['drop_path_rates'] = dpr[depth_s2:depth_s2 + depth_s3]
-        extra['stage4']['drop_path_rates'] = dpr[depth_s2 + depth_s3:]
-
-        # HRFormer use bilinear upsample as default
-        upsample_cfg = extra.get('upsample', {
-            'mode': 'bilinear',
-            'align_corners': False
-        })
-        extra['upsample'] = upsample_cfg
-        self.transformer_norm_cfg = transformer_norm_cfg
-        self.with_rpe = extra.get('with_rpe', True)
-        self.with_pad_mask = extra.get('with_pad_mask', False)
-
-        super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval,
-                         with_cp, zero_init_residual, frozen_stages)
-
-    def _make_stage(self,
-                    layer_config,
-                    num_inchannels,
-                    multiscale_output=True):
-        """Make each stage."""
-        num_modules = layer_config['num_modules']
-        num_branches = layer_config['num_branches']
-        num_blocks = layer_config['num_blocks']
-        num_channels = layer_config['num_channels']
-        block = self.blocks_dict[layer_config['block']]
-        num_heads = layer_config['num_heads']
-        num_window_sizes = layer_config['window_sizes']
-        num_mlp_ratios = layer_config['mlp_ratios']
-        drop_path_rates = layer_config['drop_path_rates']
-
-        modules = []
-        for i in range(num_modules):
-            # multiscale_output is only used at the last module
-            if not multiscale_output and i == num_modules - 1:
-                reset_multiscale_output = False
-            else:
-                reset_multiscale_output = True
-
-            modules.append(
-                HRFomerModule(
-                    num_branches,
-                    block,
-                    num_blocks,
-                    num_inchannels,
-                    num_channels,
-                    num_heads,
-                    num_window_sizes,
-                    num_mlp_ratios,
-                    reset_multiscale_output,
-                    drop_paths=drop_path_rates[num_blocks[0] *
-                                               i:num_blocks[0] * (i + 1)],
-                    with_rpe=self.with_rpe,
-                    with_pad_mask=self.with_pad_mask,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    transformer_norm_cfg=self.transformer_norm_cfg,
-                    with_cp=self.with_cp,
-                    upsample_cfg=self.upsample_cfg))
-            num_inchannels = modules[-1].get_num_inchannels()
-
-        return nn.Sequential(*modules), num_inchannels
diff --git a/main/transformer_utils/mmpose/models/backbones/hrnet.py b/main/transformer_utils/mmpose/models/backbones/hrnet.py
deleted file mode 100644
index 87dc8cef555b5e8d78fcc69293047b0cbe2ea8a6..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/hrnet.py
+++ /dev/null
@@ -1,604 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch.nn as nn
-from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
-                      normal_init)
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from mmpose.utils import get_root_logger
-from ..builder import BACKBONES
-from .resnet import BasicBlock, Bottleneck, get_expansion
-from .utils import load_checkpoint
-
-
-class HRModule(nn.Module):
-    """High-Resolution Module for HRNet.
-
-    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
-    is in this module.
-    """
-
-    def __init__(self,
-                 num_branches,
-                 blocks,
-                 num_blocks,
-                 in_channels,
-                 num_channels,
-                 multiscale_output=False,
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 upsample_cfg=dict(mode='nearest', align_corners=None)):
-
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        self._check_branches(num_branches, num_blocks, in_channels,
-                             num_channels)
-
-        self.in_channels = in_channels
-        self.num_branches = num_branches
-
-        self.multiscale_output = multiscale_output
-        self.norm_cfg = norm_cfg
-        self.conv_cfg = conv_cfg
-        self.upsample_cfg = upsample_cfg
-        self.with_cp = with_cp
-        self.branches = self._make_branches(num_branches, blocks, num_blocks,
-                                            num_channels)
-        self.fuse_layers = self._make_fuse_layers()
-        self.relu = nn.ReLU(inplace=True)
-
-    @staticmethod
-    def _check_branches(num_branches, num_blocks, in_channels, num_channels):
-        """Check input to avoid ValueError."""
-        if num_branches != len(num_blocks):
-            error_msg = f'NUM_BRANCHES({num_branches}) ' \
-                f'!= NUM_BLOCKS({len(num_blocks)})'
-            raise ValueError(error_msg)
-
-        if num_branches != len(num_channels):
-            error_msg = f'NUM_BRANCHES({num_branches}) ' \
-                f'!= NUM_CHANNELS({len(num_channels)})'
-            raise ValueError(error_msg)
-
-        if num_branches != len(in_channels):
-            error_msg = f'NUM_BRANCHES({num_branches}) ' \
-                f'!= NUM_INCHANNELS({len(in_channels)})'
-            raise ValueError(error_msg)
-
-    def _make_one_branch(self,
-                         branch_index,
-                         block,
-                         num_blocks,
-                         num_channels,
-                         stride=1):
-        """Make one branch."""
-        downsample = None
-        if stride != 1 or \
-                self.in_channels[branch_index] != \
-                num_channels[branch_index] * get_expansion(block):
-            downsample = nn.Sequential(
-                build_conv_layer(
-                    self.conv_cfg,
-                    self.in_channels[branch_index],
-                    num_channels[branch_index] * get_expansion(block),
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False),
-                build_norm_layer(
-                    self.norm_cfg,
-                    num_channels[branch_index] * get_expansion(block))[1])
-
-        layers = []
-        layers.append(
-            block(
-                self.in_channels[branch_index],
-                num_channels[branch_index] * get_expansion(block),
-                stride=stride,
-                downsample=downsample,
-                with_cp=self.with_cp,
-                norm_cfg=self.norm_cfg,
-                conv_cfg=self.conv_cfg))
-        self.in_channels[branch_index] = \
-            num_channels[branch_index] * get_expansion(block)
-        for _ in range(1, num_blocks[branch_index]):
-            layers.append(
-                block(
-                    self.in_channels[branch_index],
-                    num_channels[branch_index] * get_expansion(block),
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg))
-
-        return nn.Sequential(*layers)
-
-    def _make_branches(self, num_branches, block, num_blocks, num_channels):
-        """Make branches."""
-        branches = []
-
-        for i in range(num_branches):
-            branches.append(
-                self._make_one_branch(i, block, num_blocks, num_channels))
-
-        return nn.ModuleList(branches)
-
-    def _make_fuse_layers(self):
-        """Make fuse layer."""
-        if self.num_branches == 1:
-            return None
-
-        num_branches = self.num_branches
-        in_channels = self.in_channels
-        fuse_layers = []
-        num_out_branches = num_branches if self.multiscale_output else 1
-
-        for i in range(num_out_branches):
-            fuse_layer = []
-            for j in range(num_branches):
-                if j > i:
-                    fuse_layer.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels[j],
-                                in_channels[i],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
-                            nn.Upsample(
-                                scale_factor=2**(j - i),
-                                mode=self.upsample_cfg['mode'],
-                                align_corners=self.
-                                upsample_cfg['align_corners'])))
-                elif j == i:
-                    fuse_layer.append(None)
-                else:
-                    conv_downsamples = []
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[i],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[i])[1]))
-                        else:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[j])[1],
-                                    nn.ReLU(inplace=True)))
-                    fuse_layer.append(nn.Sequential(*conv_downsamples))
-            fuse_layers.append(nn.ModuleList(fuse_layer))
-
-        return nn.ModuleList(fuse_layers)
-
-    def forward(self, x):
-        """Forward function."""
-        if self.num_branches == 1:
-            return [self.branches[0](x[0])]
-
-        for i in range(self.num_branches):
-            x[i] = self.branches[i](x[i])
-
-        x_fuse = []
-        for i in range(len(self.fuse_layers)):
-            y = 0
-            for j in range(self.num_branches):
-                if i == j:
-                    y += x[j]
-                else:
-                    y += self.fuse_layers[i][j](x[j])
-            x_fuse.append(self.relu(y))
-        return x_fuse
-
-
-@BACKBONES.register_module()
-class HRNet(nn.Module):
-    """HRNet backbone.
-
-    `High-Resolution Representations for Labeling Pixels and Regions
-    <https://arxiv.org/abs/1904.04514>`__
-
-    Args:
-        extra (dict): detailed configuration for each stage of HRNet.
-        in_channels (int): Number of input image channels. Default: 3.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-        norm_cfg (dict): dictionary to construct and config norm layer.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-        zero_init_residual (bool): whether to use zero init for last norm layer
-            in resblocks to let them behave as identity.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters. Default: -1.
-
-    Example:
-        >>> from mmpose.models import HRNet
-        >>> import torch
-        >>> extra = dict(
-        >>>     stage1=dict(
-        >>>         num_modules=1,
-        >>>         num_branches=1,
-        >>>         block='BOTTLENECK',
-        >>>         num_blocks=(4, ),
-        >>>         num_channels=(64, )),
-        >>>     stage2=dict(
-        >>>         num_modules=1,
-        >>>         num_branches=2,
-        >>>         block='BASIC',
-        >>>         num_blocks=(4, 4),
-        >>>         num_channels=(32, 64)),
-        >>>     stage3=dict(
-        >>>         num_modules=4,
-        >>>         num_branches=3,
-        >>>         block='BASIC',
-        >>>         num_blocks=(4, 4, 4),
-        >>>         num_channels=(32, 64, 128)),
-        >>>     stage4=dict(
-        >>>         num_modules=3,
-        >>>         num_branches=4,
-        >>>         block='BASIC',
-        >>>         num_blocks=(4, 4, 4, 4),
-        >>>         num_channels=(32, 64, 128, 256)))
-        >>> self = HRNet(extra, in_channels=1)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 1, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 32, 8, 8)
-    """
-
-    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
-
-    def __init__(self,
-                 extra,
-                 in_channels=3,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 norm_eval=False,
-                 with_cp=False,
-                 zero_init_residual=False,
-                 frozen_stages=-1):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        self.extra = extra
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-        self.zero_init_residual = zero_init_residual
-        self.frozen_stages = frozen_stages
-
-        # stem net
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            in_channels,
-            64,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False)
-
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            self.conv_cfg,
-            64,
-            64,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False)
-
-        self.add_module(self.norm2_name, norm2)
-        self.relu = nn.ReLU(inplace=True)
-
-        self.upsample_cfg = self.extra.get('upsample', {
-            'mode': 'nearest',
-            'align_corners': None
-        })
-
-        # stage 1
-        self.stage1_cfg = self.extra['stage1']
-        num_channels = self.stage1_cfg['num_channels'][0]
-        block_type = self.stage1_cfg['block']
-        num_blocks = self.stage1_cfg['num_blocks'][0]
-
-        block = self.blocks_dict[block_type]
-        stage1_out_channels = num_channels * get_expansion(block)
-        self.layer1 = self._make_layer(block, 64, stage1_out_channels,
-                                       num_blocks)
-
-        # stage 2
-        self.stage2_cfg = self.extra['stage2']
-        num_channels = self.stage2_cfg['num_channels']
-        block_type = self.stage2_cfg['block']
-
-        block = self.blocks_dict[block_type]
-        num_channels = [
-            channel * get_expansion(block) for channel in num_channels
-        ]
-        self.transition1 = self._make_transition_layer([stage1_out_channels],
-                                                       num_channels)
-        self.stage2, pre_stage_channels = self._make_stage(
-            self.stage2_cfg, num_channels)
-
-        # stage 3
-        self.stage3_cfg = self.extra['stage3']
-        num_channels = self.stage3_cfg['num_channels']
-        block_type = self.stage3_cfg['block']
-
-        block = self.blocks_dict[block_type]
-        num_channels = [
-            channel * get_expansion(block) for channel in num_channels
-        ]
-        self.transition2 = self._make_transition_layer(pre_stage_channels,
-                                                       num_channels)
-        self.stage3, pre_stage_channels = self._make_stage(
-            self.stage3_cfg, num_channels)
-
-        # stage 4
-        self.stage4_cfg = self.extra['stage4']
-        num_channels = self.stage4_cfg['num_channels']
-        block_type = self.stage4_cfg['block']
-
-        block = self.blocks_dict[block_type]
-        num_channels = [
-            channel * get_expansion(block) for channel in num_channels
-        ]
-        self.transition3 = self._make_transition_layer(pre_stage_channels,
-                                                       num_channels)
-
-        self.stage4, pre_stage_channels = self._make_stage(
-            self.stage4_cfg,
-            num_channels,
-            multiscale_output=self.stage4_cfg.get('multiscale_output', False))
-
-        self._freeze_stages()
-
-    @property
-    def norm1(self):
-        """nn.Module: the normalization layer named "norm1" """
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        """nn.Module: the normalization layer named "norm2" """
-        return getattr(self, self.norm2_name)
-
-    def _make_transition_layer(self, num_channels_pre_layer,
-                               num_channels_cur_layer):
-        """Make transition layer."""
-        num_branches_cur = len(num_channels_cur_layer)
-        num_branches_pre = len(num_channels_pre_layer)
-
-        transition_layers = []
-        for i in range(num_branches_cur):
-            if i < num_branches_pre:
-                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
-                    transition_layers.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_channels_pre_layer[i],
-                                num_channels_cur_layer[i],
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg,
-                                             num_channels_cur_layer[i])[1],
-                            nn.ReLU(inplace=True)))
-                else:
-                    transition_layers.append(None)
-            else:
-                conv_downsamples = []
-                for j in range(i + 1 - num_branches_pre):
-                    in_channels = num_channels_pre_layer[-1]
-                    out_channels = num_channels_cur_layer[i] \
-                        if j == i - num_branches_pre else in_channels
-                    conv_downsamples.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels,
-                                out_channels,
-                                kernel_size=3,
-                                stride=2,
-                                padding=1,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg, out_channels)[1],
-                            nn.ReLU(inplace=True)))
-                transition_layers.append(nn.Sequential(*conv_downsamples))
-
-        return nn.ModuleList(transition_layers)
-
-    def _make_layer(self, block, in_channels, out_channels, blocks, stride=1):
-        """Make layer."""
-        downsample = None
-        if stride != 1 or in_channels != out_channels:
-            downsample = nn.Sequential(
-                build_conv_layer(
-                    self.conv_cfg,
-                    in_channels,
-                    out_channels,
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False),
-                build_norm_layer(self.norm_cfg, out_channels)[1])
-
-        layers = []
-        layers.append(
-            block(
-                in_channels,
-                out_channels,
-                stride=stride,
-                downsample=downsample,
-                with_cp=self.with_cp,
-                norm_cfg=self.norm_cfg,
-                conv_cfg=self.conv_cfg))
-        for _ in range(1, blocks):
-            layers.append(
-                block(
-                    out_channels,
-                    out_channels,
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg))
-
-        return nn.Sequential(*layers)
-
-    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
-        """Make stage."""
-        num_modules = layer_config['num_modules']
-        num_branches = layer_config['num_branches']
-        num_blocks = layer_config['num_blocks']
-        num_channels = layer_config['num_channels']
-        block = self.blocks_dict[layer_config['block']]
-
-        hr_modules = []
-        for i in range(num_modules):
-            # multi_scale_output is only used for the last module
-            if not multiscale_output and i == num_modules - 1:
-                reset_multiscale_output = False
-            else:
-                reset_multiscale_output = True
-
-            hr_modules.append(
-                HRModule(
-                    num_branches,
-                    block,
-                    num_blocks,
-                    in_channels,
-                    num_channels,
-                    reset_multiscale_output,
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg,
-                    upsample_cfg=self.upsample_cfg))
-
-            in_channels = hr_modules[-1].in_channels
-
-        return nn.Sequential(*hr_modules), in_channels
-
-    def _freeze_stages(self):
-        """Freeze parameters."""
-        if self.frozen_stages >= 0:
-            self.norm1.eval()
-            self.norm2.eval()
-
-            for m in [self.conv1, self.norm1, self.conv2, self.norm2]:
-                for param in m.parameters():
-                    param.requires_grad = False
-
-        for i in range(1, self.frozen_stages + 1):
-            if i == 1:
-                m = getattr(self, 'layer1')
-            else:
-                m = getattr(self, f'stage{i}')
-
-            m.eval()
-            for param in m.parameters():
-                param.requires_grad = False
-
-            if i < 4:
-                m = getattr(self, f'transition{i}')
-                m.eval()
-                for param in m.parameters():
-                    param.requires_grad = False
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    normal_init(m, std=0.001)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-
-            if self.zero_init_residual:
-                for m in self.modules():
-                    if isinstance(m, Bottleneck):
-                        constant_init(m.norm3, 0)
-                    elif isinstance(m, BasicBlock):
-                        constant_init(m.norm2, 0)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Forward function."""
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.relu(x)
-        x = self.conv2(x)
-        x = self.norm2(x)
-        x = self.relu(x)
-        x = self.layer1(x)
-
-        x_list = []
-        for i in range(self.stage2_cfg['num_branches']):
-            if self.transition1[i] is not None:
-                x_list.append(self.transition1[i](x))
-            else:
-                x_list.append(x)
-        y_list = self.stage2(x_list)
-
-        x_list = []
-        for i in range(self.stage3_cfg['num_branches']):
-            if self.transition2[i] is not None:
-                x_list.append(self.transition2[i](y_list[-1]))
-            else:
-                x_list.append(y_list[i])
-        y_list = self.stage3(x_list)
-
-        x_list = []
-        for i in range(self.stage4_cfg['num_branches']):
-            if self.transition3[i] is not None:
-                x_list.append(self.transition3[i](y_list[-1]))
-            else:
-                x_list.append(y_list[i])
-        y_list = self.stage4(x_list)
-
-        return y_list
-
-    def train(self, mode=True):
-        """Convert the model into training mode."""
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/hrt.py b/main/transformer_utils/mmpose/models/backbones/hrt.py
deleted file mode 100644
index 67be3d4429d03360698701b7cd6e67e7c7a0b4ad..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/hrt.py
+++ /dev/null
@@ -1,676 +0,0 @@
-# --------------------------------------------------------
-# High Resolution Transformer
-# Copyright (c) 2021 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Rao Fu, RainbowSecret
-# --------------------------------------------------------
-
-import pdb
-import torch
-import torch.nn as nn
-from mmcv.cnn import (
-    build_conv_layer,
-    build_norm_layer,
-    constant_init,
-    kaiming_init,
-    normal_init,
-)
-# from mmcv.runner import load_checkpoint
-from .hrt_checkpoint import load_checkpoint
-from mmcv.runner.checkpoint import load_state_dict
-from mmcv.utils.parrots_wrapper import _BatchNorm
-
-from mmpose.models.utils.ops import resize
-from mmpose.utils import get_root_logger
-from ..builder import BACKBONES
-from .modules.bottleneck_block import Bottleneck
-from .modules.transformer_block import GeneralTransformerBlock
-
-
-class HighResolutionTransformerModule(nn.Module):
-    def __init__(
-        self,
-        num_branches,
-        blocks,
-        num_blocks,
-        in_channels,
-        num_channels,
-        multiscale_output,
-        with_cp=False,
-        conv_cfg=None,
-        norm_cfg=dict(type="BN", requires_grad=True),
-        num_heads=None,
-        num_window_sizes=None,
-        num_mlp_ratios=None,
-        drop_paths=0.0,
-    ):
-        super(HighResolutionTransformerModule, self).__init__()
-        self._check_branches(num_branches, num_blocks, in_channels, num_channels)
-
-        self.in_channels = in_channels
-        self.num_branches = num_branches
-
-        self.multiscale_output = multiscale_output
-        self.norm_cfg = norm_cfg
-        self.conv_cfg = conv_cfg
-        self.with_cp = with_cp
-        self.branches = self._make_branches(
-            num_branches,
-            blocks,
-            num_blocks,
-            num_channels,
-            num_heads,
-            num_window_sizes,
-            num_mlp_ratios,
-            drop_paths,
-        )
-        self.fuse_layers = self._make_fuse_layers()
-        self.relu = nn.ReLU(inplace=True)
-
-        # MHSA parameters
-        self.num_heads = num_heads
-        self.num_window_sizes = num_window_sizes
-        self.num_mlp_ratios = num_mlp_ratios
-
-    def _check_branches(self, num_branches, num_blocks, in_channels, num_channels):
-        logger = get_root_logger()
-        if num_branches != len(num_blocks):
-            error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(
-                num_branches, len(num_blocks)
-            )
-            logger.error(error_msg)
-            raise ValueError(error_msg)
-
-        if num_branches != len(num_channels):
-            error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format(
-                num_branches, len(num_channels)
-            )
-            logger.error(error_msg)
-            raise ValueError(error_msg)
-
-        if num_branches != len(in_channels):
-            error_msg = "NUM_BRANCHES({}) <> IN_CHANNELS({})".format(
-                num_branches, len(in_channels)
-            )
-            logger.error(error_msg)
-            raise ValueError(error_msg)
-
-    def _make_one_branch(
-        self,
-        branch_index,
-        block,
-        num_blocks,
-        num_channels,
-        num_heads,
-        num_window_sizes,
-        num_mlp_ratios,
-        drop_paths,
-        stride=1,
-    ):
-        """Make one branch."""
-        downsample = None
-        if (
-            stride != 1
-            or self.in_channels[branch_index]
-            != num_channels[branch_index] * block.expansion
-        ):
-            downsample = nn.Sequential(
-                build_conv_layer(
-                    self.conv_cfg,
-                    self.in_channels[branch_index],
-                    num_channels[branch_index] * block.expansion,
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False,
-                ),
-                build_norm_layer(
-                    self.norm_cfg, num_channels[branch_index] * block.expansion
-                )[1],
-            )
-
-        layers = []
-
-        layers.append(
-            block(
-                self.in_channels[branch_index],
-                num_channels[branch_index],
-                num_heads=num_heads[branch_index],
-                window_size=num_window_sizes[branch_index],
-                mlp_ratio=num_mlp_ratios[branch_index],
-                drop_path=drop_paths[0],
-                norm_cfg=self.norm_cfg,
-                conv_cfg=self.conv_cfg,
-            )
-        )
-        self.in_channels[branch_index] = num_channels[branch_index] * block.expansion
-        for i in range(1, num_blocks[branch_index]):
-            layers.append(
-                block(
-                    self.in_channels[branch_index],
-                    num_channels[branch_index],
-                    num_heads=num_heads[branch_index],
-                    window_size=num_window_sizes[branch_index],
-                    mlp_ratio=num_mlp_ratios[branch_index],
-                    drop_path=drop_paths[i],
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg,
-                )
-            )
-
-        return nn.Sequential(*layers)
-
-    def _make_branches(
-        self,
-        num_branches,
-        block,
-        num_blocks,
-        num_channels,
-        num_heads,
-        num_window_sizes,
-        num_mlp_ratios,
-        drop_paths,
-    ):
-        """Make branches."""
-        branches = []
-
-        for i in range(num_branches):
-            branches.append(
-                self._make_one_branch(
-                    i,
-                    block,
-                    num_blocks,
-                    num_channels,
-                    num_heads,
-                    num_window_sizes,
-                    num_mlp_ratios,
-                    drop_paths,
-                )
-            )
-
-        return nn.ModuleList(branches)
-
-    def _make_fuse_layers(self):
-        """Build fuse layer."""
-        if self.num_branches == 1:
-            return None
-
-        num_branches = self.num_branches
-        in_channels = self.in_channels
-        fuse_layers = []
-        num_out_branches = num_branches if self.multiscale_output else 1
-        for i in range(num_out_branches):
-            fuse_layer = []
-            for j in range(num_branches):
-                if j > i:
-                    fuse_layer.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels[j],
-                                in_channels[i],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False,
-                            ),
-                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
-                            nn.Upsample(
-                                scale_factor=2 ** (j - i),
-                                mode="bilinear",
-                                align_corners=False,
-                            ),
-                        )
-                    )
-                elif j == i:
-                    fuse_layer.append(None)
-                else:
-                    conv_downsamples = []
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        groups=in_channels[j],
-                                        bias=False,
-                                    ),
-                                    build_norm_layer(self.norm_cfg, in_channels[j])[1],
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[i],
-                                        kernel_size=1,
-                                        stride=1,
-                                        bias=False,
-                                    ),
-                                    build_norm_layer(self.norm_cfg, in_channels[i])[1],
-                                )
-                            )
-                        else:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        groups=in_channels[j],
-                                        bias=False,
-                                    ),
-                                    build_norm_layer(self.norm_cfg, in_channels[j])[1],
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=1,
-                                        stride=1,
-                                        bias=False,
-                                    ),
-                                    build_norm_layer(self.norm_cfg, in_channels[j])[1],
-                                    nn.ReLU(inplace=True),
-                                )
-                            )
-                    fuse_layer.append(nn.Sequential(*conv_downsamples))
-            fuse_layers.append(nn.ModuleList(fuse_layer))
-        return nn.ModuleList(fuse_layers)
-
-    def forward(self, x):
-        """Forward function."""
-        if self.num_branches == 1:
-            return [self.branches[0](x[0])]
-
-        for i in range(self.num_branches):
-            x[i] = self.branches[i](x[i])
-
-        x_fuse = []
-        for i in range(len(self.fuse_layers)):
-            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
-            for j in range(1, self.num_branches):
-                if i == j:
-                    y += x[j]
-                elif j > i:
-                    y = y + resize(
-                        self.fuse_layers[i][j](x[j]),
-                        size=x[i].shape[2:],
-                        mode="bilinear",
-                        align_corners=False,
-                    )
-                else:
-                    y += self.fuse_layers[i][j](x[j])
-            x_fuse.append(self.relu(y))
-        return x_fuse
-
-
-@BACKBONES.register_module()
-class HRT(nn.Module):
-    """HRT backbone.
-    High Resolution Transformer Backbone
-    """
-
-    blocks_dict = {
-        "BOTTLENECK": Bottleneck,
-        "TRANSFORMER_BLOCK": GeneralTransformerBlock,
-    }
-
-    def __init__(
-        self,
-        extra,
-        in_channels=3,
-        conv_cfg=None,
-        norm_cfg=dict(type="BN", requires_grad=True),
-        norm_eval=False,
-        with_cp=False,
-        zero_init_residual=False,
-    ):
-        super(HRT, self).__init__()
-        self.extra = extra
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-        self.zero_init_residual = zero_init_residual
-
-        # stem net
-        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            in_channels,
-            64,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False,
-        )
-        self.add_module(self.norm1_name, norm1)
-
-        self.conv2 = build_conv_layer(
-            self.conv_cfg, 64, 64, kernel_size=3, stride=2, padding=1, bias=False
-        )
-        self.add_module(self.norm2_name, norm2)
-        self.relu = nn.ReLU(inplace=True)
-
-        # generat drop path rate list
-        depth_s2 = (
-            self.extra["stage2"]["num_blocks"][0] * self.extra["stage2"]["num_modules"]
-        )
-        depth_s3 = (
-            self.extra["stage3"]["num_blocks"][0] * self.extra["stage3"]["num_modules"]
-        )
-        depth_s4 = (
-            self.extra["stage4"]["num_blocks"][0] * self.extra["stage4"]["num_modules"]
-        )
-        depths = [depth_s2, depth_s3, depth_s4]
-        drop_path_rate = self.extra["drop_path_rate"]
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
-
-        logger = get_root_logger()
-        logger.info(dpr)
-
-        # stage 1
-        self.stage1_cfg = self.extra["stage1"]
-        num_channels = self.stage1_cfg["num_channels"][0]
-        block_type = self.stage1_cfg["block"]
-        num_blocks = self.stage1_cfg["num_blocks"][0]
-
-        block = self.blocks_dict[block_type]
-        stage1_out_channels = num_channels * block.expansion
-        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
-
-        # stage 2
-        self.stage2_cfg = self.extra["stage2"]
-        num_channels = self.stage2_cfg["num_channels"]
-        block_type = self.stage2_cfg["block"]
-
-        block = self.blocks_dict[block_type]
-        num_channels = [channel * block.expansion for channel in num_channels]
-        self.transition1 = self._make_transition_layer(
-            [stage1_out_channels], num_channels
-        )
-        self.stage2, pre_stage_channels = self._make_stage(
-            self.stage2_cfg, num_channels, drop_paths=dpr[0:depth_s2]
-        )
-
-        # stage 3
-        self.stage3_cfg = self.extra["stage3"]
-        num_channels = self.stage3_cfg["num_channels"]
-        block_type = self.stage3_cfg["block"]
-
-        block = self.blocks_dict[block_type]
-        num_channels = [channel * block.expansion for channel in num_channels]
-        self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
-        self.stage3, pre_stage_channels = self._make_stage(
-            self.stage3_cfg,
-            num_channels,
-            drop_paths=dpr[depth_s2 : depth_s2 + depth_s3],
-        )
-
-        # stage 4
-        self.stage4_cfg = self.extra["stage4"]
-        num_channels = self.stage4_cfg["num_channels"]
-        block_type = self.stage4_cfg["block"]
-
-        block = self.blocks_dict[block_type]
-        num_channels = [channel * block.expansion for channel in num_channels]
-        self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
-        self.stage4, pre_stage_channels = self._make_stage(
-            self.stage4_cfg,
-            num_channels,
-            multiscale_output=self.stage4_cfg.get("multiscale_output", True),
-            drop_paths=dpr[depth_s2 + depth_s3 :],
-        )
-
-    @property
-    def norm1(self):
-        """nn.Module: the normalization layer named "norm1" """
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        """nn.Module: the normalization layer named "norm2" """
-        return getattr(self, self.norm2_name)
-
-    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
-        """Make transition layer."""
-        num_branches_cur = len(num_channels_cur_layer)
-        num_branches_pre = len(num_channels_pre_layer)
-
-        transition_layers = []
-        for i in range(num_branches_cur):
-            if i < num_branches_pre:
-                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
-                    transition_layers.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_channels_pre_layer[i],
-                                num_channels_cur_layer[i],
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                bias=False,
-                            ),
-                            build_norm_layer(self.norm_cfg, num_channels_cur_layer[i])[
-                                1
-                            ],
-                            nn.ReLU(inplace=True),
-                        )
-                    )
-                else:
-                    transition_layers.append(None)
-            else:
-                conv_downsamples = []
-                for j in range(i + 1 - num_branches_pre):
-                    in_channels = num_channels_pre_layer[-1]
-                    out_channels = (
-                        num_channels_cur_layer[i]
-                        if j == i - num_branches_pre
-                        else in_channels
-                    )
-                    conv_downsamples.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels,
-                                out_channels,
-                                kernel_size=3,
-                                stride=2,
-                                padding=1,
-                                bias=False,
-                            ),
-                            build_norm_layer(self.norm_cfg, out_channels)[1],
-                            nn.ReLU(inplace=True),
-                        )
-                    )
-                transition_layers.append(nn.Sequential(*conv_downsamples))
-
-        return nn.ModuleList(transition_layers)
-
-    def _make_layer(
-        self,
-        block,
-        inplanes,
-        planes,
-        blocks,
-        stride=1,
-        num_heads=1,
-        window_size=7,
-        mlp_ratio=4.0,
-    ):
-        """Make each layer."""
-        downsample = None
-        if stride != 1 or inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                build_conv_layer(
-                    self.conv_cfg,
-                    inplanes,
-                    planes * block.expansion,
-                    kernel_size=1,
-                    stride=stride,
-                    bias=False,
-                ),
-                build_norm_layer(self.norm_cfg, planes * block.expansion)[1],
-            )
-
-        layers = []
-        if isinstance(block, GeneralTransformerBlock):
-            layers.append(
-                block(
-                    inplanes,
-                    planes,
-                    num_heads=num_heads,
-                    window_size=window_size,
-                    mlp_ratio=mlp_ratio,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg,
-                )
-            )
-        else:
-            layers.append(
-                block(
-                    inplanes,
-                    planes,
-                    stride,
-                    downsample=downsample,
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg,
-                )
-            )
-        inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(
-                block(
-                    inplanes,
-                    planes,
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg,
-                )
-            )
-
-        return nn.Sequential(*layers)
-
-    def _make_stage(
-        self, layer_config, in_channels, multiscale_output=True, drop_paths=0.0
-    ):
-        """Make each stage."""
-        num_modules = layer_config["num_modules"]
-        num_branches = layer_config["num_branches"]
-        num_blocks = layer_config["num_blocks"]
-        num_channels = layer_config["num_channels"]
-        block = self.blocks_dict[layer_config["block"]]
-
-        num_heads = layer_config["num_heads"]
-        num_window_sizes = layer_config["num_window_sizes"]
-        num_mlp_ratios = layer_config["num_mlp_ratios"]
-
-        hr_modules = []
-        for i in range(num_modules):
-            # multi_scale_output is only used for the last module
-            if not multiscale_output and i == num_modules - 1:
-                reset_multiscale_output = False
-            else:
-                reset_multiscale_output = True
-
-            hr_modules.append(
-                HighResolutionTransformerModule(
-                    num_branches,
-                    block,
-                    num_blocks,
-                    in_channels,
-                    num_channels,
-                    reset_multiscale_output,
-                    with_cp=self.with_cp,
-                    norm_cfg=self.norm_cfg,
-                    conv_cfg=self.conv_cfg,
-                    num_heads=num_heads,
-                    num_window_sizes=num_window_sizes,
-                    num_mlp_ratios=num_mlp_ratios,
-                    drop_paths=drop_paths[num_blocks[0] * i : num_blocks[0] * (i + 1)],
-                )
-            )
-
-        return nn.Sequential(*hr_modules), in_channels
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-            Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            ckpt = load_checkpoint(self, pretrained, strict=False)
-            if "model" in ckpt:
-                msg = self.load_state_dict(ckpt["model"], strict=False)
-                logger.info(msg)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    """mmseg: kaiming_init(m)"""
-                    normal_init(m, std=0.001)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-
-            if self.zero_init_residual:
-                for m in self.modules():
-                    if isinstance(m, Bottleneck):
-                        constant_init(m.norm3, 0)
-                    elif isinstance(m, BasicBlock):
-                        constant_init(m.norm2, 0)
-        else:
-            raise TypeError("pretrained must be a str or None")
-
-    def forward(self, x):
-        """Forward function."""
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.relu(x)
-        x = self.conv2(x)
-        x = self.norm2(x)
-        x = self.relu(x)
-        x = self.layer1(x)
-
-        x_list = []
-        for i in range(self.stage2_cfg["num_branches"]):
-            if self.transition1[i] is not None:
-                x_list.append(self.transition1[i](x))
-            else:
-                x_list.append(x)
-        y_list = self.stage2(x_list)
-
-        x_list = []
-        for i in range(self.stage3_cfg["num_branches"]):
-            if self.transition2[i] is not None:
-                x_list.append(self.transition2[i](y_list[-1]))
-            else:
-                x_list.append(y_list[i])
-        y_list = self.stage3(x_list)
-
-        x_list = []
-        for i in range(self.stage4_cfg["num_branches"]):
-            if self.transition3[i] is not None:
-                x_list.append(self.transition3[i](y_list[-1]))
-            else:
-                x_list.append(y_list[i])
-        y_list = self.stage4(x_list)
-
-        return y_list
-
-    def train(self, mode=True):
-        """Convert the model into training mode."""
-        super(HRT, self).train(mode)
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py b/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py
deleted file mode 100644
index e27749d45ad2e1b24e50de8b85af90b4464e91ba..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# Copyright (c) Open-MMLab. All rights reserved.
-import io
-import os
-import os.path as osp
-import pkgutil
-import time
-import warnings
-from collections import OrderedDict
-from importlib import import_module
-from tempfile import TemporaryDirectory
-
-import torch
-import torchvision
-from torch.optim import Optimizer
-from torch.utils import model_zoo
-from torch.nn import functional as F
-
-import mmcv
-from mmcv.fileio import FileClient
-from mmcv.fileio import load as load_file
-from mmcv.parallel import is_module_wrapper
-from mmcv.utils import mkdir_or_exist
-from mmcv.runner import get_dist_info
-
-ENV_MMCV_HOME = 'MMCV_HOME'
-ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
-DEFAULT_CACHE_DIR = '~/.cache'
-
-
-def _get_mmcv_home():
-    mmcv_home = os.path.expanduser(
-        os.getenv(
-            ENV_MMCV_HOME,
-            os.path.join(
-                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
-
-    mkdir_or_exist(mmcv_home)
-    return mmcv_home
-
-
-def load_state_dict(module, state_dict, strict=False, logger=None):
-    """Load state_dict to a module.
-
-    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
-    Default value for ``strict`` is set to ``False`` and the message for
-    param mismatch will be shown even if strict is False.
-
-    Args:
-        module (Module): Module that receives the state_dict.
-        state_dict (OrderedDict): Weights.
-        strict (bool): whether to strictly enforce that the keys
-            in :attr:`state_dict` match the keys returned by this module's
-            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
-        logger (:obj:`logging.Logger`, optional): Logger to log the error
-            message. If not specified, print function will be used.
-    """
-    unexpected_keys = []
-    all_missing_keys = []
-    err_msg = []
-
-    metadata = getattr(state_dict, '_metadata', None)
-    state_dict = state_dict.copy()
-    if metadata is not None:
-        state_dict._metadata = metadata
-
-    # use _load_from_state_dict to enable checkpoint version control
-    def load(module, prefix=''):
-        # recursively check parallel module in case that the model has a
-        # complicated structure, e.g., nn.Module(nn.Module(DDP))
-        if is_module_wrapper(module):
-            module = module.module
-        local_metadata = {} if metadata is None else metadata.get(
-            prefix[:-1], {})
-        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
-                                     all_missing_keys, unexpected_keys,
-                                     err_msg)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + '.')
-
-    load(module)
-    load = None  # break load->load reference cycle
-
-    # ignore "num_batches_tracked" of BN layers
-    missing_keys = [
-        key for key in all_missing_keys if 'num_batches_tracked' not in key
-    ]
-
-    if unexpected_keys:
-        err_msg.append('unexpected key in source '
-                       f'state_dict: {", ".join(unexpected_keys)}\n')
-    if missing_keys:
-        err_msg.append(
-            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
-
-    rank, _ = get_dist_info()
-    if len(err_msg) > 0 and rank == 0:
-        err_msg.insert(
-            0, 'The model and loaded state dict do not match exactly\n')
-        err_msg = '\n'.join(err_msg)
-        if strict:
-            raise RuntimeError(err_msg)
-        elif logger is not None:
-            logger.warning(err_msg)
-        else:
-            print(err_msg)
-
-
-def load_url_dist(url, model_dir=None):
-    """In distributed setting, this function only download checkpoint at local
-    rank 0."""
-    rank, world_size = get_dist_info()
-    rank = int(os.environ.get('LOCAL_RANK', rank))
-    if rank == 0:
-        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
-    return checkpoint
-
-
-def load_pavimodel_dist(model_path, map_location=None):
-    """In distributed setting, this function only download checkpoint at local
-    rank 0."""
-    try:
-        from pavi import modelcloud
-    except ImportError:
-        raise ImportError(
-            'Please install pavi to load checkpoint from modelcloud.')
-    rank, world_size = get_dist_info()
-    rank = int(os.environ.get('LOCAL_RANK', rank))
-    if rank == 0:
-        model = modelcloud.get(model_path)
-        with TemporaryDirectory() as tmp_dir:
-            downloaded_file = osp.join(tmp_dir, model.name)
-            model.download(downloaded_file)
-            checkpoint = torch.load(downloaded_file, map_location=map_location)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            model = modelcloud.get(model_path)
-            with TemporaryDirectory() as tmp_dir:
-                downloaded_file = osp.join(tmp_dir, model.name)
-                model.download(downloaded_file)
-                checkpoint = torch.load(
-                    downloaded_file, map_location=map_location)
-    return checkpoint
-
-
-def load_fileclient_dist(filename, backend, map_location):
-    """In distributed setting, this function only download checkpoint at local
-    rank 0."""
-    rank, world_size = get_dist_info()
-    rank = int(os.environ.get('LOCAL_RANK', rank))
-    allowed_backends = ['ceph']
-    if backend not in allowed_backends:
-        raise ValueError(f'Load from Backend {backend} is not supported.')
-    if rank == 0:
-        fileclient = FileClient(backend=backend)
-        buffer = io.BytesIO(fileclient.get(filename))
-        checkpoint = torch.load(buffer, map_location=map_location)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            fileclient = FileClient(backend=backend)
-            buffer = io.BytesIO(fileclient.get(filename))
-            checkpoint = torch.load(buffer, map_location=map_location)
-    return checkpoint
-
-
-def get_torchvision_models():
-    model_urls = dict()
-    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
-        if ispkg:
-            continue
-        _zoo = import_module(f'torchvision.models.{name}')
-        if hasattr(_zoo, 'model_urls'):
-            _urls = getattr(_zoo, 'model_urls')
-            model_urls.update(_urls)
-    return model_urls
-
-
-def get_external_models():
-    mmcv_home = _get_mmcv_home()
-    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
-    default_urls = load_file(default_json_path)
-    assert isinstance(default_urls, dict)
-    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
-    if osp.exists(external_json_path):
-        external_urls = load_file(external_json_path)
-        assert isinstance(external_urls, dict)
-        default_urls.update(external_urls)
-
-    return default_urls
-
-
-def get_mmcls_models():
-    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
-    mmcls_urls = load_file(mmcls_json_path)
-
-    return mmcls_urls
-
-
-def get_deprecated_model_names():
-    deprecate_json_path = osp.join(mmcv.__path__[0],
-                                   'model_zoo/deprecated.json')
-    deprecate_urls = load_file(deprecate_json_path)
-    assert isinstance(deprecate_urls, dict)
-
-    return deprecate_urls
-
-
-def _process_mmcls_checkpoint(checkpoint):
-    state_dict = checkpoint['state_dict']
-    new_state_dict = OrderedDict()
-    for k, v in state_dict.items():
-        if k.startswith('backbone.'):
-            new_state_dict[k[9:]] = v
-    new_checkpoint = dict(state_dict=new_state_dict)
-
-    return new_checkpoint
-
-
-def _load_checkpoint(filename, map_location=None):
-    """Load checkpoint from somewhere (modelzoo, file, url).
-
-    Args:
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str | None): Same as :func:`torch.load`. Default: None.
-
-    Returns:
-        dict | OrderedDict: The loaded checkpoint. It can be either an
-            OrderedDict storing model weights or a dict containing other
-            information, which depends on the checkpoint.
-    """
-    if filename.startswith('modelzoo://'):
-        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
-                      'use "torchvision://" instead')
-        model_urls = get_torchvision_models()
-        model_name = filename[11:]
-        checkpoint = load_url_dist(model_urls[model_name])
-    elif filename.startswith('torchvision://'):
-        model_urls = get_torchvision_models()
-        model_name = filename[14:]
-        checkpoint = load_url_dist(model_urls[model_name])
-    elif filename.startswith('open-mmlab://'):
-        model_urls = get_external_models()
-        model_name = filename[13:]
-        deprecated_urls = get_deprecated_model_names()
-        if model_name in deprecated_urls:
-            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
-                          f'of open-mmlab://{deprecated_urls[model_name]}')
-            model_name = deprecated_urls[model_name]
-        model_url = model_urls[model_name]
-        # check if is url
-        if model_url.startswith(('http://', 'https://')):
-            checkpoint = load_url_dist(model_url)
-        else:
-            filename = osp.join(_get_mmcv_home(), model_url)
-            if not osp.isfile(filename):
-                raise IOError(f'{filename} is not a checkpoint file')
-            checkpoint = torch.load(filename, map_location=map_location)
-    elif filename.startswith('mmcls://'):
-        model_urls = get_mmcls_models()
-        model_name = filename[8:]
-        checkpoint = load_url_dist(model_urls[model_name])
-        checkpoint = _process_mmcls_checkpoint(checkpoint)
-    elif filename.startswith(('http://', 'https://')):
-        checkpoint = load_url_dist(filename)
-    elif filename.startswith('pavi://'):
-        model_path = filename[7:]
-        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
-    elif filename.startswith('s3://'):
-        checkpoint = load_fileclient_dist(
-            filename, backend='ceph', map_location=map_location)
-    else:
-        if not osp.isfile(filename):
-            raise IOError(f'{filename} is not a checkpoint file')
-        checkpoint = torch.load(filename, map_location=map_location)
-    return checkpoint
-
-
-def load_checkpoint(model,
-                    filename,
-                    map_location='cpu',
-                    strict=False,
-                    logger=None):
-    """Load checkpoint from a file or URI.
-
-    Args:
-        model (Module): Module to load checkpoint.
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str): Same as :func:`torch.load`.
-        strict (bool): Whether to allow different params for the model and
-            checkpoint.
-        logger (:mod:`logging.Logger` or None): The logger for error message.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    checkpoint = _load_checkpoint(filename, map_location)
-    # OrderedDict is a subclass of dict
-    if not isinstance(checkpoint, dict):
-        raise RuntimeError(
-            f'No state_dict found in checkpoint file {filename}')
-    # get state_dict from checkpoint
-    if 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
-    elif 'model' in checkpoint:
-        state_dict = checkpoint['model']
-    else:
-        state_dict = checkpoint
-    # strip prefix of state_dict
-    if list(state_dict.keys())[0].startswith('module.'):
-        state_dict = {k[7:]: v for k, v in state_dict.items()}
-
-    # for MoBY, load model of online branch
-    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
-        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
-
-    # reshape absolute position embedding
-    if state_dict.get('absolute_pos_embed') is not None:
-        absolute_pos_embed = state_dict['absolute_pos_embed']
-        N1, L, C1 = absolute_pos_embed.size()
-        N2, C2, H, W = model.absolute_pos_embed.size()
-        if N1 != N2 or C1 != C2 or L != H*W:
-            logger.warning("Error in loading absolute_pos_embed, pass")
-        else:
-            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
-
-    # interpolate position bias table if needed
-    # relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
-    # for table_key in relative_position_bias_table_keys:
-    #     table_pretrained = state_dict[table_key]
-    #     table_current = model.state_dict()[table_key]
-    #     L1, nH1 = table_pretrained.size()
-    #     L2, nH2 = table_current.size()
-    #     if nH1 != nH2:
-    #         logger.warning(f"Error in loading {table_key}, pass")
-    #     else:
-    #         if L1 != L2:
-    #             S1 = int(L1 ** 0.5)
-    #             S2 = int(L2 ** 0.5)
-    #             table_pretrained_resized = F.interpolate(
-    #                  table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
-    #                  size=(S2, S2), mode='bicubic')
-    #             state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
-
-    # load state_dict
-    load_state_dict(model, state_dict, strict, logger)
-    return checkpoint
-
-
-def weights_to_cpu(state_dict):
-    """Copy a model state_dict to cpu.
-
-    Args:
-        state_dict (OrderedDict): Model weights on GPU.
-
-    Returns:
-        OrderedDict: Model weights on GPU.
-    """
-    state_dict_cpu = OrderedDict()
-    for key, val in state_dict.items():
-        state_dict_cpu[key] = val.cpu()
-    return state_dict_cpu
-
-
-def _save_to_state_dict(module, destination, prefix, keep_vars):
-    """Saves module state to `destination` dictionary.
-
-    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (dict): A dict where state will be stored.
-        prefix (str): The prefix for parameters and buffers used in this
-            module.
-    """
-    for name, param in module._parameters.items():
-        if param is not None:
-            destination[prefix + name] = param if keep_vars else param.detach()
-    for name, buf in module._buffers.items():
-        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
-        if buf is not None:
-            destination[prefix + name] = buf if keep_vars else buf.detach()
-
-
-def get_state_dict(module, destination=None, prefix='', keep_vars=False):
-    """Returns a dictionary containing a whole state of the module.
-
-    Both parameters and persistent buffers (e.g. running averages) are
-    included. Keys are corresponding parameter and buffer names.
-
-    This method is modified from :meth:`torch.nn.Module.state_dict` to
-    recursively check parallel module in case that the model has a complicated
-    structure, e.g., nn.Module(nn.Module(DDP)).
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (OrderedDict): Returned dict for the state of the
-            module.
-        prefix (str): Prefix of the key.
-        keep_vars (bool): Whether to keep the variable property of the
-            parameters. Default: False.
-
-    Returns:
-        dict: A dictionary containing a whole state of the module.
-    """
-    # recursively check parallel module in case that the model has a
-    # complicated structure, e.g., nn.Module(nn.Module(DDP))
-    if is_module_wrapper(module):
-        module = module.module
-
-    # below is the same as torch.nn.Module.state_dict()
-    if destination is None:
-        destination = OrderedDict()
-        destination._metadata = OrderedDict()
-    destination._metadata[prefix[:-1]] = local_metadata = dict(
-        version=module._version)
-    _save_to_state_dict(module, destination, prefix, keep_vars)
-    for name, child in module._modules.items():
-        if child is not None:
-            get_state_dict(
-                child, destination, prefix + name + '.', keep_vars=keep_vars)
-    for hook in module._state_dict_hooks.values():
-        hook_result = hook(module, destination, prefix, local_metadata)
-        if hook_result is not None:
-            destination = hook_result
-    return destination
-
-
-def save_checkpoint(model, filename, optimizer=None, meta=None):
-    """Save checkpoint to file.
-
-    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
-    ``optimizer``. By default ``meta`` will contain version and time info.
-
-    Args:
-        model (Module): Module whose params are to be saved.
-        filename (str): Checkpoint filename.
-        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
-        meta (dict, optional): Metadata to be saved in checkpoint.
-    """
-    if meta is None:
-        meta = {}
-    elif not isinstance(meta, dict):
-        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
-    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
-
-    if is_module_wrapper(model):
-        model = model.module
-
-    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
-        # save class name to the meta
-        meta.update(CLASSES=model.CLASSES)
-
-    checkpoint = {
-        'meta': meta,
-        'state_dict': weights_to_cpu(get_state_dict(model))
-    }
-    # save optimizer state dict in the checkpoint
-    if isinstance(optimizer, Optimizer):
-        checkpoint['optimizer'] = optimizer.state_dict()
-    elif isinstance(optimizer, dict):
-        checkpoint['optimizer'] = {}
-        for name, optim in optimizer.items():
-            checkpoint['optimizer'][name] = optim.state_dict()
-
-    if filename.startswith('pavi://'):
-        try:
-            from pavi import modelcloud
-            from pavi.exception import NodeNotFoundError
-        except ImportError:
-            raise ImportError(
-                'Please install pavi to load checkpoint from modelcloud.')
-        model_path = filename[7:]
-        root = modelcloud.Folder()
-        model_dir, model_name = osp.split(model_path)
-        try:
-            model = modelcloud.get(model_dir)
-        except NodeNotFoundError:
-            model = root.create_training_model(model_dir)
-        with TemporaryDirectory() as tmp_dir:
-            checkpoint_file = osp.join(tmp_dir, model_name)
-            with open(checkpoint_file, 'wb') as f:
-                torch.save(checkpoint, f)
-                f.flush()
-            model.create_file(checkpoint_file, name=model_name)
-    else:
-        mmcv.mkdir_or_exist(osp.dirname(filename))
-        # immediately flush buffer
-        with open(filename, 'wb') as f:
-            torch.save(checkpoint, f)
-            f.flush()
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/backbones/i3d.py b/main/transformer_utils/mmpose/models/backbones/i3d.py
deleted file mode 100644
index 64f330abac1facc16db743ef3ffbcd23248d6865..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/i3d.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# Code is modified from `Third-party pytorch implementation of i3d
-# <https://github.com/hassony2/kinetics_i3d_pytorch>`.
-
-import torch
-import torch.nn as nn
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-
-
-class Conv3dBlock(nn.Module):
-    """Basic 3d convolution block for I3D.
-
-    Args:
-    in_channels (int): Input channels of this block.
-    out_channels (int): Output channels of this block.
-    expansion (float): The multiplier of in_channels and out_channels.
-        Default: 1.
-    kernel_size (tuple[int]): kernel size of the 3d convolution layer.
-        Default: (1, 1, 1).
-    stride (tuple[int]): stride of the block. Default: (1, 1, 1)
-    padding (tuple[int]): padding of the input tensor. Default: (0, 0, 0)
-    use_bias (bool): whether to enable bias in 3d convolution layer.
-        Default: False
-    use_bn (bool): whether to use Batch Normalization after 3d convolution
-        layer. Default: True
-    use_relu (bool): whether to use ReLU after Batch Normalization layer.
-        Default: True
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 expansion=1.0,
-                 kernel_size=(1, 1, 1),
-                 stride=(1, 1, 1),
-                 padding=(0, 0, 0),
-                 use_bias=False,
-                 use_bn=True,
-                 use_relu=True):
-        super().__init__()
-
-        in_channels = int(in_channels * expansion)
-        out_channels = int(out_channels * expansion)
-
-        self.conv3d = nn.Conv3d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            padding=padding,
-            stride=stride,
-            bias=use_bias)
-
-        self.use_bn = use_bn
-        self.use_relu = use_relu
-
-        if self.use_bn:
-            self.batch3d = nn.BatchNorm3d(out_channels)
-
-        if self.use_relu:
-            self.activation = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        """Forward function."""
-        out = self.conv3d(x)
-        if self.use_bn:
-            out = self.batch3d(out)
-        if self.use_relu:
-            out = self.activation(out)
-        return out
-
-
-class Mixed(nn.Module):
-    """Inception block for I3D.
-
-    Args:
-    in_channels (int): Input channels of this block.
-    out_channels (int): Output channels of this block.
-    expansion (float): The multiplier of in_channels and out_channels.
-        Default: 1.
-    """
-
-    def __init__(self, in_channels, out_channels, expansion=1.0):
-        super(Mixed, self).__init__()
-        # Branch 0
-        self.branch_0 = Conv3dBlock(
-            in_channels, out_channels[0], expansion, kernel_size=(1, 1, 1))
-
-        # Branch 1
-        branch_1_conv1 = Conv3dBlock(
-            in_channels, out_channels[1], expansion, kernel_size=(1, 1, 1))
-        branch_1_conv2 = Conv3dBlock(
-            out_channels[1],
-            out_channels[2],
-            expansion,
-            kernel_size=(3, 3, 3),
-            padding=(1, 1, 1))
-        self.branch_1 = nn.Sequential(branch_1_conv1, branch_1_conv2)
-
-        # Branch 2
-        branch_2_conv1 = Conv3dBlock(
-            in_channels, out_channels[3], expansion, kernel_size=(1, 1, 1))
-        branch_2_conv2 = Conv3dBlock(
-            out_channels[3],
-            out_channels[4],
-            expansion,
-            kernel_size=(3, 3, 3),
-            padding=(1, 1, 1))
-        self.branch_2 = nn.Sequential(branch_2_conv1, branch_2_conv2)
-
-        # Branch3
-        branch_3_pool = nn.MaxPool3d(
-            kernel_size=(3, 3, 3),
-            stride=(1, 1, 1),
-            padding=(1, 1, 1),
-            ceil_mode=True)
-        branch_3_conv2 = Conv3dBlock(
-            in_channels, out_channels[5], expansion, kernel_size=(1, 1, 1))
-        self.branch_3 = nn.Sequential(branch_3_pool, branch_3_conv2)
-
-    def forward(self, x):
-        """Forward function."""
-        out_0 = self.branch_0(x)
-        out_1 = self.branch_1(x)
-        out_2 = self.branch_2(x)
-        out_3 = self.branch_3(x)
-        out = torch.cat((out_0, out_1, out_2, out_3), 1)
-        return out
-
-
-@BACKBONES.register_module()
-class I3D(BaseBackbone):
-    """I3D backbone.
-
-    Please refer to the `paper <https://arxiv.org/abs/1705.07750>`__ for
-    details.
-
-    Args:
-    in_channels (int): Input channels of the backbone, which is decided
-        on the input modality.
-    expansion (float): The multiplier of in_channels and out_channels.
-        Default: 1.
-    """
-
-    def __init__(self, in_channels=3, expansion=1.0):
-        super(I3D, self).__init__()
-
-        # expansion must be an integer multiple of 1/8
-        expansion = round(8 * expansion) / 8.0
-
-        # xut Layer
-        self.conv3d_1a_7x7 = Conv3dBlock(
-            out_channels=64,
-            in_channels=in_channels / expansion,
-            expansion=expansion,
-            kernel_size=(7, 7, 7),
-            stride=(2, 2, 2),
-            padding=(2, 3, 3))
-        self.maxPool3d_2a_3x3 = nn.MaxPool3d(
-            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
-
-        # Layer 2
-        self.conv3d_2b_1x1 = Conv3dBlock(
-            out_channels=64,
-            in_channels=64,
-            expansion=expansion,
-            kernel_size=(1, 1, 1))
-        self.conv3d_2c_3x3 = Conv3dBlock(
-            out_channels=192,
-            in_channels=64,
-            expansion=expansion,
-            kernel_size=(3, 3, 3),
-            padding=(1, 1, 1))
-        self.maxPool3d_3a_3x3 = nn.MaxPool3d(
-            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
-
-        # Mixed_3b
-        self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32], expansion)
-        self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64], expansion)
-        self.maxPool3d_4a_3x3 = nn.MaxPool3d(
-            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
-
-        # Mixed 4
-        self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64], expansion)
-        self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64], expansion)
-        self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64], expansion)
-        self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64], expansion)
-        self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128], expansion)
-
-        self.maxPool3d_5a_2x2 = nn.MaxPool3d(
-            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0))
-
-        # Mixed 5
-        self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128], expansion)
-        self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128], expansion)
-
-    def forward(self, x):
-        out = self.conv3d_1a_7x7(x)
-        out = self.maxPool3d_2a_3x3(out)
-        out = self.conv3d_2b_1x1(out)
-        out = self.conv3d_2c_3x3(out)
-        out = self.maxPool3d_3a_3x3(out)
-        out = self.mixed_3b(out)
-        out = self.mixed_3c(out)
-        out = self.maxPool3d_4a_3x3(out)
-        out = self.mixed_4b(out)
-        out = self.mixed_4c(out)
-        out = self.mixed_4d(out)
-        out = self.mixed_4e(out)
-        out = self.mixed_4f(out)
-        out = self.maxPool3d_5a_2x2(out)
-        out = self.mixed_5b(out)
-        out = self.mixed_5c(out)
-        return out
diff --git a/main/transformer_utils/mmpose/models/backbones/litehrnet.py b/main/transformer_utils/mmpose/models/backbones/litehrnet.py
deleted file mode 100644
index 954368841eb631e3dc6c77e9810f6980f3739bf3..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/litehrnet.py
+++ /dev/null
@@ -1,984 +0,0 @@
-# ------------------------------------------------------------------------------
-# Adapted from https://github.com/HRNet/Lite-HRNet
-# Original licence: Apache License 2.0.
-# ------------------------------------------------------------------------------
-
-import mmcv
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as cp
-from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
-                      build_conv_layer, build_norm_layer, constant_init,
-                      normal_init)
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from mmpose.utils import get_root_logger
-from ..builder import BACKBONES
-from .utils import channel_shuffle, load_checkpoint
-
-
-class SpatialWeighting(nn.Module):
-    """Spatial weighting module.
-
-    Args:
-        channels (int): The channels of the module.
-        ratio (int): channel reduction ratio.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: None.
-        act_cfg (dict): Config dict for activation layer.
-            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
-            The last ConvModule uses Sigmoid by default.
-    """
-
-    def __init__(self,
-                 channels,
-                 ratio=16,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
-        super().__init__()
-        if isinstance(act_cfg, dict):
-            act_cfg = (act_cfg, act_cfg)
-        assert len(act_cfg) == 2
-        assert mmcv.is_tuple_of(act_cfg, dict)
-        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
-        self.conv1 = ConvModule(
-            in_channels=channels,
-            out_channels=int(channels / ratio),
-            kernel_size=1,
-            stride=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg[0])
-        self.conv2 = ConvModule(
-            in_channels=int(channels / ratio),
-            out_channels=channels,
-            kernel_size=1,
-            stride=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg[1])
-
-    def forward(self, x):
-        out = self.global_avgpool(x)
-        out = self.conv1(out)
-        out = self.conv2(out)
-        return x * out
-
-
-class CrossResolutionWeighting(nn.Module):
-    """Cross-resolution channel weighting module.
-
-    Args:
-        channels (int): The channels of the module.
-        ratio (int): channel reduction ratio.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: None.
-        act_cfg (dict): Config dict for activation layer.
-            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
-            The last ConvModule uses Sigmoid by default.
-    """
-
-    def __init__(self,
-                 channels,
-                 ratio=16,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
-        super().__init__()
-        if isinstance(act_cfg, dict):
-            act_cfg = (act_cfg, act_cfg)
-        assert len(act_cfg) == 2
-        assert mmcv.is_tuple_of(act_cfg, dict)
-        self.channels = channels
-        total_channel = sum(channels)
-        self.conv1 = ConvModule(
-            in_channels=total_channel,
-            out_channels=int(total_channel / ratio),
-            kernel_size=1,
-            stride=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg[0])
-        self.conv2 = ConvModule(
-            in_channels=int(total_channel / ratio),
-            out_channels=total_channel,
-            kernel_size=1,
-            stride=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg[1])
-
-    def forward(self, x):
-        mini_size = x[-1].size()[-2:]
-        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
-        out = torch.cat(out, dim=1)
-        out = self.conv1(out)
-        out = self.conv2(out)
-        out = torch.split(out, self.channels, dim=1)
-        out = [
-            s * F.interpolate(a, size=s.size()[-2:], mode='nearest')
-            for s, a in zip(x, out)
-        ]
-        return out
-
-
-class ConditionalChannelWeighting(nn.Module):
-    """Conditional channel weighting block.
-
-    Args:
-        in_channels (int): The input channels of the block.
-        stride (int): Stride of the 3x3 convolution layer.
-        reduce_ratio (int): channel reduction ratio.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 stride,
-                 reduce_ratio,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 with_cp=False):
-        super().__init__()
-        self.with_cp = with_cp
-        self.stride = stride
-        assert stride in [1, 2]
-
-        branch_channels = [channel // 2 for channel in in_channels]
-
-        self.cross_resolution_weighting = CrossResolutionWeighting(
-            branch_channels,
-            ratio=reduce_ratio,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg)
-
-        self.depthwise_convs = nn.ModuleList([
-            ConvModule(
-                channel,
-                channel,
-                kernel_size=3,
-                stride=self.stride,
-                padding=1,
-                groups=channel,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None) for channel in branch_channels
-        ])
-
-        self.spatial_weighting = nn.ModuleList([
-            SpatialWeighting(channels=channel, ratio=4)
-            for channel in branch_channels
-        ])
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            x = [s.chunk(2, dim=1) for s in x]
-            x1 = [s[0] for s in x]
-            x2 = [s[1] for s in x]
-
-            x2 = self.cross_resolution_weighting(x2)
-            x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
-            x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
-
-            out = [torch.cat([s1, s2], dim=1) for s1, s2 in zip(x1, x2)]
-            out = [channel_shuffle(s, 2) for s in out]
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-class Stem(nn.Module):
-    """Stem network block.
-
-    Args:
-        in_channels (int): The input channels of the block.
-        stem_channels (int): Output channels of the stem layer.
-        out_channels (int): The output channels of the block.
-        expand_ratio (int): adjusts number of channels of the hidden layer
-            in InvertedResidual by this amount.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 stem_channels,
-                 out_channels,
-                 expand_ratio,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 with_cp=False):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.with_cp = with_cp
-
-        self.conv1 = ConvModule(
-            in_channels=in_channels,
-            out_channels=stem_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=dict(type='ReLU'))
-
-        mid_channels = int(round(stem_channels * expand_ratio))
-        branch_channels = stem_channels // 2
-        if stem_channels == self.out_channels:
-            inc_channels = self.out_channels - branch_channels
-        else:
-            inc_channels = self.out_channels - stem_channels
-
-        self.branch1 = nn.Sequential(
-            ConvModule(
-                branch_channels,
-                branch_channels,
-                kernel_size=3,
-                stride=2,
-                padding=1,
-                groups=branch_channels,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None),
-            ConvModule(
-                branch_channels,
-                inc_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=dict(type='ReLU')),
-        )
-
-        self.expand_conv = ConvModule(
-            branch_channels,
-            mid_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=dict(type='ReLU'))
-        self.depthwise_conv = ConvModule(
-            mid_channels,
-            mid_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            groups=mid_channels,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=None)
-        self.linear_conv = ConvModule(
-            mid_channels,
-            branch_channels
-            if stem_channels == self.out_channels else stem_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=dict(type='ReLU'))
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            x = self.conv1(x)
-            x1, x2 = x.chunk(2, dim=1)
-
-            x2 = self.expand_conv(x2)
-            x2 = self.depthwise_conv(x2)
-            x2 = self.linear_conv(x2)
-
-            out = torch.cat((self.branch1(x1), x2), dim=1)
-
-            out = channel_shuffle(out, 2)
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-class IterativeHead(nn.Module):
-    """Extra iterative head for feature learning.
-
-    Args:
-        in_channels (int): The input channels of the block.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-    """
-
-    def __init__(self, in_channels, norm_cfg=dict(type='BN')):
-        super().__init__()
-        projects = []
-        num_branchs = len(in_channels)
-        self.in_channels = in_channels[::-1]
-
-        for i in range(num_branchs):
-            if i != num_branchs - 1:
-                projects.append(
-                    DepthwiseSeparableConvModule(
-                        in_channels=self.in_channels[i],
-                        out_channels=self.in_channels[i + 1],
-                        kernel_size=3,
-                        stride=1,
-                        padding=1,
-                        norm_cfg=norm_cfg,
-                        act_cfg=dict(type='ReLU'),
-                        dw_act_cfg=None,
-                        pw_act_cfg=dict(type='ReLU')))
-            else:
-                projects.append(
-                    DepthwiseSeparableConvModule(
-                        in_channels=self.in_channels[i],
-                        out_channels=self.in_channels[i],
-                        kernel_size=3,
-                        stride=1,
-                        padding=1,
-                        norm_cfg=norm_cfg,
-                        act_cfg=dict(type='ReLU'),
-                        dw_act_cfg=None,
-                        pw_act_cfg=dict(type='ReLU')))
-        self.projects = nn.ModuleList(projects)
-
-    def forward(self, x):
-        x = x[::-1]
-
-        y = []
-        last_x = None
-        for i, s in enumerate(x):
-            if last_x is not None:
-                last_x = F.interpolate(
-                    last_x,
-                    size=s.size()[-2:],
-                    mode='bilinear',
-                    align_corners=True)
-                s = s + last_x
-            s = self.projects[i](s)
-            y.append(s)
-            last_x = s
-
-        return y[::-1]
-
-
-class ShuffleUnit(nn.Module):
-    """InvertedResidual block for ShuffleNetV2 backbone.
-
-    Args:
-        in_channels (int): The input channels of the block.
-        out_channels (int): The output channels of the block.
-        stride (int): Stride of the 3x3 convolution layer. Default: 1
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride=1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 with_cp=False):
-        super().__init__()
-        self.stride = stride
-        self.with_cp = with_cp
-
-        branch_features = out_channels // 2
-        if self.stride == 1:
-            assert in_channels == branch_features * 2, (
-                f'in_channels ({in_channels}) should equal to '
-                f'branch_features * 2 ({branch_features * 2}) '
-                'when stride is 1')
-
-        if in_channels != branch_features * 2:
-            assert self.stride != 1, (
-                f'stride ({self.stride}) should not equal 1 when '
-                f'in_channels != branch_features * 2')
-
-        if self.stride > 1:
-            self.branch1 = nn.Sequential(
-                ConvModule(
-                    in_channels,
-                    in_channels,
-                    kernel_size=3,
-                    stride=self.stride,
-                    padding=1,
-                    groups=in_channels,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=None),
-                ConvModule(
-                    in_channels,
-                    branch_features,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg),
-            )
-
-        self.branch2 = nn.Sequential(
-            ConvModule(
-                in_channels if (self.stride > 1) else branch_features,
-                branch_features,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg),
-            ConvModule(
-                branch_features,
-                branch_features,
-                kernel_size=3,
-                stride=self.stride,
-                padding=1,
-                groups=branch_features,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None),
-            ConvModule(
-                branch_features,
-                branch_features,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg))
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            if self.stride > 1:
-                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
-            else:
-                x1, x2 = x.chunk(2, dim=1)
-                out = torch.cat((x1, self.branch2(x2)), dim=1)
-
-            out = channel_shuffle(out, 2)
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-class LiteHRModule(nn.Module):
-    """High-Resolution Module for LiteHRNet.
-
-    It contains conditional channel weighting blocks and
-    shuffle blocks.
-
-
-    Args:
-        num_branches (int): Number of branches in the module.
-        num_blocks (int): Number of blocks in the module.
-        in_channels (list(int)): Number of input image channels.
-        reduce_ratio (int): Channel reduction ratio.
-        module_type (str): 'LITE' or 'NAIVE'
-        multiscale_output (bool): Whether to output multi-scale features.
-        with_fuse (bool): Whether to use fuse layers.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-        norm_cfg (dict): dictionary to construct and config norm layer.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-    """
-
-    def __init__(
-            self,
-            num_branches,
-            num_blocks,
-            in_channels,
-            reduce_ratio,
-            module_type,
-            multiscale_output=False,
-            with_fuse=True,
-            conv_cfg=None,
-            norm_cfg=dict(type='BN'),
-            with_cp=False,
-    ):
-        super().__init__()
-        self._check_branches(num_branches, in_channels)
-
-        self.in_channels = in_channels
-        self.num_branches = num_branches
-
-        self.module_type = module_type
-        self.multiscale_output = multiscale_output
-        self.with_fuse = with_fuse
-        self.norm_cfg = norm_cfg
-        self.conv_cfg = conv_cfg
-        self.with_cp = with_cp
-
-        if self.module_type.upper() == 'LITE':
-            self.layers = self._make_weighting_blocks(num_blocks, reduce_ratio)
-        elif self.module_type.upper() == 'NAIVE':
-            self.layers = self._make_naive_branches(num_branches, num_blocks)
-        else:
-            raise ValueError("module_type should be either 'LITE' or 'NAIVE'.")
-        if self.with_fuse:
-            self.fuse_layers = self._make_fuse_layers()
-            self.relu = nn.ReLU()
-
-    def _check_branches(self, num_branches, in_channels):
-        """Check input to avoid ValueError."""
-        if num_branches != len(in_channels):
-            error_msg = f'NUM_BRANCHES({num_branches}) ' \
-                f'!= NUM_INCHANNELS({len(in_channels)})'
-            raise ValueError(error_msg)
-
-    def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1):
-        """Make channel weighting blocks."""
-        layers = []
-        for i in range(num_blocks):
-            layers.append(
-                ConditionalChannelWeighting(
-                    self.in_channels,
-                    stride=stride,
-                    reduce_ratio=reduce_ratio,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    with_cp=self.with_cp))
-
-        return nn.Sequential(*layers)
-
-    def _make_one_branch(self, branch_index, num_blocks, stride=1):
-        """Make one branch."""
-        layers = []
-        layers.append(
-            ShuffleUnit(
-                self.in_channels[branch_index],
-                self.in_channels[branch_index],
-                stride=stride,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=dict(type='ReLU'),
-                with_cp=self.with_cp))
-        for i in range(1, num_blocks):
-            layers.append(
-                ShuffleUnit(
-                    self.in_channels[branch_index],
-                    self.in_channels[branch_index],
-                    stride=1,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=dict(type='ReLU'),
-                    with_cp=self.with_cp))
-
-        return nn.Sequential(*layers)
-
-    def _make_naive_branches(self, num_branches, num_blocks):
-        """Make branches."""
-        branches = []
-
-        for i in range(num_branches):
-            branches.append(self._make_one_branch(i, num_blocks))
-
-        return nn.ModuleList(branches)
-
-    def _make_fuse_layers(self):
-        """Make fuse layer."""
-        if self.num_branches == 1:
-            return None
-
-        num_branches = self.num_branches
-        in_channels = self.in_channels
-        fuse_layers = []
-        num_out_branches = num_branches if self.multiscale_output else 1
-        for i in range(num_out_branches):
-            fuse_layer = []
-            for j in range(num_branches):
-                if j > i:
-                    fuse_layer.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels[j],
-                                in_channels[i],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
-                            nn.Upsample(
-                                scale_factor=2**(j - i), mode='nearest')))
-                elif j == i:
-                    fuse_layer.append(None)
-                else:
-                    conv_downsamples = []
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        groups=in_channels[j],
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[j])[1],
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[i],
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0,
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[i])[1]))
-                        else:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        groups=in_channels[j],
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[j])[1],
-                                    build_conv_layer(
-                                        self.conv_cfg,
-                                        in_channels[j],
-                                        in_channels[j],
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0,
-                                        bias=False),
-                                    build_norm_layer(self.norm_cfg,
-                                                     in_channels[j])[1],
-                                    nn.ReLU(inplace=True)))
-                    fuse_layer.append(nn.Sequential(*conv_downsamples))
-            fuse_layers.append(nn.ModuleList(fuse_layer))
-
-        return nn.ModuleList(fuse_layers)
-
-    def forward(self, x):
-        """Forward function."""
-        if self.num_branches == 1:
-            return [self.layers[0](x[0])]
-
-        if self.module_type.upper() == 'LITE':
-            out = self.layers(x)
-        elif self.module_type.upper() == 'NAIVE':
-            for i in range(self.num_branches):
-                x[i] = self.layers[i](x[i])
-            out = x
-
-        if self.with_fuse:
-            out_fuse = []
-            for i in range(len(self.fuse_layers)):
-                # `y = 0` will lead to decreased accuracy (0.5~1 mAP)
-                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
-                for j in range(self.num_branches):
-                    if i == j:
-                        y += out[j]
-                    else:
-                        y += self.fuse_layers[i][j](out[j])
-                out_fuse.append(self.relu(y))
-            out = out_fuse
-        if not self.multiscale_output:
-            out = [out[0]]
-        return out
-
-
-@BACKBONES.register_module()
-class LiteHRNet(nn.Module):
-    """Lite-HRNet backbone.
-
-    `Lite-HRNet: A Lightweight High-Resolution Network
-    <https://arxiv.org/abs/2104.06403>`_.
-
-    Code adapted from 'https://github.com/HRNet/Lite-HRNet'.
-
-    Args:
-        extra (dict): detailed configuration for each stage of HRNet.
-        in_channels (int): Number of input image channels. Default: 3.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-        norm_cfg (dict): dictionary to construct and config norm layer.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-
-    Example:
-        >>> from mmpose.models import LiteHRNet
-        >>> import torch
-        >>> extra=dict(
-        >>>    stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
-        >>>    num_stages=3,
-        >>>    stages_spec=dict(
-        >>>        num_modules=(2, 4, 2),
-        >>>        num_branches=(2, 3, 4),
-        >>>        num_blocks=(2, 2, 2),
-        >>>        module_type=('LITE', 'LITE', 'LITE'),
-        >>>        with_fuse=(True, True, True),
-        >>>        reduce_ratios=(8, 8, 8),
-        >>>        num_channels=(
-        >>>            (40, 80),
-        >>>            (40, 80, 160),
-        >>>            (40, 80, 160, 320),
-        >>>        )),
-        >>>    with_head=False)
-        >>> self = LiteHRNet(extra, in_channels=1)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 1, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 40, 8, 8)
-    """
-
-    def __init__(self,
-                 extra,
-                 in_channels=3,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 norm_eval=False,
-                 with_cp=False):
-        super().__init__()
-        self.extra = extra
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        self.stem = Stem(
-            in_channels,
-            stem_channels=self.extra['stem']['stem_channels'],
-            out_channels=self.extra['stem']['out_channels'],
-            expand_ratio=self.extra['stem']['expand_ratio'],
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg)
-
-        self.num_stages = self.extra['num_stages']
-        self.stages_spec = self.extra['stages_spec']
-
-        num_channels_last = [
-            self.stem.out_channels,
-        ]
-        for i in range(self.num_stages):
-            num_channels = self.stages_spec['num_channels'][i]
-            num_channels = [num_channels[i] for i in range(len(num_channels))]
-            setattr(
-                self, f'transition{i}',
-                self._make_transition_layer(num_channels_last, num_channels))
-
-            stage, num_channels_last = self._make_stage(
-                self.stages_spec, i, num_channels, multiscale_output=True)
-            setattr(self, f'stage{i}', stage)
-
-        self.with_head = self.extra['with_head']
-        if self.with_head:
-            self.head_layer = IterativeHead(
-                in_channels=num_channels_last,
-                norm_cfg=self.norm_cfg,
-            )
-
-    def _make_transition_layer(self, num_channels_pre_layer,
-                               num_channels_cur_layer):
-        """Make transition layer."""
-        num_branches_cur = len(num_channels_cur_layer)
-        num_branches_pre = len(num_channels_pre_layer)
-
-        transition_layers = []
-        for i in range(num_branches_cur):
-            if i < num_branches_pre:
-                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
-                    transition_layers.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_channels_pre_layer[i],
-                                num_channels_pre_layer[i],
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                groups=num_channels_pre_layer[i],
-                                bias=False),
-                            build_norm_layer(self.norm_cfg,
-                                             num_channels_pre_layer[i])[1],
-                            build_conv_layer(
-                                self.conv_cfg,
-                                num_channels_pre_layer[i],
-                                num_channels_cur_layer[i],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg,
-                                             num_channels_cur_layer[i])[1],
-                            nn.ReLU()))
-                else:
-                    transition_layers.append(None)
-            else:
-                conv_downsamples = []
-                for j in range(i + 1 - num_branches_pre):
-                    in_channels = num_channels_pre_layer[-1]
-                    out_channels = num_channels_cur_layer[i] \
-                        if j == i - num_branches_pre else in_channels
-                    conv_downsamples.append(
-                        nn.Sequential(
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels,
-                                in_channels,
-                                kernel_size=3,
-                                stride=2,
-                                padding=1,
-                                groups=in_channels,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg, in_channels)[1],
-                            build_conv_layer(
-                                self.conv_cfg,
-                                in_channels,
-                                out_channels,
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False),
-                            build_norm_layer(self.norm_cfg, out_channels)[1],
-                            nn.ReLU()))
-                transition_layers.append(nn.Sequential(*conv_downsamples))
-
-        return nn.ModuleList(transition_layers)
-
-    def _make_stage(self,
-                    stages_spec,
-                    stage_index,
-                    in_channels,
-                    multiscale_output=True):
-        num_modules = stages_spec['num_modules'][stage_index]
-        num_branches = stages_spec['num_branches'][stage_index]
-        num_blocks = stages_spec['num_blocks'][stage_index]
-        reduce_ratio = stages_spec['reduce_ratios'][stage_index]
-        with_fuse = stages_spec['with_fuse'][stage_index]
-        module_type = stages_spec['module_type'][stage_index]
-
-        modules = []
-        for i in range(num_modules):
-            # multi_scale_output is only used last module
-            if not multiscale_output and i == num_modules - 1:
-                reset_multiscale_output = False
-            else:
-                reset_multiscale_output = True
-
-            modules.append(
-                LiteHRModule(
-                    num_branches,
-                    num_blocks,
-                    in_channels,
-                    reduce_ratio,
-                    module_type,
-                    multiscale_output=reset_multiscale_output,
-                    with_fuse=with_fuse,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    with_cp=self.with_cp))
-            in_channels = modules[-1].in_channels
-
-        return nn.Sequential(*modules), in_channels
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    normal_init(m, std=0.001)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        """Forward function."""
-        x = self.stem(x)
-
-        y_list = [x]
-        for i in range(self.num_stages):
-            x_list = []
-            transition = getattr(self, f'transition{i}')
-            for j in range(self.stages_spec['num_branches'][i]):
-                if transition[j]:
-                    if j >= len(y_list):
-                        x_list.append(transition[j](y_list[-1]))
-                    else:
-                        x_list.append(transition[j](y_list[j]))
-                else:
-                    x_list.append(y_list[j])
-            y_list = getattr(self, f'stage{i}')(x_list)
-
-        x = y_list
-        if self.with_head:
-            x = self.head_layer(x)
-
-        return [x[0]]
-
-    def train(self, mode=True):
-        """Convert the model into training mode."""
-        super().train(mode)
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py b/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py
deleted file mode 100644
index 5dc0cd1b7dfdec2aa751861e39fc1c1a45ec488e..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-from mmcv.cnn import ConvModule, constant_init, kaiming_init
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .utils import load_checkpoint, make_divisible
-
-
-class InvertedResidual(nn.Module):
-    """InvertedResidual block for MobileNetV2.
-
-    Args:
-        in_channels (int): The input channels of the InvertedResidual block.
-        out_channels (int): The output channels of the InvertedResidual block.
-        stride (int): Stride of the middle (first) 3x3 convolution.
-        expand_ratio (int): adjusts number of channels of the hidden layer
-            in InvertedResidual by this amount.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU6').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride,
-                 expand_ratio,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU6'),
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        act_cfg = copy.deepcopy(act_cfg)
-        super().__init__()
-        self.stride = stride
-        assert stride in [1, 2], f'stride must in [1, 2]. ' \
-            f'But received {stride}.'
-        self.with_cp = with_cp
-        self.use_res_connect = self.stride == 1 and in_channels == out_channels
-        hidden_dim = int(round(in_channels * expand_ratio))
-
-        layers = []
-        if expand_ratio != 1:
-            layers.append(
-                ConvModule(
-                    in_channels=in_channels,
-                    out_channels=hidden_dim,
-                    kernel_size=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-        layers.extend([
-            ConvModule(
-                in_channels=hidden_dim,
-                out_channels=hidden_dim,
-                kernel_size=3,
-                stride=stride,
-                padding=1,
-                groups=hidden_dim,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg),
-            ConvModule(
-                in_channels=hidden_dim,
-                out_channels=out_channels,
-                kernel_size=1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None)
-        ])
-        self.conv = nn.Sequential(*layers)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            if self.use_res_connect:
-                return x + self.conv(x)
-            return self.conv(x)
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-@BACKBONES.register_module()
-class MobileNetV2(BaseBackbone):
-    """MobileNetV2 backbone.
-
-    Args:
-        widen_factor (float): Width multiplier, multiply number of
-            channels in each layer by this amount. Default: 1.0.
-        out_indices (None or Sequence[int]): Output from which stages.
-            Default: (7, ).
-        frozen_stages (int): Stages to be frozen (all param fixed).
-            Default: -1, which means not freezing any parameters.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU6').
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    # Parameters to build layers. 4 parameters are needed to construct a
-    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
-    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
-                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
-                     [6, 320, 1, 1]]
-
-    def __init__(self,
-                 widen_factor=1.,
-                 out_indices=(7, ),
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU6'),
-                 norm_eval=False,
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        act_cfg = copy.deepcopy(act_cfg)
-        super().__init__()
-        self.widen_factor = widen_factor
-        self.out_indices = out_indices
-        for index in out_indices:
-            if index not in range(0, 8):
-                raise ValueError('the item in out_indices must in '
-                                 f'range(0, 8). But received {index}')
-
-        if frozen_stages not in range(-1, 8):
-            raise ValueError('frozen_stages must be in range(-1, 8). '
-                             f'But received {frozen_stages}')
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        self.in_channels = make_divisible(32 * widen_factor, 8)
-
-        self.conv1 = ConvModule(
-            in_channels=3,
-            out_channels=self.in_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-        self.layers = []
-
-        for i, layer_cfg in enumerate(self.arch_settings):
-            expand_ratio, channel, num_blocks, stride = layer_cfg
-            out_channels = make_divisible(channel * widen_factor, 8)
-            inverted_res_layer = self.make_layer(
-                out_channels=out_channels,
-                num_blocks=num_blocks,
-                stride=stride,
-                expand_ratio=expand_ratio)
-            layer_name = f'layer{i + 1}'
-            self.add_module(layer_name, inverted_res_layer)
-            self.layers.append(layer_name)
-
-        if widen_factor > 1.0:
-            self.out_channel = int(1280 * widen_factor)
-        else:
-            self.out_channel = 1280
-
-        layer = ConvModule(
-            in_channels=self.in_channels,
-            out_channels=self.out_channel,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-        self.add_module('conv2', layer)
-        self.layers.append('conv2')
-
-    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
-        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
-
-        Args:
-            out_channels (int): out_channels of block.
-            num_blocks (int): number of blocks.
-            stride (int): stride of the first block. Default: 1
-            expand_ratio (int): Expand the number of channels of the
-                hidden layer in InvertedResidual by this ratio. Default: 6.
-        """
-        layers = []
-        for i in range(num_blocks):
-            if i >= 1:
-                stride = 1
-            layers.append(
-                InvertedResidual(
-                    self.in_channels,
-                    out_channels,
-                    stride,
-                    expand_ratio=expand_ratio,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
-                    with_cp=self.with_cp))
-            self.in_channels = out_channels
-
-        return nn.Sequential(*layers)
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        x = self.conv1(x)
-
-        outs = []
-        for i, layer_name in enumerate(self.layers):
-            layer = getattr(self, layer_name)
-            x = layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-
-        if len(outs) == 1:
-            return outs[0]
-        return tuple(outs)
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-        for i in range(1, self.frozen_stages + 1):
-            layer = getattr(self, f'layer{i}')
-            layer.eval()
-            for param in layer.parameters():
-                param.requires_grad = False
-
-    def train(self, mode=True):
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py b/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py
deleted file mode 100644
index d640abec79f06d689f2d4bc1e92999946bc07261..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-
-import torch.nn as nn
-from mmcv.cnn import ConvModule, constant_init, kaiming_init
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .utils import InvertedResidual, load_checkpoint
-
-
-@BACKBONES.register_module()
-class MobileNetV3(BaseBackbone):
-    """MobileNetV3 backbone.
-
-    Args:
-        arch (str): Architecture of mobilnetv3, from {small, big}.
-            Default: small.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        out_indices (None or Sequence[int]): Output from which stages.
-            Default: (-1, ), which means output tensors from final stage.
-        frozen_stages (int): Stages to be frozen (all param fixed).
-            Default: -1, which means not freezing any parameters.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save
-            some memory while slowing down the training speed.
-            Default: False.
-    """
-    # Parameters to build each block:
-    #     [kernel size, mid channels, out channels, with_se, act type, stride]
-    arch_settings = {
-        'small': [[3, 16, 16, True, 'ReLU', 2],
-                  [3, 72, 24, False, 'ReLU', 2],
-                  [3, 88, 24, False, 'ReLU', 1],
-                  [5, 96, 40, True, 'HSwish', 2],
-                  [5, 240, 40, True, 'HSwish', 1],
-                  [5, 240, 40, True, 'HSwish', 1],
-                  [5, 120, 48, True, 'HSwish', 1],
-                  [5, 144, 48, True, 'HSwish', 1],
-                  [5, 288, 96, True, 'HSwish', 2],
-                  [5, 576, 96, True, 'HSwish', 1],
-                  [5, 576, 96, True, 'HSwish', 1]],
-        'big': [[3, 16, 16, False, 'ReLU', 1],
-                [3, 64, 24, False, 'ReLU', 2],
-                [3, 72, 24, False, 'ReLU', 1],
-                [5, 72, 40, True, 'ReLU', 2],
-                [5, 120, 40, True, 'ReLU', 1],
-                [5, 120, 40, True, 'ReLU', 1],
-                [3, 240, 80, False, 'HSwish', 2],
-                [3, 200, 80, False, 'HSwish', 1],
-                [3, 184, 80, False, 'HSwish', 1],
-                [3, 184, 80, False, 'HSwish', 1],
-                [3, 480, 112, True, 'HSwish', 1],
-                [3, 672, 112, True, 'HSwish', 1],
-                [5, 672, 160, True, 'HSwish', 1],
-                [5, 672, 160, True, 'HSwish', 2],
-                [5, 960, 160, True, 'HSwish', 1]]
-    }  # yapf: disable
-
-    def __init__(self,
-                 arch='small',
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 out_indices=(-1, ),
-                 frozen_stages=-1,
-                 norm_eval=False,
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        assert arch in self.arch_settings
-        for index in out_indices:
-            if index not in range(-len(self.arch_settings[arch]),
-                                  len(self.arch_settings[arch])):
-                raise ValueError('the item in out_indices must in '
-                                 f'range(0, {len(self.arch_settings[arch])}). '
-                                 f'But received {index}')
-
-        if frozen_stages not in range(-1, len(self.arch_settings[arch])):
-            raise ValueError('frozen_stages must be in range(-1, '
-                             f'{len(self.arch_settings[arch])}). '
-                             f'But received {frozen_stages}')
-        self.arch = arch
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        self.in_channels = 16
-        self.conv1 = ConvModule(
-            in_channels=3,
-            out_channels=self.in_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=dict(type='HSwish'))
-
-        self.layers = self._make_layer()
-        self.feat_dim = self.arch_settings[arch][-1][2]
-
-    def _make_layer(self):
-        layers = []
-        layer_setting = self.arch_settings[self.arch]
-        for i, params in enumerate(layer_setting):
-            (kernel_size, mid_channels, out_channels, with_se, act,
-             stride) = params
-            if with_se:
-                se_cfg = dict(
-                    channels=mid_channels,
-                    ratio=4,
-                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
-            else:
-                se_cfg = None
-
-            layer = InvertedResidual(
-                in_channels=self.in_channels,
-                out_channels=out_channels,
-                mid_channels=mid_channels,
-                kernel_size=kernel_size,
-                stride=stride,
-                se_cfg=se_cfg,
-                with_expand_conv=True,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                act_cfg=dict(type=act),
-                with_cp=self.with_cp)
-            self.in_channels = out_channels
-            layer_name = f'layer{i + 1}'
-            self.add_module(layer_name, layer)
-            layers.append(layer_name)
-        return layers
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, nn.BatchNorm2d):
-                    constant_init(m, 1)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        x = self.conv1(x)
-
-        outs = []
-        for i, layer_name in enumerate(self.layers):
-            layer = getattr(self, layer_name)
-            x = layer(x)
-            if i in self.out_indices or \
-                    i - len(self.layers) in self.out_indices:
-                outs.append(x)
-
-        if len(outs) == 1:
-            return outs[0]
-        return tuple(outs)
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-        for i in range(1, self.frozen_stages + 1):
-            layer = getattr(self, f'layer{i}')
-            layer.eval()
-            for param in layer.parameters():
-                param.requires_grad = False
-
-    def train(self, mode=True):
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py b/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py
index 44feef44dfc43a7b40b82752d9a82df35f1108ba..7f93a99db49704b7e1aeb71fb5e209298465dcb0 100644
--- a/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py
+++ b/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py
@@ -12,13 +12,11 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as cp
 from .transformer_block import TransformerBlock
-
+from mmengine.model import constant_init, kaiming_init
 from mmcv.cnn import (
     build_conv_layer,
     build_norm_layer,
     build_plugin_layer,
-    constant_init,
-    kaiming_init,
 )
 
 
diff --git a/main/transformer_utils/mmpose/models/backbones/mspn.py b/main/transformer_utils/mmpose/models/backbones/mspn.py
deleted file mode 100644
index 71cee34e399780e8b67eac43d862b65a3ce05412..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/mspn.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy as cp
-from collections import OrderedDict
-
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
-                      normal_init)
-from mmcv.runner.checkpoint import load_state_dict
-
-from mmpose.utils import get_root_logger
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .resnet import Bottleneck as _Bottleneck
-from .utils.utils import get_state_dict
-
-
-class Bottleneck(_Bottleneck):
-    expansion = 4
-    """Bottleneck block for MSPN.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        stride (int): stride of the block. Default: 1
-        downsample (nn.Module): downsample operation on identity branch.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-    """
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super().__init__(in_channels, out_channels * 4, **kwargs)
-
-
-class DownsampleModule(nn.Module):
-    """Downsample module for MSPN.
-
-    Args:
-        block (nn.Module): Downsample block.
-        num_blocks (list): Number of blocks in each downsample unit.
-        num_units (int): Numbers of downsample units. Default: 4
-        has_skip (bool): Have skip connections from prior upsample
-            module or not. Default:False
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        in_channels (int): Number of channels of the input feature to
-            downsample module. Default: 64
-    """
-
-    def __init__(self,
-                 block,
-                 num_blocks,
-                 num_units=4,
-                 has_skip=False,
-                 norm_cfg=dict(type='BN'),
-                 in_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.has_skip = has_skip
-        self.in_channels = in_channels
-        assert len(num_blocks) == num_units
-        self.num_blocks = num_blocks
-        self.num_units = num_units
-        self.norm_cfg = norm_cfg
-        self.layer1 = self._make_layer(block, in_channels, num_blocks[0])
-        for i in range(1, num_units):
-            module_name = f'layer{i + 1}'
-            self.add_module(
-                module_name,
-                self._make_layer(
-                    block, in_channels * pow(2, i), num_blocks[i], stride=2))
-
-    def _make_layer(self, block, out_channels, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.in_channels != out_channels * block.expansion:
-            downsample = ConvModule(
-                self.in_channels,
-                out_channels * block.expansion,
-                kernel_size=1,
-                stride=stride,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                act_cfg=None,
-                inplace=True)
-
-        units = list()
-        units.append(
-            block(
-                self.in_channels,
-                out_channels,
-                stride=stride,
-                downsample=downsample,
-                norm_cfg=self.norm_cfg))
-        self.in_channels = out_channels * block.expansion
-        for _ in range(1, blocks):
-            units.append(block(self.in_channels, out_channels))
-
-        return nn.Sequential(*units)
-
-    def forward(self, x, skip1, skip2):
-        out = list()
-        for i in range(self.num_units):
-            module_name = f'layer{i + 1}'
-            module_i = getattr(self, module_name)
-            x = module_i(x)
-            if self.has_skip:
-                x = x + skip1[i] + skip2[i]
-            out.append(x)
-        out.reverse()
-
-        return tuple(out)
-
-
-class UpsampleUnit(nn.Module):
-    """Upsample unit for upsample module.
-
-    Args:
-        ind (int): Indicates whether to interpolate (>0) and whether to
-           generate feature map for the next hourglass-like module.
-        num_units (int): Number of units that form a upsample module. Along
-            with ind and gen_cross_conv, nm_units is used to decide whether
-            to generate feature map for the next hourglass-like module.
-        in_channels (int): Channel number of the skip-in feature maps from
-            the corresponding downsample unit.
-        unit_channels (int): Channel number in this unit. Default:256.
-        gen_skip: (bool): Whether or not to generate skips for the posterior
-            downsample module. Default:False
-        gen_cross_conv (bool): Whether to generate feature map for the next
-            hourglass-like module. Default:False
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        out_channels (int): Number of channels of feature output by upsample
-            module. Must equal to in_channels of downsample module. Default:64
-    """
-
-    def __init__(self,
-                 ind,
-                 num_units,
-                 in_channels,
-                 unit_channels=256,
-                 gen_skip=False,
-                 gen_cross_conv=False,
-                 norm_cfg=dict(type='BN'),
-                 out_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.num_units = num_units
-        self.norm_cfg = norm_cfg
-        self.in_skip = ConvModule(
-            in_channels,
-            unit_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            norm_cfg=self.norm_cfg,
-            act_cfg=None,
-            inplace=True)
-        self.relu = nn.ReLU(inplace=True)
-
-        self.ind = ind
-        if self.ind > 0:
-            self.up_conv = ConvModule(
-                unit_channels,
-                unit_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                act_cfg=None,
-                inplace=True)
-
-        self.gen_skip = gen_skip
-        if self.gen_skip:
-            self.out_skip1 = ConvModule(
-                in_channels,
-                in_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                inplace=True)
-
-            self.out_skip2 = ConvModule(
-                unit_channels,
-                in_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                inplace=True)
-
-        self.gen_cross_conv = gen_cross_conv
-        if self.ind == num_units - 1 and self.gen_cross_conv:
-            self.cross_conv = ConvModule(
-                unit_channels,
-                out_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                inplace=True)
-
-    def forward(self, x, up_x):
-        out = self.in_skip(x)
-
-        if self.ind > 0:
-            up_x = F.interpolate(
-                up_x,
-                size=(x.size(2), x.size(3)),
-                mode='bilinear',
-                align_corners=True)
-            up_x = self.up_conv(up_x)
-            out = out + up_x
-        out = self.relu(out)
-
-        skip1 = None
-        skip2 = None
-        if self.gen_skip:
-            skip1 = self.out_skip1(x)
-            skip2 = self.out_skip2(out)
-
-        cross_conv = None
-        if self.ind == self.num_units - 1 and self.gen_cross_conv:
-            cross_conv = self.cross_conv(out)
-
-        return out, skip1, skip2, cross_conv
-
-
-class UpsampleModule(nn.Module):
-    """Upsample module for MSPN.
-
-    Args:
-        unit_channels (int): Channel number in the upsample units.
-            Default:256.
-        num_units (int): Numbers of upsample units. Default: 4
-        gen_skip (bool): Whether to generate skip for posterior downsample
-            module or not. Default:False
-        gen_cross_conv (bool): Whether to generate feature map for the next
-            hourglass-like module. Default:False
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        out_channels (int): Number of channels of feature output by upsample
-            module. Must equal to in_channels of downsample module. Default:64
-    """
-
-    def __init__(self,
-                 unit_channels=256,
-                 num_units=4,
-                 gen_skip=False,
-                 gen_cross_conv=False,
-                 norm_cfg=dict(type='BN'),
-                 out_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.in_channels = list()
-        for i in range(num_units):
-            self.in_channels.append(Bottleneck.expansion * out_channels *
-                                    pow(2, i))
-        self.in_channels.reverse()
-        self.num_units = num_units
-        self.gen_skip = gen_skip
-        self.gen_cross_conv = gen_cross_conv
-        self.norm_cfg = norm_cfg
-        for i in range(num_units):
-            module_name = f'up{i + 1}'
-            self.add_module(
-                module_name,
-                UpsampleUnit(
-                    i,
-                    self.num_units,
-                    self.in_channels[i],
-                    unit_channels,
-                    self.gen_skip,
-                    self.gen_cross_conv,
-                    norm_cfg=self.norm_cfg,
-                    out_channels=64))
-
-    def forward(self, x):
-        out = list()
-        skip1 = list()
-        skip2 = list()
-        cross_conv = None
-        for i in range(self.num_units):
-            module_i = getattr(self, f'up{i + 1}')
-            if i == 0:
-                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
-            elif i == self.num_units - 1:
-                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
-            else:
-                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
-            out.append(outi)
-            skip1.append(skip1_i)
-            skip2.append(skip2_i)
-        skip1.reverse()
-        skip2.reverse()
-
-        return out, skip1, skip2, cross_conv
-
-
-class SingleStageNetwork(nn.Module):
-    """Single_stage Network.
-
-    Args:
-        unit_channels (int): Channel number in the upsample units. Default:256.
-        num_units (int): Numbers of downsample/upsample units. Default: 4
-        gen_skip (bool): Whether to generate skip for posterior downsample
-            module or not. Default:False
-        gen_cross_conv (bool): Whether to generate feature map for the next
-            hourglass-like module. Default:False
-        has_skip (bool): Have skip connections from prior upsample
-            module or not. Default:False
-        num_blocks (list): Number of blocks in each downsample unit.
-            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        in_channels (int): Number of channels of the feature from ResNetTop.
-            Default: 64.
-    """
-
-    def __init__(self,
-                 has_skip=False,
-                 gen_skip=False,
-                 gen_cross_conv=False,
-                 unit_channels=256,
-                 num_units=4,
-                 num_blocks=[2, 2, 2, 2],
-                 norm_cfg=dict(type='BN'),
-                 in_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        num_blocks = cp.deepcopy(num_blocks)
-        super().__init__()
-        assert len(num_blocks) == num_units
-        self.has_skip = has_skip
-        self.gen_skip = gen_skip
-        self.gen_cross_conv = gen_cross_conv
-        self.num_units = num_units
-        self.unit_channels = unit_channels
-        self.num_blocks = num_blocks
-        self.norm_cfg = norm_cfg
-
-        self.downsample = DownsampleModule(Bottleneck, num_blocks, num_units,
-                                           has_skip, norm_cfg, in_channels)
-        self.upsample = UpsampleModule(unit_channels, num_units, gen_skip,
-                                       gen_cross_conv, norm_cfg, in_channels)
-
-    def forward(self, x, skip1, skip2):
-        mid = self.downsample(x, skip1, skip2)
-        out, skip1, skip2, cross_conv = self.upsample(mid)
-
-        return out, skip1, skip2, cross_conv
-
-
-class ResNetTop(nn.Module):
-    """ResNet top for MSPN.
-
-    Args:
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        channels (int): Number of channels of the feature output by ResNetTop.
-    """
-
-    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.top = nn.Sequential(
-            ConvModule(
-                3,
-                channels,
-                kernel_size=7,
-                stride=2,
-                padding=3,
-                norm_cfg=norm_cfg,
-                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
-
-    def forward(self, img):
-        return self.top(img)
-
-
-@BACKBONES.register_module()
-class MSPN(BaseBackbone):
-    """MSPN backbone. Paper ref: Li et al. "Rethinking on Multi-Stage Networks
-    for Human Pose Estimation" (CVPR 2020).
-
-    Args:
-        unit_channels (int): Number of Channels in an upsample unit.
-            Default: 256
-        num_stages (int): Number of stages in a multi-stage MSPN. Default: 4
-        num_units (int): Number of downsample/upsample units in a single-stage
-            network. Default: 4
-            Note: Make sure num_units == len(self.num_blocks)
-        num_blocks (list): Number of bottlenecks in each
-            downsample unit. Default: [2, 2, 2, 2]
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        res_top_channels (int): Number of channels of feature from ResNetTop.
-            Default: 64.
-
-    Example:
-        >>> from mmpose.models import MSPN
-        >>> import torch
-        >>> self = MSPN(num_stages=2,num_units=2,num_blocks=[2,2])
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 511, 511)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_output in level_outputs:
-        ...     for feature in level_output:
-        ...         print(tuple(feature.shape))
-        ...
-        (1, 256, 64, 64)
-        (1, 256, 128, 128)
-        (1, 256, 64, 64)
-        (1, 256, 128, 128)
-    """
-
-    def __init__(self,
-                 unit_channels=256,
-                 num_stages=4,
-                 num_units=4,
-                 num_blocks=[2, 2, 2, 2],
-                 norm_cfg=dict(type='BN'),
-                 res_top_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        num_blocks = cp.deepcopy(num_blocks)
-        super().__init__()
-        self.unit_channels = unit_channels
-        self.num_stages = num_stages
-        self.num_units = num_units
-        self.num_blocks = num_blocks
-        self.norm_cfg = norm_cfg
-
-        assert self.num_stages > 0
-        assert self.num_units > 1
-        assert self.num_units == len(self.num_blocks)
-        self.top = ResNetTop(norm_cfg=norm_cfg)
-        self.multi_stage_mspn = nn.ModuleList([])
-        for i in range(self.num_stages):
-            if i == 0:
-                has_skip = False
-            else:
-                has_skip = True
-            if i != self.num_stages - 1:
-                gen_skip = True
-                gen_cross_conv = True
-            else:
-                gen_skip = False
-                gen_cross_conv = False
-            self.multi_stage_mspn.append(
-                SingleStageNetwork(has_skip, gen_skip, gen_cross_conv,
-                                   unit_channels, num_units, num_blocks,
-                                   norm_cfg, res_top_channels))
-
-    def forward(self, x):
-        """Model forward function."""
-        out_feats = []
-        skip1 = None
-        skip2 = None
-        x = self.top(x)
-        for i in range(self.num_stages):
-            out, skip1, skip2, x = self.multi_stage_mspn[i](x, skip1, skip2)
-            out_feats.append(out)
-
-        return out_feats
-
-    def init_weights(self, pretrained=None):
-        """Initialize model weights."""
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            state_dict_tmp = get_state_dict(pretrained)
-            state_dict = OrderedDict()
-            state_dict['top'] = OrderedDict()
-            state_dict['bottlenecks'] = OrderedDict()
-            for k, v in state_dict_tmp.items():
-                if k.startswith('layer'):
-                    if 'downsample.0' in k:
-                        state_dict['bottlenecks'][k.replace(
-                            'downsample.0', 'downsample.conv')] = v
-                    elif 'downsample.1' in k:
-                        state_dict['bottlenecks'][k.replace(
-                            'downsample.1', 'downsample.bn')] = v
-                    else:
-                        state_dict['bottlenecks'][k] = v
-                elif k.startswith('conv1'):
-                    state_dict['top'][k.replace('conv1', 'top.0.conv')] = v
-                elif k.startswith('bn1'):
-                    state_dict['top'][k.replace('bn1', 'top.0.bn')] = v
-
-            load_state_dict(
-                self.top, state_dict['top'], strict=False, logger=logger)
-            for i in range(self.num_stages):
-                load_state_dict(
-                    self.multi_stage_mspn[i].downsample,
-                    state_dict['bottlenecks'],
-                    strict=False,
-                    logger=logger)
-        else:
-            for m in self.multi_stage_mspn.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, nn.BatchNorm2d):
-                    constant_init(m, 1)
-                elif isinstance(m, nn.Linear):
-                    normal_init(m, std=0.01)
-
-            for m in self.top.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
diff --git a/main/transformer_utils/mmpose/models/backbones/pvt.py b/main/transformer_utils/mmpose/models/backbones/pvt.py
deleted file mode 100644
index 62527a7dc817513c08f42ccbb166c75cab514873..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/pvt.py
+++ /dev/null
@@ -1,592 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-import warnings
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import (Conv2d, build_activation_layer, build_norm_layer,
-                      constant_init, normal_init, trunc_normal_init)
-from mmcv.cnn.bricks.drop import build_dropout
-from mmcv.cnn.bricks.transformer import MultiheadAttention
-from mmcv.cnn.utils.weight_init import trunc_normal_
-from mmcv.runner import (BaseModule, ModuleList, Sequential, _load_checkpoint,
-                         load_state_dict)
-from torch.nn.modules.utils import _pair as to_2tuple
-
-from ...utils import get_root_logger
-from ..builder import BACKBONES
-from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw, pvt_convert
-
-
-class MixFFN(BaseModule):
-    """An implementation of MixFFN of PVT.
-
-    The differences between MixFFN & FFN:
-        1. Use 1X1 Conv to replace Linear layer.
-        2. Introduce 3X3 Depth-wise Conv to encode positional information.
-
-    Args:
-        embed_dims (int): The feature dimension. Same as
-            `MultiheadAttention`.
-        feedforward_channels (int): The hidden dimension of FFNs.
-        act_cfg (dict, optional): The activation config for FFNs.
-            Default: dict(type='GELU').
-        ffn_drop (float, optional): Probability of an element to be
-            zeroed in FFN. Default 0.0.
-        dropout_layer (obj:`ConfigDict`): The dropout_layer used
-            when adding the shortcut.
-            Default: None.
-        use_conv (bool): If True, add 3x3 DWConv between two Linear layers.
-            Defaults: False.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 feedforward_channels,
-                 act_cfg=dict(type='GELU'),
-                 ffn_drop=0.,
-                 dropout_layer=None,
-                 use_conv=False,
-                 init_cfg=None):
-        super(MixFFN, self).__init__(init_cfg=init_cfg)
-
-        self.embed_dims = embed_dims
-        self.feedforward_channels = feedforward_channels
-        self.act_cfg = act_cfg
-        activate = build_activation_layer(act_cfg)
-
-        in_channels = embed_dims
-        fc1 = Conv2d(
-            in_channels=in_channels,
-            out_channels=feedforward_channels,
-            kernel_size=1,
-            stride=1,
-            bias=True)
-        if use_conv:
-            # 3x3 depth wise conv to provide positional encode information
-            dw_conv = Conv2d(
-                in_channels=feedforward_channels,
-                out_channels=feedforward_channels,
-                kernel_size=3,
-                stride=1,
-                padding=(3 - 1) // 2,
-                bias=True,
-                groups=feedforward_channels)
-        fc2 = Conv2d(
-            in_channels=feedforward_channels,
-            out_channels=in_channels,
-            kernel_size=1,
-            stride=1,
-            bias=True)
-        drop = nn.Dropout(ffn_drop)
-        layers = [fc1, activate, drop, fc2, drop]
-        if use_conv:
-            layers.insert(1, dw_conv)
-        self.layers = Sequential(*layers)
-        self.dropout_layer = build_dropout(
-            dropout_layer) if dropout_layer else torch.nn.Identity()
-
-    def forward(self, x, hw_shape, identity=None):
-        out = nlc_to_nchw(x, hw_shape)
-        out = self.layers(out)
-        out = nchw_to_nlc(out)
-        if identity is None:
-            identity = x
-        return identity + self.dropout_layer(out)
-
-
-class SpatialReductionAttention(MultiheadAttention):
-    """An implementation of Spatial Reduction Attention of PVT.
-
-    This module is modified from MultiheadAttention which is a module from
-    mmcv.cnn.bricks.transformer.
-
-    Args:
-        embed_dims (int): The embedding dimension.
-        num_heads (int): Parallel attention heads.
-        attn_drop (float): A Dropout layer on attn_output_weights.
-            Default: 0.0.
-        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
-            Default: 0.0.
-        dropout_layer (obj:`ConfigDict`): The dropout_layer used
-            when adding the shortcut. Default: None.
-        batch_first (bool): Key, Query and Value are shape of
-            (batch, n, embed_dim)
-            or (n, batch, embed_dim). Default: False.
-        qkv_bias (bool): enable bias for qkv if True. Default: True.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='LN').
-        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
-            Attention of PVT. Default: 1.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 dropout_layer=None,
-                 batch_first=True,
-                 qkv_bias=True,
-                 norm_cfg=dict(type='LN'),
-                 sr_ratio=1,
-                 init_cfg=None):
-        super().__init__(
-            embed_dims,
-            num_heads,
-            attn_drop,
-            proj_drop,
-            batch_first=batch_first,
-            dropout_layer=dropout_layer,
-            bias=qkv_bias,
-            init_cfg=init_cfg)
-
-        self.sr_ratio = sr_ratio
-        if sr_ratio > 1:
-            self.sr = Conv2d(
-                in_channels=embed_dims,
-                out_channels=embed_dims,
-                kernel_size=sr_ratio,
-                stride=sr_ratio)
-            # The ret[0] of build_norm_layer is norm name.
-            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
-
-        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
-        from mmpose import digit_version, mmcv_version
-        if mmcv_version < digit_version('1.3.17'):
-            warnings.warn('The legacy version of forward function in'
-                          'SpatialReductionAttention is deprecated in'
-                          'mmcv>=1.3.17 and will no longer support in the'
-                          'future. Please upgrade your mmcv.')
-            self.forward = self.legacy_forward
-
-    def forward(self, x, hw_shape, identity=None):
-
-        x_q = x
-        if self.sr_ratio > 1:
-            x_kv = nlc_to_nchw(x, hw_shape)
-            x_kv = self.sr(x_kv)
-            x_kv = nchw_to_nlc(x_kv)
-            x_kv = self.norm(x_kv)
-        else:
-            x_kv = x
-
-        if identity is None:
-            identity = x_q
-
-        # Because the dataflow('key', 'query', 'value') of
-        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
-        # embed_dims), We should adjust the shape of dataflow from
-        # batch_first (batch, num_query, embed_dims) to num_query_first
-        # (num_query ,batch, embed_dims), and recover ``attn_output``
-        # from num_query_first to batch_first.
-        if self.batch_first:
-            x_q = x_q.transpose(0, 1)
-            x_kv = x_kv.transpose(0, 1)
-
-        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
-
-        if self.batch_first:
-            out = out.transpose(0, 1)
-
-        return identity + self.dropout_layer(self.proj_drop(out))
-
-    def legacy_forward(self, x, hw_shape, identity=None):
-        """multi head attention forward in mmcv version < 1.3.17."""
-        x_q = x
-        if self.sr_ratio > 1:
-            x_kv = nlc_to_nchw(x, hw_shape)
-            x_kv = self.sr(x_kv)
-            x_kv = nchw_to_nlc(x_kv)
-            x_kv = self.norm(x_kv)
-        else:
-            x_kv = x
-
-        if identity is None:
-            identity = x_q
-
-        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
-
-        return identity + self.dropout_layer(self.proj_drop(out))
-
-
-class PVTEncoderLayer(BaseModule):
-    """Implements one encoder layer in PVT.
-
-    Args:
-        embed_dims (int): The feature dimension.
-        num_heads (int): Parallel attention heads.
-        feedforward_channels (int): The hidden dimension for FFNs.
-        drop_rate (float): Probability of an element to be zeroed.
-            after the feed forward layer. Default: 0.0.
-        attn_drop_rate (float): The drop out rate for attention layer.
-            Default: 0.0.
-        drop_path_rate (float): stochastic depth rate. Default: 0.0.
-        qkv_bias (bool): enable bias for qkv if True.
-            Default: True.
-        act_cfg (dict): The activation config for FFNs.
-            Default: dict(type='GELU').
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='LN').
-        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
-            Attention of PVT. Default: 1.
-        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
-            Default: False.
-        init_cfg (dict, optional): Initialization config dict.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 feedforward_channels,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 qkv_bias=True,
-                 act_cfg=dict(type='GELU'),
-                 norm_cfg=dict(type='LN'),
-                 sr_ratio=1,
-                 use_conv_ffn=False,
-                 init_cfg=None):
-        super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg)
-
-        # The ret[0] of build_norm_layer is norm name.
-        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
-
-        self.attn = SpatialReductionAttention(
-            embed_dims=embed_dims,
-            num_heads=num_heads,
-            attn_drop=attn_drop_rate,
-            proj_drop=drop_rate,
-            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
-            qkv_bias=qkv_bias,
-            norm_cfg=norm_cfg,
-            sr_ratio=sr_ratio)
-
-        # The ret[0] of build_norm_layer is norm name.
-        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
-
-        self.ffn = MixFFN(
-            embed_dims=embed_dims,
-            feedforward_channels=feedforward_channels,
-            ffn_drop=drop_rate,
-            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
-            use_conv=use_conv_ffn,
-            act_cfg=act_cfg)
-
-    def forward(self, x, hw_shape):
-        x = self.attn(self.norm1(x), hw_shape, identity=x)
-        x = self.ffn(self.norm2(x), hw_shape, identity=x)
-
-        return x
-
-
-class AbsolutePositionEmbedding(BaseModule):
-    """An implementation of the absolute position embedding in PVT.
-
-    Args:
-        pos_shape (int): The shape of the absolute position embedding.
-        pos_dim (int): The dimension of the absolute position embedding.
-        drop_rate (float): Probability of an element to be zeroed.
-            Default: 0.0.
-    """
-
-    def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-
-        if isinstance(pos_shape, int):
-            pos_shape = to_2tuple(pos_shape)
-        elif isinstance(pos_shape, tuple):
-            if len(pos_shape) == 1:
-                pos_shape = to_2tuple(pos_shape[0])
-            assert len(pos_shape) == 2, \
-                f'The size of image should have length 1 or 2, ' \
-                f'but got {len(pos_shape)}'
-        self.pos_shape = pos_shape
-        self.pos_dim = pos_dim
-
-        self.pos_embed = nn.Parameter(
-            torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim))
-        self.drop = nn.Dropout(p=drop_rate)
-
-    def init_weights(self):
-        trunc_normal_(self.pos_embed, std=0.02)
-
-    def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'):
-        """Resize pos_embed weights.
-
-        Resize pos_embed using bilinear interpolate method.
-
-        Args:
-            pos_embed (torch.Tensor): Position embedding weights.
-            input_shape (tuple): Tuple for (downsampled input image height,
-                downsampled input image width).
-            mode (str): Algorithm used for upsampling:
-                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
-                ``'trilinear'``. Default: ``'bilinear'``.
-
-        Return:
-            torch.Tensor: The resized pos_embed of shape [B, L_new, C].
-        """
-        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
-        pos_h, pos_w = self.pos_shape
-        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
-        pos_embed_weight = pos_embed_weight.reshape(
-            1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous()
-        pos_embed_weight = F.interpolate(
-            pos_embed_weight, size=input_shape, mode=mode)
-        pos_embed_weight = torch.flatten(pos_embed_weight,
-                                         2).transpose(1, 2).contiguous()
-        pos_embed = pos_embed_weight
-
-        return pos_embed
-
-    def forward(self, x, hw_shape, mode='bilinear'):
-        pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode)
-        return self.drop(x + pos_embed)
-
-
-@BACKBONES.register_module()
-class PyramidVisionTransformer(BaseModule):
-    """Pyramid Vision Transformer (PVT)
-
-    Implementation of `Pyramid Vision Transformer: A Versatile Backbone for
-    Dense Prediction without Convolutions
-    <https://arxiv.org/pdf/2102.12122.pdf>`_.
-
-    Args:
-        pretrain_img_size (int | tuple[int]): The size of input image when
-            pretrain. Defaults: 224.
-        in_channels (int): Number of input channels. Default: 3.
-        embed_dims (int): Embedding dimension. Default: 64.
-        num_stags (int): The num of stages. Default: 4.
-        num_layers (Sequence[int]): The layer number of each transformer encode
-            layer. Default: [3, 4, 6, 3].
-        num_heads (Sequence[int]): The attention heads of each transformer
-            encode layer. Default: [1, 2, 5, 8].
-        patch_sizes (Sequence[int]): The patch_size of each patch embedding.
-            Default: [4, 2, 2, 2].
-        strides (Sequence[int]): The stride of each patch embedding.
-            Default: [4, 2, 2, 2].
-        paddings (Sequence[int]): The padding of each patch embedding.
-            Default: [0, 0, 0, 0].
-        sr_ratios (Sequence[int]): The spatial reduction rate of each
-            transformer encode layer. Default: [8, 4, 2, 1].
-        out_indices (Sequence[int] | int): Output from which stages.
-            Default: (0, 1, 2, 3).
-        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
-            embedding dim of each transformer encode layer.
-            Default: [8, 8, 4, 4].
-        qkv_bias (bool): Enable bias for qkv if True. Default: True.
-        drop_rate (float): Probability of an element to be zeroed.
-            Default 0.0.
-        attn_drop_rate (float): The drop out rate for attention layer.
-            Default 0.0.
-        drop_path_rate (float): stochastic depth rate. Default 0.1.
-        use_abs_pos_embed (bool): If True, add absolute position embedding to
-            the patch embedding. Defaults: True.
-        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
-            Default: False.
-        act_cfg (dict): The activation config for FFNs.
-            Default: dict(type='GELU').
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='LN').
-        pretrained (str, optional): model pretrained path. Default: None.
-        convert_weights (bool): The flag indicates whether the
-            pre-trained model is from the original repo. We may need
-            to convert some keys to make it compatible.
-            Default: True.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Default: None.
-    """
-
-    def __init__(self,
-                 pretrain_img_size=224,
-                 in_channels=3,
-                 embed_dims=64,
-                 num_stages=4,
-                 num_layers=[3, 4, 6, 3],
-                 num_heads=[1, 2, 5, 8],
-                 patch_sizes=[4, 2, 2, 2],
-                 strides=[4, 2, 2, 2],
-                 paddings=[0, 0, 0, 0],
-                 sr_ratios=[8, 4, 2, 1],
-                 out_indices=(0, 1, 2, 3),
-                 mlp_ratios=[8, 8, 4, 4],
-                 qkv_bias=True,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.1,
-                 use_abs_pos_embed=True,
-                 norm_after_stage=False,
-                 use_conv_ffn=False,
-                 act_cfg=dict(type='GELU'),
-                 norm_cfg=dict(type='LN', eps=1e-6),
-                 pretrained=None,
-                 convert_weights=True,
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-
-        self.convert_weights = convert_weights
-        if isinstance(pretrain_img_size, int):
-            pretrain_img_size = to_2tuple(pretrain_img_size)
-        elif isinstance(pretrain_img_size, tuple):
-            if len(pretrain_img_size) == 1:
-                pretrain_img_size = to_2tuple(pretrain_img_size[0])
-            assert len(pretrain_img_size) == 2, \
-                f'The size of image should have length 1 or 2, ' \
-                f'but got {len(pretrain_img_size)}'
-
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-        elif pretrained is None:
-            self.init_cfg = init_cfg
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-        self.embed_dims = embed_dims
-
-        self.num_stages = num_stages
-        self.num_layers = num_layers
-        self.num_heads = num_heads
-        self.patch_sizes = patch_sizes
-        self.strides = strides
-        self.sr_ratios = sr_ratios
-        assert num_stages == len(num_layers) == len(num_heads) \
-               == len(patch_sizes) == len(strides) == len(sr_ratios)
-
-        self.out_indices = out_indices
-        assert max(out_indices) < self.num_stages
-        self.pretrained = pretrained
-
-        # transformer encoder
-        dpr = [
-            x.item()
-            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
-        ]  # stochastic num_layer decay rule
-
-        cur = 0
-        self.layers = ModuleList()
-        for i, num_layer in enumerate(num_layers):
-            embed_dims_i = embed_dims * num_heads[i]
-            patch_embed = PatchEmbed(
-                in_channels=in_channels,
-                embed_dims=embed_dims_i,
-                kernel_size=patch_sizes[i],
-                stride=strides[i],
-                padding=paddings[i],
-                bias=True,
-                norm_cfg=norm_cfg)
-
-            layers = ModuleList()
-            if use_abs_pos_embed:
-                pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1])
-                pos_embed = AbsolutePositionEmbedding(
-                    pos_shape=pos_shape,
-                    pos_dim=embed_dims_i,
-                    drop_rate=drop_rate)
-                layers.append(pos_embed)
-            layers.extend([
-                PVTEncoderLayer(
-                    embed_dims=embed_dims_i,
-                    num_heads=num_heads[i],
-                    feedforward_channels=mlp_ratios[i] * embed_dims_i,
-                    drop_rate=drop_rate,
-                    attn_drop_rate=attn_drop_rate,
-                    drop_path_rate=dpr[cur + idx],
-                    qkv_bias=qkv_bias,
-                    act_cfg=act_cfg,
-                    norm_cfg=norm_cfg,
-                    sr_ratio=sr_ratios[i],
-                    use_conv_ffn=use_conv_ffn) for idx in range(num_layer)
-            ])
-            in_channels = embed_dims_i
-            # The ret[0] of build_norm_layer is norm name.
-            if norm_after_stage:
-                norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
-            else:
-                norm = nn.Identity()
-            self.layers.append(ModuleList([patch_embed, layers, norm]))
-            cur += num_layer
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-
-        logger = get_root_logger()
-        if self.init_cfg is None:
-            logger.warn(f'No pre-trained weights for '
-                        f'{self.__class__.__name__}, '
-                        f'training start from scratch')
-            for m in self.modules():
-                if isinstance(m, nn.Linear):
-                    trunc_normal_init(m, std=.02, bias=0.)
-                elif isinstance(m, nn.LayerNorm):
-                    constant_init(m, 1.0)
-                elif isinstance(m, nn.Conv2d):
-                    fan_out = m.kernel_size[0] * m.kernel_size[
-                        1] * m.out_channels
-                    fan_out //= m.groups
-                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
-                elif isinstance(m, AbsolutePositionEmbedding):
-                    m.init_weights()
-        else:
-            assert 'checkpoint' in self.init_cfg, f'Only support ' \
-                                                  f'specify `Pretrained` in ' \
-                                                  f'`init_cfg` in ' \
-                                                  f'{self.__class__.__name__} '
-            checkpoint = _load_checkpoint(
-                self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
-            logger.warn(f'Load pre-trained model for '
-                        f'{self.__class__.__name__} from original repo')
-            if 'state_dict' in checkpoint:
-                state_dict = checkpoint['state_dict']
-            elif 'model' in checkpoint:
-                state_dict = checkpoint['model']
-            else:
-                state_dict = checkpoint
-            if self.convert_weights:
-                # Because pvt backbones are not supported by mmcls,
-                # so we need to convert pre-trained weights to match this
-                # implementation.
-                state_dict = pvt_convert(state_dict)
-            load_state_dict(self, state_dict, strict=False, logger=logger)
-
-    def forward(self, x):
-        outs = []
-
-        for i, layer in enumerate(self.layers):
-            x, hw_shape = layer[0](x)
-
-            for block in layer[1]:
-                x = block(x, hw_shape)
-            x = layer[2](x)
-            x = nlc_to_nchw(x, hw_shape)
-            if i in self.out_indices:
-                outs.append(x)
-
-        return outs
-
-
-@BACKBONES.register_module()
-class PyramidVisionTransformerV2(PyramidVisionTransformer):
-    """Implementation of `PVTv2: Improved Baselines with Pyramid Vision
-    Transformer <https://arxiv.org/pdf/2106.13797.pdf>`_."""
-
-    def __init__(self, **kwargs):
-        super(PyramidVisionTransformerV2, self).__init__(
-            patch_sizes=[7, 3, 3, 3],
-            paddings=[3, 1, 1, 1],
-            use_abs_pos_embed=False,
-            norm_after_stage=True,
-            use_conv_ffn=True,
-            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/regnet.py b/main/transformer_utils/mmpose/models/backbones/regnet.py
deleted file mode 100644
index 693417c2d61066e4e9a90989ad61700448028e58..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/regnet.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import numpy as np
-import torch.nn as nn
-from mmcv.cnn import build_conv_layer, build_norm_layer
-
-from ..builder import BACKBONES
-from .resnet import ResNet
-from .resnext import Bottleneck
-
-
-@BACKBONES.register_module()
-class RegNet(ResNet):
-    """RegNet backbone.
-
-    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`__ .
-
-    Args:
-        arch (dict): The parameter of RegNets.
-            - w0 (int): initial width
-            - wa (float): slope of width
-            - wm (float): quantization parameter to quantize the width
-            - depth (int): depth of the backbone
-            - group_w (int): width of group
-            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
-        strides (Sequence[int]): Strides of the first block of each stage.
-        base_channels (int): Base channels after stem layer.
-        in_channels (int): Number of input image channels. Default: 3.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer. Default: "pytorch".
-        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
-            not freezing any parameters. Default: -1.
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN', requires_grad=True).
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        zero_init_residual (bool): whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Default: True.
-
-    Example:
-        >>> from mmpose.models import RegNet
-        >>> import torch
-        >>> self = RegNet(
-                arch=dict(
-                    w0=88,
-                    wa=26.31,
-                    wm=2.25,
-                    group_w=48,
-                    depth=25,
-                    bot_mul=1.0),
-                 out_indices=(0, 1, 2, 3))
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 96, 8, 8)
-        (1, 192, 4, 4)
-        (1, 432, 2, 2)
-        (1, 1008, 1, 1)
-    """
-    arch_settings = {
-        'regnetx_400mf':
-        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
-        'regnetx_800mf':
-        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
-        'regnetx_1.6gf':
-        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
-        'regnetx_3.2gf':
-        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
-        'regnetx_4.0gf':
-        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
-        'regnetx_6.4gf':
-        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
-        'regnetx_8.0gf':
-        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
-        'regnetx_12gf':
-        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
-    }
-
-    def __init__(self,
-                 arch,
-                 in_channels=3,
-                 stem_channels=32,
-                 base_channels=32,
-                 strides=(2, 2, 2, 2),
-                 dilations=(1, 1, 1, 1),
-                 out_indices=(3, ),
-                 style='pytorch',
-                 deep_stem=False,
-                 avg_down=False,
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True),
-                 norm_eval=False,
-                 with_cp=False,
-                 zero_init_residual=True):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super(ResNet, self).__init__()
-
-        # Generate RegNet parameters first
-        if isinstance(arch, str):
-            assert arch in self.arch_settings, \
-                f'"arch": "{arch}" is not one of the' \
-                ' arch_settings'
-            arch = self.arch_settings[arch]
-        elif not isinstance(arch, dict):
-            raise TypeError('Expect "arch" to be either a string '
-                            f'or a dict, got {type(arch)}')
-
-        widths, num_stages = self.generate_regnet(
-            arch['w0'],
-            arch['wa'],
-            arch['wm'],
-            arch['depth'],
-        )
-        # Convert to per stage format
-        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
-        # Generate group widths and bot muls
-        group_widths = [arch['group_w'] for _ in range(num_stages)]
-        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
-        # Adjust the compatibility of stage_widths and group_widths
-        stage_widths, group_widths = self.adjust_width_group(
-            stage_widths, self.bottleneck_ratio, group_widths)
-
-        # Group params by stage
-        self.stage_widths = stage_widths
-        self.group_widths = group_widths
-        self.depth = sum(stage_blocks)
-        self.stem_channels = stem_channels
-        self.base_channels = base_channels
-        self.num_stages = num_stages
-        assert 1 <= num_stages <= 4
-        self.strides = strides
-        self.dilations = dilations
-        assert len(strides) == len(dilations) == num_stages
-        self.out_indices = out_indices
-        assert max(out_indices) < num_stages
-        self.style = style
-        self.deep_stem = deep_stem
-        if self.deep_stem:
-            raise NotImplementedError(
-                'deep_stem has not been implemented for RegNet')
-        self.avg_down = avg_down
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.with_cp = with_cp
-        self.norm_eval = norm_eval
-        self.zero_init_residual = zero_init_residual
-        self.stage_blocks = stage_blocks[:num_stages]
-
-        self._make_stem_layer(in_channels, stem_channels)
-
-        _in_channels = stem_channels
-        self.res_layers = []
-        for i, num_blocks in enumerate(self.stage_blocks):
-            stride = self.strides[i]
-            dilation = self.dilations[i]
-            group_width = self.group_widths[i]
-            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
-            stage_groups = width // group_width
-
-            res_layer = self.make_res_layer(
-                block=Bottleneck,
-                num_blocks=num_blocks,
-                in_channels=_in_channels,
-                out_channels=self.stage_widths[i],
-                expansion=1,
-                stride=stride,
-                dilation=dilation,
-                style=self.style,
-                avg_down=self.avg_down,
-                with_cp=self.with_cp,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg,
-                base_channels=self.stage_widths[i],
-                groups=stage_groups,
-                width_per_group=group_width)
-            _in_channels = self.stage_widths[i]
-            layer_name = f'layer{i + 1}'
-            self.add_module(layer_name, res_layer)
-            self.res_layers.append(layer_name)
-
-        self._freeze_stages()
-
-        self.feat_dim = stage_widths[-1]
-
-    def _make_stem_layer(self, in_channels, base_channels):
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            in_channels,
-            base_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            bias=False)
-        self.norm1_name, norm1 = build_norm_layer(
-            self.norm_cfg, base_channels, postfix=1)
-        self.add_module(self.norm1_name, norm1)
-        self.relu = nn.ReLU(inplace=True)
-
-    @staticmethod
-    def generate_regnet(initial_width,
-                        width_slope,
-                        width_parameter,
-                        depth,
-                        divisor=8):
-        """Generates per block width from RegNet parameters.
-
-        Args:
-            initial_width ([int]): Initial width of the backbone
-            width_slope ([float]): Slope of the quantized linear function
-            width_parameter ([int]): Parameter used to quantize the width.
-            depth ([int]): Depth of the backbone.
-            divisor (int, optional): The divisor of channels. Defaults to 8.
-
-        Returns:
-            list, int: return a list of widths of each stage and the number of
-                stages
-        """
-        assert width_slope >= 0
-        assert initial_width > 0
-        assert width_parameter > 1
-        assert initial_width % divisor == 0
-        widths_cont = np.arange(depth) * width_slope + initial_width
-        ks = np.round(
-            np.log(widths_cont / initial_width) / np.log(width_parameter))
-        widths = initial_width * np.power(width_parameter, ks)
-        widths = np.round(np.divide(widths, divisor)) * divisor
-        num_stages = len(np.unique(widths))
-        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
-        return widths, num_stages
-
-    @staticmethod
-    def quantize_float(number, divisor):
-        """Converts a float to closest non-zero int divisible by divior.
-
-        Args:
-            number (int): Original number to be quantized.
-            divisor (int): Divisor used to quantize the number.
-
-        Returns:
-            int: quantized number that is divisible by devisor.
-        """
-        return int(round(number / divisor) * divisor)
-
-    def adjust_width_group(self, widths, bottleneck_ratio, groups):
-        """Adjusts the compatibility of widths and groups.
-
-        Args:
-            widths (list[int]): Width of each stage.
-            bottleneck_ratio (float): Bottleneck ratio.
-            groups (int): number of groups in each stage
-
-        Returns:
-            tuple(list): The adjusted widths and groups of each stage.
-        """
-        bottleneck_width = [
-            int(w * b) for w, b in zip(widths, bottleneck_ratio)
-        ]
-        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
-        bottleneck_width = [
-            self.quantize_float(w_bot, g)
-            for w_bot, g in zip(bottleneck_width, groups)
-        ]
-        widths = [
-            int(w_bot / b)
-            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
-        ]
-        return widths, groups
-
-    def get_stages_from_blocks(self, widths):
-        """Gets widths/stage_blocks of network at each stage.
-
-        Args:
-            widths (list[int]): Width in each stage.
-
-        Returns:
-            tuple(list): width and depth of each stage
-        """
-        width_diff = [
-            width != width_prev
-            for width, width_prev in zip(widths + [0], [0] + widths)
-        ]
-        stage_widths = [
-            width for width, diff in zip(widths, width_diff[:-1]) if diff
-        ]
-        stage_blocks = np.diff([
-            depth for depth, diff in zip(range(len(width_diff)), width_diff)
-            if diff
-        ]).tolist()
-        return stage_widths, stage_blocks
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.relu(x)
-
-        outs = []
-        for i, layer_name in enumerate(self.res_layers):
-            res_layer = getattr(self, layer_name)
-            x = res_layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-
-        if len(outs) == 1:
-            return outs[0]
-        return tuple(outs)
diff --git a/main/transformer_utils/mmpose/models/backbones/resnest.py b/main/transformer_utils/mmpose/models/backbones/resnest.py
deleted file mode 100644
index 0a2d4081df1417155f0626646f5fe3d0dbfc2864..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/resnest.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as cp
-from mmcv.cnn import build_conv_layer, build_norm_layer
-
-from ..builder import BACKBONES
-from .resnet import Bottleneck as _Bottleneck
-from .resnet import ResLayer, ResNetV1d
-
-
-class RSoftmax(nn.Module):
-    """Radix Softmax module in ``SplitAttentionConv2d``.
-
-    Args:
-        radix (int): Radix of input.
-        groups (int): Groups of input.
-    """
-
-    def __init__(self, radix, groups):
-        super().__init__()
-        self.radix = radix
-        self.groups = groups
-
-    def forward(self, x):
-        batch = x.size(0)
-        if self.radix > 1:
-            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
-            x = F.softmax(x, dim=1)
-            x = x.reshape(batch, -1)
-        else:
-            x = torch.sigmoid(x)
-        return x
-
-
-class SplitAttentionConv2d(nn.Module):
-    """Split-Attention Conv2d.
-
-    Args:
-        in_channels (int): Same as nn.Conv2d.
-        out_channels (int): Same as nn.Conv2d.
-        kernel_size (int | tuple[int]): Same as nn.Conv2d.
-        stride (int | tuple[int]): Same as nn.Conv2d.
-        padding (int | tuple[int]): Same as nn.Conv2d.
-        dilation (int | tuple[int]): Same as nn.Conv2d.
-        groups (int): Same as nn.Conv2d.
-        radix (int): Radix of SpltAtConv2d. Default: 2
-        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
-            Default: 4.
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 radix=2,
-                 reduction_factor=4,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN')):
-        super().__init__()
-        inter_channels = max(in_channels * radix // reduction_factor, 32)
-        self.radix = radix
-        self.groups = groups
-        self.channels = channels
-        self.conv = build_conv_layer(
-            conv_cfg,
-            in_channels,
-            channels * radix,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups * radix,
-            bias=False)
-        self.norm0_name, norm0 = build_norm_layer(
-            norm_cfg, channels * radix, postfix=0)
-        self.add_module(self.norm0_name, norm0)
-        self.relu = nn.ReLU(inplace=True)
-        self.fc1 = build_conv_layer(
-            None, channels, inter_channels, 1, groups=self.groups)
-        self.norm1_name, norm1 = build_norm_layer(
-            norm_cfg, inter_channels, postfix=1)
-        self.add_module(self.norm1_name, norm1)
-        self.fc2 = build_conv_layer(
-            None, inter_channels, channels * radix, 1, groups=self.groups)
-        self.rsoftmax = RSoftmax(radix, groups)
-
-    @property
-    def norm0(self):
-        return getattr(self, self.norm0_name)
-
-    @property
-    def norm1(self):
-        return getattr(self, self.norm1_name)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.norm0(x)
-        x = self.relu(x)
-
-        batch, rchannel = x.shape[:2]
-        if self.radix > 1:
-            splits = x.view(batch, self.radix, -1, *x.shape[2:])
-            gap = splits.sum(dim=1)
-        else:
-            gap = x
-        gap = F.adaptive_avg_pool2d(gap, 1)
-        gap = self.fc1(gap)
-
-        gap = self.norm1(gap)
-        gap = self.relu(gap)
-
-        atten = self.fc2(gap)
-        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
-
-        if self.radix > 1:
-            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
-            out = torch.sum(attens * splits, dim=1)
-        else:
-            out = atten * x
-        return out.contiguous()
-
-
-class Bottleneck(_Bottleneck):
-    """Bottleneck block for ResNeSt.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        groups (int): Groups of conv2.
-        width_per_group (int): Width per group of conv2. 64x4d indicates
-            ``groups=64, width_per_group=4`` and 32x8d indicates
-            ``groups=32, width_per_group=8``.
-        radix (int): Radix of SpltAtConv2d. Default: 2
-        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
-            Default: 4.
-        avg_down_stride (bool): Whether to use average pool for stride in
-            Bottleneck. Default: True.
-        stride (int): stride of the block. Default: 1
-        dilation (int): dilation of convolution. Default: 1
-        downsample (nn.Module): downsample operation on identity branch.
-            Default: None
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 groups=1,
-                 width_per_group=4,
-                 base_channels=64,
-                 radix=2,
-                 reduction_factor=4,
-                 avg_down_stride=True,
-                 **kwargs):
-        super().__init__(in_channels, out_channels, **kwargs)
-
-        self.groups = groups
-        self.width_per_group = width_per_group
-
-        # For ResNet bottleneck, middle channels are determined by expansion
-        # and out_channels, but for ResNeXt bottleneck, it is determined by
-        # groups and width_per_group and the stage it is located in.
-        if groups != 1:
-            assert self.mid_channels % base_channels == 0
-            self.mid_channels = (
-                groups * width_per_group * self.mid_channels // base_channels)
-
-        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
-
-        self.norm1_name, norm1 = build_norm_layer(
-            self.norm_cfg, self.mid_channels, postfix=1)
-        self.norm3_name, norm3 = build_norm_layer(
-            self.norm_cfg, self.out_channels, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            self.in_channels,
-            self.mid_channels,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = SplitAttentionConv2d(
-            self.mid_channels,
-            self.mid_channels,
-            kernel_size=3,
-            stride=1 if self.avg_down_stride else self.conv2_stride,
-            padding=self.dilation,
-            dilation=self.dilation,
-            groups=groups,
-            radix=radix,
-            reduction_factor=reduction_factor,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg)
-        delattr(self, self.norm2_name)
-
-        if self.avg_down_stride:
-            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
-
-        self.conv3 = build_conv_layer(
-            self.conv_cfg,
-            self.mid_channels,
-            self.out_channels,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            identity = x
-
-            out = self.conv1(x)
-            out = self.norm1(out)
-            out = self.relu(out)
-
-            out = self.conv2(out)
-
-            if self.avg_down_stride:
-                out = self.avd_layer(out)
-
-            out = self.conv3(out)
-            out = self.norm3(out)
-
-            if self.downsample is not None:
-                identity = self.downsample(x)
-
-            out += identity
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-@BACKBONES.register_module()
-class ResNeSt(ResNetV1d):
-    """ResNeSt backbone.
-
-    Please refer to the `paper <https://arxiv.org/pdf/2004.08955.pdf>`__
-    for details.
-
-    Args:
-        depth (int): Network depth, from {50, 101, 152, 200}.
-        groups (int): Groups of conv2 in Bottleneck. Default: 32.
-        width_per_group (int): Width per group of conv2 in Bottleneck.
-            Default: 4.
-        radix (int): Radix of SpltAtConv2d. Default: 2
-        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
-            Default: 4.
-        avg_down_stride (bool): Whether to use average pool for stride in
-            Bottleneck. Default: True.
-        in_channels (int): Number of input image channels. Default: 3.
-        stem_channels (int): Output channels of the stem layer. Default: 64.
-        num_stages (int): Stages of the network. Default: 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-            Default: ``(1, 2, 2, 2)``.
-        dilations (Sequence[int]): Dilation of each stage.
-            Default: ``(1, 1, 1, 1)``.
-        out_indices (Sequence[int]): Output from which stages. If only one
-            stage is specified, a single tensor (feature map) is returned,
-            otherwise multiple stages are specified, a tuple of tensors will
-            be returned. Default: ``(3, )``.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
-            Default: False.
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck. Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters. Default: -1.
-        conv_cfg (dict | None): The config dict for conv layers. Default: None.
-        norm_cfg (dict): The config dict for norm layers.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Default: True.
-    """
-
-    arch_settings = {
-        50: (Bottleneck, (3, 4, 6, 3)),
-        101: (Bottleneck, (3, 4, 23, 3)),
-        152: (Bottleneck, (3, 8, 36, 3)),
-        200: (Bottleneck, (3, 24, 36, 3)),
-        269: (Bottleneck, (3, 30, 48, 8))
-    }
-
-    def __init__(self,
-                 depth,
-                 groups=1,
-                 width_per_group=4,
-                 radix=2,
-                 reduction_factor=4,
-                 avg_down_stride=True,
-                 **kwargs):
-        self.groups = groups
-        self.width_per_group = width_per_group
-        self.radix = radix
-        self.reduction_factor = reduction_factor
-        self.avg_down_stride = avg_down_stride
-        super().__init__(depth=depth, **kwargs)
-
-    def make_res_layer(self, **kwargs):
-        return ResLayer(
-            groups=self.groups,
-            width_per_group=self.width_per_group,
-            base_channels=self.base_channels,
-            radix=self.radix,
-            reduction_factor=self.reduction_factor,
-            avg_down_stride=self.avg_down_stride,
-            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/resnet.py b/main/transformer_utils/mmpose/models/backbones/resnet.py
index 649496a755020140d94eb32fbe79d1ff135c86ca..376796046ba1634e3acdb3d26a3f33a3d8528522 100644
--- a/main/transformer_utils/mmpose/models/backbones/resnet.py
+++ b/main/transformer_utils/mmpose/models/backbones/resnet.py
@@ -3,9 +3,9 @@ import copy
 
 import torch.nn as nn
 import torch.utils.checkpoint as cp
-from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,
-                      constant_init, kaiming_init)
-from mmcv.utils.parrots_wrapper import _BatchNorm
+from mmengine.model import constant_init, kaiming_init
+from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer)
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
 from ..builder import BACKBONES
 from .base_backbone import BaseBackbone
diff --git a/main/transformer_utils/mmpose/models/backbones/resnext.py b/main/transformer_utils/mmpose/models/backbones/resnext.py
deleted file mode 100644
index c10dc33f98ac3229c77bf306acf19950c295f904..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/resnext.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import build_conv_layer, build_norm_layer
-
-from ..builder import BACKBONES
-from .resnet import Bottleneck as _Bottleneck
-from .resnet import ResLayer, ResNet
-
-
-class Bottleneck(_Bottleneck):
-    """Bottleneck block for ResNeXt.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        groups (int): Groups of conv2.
-        width_per_group (int): Width per group of conv2. 64x4d indicates
-            ``groups=64, width_per_group=4`` and 32x8d indicates
-            ``groups=32, width_per_group=8``.
-        stride (int): stride of the block. Default: 1
-        dilation (int): dilation of convolution. Default: 1
-        downsample (nn.Module): downsample operation on identity branch.
-            Default: None
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 base_channels=64,
-                 groups=32,
-                 width_per_group=4,
-                 **kwargs):
-        super().__init__(in_channels, out_channels, **kwargs)
-        self.groups = groups
-        self.width_per_group = width_per_group
-
-        # For ResNet bottleneck, middle channels are determined by expansion
-        # and out_channels, but for ResNeXt bottleneck, it is determined by
-        # groups and width_per_group and the stage it is located in.
-        if groups != 1:
-            assert self.mid_channels % base_channels == 0
-            self.mid_channels = (
-                groups * width_per_group * self.mid_channels // base_channels)
-
-        self.norm1_name, norm1 = build_norm_layer(
-            self.norm_cfg, self.mid_channels, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(
-            self.norm_cfg, self.mid_channels, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(
-            self.norm_cfg, self.out_channels, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            self.in_channels,
-            self.mid_channels,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            self.conv_cfg,
-            self.mid_channels,
-            self.mid_channels,
-            kernel_size=3,
-            stride=self.conv2_stride,
-            padding=self.dilation,
-            dilation=self.dilation,
-            groups=groups,
-            bias=False)
-
-        self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(
-            self.conv_cfg,
-            self.mid_channels,
-            self.out_channels,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-
-@BACKBONES.register_module()
-class ResNeXt(ResNet):
-    """ResNeXt backbone.
-
-    Please refer to the `paper <https://arxiv.org/abs/1611.05431>`__ for
-    details.
-
-    Args:
-        depth (int): Network depth, from {50, 101, 152}.
-        groups (int): Groups of conv2 in Bottleneck. Default: 32.
-        width_per_group (int): Width per group of conv2 in Bottleneck.
-            Default: 4.
-        in_channels (int): Number of input image channels. Default: 3.
-        stem_channels (int): Output channels of the stem layer. Default: 64.
-        num_stages (int): Stages of the network. Default: 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-            Default: ``(1, 2, 2, 2)``.
-        dilations (Sequence[int]): Dilation of each stage.
-            Default: ``(1, 1, 1, 1)``.
-        out_indices (Sequence[int]): Output from which stages. If only one
-            stage is specified, a single tensor (feature map) is returned,
-            otherwise multiple stages are specified, a tuple of tensors will
-            be returned. Default: ``(3, )``.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
-            Default: False.
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck. Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters. Default: -1.
-        conv_cfg (dict | None): The config dict for conv layers. Default: None.
-        norm_cfg (dict): The config dict for norm layers.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Default: True.
-
-     Example:
-        >>> from mmpose.models import ResNeXt
-        >>> import torch
-        >>> self = ResNeXt(depth=50, out_indices=(0, 1, 2, 3))
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 32, 32)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 256, 8, 8)
-        (1, 512, 4, 4)
-        (1, 1024, 2, 2)
-        (1, 2048, 1, 1)
-    """
-
-    arch_settings = {
-        50: (Bottleneck, (3, 4, 6, 3)),
-        101: (Bottleneck, (3, 4, 23, 3)),
-        152: (Bottleneck, (3, 8, 36, 3))
-    }
-
-    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
-        self.groups = groups
-        self.width_per_group = width_per_group
-        super().__init__(depth, **kwargs)
-
-    def make_res_layer(self, **kwargs):
-        return ResLayer(
-            groups=self.groups,
-            width_per_group=self.width_per_group,
-            base_channels=self.base_channels,
-            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/rsn.py b/main/transformer_utils/mmpose/models/backbones/rsn.py
deleted file mode 100644
index 29038afe2a77dcb3d3b027b1549d478916a50727..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/rsn.py
+++ /dev/null
@@ -1,616 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy as cp
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
-                      normal_init)
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-
-
-class RSB(nn.Module):
-    """Residual Steps block for RSN. Paper ref: Cai et al. "Learning Delicate
-    Local Representations for Multi-Person Pose Estimation" (ECCV 2020).
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        num_steps (int): Numbers of steps in RSB
-        stride (int): stride of the block. Default: 1
-        downsample (nn.Module): downsample operation on identity branch.
-            Default: None.
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        expand_times (int): Times by which the in_channels are expanded.
-            Default:26.
-        res_top_channels (int): Number of channels of feature output by
-            ResNet_top. Default:64.
-    """
-
-    expansion = 1
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_steps=4,
-                 stride=1,
-                 downsample=None,
-                 with_cp=False,
-                 norm_cfg=dict(type='BN'),
-                 expand_times=26,
-                 res_top_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        assert num_steps > 1
-        self.in_channels = in_channels
-        self.branch_channels = self.in_channels * expand_times
-        self.branch_channels //= res_top_channels
-        self.out_channels = out_channels
-        self.stride = stride
-        self.downsample = downsample
-        self.with_cp = with_cp
-        self.norm_cfg = norm_cfg
-        self.num_steps = num_steps
-        self.conv_bn_relu1 = ConvModule(
-            self.in_channels,
-            self.num_steps * self.branch_channels,
-            kernel_size=1,
-            stride=self.stride,
-            padding=0,
-            norm_cfg=self.norm_cfg,
-            inplace=False)
-        for i in range(self.num_steps):
-            for j in range(i + 1):
-                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
-                self.add_module(
-                    module_name,
-                    ConvModule(
-                        self.branch_channels,
-                        self.branch_channels,
-                        kernel_size=3,
-                        stride=1,
-                        padding=1,
-                        norm_cfg=self.norm_cfg,
-                        inplace=False))
-        self.conv_bn3 = ConvModule(
-            self.num_steps * self.branch_channels,
-            self.out_channels * self.expansion,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act_cfg=None,
-            norm_cfg=self.norm_cfg,
-            inplace=False)
-        self.relu = nn.ReLU(inplace=False)
-
-    def forward(self, x):
-        """Forward function."""
-
-        identity = x
-        x = self.conv_bn_relu1(x)
-        spx = torch.split(x, self.branch_channels, 1)
-        outputs = list()
-        outs = list()
-        for i in range(self.num_steps):
-            outputs_i = list()
-            outputs.append(outputs_i)
-            for j in range(i + 1):
-                if j == 0:
-                    inputs = spx[i]
-                else:
-                    inputs = outputs[i][j - 1]
-                if i > j:
-                    inputs = inputs + outputs[i - 1][j]
-                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
-                module_i_j = getattr(self, module_name)
-                outputs[i].append(module_i_j(inputs))
-
-            outs.append(outputs[i][i])
-        out = torch.cat(tuple(outs), 1)
-        out = self.conv_bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(identity)
-        out = out + identity
-
-        out = self.relu(out)
-
-        return out
-
-
-class Downsample_module(nn.Module):
-    """Downsample module for RSN.
-
-    Args:
-        block (nn.Module): Downsample block.
-        num_blocks (list): Number of blocks in each downsample unit.
-        num_units (int): Numbers of downsample units. Default: 4
-        has_skip (bool): Have skip connections from prior upsample
-            module or not. Default:False
-        num_steps (int): Number of steps in a block. Default:4
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        in_channels (int): Number of channels of the input feature to
-            downsample module. Default: 64
-        expand_times (int): Times by which the in_channels are expanded.
-            Default:26.
-    """
-
-    def __init__(self,
-                 block,
-                 num_blocks,
-                 num_steps=4,
-                 num_units=4,
-                 has_skip=False,
-                 norm_cfg=dict(type='BN'),
-                 in_channels=64,
-                 expand_times=26):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.has_skip = has_skip
-        self.in_channels = in_channels
-        assert len(num_blocks) == num_units
-        self.num_blocks = num_blocks
-        self.num_units = num_units
-        self.num_steps = num_steps
-        self.norm_cfg = norm_cfg
-        self.layer1 = self._make_layer(
-            block,
-            in_channels,
-            num_blocks[0],
-            expand_times=expand_times,
-            res_top_channels=in_channels)
-        for i in range(1, num_units):
-            module_name = f'layer{i + 1}'
-            self.add_module(
-                module_name,
-                self._make_layer(
-                    block,
-                    in_channels * pow(2, i),
-                    num_blocks[i],
-                    stride=2,
-                    expand_times=expand_times,
-                    res_top_channels=in_channels))
-
-    def _make_layer(self,
-                    block,
-                    out_channels,
-                    blocks,
-                    stride=1,
-                    expand_times=26,
-                    res_top_channels=64):
-        downsample = None
-        if stride != 1 or self.in_channels != out_channels * block.expansion:
-            downsample = ConvModule(
-                self.in_channels,
-                out_channels * block.expansion,
-                kernel_size=1,
-                stride=stride,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                act_cfg=None,
-                inplace=True)
-
-        units = list()
-        units.append(
-            block(
-                self.in_channels,
-                out_channels,
-                num_steps=self.num_steps,
-                stride=stride,
-                downsample=downsample,
-                norm_cfg=self.norm_cfg,
-                expand_times=expand_times,
-                res_top_channels=res_top_channels))
-        self.in_channels = out_channels * block.expansion
-        for _ in range(1, blocks):
-            units.append(
-                block(
-                    self.in_channels,
-                    out_channels,
-                    num_steps=self.num_steps,
-                    expand_times=expand_times,
-                    res_top_channels=res_top_channels))
-
-        return nn.Sequential(*units)
-
-    def forward(self, x, skip1, skip2):
-        out = list()
-        for i in range(self.num_units):
-            module_name = f'layer{i + 1}'
-            module_i = getattr(self, module_name)
-            x = module_i(x)
-            if self.has_skip:
-                x = x + skip1[i] + skip2[i]
-            out.append(x)
-        out.reverse()
-
-        return tuple(out)
-
-
-class Upsample_unit(nn.Module):
-    """Upsample unit for upsample module.
-
-    Args:
-        ind (int): Indicates whether to interpolate (>0) and whether to
-           generate feature map for the next hourglass-like module.
-        num_units (int): Number of units that form a upsample module. Along
-            with ind and gen_cross_conv, nm_units is used to decide whether
-            to generate feature map for the next hourglass-like module.
-        in_channels (int): Channel number of the skip-in feature maps from
-            the corresponding downsample unit.
-        unit_channels (int): Channel number in this unit. Default:256.
-        gen_skip: (bool): Whether or not to generate skips for the posterior
-            downsample module. Default:False
-        gen_cross_conv (bool): Whether to generate feature map for the next
-            hourglass-like module. Default:False
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        out_channels (in): Number of channels of feature output by upsample
-            module. Must equal to in_channels of downsample module. Default:64
-    """
-
-    def __init__(self,
-                 ind,
-                 num_units,
-                 in_channels,
-                 unit_channels=256,
-                 gen_skip=False,
-                 gen_cross_conv=False,
-                 norm_cfg=dict(type='BN'),
-                 out_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.num_units = num_units
-        self.norm_cfg = norm_cfg
-        self.in_skip = ConvModule(
-            in_channels,
-            unit_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            norm_cfg=self.norm_cfg,
-            act_cfg=None,
-            inplace=True)
-        self.relu = nn.ReLU(inplace=True)
-
-        self.ind = ind
-        if self.ind > 0:
-            self.up_conv = ConvModule(
-                unit_channels,
-                unit_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                act_cfg=None,
-                inplace=True)
-
-        self.gen_skip = gen_skip
-        if self.gen_skip:
-            self.out_skip1 = ConvModule(
-                in_channels,
-                in_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                inplace=True)
-
-            self.out_skip2 = ConvModule(
-                unit_channels,
-                in_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                inplace=True)
-
-        self.gen_cross_conv = gen_cross_conv
-        if self.ind == num_units - 1 and self.gen_cross_conv:
-            self.cross_conv = ConvModule(
-                unit_channels,
-                out_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                norm_cfg=self.norm_cfg,
-                inplace=True)
-
-    def forward(self, x, up_x):
-        out = self.in_skip(x)
-
-        if self.ind > 0:
-            up_x = F.interpolate(
-                up_x,
-                size=(x.size(2), x.size(3)),
-                mode='bilinear',
-                align_corners=True)
-            up_x = self.up_conv(up_x)
-            out = out + up_x
-        out = self.relu(out)
-
-        skip1 = None
-        skip2 = None
-        if self.gen_skip:
-            skip1 = self.out_skip1(x)
-            skip2 = self.out_skip2(out)
-
-        cross_conv = None
-        if self.ind == self.num_units - 1 and self.gen_cross_conv:
-            cross_conv = self.cross_conv(out)
-
-        return out, skip1, skip2, cross_conv
-
-
-class Upsample_module(nn.Module):
-    """Upsample module for RSN.
-
-    Args:
-        unit_channels (int): Channel number in the upsample units.
-            Default:256.
-        num_units (int): Numbers of upsample units. Default: 4
-        gen_skip (bool): Whether to generate skip for posterior downsample
-            module or not. Default:False
-        gen_cross_conv (bool): Whether to generate feature map for the next
-            hourglass-like module. Default:False
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        out_channels (int): Number of channels of feature output by upsample
-            module. Must equal to in_channels of downsample module. Default:64
-    """
-
-    def __init__(self,
-                 unit_channels=256,
-                 num_units=4,
-                 gen_skip=False,
-                 gen_cross_conv=False,
-                 norm_cfg=dict(type='BN'),
-                 out_channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.in_channels = list()
-        for i in range(num_units):
-            self.in_channels.append(RSB.expansion * out_channels * pow(2, i))
-        self.in_channels.reverse()
-        self.num_units = num_units
-        self.gen_skip = gen_skip
-        self.gen_cross_conv = gen_cross_conv
-        self.norm_cfg = norm_cfg
-        for i in range(num_units):
-            module_name = f'up{i + 1}'
-            self.add_module(
-                module_name,
-                Upsample_unit(
-                    i,
-                    self.num_units,
-                    self.in_channels[i],
-                    unit_channels,
-                    self.gen_skip,
-                    self.gen_cross_conv,
-                    norm_cfg=self.norm_cfg,
-                    out_channels=64))
-
-    def forward(self, x):
-        out = list()
-        skip1 = list()
-        skip2 = list()
-        cross_conv = None
-        for i in range(self.num_units):
-            module_i = getattr(self, f'up{i + 1}')
-            if i == 0:
-                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
-            elif i == self.num_units - 1:
-                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
-            else:
-                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
-            out.append(outi)
-            skip1.append(skip1_i)
-            skip2.append(skip2_i)
-        skip1.reverse()
-        skip2.reverse()
-
-        return out, skip1, skip2, cross_conv
-
-
-class Single_stage_RSN(nn.Module):
-    """Single_stage Residual Steps Network.
-
-    Args:
-        unit_channels (int): Channel number in the upsample units. Default:256.
-        num_units (int): Numbers of downsample/upsample units. Default: 4
-        gen_skip (bool): Whether to generate skip for posterior downsample
-            module or not. Default:False
-        gen_cross_conv (bool): Whether to generate feature map for the next
-            hourglass-like module. Default:False
-        has_skip (bool): Have skip connections from prior upsample
-            module or not. Default:False
-        num_steps (int): Number of steps in RSB. Default: 4
-        num_blocks (list): Number of blocks in each downsample unit.
-            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        in_channels (int): Number of channels of the feature from ResNet_Top.
-            Default: 64.
-        expand_times (int): Times by which the in_channels are expanded in RSB.
-            Default:26.
-    """
-
-    def __init__(self,
-                 has_skip=False,
-                 gen_skip=False,
-                 gen_cross_conv=False,
-                 unit_channels=256,
-                 num_units=4,
-                 num_steps=4,
-                 num_blocks=[2, 2, 2, 2],
-                 norm_cfg=dict(type='BN'),
-                 in_channels=64,
-                 expand_times=26):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        num_blocks = cp.deepcopy(num_blocks)
-        super().__init__()
-        assert len(num_blocks) == num_units
-        self.has_skip = has_skip
-        self.gen_skip = gen_skip
-        self.gen_cross_conv = gen_cross_conv
-        self.num_units = num_units
-        self.num_steps = num_steps
-        self.unit_channels = unit_channels
-        self.num_blocks = num_blocks
-        self.norm_cfg = norm_cfg
-
-        self.downsample = Downsample_module(RSB, num_blocks, num_steps,
-                                            num_units, has_skip, norm_cfg,
-                                            in_channels, expand_times)
-        self.upsample = Upsample_module(unit_channels, num_units, gen_skip,
-                                        gen_cross_conv, norm_cfg, in_channels)
-
-    def forward(self, x, skip1, skip2):
-        mid = self.downsample(x, skip1, skip2)
-        out, skip1, skip2, cross_conv = self.upsample(mid)
-
-        return out, skip1, skip2, cross_conv
-
-
-class ResNet_top(nn.Module):
-    """ResNet top for RSN.
-
-    Args:
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        channels (int): Number of channels of the feature output by ResNet_top.
-    """
-
-    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        super().__init__()
-        self.top = nn.Sequential(
-            ConvModule(
-                3,
-                channels,
-                kernel_size=7,
-                stride=2,
-                padding=3,
-                norm_cfg=norm_cfg,
-                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
-
-    def forward(self, img):
-        return self.top(img)
-
-
-@BACKBONES.register_module()
-class RSN(BaseBackbone):
-    """Residual Steps Network backbone. Paper ref: Cai et al. "Learning
-    Delicate Local Representations for Multi-Person Pose Estimation" (ECCV
-    2020).
-
-    Args:
-        unit_channels (int): Number of Channels in an upsample unit.
-            Default: 256
-        num_stages (int): Number of stages in a multi-stage RSN. Default: 4
-        num_units (int): NUmber of downsample/upsample units in a single-stage
-            RSN. Default: 4 Note: Make sure num_units == len(self.num_blocks)
-        num_blocks (list): Number of RSBs (Residual Steps Block) in each
-            downsample unit. Default: [2, 2, 2, 2]
-        num_steps (int): Number of steps in a RSB. Default:4
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        res_top_channels (int): Number of channels of feature from ResNet_top.
-            Default: 64.
-        expand_times (int): Times by which the in_channels are expanded in RSB.
-            Default:26.
-    Example:
-        >>> from mmpose.models import RSN
-        >>> import torch
-        >>> self = RSN(num_stages=2,num_units=2,num_blocks=[2,2])
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 511, 511)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_output in level_outputs:
-        ...     for feature in level_output:
-        ...         print(tuple(feature.shape))
-        ...
-        (1, 256, 64, 64)
-        (1, 256, 128, 128)
-        (1, 256, 64, 64)
-        (1, 256, 128, 128)
-    """
-
-    def __init__(self,
-                 unit_channels=256,
-                 num_stages=4,
-                 num_units=4,
-                 num_blocks=[2, 2, 2, 2],
-                 num_steps=4,
-                 norm_cfg=dict(type='BN'),
-                 res_top_channels=64,
-                 expand_times=26):
-        # Protect mutable default arguments
-        norm_cfg = cp.deepcopy(norm_cfg)
-        num_blocks = cp.deepcopy(num_blocks)
-        super().__init__()
-        self.unit_channels = unit_channels
-        self.num_stages = num_stages
-        self.num_units = num_units
-        self.num_blocks = num_blocks
-        self.num_steps = num_steps
-        self.norm_cfg = norm_cfg
-
-        assert self.num_stages > 0
-        assert self.num_steps > 1
-        assert self.num_units > 1
-        assert self.num_units == len(self.num_blocks)
-        self.top = ResNet_top(norm_cfg=norm_cfg)
-        self.multi_stage_rsn = nn.ModuleList([])
-        for i in range(self.num_stages):
-            if i == 0:
-                has_skip = False
-            else:
-                has_skip = True
-            if i != self.num_stages - 1:
-                gen_skip = True
-                gen_cross_conv = True
-            else:
-                gen_skip = False
-                gen_cross_conv = False
-            self.multi_stage_rsn.append(
-                Single_stage_RSN(has_skip, gen_skip, gen_cross_conv,
-                                 unit_channels, num_units, num_steps,
-                                 num_blocks, norm_cfg, res_top_channels,
-                                 expand_times))
-
-    def forward(self, x):
-        """Model forward function."""
-        out_feats = []
-        skip1 = None
-        skip2 = None
-        x = self.top(x)
-        for i in range(self.num_stages):
-            out, skip1, skip2, x = self.multi_stage_rsn[i](x, skip1, skip2)
-            out_feats.append(out)
-
-        return out_feats
-
-    def init_weights(self, pretrained=None):
-        """Initialize model weights."""
-        for m in self.multi_stage_rsn.modules():
-            if isinstance(m, nn.Conv2d):
-                kaiming_init(m)
-            elif isinstance(m, nn.BatchNorm2d):
-                constant_init(m, 1)
-            elif isinstance(m, nn.Linear):
-                normal_init(m, std=0.01)
-
-        for m in self.top.modules():
-            if isinstance(m, nn.Conv2d):
-                kaiming_init(m)
diff --git a/main/transformer_utils/mmpose/models/backbones/scnet.py b/main/transformer_utils/mmpose/models/backbones/scnet.py
deleted file mode 100644
index 3786c5731d685638cfa64a83e5d4a5e2eee545de..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/scnet.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as cp
-from mmcv.cnn import build_conv_layer, build_norm_layer
-
-from ..builder import BACKBONES
-from .resnet import Bottleneck, ResNet
-
-
-class SCConv(nn.Module):
-    """SCConv (Self-calibrated Convolution)
-
-    Args:
-        in_channels (int): The input channels of the SCConv.
-        out_channels (int): The output channel of the SCConv.
-        stride (int): stride of SCConv.
-        pooling_r (int): size of pooling for scconv.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride,
-                 pooling_r,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', momentum=0.1)):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-
-        assert in_channels == out_channels
-
-        self.k2 = nn.Sequential(
-            nn.AvgPool2d(kernel_size=pooling_r, stride=pooling_r),
-            build_conv_layer(
-                conv_cfg,
-                in_channels,
-                in_channels,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                bias=False),
-            build_norm_layer(norm_cfg, in_channels)[1],
-        )
-        self.k3 = nn.Sequential(
-            build_conv_layer(
-                conv_cfg,
-                in_channels,
-                in_channels,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                bias=False),
-            build_norm_layer(norm_cfg, in_channels)[1],
-        )
-        self.k4 = nn.Sequential(
-            build_conv_layer(
-                conv_cfg,
-                in_channels,
-                in_channels,
-                kernel_size=3,
-                stride=stride,
-                padding=1,
-                bias=False),
-            build_norm_layer(norm_cfg, out_channels)[1],
-            nn.ReLU(inplace=True),
-        )
-
-    def forward(self, x):
-        """Forward function."""
-        identity = x
-
-        out = torch.sigmoid(
-            torch.add(identity, F.interpolate(self.k2(x),
-                                              identity.size()[2:])))
-        out = torch.mul(self.k3(x), out)
-        out = self.k4(out)
-
-        return out
-
-
-class SCBottleneck(Bottleneck):
-    """SC(Self-calibrated) Bottleneck.
-
-    Args:
-        in_channels (int): The input channels of the SCBottleneck block.
-        out_channels (int): The output channel of the SCBottleneck block.
-    """
-
-    pooling_r = 4
-
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super().__init__(in_channels, out_channels, **kwargs)
-        self.mid_channels = out_channels // self.expansion // 2
-
-        self.norm1_name, norm1 = build_norm_layer(
-            self.norm_cfg, self.mid_channels, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(
-            self.norm_cfg, self.mid_channels, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(
-            self.norm_cfg, out_channels, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            in_channels,
-            self.mid_channels,
-            kernel_size=1,
-            stride=1,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-
-        self.k1 = nn.Sequential(
-            build_conv_layer(
-                self.conv_cfg,
-                self.mid_channels,
-                self.mid_channels,
-                kernel_size=3,
-                stride=self.stride,
-                padding=1,
-                bias=False),
-            build_norm_layer(self.norm_cfg, self.mid_channels)[1],
-            nn.ReLU(inplace=True))
-
-        self.conv2 = build_conv_layer(
-            self.conv_cfg,
-            in_channels,
-            self.mid_channels,
-            kernel_size=1,
-            stride=1,
-            bias=False)
-        self.add_module(self.norm2_name, norm2)
-
-        self.scconv = SCConv(self.mid_channels, self.mid_channels, self.stride,
-                             self.pooling_r, self.conv_cfg, self.norm_cfg)
-
-        self.conv3 = build_conv_layer(
-            self.conv_cfg,
-            self.mid_channels * 2,
-            out_channels,
-            kernel_size=1,
-            stride=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-    def forward(self, x):
-        """Forward function."""
-
-        def _inner_forward(x):
-            identity = x
-
-            out_a = self.conv1(x)
-            out_a = self.norm1(out_a)
-            out_a = self.relu(out_a)
-
-            out_a = self.k1(out_a)
-
-            out_b = self.conv2(x)
-            out_b = self.norm2(out_b)
-            out_b = self.relu(out_b)
-
-            out_b = self.scconv(out_b)
-
-            out = self.conv3(torch.cat([out_a, out_b], dim=1))
-            out = self.norm3(out)
-
-            if self.downsample is not None:
-                identity = self.downsample(x)
-
-            out += identity
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-@BACKBONES.register_module()
-class SCNet(ResNet):
-    """SCNet backbone.
-
-    Improving Convolutional Networks with Self-Calibrated Convolutions,
-    Jiang-Jiang Liu, Qibin Hou, Ming-Ming Cheng, Changhu Wang, Jiashi Feng,
-    IEEE CVPR, 2020.
-    http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf
-
-    Args:
-        depth (int): Depth of scnet, from {50, 101}.
-        in_channels (int): Number of input image channels. Normally 3.
-        base_channels (int): Number of base channels of hidden layer.
-        num_stages (int): SCNet stages, normally 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity.
-
-    Example:
-        >>> from mmpose.models import SCNet
-        >>> import torch
-        >>> self = SCNet(depth=50, out_indices=(0, 1, 2, 3))
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 224, 224)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 256, 56, 56)
-        (1, 512, 28, 28)
-        (1, 1024, 14, 14)
-        (1, 2048, 7, 7)
-    """
-
-    arch_settings = {
-        50: (SCBottleneck, [3, 4, 6, 3]),
-        101: (SCBottleneck, [3, 4, 23, 3])
-    }
-
-    def __init__(self, depth, **kwargs):
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for SCNet')
-        super().__init__(depth, **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/seresnet.py b/main/transformer_utils/mmpose/models/backbones/seresnet.py
deleted file mode 100644
index ac2d53b40a4593bce96d5c7c3bb4e06d38353d0b..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/seresnet.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.utils.checkpoint as cp
-
-from ..builder import BACKBONES
-from .resnet import Bottleneck, ResLayer, ResNet
-from .utils.se_layer import SELayer
-
-
-class SEBottleneck(Bottleneck):
-    """SEBottleneck block for SEResNet.
-
-    Args:
-        in_channels (int): The input channels of the SEBottleneck block.
-        out_channels (int): The output channel of the SEBottleneck block.
-        se_ratio (int): Squeeze ratio in SELayer. Default: 16
-    """
-
-    def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs):
-        super().__init__(in_channels, out_channels, **kwargs)
-        self.se_layer = SELayer(out_channels, ratio=se_ratio)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            identity = x
-
-            out = self.conv1(x)
-            out = self.norm1(out)
-            out = self.relu(out)
-
-            out = self.conv2(out)
-            out = self.norm2(out)
-            out = self.relu(out)
-
-            out = self.conv3(out)
-            out = self.norm3(out)
-
-            out = self.se_layer(out)
-
-            if self.downsample is not None:
-                identity = self.downsample(x)
-
-            out += identity
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-@BACKBONES.register_module()
-class SEResNet(ResNet):
-    """SEResNet backbone.
-
-    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
-    details.
-
-    Args:
-        depth (int): Network depth, from {50, 101, 152}.
-        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
-        in_channels (int): Number of input image channels. Default: 3.
-        stem_channels (int): Output channels of the stem layer. Default: 64.
-        num_stages (int): Stages of the network. Default: 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-            Default: ``(1, 2, 2, 2)``.
-        dilations (Sequence[int]): Dilation of each stage.
-            Default: ``(1, 1, 1, 1)``.
-        out_indices (Sequence[int]): Output from which stages. If only one
-            stage is specified, a single tensor (feature map) is returned,
-            otherwise multiple stages are specified, a tuple of tensors will
-            be returned. Default: ``(3, )``.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
-            Default: False.
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck. Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters. Default: -1.
-        conv_cfg (dict | None): The config dict for conv layers. Default: None.
-        norm_cfg (dict): The config dict for norm layers.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Default: True.
-
-    Example:
-        >>> from mmpose.models import SEResNet
-        >>> import torch
-        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 224, 224)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 256, 56, 56)
-        (1, 512, 28, 28)
-        (1, 1024, 14, 14)
-        (1, 2048, 7, 7)
-    """
-
-    arch_settings = {
-        50: (SEBottleneck, (3, 4, 6, 3)),
-        101: (SEBottleneck, (3, 4, 23, 3)),
-        152: (SEBottleneck, (3, 8, 36, 3))
-    }
-
-    def __init__(self, depth, se_ratio=16, **kwargs):
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for SEResNet')
-        self.se_ratio = se_ratio
-        super().__init__(depth, **kwargs)
-
-    def make_res_layer(self, **kwargs):
-        return ResLayer(se_ratio=self.se_ratio, **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/seresnext.py b/main/transformer_utils/mmpose/models/backbones/seresnext.py
deleted file mode 100644
index c5c4e4ce03684f8a9bd0c6166969c01bace54bd2..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/seresnext.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import build_conv_layer, build_norm_layer
-
-from ..builder import BACKBONES
-from .resnet import ResLayer
-from .seresnet import SEBottleneck as _SEBottleneck
-from .seresnet import SEResNet
-
-
-class SEBottleneck(_SEBottleneck):
-    """SEBottleneck block for SEResNeXt.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        base_channels (int): Middle channels of the first stage. Default: 64.
-        groups (int): Groups of conv2.
-        width_per_group (int): Width per group of conv2. 64x4d indicates
-            ``groups=64, width_per_group=4`` and 32x8d indicates
-            ``groups=32, width_per_group=8``.
-        stride (int): stride of the block. Default: 1
-        dilation (int): dilation of convolution. Default: 1
-        downsample (nn.Module): downsample operation on identity branch.
-            Default: None
-        se_ratio (int): Squeeze ratio in SELayer. Default: 16
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 base_channels=64,
-                 groups=32,
-                 width_per_group=4,
-                 se_ratio=16,
-                 **kwargs):
-        super().__init__(in_channels, out_channels, se_ratio, **kwargs)
-        self.groups = groups
-        self.width_per_group = width_per_group
-
-        # We follow the same rational of ResNext to compute mid_channels.
-        # For SEResNet bottleneck, middle channels are determined by expansion
-        # and out_channels, but for SEResNeXt bottleneck, it is determined by
-        # groups and width_per_group and the stage it is located in.
-        if groups != 1:
-            assert self.mid_channels % base_channels == 0
-            self.mid_channels = (
-                groups * width_per_group * self.mid_channels // base_channels)
-
-        self.norm1_name, norm1 = build_norm_layer(
-            self.norm_cfg, self.mid_channels, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(
-            self.norm_cfg, self.mid_channels, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(
-            self.norm_cfg, self.out_channels, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            self.conv_cfg,
-            self.in_channels,
-            self.mid_channels,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            self.conv_cfg,
-            self.mid_channels,
-            self.mid_channels,
-            kernel_size=3,
-            stride=self.conv2_stride,
-            padding=self.dilation,
-            dilation=self.dilation,
-            groups=groups,
-            bias=False)
-
-        self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(
-            self.conv_cfg,
-            self.mid_channels,
-            self.out_channels,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-
-@BACKBONES.register_module()
-class SEResNeXt(SEResNet):
-    """SEResNeXt backbone.
-
-    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
-    details.
-
-    Args:
-        depth (int): Network depth, from {50, 101, 152}.
-        groups (int): Groups of conv2 in Bottleneck. Default: 32.
-        width_per_group (int): Width per group of conv2 in Bottleneck.
-            Default: 4.
-        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
-        in_channels (int): Number of input image channels. Default: 3.
-        stem_channels (int): Output channels of the stem layer. Default: 64.
-        num_stages (int): Stages of the network. Default: 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-            Default: ``(1, 2, 2, 2)``.
-        dilations (Sequence[int]): Dilation of each stage.
-            Default: ``(1, 1, 1, 1)``.
-        out_indices (Sequence[int]): Output from which stages. If only one
-            stage is specified, a single tensor (feature map) is returned,
-            otherwise multiple stages are specified, a tuple of tensors will
-            be returned. Default: ``(3, )``.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
-            Default: False.
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck. Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters. Default: -1.
-        conv_cfg (dict | None): The config dict for conv layers. Default: None.
-        norm_cfg (dict): The config dict for norm layers.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Default: True.
-
-    Example:
-        >>> from mmpose.models import SEResNeXt
-        >>> import torch
-        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 3, 224, 224)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 256, 56, 56)
-        (1, 512, 28, 28)
-        (1, 1024, 14, 14)
-        (1, 2048, 7, 7)
-    """
-
-    arch_settings = {
-        50: (SEBottleneck, (3, 4, 6, 3)),
-        101: (SEBottleneck, (3, 4, 23, 3)),
-        152: (SEBottleneck, (3, 8, 36, 3))
-    }
-
-    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
-        self.groups = groups
-        self.width_per_group = width_per_group
-        super().__init__(depth, **kwargs)
-
-    def make_res_layer(self, **kwargs):
-        return ResLayer(
-            groups=self.groups,
-            width_per_group=self.width_per_group,
-            base_channels=self.base_channels,
-            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py b/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py
deleted file mode 100644
index 9f98cbd2132250ec13adcce6e642c966b0dbd7cc..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-from mmcv.cnn import (ConvModule, build_activation_layer, constant_init,
-                      normal_init)
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .utils import channel_shuffle, load_checkpoint, make_divisible
-
-
-class ShuffleUnit(nn.Module):
-    """ShuffleUnit block.
-
-    ShuffleNet unit with pointwise group convolution (GConv) and channel
-    shuffle.
-
-    Args:
-        in_channels (int): The input channels of the ShuffleUnit.
-        out_channels (int): The output channels of the ShuffleUnit.
-        groups (int, optional): The number of groups to be used in grouped 1x1
-            convolutions in each ShuffleUnit. Default: 3
-        first_block (bool, optional): Whether it is the first ShuffleUnit of a
-            sequential ShuffleUnits. Default: True, which means not using the
-            grouped 1x1 convolution.
-        combine (str, optional): The ways to combine the input and output
-            branches. Default: 'add'.
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
-        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
-            will save some memory while slowing down the training speed.
-            Default: False.
-
-    Returns:
-        Tensor: The output tensor.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 groups=3,
-                 first_block=True,
-                 combine='add',
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        act_cfg = copy.deepcopy(act_cfg)
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.first_block = first_block
-        self.combine = combine
-        self.groups = groups
-        self.bottleneck_channels = self.out_channels // 4
-        self.with_cp = with_cp
-
-        if self.combine == 'add':
-            self.depthwise_stride = 1
-            self._combine_func = self._add
-            assert in_channels == out_channels, (
-                'in_channels must be equal to out_channels when combine '
-                'is add')
-        elif self.combine == 'concat':
-            self.depthwise_stride = 2
-            self._combine_func = self._concat
-            self.out_channels -= self.in_channels
-            self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
-        else:
-            raise ValueError(f'Cannot combine tensors with {self.combine}. '
-                             'Only "add" and "concat" are supported')
-
-        self.first_1x1_groups = 1 if first_block else self.groups
-        self.g_conv_1x1_compress = ConvModule(
-            in_channels=self.in_channels,
-            out_channels=self.bottleneck_channels,
-            kernel_size=1,
-            groups=self.first_1x1_groups,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-
-        self.depthwise_conv3x3_bn = ConvModule(
-            in_channels=self.bottleneck_channels,
-            out_channels=self.bottleneck_channels,
-            kernel_size=3,
-            stride=self.depthwise_stride,
-            padding=1,
-            groups=self.bottleneck_channels,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=None)
-
-        self.g_conv_1x1_expand = ConvModule(
-            in_channels=self.bottleneck_channels,
-            out_channels=self.out_channels,
-            kernel_size=1,
-            groups=self.groups,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=None)
-
-        self.act = build_activation_layer(act_cfg)
-
-    @staticmethod
-    def _add(x, out):
-        # residual connection
-        return x + out
-
-    @staticmethod
-    def _concat(x, out):
-        # concatenate along channel axis
-        return torch.cat((x, out), 1)
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            residual = x
-
-            out = self.g_conv_1x1_compress(x)
-            out = self.depthwise_conv3x3_bn(out)
-
-            if self.groups > 1:
-                out = channel_shuffle(out, self.groups)
-
-            out = self.g_conv_1x1_expand(out)
-
-            if self.combine == 'concat':
-                residual = self.avgpool(residual)
-                out = self.act(out)
-                out = self._combine_func(residual, out)
-            else:
-                out = self._combine_func(residual, out)
-                out = self.act(out)
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-@BACKBONES.register_module()
-class ShuffleNetV1(BaseBackbone):
-    """ShuffleNetV1 backbone.
-
-    Args:
-        groups (int, optional): The number of groups to be used in grouped 1x1
-            convolutions in each ShuffleUnit. Default: 3.
-        widen_factor (float, optional): Width multiplier - adjusts the number
-            of channels in each layer by this amount. Default: 1.0.
-        out_indices (Sequence[int]): Output from which stages.
-            Default: (2, )
-        frozen_stages (int): Stages to be frozen (all param fixed).
-            Default: -1, which means not freezing any parameters.
-        conv_cfg (dict): Config dict for convolution layer. Default: None,
-            which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 groups=3,
-                 widen_factor=1.0,
-                 out_indices=(2, ),
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 norm_eval=False,
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        act_cfg = copy.deepcopy(act_cfg)
-        super().__init__()
-        self.stage_blocks = [4, 8, 4]
-        self.groups = groups
-
-        for index in out_indices:
-            if index not in range(0, 3):
-                raise ValueError('the item in out_indices must in '
-                                 f'range(0, 3). But received {index}')
-
-        if frozen_stages not in range(-1, 3):
-            raise ValueError('frozen_stages must be in range(-1, 3). '
-                             f'But received {frozen_stages}')
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        if groups == 1:
-            channels = (144, 288, 576)
-        elif groups == 2:
-            channels = (200, 400, 800)
-        elif groups == 3:
-            channels = (240, 480, 960)
-        elif groups == 4:
-            channels = (272, 544, 1088)
-        elif groups == 8:
-            channels = (384, 768, 1536)
-        else:
-            raise ValueError(f'{groups} groups is not supported for 1x1 '
-                             'Grouped Convolutions')
-
-        channels = [make_divisible(ch * widen_factor, 8) for ch in channels]
-
-        self.in_channels = int(24 * widen_factor)
-
-        self.conv1 = ConvModule(
-            in_channels=3,
-            out_channels=self.in_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-        self.layers = nn.ModuleList()
-        for i, num_blocks in enumerate(self.stage_blocks):
-            first_block = (i == 0)
-            layer = self.make_layer(channels[i], num_blocks, first_block)
-            self.layers.append(layer)
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-        for i in range(self.frozen_stages):
-            layer = self.layers[i]
-            layer.eval()
-            for param in layer.parameters():
-                param.requires_grad = False
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for name, m in self.named_modules():
-                if isinstance(m, nn.Conv2d):
-                    if 'conv1' in name:
-                        normal_init(m, mean=0, std=0.01)
-                    else:
-                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m, val=1, bias=0.0001)
-                    if isinstance(m, _BatchNorm):
-                        if m.running_mean is not None:
-                            nn.init.constant_(m.running_mean, 0)
-        else:
-            raise TypeError('pretrained must be a str or None. But received '
-                            f'{type(pretrained)}')
-
-    def make_layer(self, out_channels, num_blocks, first_block=False):
-        """Stack ShuffleUnit blocks to make a layer.
-
-        Args:
-            out_channels (int): out_channels of the block.
-            num_blocks (int): Number of blocks.
-            first_block (bool, optional): Whether is the first ShuffleUnit of a
-                sequential ShuffleUnits. Default: False, which means using
-                the grouped 1x1 convolution.
-        """
-        layers = []
-        for i in range(num_blocks):
-            first_block = first_block if i == 0 else False
-            combine_mode = 'concat' if i == 0 else 'add'
-            layers.append(
-                ShuffleUnit(
-                    self.in_channels,
-                    out_channels,
-                    groups=self.groups,
-                    first_block=first_block,
-                    combine=combine_mode,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
-                    with_cp=self.with_cp))
-            self.in_channels = out_channels
-
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.maxpool(x)
-
-        outs = []
-        for i, layer in enumerate(self.layers):
-            x = layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-
-        if len(outs) == 1:
-            return outs[0]
-        return tuple(outs)
-
-    def train(self, mode=True):
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py b/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py
deleted file mode 100644
index e93533367afe4efa01fa67d14cafcca006c990e8..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-from mmcv.cnn import ConvModule, constant_init, normal_init
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .utils import channel_shuffle, load_checkpoint
-
-
-class InvertedResidual(nn.Module):
-    """InvertedResidual block for ShuffleNetV2 backbone.
-
-    Args:
-        in_channels (int): The input channels of the block.
-        out_channels (int): The output channels of the block.
-        stride (int): Stride of the 3x3 convolution layer. Default: 1
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride=1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        act_cfg = copy.deepcopy(act_cfg)
-        super().__init__()
-        self.stride = stride
-        self.with_cp = with_cp
-
-        branch_features = out_channels // 2
-        if self.stride == 1:
-            assert in_channels == branch_features * 2, (
-                f'in_channels ({in_channels}) should equal to '
-                f'branch_features * 2 ({branch_features * 2}) '
-                'when stride is 1')
-
-        if in_channels != branch_features * 2:
-            assert self.stride != 1, (
-                f'stride ({self.stride}) should not equal 1 when '
-                f'in_channels != branch_features * 2')
-
-        if self.stride > 1:
-            self.branch1 = nn.Sequential(
-                ConvModule(
-                    in_channels,
-                    in_channels,
-                    kernel_size=3,
-                    stride=self.stride,
-                    padding=1,
-                    groups=in_channels,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=None),
-                ConvModule(
-                    in_channels,
-                    branch_features,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg),
-            )
-
-        self.branch2 = nn.Sequential(
-            ConvModule(
-                in_channels if (self.stride > 1) else branch_features,
-                branch_features,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg),
-            ConvModule(
-                branch_features,
-                branch_features,
-                kernel_size=3,
-                stride=self.stride,
-                padding=1,
-                groups=branch_features,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None),
-            ConvModule(
-                branch_features,
-                branch_features,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg))
-
-    def forward(self, x):
-
-        def _inner_forward(x):
-            if self.stride > 1:
-                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
-            else:
-                x1, x2 = x.chunk(2, dim=1)
-                out = torch.cat((x1, self.branch2(x2)), dim=1)
-
-            out = channel_shuffle(out, 2)
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        return out
-
-
-@BACKBONES.register_module()
-class ShuffleNetV2(BaseBackbone):
-    """ShuffleNetV2 backbone.
-
-    Args:
-        widen_factor (float): Width multiplier - adjusts the number of
-            channels in each layer by this amount. Default: 1.0.
-        out_indices (Sequence[int]): Output from which stages.
-            Default: (0, 1, 2, 3).
-        frozen_stages (int): Stages to be frozen (all param fixed).
-            Default: -1, which means not freezing any parameters.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='ReLU').
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-    """
-
-    def __init__(self,
-                 widen_factor=1.0,
-                 out_indices=(3, ),
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 act_cfg=dict(type='ReLU'),
-                 norm_eval=False,
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        act_cfg = copy.deepcopy(act_cfg)
-        super().__init__()
-        self.stage_blocks = [4, 8, 4]
-        for index in out_indices:
-            if index not in range(0, 4):
-                raise ValueError('the item in out_indices must in '
-                                 f'range(0, 4). But received {index}')
-
-        if frozen_stages not in range(-1, 4):
-            raise ValueError('frozen_stages must be in range(-1, 4). '
-                             f'But received {frozen_stages}')
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        if widen_factor == 0.5:
-            channels = [48, 96, 192, 1024]
-        elif widen_factor == 1.0:
-            channels = [116, 232, 464, 1024]
-        elif widen_factor == 1.5:
-            channels = [176, 352, 704, 1024]
-        elif widen_factor == 2.0:
-            channels = [244, 488, 976, 2048]
-        else:
-            raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. '
-                             f'But received {widen_factor}')
-
-        self.in_channels = 24
-        self.conv1 = ConvModule(
-            in_channels=3,
-            out_channels=self.in_channels,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-        self.layers = nn.ModuleList()
-        for i, num_blocks in enumerate(self.stage_blocks):
-            layer = self._make_layer(channels[i], num_blocks)
-            self.layers.append(layer)
-
-        output_channels = channels[-1]
-        self.layers.append(
-            ConvModule(
-                in_channels=self.in_channels,
-                out_channels=output_channels,
-                kernel_size=1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg))
-
-    def _make_layer(self, out_channels, num_blocks):
-        """Stack blocks to make a layer.
-
-        Args:
-            out_channels (int): out_channels of the block.
-            num_blocks (int): number of blocks.
-        """
-        layers = []
-        for i in range(num_blocks):
-            stride = 2 if i == 0 else 1
-            layers.append(
-                InvertedResidual(
-                    in_channels=self.in_channels,
-                    out_channels=out_channels,
-                    stride=stride,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
-                    with_cp=self.with_cp))
-            self.in_channels = out_channels
-
-        return nn.Sequential(*layers)
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-
-        for i in range(self.frozen_stages):
-            m = self.layers[i]
-            m.eval()
-            for param in m.parameters():
-                param.requires_grad = False
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for name, m in self.named_modules():
-                if isinstance(m, nn.Conv2d):
-                    if 'conv1' in name:
-                        normal_init(m, mean=0, std=0.01)
-                    else:
-                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
-                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                    constant_init(m.weight, val=1, bias=0.0001)
-                    if isinstance(m, _BatchNorm):
-                        if m.running_mean is not None:
-                            nn.init.constant_(m.running_mean, 0)
-        else:
-            raise TypeError('pretrained must be a str or None. But received '
-                            f'{type(pretrained)}')
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.maxpool(x)
-
-        outs = []
-        for i, layer in enumerate(self.layers):
-            x = layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-
-        if len(outs) == 1:
-            return outs[0]
-        return tuple(outs)
-
-    def train(self, mode=True):
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, nn.BatchNorm2d):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/swin.py b/main/transformer_utils/mmpose/models/backbones/swin.py
deleted file mode 100644
index 2449cdca591bc0bbf601295bde11efe834b49f8a..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/swin.py
+++ /dev/null
@@ -1,733 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections import OrderedDict
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as cp
-from mmcv.cnn import build_norm_layer, constant_init, trunc_normal_init
-from mmcv.cnn.bricks.transformer import FFN, build_dropout
-from mmcv.cnn.utils.weight_init import trunc_normal_
-from mmcv.runner import _load_checkpoint
-from mmcv.utils import to_2tuple
-
-from ...utils import get_root_logger
-from ..builder import BACKBONES
-from ..utils.transformer import PatchEmbed, PatchMerging
-from .base_backbone import BaseBackbone
-from .utils.ckpt_convert import swin_converter
-
-
-class WindowMSA(nn.Module):
-    """Window based multi-head self-attention (W-MSA) module with relative
-    position bias.
-
-    Args:
-        embed_dims (int): Number of input channels.
-        num_heads (int): Number of attention heads.
-        window_size (tuple[int]): The height and width of the window.
-        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
-            Default: True.
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        attn_drop_rate (float, optional): Dropout ratio of attention weight.
-            Default: 0.0
-        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 window_size,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 attn_drop_rate=0.,
-                 proj_drop_rate=0.):
-
-        super().__init__()
-        self.embed_dims = embed_dims
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_embed_dims = embed_dims // num_heads
-        self.scale = qk_scale or head_embed_dims**-0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
-                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # About 2x faster than original impl
-        Wh, Ww = self.window_size
-        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
-        rel_position_index = rel_index_coords + rel_index_coords.T
-        rel_position_index = rel_position_index.flip(1).contiguous()
-        self.register_buffer('relative_position_index', rel_position_index)
-
-        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop_rate)
-        self.proj = nn.Linear(embed_dims, embed_dims)
-        self.proj_drop = nn.Dropout(proj_drop_rate)
-
-        self.softmax = nn.Softmax(dim=-1)
-
-    def init_weights(self):
-        trunc_normal_(self.relative_position_bias_table, std=0.02)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-
-            x (tensor): input features with shape of (num_windows*B, N, C)
-            mask (tensor | None, Optional): mask with shape of (num_windows,
-                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
-        """
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        # make torchscript happy (cannot use tensor as tuple)
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[
-            self.relative_position_index.view(-1)].view(
-                self.window_size[0] * self.window_size[1],
-                self.window_size[0] * self.window_size[1],
-                -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(
-            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B // nW, nW, self.num_heads, N,
-                             N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-        attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    @staticmethod
-    def double_step_seq(step1, len1, step2, len2):
-        seq1 = torch.arange(0, step1 * len1, step1)
-        seq2 = torch.arange(0, step2 * len2, step2)
-        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
-
-
-class ShiftWindowMSA(nn.Module):
-    """Shifted Window Multihead Self-Attention Module.
-
-    Args:
-        embed_dims (int): Number of input channels.
-        num_heads (int): Number of attention heads.
-        window_size (int): The height and width of the window.
-        shift_size (int, optional): The shift step of each window towards
-            right-bottom. If zero, act as regular window-msa. Defaults to 0.
-        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
-            Default: True
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Defaults: None.
-        attn_drop_rate (float, optional): Dropout ratio of attention weight.
-            Defaults: 0.
-        proj_drop_rate (float, optional): Dropout ratio of output.
-            Defaults: 0.
-        dropout_layer (dict, optional): The dropout_layer used before output.
-            Defaults: dict(type='DropPath', drop_prob=0.).
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 window_size,
-                 shift_size=0,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 attn_drop_rate=0,
-                 proj_drop_rate=0,
-                 dropout_layer=dict(type='DropPath', drop_prob=0.)):
-        super().__init__()
-
-        self.window_size = window_size
-        self.shift_size = shift_size
-        assert 0 <= self.shift_size < self.window_size
-
-        self.w_msa = WindowMSA(
-            embed_dims=embed_dims,
-            num_heads=num_heads,
-            window_size=to_2tuple(window_size),
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop_rate=attn_drop_rate,
-            proj_drop_rate=proj_drop_rate)
-
-        self.drop = build_dropout(dropout_layer)
-
-    def forward(self, query, hw_shape):
-        B, L, C = query.shape
-        H, W = hw_shape
-        assert L == H * W, 'input feature has wrong size'
-        query = query.view(B, H, W, C)
-
-        # pad feature maps to multiples of window size
-        pad_r = (self.window_size - W % self.window_size) % self.window_size
-        pad_b = (self.window_size - H % self.window_size) % self.window_size
-        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
-        H_pad, W_pad = query.shape[1], query.shape[2]
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_query = torch.roll(
-                query,
-                shifts=(-self.shift_size, -self.shift_size),
-                dims=(1, 2))
-
-            # calculate attention mask for SW-MSA
-            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
-            h_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size,
-                              -self.shift_size), slice(-self.shift_size, None))
-            w_slices = (slice(0, -self.window_size),
-                        slice(-self.window_size,
-                              -self.shift_size), slice(-self.shift_size, None))
-            cnt = 0
-            for h in h_slices:
-                for w in w_slices:
-                    img_mask[:, h, w, :] = cnt
-                    cnt += 1
-
-            # nW, window_size, window_size, 1
-            mask_windows = self.window_partition(img_mask)
-            mask_windows = mask_windows.view(
-                -1, self.window_size * self.window_size)
-            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-            attn_mask = attn_mask.masked_fill(attn_mask != 0,
-                                              float(-100.0)).masked_fill(
-                                                  attn_mask == 0, float(0.0))
-        else:
-            shifted_query = query
-            attn_mask = None
-
-        # nW*B, window_size, window_size, C
-        query_windows = self.window_partition(shifted_query)
-        # nW*B, window_size*window_size, C
-        query_windows = query_windows.view(-1, self.window_size**2, C)
-
-        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
-        attn_windows = self.w_msa(query_windows, mask=attn_mask)
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size,
-                                         self.window_size, C)
-
-        # B H' W' C
-        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(
-                shifted_x,
-                shifts=(self.shift_size, self.shift_size),
-                dims=(1, 2))
-        else:
-            x = shifted_x
-
-        if pad_r > 0 or pad_b:
-            x = x[:, :H, :W, :].contiguous()
-
-        x = x.view(B, H * W, C)
-
-        x = self.drop(x)
-        return x
-
-    def window_reverse(self, windows, H, W):
-        """
-        Args:
-            windows: (num_windows*B, window_size, window_size, C)
-            H (int): Height of image
-            W (int): Width of image
-        Returns:
-            x: (B, H, W, C)
-        """
-        window_size = self.window_size
-        B = int(windows.shape[0] / (H * W / window_size / window_size))
-        x = windows.view(B, H // window_size, W // window_size, window_size,
-                         window_size, -1)
-        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-        return x
-
-    def window_partition(self, x):
-        """
-        Args:
-            x: (B, H, W, C)
-        Returns:
-            windows: (num_windows*B, window_size, window_size, C)
-        """
-        B, H, W, C = x.shape
-        window_size = self.window_size
-        x = x.view(B, H // window_size, window_size, W // window_size,
-                   window_size, C)
-        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
-        windows = windows.view(-1, window_size, window_size, C)
-        return windows
-
-
-class SwinBlock(nn.Module):
-    """"
-    Args:
-        embed_dims (int): The feature dimension.
-        num_heads (int): Parallel attention heads.
-        feedforward_channels (int): The hidden dimension for FFNs.
-        window_size (int, optional): The local window scale. Default: 7.
-        shift (bool, optional): whether to shift window or not. Default False.
-        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        drop_rate (float, optional): Dropout rate. Default: 0.
-        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
-        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
-        act_cfg (dict, optional): The config dict of activation function.
-            Default: dict(type='GELU').
-        norm_cfg (dict, optional): The config dict of normalization.
-            Default: dict(type='LN').
-        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
-            will save some memory while slowing down the training speed.
-            Default: False.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 feedforward_channels,
-                 window_size=7,
-                 shift=False,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 act_cfg=dict(type='GELU'),
-                 norm_cfg=dict(type='LN'),
-                 with_cp=False):
-
-        super(SwinBlock, self).__init__()
-
-        self.with_cp = with_cp
-
-        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
-        self.attn = ShiftWindowMSA(
-            embed_dims=embed_dims,
-            num_heads=num_heads,
-            window_size=window_size,
-            shift_size=window_size // 2 if shift else 0,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop_rate=attn_drop_rate,
-            proj_drop_rate=drop_rate,
-            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate))
-
-        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
-        self.ffn = FFN(
-            embed_dims=embed_dims,
-            feedforward_channels=feedforward_channels,
-            num_fcs=2,
-            ffn_drop=drop_rate,
-            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
-            act_cfg=act_cfg,
-            add_identity=True,
-            init_cfg=None)
-
-    def forward(self, x, hw_shape):
-
-        def _inner_forward(x):
-            identity = x
-            x = self.norm1(x)
-            x = self.attn(x, hw_shape)
-
-            x = x + identity
-
-            identity = x
-            x = self.norm2(x)
-            x = self.ffn(x, identity=identity)
-
-            return x
-
-        if self.with_cp and x.requires_grad:
-            x = cp.checkpoint(_inner_forward, x)
-        else:
-            x = _inner_forward(x)
-
-        return x
-
-
-class SwinBlockSequence(nn.Module):
-    """Implements one stage in Swin Transformer.
-
-    Args:
-        embed_dims (int): The feature dimension.
-        num_heads (int): Parallel attention heads.
-        feedforward_channels (int): The hidden dimension for FFNs.
-        depth (int): The number of blocks in this stage.
-        window_size (int, optional): The local window scale. Default: 7.
-        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        drop_rate (float, optional): Dropout rate. Default: 0.
-        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
-        drop_path_rate (float | list[float], optional): Stochastic depth
-            rate. Default: 0.
-        downsample (nn.Module | None, optional): The downsample operation
-            module. Default: None.
-        act_cfg (dict, optional): The config dict of activation function.
-            Default: dict(type='GELU').
-        norm_cfg (dict, optional): The config dict of normalization.
-            Default: dict(type='LN').
-        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
-            will save some memory while slowing down the training speed.
-            Default: False.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 feedforward_channels,
-                 depth,
-                 window_size=7,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 downsample=None,
-                 act_cfg=dict(type='GELU'),
-                 norm_cfg=dict(type='LN'),
-                 with_cp=False):
-        super().__init__()
-
-        if isinstance(drop_path_rate, list):
-            drop_path_rates = drop_path_rate
-            assert len(drop_path_rates) == depth
-        else:
-            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
-
-        self.blocks = nn.ModuleList()
-        for i in range(depth):
-            block = SwinBlock(
-                embed_dims=embed_dims,
-                num_heads=num_heads,
-                feedforward_channels=feedforward_channels,
-                window_size=window_size,
-                shift=False if i % 2 == 0 else True,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop_rate=drop_rate,
-                attn_drop_rate=attn_drop_rate,
-                drop_path_rate=drop_path_rates[i],
-                act_cfg=act_cfg,
-                norm_cfg=norm_cfg,
-                with_cp=with_cp)
-            self.blocks.append(block)
-
-        self.downsample = downsample
-
-    def forward(self, x, hw_shape):
-        for block in self.blocks:
-            x = block(x, hw_shape)
-
-        if self.downsample:
-            x_down, down_hw_shape = self.downsample(x, hw_shape)
-            return x_down, down_hw_shape, x, hw_shape
-        else:
-            return x, hw_shape, x, hw_shape
-
-
-@BACKBONES.register_module()
-class SwinTransformer(BaseBackbone):
-    """ Swin Transformer
-    A PyTorch implement of : `Swin Transformer:
-    Hierarchical Vision Transformer using Shifted Windows`  -
-        https://arxiv.org/abs/2103.14030
-
-    Inspiration from
-    https://github.com/microsoft/Swin-Transformer
-
-    Args:
-        pretrain_img_size (int | tuple[int]): The size of input image when
-            pretrain. Defaults: 224.
-        in_channels (int): The num of input channels.
-            Defaults: 3.
-        embed_dims (int): The feature dimension. Default: 96.
-        patch_size (int | tuple[int]): Patch size. Default: 4.
-        window_size (int): Window size. Default: 7.
-        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
-            Default: 4.
-        depths (tuple[int]): Depths of each Swin Transformer stage.
-            Default: (2, 2, 6, 2).
-        num_heads (tuple[int]): Parallel attention heads of each Swin
-            Transformer stage. Default: (3, 6, 12, 24).
-        strides (tuple[int]): The patch merging or patch embedding stride of
-            each Swin Transformer stage. (In swin, we set kernel size equal to
-            stride.) Default: (4, 2, 2, 2).
-        out_indices (tuple[int]): Output from which stages.
-            Default: (0, 1, 2, 3).
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
-            value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        patch_norm (bool): If add a norm layer for patch embed and patch
-            merging. Default: True.
-        drop_rate (float): Dropout rate. Defaults: 0.
-        attn_drop_rate (float): Attention dropout rate. Default: 0.
-        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
-        use_abs_pos_embed (bool): If True, add absolute position embedding to
-            the patch embedding. Defaults: False.
-        act_cfg (dict): Config dict for activation layer.
-            Default: dict(type='LN').
-        norm_cfg (dict): Config dict for normalization layer at
-            output of backone. Defaults: dict(type='LN').
-        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
-            will save some memory while slowing down the training speed.
-            Default: False.
-        pretrained (str, optional): model pretrained path. Default: None.
-        convert_weights (bool): The flag indicates whether the
-            pre-trained model is from the original repo. We may need
-            to convert some keys to make it compatible.
-            Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            Default: -1 (-1 means not freezing any parameters).
-    """
-
-    def __init__(
-        self,
-        pretrain_img_size=224,
-        in_channels=3,
-        embed_dims=96,
-        patch_size=4,
-        window_size=7,
-        mlp_ratio=4,
-        depths=(2, 2, 6, 2),
-        num_heads=(3, 6, 12, 24),
-        strides=(4, 2, 2, 2),
-        out_indices=(0, 1, 2, 3),
-        qkv_bias=True,
-        qk_scale=None,
-        patch_norm=True,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.1,
-        use_abs_pos_embed=False,
-        act_cfg=dict(type='GELU'),
-        norm_cfg=dict(type='LN'),
-        with_cp=False,
-        convert_weights=False,
-        frozen_stages=-1,
-    ):
-        self.convert_weights = convert_weights
-        self.frozen_stages = frozen_stages
-        if isinstance(pretrain_img_size, int):
-            pretrain_img_size = to_2tuple(pretrain_img_size)
-        elif isinstance(pretrain_img_size, tuple):
-            if len(pretrain_img_size) == 1:
-                pretrain_img_size = to_2tuple(pretrain_img_size[0])
-            assert len(pretrain_img_size) == 2, \
-                f'The size of image should have length 1 or 2, ' \
-                f'but got {len(pretrain_img_size)}'
-
-        super(SwinTransformer, self).__init__()
-
-        num_layers = len(depths)
-        self.out_indices = out_indices
-        self.use_abs_pos_embed = use_abs_pos_embed
-
-        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
-
-        self.patch_embed = PatchEmbed(
-            in_channels=in_channels,
-            embed_dims=embed_dims,
-            conv_type='Conv2d',
-            kernel_size=patch_size,
-            stride=strides[0],
-            norm_cfg=norm_cfg if patch_norm else None,
-            init_cfg=None)
-
-        if self.use_abs_pos_embed:
-            patch_row = pretrain_img_size[0] // patch_size
-            patch_col = pretrain_img_size[1] // patch_size
-            num_patches = patch_row * patch_col
-            self.absolute_pos_embed = nn.Parameter(
-                torch.zeros((1, num_patches, embed_dims)))
-
-        self.drop_after_pos = nn.Dropout(p=drop_rate)
-
-        # set stochastic depth decay rule
-        total_depth = sum(depths)
-        dpr = [
-            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
-        ]
-
-        self.stages = nn.ModuleList()
-        in_channels = embed_dims
-        for i in range(num_layers):
-            if i < num_layers - 1:
-                downsample = PatchMerging(
-                    in_channels=in_channels,
-                    out_channels=2 * in_channels,
-                    stride=strides[i + 1],
-                    norm_cfg=norm_cfg if patch_norm else None,
-                    init_cfg=None)
-            else:
-                downsample = None
-
-            stage = SwinBlockSequence(
-                embed_dims=in_channels,
-                num_heads=num_heads[i],
-                feedforward_channels=mlp_ratio * in_channels,
-                depth=depths[i],
-                window_size=window_size,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop_rate=drop_rate,
-                attn_drop_rate=attn_drop_rate,
-                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
-                downsample=downsample,
-                act_cfg=act_cfg,
-                norm_cfg=norm_cfg,
-                with_cp=with_cp)
-            self.stages.append(stage)
-            if downsample:
-                in_channels = downsample.out_channels
-
-        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
-        # Add a norm layer for each output
-        for i in out_indices:
-            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
-            layer_name = f'norm{i}'
-            self.add_module(layer_name, layer)
-
-    def train(self, mode=True):
-        """Convert the model into training mode while keep layers freezed."""
-        super(SwinTransformer, self).train(mode)
-        self._freeze_stages()
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            self.patch_embed.eval()
-            for param in self.patch_embed.parameters():
-                param.requires_grad = False
-            if self.use_abs_pos_embed:
-                self.absolute_pos_embed.requires_grad = False
-            self.drop_after_pos.eval()
-
-        for i in range(1, self.frozen_stages + 1):
-
-            if (i - 1) in self.out_indices:
-                norm_layer = getattr(self, f'norm{i-1}')
-                norm_layer.eval()
-                for param in norm_layer.parameters():
-                    param.requires_grad = False
-
-            m = self.stages[i - 1]
-            m.eval()
-            for param in m.parameters():
-                param.requires_grad = False
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights in backbone.
-
-        Args:
-            pretrained (str, optional): Path to pre-trained weights.
-                Defaults to None.
-        """
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-            ckpt = _load_checkpoint(
-                pretrained, logger=logger, map_location='cpu')
-            if 'state_dict' in ckpt:
-                _state_dict = ckpt['state_dict']
-            elif 'model' in ckpt:
-                _state_dict = ckpt['model']
-            else:
-                _state_dict = ckpt
-            if self.convert_weights:
-                # supported loading weight from original repo,
-                _state_dict = swin_converter(_state_dict)
-
-            state_dict = OrderedDict()
-            for k, v in _state_dict.items():
-                if k.startswith('backbone.'):
-                    state_dict[k[9:]] = v
-
-            # strip prefix of state_dict
-            if list(state_dict.keys())[0].startswith('module.'):
-                state_dict = {k[7:]: v for k, v in state_dict.items()}
-
-            # reshape absolute position embedding
-            if state_dict.get('absolute_pos_embed') is not None:
-                absolute_pos_embed = state_dict['absolute_pos_embed']
-                N1, L, C1 = absolute_pos_embed.size()
-                N2, C2, H, W = self.absolute_pos_embed.size()
-                if N1 != N2 or C1 != C2 or L != H * W:
-                    logger.warning('Error in loading absolute_pos_embed, pass')
-                else:
-                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
-                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
-
-            # interpolate position bias table if needed
-            relative_position_bias_table_keys = [
-                k for k in state_dict.keys()
-                if 'relative_position_bias_table' in k
-            ]
-            for table_key in relative_position_bias_table_keys:
-                table_pretrained = state_dict[table_key]
-                table_current = self.state_dict()[table_key]
-                L1, nH1 = table_pretrained.size()
-                L2, nH2 = table_current.size()
-                if nH1 != nH2:
-                    logger.warning(f'Error in loading {table_key}, pass')
-                elif L1 != L2:
-                    S1 = int(L1**0.5)
-                    S2 = int(L2**0.5)
-                    table_pretrained_resized = F.interpolate(
-                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
-                        size=(S2, S2),
-                        mode='bicubic')
-                    state_dict[table_key] = table_pretrained_resized.view(
-                        nH2, L2).permute(1, 0).contiguous()
-
-            # load state_dict
-            self.load_state_dict(state_dict, False)
-        elif pretrained is None:
-            if self.use_abs_pos_embed:
-                trunc_normal_(self.absolute_pos_embed, std=0.02)
-            for m in self.modules():
-                if isinstance(m, nn.Linear):
-                    trunc_normal_init(m, std=.02, bias=0.)
-                elif isinstance(m, nn.LayerNorm):
-                    constant_init(m, 1.0)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        x, hw_shape = self.patch_embed(x)
-
-        if self.use_abs_pos_embed:
-            x = x + self.absolute_pos_embed
-        x = self.drop_after_pos(x)
-
-        outs = []
-        for i, stage in enumerate(self.stages):
-            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
-            if i in self.out_indices:
-                norm_layer = getattr(self, f'norm{i}')
-                out = norm_layer(out)
-                out = out.view(-1, *out_hw_shape,
-                               self.num_features[i]).permute(0, 3, 1,
-                                                             2).contiguous()
-                outs.append(out)
-
-        return outs
diff --git a/main/transformer_utils/mmpose/models/backbones/tcformer.py b/main/transformer_utils/mmpose/models/backbones/tcformer.py
deleted file mode 100644
index a0805cdddd17bbba50bf203e2bc9012efd86ba03..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/tcformer.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import torch
-import torch.nn as nn
-from mmcv.cnn import (build_norm_layer, constant_init, normal_init,
-                      trunc_normal_init)
-from mmcv.runner import _load_checkpoint, load_state_dict
-
-from ...utils import get_root_logger
-from ..builder import BACKBONES
-from ..utils import (PatchEmbed, TCFormerDynamicBlock, TCFormerRegularBlock,
-                     TokenConv, cluster_dpc_knn, merge_tokens,
-                     tcformer_convert, token2map)
-
-
-class CTM(nn.Module):
-    """Clustering-based Token Merging module in TCFormer.
-
-    Args:
-        sample_ratio (float): The sample ratio of tokens.
-        embed_dim (int): Input token feature dimension.
-        dim_out (int): Output token feature dimension.
-        k (int): number of the nearest neighbor used i DPC-knn algorithm.
-    """
-
-    def __init__(self, sample_ratio, embed_dim, dim_out, k=5):
-        super().__init__()
-        self.sample_ratio = sample_ratio
-        self.dim_out = dim_out
-        self.conv = TokenConv(
-            in_channels=embed_dim,
-            out_channels=dim_out,
-            kernel_size=3,
-            stride=2,
-            padding=1)
-        self.norm = nn.LayerNorm(self.dim_out)
-        self.score = nn.Linear(self.dim_out, 1)
-        self.k = k
-
-    def forward(self, token_dict):
-        token_dict = token_dict.copy()
-        x = self.conv(token_dict)
-        x = self.norm(x)
-        token_score = self.score(x)
-        token_weight = token_score.exp()
-
-        token_dict['x'] = x
-        B, N, C = x.shape
-        token_dict['token_score'] = token_score
-
-        cluster_num = max(math.ceil(N * self.sample_ratio), 1)
-        idx_cluster, cluster_num = cluster_dpc_knn(token_dict, cluster_num,
-                                                   self.k)
-        down_dict = merge_tokens(token_dict, idx_cluster, cluster_num,
-                                 token_weight)
-
-        H, W = token_dict['map_size']
-        H = math.floor((H - 1) / 2 + 1)
-        W = math.floor((W - 1) / 2 + 1)
-        down_dict['map_size'] = [H, W]
-
-        return down_dict, token_dict
-
-
-@BACKBONES.register_module()
-class TCFormer(nn.Module):
-    """Token Clustering Transformer (TCFormer)
-
-    Implementation of `Not All Tokens Are Equal: Human-centric Visual
-    Analysis via Token Clustering Transformer
-    <https://arxiv.org/abs/2204.08680>`
-
-        Args:
-        in_channels (int): Number of input channels. Default: 3.
-        embed_dims (list[int]): Embedding dimension. Default:
-            [64, 128, 256, 512].
-        num_heads (Sequence[int]): The attention heads of each transformer
-            encode layer. Default: [1, 2, 5, 8].
-        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
-            embedding dim of each transformer block.
-        qkv_bias (bool): Enable bias for qkv if True. Default: True.
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        drop_rate (float): Probability of an element to be zeroed.
-            Default 0.0.
-        attn_drop_rate (float): The drop out rate for attention layer.
-            Default 0.0.
-        drop_path_rate (float): stochastic depth rate. Default 0.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='LN', eps=1e-6).
-        num_layers (Sequence[int]): The layer number of each transformer encode
-            layer. Default: [3, 4, 6, 3].
-        sr_ratios (Sequence[int]): The spatial reduction rate of each
-            transformer block. Default: [8, 4, 2, 1].
-        num_stages (int): The num of stages. Default: 4.
-        pretrained (str, optional): model pretrained path. Default: None.
-        k (int): number of the nearest neighbor used for local density.
-        sample_ratios (list[float]): The sample ratios of CTM modules.
-            Default: [0.25, 0.25, 0.25]
-        return_map (bool): If True, transfer dynamic tokens to feature map at
-            last. Default: False
-        convert_weights (bool): The flag indicates whether the
-            pre-trained model is from the original repo. We may need
-            to convert some keys to make it compatible.
-            Default: True.
-    """
-
-    def __init__(self,
-                 in_channels=3,
-                 embed_dims=[64, 128, 256, 512],
-                 num_heads=[1, 2, 4, 8],
-                 mlp_ratios=[4, 4, 4, 4],
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 norm_cfg=dict(type='LN', eps=1e-6),
-                 num_layers=[3, 4, 6, 3],
-                 sr_ratios=[8, 4, 2, 1],
-                 num_stages=4,
-                 pretrained=None,
-                 k=5,
-                 sample_ratios=[0.25, 0.25, 0.25],
-                 return_map=False,
-                 convert_weights=True):
-        super().__init__()
-
-        self.num_layers = num_layers
-        self.num_stages = num_stages
-        self.grid_stride = sr_ratios[0]
-        self.embed_dims = embed_dims
-        self.sr_ratios = sr_ratios
-        self.mlp_ratios = mlp_ratios
-        self.sample_ratios = sample_ratios
-        self.return_map = return_map
-        self.convert_weights = convert_weights
-
-        # stochastic depth decay rule
-        dpr = [
-            x.item()
-            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
-        ]
-        cur = 0
-
-        # In stage 1, use the standard transformer blocks
-        for i in range(1):
-            patch_embed = PatchEmbed(
-                in_channels=in_channels if i == 0 else embed_dims[i - 1],
-                embed_dims=embed_dims[i],
-                kernel_size=7,
-                stride=4,
-                padding=3,
-                bias=True,
-                norm_cfg=dict(type='LN', eps=1e-6))
-
-            block = nn.ModuleList([
-                TCFormerRegularBlock(
-                    dim=embed_dims[i],
-                    num_heads=num_heads[i],
-                    mlp_ratio=mlp_ratios[i],
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[cur + j],
-                    norm_cfg=norm_cfg,
-                    sr_ratio=sr_ratios[i]) for j in range(num_layers[i])
-            ])
-            norm = build_norm_layer(norm_cfg, embed_dims[i])[1]
-
-            cur += num_layers[i]
-
-            setattr(self, f'patch_embed{i + 1}', patch_embed)
-            setattr(self, f'block{i + 1}', block)
-            setattr(self, f'norm{i + 1}', norm)
-
-        # In stage 2~4, use TCFormerDynamicBlock for dynamic tokens
-        for i in range(1, num_stages):
-            ctm = CTM(sample_ratios[i - 1], embed_dims[i - 1], embed_dims[i],
-                      k)
-
-            block = nn.ModuleList([
-                TCFormerDynamicBlock(
-                    dim=embed_dims[i],
-                    num_heads=num_heads[i],
-                    mlp_ratio=mlp_ratios[i],
-                    qkv_bias=qkv_bias,
-                    qk_scale=qk_scale,
-                    drop=drop_rate,
-                    attn_drop=attn_drop_rate,
-                    drop_path=dpr[cur + j],
-                    norm_cfg=norm_cfg,
-                    sr_ratio=sr_ratios[i]) for j in range(num_layers[i])
-            ])
-            norm = build_norm_layer(norm_cfg, embed_dims[i])[1]
-            cur += num_layers[i]
-
-            setattr(self, f'ctm{i}', ctm)
-            setattr(self, f'block{i + 1}', block)
-            setattr(self, f'norm{i + 1}', norm)
-
-        self.init_weights(pretrained)
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = get_root_logger()
-
-            checkpoint = _load_checkpoint(
-                pretrained, logger=logger, map_location='cpu')
-            logger.warning(f'Load pre-trained model for '
-                           f'{self.__class__.__name__} from original repo')
-            if 'state_dict' in checkpoint:
-                state_dict = checkpoint['state_dict']
-            elif 'model' in checkpoint:
-                state_dict = checkpoint['model']
-            else:
-                state_dict = checkpoint
-
-            if self.convert_weights:
-                # We need to convert pre-trained weights to match this
-                # implementation.
-                state_dict = tcformer_convert(state_dict)
-            load_state_dict(self, state_dict, strict=False, logger=logger)
-
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Linear):
-                    trunc_normal_init(m, std=.02, bias=0.)
-                elif isinstance(m, nn.LayerNorm):
-                    constant_init(m, 1.0)
-                elif isinstance(m, nn.Conv2d):
-                    fan_out = m.kernel_size[0] * m.kernel_size[
-                        1] * m.out_channels
-                    fan_out //= m.groups
-                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        outs = []
-
-        i = 0
-        patch_embed = getattr(self, f'patch_embed{i + 1}')
-        block = getattr(self, f'block{i + 1}')
-        norm = getattr(self, f'norm{i + 1}')
-        x, (H, W) = patch_embed(x)
-        for blk in block:
-            x = blk(x, H, W)
-        x = norm(x)
-
-        # init token dict
-        B, N, _ = x.shape
-        device = x.device
-        idx_token = torch.arange(N)[None, :].repeat(B, 1).to(device)
-        agg_weight = x.new_ones(B, N, 1)
-        token_dict = {
-            'x': x,
-            'token_num': N,
-            'map_size': [H, W],
-            'init_grid_size': [H, W],
-            'idx_token': idx_token,
-            'agg_weight': agg_weight
-        }
-        outs.append(token_dict.copy())
-
-        # stage 2~4
-        for i in range(1, self.num_stages):
-            ctm = getattr(self, f'ctm{i}')
-            block = getattr(self, f'block{i + 1}')
-            norm = getattr(self, f'norm{i + 1}')
-
-            token_dict = ctm(token_dict)  # down sample
-            for j, blk in enumerate(block):
-                token_dict = blk(token_dict)
-
-            token_dict['x'] = norm(token_dict['x'])
-            outs.append(token_dict)
-
-        if self.return_map:
-            outs = [token2map(token_dict) for token_dict in outs]
-        return outs
diff --git a/main/transformer_utils/mmpose/models/backbones/tcn.py b/main/transformer_utils/mmpose/models/backbones/tcn.py
deleted file mode 100644
index deca2290aeb1830bc3e241b819157369371aaf27..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/tcn.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch.nn as nn
-from mmcv.cnn import ConvModule, build_conv_layer, constant_init, kaiming_init
-from mmcv.utils.parrots_wrapper import _BatchNorm
-
-from mmpose.core import WeightNormClipHook
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-
-
-class BasicTemporalBlock(nn.Module):
-    """Basic block for VideoPose3D.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        mid_channels (int): The output channels of conv1. Default: 1024.
-        kernel_size (int): Size of the convolving kernel. Default: 3.
-        dilation (int): Spacing between kernel elements. Default: 3.
-        dropout (float): Dropout rate. Default: 0.25.
-        causal (bool): Use causal convolutions instead of symmetric
-            convolutions (for real-time applications). Default: False.
-        residual (bool): Use residual connection. Default: True.
-        use_stride_conv (bool): Use optimized TCN that designed
-            specifically for single-frame batching, i.e. where batches have
-            input length = receptive field, and output length = 1. This
-            implementation replaces dilated convolutions with strided
-            convolutions to avoid generating unused intermediate results.
-            Default: False.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN1d').
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 mid_channels=1024,
-                 kernel_size=3,
-                 dilation=3,
-                 dropout=0.25,
-                 causal=False,
-                 residual=True,
-                 use_stride_conv=False,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d')):
-        # Protect mutable default arguments
-        conv_cfg = copy.deepcopy(conv_cfg)
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.mid_channels = mid_channels
-        self.kernel_size = kernel_size
-        self.dilation = dilation
-        self.dropout = dropout
-        self.causal = causal
-        self.residual = residual
-        self.use_stride_conv = use_stride_conv
-
-        self.pad = (kernel_size - 1) * dilation // 2
-        if use_stride_conv:
-            self.stride = kernel_size
-            self.causal_shift = kernel_size // 2 if causal else 0
-            self.dilation = 1
-        else:
-            self.stride = 1
-            self.causal_shift = kernel_size // 2 * dilation if causal else 0
-
-        self.conv1 = nn.Sequential(
-            ConvModule(
-                in_channels,
-                mid_channels,
-                kernel_size=kernel_size,
-                stride=self.stride,
-                dilation=self.dilation,
-                bias='auto',
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg))
-        self.conv2 = nn.Sequential(
-            ConvModule(
-                mid_channels,
-                out_channels,
-                kernel_size=1,
-                bias='auto',
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg))
-
-        if residual and in_channels != out_channels:
-            self.short_cut = build_conv_layer(conv_cfg, in_channels,
-                                              out_channels, 1)
-        else:
-            self.short_cut = None
-
-        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
-
-    def forward(self, x):
-        """Forward function."""
-        if self.use_stride_conv:
-            assert self.causal_shift + self.kernel_size // 2 < x.shape[2]
-        else:
-            assert 0 <= self.pad + self.causal_shift < x.shape[2] - \
-                self.pad + self.causal_shift <= x.shape[2]
-
-        out = self.conv1(x)
-        if self.dropout is not None:
-            out = self.dropout(out)
-
-        out = self.conv2(out)
-        if self.dropout is not None:
-            out = self.dropout(out)
-
-        if self.residual:
-            if self.use_stride_conv:
-                res = x[:, :, self.causal_shift +
-                        self.kernel_size // 2::self.kernel_size]
-            else:
-                res = x[:, :,
-                        (self.pad + self.causal_shift):(x.shape[2] - self.pad +
-                                                        self.causal_shift)]
-
-            if self.short_cut is not None:
-                res = self.short_cut(res)
-            out = out + res
-
-        return out
-
-
-@BACKBONES.register_module()
-class TCN(BaseBackbone):
-    """TCN backbone.
-
-    Temporal Convolutional Networks.
-    More details can be found in the
-    `paper <https://arxiv.org/abs/1811.11742>`__ .
-
-    Args:
-        in_channels (int): Number of input channels, which equals to
-            num_keypoints * num_features.
-        stem_channels (int): Number of feature channels. Default: 1024.
-        num_blocks (int): NUmber of basic temporal convolutional blocks.
-            Default: 2.
-        kernel_sizes (Sequence[int]): Sizes of the convolving kernel of
-            each basic block. Default: ``(3, 3, 3)``.
-        dropout (float): Dropout rate. Default: 0.25.
-        causal (bool): Use causal convolutions instead of symmetric
-            convolutions (for real-time applications).
-            Default: False.
-        residual (bool): Use residual connection. Default: True.
-        use_stride_conv (bool): Use TCN backbone optimized for
-            single-frame batching, i.e. where batches have input length =
-            receptive field, and output length = 1. This implementation
-            replaces dilated convolutions with strided convolutions to avoid
-            generating unused intermediate results. The weights are
-            interchangeable with the reference implementation. Default: False
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN1d').
-        max_norm (float|None): if not None, the weight of convolution layers
-            will be clipped to have a maximum norm of max_norm.
-
-    Example:
-        >>> from mmpose.models import TCN
-        >>> import torch
-        >>> self = TCN(in_channels=34)
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 34, 243)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 1024, 235)
-        (1, 1024, 217)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 stem_channels=1024,
-                 num_blocks=2,
-                 kernel_sizes=(3, 3, 3),
-                 dropout=0.25,
-                 causal=False,
-                 residual=True,
-                 use_stride_conv=False,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 max_norm=None):
-        # Protect mutable default arguments
-        conv_cfg = copy.deepcopy(conv_cfg)
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        self.in_channels = in_channels
-        self.stem_channels = stem_channels
-        self.num_blocks = num_blocks
-        self.kernel_sizes = kernel_sizes
-        self.dropout = dropout
-        self.causal = causal
-        self.residual = residual
-        self.use_stride_conv = use_stride_conv
-        self.max_norm = max_norm
-
-        assert num_blocks == len(kernel_sizes) - 1
-        for ks in kernel_sizes:
-            assert ks % 2 == 1, 'Only odd filter widths are supported.'
-
-        self.expand_conv = ConvModule(
-            in_channels,
-            stem_channels,
-            kernel_size=kernel_sizes[0],
-            stride=kernel_sizes[0] if use_stride_conv else 1,
-            bias='auto',
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg)
-
-        dilation = kernel_sizes[0]
-        self.tcn_blocks = nn.ModuleList()
-        for i in range(1, num_blocks + 1):
-            self.tcn_blocks.append(
-                BasicTemporalBlock(
-                    in_channels=stem_channels,
-                    out_channels=stem_channels,
-                    mid_channels=stem_channels,
-                    kernel_size=kernel_sizes[i],
-                    dilation=dilation,
-                    dropout=dropout,
-                    causal=causal,
-                    residual=residual,
-                    use_stride_conv=use_stride_conv,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg))
-            dilation *= kernel_sizes[i]
-
-        if self.max_norm is not None:
-            # Apply weight norm clip to conv layers
-            weight_clip = WeightNormClipHook(self.max_norm)
-            for module in self.modules():
-                if isinstance(module, nn.modules.conv._ConvNd):
-                    weight_clip.register(module)
-
-        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
-
-    def forward(self, x):
-        """Forward function."""
-        x = self.expand_conv(x)
-
-        if self.dropout is not None:
-            x = self.dropout(x)
-
-        outs = []
-        for i in range(self.num_blocks):
-            x = self.tcn_blocks[i](x)
-            outs.append(x)
-
-        return tuple(outs)
-
-    def init_weights(self, pretrained=None):
-        """Initialize the weights."""
-        super().init_weights(pretrained)
-        if pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.modules.conv._ConvNd):
-                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
-                elif isinstance(m, _BatchNorm):
-                    constant_init(m, 1)
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/utils.py b/main/transformer_utils/mmpose/models/backbones/utils/utils.py
index 2a53c94a90a1802cc0c4dcfceba241711c989640..5f6ab9b43202b6491911f4e0a713cbf3f210566d 100644
--- a/main/transformer_utils/mmpose/models/backbones/utils/utils.py
+++ b/main/transformer_utils/mmpose/models/backbones/utils/utils.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import OrderedDict
 
-from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict
+from mmengine.runner import load_state_dict
 
 
 # Copyright (c) Open-MMLab. All rights reserved.
@@ -22,11 +22,11 @@ from torch.utils import model_zoo
 from torch.nn import functional as F
 
 import mmcv
-from mmcv.fileio import FileClient
-from mmcv.fileio import load as load_file
-from mmcv.parallel import is_module_wrapper
-from mmcv.utils import mkdir_or_exist
-from mmcv.runner import get_dist_info
+from mmengine.fileio import FileClient
+from mmengine.fileio import load as load_file
+# from mmengine.model.wrappers.utils import is_module_wrapper
+from mmengine.utils import mkdir_or_exist
+from mmengine.dist import get_dist_info
 
 from scipy import interpolate
 import numpy as np
@@ -75,8 +75,8 @@ def load_state_dict(module, state_dict, strict=False, logger=None):
     def load(module, prefix=''):
         # recursively check parallel module in case that the model has a
         # complicated structure, e.g., nn.Module(nn.Module(DDP))
-        if is_module_wrapper(module):
-            module = module.module
+        # if is_module_wrapper(module):
+        #     module = module.module
         local_metadata = {} if metadata is None else metadata.get(
             prefix[:-1], {})
         module._load_from_state_dict(state_dict, prefix, local_metadata, True,
@@ -445,8 +445,8 @@ def get_state_dict(module, destination=None, prefix='', keep_vars=False):
     """
     # recursively check parallel module in case that the model has a
     # complicated structure, e.g., nn.Module(nn.Module(DDP))
-    if is_module_wrapper(module):
-        module = module.module
+    # if is_module_wrapper(module):
+    #     module = module.module
 
     # below is the same as torch.nn.Module.state_dict()
     if destination is None:
@@ -482,8 +482,8 @@ def save_checkpoint(model, filename, optimizer=None, meta=None):
         raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
     meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
 
-    if is_module_wrapper(model):
-        model = model.module
+    # if is_module_wrapper(model):
+    #     model = model.module
 
     if hasattr(model, 'CLASSES') and model.CLASSES is not None:
         # save class name to the meta
diff --git a/main/transformer_utils/mmpose/models/backbones/v2v_net.py b/main/transformer_utils/mmpose/models/backbones/v2v_net.py
deleted file mode 100644
index 99462af711069a34c13628364e2c466163507861..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/v2v_net.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# ------------------------------------------------------------------------------
-# Copyright and License Information
-# Adapted from
-# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models/v2v_net.py
-# Original Licence: MIT License
-# ------------------------------------------------------------------------------
-
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import ConvModule
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-
-
-class Basic3DBlock(nn.Module):
-    """A basic 3D convolutional block.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        kernel_size (int): Kernel size of the convolution operation
-        conv_cfg (dict): Dictionary to construct and config conv layer.
-            Default: dict(type='Conv3d')
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-            Default: dict(type='BN3d')
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 conv_cfg=dict(type='Conv3d'),
-                 norm_cfg=dict(type='BN3d')):
-        super(Basic3DBlock, self).__init__()
-        self.block = ConvModule(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=1,
-            padding=((kernel_size - 1) // 2),
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            bias=True)
-
-    def forward(self, x):
-        """Forward function."""
-        return self.block(x)
-
-
-class Res3DBlock(nn.Module):
-    """A residual 3D convolutional block.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        kernel_size (int): Kernel size of the convolution operation
-            Default: 3
-        conv_cfg (dict): Dictionary to construct and config conv layer.
-            Default: dict(type='Conv3d')
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-            Default: dict(type='BN3d')
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 conv_cfg=dict(type='Conv3d'),
-                 norm_cfg=dict(type='BN3d')):
-        super(Res3DBlock, self).__init__()
-        self.res_branch = nn.Sequential(
-            ConvModule(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride=1,
-                padding=((kernel_size - 1) // 2),
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                bias=True),
-            ConvModule(
-                out_channels,
-                out_channels,
-                kernel_size,
-                stride=1,
-                padding=((kernel_size - 1) // 2),
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None,
-                bias=True))
-
-        if in_channels == out_channels:
-            self.skip_con = nn.Sequential()
-        else:
-            self.skip_con = ConvModule(
-                in_channels,
-                out_channels,
-                1,
-                stride=1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None,
-                bias=True)
-
-    def forward(self, x):
-        """Forward function."""
-        res = self.res_branch(x)
-        skip = self.skip_con(x)
-        return F.relu(res + skip, True)
-
-
-class Pool3DBlock(nn.Module):
-    """A 3D max-pool block.
-
-    Args:
-        pool_size (int): Pool size of the 3D max-pool layer
-    """
-
-    def __init__(self, pool_size):
-        super(Pool3DBlock, self).__init__()
-        self.pool_size = pool_size
-
-    def forward(self, x):
-        """Forward function."""
-        return F.max_pool3d(
-            x, kernel_size=self.pool_size, stride=self.pool_size)
-
-
-class Upsample3DBlock(nn.Module):
-    """A 3D upsample block.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        kernel_size (int): Kernel size of the transposed convolution operation.
-            Default: 2
-        stride (int):  Kernel size of the transposed convolution operation.
-            Default: 2
-    """
-
-    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2):
-        super(Upsample3DBlock, self).__init__()
-        assert kernel_size == 2
-        assert stride == 2
-        self.block = nn.Sequential(
-            nn.ConvTranspose3d(
-                in_channels,
-                out_channels,
-                kernel_size=kernel_size,
-                stride=stride,
-                padding=0,
-                output_padding=0), nn.BatchNorm3d(out_channels), nn.ReLU(True))
-
-    def forward(self, x):
-        """Forward function."""
-        return self.block(x)
-
-
-class EncoderDecorder(nn.Module):
-    """An encoder-decoder block.
-
-    Args:
-        in_channels (int): Input channels of this block
-    """
-
-    def __init__(self, in_channels=32):
-        super(EncoderDecorder, self).__init__()
-
-        self.encoder_pool1 = Pool3DBlock(2)
-        self.encoder_res1 = Res3DBlock(in_channels, in_channels * 2)
-        self.encoder_pool2 = Pool3DBlock(2)
-        self.encoder_res2 = Res3DBlock(in_channels * 2, in_channels * 4)
-
-        self.mid_res = Res3DBlock(in_channels * 4, in_channels * 4)
-
-        self.decoder_res2 = Res3DBlock(in_channels * 4, in_channels * 4)
-        self.decoder_upsample2 = Upsample3DBlock(in_channels * 4,
-                                                 in_channels * 2, 2, 2)
-        self.decoder_res1 = Res3DBlock(in_channels * 2, in_channels * 2)
-        self.decoder_upsample1 = Upsample3DBlock(in_channels * 2, in_channels,
-                                                 2, 2)
-
-        self.skip_res1 = Res3DBlock(in_channels, in_channels)
-        self.skip_res2 = Res3DBlock(in_channels * 2, in_channels * 2)
-
-    def forward(self, x):
-        """Forward function."""
-        skip_x1 = self.skip_res1(x)
-        x = self.encoder_pool1(x)
-        x = self.encoder_res1(x)
-
-        skip_x2 = self.skip_res2(x)
-        x = self.encoder_pool2(x)
-        x = self.encoder_res2(x)
-
-        x = self.mid_res(x)
-
-        x = self.decoder_res2(x)
-        x = self.decoder_upsample2(x)
-        x = x + skip_x2
-
-        x = self.decoder_res1(x)
-        x = self.decoder_upsample1(x)
-        x = x + skip_x1
-
-        return x
-
-
-@BACKBONES.register_module()
-class V2VNet(BaseBackbone):
-    """V2VNet.
-
-    Please refer to the `paper <https://arxiv.org/abs/1711.07399>`
-        for details.
-
-    Args:
-        input_channels (int):
-            Number of channels of the input feature volume.
-        output_channels (int):
-            Number of channels of the output volume.
-        mid_channels (int):
-            Input and output channels of the encoder-decoder block.
-    """
-
-    def __init__(self, input_channels, output_channels, mid_channels=32):
-        super(V2VNet, self).__init__()
-
-        self.front_layers = nn.Sequential(
-            Basic3DBlock(input_channels, mid_channels // 2, 7),
-            Res3DBlock(mid_channels // 2, mid_channels),
-        )
-
-        self.encoder_decoder = EncoderDecorder(in_channels=mid_channels)
-
-        self.output_layer = nn.Conv3d(
-            mid_channels, output_channels, kernel_size=1, stride=1, padding=0)
-
-        self._initialize_weights()
-
-    def forward(self, x):
-        """Forward function."""
-        x = self.front_layers(x)
-        x = self.encoder_decoder(x)
-        x = self.output_layer(x)
-
-        return x
-
-    def _initialize_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv3d):
-                nn.init.normal_(m.weight, 0, 0.001)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.ConvTranspose3d):
-                nn.init.normal_(m.weight, 0, 0.001)
-                nn.init.constant_(m.bias, 0)
diff --git a/main/transformer_utils/mmpose/models/backbones/vgg.py b/main/transformer_utils/mmpose/models/backbones/vgg.py
deleted file mode 100644
index f7d467017a5520f399c84b1235ec64c99b805b42..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/vgg.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-from mmcv.cnn import ConvModule, constant_init, kaiming_init, normal_init
-from mmcv.utils.parrots_wrapper import _BatchNorm
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-
-
-def make_vgg_layer(in_channels,
-                   out_channels,
-                   num_blocks,
-                   conv_cfg=None,
-                   norm_cfg=None,
-                   act_cfg=dict(type='ReLU'),
-                   dilation=1,
-                   with_norm=False,
-                   ceil_mode=False):
-    layers = []
-    for _ in range(num_blocks):
-        layer = ConvModule(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            dilation=dilation,
-            padding=dilation,
-            bias=True,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg)
-        layers.append(layer)
-        in_channels = out_channels
-    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
-
-    return layers
-
-
-@BACKBONES.register_module()
-class VGG(BaseBackbone):
-    """VGG backbone.
-
-    Args:
-        depth (int): Depth of vgg, from {11, 13, 16, 19}.
-        with_norm (bool): Use BatchNorm or not.
-        num_classes (int): number of classes for classification.
-        num_stages (int): VGG stages, normally 5.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages. If only one
-            stage is specified, a single tensor (feature map) is returned,
-            otherwise multiple stages are specified, a tuple of tensors will
-            be returned. When it is None, the default behavior depends on
-            whether num_classes is specified. If num_classes <= 0, the default
-            value is (4, ), outputting the last feature map before classifier.
-            If num_classes > 0, the default value is (5, ), outputting the
-            classification score. Default: None.
-        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
-            not freezing any parameters.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        ceil_mode (bool): Whether to use ceil_mode of MaxPool. Default: False.
-        with_last_pool (bool): Whether to keep the last pooling before
-            classifier. Default: True.
-    """
-
-    # Parameters to build layers. Each element specifies the number of conv in
-    # each stage. For example, VGG11 contains 11 layers with learnable
-    # parameters. 11 is computed as 11 = (1 + 1 + 2 + 2 + 2) + 3,
-    # where 3 indicates the last three fully-connected layers.
-    arch_settings = {
-        11: (1, 1, 2, 2, 2),
-        13: (2, 2, 2, 2, 2),
-        16: (2, 2, 3, 3, 3),
-        19: (2, 2, 4, 4, 4)
-    }
-
-    def __init__(self,
-                 depth,
-                 num_classes=-1,
-                 num_stages=5,
-                 dilations=(1, 1, 1, 1, 1),
-                 out_indices=None,
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 norm_eval=False,
-                 ceil_mode=False,
-                 with_last_pool=True):
-        super().__init__()
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for vgg')
-        assert num_stages >= 1 and num_stages <= 5
-        stage_blocks = self.arch_settings[depth]
-        self.stage_blocks = stage_blocks[:num_stages]
-        assert len(dilations) == num_stages
-
-        self.num_classes = num_classes
-        self.frozen_stages = frozen_stages
-        self.norm_eval = norm_eval
-        with_norm = norm_cfg is not None
-
-        if out_indices is None:
-            out_indices = (5, ) if num_classes > 0 else (4, )
-        assert max(out_indices) <= num_stages
-        self.out_indices = out_indices
-
-        self.in_channels = 3
-        start_idx = 0
-        vgg_layers = []
-        self.range_sub_modules = []
-        for i, num_blocks in enumerate(self.stage_blocks):
-            num_modules = num_blocks + 1
-            end_idx = start_idx + num_modules
-            dilation = dilations[i]
-            out_channels = 64 * 2**i if i < 4 else 512
-            vgg_layer = make_vgg_layer(
-                self.in_channels,
-                out_channels,
-                num_blocks,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
-                dilation=dilation,
-                with_norm=with_norm,
-                ceil_mode=ceil_mode)
-            vgg_layers.extend(vgg_layer)
-            self.in_channels = out_channels
-            self.range_sub_modules.append([start_idx, end_idx])
-            start_idx = end_idx
-        if not with_last_pool:
-            vgg_layers.pop(-1)
-            self.range_sub_modules[-1][1] -= 1
-        self.module_name = 'features'
-        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
-
-        if self.num_classes > 0:
-            self.classifier = nn.Sequential(
-                nn.Linear(512 * 7 * 7, 4096),
-                nn.ReLU(True),
-                nn.Dropout(),
-                nn.Linear(4096, 4096),
-                nn.ReLU(True),
-                nn.Dropout(),
-                nn.Linear(4096, num_classes),
-            )
-
-    def init_weights(self, pretrained=None):
-        super().init_weights(pretrained)
-        if pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    kaiming_init(m)
-                elif isinstance(m, _BatchNorm):
-                    constant_init(m, 1)
-                elif isinstance(m, nn.Linear):
-                    normal_init(m, std=0.01)
-
-    def forward(self, x):
-        outs = []
-        vgg_layers = getattr(self, self.module_name)
-        for i in range(len(self.stage_blocks)):
-            for j in range(*self.range_sub_modules[i]):
-                vgg_layer = vgg_layers[j]
-                x = vgg_layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-        if self.num_classes > 0:
-            x = x.view(x.size(0), -1)
-            x = self.classifier(x)
-            outs.append(x)
-        if len(outs) == 1:
-            return outs[0]
-        else:
-            return tuple(outs)
-
-    def _freeze_stages(self):
-        vgg_layers = getattr(self, self.module_name)
-        for i in range(self.frozen_stages):
-            for j in range(*self.range_sub_modules[i]):
-                m = vgg_layers[j]
-                m.eval()
-                for param in m.parameters():
-                    param.requires_grad = False
-
-    def train(self, mode=True):
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                # trick: eval have effect on BatchNorm only
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py b/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py
deleted file mode 100644
index ed990e3966b27301dbaf081e3ec0e908704dfc8b..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-
-import torch.nn as nn
-from mmcv.cnn import ConvModule
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-from .utils import InvertedResidual, load_checkpoint
-
-
-@BACKBONES.register_module()
-class ViPNAS_MobileNetV3(BaseBackbone):
-    """ViPNAS_MobileNetV3 backbone.
-
-    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
-    More details can be found in the `paper
-    <https://arxiv.org/abs/2105.10154>`__ .
-
-    Args:
-        wid (list(int)): Searched width config for each stage.
-        expan (list(int)): Searched expansion ratio config for each stage.
-        dep (list(int)): Searched depth config for each stage.
-        ks (list(int)): Searched kernel size config for each stage.
-        group (list(int)): Searched group number config for each stage.
-        att (list(bool)): Searched attention config for each stage.
-        stride (list(int)): Stride config for each stage.
-        act (list(dict)): Activation config for each stage.
-        conv_cfg (dict): Config dict for convolution layer.
-            Default: None, which means using conv2d.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: dict(type='BN').
-        frozen_stages (int): Stages to be frozen (all param fixed).
-            Default: -1, which means not freezing any parameters.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save
-            some memory while slowing down the training speed.
-            Default: False.
-    """
-
-    def __init__(self,
-                 wid=[16, 16, 24, 40, 80, 112, 160],
-                 expan=[None, 1, 5, 4, 5, 5, 6],
-                 dep=[None, 1, 4, 4, 4, 4, 4],
-                 ks=[3, 3, 7, 7, 5, 7, 5],
-                 group=[None, 8, 120, 20, 100, 280, 240],
-                 att=[None, True, True, False, True, True, True],
-                 stride=[2, 1, 2, 2, 2, 1, 2],
-                 act=[
-                     'HSwish', 'ReLU', 'ReLU', 'ReLU', 'HSwish', 'HSwish',
-                     'HSwish'
-                 ],
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 frozen_stages=-1,
-                 norm_eval=False,
-                 with_cp=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        self.wid = wid
-        self.expan = expan
-        self.dep = dep
-        self.ks = ks
-        self.group = group
-        self.att = att
-        self.stride = stride
-        self.act = act
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.frozen_stages = frozen_stages
-        self.norm_eval = norm_eval
-        self.with_cp = with_cp
-
-        self.conv1 = ConvModule(
-            in_channels=3,
-            out_channels=self.wid[0],
-            kernel_size=self.ks[0],
-            stride=self.stride[0],
-            padding=self.ks[0] // 2,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            act_cfg=dict(type=self.act[0]))
-
-        self.layers = self._make_layer()
-
-    def _make_layer(self):
-        layers = []
-        layer_index = 0
-        for i, dep in enumerate(self.dep[1:]):
-            mid_channels = self.wid[i + 1] * self.expan[i + 1]
-
-            if self.att[i + 1]:
-                se_cfg = dict(
-                    channels=mid_channels,
-                    ratio=4,
-                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
-            else:
-                se_cfg = None
-
-            if self.expan[i + 1] == 1:
-                with_expand_conv = False
-            else:
-                with_expand_conv = True
-
-            for j in range(dep):
-                if j == 0:
-                    stride = self.stride[i + 1]
-                    in_channels = self.wid[i]
-                else:
-                    stride = 1
-                    in_channels = self.wid[i + 1]
-
-                layer = InvertedResidual(
-                    in_channels=in_channels,
-                    out_channels=self.wid[i + 1],
-                    mid_channels=mid_channels,
-                    kernel_size=self.ks[i + 1],
-                    groups=self.group[i + 1],
-                    stride=stride,
-                    se_cfg=se_cfg,
-                    with_expand_conv=with_expand_conv,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=dict(type=self.act[i + 1]),
-                    with_cp=self.with_cp)
-                layer_index += 1
-                layer_name = f'layer{layer_index}'
-                self.add_module(layer_name, layer)
-                layers.append(layer_name)
-        return layers
-
-    def init_weights(self, pretrained=None):
-        if isinstance(pretrained, str):
-            logger = logging.getLogger()
-            load_checkpoint(self, pretrained, strict=False, logger=logger)
-        elif pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    nn.init.normal_(m.weight, std=0.001)
-                    for name, _ in m.named_parameters():
-                        if name in ['bias']:
-                            nn.init.constant_(m.bias, 0)
-                elif isinstance(m, nn.BatchNorm2d):
-                    nn.init.constant_(m.weight, 1)
-                    nn.init.constant_(m.bias, 0)
-        else:
-            raise TypeError('pretrained must be a str or None')
-
-    def forward(self, x):
-        x = self.conv1(x)
-
-        for i, layer_name in enumerate(self.layers):
-            layer = getattr(self, layer_name)
-            x = layer(x)
-
-        return x
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            for param in self.conv1.parameters():
-                param.requires_grad = False
-        for i in range(1, self.frozen_stages + 1):
-            layer = getattr(self, f'layer{i}')
-            layer.eval()
-            for param in layer.parameters():
-                param.requires_grad = False
-
-    def train(self, mode=True):
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py b/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py
deleted file mode 100644
index 81b028ed5f5caad5f59c68b7f82c1a4661cf4d6f..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py
+++ /dev/null
@@ -1,589 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch.nn as nn
-import torch.utils.checkpoint as cp
-from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
-from mmcv.cnn.bricks import ContextBlock
-from mmcv.utils.parrots_wrapper import _BatchNorm
-
-from ..builder import BACKBONES
-from .base_backbone import BaseBackbone
-
-
-class ViPNAS_Bottleneck(nn.Module):
-    """Bottleneck block for ViPNAS_ResNet.
-
-    Args:
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        expansion (int): The ratio of ``out_channels/mid_channels`` where
-            ``mid_channels`` is the input/output channels of conv2. Default: 4.
-        stride (int): stride of the block. Default: 1
-        dilation (int): dilation of convolution. Default: 1
-        downsample (nn.Module): downsample operation on identity branch.
-            Default: None.
-        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
-            stride-two layer is the 3x3 conv layer, otherwise the stride-two
-            layer is the first 1x1 conv layer. Default: "pytorch".
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        kernel_size (int): kernel size of conv2 searched in ViPANS.
-        groups (int): group number of conv2 searched in ViPNAS.
-        attention (bool): whether to use attention module in the end of
-            the block.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 expansion=4,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 kernel_size=3,
-                 groups=1,
-                 attention=False):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        assert style in ['pytorch', 'caffe']
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.expansion = expansion
-        assert out_channels % expansion == 0
-        self.mid_channels = out_channels // expansion
-        self.stride = stride
-        self.dilation = dilation
-        self.style = style
-        self.with_cp = with_cp
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-
-        if self.style == 'pytorch':
-            self.conv1_stride = 1
-            self.conv2_stride = stride
-        else:
-            self.conv1_stride = stride
-            self.conv2_stride = 1
-
-        self.norm1_name, norm1 = build_norm_layer(
-            norm_cfg, self.mid_channels, postfix=1)
-        self.norm2_name, norm2 = build_norm_layer(
-            norm_cfg, self.mid_channels, postfix=2)
-        self.norm3_name, norm3 = build_norm_layer(
-            norm_cfg, out_channels, postfix=3)
-
-        self.conv1 = build_conv_layer(
-            conv_cfg,
-            in_channels,
-            self.mid_channels,
-            kernel_size=1,
-            stride=self.conv1_stride,
-            bias=False)
-        self.add_module(self.norm1_name, norm1)
-        self.conv2 = build_conv_layer(
-            conv_cfg,
-            self.mid_channels,
-            self.mid_channels,
-            kernel_size=kernel_size,
-            stride=self.conv2_stride,
-            padding=kernel_size // 2,
-            groups=groups,
-            dilation=dilation,
-            bias=False)
-
-        self.add_module(self.norm2_name, norm2)
-        self.conv3 = build_conv_layer(
-            conv_cfg,
-            self.mid_channels,
-            out_channels,
-            kernel_size=1,
-            bias=False)
-        self.add_module(self.norm3_name, norm3)
-
-        if attention:
-            self.attention = ContextBlock(out_channels,
-                                          max(1.0 / 16, 16.0 / out_channels))
-        else:
-            self.attention = None
-
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-
-    @property
-    def norm1(self):
-        """nn.Module: the normalization layer named "norm1" """
-        return getattr(self, self.norm1_name)
-
-    @property
-    def norm2(self):
-        """nn.Module: the normalization layer named "norm2" """
-        return getattr(self, self.norm2_name)
-
-    @property
-    def norm3(self):
-        """nn.Module: the normalization layer named "norm3" """
-        return getattr(self, self.norm3_name)
-
-    def forward(self, x):
-        """Forward function."""
-
-        def _inner_forward(x):
-            identity = x
-
-            out = self.conv1(x)
-            out = self.norm1(out)
-            out = self.relu(out)
-
-            out = self.conv2(out)
-            out = self.norm2(out)
-            out = self.relu(out)
-
-            out = self.conv3(out)
-            out = self.norm3(out)
-
-            if self.attention is not None:
-                out = self.attention(out)
-
-            if self.downsample is not None:
-                identity = self.downsample(x)
-
-            out += identity
-
-            return out
-
-        if self.with_cp and x.requires_grad:
-            out = cp.checkpoint(_inner_forward, x)
-        else:
-            out = _inner_forward(x)
-
-        out = self.relu(out)
-
-        return out
-
-
-def get_expansion(block, expansion=None):
-    """Get the expansion of a residual block.
-
-    The block expansion will be obtained by the following order:
-
-    1. If ``expansion`` is given, just return it.
-    2. If ``block`` has the attribute ``expansion``, then return
-       ``block.expansion``.
-    3. Return the default value according the the block type:
-       4 for ``ViPNAS_Bottleneck``.
-
-    Args:
-        block (class): The block class.
-        expansion (int | None): The given expansion ratio.
-
-    Returns:
-        int: The expansion of the block.
-    """
-    if isinstance(expansion, int):
-        assert expansion > 0
-    elif expansion is None:
-        if hasattr(block, 'expansion'):
-            expansion = block.expansion
-        elif issubclass(block, ViPNAS_Bottleneck):
-            expansion = 1
-        else:
-            raise TypeError(f'expansion is not specified for {block.__name__}')
-    else:
-        raise TypeError('expansion must be an integer or None')
-
-    return expansion
-
-
-class ViPNAS_ResLayer(nn.Sequential):
-    """ViPNAS_ResLayer to build ResNet style backbone.
-
-    Args:
-        block (nn.Module): Residual block used to build ViPNAS ResLayer.
-        num_blocks (int): Number of blocks.
-        in_channels (int): Input channels of this block.
-        out_channels (int): Output channels of this block.
-        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
-            If not specified, it will firstly be obtained via
-            ``block.expansion``. If the block has no attribute "expansion",
-            the following default values will be used: 1 for BasicBlock and
-            4 for Bottleneck. Default: None.
-        stride (int): stride of the first block. Default: 1.
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck. Default: False
-        conv_cfg (dict): dictionary to construct and config conv layer.
-            Default: None
-        norm_cfg (dict): dictionary to construct and config norm layer.
-            Default: dict(type='BN')
-        downsample_first (bool): Downsample at the first block or last block.
-            False for Hourglass, True for ResNet. Default: True
-        kernel_size (int): Kernel Size of the corresponding convolution layer
-            searched in the block.
-        groups (int): Group number of the corresponding convolution layer
-            searched in the block.
-        attention (bool): Whether to use attention module in the end of the
-            block.
-    """
-
-    def __init__(self,
-                 block,
-                 num_blocks,
-                 in_channels,
-                 out_channels,
-                 expansion=None,
-                 stride=1,
-                 avg_down=False,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN'),
-                 downsample_first=True,
-                 kernel_size=3,
-                 groups=1,
-                 attention=False,
-                 **kwargs):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        self.block = block
-        self.expansion = get_expansion(block, expansion)
-
-        downsample = None
-        if stride != 1 or in_channels != out_channels:
-            downsample = []
-            conv_stride = stride
-            if avg_down and stride != 1:
-                conv_stride = 1
-                downsample.append(
-                    nn.AvgPool2d(
-                        kernel_size=stride,
-                        stride=stride,
-                        ceil_mode=True,
-                        count_include_pad=False))
-            downsample.extend([
-                build_conv_layer(
-                    conv_cfg,
-                    in_channels,
-                    out_channels,
-                    kernel_size=1,
-                    stride=conv_stride,
-                    bias=False),
-                build_norm_layer(norm_cfg, out_channels)[1]
-            ])
-            downsample = nn.Sequential(*downsample)
-
-        layers = []
-        if downsample_first:
-            layers.append(
-                block(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    expansion=self.expansion,
-                    stride=stride,
-                    downsample=downsample,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    kernel_size=kernel_size,
-                    groups=groups,
-                    attention=attention,
-                    **kwargs))
-            in_channels = out_channels
-            for _ in range(1, num_blocks):
-                layers.append(
-                    block(
-                        in_channels=in_channels,
-                        out_channels=out_channels,
-                        expansion=self.expansion,
-                        stride=1,
-                        conv_cfg=conv_cfg,
-                        norm_cfg=norm_cfg,
-                        kernel_size=kernel_size,
-                        groups=groups,
-                        attention=attention,
-                        **kwargs))
-        else:  # downsample_first=False is for HourglassModule
-            for i in range(0, num_blocks - 1):
-                layers.append(
-                    block(
-                        in_channels=in_channels,
-                        out_channels=in_channels,
-                        expansion=self.expansion,
-                        stride=1,
-                        conv_cfg=conv_cfg,
-                        norm_cfg=norm_cfg,
-                        kernel_size=kernel_size,
-                        groups=groups,
-                        attention=attention,
-                        **kwargs))
-            layers.append(
-                block(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    expansion=self.expansion,
-                    stride=stride,
-                    downsample=downsample,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    kernel_size=kernel_size,
-                    groups=groups,
-                    attention=attention,
-                    **kwargs))
-
-        super().__init__(*layers)
-
-
-@BACKBONES.register_module()
-class ViPNAS_ResNet(BaseBackbone):
-    """ViPNAS_ResNet backbone.
-
-    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
-    More details can be found in the `paper
-    <https://arxiv.org/abs/2105.10154>`__ .
-
-    Args:
-        depth (int): Network depth, from {18, 34, 50, 101, 152}.
-        in_channels (int): Number of input image channels. Default: 3.
-        num_stages (int): Stages of the network. Default: 4.
-        strides (Sequence[int]): Strides of the first block of each stage.
-            Default: ``(1, 2, 2, 2)``.
-        dilations (Sequence[int]): Dilation of each stage.
-            Default: ``(1, 1, 1, 1)``.
-        out_indices (Sequence[int]): Output from which stages. If only one
-            stage is specified, a single tensor (feature map) is returned,
-            otherwise multiple stages are specified, a tuple of tensors will
-            be returned. Default: ``(3, )``.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
-            Default: False.
-        avg_down (bool): Use AvgPool instead of stride conv when
-            downsampling in the bottleneck. Default: False.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters. Default: -1.
-        conv_cfg (dict | None): The config dict for conv layers. Default: None.
-        norm_cfg (dict): The config dict for norm layers.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed. Default: False.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity. Default: True.
-        wid (list(int)): Searched width config for each stage.
-        expan (list(int)): Searched expansion ratio config for each stage.
-        dep (list(int)): Searched depth config for each stage.
-        ks (list(int)): Searched kernel size config for each stage.
-        group (list(int)): Searched group number config for each stage.
-        att (list(bool)): Searched attention config for each stage.
-    """
-
-    arch_settings = {
-        50: ViPNAS_Bottleneck,
-    }
-
-    def __init__(self,
-                 depth,
-                 in_channels=3,
-                 num_stages=4,
-                 strides=(1, 2, 2, 2),
-                 dilations=(1, 1, 1, 1),
-                 out_indices=(3, ),
-                 style='pytorch',
-                 deep_stem=False,
-                 avg_down=False,
-                 frozen_stages=-1,
-                 conv_cfg=None,
-                 norm_cfg=dict(type='BN', requires_grad=True),
-                 norm_eval=False,
-                 with_cp=False,
-                 zero_init_residual=True,
-                 wid=[48, 80, 160, 304, 608],
-                 expan=[None, 1, 1, 1, 1],
-                 dep=[None, 4, 6, 7, 3],
-                 ks=[7, 3, 5, 5, 5],
-                 group=[None, 16, 16, 16, 16],
-                 att=[None, True, False, True, True]):
-        # Protect mutable default arguments
-        norm_cfg = copy.deepcopy(norm_cfg)
-        super().__init__()
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for resnet')
-        self.depth = depth
-        self.stem_channels = dep[0]
-        self.num_stages = num_stages
-        assert 1 <= num_stages <= 4
-        self.strides = strides
-        self.dilations = dilations
-        assert len(strides) == len(dilations) == num_stages
-        self.out_indices = out_indices
-        assert max(out_indices) < num_stages
-        self.style = style
-        self.deep_stem = deep_stem
-        self.avg_down = avg_down
-        self.frozen_stages = frozen_stages
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.with_cp = with_cp
-        self.norm_eval = norm_eval
-        self.zero_init_residual = zero_init_residual
-        self.block = self.arch_settings[depth]
-        self.stage_blocks = dep[1:1 + num_stages]
-
-        self._make_stem_layer(in_channels, wid[0], ks[0])
-
-        self.res_layers = []
-        _in_channels = wid[0]
-        for i, num_blocks in enumerate(self.stage_blocks):
-            expansion = get_expansion(self.block, expan[i + 1])
-            _out_channels = wid[i + 1] * expansion
-            stride = strides[i]
-            dilation = dilations[i]
-            res_layer = self.make_res_layer(
-                block=self.block,
-                num_blocks=num_blocks,
-                in_channels=_in_channels,
-                out_channels=_out_channels,
-                expansion=expansion,
-                stride=stride,
-                dilation=dilation,
-                style=self.style,
-                avg_down=self.avg_down,
-                with_cp=with_cp,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                kernel_size=ks[i + 1],
-                groups=group[i + 1],
-                attention=att[i + 1])
-            _in_channels = _out_channels
-            layer_name = f'layer{i + 1}'
-            self.add_module(layer_name, res_layer)
-            self.res_layers.append(layer_name)
-
-        self._freeze_stages()
-
-        self.feat_dim = res_layer[-1].out_channels
-
-    def make_res_layer(self, **kwargs):
-        """Make a ViPNAS ResLayer."""
-        return ViPNAS_ResLayer(**kwargs)
-
-    @property
-    def norm1(self):
-        """nn.Module: the normalization layer named "norm1" """
-        return getattr(self, self.norm1_name)
-
-    def _make_stem_layer(self, in_channels, stem_channels, kernel_size):
-        """Make stem layer."""
-        if self.deep_stem:
-            self.stem = nn.Sequential(
-                ConvModule(
-                    in_channels,
-                    stem_channels // 2,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    inplace=True),
-                ConvModule(
-                    stem_channels // 2,
-                    stem_channels // 2,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    inplace=True),
-                ConvModule(
-                    stem_channels // 2,
-                    stem_channels,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    inplace=True))
-        else:
-            self.conv1 = build_conv_layer(
-                self.conv_cfg,
-                in_channels,
-                stem_channels,
-                kernel_size=kernel_size,
-                stride=2,
-                padding=kernel_size // 2,
-                bias=False)
-            self.norm1_name, norm1 = build_norm_layer(
-                self.norm_cfg, stem_channels, postfix=1)
-            self.add_module(self.norm1_name, norm1)
-            self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-
-    def _freeze_stages(self):
-        """Freeze parameters."""
-        if self.frozen_stages >= 0:
-            if self.deep_stem:
-                self.stem.eval()
-                for param in self.stem.parameters():
-                    param.requires_grad = False
-            else:
-                self.norm1.eval()
-                for m in [self.conv1, self.norm1]:
-                    for param in m.parameters():
-                        param.requires_grad = False
-
-        for i in range(1, self.frozen_stages + 1):
-            m = getattr(self, f'layer{i}')
-            m.eval()
-            for param in m.parameters():
-                param.requires_grad = False
-
-    def init_weights(self, pretrained=None):
-        """Initialize model weights."""
-        super().init_weights(pretrained)
-        if pretrained is None:
-            for m in self.modules():
-                if isinstance(m, nn.Conv2d):
-                    nn.init.normal_(m.weight, std=0.001)
-                    for name, _ in m.named_parameters():
-                        if name in ['bias']:
-                            nn.init.constant_(m.bias, 0)
-                elif isinstance(m, nn.BatchNorm2d):
-                    nn.init.constant_(m.weight, 1)
-                    nn.init.constant_(m.bias, 0)
-
-    def forward(self, x):
-        """Forward function."""
-        if self.deep_stem:
-            x = self.stem(x)
-        else:
-            x = self.conv1(x)
-            x = self.norm1(x)
-            x = self.relu(x)
-        x = self.maxpool(x)
-        outs = []
-        for i, layer_name in enumerate(self.res_layers):
-            res_layer = getattr(self, layer_name)
-            x = res_layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-        if len(outs) == 1:
-            return outs[0]
-        return tuple(outs)
-
-    def train(self, mode=True):
-        """Convert the model into training mode."""
-        super().train(mode)
-        self._freeze_stages()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                # trick: eval have effect on BatchNorm only
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/builder.py b/main/transformer_utils/mmpose/models/builder.py
index 47f0a53121633fb6185a4d514c05a5862a9d74cf..5fa61ca08374a3e0af8d937fdc197eb51207881c 100644
--- a/main/transformer_utils/mmpose/models/builder.py
+++ b/main/transformer_utils/mmpose/models/builder.py
@@ -1,10 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import MODELS as MMCV_MODELS
-from mmcv.cnn import build_model_from_cfg
-from mmcv.utils import Registry, build_from_cfg
+from mmengine.registry import MODELS as MMCV_MODELS
+from mmengine import Registry
+from mmengine.registry import build_from_cfg, build_model_from_cfg
 
-MODELS = Registry(
-    'models', build_func=build_model_from_cfg, parent=MMCV_MODELS)
+MODELS = Registry('models', parent=MMCV_MODELS, locations=['mmpose.models'])
 
 BACKBONES = MODELS
 NECKS = MODELS
diff --git a/main/transformer_utils/mmpose/models/detectors/poseur.py b/main/transformer_utils/mmpose/models/detectors/poseur.py
index b5c98ea95af4ee114e2dc731bf1b3e83489b8563..455803bbd78dca98ba80814d19dbbdb4ccfda8ab 100644
--- a/main/transformer_utils/mmpose/models/detectors/poseur.py
+++ b/main/transformer_utils/mmpose/models/detectors/poseur.py
@@ -12,12 +12,7 @@ from .base import BasePose
 import torch
 from config import cfg
 
-try:
-    from mmcv.runner import auto_fp16
-except ImportError:
-    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
-                  'Please install mmcv>=1.1.4')
-    from mmpose.core import auto_fp16
+from mmpose.core import auto_fp16
 
 from .top_down import TopDown
 
diff --git a/main/transformer_utils/mmpose/models/detectors/top_down.py b/main/transformer_utils/mmpose/models/detectors/top_down.py
index 99215ec70b2381fbc01be6e448e30a09f83cda2b..20bf504e9bfa20a7470203033b2265e067ad33e6 100644
--- a/main/transformer_utils/mmpose/models/detectors/top_down.py
+++ b/main/transformer_utils/mmpose/models/detectors/top_down.py
@@ -4,7 +4,7 @@ import warnings
 import mmcv
 import numpy as np
 from mmcv.image import imwrite
-from mmcv.utils.misc import deprecated_api_warning
+from mmengine.utils import deprecated_api_warning
 from mmcv.visualization.image import imshow
 
 from mmpose.core import imshow_bboxes, imshow_keypoints
@@ -12,12 +12,7 @@ from .. import builder
 from ..builder import POSENETS
 from .base import BasePose
 
-try:
-    from mmcv.runner import auto_fp16
-except ImportError:
-    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
-                  'Please install mmcv>=1.1.4')
-    from mmpose.core import auto_fp16
+from mmpose.core import auto_fp16
 
 
 @POSENETS.register_module()
diff --git a/main/transformer_utils/mmpose/models/heads/poseur_head.py b/main/transformer_utils/mmpose/models/heads/poseur_head.py
index d01232247db1d687144d8fff2a3b226dd66fdcf5..db4d8a62acb9991b908d894e515befd0e7f414e8 100644
--- a/main/transformer_utils/mmpose/models/heads/poseur_head.py
+++ b/main/transformer_utils/mmpose/models/heads/poseur_head.py
@@ -4,9 +4,9 @@ import torch.nn as nn
 import copy
 import math
 import warnings
-from mmcv.cnn import build_upsample_layer, Linear, bias_init_with_prob, constant_init, normal_init
+from mmengine.model import constant_init, normal_init, bias_init_with_prob
+from mmcv.cnn import build_upsample_layer, Linear
 import torch.nn.functional as F
-from mmcv.cnn import normal_init
 
 from mmpose.core.evaluation import (keypoint_pck_accuracy,
                                     keypoints_from_regression)
diff --git a/main/transformer_utils/mmpose/models/heads/rle_regression_head.py b/main/transformer_utils/mmpose/models/heads/rle_regression_head.py
index b96a19155f6ec13f86e069d75d15ea4b70f133fa..20f702a573b5fa777c58755e1cc8270885a8bebc 100644
--- a/main/transformer_utils/mmpose/models/heads/rle_regression_head.py
+++ b/main/transformer_utils/mmpose/models/heads/rle_regression_head.py
@@ -1,7 +1,6 @@
 import numpy as np
 import torch.nn as nn
-from mmcv.cnn import normal_init
-
+from mmengine.model import normal_init
 from mmpose.core.evaluation import (keypoint_pck_accuracy,
                                     keypoints_from_regression)
 from mmpose.core.post_processing import fliplr_regression
diff --git a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
index c439f5b6332d72a66db75bf599035411c4e1e0d1..ac7b42a078a210053150bc353e7c9426285d9599 100644
--- a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
+++ b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
@@ -2,10 +2,10 @@
 import copy as cp
 
 import torch.nn as nn
+from mmengine.model import constant_init, normal_init, kaiming_init
 from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, Linear,
                       build_activation_layer, build_conv_layer,
-                      build_norm_layer, build_upsample_layer, constant_init,
-                      kaiming_init, normal_init)
+                      build_norm_layer, build_upsample_layer)
 
 from mmpose.core.evaluation import pose_pck_accuracy
 from mmpose.core.post_processing import flip_back
diff --git a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py
index 5ddc058d5634a5c63970a1efb8eaa66b158da1ec..1fe95548d7ff2c20516f22e14b24a51da24d7654 100644
--- a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py
+++ b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py
@@ -1,8 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
-from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
-                      constant_init, normal_init)
+from mmengine.model import constant_init, normal_init
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer)
 
 from mmpose.core.evaluation import pose_pck_accuracy
 from mmpose.core.post_processing import flip_back
diff --git a/main/transformer_utils/mmpose/models/necks/__init__.py b/main/transformer_utils/mmpose/models/necks/__init__.py
deleted file mode 100644
index 0593f61c01fa9968260b939f7ccd50311c058595..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/necks/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .fpn import FPN
-from .gap_neck import GlobalAveragePooling
-from .posewarper_neck import PoseWarperNeck
-from .tcformer_mta_neck import MTA
-from .channel_mapper import ChannelMapper
-
-__all__ = ['GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'MTA']
diff --git a/main/transformer_utils/mmpose/models/necks/channel_mapper.py b/main/transformer_utils/mmpose/models/necks/channel_mapper.py
deleted file mode 100644
index 113d170e9d55b9e2d3984c6838a86e4c659fa75c..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/necks/channel_mapper.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import torch.nn as nn
-from mmcv.cnn import ConvModule, xavier_init
-
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class ChannelMapper(nn.Module):
-    r"""Channel Mapper to reduce/increase channels of backbone features.
-
-    This is used to reduce/increase channels of backbone features.
-
-    Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale).
-        kernel_size (int, optional): kernel_size for reducing channels (used
-            at each scale). Default: 3.
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Default: None.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: None.
-        act_cfg (dict, optional): Config dict for activation layer in
-            ConvModule. Default: dict(type='ReLU').
-
-    Example:
-        >>> import torch
-        >>> in_channels = [2, 3, 5, 7]
-        >>> scales = [340, 170, 84, 43]
-        >>> inputs = [torch.rand(1, c, s, s)
-        ...           for c, s in zip(in_channels, scales)]
-        >>> self = ChannelMapper(in_channels, 11, 3).eval()
-        >>> outputs = self.forward(inputs)
-        >>> for i in range(len(outputs)):
-        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
-        outputs[0].shape = torch.Size([1, 11, 340, 340])
-        outputs[1].shape = torch.Size([1, 11, 170, 170])
-        outputs[2].shape = torch.Size([1, 11, 84, 84])
-        outputs[3].shape = torch.Size([1, 11, 43, 43])
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU')):
-        super(ChannelMapper, self).__init__()
-        assert isinstance(in_channels, list)
-
-        self.convs = nn.ModuleList()
-        for in_channel in in_channels:
-            self.convs.append(
-                ConvModule(
-                    in_channel,
-                    out_channels,
-                    kernel_size,
-                    padding=(kernel_size - 1) // 2,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-
-    # default init_weights for conv(msra) and norm in ConvModule
-    def init_weights(self):
-        """Initialize the weights of ChannelMapper module."""
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                xavier_init(m, distribution='uniform')
-
-    def forward(self, inputs):
-        """Forward function."""
-        
-        
-        assert len(inputs) == len(self.convs)
-        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
-        return tuple(outs)
diff --git a/main/transformer_utils/mmpose/models/necks/fpn.py b/main/transformer_utils/mmpose/models/necks/fpn.py
deleted file mode 100644
index 795a8af0b6904153a9b4e1a41d7b803381874162..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/necks/fpn.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import ConvModule, xavier_init
-from mmcv.runner import auto_fp16
-
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class FPN(nn.Module):
-    r"""Feature Pyramid Network.
-
-    This is an implementation of paper `Feature Pyramid Networks for Object
-    Detection <https://arxiv.org/abs/1612.03144>`_.
-
-    Args:
-        in_channels (list[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale).
-        num_outs (int): Number of output scales.
-        start_level (int): Index of the start input backbone level used to
-            build the feature pyramid. Default: 0.
-        end_level (int): Index of the end input backbone level (exclusive) to
-            build the feature pyramid. Default: -1, which means the last level.
-        add_extra_convs (bool | str): If bool, it decides whether to add conv
-            layers on top of the original feature maps. Default to False.
-            If True, it is equivalent to `add_extra_convs='on_input'`.
-            If str, it specifies the source feature map of the extra convs.
-            Only the following options are allowed
-
-            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
-            - 'on_lateral': Last feature map after lateral convs.
-            - 'on_output': The last output feature map after fpn convs.
-        relu_before_extra_convs (bool): Whether to apply relu before the extra
-            conv. Default: False.
-        no_norm_on_lateral (bool): Whether to apply norm on lateral.
-            Default: False.
-        conv_cfg (dict): Config dict for convolution layer. Default: None.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-        act_cfg (dict): Config dict for activation layer in ConvModule.
-            Default: None.
-        upsample_cfg (dict): Config dict for interpolate layer.
-            Default: dict(mode='nearest').
-
-    Example:
-        >>> import torch
-        >>> in_channels = [2, 3, 5, 7]
-        >>> scales = [340, 170, 84, 43]
-        >>> inputs = [torch.rand(1, c, s, s)
-        ...           for c, s in zip(in_channels, scales)]
-        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
-        >>> outputs = self.forward(inputs)
-        >>> for i in range(len(outputs)):
-        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
-        outputs[0].shape = torch.Size([1, 11, 340, 340])
-        outputs[1].shape = torch.Size([1, 11, 170, 170])
-        outputs[2].shape = torch.Size([1, 11, 84, 84])
-        outputs[3].shape = torch.Size([1, 11, 43, 43])
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_outs,
-                 start_level=0,
-                 end_level=-1,
-                 add_extra_convs=False,
-                 relu_before_extra_convs=False,
-                 no_norm_on_lateral=False,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=None,
-                 upsample_cfg=dict(mode='nearest')):
-        super().__init__()
-        assert isinstance(in_channels, list)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.num_ins = len(in_channels)
-        self.num_outs = num_outs
-        self.relu_before_extra_convs = relu_before_extra_convs
-        self.no_norm_on_lateral = no_norm_on_lateral
-        self.fp16_enabled = False
-        self.upsample_cfg = upsample_cfg.copy()
-
-        if end_level == -1 or end_level == self.num_ins - 1:
-            self.backbone_end_level = self.num_ins
-            assert num_outs >= self.num_ins - start_level
-        else:
-            # if end_level is not the last level, no extra level is allowed
-            self.backbone_end_level = end_level + 1
-            assert end_level < self.num_ins
-            assert num_outs == end_level - start_level + 1
-        self.start_level = start_level
-        self.end_level = end_level
-        self.add_extra_convs = add_extra_convs
-        assert isinstance(add_extra_convs, (str, bool))
-        if isinstance(add_extra_convs, str):
-            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
-            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
-        elif add_extra_convs:  # True
-            self.add_extra_convs = 'on_input'
-
-        self.lateral_convs = nn.ModuleList()
-        self.fpn_convs = nn.ModuleList()
-
-        for i in range(self.start_level, self.backbone_end_level):
-            l_conv = ConvModule(
-                in_channels[i],
-                out_channels,
-                1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
-                act_cfg=act_cfg,
-                inplace=False)
-            fpn_conv = ConvModule(
-                out_channels,
-                out_channels,
-                3,
-                padding=1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
-                inplace=False)
-
-            self.lateral_convs.append(l_conv)
-            self.fpn_convs.append(fpn_conv)
-
-        # add extra conv layers (e.g., RetinaNet)
-        extra_levels = num_outs - self.backbone_end_level + self.start_level
-        if self.add_extra_convs and extra_levels >= 1:
-            for i in range(extra_levels):
-                if i == 0 and self.add_extra_convs == 'on_input':
-                    in_channels = self.in_channels[self.backbone_end_level - 1]
-                else:
-                    in_channels = out_channels
-                extra_fpn_conv = ConvModule(
-                    in_channels,
-                    out_channels,
-                    3,
-                    stride=2,
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    inplace=False)
-                self.fpn_convs.append(extra_fpn_conv)
-
-    def init_weights(self):
-        """Initialize model weights."""
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                xavier_init(m, distribution='uniform')
-
-    @auto_fp16()
-    def forward(self, inputs):
-        """Forward function."""
-        assert len(inputs) == len(self.in_channels)
-
-        # build laterals
-        laterals = [
-            lateral_conv(inputs[i + self.start_level])
-            for i, lateral_conv in enumerate(self.lateral_convs)
-        ]
-
-        # build top-down path
-        used_backbone_levels = len(laterals)
-        for i in range(used_backbone_levels - 1, 0, -1):
-            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
-            #  it cannot co-exist with `size` in `F.interpolate`.
-            if 'scale_factor' in self.upsample_cfg:
-                # fix runtime error of "+=" inplace operation in PyTorch 1.10
-                laterals[i - 1] = laterals[i - 1] + F.interpolate(
-                    laterals[i], **self.upsample_cfg)
-            else:
-                prev_shape = laterals[i - 1].shape[2:]
-                laterals[i - 1] = laterals[i - 1] + F.interpolate(
-                    laterals[i], size=prev_shape, **self.upsample_cfg)
-
-        # build outputs
-        # part 1: from original levels
-        outs = [
-            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
-        ]
-        # part 2: add extra levels
-        if self.num_outs > len(outs):
-            # use max pool to get more levels on top of outputs
-            # (e.g., Faster R-CNN, Mask R-CNN)
-            if not self.add_extra_convs:
-                for i in range(self.num_outs - used_backbone_levels):
-                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
-            # add conv layers on top of original feature maps (RetinaNet)
-            else:
-                if self.add_extra_convs == 'on_input':
-                    extra_source = inputs[self.backbone_end_level - 1]
-                elif self.add_extra_convs == 'on_lateral':
-                    extra_source = laterals[-1]
-                elif self.add_extra_convs == 'on_output':
-                    extra_source = outs[-1]
-                else:
-                    raise NotImplementedError
-                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
-                for i in range(used_backbone_levels + 1, self.num_outs):
-                    if self.relu_before_extra_convs:
-                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
-                    else:
-                        outs.append(self.fpn_convs[i](outs[-1]))
-        return outs
diff --git a/main/transformer_utils/mmpose/models/necks/gap_neck.py b/main/transformer_utils/mmpose/models/necks/gap_neck.py
deleted file mode 100644
index 5e6ad68ec11110daaad3a66e09d67efb355c4b93..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/necks/gap_neck.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-import torch.nn as nn
-
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class GlobalAveragePooling(nn.Module):
-    """Global Average Pooling neck.
-
-    Note that we use `view` to remove extra channel after pooling. We do not
-    use `squeeze` as it will also remove the batch dimension when the tensor
-    has a batch dimension of size 1, which can lead to unexpected errors.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.gap = nn.AdaptiveAvgPool2d((1, 1))
-
-    def init_weights(self):
-        pass
-
-    def forward(self, inputs):
-        if isinstance(inputs, tuple):
-            outs = tuple([self.gap(x) for x in inputs])
-            outs = tuple(
-                [out.view(x.size(0), -1) for out, x in zip(outs, inputs)])
-        elif isinstance(inputs, list):
-            outs = [self.gap(x) for x in inputs]
-            outs = [out.view(x.size(0), -1) for out, x in zip(outs, inputs)]
-        elif isinstance(inputs, torch.Tensor):
-            outs = self.gap(inputs)
-            outs = outs.view(inputs.size(0), -1)
-        else:
-            raise TypeError('neck inputs should be tuple or torch.tensor')
-        return outs
diff --git a/main/transformer_utils/mmpose/models/necks/posewarper_neck.py b/main/transformer_utils/mmpose/models/necks/posewarper_neck.py
deleted file mode 100644
index dd4ddfbf8984857a6110f19b0a7d703b53f1c433..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/necks/posewarper_neck.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-import torch
-import torch.nn as nn
-from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
-                      normal_init)
-from mmcv.utils import digit_version
-from torch.nn.modules.batchnorm import _BatchNorm
-
-from mmpose.models.utils.ops import resize
-from ..backbones.resnet import BasicBlock, Bottleneck
-from ..builder import NECKS
-
-try:
-    from mmcv.ops import DeformConv2d
-    has_mmcv_full = True
-except (ImportError, ModuleNotFoundError):
-    has_mmcv_full = False
-
-
-@NECKS.register_module()
-class PoseWarperNeck(nn.Module):
-    """PoseWarper neck.
-
-    `"Learning temporal pose estimation from sparsely-labeled videos"
-    <https://arxiv.org/abs/1906.04016>`_.
-
-    Args:
-        in_channels (int): Number of input channels from backbone
-        out_channels (int): Number of output channels
-        inner_channels (int): Number of intermediate channels of the res block
-        deform_groups (int): Number of groups in the deformable conv
-        dilations (list|tuple): different dilations of the offset conv layers
-        trans_conv_kernel (int): the kernel of the trans conv layer, which is
-            used to get heatmap from the output of backbone. Default: 1
-        res_blocks_cfg (dict|None): config of residual blocks. If None,
-            use the default values. If not None, it should contain the
-            following keys:
-
-            - block (str): the type of residual block, Default: 'BASIC'.
-            - num_blocks (int):  the number of blocks, Default: 20.
-
-        offsets_kernel (int): the kernel of offset conv layer.
-        deform_conv_kernel (int): the kernel of defomrable conv layer.
-        in_index (int|Sequence[int]): Input feature index. Default: 0
-        input_transform (str|None): Transformation type of input features.
-            Options: 'resize_concat', 'multiple_select', None.
-            Default: None.
-
-            - 'resize_concat': Multiple feature maps will be resize to \
-                the same size as first one and than concat together. \
-                Usually used in FCN head of HRNet.
-            - 'multiple_select': Multiple feature maps will be bundle into \
-                a list and passed into decode head.
-            - None: Only one select feature map is allowed.
-
-        freeze_trans_layer (bool): Whether to freeze the transition layer
-            (stop grad and set eval mode). Default: True.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only. Default: False.
-        im2col_step (int): the argument `im2col_step` in deformable conv,
-            Default: 80.
-    """
-    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
-    minimum_mmcv_version = '1.3.17'
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 inner_channels,
-                 deform_groups=17,
-                 dilations=(3, 6, 12, 18, 24),
-                 trans_conv_kernel=1,
-                 res_blocks_cfg=None,
-                 offsets_kernel=3,
-                 deform_conv_kernel=3,
-                 in_index=0,
-                 input_transform=None,
-                 freeze_trans_layer=True,
-                 norm_eval=False,
-                 im2col_step=80):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.inner_channels = inner_channels
-        self.deform_groups = deform_groups
-        self.dilations = dilations
-        self.trans_conv_kernel = trans_conv_kernel
-        self.res_blocks_cfg = res_blocks_cfg
-        self.offsets_kernel = offsets_kernel
-        self.deform_conv_kernel = deform_conv_kernel
-        self.in_index = in_index
-        self.input_transform = input_transform
-        self.freeze_trans_layer = freeze_trans_layer
-        self.norm_eval = norm_eval
-        self.im2col_step = im2col_step
-
-        identity_trans_layer = False
-
-        assert trans_conv_kernel in [0, 1, 3]
-        kernel_size = trans_conv_kernel
-        if kernel_size == 3:
-            padding = 1
-        elif kernel_size == 1:
-            padding = 0
-        else:
-            # 0 for Identity mapping.
-            identity_trans_layer = True
-
-        if identity_trans_layer:
-            self.trans_layer = nn.Identity()
-        else:
-            self.trans_layer = build_conv_layer(
-                cfg=dict(type='Conv2d'),
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=kernel_size,
-                stride=1,
-                padding=padding)
-
-        # build chain of residual blocks
-        if res_blocks_cfg is not None and not isinstance(res_blocks_cfg, dict):
-            raise TypeError('res_blocks_cfg should be dict or None.')
-
-        if res_blocks_cfg is None:
-            block_type = 'BASIC'
-            num_blocks = 20
-        else:
-            block_type = res_blocks_cfg.get('block', 'BASIC')
-            num_blocks = res_blocks_cfg.get('num_blocks', 20)
-
-        block = self.blocks_dict[block_type]
-
-        res_layers = []
-        downsample = nn.Sequential(
-            build_conv_layer(
-                cfg=dict(type='Conv2d'),
-                in_channels=out_channels,
-                out_channels=inner_channels,
-                kernel_size=1,
-                stride=1,
-                bias=False),
-            build_norm_layer(dict(type='BN'), inner_channels)[1])
-        res_layers.append(
-            block(
-                in_channels=out_channels,
-                out_channels=inner_channels,
-                downsample=downsample))
-
-        for _ in range(1, num_blocks):
-            res_layers.append(block(inner_channels, inner_channels))
-        self.offset_feats = nn.Sequential(*res_layers)
-
-        # build offset layers
-        self.num_offset_layers = len(dilations)
-        assert self.num_offset_layers > 0, 'Number of offset layers ' \
-            'should be larger than 0.'
-
-        target_offset_channels = 2 * offsets_kernel**2 * deform_groups
-
-        offset_layers = [
-            build_conv_layer(
-                cfg=dict(type='Conv2d'),
-                in_channels=inner_channels,
-                out_channels=target_offset_channels,
-                kernel_size=offsets_kernel,
-                stride=1,
-                dilation=dilations[i],
-                padding=dilations[i],
-                bias=False,
-            ) for i in range(self.num_offset_layers)
-        ]
-        self.offset_layers = nn.ModuleList(offset_layers)
-
-        # build deformable conv layers
-        assert digit_version(mmcv.__version__) >= \
-            digit_version(self.minimum_mmcv_version), \
-            f'Current MMCV version: {mmcv.__version__}, ' \
-            f'but MMCV >= {self.minimum_mmcv_version} is required, see ' \
-            f'https://github.com/open-mmlab/mmcv/issues/1440, ' \
-            f'Please install the latest MMCV.'
-
-        if has_mmcv_full:
-            deform_conv_layers = [
-                DeformConv2d(
-                    in_channels=out_channels,
-                    out_channels=out_channels,
-                    kernel_size=deform_conv_kernel,
-                    stride=1,
-                    padding=int(deform_conv_kernel / 2) * dilations[i],
-                    dilation=dilations[i],
-                    deform_groups=deform_groups,
-                    im2col_step=self.im2col_step,
-                ) for i in range(self.num_offset_layers)
-            ]
-        else:
-            raise ImportError('Please install the full version of mmcv '
-                              'to use `DeformConv2d`.')
-
-        self.deform_conv_layers = nn.ModuleList(deform_conv_layers)
-
-        self.freeze_layers()
-
-    def freeze_layers(self):
-        if self.freeze_trans_layer:
-            self.trans_layer.eval()
-
-            for param in self.trans_layer.parameters():
-                param.requires_grad = False
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                normal_init(m, std=0.001)
-            elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
-                constant_init(m, 1)
-            elif isinstance(m, DeformConv2d):
-                filler = torch.zeros([
-                    m.weight.size(0),
-                    m.weight.size(1),
-                    m.weight.size(2),
-                    m.weight.size(3)
-                ],
-                                     dtype=torch.float32,
-                                     device=m.weight.device)
-                for k in range(m.weight.size(0)):
-                    filler[k, k,
-                           int(m.weight.size(2) / 2),
-                           int(m.weight.size(3) / 2)] = 1.0
-                m.weight = torch.nn.Parameter(filler)
-                m.weight.requires_grad = True
-
-        # posewarper offset layer weight initialization
-        for m in self.offset_layers.modules():
-            constant_init(m, 0)
-
-    def _transform_inputs(self, inputs):
-        """Transform inputs for decoder.
-
-        Args:
-            inputs (list[Tensor] | Tensor): multi-level img features.
-
-        Returns:
-            Tensor: The transformed inputs
-        """
-        if not isinstance(inputs, list):
-            return inputs
-
-        if self.input_transform == 'resize_concat':
-            inputs = [inputs[i] for i in self.in_index]
-            upsampled_inputs = [
-                resize(
-                    input=x,
-                    size=inputs[0].shape[2:],
-                    mode='bilinear',
-                    align_corners=self.align_corners) for x in inputs
-            ]
-            inputs = torch.cat(upsampled_inputs, dim=1)
-        elif self.input_transform == 'multiple_select':
-            inputs = [inputs[i] for i in self.in_index]
-        else:
-            inputs = inputs[self.in_index]
-
-        return inputs
-
-    def forward(self, inputs, frame_weight):
-        assert isinstance(inputs, (list, tuple)), 'PoseWarperNeck inputs ' \
-            'should be list or tuple, even though the length is 1, ' \
-            'for unified processing.'
-
-        output_heatmap = 0
-        if len(inputs) > 1:
-            inputs = [self._transform_inputs(input) for input in inputs]
-            inputs = [self.trans_layer(input) for input in inputs]
-
-            # calculate difference features
-            diff_features = [
-                self.offset_feats(inputs[0] - input) for input in inputs
-            ]
-
-            for i in range(len(inputs)):
-                if frame_weight[i] == 0:
-                    continue
-                warped_heatmap = 0
-                for j in range(self.num_offset_layers):
-                    offset = (self.offset_layers[j](diff_features[i]))
-                    warped_heatmap_tmp = self.deform_conv_layers[j](inputs[i],
-                                                                    offset)
-                    warped_heatmap += warped_heatmap_tmp / \
-                        self.num_offset_layers
-
-                output_heatmap += warped_heatmap * frame_weight[i]
-
-        else:
-            inputs = inputs[0]
-            inputs = self._transform_inputs(inputs)
-            inputs = self.trans_layer(inputs)
-
-            num_frames = len(frame_weight)
-            batch_size = inputs.size(0) // num_frames
-            ref_x = inputs[:batch_size]
-            ref_x_tiled = ref_x.repeat(num_frames, 1, 1, 1)
-
-            offset_features = self.offset_feats(ref_x_tiled - inputs)
-
-            warped_heatmap = 0
-            for j in range(self.num_offset_layers):
-                offset = self.offset_layers[j](offset_features)
-
-                warped_heatmap_tmp = self.deform_conv_layers[j](inputs, offset)
-                warped_heatmap += warped_heatmap_tmp / self.num_offset_layers
-
-            for i in range(num_frames):
-                if frame_weight[i] == 0:
-                    continue
-                output_heatmap += warped_heatmap[i * batch_size:(i + 1) *
-                                                 batch_size] * frame_weight[i]
-
-        return output_heatmap
-
-    def train(self, mode=True):
-        """Convert the model into training mode."""
-        super().train(mode)
-        self.freeze_layers()
-        if mode and self.norm_eval:
-            for m in self.modules():
-                if isinstance(m, _BatchNorm):
-                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py b/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py
deleted file mode 100644
index 6723fb018e7799c1c0104868b1ca87c56cd28351..0000000000000000000000000000000000000000
--- a/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import ConvModule, constant_init, normal_init, trunc_normal_init
-from mmcv.runner import BaseModule
-
-from ..builder import NECKS
-from ..utils import TCFormerDynamicBlock, token2map, token_interp
-
-
-@NECKS.register_module()
-class MTA(BaseModule):
-    """Multi-stage Token feature Aggregation (MTA) module in TCFormer.
-
-    Args:
-        in_channels (list[int]): Number of input channels per stage.
-            Default: [64, 128, 256, 512].
-        out_channels (int): Number of output channels (used at each scale).
-        num_outs (int): Number of output scales. Default: 4.
-        start_level (int): Index of the start input backbone level used to
-            build the feature pyramid. Default: 0.
-        end_level (int): Index of the end input backbone level (exclusive) to
-            build the feature pyramid. Default: -1, which means the last level.
-        add_extra_convs (bool | str): If bool, it decides whether to add conv
-            layers on top of the original feature maps. Default to False.
-            If True, it is equivalent to `add_extra_convs='on_input'`.
-            If str, it specifies the source feature map of the extra convs.
-            Only the following options are allowed
-            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
-            - 'on_output': The last output feature map after fpn convs.
-        relu_before_extra_convs (bool): Whether to apply relu before the extra
-            conv. Default: False.
-        no_norm_on_lateral (bool): Whether to apply norm on lateral.
-            Default: False.
-        conv_cfg (dict): Config dict for convolution layer. Default: None.
-        norm_cfg (dict): Config dict for normalization layer. Default: None.
-        act_cfg (dict): Config dict for activation layer in ConvModule.
-        num_heads (Sequence[int]): The attention heads of each transformer
-            block. Default: [2, 2, 2, 2].
-        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
-            embedding dim of each transformer block.
-        sr_ratios (Sequence[int]): The spatial reduction rate of each
-            transformer block. Default: [8, 4, 2, 1].
-        qkv_bias (bool): Enable bias for qkv if True. Default: True.
-        qk_scale (float | None, optional): Override default qk scale of
-            head_dim ** -0.5 if set. Default: None.
-        drop_rate (float): Probability of an element to be zeroed.
-            Default 0.0.
-        attn_drop_rate (float): The drop out rate for attention layer.
-            Default 0.0.
-        drop_path_rate (float): stochastic depth rate. Default 0.
-        transformer_norm_cfg (dict): Config dict for normalization layer
-            in transformer blocks. Default: dict(type='LN').
-        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
-            If False, use a pooling process for spatial reduction. Defaults:
-            False.
-    """
-
-    def __init__(
-            self,
-            in_channels=[64, 128, 256, 512],
-            out_channels=128,
-            num_outs=4,
-            start_level=0,
-            end_level=-1,
-            add_extra_convs=False,
-            relu_before_extra_convs=False,
-            no_norm_on_lateral=False,
-            conv_cfg=None,
-            norm_cfg=None,
-            act_cfg=None,
-            num_heads=[2, 2, 2, 2],
-            mlp_ratios=[4, 4, 4, 4],
-            sr_ratios=[8, 4, 2, 1],
-            qkv_bias=True,
-            qk_scale=None,
-            drop_rate=0.,
-            attn_drop_rate=0.,
-            drop_path_rate=0.,
-            transformer_norm_cfg=dict(type='LN'),
-            use_sr_conv=False,
-    ):
-        super().__init__()
-        assert isinstance(in_channels, list)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.num_ins = len(in_channels)
-        self.num_outs = num_outs
-        self.no_norm_on_lateral = no_norm_on_lateral
-        self.fp16_enabled = False
-        self.norm_cfg = norm_cfg
-        self.conv_cfg = conv_cfg
-        self.act_cfg = act_cfg
-        self.mlp_ratios = mlp_ratios
-
-        if end_level == -1 or end_level == self.num_ins - 1:
-            self.backbone_end_level = self.num_ins
-            assert num_outs >= self.num_ins - start_level
-        else:
-            # if end_level is not the last level, no extra level is allowed
-            self.backbone_end_level = end_level + 1
-            assert end_level < self.num_ins
-            assert num_outs == end_level - start_level + 1
-        self.start_level = start_level
-        self.end_level = end_level
-
-        self.lateral_convs = nn.ModuleList()
-        self.merge_blocks = nn.ModuleList()
-
-        for i in range(self.start_level, self.backbone_end_level):
-            l_conv = ConvModule(
-                in_channels[i],
-                out_channels,
-                1,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
-                act_cfg=act_cfg,
-                inplace=False)
-            self.lateral_convs.append(l_conv)
-
-        for i in range(self.start_level, self.backbone_end_level - 1):
-            merge_block = TCFormerDynamicBlock(
-                dim=out_channels,
-                num_heads=num_heads[i],
-                mlp_ratio=mlp_ratios[i],
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=drop_path_rate,
-                norm_cfg=transformer_norm_cfg,
-                sr_ratio=sr_ratios[i],
-                use_sr_conv=use_sr_conv)
-            self.merge_blocks.append(merge_block)
-
-        # add extra conv layers (e.g., RetinaNet)
-        self.relu_before_extra_convs = relu_before_extra_convs
-
-        self.add_extra_convs = add_extra_convs
-        assert isinstance(add_extra_convs, (str, bool))
-        if isinstance(add_extra_convs, str):
-            # Extra_convs_source choices: 'on_input', 'on_output'
-            assert add_extra_convs in ('on_input', 'on_output')
-        elif add_extra_convs:  # True
-            self.add_extra_convs = 'on_input'
-
-        self.extra_convs = nn.ModuleList()
-        extra_levels = num_outs - (self.end_level + 1 - self.start_level)
-        if self.add_extra_convs and extra_levels >= 1:
-            for i in range(extra_levels):
-                if i == 0 and self.add_extra_convs == 'on_input':
-                    in_channels = self.in_channels[self.end_level]
-                else:
-                    in_channels = out_channels
-                extra_fpn_conv = ConvModule(
-                    in_channels,
-                    out_channels,
-                    3,
-                    stride=2,
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    inplace=False)
-                self.extra_convs.append(extra_fpn_conv)
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                trunc_normal_init(m, std=.02, bias=0.)
-            elif isinstance(m, nn.LayerNorm):
-                constant_init(m, 1.0)
-            elif isinstance(m, nn.Conv2d):
-                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                fan_out //= m.groups
-                normal_init(m, 0, math.sqrt(2.0 / fan_out))
-
-    def forward(self, inputs):
-        """Forward function."""
-        assert len(inputs) == len(self.in_channels)
-
-        # build lateral tokens
-        input_dicts = []
-        for i, lateral_conv in enumerate(self.lateral_convs):
-            tmp = inputs[i + self.start_level].copy()
-            tmp['x'] = lateral_conv(tmp['x'].unsqueeze(2).permute(
-                0, 3, 1, 2)).permute(0, 2, 3, 1).squeeze(2)
-            input_dicts.append(tmp)
-
-        # merge from high level to low level
-        for i in range(len(input_dicts) - 2, -1, -1):
-            input_dicts[i]['x'] = input_dicts[i]['x'] + token_interp(
-                input_dicts[i], input_dicts[i + 1])
-            input_dicts[i] = self.merge_blocks[i](input_dicts[i])
-
-        # transform to feature map
-        outs = [token2map(token_dict) for token_dict in input_dicts]
-
-        # part 2: add extra levels
-        used_backbone_levels = len(outs)
-        if self.num_outs > len(outs):
-            # use max pool to get more levels on top of outputs
-            if not self.add_extra_convs:
-                for i in range(self.num_outs - used_backbone_levels):
-                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
-            # add conv layers on top of original feature maps
-            else:
-                if self.add_extra_convs == 'on_input':
-                    tmp = inputs[self.backbone_end_level - 1]
-                    extra_source = token2map(tmp)
-                elif self.add_extra_convs == 'on_output':
-                    extra_source = outs[-1]
-                else:
-                    raise NotImplementedError
-
-                outs.append(self.extra_convs[0](extra_source))
-                for i in range(1, self.num_outs - used_backbone_levels):
-                    if self.relu_before_extra_convs:
-                        outs.append(self.extra_convs[i](F.relu(outs[-1])))
-                    else:
-                        outs.append(self.extra_convs[i](outs[-1]))
-        return outs
diff --git a/main/transformer_utils/mmpose/models/utils/positional_encoding.py b/main/transformer_utils/mmpose/models/utils/positional_encoding.py
index 3c7e6bab9f5b3a1a71895f068bcbee47a891de68..3ceb81ac078894f747d97a2ba6d78199addab3e5 100644
--- a/main/transformer_utils/mmpose/models/utils/positional_encoding.py
+++ b/main/transformer_utils/mmpose/models/utils/positional_encoding.py
@@ -2,11 +2,11 @@ import math
 
 import torch
 import torch.nn as nn
-from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
-from mmcv.runner import BaseModule
+# from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmengine.model import BaseModule
 
 
-@POSITIONAL_ENCODING.register_module(force=True)
+# @POSITIONAL_ENCODING.register_module(force=True)
 class SinePositionalEncoding(BaseModule):
     """Position encoding with sine and cosine functions.
     See `End-to-End Object Detection with Transformers
@@ -98,7 +98,7 @@ class SinePositionalEncoding(BaseModule):
         return repr_str
 
 
-@POSITIONAL_ENCODING.register_module(force=True)
+# @POSITIONAL_ENCODING.register_module(force=True)
 class LearnedPositionalEncoding(BaseModule):
     """Position embedding with learnable embedding weights.
     Args:
diff --git a/main/transformer_utils/mmpose/models/utils/tcformer_utils.py b/main/transformer_utils/mmpose/models/utils/tcformer_utils.py
index 8d3a28534c83d60e52ed0382f54a4d9f4902e018..85fae8cd46f44b536ab396df669f041fe1f18094 100644
--- a/main/transformer_utils/mmpose/models/utils/tcformer_utils.py
+++ b/main/transformer_utils/mmpose/models/utils/tcformer_utils.py
@@ -4,7 +4,8 @@ import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mmcv.cnn import build_norm_layer, trunc_normal_init
+from mmcv.cnn import build_norm_layer
+from mmengine.model import trunc_normal_init
 from mmcv.cnn.bricks.transformer import build_dropout
 
 try:
diff --git a/main/transformer_utils/mmpose/models/utils/transformer.py b/main/transformer_utils/mmpose/models/utils/transformer.py
index 42205707347e57c433e66eda728cc82a6df7455a..f256c4c8ffade75f020686a369d35157ba6d6b5c 100644
--- a/main/transformer_utils/mmpose/models/utils/transformer.py
+++ b/main/transformer_utils/mmpose/models/utils/transformer.py
@@ -6,30 +6,32 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import build_conv_layer, build_norm_layer
-from mmcv.runner.base_module import BaseModule
-from mmcv.utils import to_2tuple
+from mmengine.model import BaseModule, ModuleList
+from mmengine.utils import digit_version, to_2tuple
 from mmpose.models.builder import TRANSFORMER
 
 from easydict import EasyDict
 from einops import rearrange, repeat
-from mmcv.runner import force_fp32
+from mmpose.core import force_fp32
 from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
                                          TransformerLayerSequence,
                                          build_transformer_layer_sequence)
-from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
-                                      TRANSFORMER_LAYER_SEQUENCE)
+# from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
+#                                       TRANSFORMER_LAYER_SEQUENCE)
 import torch.distributions as distributions
 from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
 from torch.nn.init import normal_
 import copy
 import warnings
-from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init
-
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmengine.model import xavier_init
 from utils.human_models import smpl_x
 
 from config import cfg
 
-
+from mmengine import Registry
+TRANSFORMER_LAYER = Registry('transformerLayer')
+TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
 def point_sample(input, point_coords, **kwargs):
     """
     A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
diff --git a/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py b/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py
index e58ca98ebd0cef0498607270b7650a4ad1f6ec27..0d46db5038dcb24d605b1705ecc64b1dcafb6d96 100644
--- a/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py
+++ b/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py
@@ -6,13 +6,14 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd.function import Function, once_differentiable
 
-from mmcv import deprecated_api_warning
-from mmcv.cnn import constant_init, xavier_init
-from mmcv.cnn.bricks.registry import ATTENTION
-from mmcv.runner import BaseModule
+from mmengine.utils import deprecated_api_warning
+from mmengine.model import constant_init, xavier_init
+# from mmcv.cnn.bricks.registry import ATTENTION
+from mmengine.model import BaseModule
 from mmcv.utils import ext_loader
 from mmcv.ops.multi_scale_deform_attn import ext_module
-
+from mmengine import Registry
+ATTENTION = Registry('attention')
 class MultiScaleDeformableAttnFunction(Function):
 
     @staticmethod
diff --git a/main/transformer_utils/mmpose/utils/collect_env.py b/main/transformer_utils/mmpose/utils/collect_env.py
index f75c5ea73383ccef367632cf497227498ac50078..1433f0bcb1555b550e06b5e933b2755dbc56e24c 100644
--- a/main/transformer_utils/mmpose/utils/collect_env.py
+++ b/main/transformer_utils/mmpose/utils/collect_env.py
@@ -1,16 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import collect_env as collect_basic_env
-from mmcv.utils import get_git_hash
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
 
 import mmpose
 
 
 def collect_env():
-    env_info = collect_basic_env()
+    env_info = collect_base_env()
     env_info['MMPose'] = (mmpose.__version__ + '+' + get_git_hash(digits=7))
     return env_info
 
 
 if __name__ == '__main__':
     for name, val in collect_env().items():
-        print(f'{name}: {val}')
+        print(f'{name}: {val}')
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/utils/logger.py b/main/transformer_utils/mmpose/utils/logger.py
index 294837fa6aec1e1896de8c8accf470f366f81296..2e60cc5c2d59c15adb963645137d54900d998a60 100644
--- a/main/transformer_utils/mmpose/utils/logger.py
+++ b/main/transformer_utils/mmpose/utils/logger.py
@@ -1,11 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
 
-from mmcv.utils import get_logger
+from mmengine.logging import MMLogger
 
 
 def get_root_logger(log_file=None, log_level=logging.INFO):
-    """Use `get_logger` method in mmcv to get the root logger.
+    """Use `MMLogger` class in mmengine to get the root logger.
 
     The logger will be initialized if it has not been initialized. By default a
     StreamHandler will be added. If `log_file` is specified, a FileHandler will
@@ -22,4 +22,4 @@ def get_root_logger(log_file=None, log_level=logging.INFO):
     Returns:
         logging.Logger: The root logger.
     """
-    return get_logger(__name__.split('.')[0], log_file, log_level)
+    return MMLogger('MMLogger', __name__.split('.')[0], log_file, log_level)
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/utils/setup_env.py b/main/transformer_utils/mmpose/utils/setup_env.py
index 21def2f0809153a5f755af2431f7e702db625e5c..4c862d9e67a869aa8fa3624110c4d7f3ac60fa90 100644
--- a/main/transformer_utils/mmpose/utils/setup_env.py
+++ b/main/transformer_utils/mmpose/utils/setup_env.py
@@ -45,3 +45,34 @@ def setup_multi_processes(cfg):
             f'overloaded, please further tune the variable for optimal '
             f'performance in your application as needed.')
         os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+
+# def register_all_modules(init_default_scope: bool = True) -> None:
+#     """Register all modules in mmpose into the registries.
+
+#     Args:
+#         init_default_scope (bool): Whether initialize the mmpose default scope.
+#             When `init_default_scope=True`, the global default scope will be
+#             set to `mmpose`, and all registries will build modules from mmpose's
+#             registry node. To understand more about the registry, please refer
+#             to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+#             Defaults to True.
+#     """  # noqa
+
+#     import mmpose.models  # noqa: F401,F403
+
+#     if init_default_scope:
+#         never_created = DefaultScope.get_current_instance() is None \
+#                         or not DefaultScope.check_instance_created('mmpose')
+#         if never_created:
+#             DefaultScope.get_instance('mmpose', scope_name='mmpose')
+#             return
+#         current_scope = DefaultScope.get_current_instance()
+#         if current_scope.scope_name != 'mmpose':
+#             warnings.warn('The current default scope '
+#                           f'"{current_scope.scope_name}" is not "mmpose", '
+#                           '`register_all_modules` will force the current'
+#                           'default scope to be "mmpose". If this is not '
+#                           'expected, please set `init_default_scope=False`.')
+#             # avoid name conflict
+#             new_instance_name = f'mmpose-{datetime.datetime.now()}'
+#             DefaultScope.get_instance(new_instance_name, scope_name='mmpose')
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/utils/timer.py b/main/transformer_utils/mmpose/utils/timer.py
index 5a3185c5e89ce73bd33591c22ce74fc73ef8e770..cec6aff6226e249edca2e2ac64b7cd2db8557e19 100644
--- a/main/transformer_utils/mmpose/utils/timer.py
+++ b/main/transformer_utils/mmpose/utils/timer.py
@@ -4,7 +4,7 @@ from contextlib import contextmanager
 from functools import partial
 
 import numpy as np
-from mmcv import Timer
+from mmengine import Timer
 
 
 class RunningAverage():
@@ -114,4 +114,4 @@ class StopWatch:
 
     def reset(self):
         self._record = defaultdict(list)
-        self._active_timer_stack = []
+        self._active_timer_stack = []
\ No newline at end of file
diff --git a/pre-requirements.txt b/pre-requirements.txt
index 4e1c5f6a2c1603941b6cc4be0c03ca373e6327ec..ba1cf2dec43cf1bd737527402e561cfb15db9b8c 100644
--- a/pre-requirements.txt
+++ b/pre-requirements.txt
@@ -1,6 +1,4 @@
 numpy==1.23
-
---extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.0.0+cu118
-torchvision==0.15.0+cu118
-torchaudio==2.0.0+cu118
\ No newline at end of file
+torch==2.0.0
+torchvision
+torchaudio
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 34511268aa0abff3c76c7eb66397dfa2f277acc3..fb3a3646f5962e098c3521a29a4b73342157ecd8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
---extra-index-url https://download.openmmlab.com/mmcv/dist/cu118/torch2.0/index.html
-https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv_full-1.7.2-cp39-cp39-manylinux1_x86_64.whl
+--extra-index-url https://download.openmmlab.com/mmcv/dist/cu117/torch2.0.0/index.html
+https://download.openmmlab.com/mmcv/dist/cu117/torch2.0.0/mmcv-2.1.0-cp310-cp310-manylinux1_x86_64.whl
 
 scikit-image
 scipy
@@ -28,5 +28,6 @@ pycocotools
 plyfile
 timm
 pyglet
-mmdet==2.26.0
+mmcv
+mmdet==3.2.0
 eval_type_backport
\ No newline at end of file