diff --git a/app.py b/app.py index 585d97c991f7c243cbcd743a6317599248e2993d..15cf9536c53960e84faf3f1b0dfaec85773e89be 100644 --- a/app.py +++ b/app.py @@ -13,29 +13,22 @@ try: except: os.system('pip install /home/user/app/main/transformer_utils') hf_hub_download(repo_id="caizhongang/SMPLer-X", filename="smpler_x_h32.pth.tar", local_dir="/home/user/app/pretrained_models") -os.system('cp -rf /home/user/app/assets/conversions.py /home/user/.pyenv/versions/3.9.18/lib/python3.9/site-packages/torchgeometry/core/conversions.py') +os.system('cp -rf /home/user/app/assets/conversions.py /usr/local/lib/python3.10/site-packages/torchgeometry/core/conversions.py') DEFAULT_MODEL='smpler_x_h32' OUT_FOLDER = '/home/user/app/demo_out' os.makedirs(OUT_FOLDER, exist_ok=True) -# num_gpus = 1 if torch.cuda.is_available() else -1 -# print("!!!", torch.cuda.is_available()) -# print(torch.cuda.device_count()) -# print(torch.version.cuda) -# index = torch.cuda.current_device() -# print(index) -# print(torch.cuda.get_device_name(index)) +num_gpus = 1 if torch.cuda.is_available() else -1 +print("!!!", torch.cuda.is_available()) +print(torch.cuda.device_count()) +print(torch.version.cuda) +index = torch.cuda.current_device() +print(index) +print(torch.cuda.get_device_name(index)) # from main.inference import Inferer # inferer = Inferer(DEFAULT_MODEL, num_gpus, OUT_FOLDER) @spaces.GPU(enable_queue=True) def infer(video_input, in_threshold=0.5, num_people="Single person", render_mesh=False): - num_gpus = 1 if torch.cuda.is_available() else -1 - print("!!!", torch.cuda.is_available()) - print(torch.cuda.device_count()) - print(torch.version.cuda) - index = torch.cuda.current_device() - print(index) - print(torch.cuda.get_device_name(index)) from main.inference import Inferer inferer = Inferer(DEFAULT_MODEL, num_gpus, OUT_FOLDER) os.system(f'rm -rf {OUT_FOLDER}/*') diff --git a/common/base.py b/common/base.py index 21fe9d3f36f39167879aa84c248c32ec80b12927..8434f1b8e6c49988df0175649a5cfaf4c6a16269 100644 --- a/common/base.py +++ b/common/base.py @@ -17,7 +17,7 @@ import torch.utils.data.distributed from utils.distribute_utils import ( get_rank, is_main_process, time_synchronized, get_group_idx, get_process_groups ) -from mmcv.runner import get_dist_info + class Base(object): __metaclass__ = abc.ABCMeta diff --git a/common/utils/distribute_utils.py b/common/utils/distribute_utils.py index 5b1c71cd3863fe1f99370d7d072d6389f663959e..a6c928a5a3c66885e1311949a82feb181aee60a3 100644 --- a/common/utils/distribute_utils.py +++ b/common/utils/distribute_utils.py @@ -7,7 +7,7 @@ import tempfile import time import torch import torch.distributed as dist -from mmcv.runner import get_dist_info +from mmengine.dist import get_dist_info import random import numpy as np import subprocess diff --git a/main/SMPLer_X.py b/main/SMPLer_X.py index 1ca9477babbb0eee26f296b47359f7b0911d0a31..f1c71cc6d0d3d6ada4d164da94cda0784ab9aaad 100644 --- a/main/SMPLer_X.py +++ b/main/SMPLer_X.py @@ -9,7 +9,7 @@ from config import cfg import math import copy from mmpose.models import build_posenet -from mmcv import Config +from mmengine.config import Config class Model(nn.Module): def __init__(self, encoder, body_position_net, body_rotation_net, box_net, hand_position_net, hand_roi_net, diff --git a/main/config.py b/main/config.py index d9a53874b62f35492e5a50034f82420c9595eed3..dd5fd00b8e66a2ee782b3005075c5d23d032b1b9 100644 --- a/main/config.py +++ b/main/config.py @@ -2,7 +2,8 @@ import os import os.path as osp import sys import datetime -from mmcv import Config as MMConfig +from mmengine.config import Config as MMConfig + class Config: def get_config_fromfile(self, config_path): diff --git a/main/inference.py b/main/inference.py index a0dc4a161590d1ec3de8c7b528e2f6fea3db6683..10a4f0ad6c7058096237a32037ed43b89f9d8030 100644 --- a/main/inference.py +++ b/main/inference.py @@ -53,8 +53,14 @@ class Inferer: ## mmdet inference mmdet_results = inference_detector(self.model, original_img) - mmdet_box = process_mmdet_results(mmdet_results, cat_id=0, multi_person=True) + pred_instance = mmdet_results.pred_instances.cpu().numpy() + bboxes = np.concatenate( + (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1) + bboxes = bboxes[pred_instance.labels == 0] + bboxes = np.expand_dims(bboxes, axis=0) + mmdet_box = process_mmdet_results(bboxes, cat_id=0, multi_person=True) + # save original image if no bbox if len(mmdet_box[0])<1: return original_img, [], [] diff --git a/main/transformer_utils/mmpose/__init__.py b/main/transformer_utils/mmpose/__init__.py index abcf8693e279f59c8c80f55e1797841e593dbd72..690da3a78ba0033e7dc820b3d9a681da3ca39706 100644 --- a/main/transformer_utils/mmpose/__init__.py +++ b/main/transformer_utils/mmpose/__init__.py @@ -17,7 +17,7 @@ def digit_version(version_str): mmcv_minimum_version = '1.3.8' -mmcv_maximum_version = '1.8.0' +mmcv_maximum_version = '2.3.0' mmcv_version = digit_version(mmcv.__version__) diff --git a/main/transformer_utils/mmpose/core/camera/camera_base.py b/main/transformer_utils/mmpose/core/camera/camera_base.py index 28b23e7c6279e3613265a949df91f6ced0413b99..092dc20d6b1f1d2db785ad720f67fd9184930ad5 100644 --- a/main/transformer_utils/mmpose/core/camera/camera_base.py +++ b/main/transformer_utils/mmpose/core/camera/camera_base.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod -from mmcv.utils import Registry +from mmengine import Registry CAMERAS = Registry('camera') diff --git a/main/transformer_utils/mmpose/core/distributed_wrapper.py b/main/transformer_utils/mmpose/core/distributed_wrapper.py index c67aceec992085e9952ea70c62009e9ec1db30ca..12122e71cb4fd46f0e23bb6df127339325f47520 100644 --- a/main/transformer_utils/mmpose/core/distributed_wrapper.py +++ b/main/transformer_utils/mmpose/core/distributed_wrapper.py @@ -4,7 +4,7 @@ import torch.nn as nn from mmcv.parallel import MODULE_WRAPPERS as MMCV_MODULE_WRAPPERS from mmcv.parallel import MMDistributedDataParallel from mmcv.parallel.scatter_gather import scatter_kwargs -from mmcv.utils import Registry +from mmengine import Registry from torch.cuda._utils import _get_device_index MODULE_WRAPPERS = Registry('module wrapper', parent=MMCV_MODULE_WRAPPERS) diff --git a/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py b/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py index b35a9c6a990c69b2beac9e73f893f97c237e4783..e94cb9a914d5e18816c81545c329958f7630877e 100644 --- a/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py +++ b/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py @@ -1,8 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings +import os.path as osp +import warnings +from math import inf +from typing import Callable, List, Optional + +import torch.distributed as dist +from torch.nn.modules.batchnorm import _BatchNorm +from torch.utils.data import DataLoader + +from mmengine.fileio import FileClient +from mmengine.utils import is_seq_of +from mmengine.hooks import Hook, LoggerHook -from mmcv.runner import DistEvalHook as _DistEvalHook -from mmcv.runner import EvalHook as _EvalHook MMPOSE_GREATER_KEYS = [ 'acc', 'ap', 'ar', 'pck', 'auc', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc', @@ -10,6 +20,505 @@ MMPOSE_GREATER_KEYS = [ ] MMPOSE_LESS_KEYS = ['loss', 'epe', 'nme', 'mpjpe', 'p-mpjpe', 'n-mpjpe'] +class _EvalHook(Hook): + """Non-Distributed evaluation hook. + + This hook will regularly perform evaluation in a given interval when + performing in non-distributed environment. + + Args: + dataloader (DataLoader): A PyTorch dataloader, whose dataset has + implemented ``evaluate`` function. + start (int | None, optional): Evaluation starting epoch or iteration. + It enables evaluation before the training starts if ``start`` <= + the resuming epoch or iteration. If None, whether to evaluate is + merely decided by ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + Default: True. + save_best (str, optional): If a metric is specified, it would measure + the best checkpoint during evaluation. The information about best + checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep + best score value and best checkpoint path, which will be also + loaded when resume checkpoint. Options are the evaluation metrics + on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox + detection and instance segmentation. ``AR@100`` for proposal + recall. If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: None. + rule (str | None, optional): Comparison rule for best score. If set to + None, it will infer a reasonable rule. Keys such as 'acc', 'top' + .etc will be inferred by 'greater' rule. Keys contain 'loss' will + be inferred by 'less' rule. Options are 'greater', 'less', None. + Default: None. + test_fn (callable, optional): test a model with samples from a + dataloader, and return the test results. If ``None``, the default + test function ``mmcv.engine.single_gpu_test`` will be used. + (default: ``None``) + greater_keys (List[str] | None, optional): Metric keys that will be + inferred by 'greater' comparison rule. If ``None``, + _default_greater_keys will be used. (default: ``None``) + less_keys (List[str] | None, optional): Metric keys that will be + inferred by 'less' comparison rule. If ``None``, _default_less_keys + will be used. (default: ``None``) + out_dir (str, optional): The root directory to save checkpoints. If not + specified, `runner.work_dir` will be used by default. If specified, + the `out_dir` will be the concatenation of `out_dir` and the last + level directory of `runner.work_dir`. + `New in version 1.3.16.` + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. Default: None. + `New in version 1.3.16.` + **eval_kwargs: Evaluation arguments fed into the evaluate function of + the dataset. + + Note: + If new arguments are added for EvalHook, tools/test.py, + tools/eval_metric.py may be affected. + """ + + # Since the key for determine greater or less is related to the downstream + # tasks, downstream repos may need to overwrite the following inner + # variable accordingly. + + rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y} + init_value_map = {'greater': -inf, 'less': inf} + _default_greater_keys = [ + 'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU', + 'mAcc', 'aAcc' + ] + _default_less_keys = ['loss'] + + def __init__(self, + dataloader: DataLoader, + start: Optional[int] = None, + interval: int = 1, + by_epoch: bool = True, + save_best: Optional[str] = None, + rule: Optional[str] = None, + test_fn: Optional[Callable] = None, + greater_keys: Optional[List[str]] = None, + less_keys: Optional[List[str]] = None, + out_dir: Optional[str] = None, + file_client_args: Optional[dict] = None, + **eval_kwargs): + if not isinstance(dataloader, DataLoader): + raise TypeError(f'dataloader must be a pytorch DataLoader, ' + f'but got {type(dataloader)}') + + if interval <= 0: + raise ValueError(f'interval must be a positive number, ' + f'but got {interval}') + + assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean' + + if start is not None and start < 0: + raise ValueError(f'The evaluation start epoch {start} is smaller ' + f'than 0') + + self.dataloader = dataloader + self.interval = interval + self.start = start + self.by_epoch = by_epoch + + assert isinstance(save_best, str) or save_best is None, \ + '""save_best"" should be a str or None ' \ + f'rather than {type(save_best)}' + self.save_best = save_best + self.eval_kwargs = eval_kwargs + self.initial_flag = True + + if test_fn is None: + from mmcv.engine import single_gpu_test + self.test_fn = single_gpu_test + else: + self.test_fn = test_fn + + if greater_keys is None: + self.greater_keys = self._default_greater_keys + else: + if not isinstance(greater_keys, (list, tuple)): + assert isinstance(greater_keys, str) + greater_keys = (greater_keys, ) + assert is_seq_of(greater_keys, str) + self.greater_keys = greater_keys + + if less_keys is None: + self.less_keys = self._default_less_keys + else: + if not isinstance(less_keys, (list, tuple)): + assert isinstance(greater_keys, str) + less_keys = (less_keys, ) + assert is_seq_of(less_keys, str) + self.less_keys = less_keys + + if self.save_best is not None: + self.best_ckpt_path = None + self._init_rule(rule, self.save_best) + + self.out_dir = out_dir + self.file_client_args = file_client_args + + def _init_rule(self, rule: Optional[str], key_indicator: str): + """Initialize rule, key_indicator, comparison_func, and best score. + + Here is the rule to determine which rule is used for key indicator + when the rule is not specific (note that the key indicator matching + is case-insensitive): + 1. If the key indicator is in ``self.greater_keys``, the rule will be + specified as 'greater'. + 2. Or if the key indicator is in ``self.less_keys``, the rule will be + specified as 'less'. + 3. Or if any one item in ``self.greater_keys`` is a substring of + key_indicator , the rule will be specified as 'greater'. + 4. Or if any one item in ``self.less_keys`` is a substring of + key_indicator , the rule will be specified as 'less'. + + Args: + rule (str | None): Comparison rule for best score. + key_indicator (str | None): Key indicator to determine the + comparison rule. + """ + if rule not in self.rule_map and rule is not None: + raise KeyError(f'rule must be greater, less or None, ' + f'but got {rule}.') + + if rule is None: + if key_indicator != 'auto': + # `_lc` here means we use the lower case of keys for + # case-insensitive matching + assert isinstance(key_indicator, str) + key_indicator_lc = key_indicator.lower() + greater_keys = [key.lower() for key in self.greater_keys] + less_keys = [key.lower() for key in self.less_keys] + + if key_indicator_lc in greater_keys: + rule = 'greater' + elif key_indicator_lc in less_keys: + rule = 'less' + elif any(key in key_indicator_lc for key in greater_keys): + rule = 'greater' + elif any(key in key_indicator_lc for key in less_keys): + rule = 'less' + else: + raise ValueError(f'Cannot infer the rule for key ' + f'{key_indicator}, thus a specific rule ' + f'must be specified.') + self.rule = rule + self.key_indicator = key_indicator + if self.rule is not None: + self.compare_func = self.rule_map[self.rule] + + def before_run(self, runner): + if not self.out_dir: + self.out_dir = runner.work_dir + + self.file_client = FileClient.infer_client(self.file_client_args, + self.out_dir) + + # if `self.out_dir` is not equal to `runner.work_dir`, it means that + # `self.out_dir` is set so the final `self.out_dir` is the + # concatenation of `self.out_dir` and the last level directory of + # `runner.work_dir` + if self.out_dir != runner.work_dir: + basename = osp.basename(runner.work_dir.rstrip(osp.sep)) + self.out_dir = self.file_client.join_path(self.out_dir, basename) + runner.logger.info( + f'The best checkpoint will be saved to {self.out_dir} by ' + f'{self.file_client.name}') + + if self.save_best is not None: + if runner.meta is None: + warnings.warn('runner.meta is None. Creating an empty one.') + runner.meta = dict() + runner.meta.setdefault('hook_msgs', dict()) + self.best_ckpt_path = runner.meta['hook_msgs'].get( + 'best_ckpt', None) + + def before_train_iter(self, runner): + """Evaluate the model only at the start of training by iteration.""" + if self.by_epoch or not self.initial_flag: + return + if self.start is not None and runner.iter >= self.start: + self.after_train_iter(runner) + self.initial_flag = False + + def before_train_epoch(self, runner): + """Evaluate the model only at the start of training by epoch.""" + if not (self.by_epoch and self.initial_flag): + return + if self.start is not None and runner.epoch >= self.start: + self.after_train_epoch(runner) + self.initial_flag = False + + def after_train_iter(self, runner): + """Called after every training iter to evaluate the results.""" + if not self.by_epoch and self._should_evaluate(runner): + # Because the priority of EvalHook is higher than LoggerHook, the + # training log and the evaluating log are mixed. Therefore, + # we need to dump the training log and clear it before evaluating + # log is generated. In addition, this problem will only appear in + # `IterBasedRunner` whose `self.by_epoch` is False, because + # `EpochBasedRunner` whose `self.by_epoch` is True calls + # `_do_evaluate` in `after_train_epoch` stage, and at this stage + # the training log has been printed, so it will not cause any + # problem. more details at + # https://github.com/open-mmlab/mmsegmentation/issues/694 + for hook in runner._hooks: + if isinstance(hook, LoggerHook): + hook.after_train_iter(runner) + runner.log_buffer.clear() + + self._do_evaluate(runner) + + def after_train_epoch(self, runner): + """Called after every training epoch to evaluate the results.""" + if self.by_epoch and self._should_evaluate(runner): + self._do_evaluate(runner) + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + results = self.test_fn(runner.model, self.dataloader) + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) + # the key_score may be `None` so it needs to skip the action to save + # the best checkpoint + if self.save_best and key_score: + self._save_ckpt(runner, key_score) + + def _should_evaluate(self, runner): + """Judge whether to perform evaluation. + + Here is the rule to judge whether to perform evaluation: + 1. It will not perform evaluation during the epoch/iteration interval, + which is determined by ``self.interval``. + 2. It will not perform evaluation if the start time is larger than + current time. + 3. It will not perform evaluation when current time is larger than + the start time but during epoch/iteration interval. + + Returns: + bool: The flag indicating whether to perform evaluation. + """ + if self.by_epoch: + current = runner.epoch + check_time = self.every_n_epochs + else: + current = runner.iter + check_time = self.every_n_iters + + if self.start is None: + if not check_time(runner, self.interval): + # No evaluation during the interval. + return False + elif (current + 1) < self.start: + # No evaluation if start is larger than the current time. + return False + else: + # Evaluation only at epochs/iters 3, 5, 7... + # if start==3 and interval==2 + if (current + 1 - self.start) % self.interval: + return False + return True + + def _save_ckpt(self, runner, key_score): + """Save the best checkpoint. + + It will compare the score according to the compare function, write + related information (best score, best checkpoint path) and save the + best checkpoint into ``work_dir``. + """ + if self.by_epoch: + current = f'epoch_{runner.epoch + 1}' + cur_type, cur_time = 'epoch', runner.epoch + 1 + else: + current = f'iter_{runner.iter + 1}' + cur_type, cur_time = 'iter', runner.iter + 1 + + best_score = runner.meta['hook_msgs'].get( + 'best_score', self.init_value_map[self.rule]) + if self.compare_func(key_score, best_score): + best_score = key_score + runner.meta['hook_msgs']['best_score'] = best_score + + if self.best_ckpt_path and self.file_client.isfile( + self.best_ckpt_path): + self.file_client.remove(self.best_ckpt_path) + runner.logger.info( + f'The previous best checkpoint {self.best_ckpt_path} was ' + 'removed') + + best_ckpt_name = f'best_{self.key_indicator}_{current}.pth' + self.best_ckpt_path = self.file_client.join_path( + self.out_dir, best_ckpt_name) + runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path + + runner.save_checkpoint( + self.out_dir, + filename_tmpl=best_ckpt_name, + create_symlink=False) + runner.logger.info( + f'Now best checkpoint is saved as {best_ckpt_name}.') + runner.logger.info( + f'Best {self.key_indicator} is {best_score:0.4f} ' + f'at {cur_time} {cur_type}.') + + def evaluate(self, runner, results): + """Evaluate the results. + + Args: + runner (:obj:`mmcv.Runner`): The underlined training runner. + results (list): Output results. + """ + eval_res = self.dataloader.dataset.evaluate( + results, logger=runner.logger, **self.eval_kwargs) + + for name, val in eval_res.items(): + runner.log_buffer.output[name] = val + runner.log_buffer.ready = True + + if self.save_best is not None: + # If the performance of model is poor, the `eval_res` may be an + # empty dict and it will raise exception when `self.save_best` is + # not None. More details at + # https://github.com/open-mmlab/mmdetection/issues/6265. + if not eval_res: + warnings.warn( + 'Since `eval_res` is an empty dict, the behavior to save ' + 'the best checkpoint will be skipped in this evaluation.') + return None + + if self.key_indicator == 'auto': + # infer from eval_results + self._init_rule(self.rule, list(eval_res.keys())[0]) + return eval_res[self.key_indicator] + + return None + + +class _DistEvalHook(_EvalHook): + """Distributed evaluation hook. + + This hook will regularly perform evaluation in a given interval when + performing in distributed environment. + + Args: + dataloader (DataLoader): A PyTorch dataloader, whose dataset has + implemented ``evaluate`` function. + start (int | None, optional): Evaluation starting epoch. It enables + evaluation before the training starts if ``start`` <= the resuming + epoch. If None, whether to evaluate is merely decided by + ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + default: True. + save_best (str, optional): If a metric is specified, it would measure + the best checkpoint during evaluation. The information about best + checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep + best score value and best checkpoint path, which will be also + loaded when resume checkpoint. Options are the evaluation metrics + on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox + detection and instance segmentation. ``AR@100`` for proposal + recall. If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: None. + rule (str | None, optional): Comparison rule for best score. If set to + None, it will infer a reasonable rule. Keys such as 'acc', 'top' + .etc will be inferred by 'greater' rule. Keys contain 'loss' will + be inferred by 'less' rule. Options are 'greater', 'less', None. + Default: None. + test_fn (callable, optional): test a model with samples from a + dataloader in a multi-gpu manner, and return the test results. If + ``None``, the default test function ``mmcv.engine.multi_gpu_test`` + will be used. (default: ``None``) + tmpdir (str | None): Temporary directory to save the results of all + processes. Default: None. + gpu_collect (bool): Whether to use gpu or cpu to collect results. + Default: False. + broadcast_bn_buffer (bool): Whether to broadcast the + buffer(running_mean and running_var) of rank 0 to other rank + before evaluation. Default: True. + out_dir (str, optional): The root directory to save checkpoints. If not + specified, `runner.work_dir` will be used by default. If specified, + the `out_dir` will be the concatenation of `out_dir` and the last + level directory of `runner.work_dir`. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. Default: None. + **eval_kwargs: Evaluation arguments fed into the evaluate function of + the dataset. + """ + + def __init__(self, + dataloader: DataLoader, + start: Optional[int] = None, + interval: int = 1, + by_epoch: bool = True, + save_best: Optional[str] = None, + rule: Optional[str] = None, + test_fn: Optional[Callable] = None, + greater_keys: Optional[List[str]] = None, + less_keys: Optional[List[str]] = None, + broadcast_bn_buffer: bool = True, + tmpdir: Optional[str] = None, + gpu_collect: bool = False, + out_dir: Optional[str] = None, + file_client_args: Optional[dict] = None, + **eval_kwargs): + + if test_fn is None: + from mmcv.engine import multi_gpu_test + test_fn = multi_gpu_test + + super().__init__( + dataloader, + start=start, + interval=interval, + by_epoch=by_epoch, + save_best=save_best, + rule=rule, + test_fn=test_fn, + greater_keys=greater_keys, + less_keys=less_keys, + out_dir=out_dir, + file_client_args=file_client_args, + **eval_kwargs) + + self.broadcast_bn_buffer = broadcast_bn_buffer + self.tmpdir = tmpdir + self.gpu_collect = gpu_collect + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + # Synchronization of BatchNorm's buffer (running_mean + # and running_var) is not supported in the DDP of pytorch, + # which may cause the inconsistent performance of models in + # different ranks, so we broadcast BatchNorm's buffers + # of rank 0 to other ranks to avoid this. + if self.broadcast_bn_buffer: + model = runner.model + for name, module in model.named_modules(): + if isinstance(module, + _BatchNorm) and module.track_running_stats: + dist.broadcast(module.running_var, 0) + dist.broadcast(module.running_mean, 0) + + tmpdir = self.tmpdir + if tmpdir is None: + tmpdir = osp.join(runner.work_dir, '.eval_hook') + + results = self.test_fn( + runner.model, + self.dataloader, + tmpdir=tmpdir, + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) + # the key_score may be `None` so it needs to skip the action to + # save the best checkpoint + if self.save_best and key_score: + self._save_ckpt(runner, key_score) class EvalHook(_EvalHook): diff --git a/main/transformer_utils/mmpose/core/fp16/hooks.py b/main/transformer_utils/mmpose/core/fp16/hooks.py index 74081a9b73b95ebb20cabf07cfaeab86cc874780..c4e414396925b9d15b5958d4831bec06f0d0f7bf 100644 --- a/main/transformer_utils/mmpose/core/fp16/hooks.py +++ b/main/transformer_utils/mmpose/core/fp16/hooks.py @@ -1,15 +1,90 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +import logging +from typing import Optional import torch import torch.nn as nn -from mmcv.runner import OptimizerHook -from mmcv.utils import _BatchNorm +from torch import Tensor +from torch.nn.utils import clip_grad +from mmengine.hooks import Hook +from torch.nn.modules.batchnorm import _BatchNorm from ..utils.dist_utils import allreduce_grads from .utils import cast_tensor_type +class OptimizerHook(Hook): + """A hook contains custom operations for the optimizer. + + Args: + grad_clip (dict, optional): A config dict to control the clip_grad. + Default: None. + detect_anomalous_params (bool): This option is only used for + debugging which will slow down the training speed. + Detect anomalous parameters that are not included in + the computational graph with `loss` as the root. + There are two cases + + - Parameters were not used during + forward pass. + - Parameters were not used to produce + loss. + Default: False. + """ + + def __init__(self, + grad_clip: Optional[dict] = None, + detect_anomalous_params: bool = False): + self.grad_clip = grad_clip + self.detect_anomalous_params = detect_anomalous_params + + def clip_grads(self, params): + params = list( + filter(lambda p: p.requires_grad and p.grad is not None, params)) + if len(params) > 0: + return clip_grad.clip_grad_norm_(params, **self.grad_clip) + + def after_train_iter(self, runner): + runner.optimizer.zero_grad() + if self.detect_anomalous_params: + self.detect_anomalous_parameters(runner.outputs['loss'], runner) + runner.outputs['loss'].backward() + + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update({'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + runner.optimizer.step() + + def detect_anomalous_parameters(self, loss: Tensor, runner) -> None: + logger = runner.logger + parameters_in_graph = set() + visited = set() + + def traverse(grad_fn): + if grad_fn is None: + return + if grad_fn not in visited: + visited.add(grad_fn) + if hasattr(grad_fn, 'variable'): + parameters_in_graph.add(grad_fn.variable) + parents = grad_fn.next_functions + if parents is not None: + for parent in parents: + grad_fn = parent[0] + traverse(grad_fn) + + traverse(loss.grad_fn) + for n, p in runner.model.named_parameters(): + if p not in parameters_in_graph and p.requires_grad: + logger.log( + level=logging.ERROR, + msg=f'{n} with shape {p.size()} is not ' + f'in the computational graph \n') + class Fp16OptimizerHook(OptimizerHook): """FP16 optimizer hook. diff --git a/main/transformer_utils/mmpose/core/optimizers/builder.py b/main/transformer_utils/mmpose/core/optimizers/builder.py index cd2cf49133c57f28261b555d30a5cee18ae105af..aa9d2c7ab4b464b2900a7bd14076e601a4d1168c 100644 --- a/main/transformer_utils/mmpose/core/optimizers/builder.py +++ b/main/transformer_utils/mmpose/core/optimizers/builder.py @@ -1,24 +1,37 @@ # Copyright (c) OpenMMLab. All rights reserved. -from mmcv.runner import build_optimizer -from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS -from mmcv.utils import Registry, build_from_cfg +import copy +from typing import Dict +# from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS +from mmengine import Registry +from mmengine.registry import build_from_cfg OPTIMIZERS = Registry('optimizers') -OPTIMIZER_BUILDERS = Registry( - 'optimizer builder', parent=MMCV_OPTIMIZER_BUILDERS) +OPTIMIZER_BUILDERS = Registry('optimizer builder') def build_optimizer_constructor(cfg): constructor_type = cfg.get('type') if constructor_type in OPTIMIZER_BUILDERS: return build_from_cfg(cfg, OPTIMIZER_BUILDERS) - elif constructor_type in MMCV_OPTIMIZER_BUILDERS: - return build_from_cfg(cfg, MMCV_OPTIMIZER_BUILDERS) else: raise KeyError(f'{constructor_type} is not registered ' 'in the optimizer builder registry.') +def build_optimizer(model, cfg: Dict): + optimizer_cfg = copy.deepcopy(cfg) + constructor_type = optimizer_cfg.pop('constructor', + 'DefaultOptimizerConstructor') + paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None) + optim_constructor = build_optimizer_constructor( + dict( + type=constructor_type, + optimizer_cfg=optimizer_cfg, + paramwise_cfg=paramwise_cfg)) + optimizer = optim_constructor(model) + return optimizer + + def build_optimizers(model, cfgs): """Build multiple optimizers from configs. diff --git a/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py b/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py index 1ab6a82548c046483b7c412cefa0762cdbc531f8..958b50ae4839f5fd4dc0aa864a1723c9cbc9d8c8 100644 --- a/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py +++ b/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py @@ -1,8 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import warnings - -from mmcv.runner import DefaultOptimizerConstructor, get_dist_info +from mmengine.dist import get_dist_info +from mmcv.runner import DefaultOptimizerConstructor from mmpose.utils import get_root_logger from .builder import OPTIMIZER_BUILDERS diff --git a/main/transformer_utils/mmpose/core/post_processing/smoother.py b/main/transformer_utils/mmpose/core/post_processing/smoother.py index 6b57768c03b48ff84877acbceb6e27b82832c04d..083e360a15f38660eea19a8115412ff70fcd1b80 100644 --- a/main/transformer_utils/mmpose/core/post_processing/smoother.py +++ b/main/transformer_utils/mmpose/core/post_processing/smoother.py @@ -4,8 +4,8 @@ import warnings from typing import Dict, Union import numpy as np -from mmcv import Config, is_seq_of - +from mmengine.config import Config +from mmengine.utils import is_seq_of from mmpose.core.post_processing.temporal_filters import build_filter diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py index adb914c5222db967c9cdb56fa9f469ff47792f79..cd429df5106ff7a27dc4f63cb510b442ed48bb87 100644 --- a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py +++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from mmcv.utils import Registry +from mmengine import Registry FILTERS = Registry('filters') diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py index c7f8df520ad9457722f738c33b79d69d3a99fb9e..dd73b09717c25d2fef9839f6f9869bc45d8958ef 100644 --- a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py +++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py @@ -3,7 +3,7 @@ from typing import Optional import numpy as np import torch -from mmcv.runner import load_checkpoint +from mmengine.runner import load_checkpoint from torch import Tensor, nn from .builder import FILTERS diff --git a/main/transformer_utils/mmpose/core/utils/dist_utils.py b/main/transformer_utils/mmpose/core/utils/dist_utils.py index b81f925ad7aa51ce800e27bead8eb8ba021c2592..b6273bab4870ac646edbddcc21e2c30de462f2a2 100644 --- a/main/transformer_utils/mmpose/core/utils/dist_utils.py +++ b/main/transformer_utils/mmpose/core/utils/dist_utils.py @@ -4,7 +4,7 @@ from collections import OrderedDict import numpy as np import torch import torch.distributed as dist -from mmcv.runner import get_dist_info +from mmengine.dist import get_dist_info from torch._utils import (_flatten_dense_tensors, _take_tensors, _unflatten_dense_tensors) diff --git a/main/transformer_utils/mmpose/core/utils/model_util_hooks.py b/main/transformer_utils/mmpose/core/utils/model_util_hooks.py index d308a8a57a04f1a2acaa841ac2e8ad42439bb633..f03e3178309b08e7969dd6793e39d8bb743115cf 100644 --- a/main/transformer_utils/mmpose/core/utils/model_util_hooks.py +++ b/main/transformer_utils/mmpose/core/utils/model_util_hooks.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from mmcv.runner import HOOKS, Hook - +from mmengine.registry import HOOKS +from mmengine.hooks import Hook @HOOKS.register_module() class ModelSetEpochHook(Hook): diff --git a/main/transformer_utils/mmpose/core/visualization/image.py b/main/transformer_utils/mmpose/core/visualization/image.py index d244b2b12cff970c810ae0798164e835dd6226e4..8188ccb0ed42427dc4311d20e135d115d9e5e6fc 100644 --- a/main/transformer_utils/mmpose/core/visualization/image.py +++ b/main/transformer_utils/mmpose/core/visualization/image.py @@ -7,7 +7,7 @@ import cv2 import mmcv import numpy as np from matplotlib import pyplot as plt -from mmcv.utils.misc import deprecated_api_warning +from mmengine.utils import deprecated_api_warning from mmcv.visualization.color import color_val try: diff --git a/main/transformer_utils/mmpose/models/__init__.py b/main/transformer_utils/mmpose/models/__init__.py index 641d115a693abff882fa7604811430f8e6b605ab..fa68fc72fbce4204da6bc576daf8d04a9819bf52 100644 --- a/main/transformer_utils/mmpose/models/__init__.py +++ b/main/transformer_utils/mmpose/models/__init__.py @@ -3,9 +3,9 @@ from .builder import (BACKBONES, HEADS, LOSSES, MESH_MODELS, NECKS, POSENETS, build_backbone, build_head, build_loss, build_mesh_model, build_neck, build_posenet) from .detectors import * # noqa +from .backbones import * from .heads import * # noqa from .losses import * # noqa -from .necks import * # noqa from .utils import * # noqa diff --git a/main/transformer_utils/mmpose/models/backbones/__init__.py b/main/transformer_utils/mmpose/models/backbones/__init__.py index 06717917a2dbd08800587d3ffa193149e42a653c..2003ee3af7c44b6fcbf3b46e0ac1e00785f7a6f1 100644 --- a/main/transformer_utils/mmpose/models/backbones/__init__.py +++ b/main/transformer_utils/mmpose/models/backbones/__init__.py @@ -1,41 +1,42 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .alexnet import AlexNet -from .cpm import CPM -from .hourglass import HourglassNet -from .hourglass_ae import HourglassAENet -from .hrformer import HRFormer -from .hrnet import HRNet -from .i3d import I3D -from .litehrnet import LiteHRNet -from .mobilenet_v2 import MobileNetV2 -from .mobilenet_v3 import MobileNetV3 -from .mspn import MSPN -from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2 -from .regnet import RegNet -from .resnest import ResNeSt -from .resnet import ResNet, ResNetV1d -from .resnext import ResNeXt -from .rsn import RSN -from .scnet import SCNet -from .seresnet import SEResNet -from .seresnext import SEResNeXt -from .shufflenet_v1 import ShuffleNetV1 -from .shufflenet_v2 import ShuffleNetV2 -from .swin import SwinTransformer -from .tcformer import TCFormer -from .tcn import TCN -from .v2v_net import V2VNet -from .vgg import VGG -from .vipnas_mbv3 import ViPNAS_MobileNetV3 -from .vipnas_resnet import ViPNAS_ResNet -from .hrt import HRT +# from .alexnet import AlexNet +# from .cpm import CPM +# from .hourglass import HourglassNet +# from .hourglass_ae import HourglassAENet +# from .hrformer import HRFormer +# from .hrnet import HRNet +# from .i3d import I3D +# from .litehrnet import LiteHRNet +# from .mobilenet_v2 import MobileNetV2 +# from .mobilenet_v3 import MobileNetV3 +# from .mspn import MSPN +# from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2 +# from .regnet import RegNet +# from .resnest import ResNeSt +# from .resnet import ResNet, ResNetV1d +# from .resnext import ResNeXt +# from .rsn import RSN +# from .scnet import SCNet +# from .seresnet import SEResNet +# from .seresnext import SEResNeXt +# from .shufflenet_v1 import ShuffleNetV1 +# from .shufflenet_v2 import ShuffleNetV2 +# from .swin import SwinTransformer +# from .tcformer import TCFormer +# from .tcn import TCN +# from .v2v_net import V2VNet +# from .vgg import VGG +# from .vipnas_mbv3 import ViPNAS_MobileNetV3 +# from .vipnas_resnet import ViPNAS_ResNet +# from .hrt import HRT from .vit import ViT -__all__ = [ - 'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2', - 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', - 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', - 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', - 'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer', - 'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer', 'ViT' -] +# __all__ = [ +# 'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2', +# 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', +# 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', +# 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', +# 'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer', +# 'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer', 'ViT' +# ] +__all__ = ['ViT'] \ No newline at end of file diff --git a/main/transformer_utils/mmpose/models/backbones/alexnet.py b/main/transformer_utils/mmpose/models/backbones/alexnet.py deleted file mode 100644 index a8efd74d118f5abe4d9c880ebe80ce7cbd58c6b2..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/alexnet.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch.nn as nn - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone - - -@BACKBONES.register_module() -class AlexNet(BaseBackbone): - """`AlexNet `__ backbone. - - The input for AlexNet is a 224x224 RGB image. - - Args: - num_classes (int): number of classes for classification. - The default value is -1, which uses the backbone as - a feature extractor without the top classifier. - """ - - def __init__(self, num_classes=-1): - super().__init__() - self.num_classes = num_classes - self.features = nn.Sequential( - nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=3, stride=2), - nn.Conv2d(64, 192, kernel_size=5, padding=2), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=3, stride=2), - nn.Conv2d(192, 384, kernel_size=3, padding=1), - nn.ReLU(inplace=True), - nn.Conv2d(384, 256, kernel_size=3, padding=1), - nn.ReLU(inplace=True), - nn.Conv2d(256, 256, kernel_size=3, padding=1), - nn.ReLU(inplace=True), - nn.MaxPool2d(kernel_size=3, stride=2), - ) - if self.num_classes > 0: - self.classifier = nn.Sequential( - nn.Dropout(), - nn.Linear(256 * 6 * 6, 4096), - nn.ReLU(inplace=True), - nn.Dropout(), - nn.Linear(4096, 4096), - nn.ReLU(inplace=True), - nn.Linear(4096, num_classes), - ) - - def forward(self, x): - - x = self.features(x) - if self.num_classes > 0: - x = x.view(x.size(0), 256 * 6 * 6) - x = self.classifier(x) - - return x diff --git a/main/transformer_utils/mmpose/models/backbones/cpm.py b/main/transformer_utils/mmpose/models/backbones/cpm.py deleted file mode 100644 index 458245d755f930f4ff625a754aadbab5c13494a6..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/cpm.py +++ /dev/null @@ -1,186 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import torch -import torch.nn as nn -from mmcv.cnn import ConvModule, constant_init, normal_init -from torch.nn.modules.batchnorm import _BatchNorm - -from mmpose.utils import get_root_logger -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .utils import load_checkpoint - - -class CpmBlock(nn.Module): - """CpmBlock for Convolutional Pose Machine. - - Args: - in_channels (int): Input channels of this block. - channels (list): Output channels of each conv module. - kernels (list): Kernel sizes of each conv module. - """ - - def __init__(self, - in_channels, - channels=(128, 128, 128), - kernels=(11, 11, 11), - norm_cfg=None): - super().__init__() - - assert len(channels) == len(kernels) - layers = [] - for i in range(len(channels)): - if i == 0: - input_channels = in_channels - else: - input_channels = channels[i - 1] - layers.append( - ConvModule( - input_channels, - channels[i], - kernels[i], - padding=(kernels[i] - 1) // 2, - norm_cfg=norm_cfg)) - self.model = nn.Sequential(*layers) - - def forward(self, x): - """Model forward function.""" - out = self.model(x) - return out - - -@BACKBONES.register_module() -class CPM(BaseBackbone): - """CPM backbone. - - Convolutional Pose Machines. - More details can be found in the `paper - `__ . - - Args: - in_channels (int): The input channels of the CPM. - out_channels (int): The output channels of the CPM. - feat_channels (int): Feature channel of each CPM stage. - middle_channels (int): Feature channel of conv after the middle stage. - num_stages (int): Number of stages. - norm_cfg (dict): Dictionary to construct and config norm layer. - - Example: - >>> from mmpose.models import CPM - >>> import torch - >>> self = CPM(3, 17) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 368, 368) - >>> level_outputs = self.forward(inputs) - >>> for level_output in level_outputs: - ... print(tuple(level_output.shape)) - (1, 17, 46, 46) - (1, 17, 46, 46) - (1, 17, 46, 46) - (1, 17, 46, 46) - (1, 17, 46, 46) - (1, 17, 46, 46) - """ - - def __init__(self, - in_channels, - out_channels, - feat_channels=128, - middle_channels=32, - num_stages=6, - norm_cfg=dict(type='BN', requires_grad=True)): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - - assert in_channels == 3 - - self.num_stages = num_stages - assert self.num_stages >= 1 - - self.stem = nn.Sequential( - ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg), - nn.MaxPool2d(kernel_size=3, stride=2, padding=1), - ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), - nn.MaxPool2d(kernel_size=3, stride=2, padding=1), - ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), - nn.MaxPool2d(kernel_size=3, stride=2, padding=1), - ConvModule(128, 32, 5, padding=2, norm_cfg=norm_cfg), - ConvModule(32, 512, 9, padding=4, norm_cfg=norm_cfg), - ConvModule(512, 512, 1, padding=0, norm_cfg=norm_cfg), - ConvModule(512, out_channels, 1, padding=0, act_cfg=None)) - - self.middle = nn.Sequential( - ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg), - nn.MaxPool2d(kernel_size=3, stride=2, padding=1), - ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), - nn.MaxPool2d(kernel_size=3, stride=2, padding=1), - ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg), - nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) - - self.cpm_stages = nn.ModuleList([ - CpmBlock( - middle_channels + out_channels, - channels=[feat_channels, feat_channels, feat_channels], - kernels=[11, 11, 11], - norm_cfg=norm_cfg) for _ in range(num_stages - 1) - ]) - - self.middle_conv = nn.ModuleList([ - nn.Sequential( - ConvModule( - 128, middle_channels, 5, padding=2, norm_cfg=norm_cfg)) - for _ in range(num_stages - 1) - ]) - - self.out_convs = nn.ModuleList([ - nn.Sequential( - ConvModule( - feat_channels, - feat_channels, - 1, - padding=0, - norm_cfg=norm_cfg), - ConvModule(feat_channels, out_channels, 1, act_cfg=None)) - for _ in range(num_stages - 1) - ]) - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - normal_init(m, std=0.001) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - """Model forward function.""" - stage1_out = self.stem(x) - middle_out = self.middle(x) - out_feats = [] - - out_feats.append(stage1_out) - - for ind in range(self.num_stages - 1): - single_stage = self.cpm_stages[ind] - out_conv = self.out_convs[ind] - - inp_feat = torch.cat( - [out_feats[-1], self.middle_conv[ind](middle_out)], 1) - cpm_feat = single_stage(inp_feat) - out_feat = out_conv(cpm_feat) - out_feats.append(out_feat) - - return out_feats diff --git a/main/transformer_utils/mmpose/models/backbones/hourglass.py b/main/transformer_utils/mmpose/models/backbones/hourglass.py deleted file mode 100644 index bf75fad9895ebfd3f3c2a6bffedb3d7e4cc77cba..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/hourglass.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import torch.nn as nn -from mmcv.cnn import ConvModule, constant_init, normal_init -from torch.nn.modules.batchnorm import _BatchNorm - -from mmpose.utils import get_root_logger -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .resnet import BasicBlock, ResLayer -from .utils import load_checkpoint - - -class HourglassModule(nn.Module): - """Hourglass Module for HourglassNet backbone. - - Generate module recursively and use BasicBlock as the base unit. - - Args: - depth (int): Depth of current HourglassModule. - stage_channels (list[int]): Feature channels of sub-modules in current - and follow-up HourglassModule. - stage_blocks (list[int]): Number of sub-modules stacked in current and - follow-up HourglassModule. - norm_cfg (dict): Dictionary to construct and config norm layer. - """ - - def __init__(self, - depth, - stage_channels, - stage_blocks, - norm_cfg=dict(type='BN', requires_grad=True)): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - - self.depth = depth - - cur_block = stage_blocks[0] - next_block = stage_blocks[1] - - cur_channel = stage_channels[0] - next_channel = stage_channels[1] - - self.up1 = ResLayer( - BasicBlock, cur_block, cur_channel, cur_channel, norm_cfg=norm_cfg) - - self.low1 = ResLayer( - BasicBlock, - cur_block, - cur_channel, - next_channel, - stride=2, - norm_cfg=norm_cfg) - - if self.depth > 1: - self.low2 = HourglassModule(depth - 1, stage_channels[1:], - stage_blocks[1:]) - else: - self.low2 = ResLayer( - BasicBlock, - next_block, - next_channel, - next_channel, - norm_cfg=norm_cfg) - - self.low3 = ResLayer( - BasicBlock, - cur_block, - next_channel, - cur_channel, - norm_cfg=norm_cfg, - downsample_first=False) - - self.up2 = nn.Upsample(scale_factor=2) - - def forward(self, x): - """Model forward function.""" - up1 = self.up1(x) - low1 = self.low1(x) - low2 = self.low2(low1) - low3 = self.low3(low2) - up2 = self.up2(low3) - return up1 + up2 - - -@BACKBONES.register_module() -class HourglassNet(BaseBackbone): - """HourglassNet backbone. - - Stacked Hourglass Networks for Human Pose Estimation. - More details can be found in the `paper - `__ . - - Args: - downsample_times (int): Downsample times in a HourglassModule. - num_stacks (int): Number of HourglassModule modules stacked, - 1 for Hourglass-52, 2 for Hourglass-104. - stage_channels (list[int]): Feature channel of each sub-module in a - HourglassModule. - stage_blocks (list[int]): Number of sub-modules stacked in a - HourglassModule. - feat_channel (int): Feature channel of conv after a HourglassModule. - norm_cfg (dict): Dictionary to construct and config norm layer. - - Example: - >>> from mmpose.models import HourglassNet - >>> import torch - >>> self = HourglassNet() - >>> self.eval() - >>> inputs = torch.rand(1, 3, 511, 511) - >>> level_outputs = self.forward(inputs) - >>> for level_output in level_outputs: - ... print(tuple(level_output.shape)) - (1, 256, 128, 128) - (1, 256, 128, 128) - """ - - def __init__(self, - downsample_times=5, - num_stacks=2, - stage_channels=(256, 256, 384, 384, 384, 512), - stage_blocks=(2, 2, 2, 2, 2, 4), - feat_channel=256, - norm_cfg=dict(type='BN', requires_grad=True)): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - - self.num_stacks = num_stacks - assert self.num_stacks >= 1 - assert len(stage_channels) == len(stage_blocks) - assert len(stage_channels) > downsample_times - - cur_channel = stage_channels[0] - - self.stem = nn.Sequential( - ConvModule(3, 128, 7, padding=3, stride=2, norm_cfg=norm_cfg), - ResLayer(BasicBlock, 1, 128, 256, stride=2, norm_cfg=norm_cfg)) - - self.hourglass_modules = nn.ModuleList([ - HourglassModule(downsample_times, stage_channels, stage_blocks) - for _ in range(num_stacks) - ]) - - self.inters = ResLayer( - BasicBlock, - num_stacks - 1, - cur_channel, - cur_channel, - norm_cfg=norm_cfg) - - self.conv1x1s = nn.ModuleList([ - ConvModule( - cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None) - for _ in range(num_stacks - 1) - ]) - - self.out_convs = nn.ModuleList([ - ConvModule( - cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg) - for _ in range(num_stacks) - ]) - - self.remap_convs = nn.ModuleList([ - ConvModule( - feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None) - for _ in range(num_stacks - 1) - ]) - - self.relu = nn.ReLU(inplace=True) - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - normal_init(m, std=0.001) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - """Model forward function.""" - inter_feat = self.stem(x) - out_feats = [] - - for ind in range(self.num_stacks): - single_hourglass = self.hourglass_modules[ind] - out_conv = self.out_convs[ind] - - hourglass_feat = single_hourglass(inter_feat) - out_feat = out_conv(hourglass_feat) - out_feats.append(out_feat) - - if ind < self.num_stacks - 1: - inter_feat = self.conv1x1s[ind]( - inter_feat) + self.remap_convs[ind]( - out_feat) - inter_feat = self.inters[ind](self.relu(inter_feat)) - - return out_feats diff --git a/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py b/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py deleted file mode 100644 index 5a700e5cb2157fd1dc16771145f065e991b270ea..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import torch.nn as nn -from mmcv.cnn import ConvModule, MaxPool2d, constant_init, normal_init -from torch.nn.modules.batchnorm import _BatchNorm - -from mmpose.utils import get_root_logger -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .utils import load_checkpoint - - -class HourglassAEModule(nn.Module): - """Modified Hourglass Module for HourglassNet_AE backbone. - - Generate module recursively and use BasicBlock as the base unit. - - Args: - depth (int): Depth of current HourglassModule. - stage_channels (list[int]): Feature channels of sub-modules in current - and follow-up HourglassModule. - norm_cfg (dict): Dictionary to construct and config norm layer. - """ - - def __init__(self, - depth, - stage_channels, - norm_cfg=dict(type='BN', requires_grad=True)): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - - self.depth = depth - - cur_channel = stage_channels[0] - next_channel = stage_channels[1] - - self.up1 = ConvModule( - cur_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg) - - self.pool1 = MaxPool2d(2, 2) - - self.low1 = ConvModule( - cur_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg) - - if self.depth > 1: - self.low2 = HourglassAEModule(depth - 1, stage_channels[1:]) - else: - self.low2 = ConvModule( - next_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg) - - self.low3 = ConvModule( - next_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg) - - self.up2 = nn.UpsamplingNearest2d(scale_factor=2) - - def forward(self, x): - """Model forward function.""" - up1 = self.up1(x) - pool1 = self.pool1(x) - low1 = self.low1(pool1) - low2 = self.low2(low1) - low3 = self.low3(low2) - up2 = self.up2(low3) - return up1 + up2 - - -@BACKBONES.register_module() -class HourglassAENet(BaseBackbone): - """Hourglass-AE Network proposed by Newell et al. - - Associative Embedding: End-to-End Learning for Joint - Detection and Grouping. - - More details can be found in the `paper - `__ . - - Args: - downsample_times (int): Downsample times in a HourglassModule. - num_stacks (int): Number of HourglassModule modules stacked, - 1 for Hourglass-52, 2 for Hourglass-104. - stage_channels (list[int]): Feature channel of each sub-module in a - HourglassModule. - stage_blocks (list[int]): Number of sub-modules stacked in a - HourglassModule. - feat_channels (int): Feature channel of conv after a HourglassModule. - norm_cfg (dict): Dictionary to construct and config norm layer. - - Example: - >>> from mmpose.models import HourglassAENet - >>> import torch - >>> self = HourglassAENet() - >>> self.eval() - >>> inputs = torch.rand(1, 3, 512, 512) - >>> level_outputs = self.forward(inputs) - >>> for level_output in level_outputs: - ... print(tuple(level_output.shape)) - (1, 34, 128, 128) - """ - - def __init__(self, - downsample_times=4, - num_stacks=1, - out_channels=34, - stage_channels=(256, 384, 512, 640, 768), - feat_channels=256, - norm_cfg=dict(type='BN', requires_grad=True)): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - - self.num_stacks = num_stacks - assert self.num_stacks >= 1 - assert len(stage_channels) > downsample_times - - cur_channels = stage_channels[0] - - self.stem = nn.Sequential( - ConvModule(3, 64, 7, padding=3, stride=2, norm_cfg=norm_cfg), - ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg), - MaxPool2d(2, 2), - ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg), - ConvModule(128, feat_channels, 3, padding=1, norm_cfg=norm_cfg), - ) - - self.hourglass_modules = nn.ModuleList([ - nn.Sequential( - HourglassAEModule( - downsample_times, stage_channels, norm_cfg=norm_cfg), - ConvModule( - feat_channels, - feat_channels, - 3, - padding=1, - norm_cfg=norm_cfg), - ConvModule( - feat_channels, - feat_channels, - 3, - padding=1, - norm_cfg=norm_cfg)) for _ in range(num_stacks) - ]) - - self.out_convs = nn.ModuleList([ - ConvModule( - cur_channels, - out_channels, - 1, - padding=0, - norm_cfg=None, - act_cfg=None) for _ in range(num_stacks) - ]) - - self.remap_out_convs = nn.ModuleList([ - ConvModule( - out_channels, - feat_channels, - 1, - norm_cfg=norm_cfg, - act_cfg=None) for _ in range(num_stacks - 1) - ]) - - self.remap_feature_convs = nn.ModuleList([ - ConvModule( - feat_channels, - feat_channels, - 1, - norm_cfg=norm_cfg, - act_cfg=None) for _ in range(num_stacks - 1) - ]) - - self.relu = nn.ReLU(inplace=True) - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - normal_init(m, std=0.001) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - """Model forward function.""" - inter_feat = self.stem(x) - out_feats = [] - - for ind in range(self.num_stacks): - single_hourglass = self.hourglass_modules[ind] - out_conv = self.out_convs[ind] - - hourglass_feat = single_hourglass(inter_feat) - out_feat = out_conv(hourglass_feat) - out_feats.append(out_feat) - - if ind < self.num_stacks - 1: - inter_feat = inter_feat + self.remap_out_convs[ind]( - out_feat) + self.remap_feature_convs[ind]( - hourglass_feat) - - return out_feats diff --git a/main/transformer_utils/mmpose/models/backbones/hrformer.py b/main/transformer_utils/mmpose/models/backbones/hrformer.py deleted file mode 100644 index b843300a9fdb85908678c5a3fd45ce19e97ce2fe..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/hrformer.py +++ /dev/null @@ -1,746 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import math - -import torch -import torch.nn as nn -# from timm.models.layers import to_2tuple, trunc_normal_ -from mmcv.cnn import (build_activation_layer, build_conv_layer, - build_norm_layer, trunc_normal_init) -from mmcv.cnn.bricks.transformer import build_dropout -from mmcv.runner import BaseModule -from torch.nn.functional import pad - -from ..builder import BACKBONES -from .hrnet import Bottleneck, HRModule, HRNet - - -def nlc_to_nchw(x, hw_shape): - """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor. - - Args: - x (Tensor): The input tensor of shape [N, L, C] before conversion. - hw_shape (Sequence[int]): The height and width of output feature map. - - Returns: - Tensor: The output tensor of shape [N, C, H, W] after conversion. - """ - H, W = hw_shape - assert len(x.shape) == 3 - B, L, C = x.shape - assert L == H * W, 'The seq_len doesn\'t match H, W' - return x.transpose(1, 2).reshape(B, C, H, W) - - -def nchw_to_nlc(x): - """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor. - - Args: - x (Tensor): The input tensor of shape [N, C, H, W] before conversion. - - Returns: - Tensor: The output tensor of shape [N, L, C] after conversion. - """ - assert len(x.shape) == 4 - return x.flatten(2).transpose(1, 2).contiguous() - - -def build_drop_path(drop_path_rate): - """Build drop path layer.""" - return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate)) - - -class WindowMSA(BaseModule): - """Window based multi-head self-attention (W-MSA) module with relative - position bias. - - Args: - embed_dims (int): Number of input channels. - num_heads (int): Number of attention heads. - window_size (tuple[int]): The height and width of the window. - qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. - Default: True. - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - attn_drop_rate (float, optional): Dropout ratio of attention weight. - Default: 0.0 - proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. - with_rpe (bool, optional): If True, use relative position bias. - Default: True. - init_cfg (dict | None, optional): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims, - num_heads, - window_size, - qkv_bias=True, - qk_scale=None, - attn_drop_rate=0., - proj_drop_rate=0., - with_rpe=True, - init_cfg=None): - - super().__init__(init_cfg=init_cfg) - self.embed_dims = embed_dims - self.window_size = window_size # Wh, Ww - self.num_heads = num_heads - head_embed_dims = embed_dims // num_heads - self.scale = qk_scale or head_embed_dims**-0.5 - - self.with_rpe = with_rpe - if self.with_rpe: - # define a parameter table of relative position bias - self.relative_position_bias_table = nn.Parameter( - torch.zeros( - (2 * window_size[0] - 1) * (2 * window_size[1] - 1), - num_heads)) # 2*Wh-1 * 2*Ww-1, nH - - Wh, Ww = self.window_size - rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) - rel_position_index = rel_index_coords + rel_index_coords.T - rel_position_index = rel_position_index.flip(1).contiguous() - self.register_buffer('relative_position_index', rel_position_index) - - self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop_rate) - self.proj = nn.Linear(embed_dims, embed_dims) - self.proj_drop = nn.Dropout(proj_drop_rate) - - self.softmax = nn.Softmax(dim=-1) - - def init_weights(self): - trunc_normal_init(self.relative_position_bias_table, std=0.02) - - def forward(self, x, mask=None): - """ - Args: - - x (tensor): input features with shape of (B*num_windows, N, C) - mask (tensor | None, Optional): mask with shape of (num_windows, - Wh*Ww, Wh*Ww), value should be between (-inf, 0]. - """ - B, N, C = x.shape - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv[0], qkv[1], qkv[2] - - q = q * self.scale - attn = (q @ k.transpose(-2, -1)) - - if self.with_rpe: - relative_position_bias = self.relative_position_bias_table[ - self.relative_position_index.view(-1)].view( - self.window_size[0] * self.window_size[1], - self.window_size[0] * self.window_size[1], - -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute( - 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0) - - if mask is not None: - nW = mask.shape[0] - attn = attn.view(B // nW, nW, self.num_heads, N, - N) + mask.unsqueeze(1).unsqueeze(0) - attn = attn.view(-1, self.num_heads, N, N) - attn = self.softmax(attn) - - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - - @staticmethod - def double_step_seq(step1, len1, step2, len2): - seq1 = torch.arange(0, step1 * len1, step1) - seq2 = torch.arange(0, step2 * len2, step2) - return (seq1[:, None] + seq2[None, :]).reshape(1, -1) - - -class LocalWindowSelfAttention(BaseModule): - r""" Local-window Self Attention (LSA) module with relative position bias. - - This module is the short-range self-attention module in the - Interlaced Sparse Self-Attention `_. - - Args: - embed_dims (int): Number of input channels. - num_heads (int): Number of attention heads. - window_size (tuple[int] | int): The height and width of the window. - qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. - Default: True. - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - attn_drop_rate (float, optional): Dropout ratio of attention weight. - Default: 0.0 - proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. - with_rpe (bool, optional): If True, use relative position bias. - Default: True. - with_pad_mask (bool, optional): If True, mask out the padded tokens in - the attention process. Default: False. - init_cfg (dict | None, optional): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims, - num_heads, - window_size, - qkv_bias=True, - qk_scale=None, - attn_drop_rate=0., - proj_drop_rate=0., - with_rpe=True, - with_pad_mask=False, - init_cfg=None): - super().__init__(init_cfg=init_cfg) - if isinstance(window_size, int): - window_size = (window_size, window_size) - self.window_size = window_size - self.with_pad_mask = with_pad_mask - self.attn = WindowMSA( - embed_dims=embed_dims, - num_heads=num_heads, - window_size=window_size, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop_rate=attn_drop_rate, - proj_drop_rate=proj_drop_rate, - with_rpe=with_rpe, - init_cfg=init_cfg) - - def forward(self, x, H, W, **kwargs): - """Forward function.""" - B, N, C = x.shape - x = x.view(B, H, W, C) - Wh, Ww = self.window_size - - # center-pad the feature on H and W axes - pad_h = math.ceil(H / Wh) * Wh - H - pad_w = math.ceil(W / Ww) * Ww - W - x = pad(x, (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, - pad_h - pad_h // 2)) - - # permute - x = x.view(B, math.ceil(H / Wh), Wh, math.ceil(W / Ww), Ww, C) - x = x.permute(0, 1, 3, 2, 4, 5) - x = x.reshape(-1, Wh * Ww, C) # (B*num_window, Wh*Ww, C) - - # attention - if self.with_pad_mask and pad_h > 0 and pad_w > 0: - pad_mask = x.new_zeros(1, H, W, 1) - pad_mask = pad( - pad_mask, [ - 0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, - pad_h - pad_h // 2 - ], - value=-float('inf')) - pad_mask = pad_mask.view(1, math.ceil(H / Wh), Wh, - math.ceil(W / Ww), Ww, 1) - pad_mask = pad_mask.permute(1, 3, 0, 2, 4, 5) - pad_mask = pad_mask.reshape(-1, Wh * Ww) - pad_mask = pad_mask[:, None, :].expand([-1, Wh * Ww, -1]) - out = self.attn(x, pad_mask, **kwargs) - else: - out = self.attn(x, **kwargs) - - # reverse permutation - out = out.reshape(B, math.ceil(H / Wh), math.ceil(W / Ww), Wh, Ww, C) - out = out.permute(0, 1, 3, 2, 4, 5) - out = out.reshape(B, H + pad_h, W + pad_w, C) - - # de-pad - out = out[:, pad_h // 2:H + pad_h // 2, pad_w // 2:W + pad_w // 2] - return out.reshape(B, N, C) - - -class CrossFFN(BaseModule): - r"""FFN with Depthwise Conv of HRFormer. - - Args: - in_features (int): The feature dimension. - hidden_features (int, optional): The hidden dimension of FFNs. - Defaults: The same as in_features. - act_cfg (dict, optional): Config of activation layer. - Default: dict(type='GELU'). - dw_act_cfg (dict, optional): Config of activation layer appended - right after DW Conv. Default: dict(type='GELU'). - norm_cfg (dict, optional): Config of norm layer. - Default: dict(type='SyncBN'). - init_cfg (dict | list | None, optional): The init config. - Default: None. - """ - - def __init__(self, - in_features, - hidden_features=None, - out_features=None, - act_cfg=dict(type='GELU'), - dw_act_cfg=dict(type='GELU'), - norm_cfg=dict(type='SyncBN'), - init_cfg=None): - super().__init__(init_cfg=init_cfg) - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1) - self.act1 = build_activation_layer(act_cfg) - self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1] - self.dw3x3 = nn.Conv2d( - hidden_features, - hidden_features, - kernel_size=3, - stride=1, - groups=hidden_features, - padding=1) - self.act2 = build_activation_layer(dw_act_cfg) - self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1] - self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1) - self.act3 = build_activation_layer(act_cfg) - self.norm3 = build_norm_layer(norm_cfg, out_features)[1] - - # put the modules togather - self.layers = [ - self.fc1, self.norm1, self.act1, self.dw3x3, self.norm2, self.act2, - self.fc2, self.norm3, self.act3 - ] - - def forward(self, x, H, W): - """Forward function.""" - x = nlc_to_nchw(x, (H, W)) - for layer in self.layers: - x = layer(x) - x = nchw_to_nlc(x) - return x - - -class HRFormerBlock(BaseModule): - """High-Resolution Block for HRFormer. - - Args: - in_features (int): The input dimension. - out_features (int): The output dimension. - num_heads (int): The number of head within each LSA. - window_size (int, optional): The window size for the LSA. - Default: 7 - mlp_ratio (int, optional): The expansion ration of FFN. - Default: 4 - act_cfg (dict, optional): Config of activation layer. - Default: dict(type='GELU'). - norm_cfg (dict, optional): Config of norm layer. - Default: dict(type='SyncBN'). - transformer_norm_cfg (dict, optional): Config of transformer norm - layer. Default: dict(type='LN', eps=1e-6). - init_cfg (dict | list | None, optional): The init config. - Default: None. - """ - - expansion = 1 - - def __init__(self, - in_features, - out_features, - num_heads, - window_size=7, - mlp_ratio=4.0, - drop_path=0.0, - act_cfg=dict(type='GELU'), - norm_cfg=dict(type='SyncBN'), - transformer_norm_cfg=dict(type='LN', eps=1e-6), - init_cfg=None, - **kwargs): - super(HRFormerBlock, self).__init__(init_cfg=init_cfg) - self.num_heads = num_heads - self.window_size = window_size - self.mlp_ratio = mlp_ratio - - self.norm1 = build_norm_layer(transformer_norm_cfg, in_features)[1] - self.attn = LocalWindowSelfAttention( - in_features, - num_heads=num_heads, - window_size=window_size, - init_cfg=None, - **kwargs) - - self.norm2 = build_norm_layer(transformer_norm_cfg, out_features)[1] - self.ffn = CrossFFN( - in_features=in_features, - hidden_features=int(in_features * mlp_ratio), - out_features=out_features, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - dw_act_cfg=act_cfg, - init_cfg=None) - - self.drop_path = build_drop_path( - drop_path) if drop_path > 0.0 else nn.Identity() - - def forward(self, x): - """Forward function.""" - B, C, H, W = x.size() - # Attention - x = x.view(B, C, -1).permute(0, 2, 1) - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - # FFN - x = x + self.drop_path(self.ffn(self.norm2(x), H, W)) - x = x.permute(0, 2, 1).view(B, C, H, W) - return x - - def extra_repr(self): - """(Optional) Set the extra information about this module.""" - return 'num_heads={}, window_size={}, mlp_ratio={}'.format( - self.num_heads, self.window_size, self.mlp_ratio) - - -class HRFomerModule(HRModule): - """High-Resolution Module for HRFormer. - - Args: - num_branches (int): The number of branches in the HRFormerModule. - block (nn.Module): The building block of HRFormer. - The block should be the HRFormerBlock. - num_blocks (tuple): The number of blocks in each branch. - The length must be equal to num_branches. - num_inchannels (tuple): The number of input channels in each branch. - The length must be equal to num_branches. - num_channels (tuple): The number of channels in each branch. - The length must be equal to num_branches. - num_heads (tuple): The number of heads within the LSAs. - num_window_sizes (tuple): The window size for the LSAs. - num_mlp_ratios (tuple): The expansion ratio for the FFNs. - drop_path (int, optional): The drop path rate of HRFomer. - Default: 0.0 - multiscale_output (bool, optional): Whether to output multi-level - features produced by multiple branches. If False, only the first - level feature will be output. Default: True. - conv_cfg (dict, optional): Config of the conv layers. - Default: None. - norm_cfg (dict, optional): Config of the norm layers appended - right after conv. Default: dict(type='SyncBN', requires_grad=True) - transformer_norm_cfg (dict, optional): Config of the norm layers. - Default: dict(type='LN', eps=1e-6) - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False - upsample_cfg(dict, optional): The config of upsample layers in fuse - layers. Default: dict(mode='bilinear', align_corners=False) - """ - - def __init__(self, - num_branches, - block, - num_blocks, - num_inchannels, - num_channels, - num_heads, - num_window_sizes, - num_mlp_ratios, - multiscale_output=True, - drop_paths=0.0, - with_rpe=True, - with_pad_mask=False, - conv_cfg=None, - norm_cfg=dict(type='SyncBN', requires_grad=True), - transformer_norm_cfg=dict(type='LN', eps=1e-6), - with_cp=False, - upsample_cfg=dict(mode='bilinear', align_corners=False)): - - self.transformer_norm_cfg = transformer_norm_cfg - self.drop_paths = drop_paths - self.num_heads = num_heads - self.num_window_sizes = num_window_sizes - self.num_mlp_ratios = num_mlp_ratios - self.with_rpe = with_rpe - self.with_pad_mask = with_pad_mask - - super().__init__(num_branches, block, num_blocks, num_inchannels, - num_channels, multiscale_output, with_cp, conv_cfg, - norm_cfg, upsample_cfg) - - def _make_one_branch(self, - branch_index, - block, - num_blocks, - num_channels, - stride=1): - """Build one branch.""" - # HRFormerBlock does not support down sample layer yet. - assert stride == 1 and self.in_channels[branch_index] == num_channels[ - branch_index] - layers = [] - layers.append( - block( - self.in_channels[branch_index], - num_channels[branch_index], - num_heads=self.num_heads[branch_index], - window_size=self.num_window_sizes[branch_index], - mlp_ratio=self.num_mlp_ratios[branch_index], - drop_path=self.drop_paths[0], - norm_cfg=self.norm_cfg, - transformer_norm_cfg=self.transformer_norm_cfg, - init_cfg=None, - with_rpe=self.with_rpe, - with_pad_mask=self.with_pad_mask)) - - self.in_channels[ - branch_index] = self.in_channels[branch_index] * block.expansion - for i in range(1, num_blocks[branch_index]): - layers.append( - block( - self.in_channels[branch_index], - num_channels[branch_index], - num_heads=self.num_heads[branch_index], - window_size=self.num_window_sizes[branch_index], - mlp_ratio=self.num_mlp_ratios[branch_index], - drop_path=self.drop_paths[i], - norm_cfg=self.norm_cfg, - transformer_norm_cfg=self.transformer_norm_cfg, - init_cfg=None, - with_rpe=self.with_rpe, - with_pad_mask=self.with_pad_mask)) - return nn.Sequential(*layers) - - def _make_fuse_layers(self): - """Build fuse layers.""" - if self.num_branches == 1: - return None - num_branches = self.num_branches - num_inchannels = self.in_channels - fuse_layers = [] - for i in range(num_branches if self.multiscale_output else 1): - fuse_layer = [] - for j in range(num_branches): - if j > i: - fuse_layer.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - num_inchannels[j], - num_inchannels[i], - kernel_size=1, - stride=1, - bias=False), - build_norm_layer(self.norm_cfg, - num_inchannels[i])[1], - nn.Upsample( - scale_factor=2**(j - i), - mode=self.upsample_cfg['mode'], - align_corners=self. - upsample_cfg['align_corners']))) - elif j == i: - fuse_layer.append(None) - else: - conv3x3s = [] - for k in range(i - j): - if k == i - j - 1: - num_outchannels_conv3x3 = num_inchannels[i] - with_out_act = False - else: - num_outchannels_conv3x3 = num_inchannels[j] - with_out_act = True - sub_modules = [ - build_conv_layer( - self.conv_cfg, - num_inchannels[j], - num_inchannels[j], - kernel_size=3, - stride=2, - padding=1, - groups=num_inchannels[j], - bias=False, - ), - build_norm_layer(self.norm_cfg, - num_inchannels[j])[1], - build_conv_layer( - self.conv_cfg, - num_inchannels[j], - num_outchannels_conv3x3, - kernel_size=1, - stride=1, - bias=False, - ), - build_norm_layer(self.norm_cfg, - num_outchannels_conv3x3)[1] - ] - if with_out_act: - sub_modules.append(nn.ReLU(False)) - conv3x3s.append(nn.Sequential(*sub_modules)) - fuse_layer.append(nn.Sequential(*conv3x3s)) - fuse_layers.append(nn.ModuleList(fuse_layer)) - - return nn.ModuleList(fuse_layers) - - def get_num_inchannels(self): - """Return the number of input channels.""" - return self.in_channels - - -@BACKBONES.register_module() -class HRFormer(HRNet): - """HRFormer backbone. - - This backbone is the implementation of `HRFormer: High-Resolution - Transformer for Dense Prediction `_. - - Args: - extra (dict): Detailed configuration for each stage of HRNet. - There must be 4 stages, the configuration for each stage must have - 5 keys: - - - num_modules (int): The number of HRModule in this stage. - - num_branches (int): The number of branches in the HRModule. - - block (str): The type of block. - - num_blocks (tuple): The number of blocks in each branch. - The length must be equal to num_branches. - - num_channels (tuple): The number of channels in each branch. - The length must be equal to num_branches. - in_channels (int): Number of input image channels. Normally 3. - conv_cfg (dict): Dictionary to construct and config conv layer. - Default: None. - norm_cfg (dict): Config of norm layer. - Use `SyncBN` by default. - transformer_norm_cfg (dict): Config of transformer norm layer. - Use `LN` by default. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - zero_init_residual (bool): Whether to use zero init for last norm layer - in resblocks to let them behave as identity. Default: False. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. Default: -1. - Example: - >>> from mmpose.models import HRFormer - >>> import torch - >>> extra = dict( - >>> stage1=dict( - >>> num_modules=1, - >>> num_branches=1, - >>> block='BOTTLENECK', - >>> num_blocks=(2, ), - >>> num_channels=(64, )), - >>> stage2=dict( - >>> num_modules=1, - >>> num_branches=2, - >>> block='HRFORMER', - >>> window_sizes=(7, 7), - >>> num_heads=(1, 2), - >>> mlp_ratios=(4, 4), - >>> num_blocks=(2, 2), - >>> num_channels=(32, 64)), - >>> stage3=dict( - >>> num_modules=4, - >>> num_branches=3, - >>> block='HRFORMER', - >>> window_sizes=(7, 7, 7), - >>> num_heads=(1, 2, 4), - >>> mlp_ratios=(4, 4, 4), - >>> num_blocks=(2, 2, 2), - >>> num_channels=(32, 64, 128)), - >>> stage4=dict( - >>> num_modules=2, - >>> num_branches=4, - >>> block='HRFORMER', - >>> window_sizes=(7, 7, 7, 7), - >>> num_heads=(1, 2, 4, 8), - >>> mlp_ratios=(4, 4, 4, 4), - >>> num_blocks=(2, 2, 2, 2), - >>> num_channels=(32, 64, 128, 256))) - >>> self = HRFormer(extra, in_channels=1) - >>> self.eval() - >>> inputs = torch.rand(1, 1, 32, 32) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 32, 8, 8) - (1, 64, 4, 4) - (1, 128, 2, 2) - (1, 256, 1, 1) - """ - - blocks_dict = {'BOTTLENECK': Bottleneck, 'HRFORMERBLOCK': HRFormerBlock} - - def __init__(self, - extra, - in_channels=3, - conv_cfg=None, - norm_cfg=dict(type='BN', requires_grad=True), - transformer_norm_cfg=dict(type='LN', eps=1e-6), - norm_eval=False, - with_cp=False, - zero_init_residual=False, - frozen_stages=-1): - - # stochastic depth - depths = [ - extra[stage]['num_blocks'][0] * extra[stage]['num_modules'] - for stage in ['stage2', 'stage3', 'stage4'] - ] - depth_s2, depth_s3, _ = depths - drop_path_rate = extra['drop_path_rate'] - dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) - ] - extra['stage2']['drop_path_rates'] = dpr[0:depth_s2] - extra['stage3']['drop_path_rates'] = dpr[depth_s2:depth_s2 + depth_s3] - extra['stage4']['drop_path_rates'] = dpr[depth_s2 + depth_s3:] - - # HRFormer use bilinear upsample as default - upsample_cfg = extra.get('upsample', { - 'mode': 'bilinear', - 'align_corners': False - }) - extra['upsample'] = upsample_cfg - self.transformer_norm_cfg = transformer_norm_cfg - self.with_rpe = extra.get('with_rpe', True) - self.with_pad_mask = extra.get('with_pad_mask', False) - - super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval, - with_cp, zero_init_residual, frozen_stages) - - def _make_stage(self, - layer_config, - num_inchannels, - multiscale_output=True): - """Make each stage.""" - num_modules = layer_config['num_modules'] - num_branches = layer_config['num_branches'] - num_blocks = layer_config['num_blocks'] - num_channels = layer_config['num_channels'] - block = self.blocks_dict[layer_config['block']] - num_heads = layer_config['num_heads'] - num_window_sizes = layer_config['window_sizes'] - num_mlp_ratios = layer_config['mlp_ratios'] - drop_path_rates = layer_config['drop_path_rates'] - - modules = [] - for i in range(num_modules): - # multiscale_output is only used at the last module - if not multiscale_output and i == num_modules - 1: - reset_multiscale_output = False - else: - reset_multiscale_output = True - - modules.append( - HRFomerModule( - num_branches, - block, - num_blocks, - num_inchannels, - num_channels, - num_heads, - num_window_sizes, - num_mlp_ratios, - reset_multiscale_output, - drop_paths=drop_path_rates[num_blocks[0] * - i:num_blocks[0] * (i + 1)], - with_rpe=self.with_rpe, - with_pad_mask=self.with_pad_mask, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - transformer_norm_cfg=self.transformer_norm_cfg, - with_cp=self.with_cp, - upsample_cfg=self.upsample_cfg)) - num_inchannels = modules[-1].get_num_inchannels() - - return nn.Sequential(*modules), num_inchannels diff --git a/main/transformer_utils/mmpose/models/backbones/hrnet.py b/main/transformer_utils/mmpose/models/backbones/hrnet.py deleted file mode 100644 index 87dc8cef555b5e8d78fcc69293047b0cbe2ea8a6..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/hrnet.py +++ /dev/null @@ -1,604 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import torch.nn as nn -from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, - normal_init) -from torch.nn.modules.batchnorm import _BatchNorm - -from mmpose.utils import get_root_logger -from ..builder import BACKBONES -from .resnet import BasicBlock, Bottleneck, get_expansion -from .utils import load_checkpoint - - -class HRModule(nn.Module): - """High-Resolution Module for HRNet. - - In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange - is in this module. - """ - - def __init__(self, - num_branches, - blocks, - num_blocks, - in_channels, - num_channels, - multiscale_output=False, - with_cp=False, - conv_cfg=None, - norm_cfg=dict(type='BN'), - upsample_cfg=dict(mode='nearest', align_corners=None)): - - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - self._check_branches(num_branches, num_blocks, in_channels, - num_channels) - - self.in_channels = in_channels - self.num_branches = num_branches - - self.multiscale_output = multiscale_output - self.norm_cfg = norm_cfg - self.conv_cfg = conv_cfg - self.upsample_cfg = upsample_cfg - self.with_cp = with_cp - self.branches = self._make_branches(num_branches, blocks, num_blocks, - num_channels) - self.fuse_layers = self._make_fuse_layers() - self.relu = nn.ReLU(inplace=True) - - @staticmethod - def _check_branches(num_branches, num_blocks, in_channels, num_channels): - """Check input to avoid ValueError.""" - if num_branches != len(num_blocks): - error_msg = f'NUM_BRANCHES({num_branches}) ' \ - f'!= NUM_BLOCKS({len(num_blocks)})' - raise ValueError(error_msg) - - if num_branches != len(num_channels): - error_msg = f'NUM_BRANCHES({num_branches}) ' \ - f'!= NUM_CHANNELS({len(num_channels)})' - raise ValueError(error_msg) - - if num_branches != len(in_channels): - error_msg = f'NUM_BRANCHES({num_branches}) ' \ - f'!= NUM_INCHANNELS({len(in_channels)})' - raise ValueError(error_msg) - - def _make_one_branch(self, - branch_index, - block, - num_blocks, - num_channels, - stride=1): - """Make one branch.""" - downsample = None - if stride != 1 or \ - self.in_channels[branch_index] != \ - num_channels[branch_index] * get_expansion(block): - downsample = nn.Sequential( - build_conv_layer( - self.conv_cfg, - self.in_channels[branch_index], - num_channels[branch_index] * get_expansion(block), - kernel_size=1, - stride=stride, - bias=False), - build_norm_layer( - self.norm_cfg, - num_channels[branch_index] * get_expansion(block))[1]) - - layers = [] - layers.append( - block( - self.in_channels[branch_index], - num_channels[branch_index] * get_expansion(block), - stride=stride, - downsample=downsample, - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg)) - self.in_channels[branch_index] = \ - num_channels[branch_index] * get_expansion(block) - for _ in range(1, num_blocks[branch_index]): - layers.append( - block( - self.in_channels[branch_index], - num_channels[branch_index] * get_expansion(block), - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg)) - - return nn.Sequential(*layers) - - def _make_branches(self, num_branches, block, num_blocks, num_channels): - """Make branches.""" - branches = [] - - for i in range(num_branches): - branches.append( - self._make_one_branch(i, block, num_blocks, num_channels)) - - return nn.ModuleList(branches) - - def _make_fuse_layers(self): - """Make fuse layer.""" - if self.num_branches == 1: - return None - - num_branches = self.num_branches - in_channels = self.in_channels - fuse_layers = [] - num_out_branches = num_branches if self.multiscale_output else 1 - - for i in range(num_out_branches): - fuse_layer = [] - for j in range(num_branches): - if j > i: - fuse_layer.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[i], - kernel_size=1, - stride=1, - padding=0, - bias=False), - build_norm_layer(self.norm_cfg, in_channels[i])[1], - nn.Upsample( - scale_factor=2**(j - i), - mode=self.upsample_cfg['mode'], - align_corners=self. - upsample_cfg['align_corners']))) - elif j == i: - fuse_layer.append(None) - else: - conv_downsamples = [] - for k in range(i - j): - if k == i - j - 1: - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[i], - kernel_size=3, - stride=2, - padding=1, - bias=False), - build_norm_layer(self.norm_cfg, - in_channels[i])[1])) - else: - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[j], - kernel_size=3, - stride=2, - padding=1, - bias=False), - build_norm_layer(self.norm_cfg, - in_channels[j])[1], - nn.ReLU(inplace=True))) - fuse_layer.append(nn.Sequential(*conv_downsamples)) - fuse_layers.append(nn.ModuleList(fuse_layer)) - - return nn.ModuleList(fuse_layers) - - def forward(self, x): - """Forward function.""" - if self.num_branches == 1: - return [self.branches[0](x[0])] - - for i in range(self.num_branches): - x[i] = self.branches[i](x[i]) - - x_fuse = [] - for i in range(len(self.fuse_layers)): - y = 0 - for j in range(self.num_branches): - if i == j: - y += x[j] - else: - y += self.fuse_layers[i][j](x[j]) - x_fuse.append(self.relu(y)) - return x_fuse - - -@BACKBONES.register_module() -class HRNet(nn.Module): - """HRNet backbone. - - `High-Resolution Representations for Labeling Pixels and Regions - `__ - - Args: - extra (dict): detailed configuration for each stage of HRNet. - in_channels (int): Number of input image channels. Default: 3. - conv_cfg (dict): dictionary to construct and config conv layer. - norm_cfg (dict): dictionary to construct and config norm layer. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - zero_init_residual (bool): whether to use zero init for last norm layer - in resblocks to let them behave as identity. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. Default: -1. - - Example: - >>> from mmpose.models import HRNet - >>> import torch - >>> extra = dict( - >>> stage1=dict( - >>> num_modules=1, - >>> num_branches=1, - >>> block='BOTTLENECK', - >>> num_blocks=(4, ), - >>> num_channels=(64, )), - >>> stage2=dict( - >>> num_modules=1, - >>> num_branches=2, - >>> block='BASIC', - >>> num_blocks=(4, 4), - >>> num_channels=(32, 64)), - >>> stage3=dict( - >>> num_modules=4, - >>> num_branches=3, - >>> block='BASIC', - >>> num_blocks=(4, 4, 4), - >>> num_channels=(32, 64, 128)), - >>> stage4=dict( - >>> num_modules=3, - >>> num_branches=4, - >>> block='BASIC', - >>> num_blocks=(4, 4, 4, 4), - >>> num_channels=(32, 64, 128, 256))) - >>> self = HRNet(extra, in_channels=1) - >>> self.eval() - >>> inputs = torch.rand(1, 1, 32, 32) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 32, 8, 8) - """ - - blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} - - def __init__(self, - extra, - in_channels=3, - conv_cfg=None, - norm_cfg=dict(type='BN'), - norm_eval=False, - with_cp=False, - zero_init_residual=False, - frozen_stages=-1): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - self.extra = extra - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.norm_eval = norm_eval - self.with_cp = with_cp - self.zero_init_residual = zero_init_residual - self.frozen_stages = frozen_stages - - # stem net - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) - self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) - - self.conv1 = build_conv_layer( - self.conv_cfg, - in_channels, - 64, - kernel_size=3, - stride=2, - padding=1, - bias=False) - - self.add_module(self.norm1_name, norm1) - self.conv2 = build_conv_layer( - self.conv_cfg, - 64, - 64, - kernel_size=3, - stride=2, - padding=1, - bias=False) - - self.add_module(self.norm2_name, norm2) - self.relu = nn.ReLU(inplace=True) - - self.upsample_cfg = self.extra.get('upsample', { - 'mode': 'nearest', - 'align_corners': None - }) - - # stage 1 - self.stage1_cfg = self.extra['stage1'] - num_channels = self.stage1_cfg['num_channels'][0] - block_type = self.stage1_cfg['block'] - num_blocks = self.stage1_cfg['num_blocks'][0] - - block = self.blocks_dict[block_type] - stage1_out_channels = num_channels * get_expansion(block) - self.layer1 = self._make_layer(block, 64, stage1_out_channels, - num_blocks) - - # stage 2 - self.stage2_cfg = self.extra['stage2'] - num_channels = self.stage2_cfg['num_channels'] - block_type = self.stage2_cfg['block'] - - block = self.blocks_dict[block_type] - num_channels = [ - channel * get_expansion(block) for channel in num_channels - ] - self.transition1 = self._make_transition_layer([stage1_out_channels], - num_channels) - self.stage2, pre_stage_channels = self._make_stage( - self.stage2_cfg, num_channels) - - # stage 3 - self.stage3_cfg = self.extra['stage3'] - num_channels = self.stage3_cfg['num_channels'] - block_type = self.stage3_cfg['block'] - - block = self.blocks_dict[block_type] - num_channels = [ - channel * get_expansion(block) for channel in num_channels - ] - self.transition2 = self._make_transition_layer(pre_stage_channels, - num_channels) - self.stage3, pre_stage_channels = self._make_stage( - self.stage3_cfg, num_channels) - - # stage 4 - self.stage4_cfg = self.extra['stage4'] - num_channels = self.stage4_cfg['num_channels'] - block_type = self.stage4_cfg['block'] - - block = self.blocks_dict[block_type] - num_channels = [ - channel * get_expansion(block) for channel in num_channels - ] - self.transition3 = self._make_transition_layer(pre_stage_channels, - num_channels) - - self.stage4, pre_stage_channels = self._make_stage( - self.stage4_cfg, - num_channels, - multiscale_output=self.stage4_cfg.get('multiscale_output', False)) - - self._freeze_stages() - - @property - def norm1(self): - """nn.Module: the normalization layer named "norm1" """ - return getattr(self, self.norm1_name) - - @property - def norm2(self): - """nn.Module: the normalization layer named "norm2" """ - return getattr(self, self.norm2_name) - - def _make_transition_layer(self, num_channels_pre_layer, - num_channels_cur_layer): - """Make transition layer.""" - num_branches_cur = len(num_channels_cur_layer) - num_branches_pre = len(num_channels_pre_layer) - - transition_layers = [] - for i in range(num_branches_cur): - if i < num_branches_pre: - if num_channels_cur_layer[i] != num_channels_pre_layer[i]: - transition_layers.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - num_channels_pre_layer[i], - num_channels_cur_layer[i], - kernel_size=3, - stride=1, - padding=1, - bias=False), - build_norm_layer(self.norm_cfg, - num_channels_cur_layer[i])[1], - nn.ReLU(inplace=True))) - else: - transition_layers.append(None) - else: - conv_downsamples = [] - for j in range(i + 1 - num_branches_pre): - in_channels = num_channels_pre_layer[-1] - out_channels = num_channels_cur_layer[i] \ - if j == i - num_branches_pre else in_channels - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels, - out_channels, - kernel_size=3, - stride=2, - padding=1, - bias=False), - build_norm_layer(self.norm_cfg, out_channels)[1], - nn.ReLU(inplace=True))) - transition_layers.append(nn.Sequential(*conv_downsamples)) - - return nn.ModuleList(transition_layers) - - def _make_layer(self, block, in_channels, out_channels, blocks, stride=1): - """Make layer.""" - downsample = None - if stride != 1 or in_channels != out_channels: - downsample = nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels, - out_channels, - kernel_size=1, - stride=stride, - bias=False), - build_norm_layer(self.norm_cfg, out_channels)[1]) - - layers = [] - layers.append( - block( - in_channels, - out_channels, - stride=stride, - downsample=downsample, - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg)) - for _ in range(1, blocks): - layers.append( - block( - out_channels, - out_channels, - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg)) - - return nn.Sequential(*layers) - - def _make_stage(self, layer_config, in_channels, multiscale_output=True): - """Make stage.""" - num_modules = layer_config['num_modules'] - num_branches = layer_config['num_branches'] - num_blocks = layer_config['num_blocks'] - num_channels = layer_config['num_channels'] - block = self.blocks_dict[layer_config['block']] - - hr_modules = [] - for i in range(num_modules): - # multi_scale_output is only used for the last module - if not multiscale_output and i == num_modules - 1: - reset_multiscale_output = False - else: - reset_multiscale_output = True - - hr_modules.append( - HRModule( - num_branches, - block, - num_blocks, - in_channels, - num_channels, - reset_multiscale_output, - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg, - upsample_cfg=self.upsample_cfg)) - - in_channels = hr_modules[-1].in_channels - - return nn.Sequential(*hr_modules), in_channels - - def _freeze_stages(self): - """Freeze parameters.""" - if self.frozen_stages >= 0: - self.norm1.eval() - self.norm2.eval() - - for m in [self.conv1, self.norm1, self.conv2, self.norm2]: - for param in m.parameters(): - param.requires_grad = False - - for i in range(1, self.frozen_stages + 1): - if i == 1: - m = getattr(self, 'layer1') - else: - m = getattr(self, f'stage{i}') - - m.eval() - for param in m.parameters(): - param.requires_grad = False - - if i < 4: - m = getattr(self, f'transition{i}') - m.eval() - for param in m.parameters(): - param.requires_grad = False - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - normal_init(m, std=0.001) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - - if self.zero_init_residual: - for m in self.modules(): - if isinstance(m, Bottleneck): - constant_init(m.norm3, 0) - elif isinstance(m, BasicBlock): - constant_init(m.norm2, 0) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - """Forward function.""" - x = self.conv1(x) - x = self.norm1(x) - x = self.relu(x) - x = self.conv2(x) - x = self.norm2(x) - x = self.relu(x) - x = self.layer1(x) - - x_list = [] - for i in range(self.stage2_cfg['num_branches']): - if self.transition1[i] is not None: - x_list.append(self.transition1[i](x)) - else: - x_list.append(x) - y_list = self.stage2(x_list) - - x_list = [] - for i in range(self.stage3_cfg['num_branches']): - if self.transition2[i] is not None: - x_list.append(self.transition2[i](y_list[-1])) - else: - x_list.append(y_list[i]) - y_list = self.stage3(x_list) - - x_list = [] - for i in range(self.stage4_cfg['num_branches']): - if self.transition3[i] is not None: - x_list.append(self.transition3[i](y_list[-1])) - else: - x_list.append(y_list[i]) - y_list = self.stage4(x_list) - - return y_list - - def train(self, mode=True): - """Convert the model into training mode.""" - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/hrt.py b/main/transformer_utils/mmpose/models/backbones/hrt.py deleted file mode 100644 index 67be3d4429d03360698701b7cd6e67e7c7a0b4ad..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/hrt.py +++ /dev/null @@ -1,676 +0,0 @@ -# -------------------------------------------------------- -# High Resolution Transformer -# Copyright (c) 2021 Microsoft -# Licensed under The MIT License [see LICENSE for details] -# Written by Rao Fu, RainbowSecret -# -------------------------------------------------------- - -import pdb -import torch -import torch.nn as nn -from mmcv.cnn import ( - build_conv_layer, - build_norm_layer, - constant_init, - kaiming_init, - normal_init, -) -# from mmcv.runner import load_checkpoint -from .hrt_checkpoint import load_checkpoint -from mmcv.runner.checkpoint import load_state_dict -from mmcv.utils.parrots_wrapper import _BatchNorm - -from mmpose.models.utils.ops import resize -from mmpose.utils import get_root_logger -from ..builder import BACKBONES -from .modules.bottleneck_block import Bottleneck -from .modules.transformer_block import GeneralTransformerBlock - - -class HighResolutionTransformerModule(nn.Module): - def __init__( - self, - num_branches, - blocks, - num_blocks, - in_channels, - num_channels, - multiscale_output, - with_cp=False, - conv_cfg=None, - norm_cfg=dict(type="BN", requires_grad=True), - num_heads=None, - num_window_sizes=None, - num_mlp_ratios=None, - drop_paths=0.0, - ): - super(HighResolutionTransformerModule, self).__init__() - self._check_branches(num_branches, num_blocks, in_channels, num_channels) - - self.in_channels = in_channels - self.num_branches = num_branches - - self.multiscale_output = multiscale_output - self.norm_cfg = norm_cfg - self.conv_cfg = conv_cfg - self.with_cp = with_cp - self.branches = self._make_branches( - num_branches, - blocks, - num_blocks, - num_channels, - num_heads, - num_window_sizes, - num_mlp_ratios, - drop_paths, - ) - self.fuse_layers = self._make_fuse_layers() - self.relu = nn.ReLU(inplace=True) - - # MHSA parameters - self.num_heads = num_heads - self.num_window_sizes = num_window_sizes - self.num_mlp_ratios = num_mlp_ratios - - def _check_branches(self, num_branches, num_blocks, in_channels, num_channels): - logger = get_root_logger() - if num_branches != len(num_blocks): - error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format( - num_branches, len(num_blocks) - ) - logger.error(error_msg) - raise ValueError(error_msg) - - if num_branches != len(num_channels): - error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format( - num_branches, len(num_channels) - ) - logger.error(error_msg) - raise ValueError(error_msg) - - if num_branches != len(in_channels): - error_msg = "NUM_BRANCHES({}) <> IN_CHANNELS({})".format( - num_branches, len(in_channels) - ) - logger.error(error_msg) - raise ValueError(error_msg) - - def _make_one_branch( - self, - branch_index, - block, - num_blocks, - num_channels, - num_heads, - num_window_sizes, - num_mlp_ratios, - drop_paths, - stride=1, - ): - """Make one branch.""" - downsample = None - if ( - stride != 1 - or self.in_channels[branch_index] - != num_channels[branch_index] * block.expansion - ): - downsample = nn.Sequential( - build_conv_layer( - self.conv_cfg, - self.in_channels[branch_index], - num_channels[branch_index] * block.expansion, - kernel_size=1, - stride=stride, - bias=False, - ), - build_norm_layer( - self.norm_cfg, num_channels[branch_index] * block.expansion - )[1], - ) - - layers = [] - - layers.append( - block( - self.in_channels[branch_index], - num_channels[branch_index], - num_heads=num_heads[branch_index], - window_size=num_window_sizes[branch_index], - mlp_ratio=num_mlp_ratios[branch_index], - drop_path=drop_paths[0], - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg, - ) - ) - self.in_channels[branch_index] = num_channels[branch_index] * block.expansion - for i in range(1, num_blocks[branch_index]): - layers.append( - block( - self.in_channels[branch_index], - num_channels[branch_index], - num_heads=num_heads[branch_index], - window_size=num_window_sizes[branch_index], - mlp_ratio=num_mlp_ratios[branch_index], - drop_path=drop_paths[i], - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg, - ) - ) - - return nn.Sequential(*layers) - - def _make_branches( - self, - num_branches, - block, - num_blocks, - num_channels, - num_heads, - num_window_sizes, - num_mlp_ratios, - drop_paths, - ): - """Make branches.""" - branches = [] - - for i in range(num_branches): - branches.append( - self._make_one_branch( - i, - block, - num_blocks, - num_channels, - num_heads, - num_window_sizes, - num_mlp_ratios, - drop_paths, - ) - ) - - return nn.ModuleList(branches) - - def _make_fuse_layers(self): - """Build fuse layer.""" - if self.num_branches == 1: - return None - - num_branches = self.num_branches - in_channels = self.in_channels - fuse_layers = [] - num_out_branches = num_branches if self.multiscale_output else 1 - for i in range(num_out_branches): - fuse_layer = [] - for j in range(num_branches): - if j > i: - fuse_layer.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[i], - kernel_size=1, - stride=1, - padding=0, - bias=False, - ), - build_norm_layer(self.norm_cfg, in_channels[i])[1], - nn.Upsample( - scale_factor=2 ** (j - i), - mode="bilinear", - align_corners=False, - ), - ) - ) - elif j == i: - fuse_layer.append(None) - else: - conv_downsamples = [] - for k in range(i - j): - if k == i - j - 1: - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[j], - kernel_size=3, - stride=2, - padding=1, - groups=in_channels[j], - bias=False, - ), - build_norm_layer(self.norm_cfg, in_channels[j])[1], - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[i], - kernel_size=1, - stride=1, - bias=False, - ), - build_norm_layer(self.norm_cfg, in_channels[i])[1], - ) - ) - else: - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[j], - kernel_size=3, - stride=2, - padding=1, - groups=in_channels[j], - bias=False, - ), - build_norm_layer(self.norm_cfg, in_channels[j])[1], - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[j], - kernel_size=1, - stride=1, - bias=False, - ), - build_norm_layer(self.norm_cfg, in_channels[j])[1], - nn.ReLU(inplace=True), - ) - ) - fuse_layer.append(nn.Sequential(*conv_downsamples)) - fuse_layers.append(nn.ModuleList(fuse_layer)) - return nn.ModuleList(fuse_layers) - - def forward(self, x): - """Forward function.""" - if self.num_branches == 1: - return [self.branches[0](x[0])] - - for i in range(self.num_branches): - x[i] = self.branches[i](x[i]) - - x_fuse = [] - for i in range(len(self.fuse_layers)): - y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) - for j in range(1, self.num_branches): - if i == j: - y += x[j] - elif j > i: - y = y + resize( - self.fuse_layers[i][j](x[j]), - size=x[i].shape[2:], - mode="bilinear", - align_corners=False, - ) - else: - y += self.fuse_layers[i][j](x[j]) - x_fuse.append(self.relu(y)) - return x_fuse - - -@BACKBONES.register_module() -class HRT(nn.Module): - """HRT backbone. - High Resolution Transformer Backbone - """ - - blocks_dict = { - "BOTTLENECK": Bottleneck, - "TRANSFORMER_BLOCK": GeneralTransformerBlock, - } - - def __init__( - self, - extra, - in_channels=3, - conv_cfg=None, - norm_cfg=dict(type="BN", requires_grad=True), - norm_eval=False, - with_cp=False, - zero_init_residual=False, - ): - super(HRT, self).__init__() - self.extra = extra - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.norm_eval = norm_eval - self.with_cp = with_cp - self.zero_init_residual = zero_init_residual - - # stem net - self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) - self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) - - self.conv1 = build_conv_layer( - self.conv_cfg, - in_channels, - 64, - kernel_size=3, - stride=2, - padding=1, - bias=False, - ) - self.add_module(self.norm1_name, norm1) - - self.conv2 = build_conv_layer( - self.conv_cfg, 64, 64, kernel_size=3, stride=2, padding=1, bias=False - ) - self.add_module(self.norm2_name, norm2) - self.relu = nn.ReLU(inplace=True) - - # generat drop path rate list - depth_s2 = ( - self.extra["stage2"]["num_blocks"][0] * self.extra["stage2"]["num_modules"] - ) - depth_s3 = ( - self.extra["stage3"]["num_blocks"][0] * self.extra["stage3"]["num_modules"] - ) - depth_s4 = ( - self.extra["stage4"]["num_blocks"][0] * self.extra["stage4"]["num_modules"] - ) - depths = [depth_s2, depth_s3, depth_s4] - drop_path_rate = self.extra["drop_path_rate"] - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] - - logger = get_root_logger() - logger.info(dpr) - - # stage 1 - self.stage1_cfg = self.extra["stage1"] - num_channels = self.stage1_cfg["num_channels"][0] - block_type = self.stage1_cfg["block"] - num_blocks = self.stage1_cfg["num_blocks"][0] - - block = self.blocks_dict[block_type] - stage1_out_channels = num_channels * block.expansion - self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) - - # stage 2 - self.stage2_cfg = self.extra["stage2"] - num_channels = self.stage2_cfg["num_channels"] - block_type = self.stage2_cfg["block"] - - block = self.blocks_dict[block_type] - num_channels = [channel * block.expansion for channel in num_channels] - self.transition1 = self._make_transition_layer( - [stage1_out_channels], num_channels - ) - self.stage2, pre_stage_channels = self._make_stage( - self.stage2_cfg, num_channels, drop_paths=dpr[0:depth_s2] - ) - - # stage 3 - self.stage3_cfg = self.extra["stage3"] - num_channels = self.stage3_cfg["num_channels"] - block_type = self.stage3_cfg["block"] - - block = self.blocks_dict[block_type] - num_channels = [channel * block.expansion for channel in num_channels] - self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels) - self.stage3, pre_stage_channels = self._make_stage( - self.stage3_cfg, - num_channels, - drop_paths=dpr[depth_s2 : depth_s2 + depth_s3], - ) - - # stage 4 - self.stage4_cfg = self.extra["stage4"] - num_channels = self.stage4_cfg["num_channels"] - block_type = self.stage4_cfg["block"] - - block = self.blocks_dict[block_type] - num_channels = [channel * block.expansion for channel in num_channels] - self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels) - self.stage4, pre_stage_channels = self._make_stage( - self.stage4_cfg, - num_channels, - multiscale_output=self.stage4_cfg.get("multiscale_output", True), - drop_paths=dpr[depth_s2 + depth_s3 :], - ) - - @property - def norm1(self): - """nn.Module: the normalization layer named "norm1" """ - return getattr(self, self.norm1_name) - - @property - def norm2(self): - """nn.Module: the normalization layer named "norm2" """ - return getattr(self, self.norm2_name) - - def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer): - """Make transition layer.""" - num_branches_cur = len(num_channels_cur_layer) - num_branches_pre = len(num_channels_pre_layer) - - transition_layers = [] - for i in range(num_branches_cur): - if i < num_branches_pre: - if num_channels_cur_layer[i] != num_channels_pre_layer[i]: - transition_layers.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - num_channels_pre_layer[i], - num_channels_cur_layer[i], - kernel_size=3, - stride=1, - padding=1, - bias=False, - ), - build_norm_layer(self.norm_cfg, num_channels_cur_layer[i])[ - 1 - ], - nn.ReLU(inplace=True), - ) - ) - else: - transition_layers.append(None) - else: - conv_downsamples = [] - for j in range(i + 1 - num_branches_pre): - in_channels = num_channels_pre_layer[-1] - out_channels = ( - num_channels_cur_layer[i] - if j == i - num_branches_pre - else in_channels - ) - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels, - out_channels, - kernel_size=3, - stride=2, - padding=1, - bias=False, - ), - build_norm_layer(self.norm_cfg, out_channels)[1], - nn.ReLU(inplace=True), - ) - ) - transition_layers.append(nn.Sequential(*conv_downsamples)) - - return nn.ModuleList(transition_layers) - - def _make_layer( - self, - block, - inplanes, - planes, - blocks, - stride=1, - num_heads=1, - window_size=7, - mlp_ratio=4.0, - ): - """Make each layer.""" - downsample = None - if stride != 1 or inplanes != planes * block.expansion: - downsample = nn.Sequential( - build_conv_layer( - self.conv_cfg, - inplanes, - planes * block.expansion, - kernel_size=1, - stride=stride, - bias=False, - ), - build_norm_layer(self.norm_cfg, planes * block.expansion)[1], - ) - - layers = [] - if isinstance(block, GeneralTransformerBlock): - layers.append( - block( - inplanes, - planes, - num_heads=num_heads, - window_size=window_size, - mlp_ratio=mlp_ratio, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg, - ) - ) - else: - layers.append( - block( - inplanes, - planes, - stride, - downsample=downsample, - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg, - ) - ) - inplanes = planes * block.expansion - for i in range(1, blocks): - layers.append( - block( - inplanes, - planes, - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg, - ) - ) - - return nn.Sequential(*layers) - - def _make_stage( - self, layer_config, in_channels, multiscale_output=True, drop_paths=0.0 - ): - """Make each stage.""" - num_modules = layer_config["num_modules"] - num_branches = layer_config["num_branches"] - num_blocks = layer_config["num_blocks"] - num_channels = layer_config["num_channels"] - block = self.blocks_dict[layer_config["block"]] - - num_heads = layer_config["num_heads"] - num_window_sizes = layer_config["num_window_sizes"] - num_mlp_ratios = layer_config["num_mlp_ratios"] - - hr_modules = [] - for i in range(num_modules): - # multi_scale_output is only used for the last module - if not multiscale_output and i == num_modules - 1: - reset_multiscale_output = False - else: - reset_multiscale_output = True - - hr_modules.append( - HighResolutionTransformerModule( - num_branches, - block, - num_blocks, - in_channels, - num_channels, - reset_multiscale_output, - with_cp=self.with_cp, - norm_cfg=self.norm_cfg, - conv_cfg=self.conv_cfg, - num_heads=num_heads, - num_window_sizes=num_window_sizes, - num_mlp_ratios=num_mlp_ratios, - drop_paths=drop_paths[num_blocks[0] * i : num_blocks[0] * (i + 1)], - ) - ) - - return nn.Sequential(*hr_modules), in_channels - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - ckpt = load_checkpoint(self, pretrained, strict=False) - if "model" in ckpt: - msg = self.load_state_dict(ckpt["model"], strict=False) - logger.info(msg) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - """mmseg: kaiming_init(m)""" - normal_init(m, std=0.001) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - - if self.zero_init_residual: - for m in self.modules(): - if isinstance(m, Bottleneck): - constant_init(m.norm3, 0) - elif isinstance(m, BasicBlock): - constant_init(m.norm2, 0) - else: - raise TypeError("pretrained must be a str or None") - - def forward(self, x): - """Forward function.""" - x = self.conv1(x) - x = self.norm1(x) - x = self.relu(x) - x = self.conv2(x) - x = self.norm2(x) - x = self.relu(x) - x = self.layer1(x) - - x_list = [] - for i in range(self.stage2_cfg["num_branches"]): - if self.transition1[i] is not None: - x_list.append(self.transition1[i](x)) - else: - x_list.append(x) - y_list = self.stage2(x_list) - - x_list = [] - for i in range(self.stage3_cfg["num_branches"]): - if self.transition2[i] is not None: - x_list.append(self.transition2[i](y_list[-1])) - else: - x_list.append(y_list[i]) - y_list = self.stage3(x_list) - - x_list = [] - for i in range(self.stage4_cfg["num_branches"]): - if self.transition3[i] is not None: - x_list.append(self.transition3[i](y_list[-1])) - else: - x_list.append(y_list[i]) - y_list = self.stage4(x_list) - - return y_list - - def train(self, mode=True): - """Convert the model into training mode.""" - super(HRT, self).train(mode) - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py b/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py deleted file mode 100644 index e27749d45ad2e1b24e50de8b85af90b4464e91ba..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py +++ /dev/null @@ -1,500 +0,0 @@ -# Copyright (c) Open-MMLab. All rights reserved. -import io -import os -import os.path as osp -import pkgutil -import time -import warnings -from collections import OrderedDict -from importlib import import_module -from tempfile import TemporaryDirectory - -import torch -import torchvision -from torch.optim import Optimizer -from torch.utils import model_zoo -from torch.nn import functional as F - -import mmcv -from mmcv.fileio import FileClient -from mmcv.fileio import load as load_file -from mmcv.parallel import is_module_wrapper -from mmcv.utils import mkdir_or_exist -from mmcv.runner import get_dist_info - -ENV_MMCV_HOME = 'MMCV_HOME' -ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' -DEFAULT_CACHE_DIR = '~/.cache' - - -def _get_mmcv_home(): - mmcv_home = os.path.expanduser( - os.getenv( - ENV_MMCV_HOME, - os.path.join( - os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv'))) - - mkdir_or_exist(mmcv_home) - return mmcv_home - - -def load_state_dict(module, state_dict, strict=False, logger=None): - """Load state_dict to a module. - - This method is modified from :meth:`torch.nn.Module.load_state_dict`. - Default value for ``strict`` is set to ``False`` and the message for - param mismatch will be shown even if strict is False. - - Args: - module (Module): Module that receives the state_dict. - state_dict (OrderedDict): Weights. - strict (bool): whether to strictly enforce that the keys - in :attr:`state_dict` match the keys returned by this module's - :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. - logger (:obj:`logging.Logger`, optional): Logger to log the error - message. If not specified, print function will be used. - """ - unexpected_keys = [] - all_missing_keys = [] - err_msg = [] - - metadata = getattr(state_dict, '_metadata', None) - state_dict = state_dict.copy() - if metadata is not None: - state_dict._metadata = metadata - - # use _load_from_state_dict to enable checkpoint version control - def load(module, prefix=''): - # recursively check parallel module in case that the model has a - # complicated structure, e.g., nn.Module(nn.Module(DDP)) - if is_module_wrapper(module): - module = module.module - local_metadata = {} if metadata is None else metadata.get( - prefix[:-1], {}) - module._load_from_state_dict(state_dict, prefix, local_metadata, True, - all_missing_keys, unexpected_keys, - err_msg) - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + '.') - - load(module) - load = None # break load->load reference cycle - - # ignore "num_batches_tracked" of BN layers - missing_keys = [ - key for key in all_missing_keys if 'num_batches_tracked' not in key - ] - - if unexpected_keys: - err_msg.append('unexpected key in source ' - f'state_dict: {", ".join(unexpected_keys)}\n') - if missing_keys: - err_msg.append( - f'missing keys in source state_dict: {", ".join(missing_keys)}\n') - - rank, _ = get_dist_info() - if len(err_msg) > 0 and rank == 0: - err_msg.insert( - 0, 'The model and loaded state dict do not match exactly\n') - err_msg = '\n'.join(err_msg) - if strict: - raise RuntimeError(err_msg) - elif logger is not None: - logger.warning(err_msg) - else: - print(err_msg) - - -def load_url_dist(url, model_dir=None): - """In distributed setting, this function only download checkpoint at local - rank 0.""" - rank, world_size = get_dist_info() - rank = int(os.environ.get('LOCAL_RANK', rank)) - if rank == 0: - checkpoint = model_zoo.load_url(url, model_dir=model_dir) - if world_size > 1: - torch.distributed.barrier() - if rank > 0: - checkpoint = model_zoo.load_url(url, model_dir=model_dir) - return checkpoint - - -def load_pavimodel_dist(model_path, map_location=None): - """In distributed setting, this function only download checkpoint at local - rank 0.""" - try: - from pavi import modelcloud - except ImportError: - raise ImportError( - 'Please install pavi to load checkpoint from modelcloud.') - rank, world_size = get_dist_info() - rank = int(os.environ.get('LOCAL_RANK', rank)) - if rank == 0: - model = modelcloud.get(model_path) - with TemporaryDirectory() as tmp_dir: - downloaded_file = osp.join(tmp_dir, model.name) - model.download(downloaded_file) - checkpoint = torch.load(downloaded_file, map_location=map_location) - if world_size > 1: - torch.distributed.barrier() - if rank > 0: - model = modelcloud.get(model_path) - with TemporaryDirectory() as tmp_dir: - downloaded_file = osp.join(tmp_dir, model.name) - model.download(downloaded_file) - checkpoint = torch.load( - downloaded_file, map_location=map_location) - return checkpoint - - -def load_fileclient_dist(filename, backend, map_location): - """In distributed setting, this function only download checkpoint at local - rank 0.""" - rank, world_size = get_dist_info() - rank = int(os.environ.get('LOCAL_RANK', rank)) - allowed_backends = ['ceph'] - if backend not in allowed_backends: - raise ValueError(f'Load from Backend {backend} is not supported.') - if rank == 0: - fileclient = FileClient(backend=backend) - buffer = io.BytesIO(fileclient.get(filename)) - checkpoint = torch.load(buffer, map_location=map_location) - if world_size > 1: - torch.distributed.barrier() - if rank > 0: - fileclient = FileClient(backend=backend) - buffer = io.BytesIO(fileclient.get(filename)) - checkpoint = torch.load(buffer, map_location=map_location) - return checkpoint - - -def get_torchvision_models(): - model_urls = dict() - for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): - if ispkg: - continue - _zoo = import_module(f'torchvision.models.{name}') - if hasattr(_zoo, 'model_urls'): - _urls = getattr(_zoo, 'model_urls') - model_urls.update(_urls) - return model_urls - - -def get_external_models(): - mmcv_home = _get_mmcv_home() - default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json') - default_urls = load_file(default_json_path) - assert isinstance(default_urls, dict) - external_json_path = osp.join(mmcv_home, 'open_mmlab.json') - if osp.exists(external_json_path): - external_urls = load_file(external_json_path) - assert isinstance(external_urls, dict) - default_urls.update(external_urls) - - return default_urls - - -def get_mmcls_models(): - mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json') - mmcls_urls = load_file(mmcls_json_path) - - return mmcls_urls - - -def get_deprecated_model_names(): - deprecate_json_path = osp.join(mmcv.__path__[0], - 'model_zoo/deprecated.json') - deprecate_urls = load_file(deprecate_json_path) - assert isinstance(deprecate_urls, dict) - - return deprecate_urls - - -def _process_mmcls_checkpoint(checkpoint): - state_dict = checkpoint['state_dict'] - new_state_dict = OrderedDict() - for k, v in state_dict.items(): - if k.startswith('backbone.'): - new_state_dict[k[9:]] = v - new_checkpoint = dict(state_dict=new_state_dict) - - return new_checkpoint - - -def _load_checkpoint(filename, map_location=None): - """Load checkpoint from somewhere (modelzoo, file, url). - - Args: - filename (str): Accept local filepath, URL, ``torchvision://xxx``, - ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for - details. - map_location (str | None): Same as :func:`torch.load`. Default: None. - - Returns: - dict | OrderedDict: The loaded checkpoint. It can be either an - OrderedDict storing model weights or a dict containing other - information, which depends on the checkpoint. - """ - if filename.startswith('modelzoo://'): - warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' - 'use "torchvision://" instead') - model_urls = get_torchvision_models() - model_name = filename[11:] - checkpoint = load_url_dist(model_urls[model_name]) - elif filename.startswith('torchvision://'): - model_urls = get_torchvision_models() - model_name = filename[14:] - checkpoint = load_url_dist(model_urls[model_name]) - elif filename.startswith('open-mmlab://'): - model_urls = get_external_models() - model_name = filename[13:] - deprecated_urls = get_deprecated_model_names() - if model_name in deprecated_urls: - warnings.warn(f'open-mmlab://{model_name} is deprecated in favor ' - f'of open-mmlab://{deprecated_urls[model_name]}') - model_name = deprecated_urls[model_name] - model_url = model_urls[model_name] - # check if is url - if model_url.startswith(('http://', 'https://')): - checkpoint = load_url_dist(model_url) - else: - filename = osp.join(_get_mmcv_home(), model_url) - if not osp.isfile(filename): - raise IOError(f'{filename} is not a checkpoint file') - checkpoint = torch.load(filename, map_location=map_location) - elif filename.startswith('mmcls://'): - model_urls = get_mmcls_models() - model_name = filename[8:] - checkpoint = load_url_dist(model_urls[model_name]) - checkpoint = _process_mmcls_checkpoint(checkpoint) - elif filename.startswith(('http://', 'https://')): - checkpoint = load_url_dist(filename) - elif filename.startswith('pavi://'): - model_path = filename[7:] - checkpoint = load_pavimodel_dist(model_path, map_location=map_location) - elif filename.startswith('s3://'): - checkpoint = load_fileclient_dist( - filename, backend='ceph', map_location=map_location) - else: - if not osp.isfile(filename): - raise IOError(f'{filename} is not a checkpoint file') - checkpoint = torch.load(filename, map_location=map_location) - return checkpoint - - -def load_checkpoint(model, - filename, - map_location='cpu', - strict=False, - logger=None): - """Load checkpoint from a file or URI. - - Args: - model (Module): Module to load checkpoint. - filename (str): Accept local filepath, URL, ``torchvision://xxx``, - ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for - details. - map_location (str): Same as :func:`torch.load`. - strict (bool): Whether to allow different params for the model and - checkpoint. - logger (:mod:`logging.Logger` or None): The logger for error message. - - Returns: - dict or OrderedDict: The loaded checkpoint. - """ - checkpoint = _load_checkpoint(filename, map_location) - # OrderedDict is a subclass of dict - if not isinstance(checkpoint, dict): - raise RuntimeError( - f'No state_dict found in checkpoint file {filename}') - # get state_dict from checkpoint - if 'state_dict' in checkpoint: - state_dict = checkpoint['state_dict'] - elif 'model' in checkpoint: - state_dict = checkpoint['model'] - else: - state_dict = checkpoint - # strip prefix of state_dict - if list(state_dict.keys())[0].startswith('module.'): - state_dict = {k[7:]: v for k, v in state_dict.items()} - - # for MoBY, load model of online branch - if sorted(list(state_dict.keys()))[0].startswith('encoder'): - state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')} - - # reshape absolute position embedding - if state_dict.get('absolute_pos_embed') is not None: - absolute_pos_embed = state_dict['absolute_pos_embed'] - N1, L, C1 = absolute_pos_embed.size() - N2, C2, H, W = model.absolute_pos_embed.size() - if N1 != N2 or C1 != C2 or L != H*W: - logger.warning("Error in loading absolute_pos_embed, pass") - else: - state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2) - - # interpolate position bias table if needed - # relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k] - # for table_key in relative_position_bias_table_keys: - # table_pretrained = state_dict[table_key] - # table_current = model.state_dict()[table_key] - # L1, nH1 = table_pretrained.size() - # L2, nH2 = table_current.size() - # if nH1 != nH2: - # logger.warning(f"Error in loading {table_key}, pass") - # else: - # if L1 != L2: - # S1 = int(L1 ** 0.5) - # S2 = int(L2 ** 0.5) - # table_pretrained_resized = F.interpolate( - # table_pretrained.permute(1, 0).view(1, nH1, S1, S1), - # size=(S2, S2), mode='bicubic') - # state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0) - - # load state_dict - load_state_dict(model, state_dict, strict, logger) - return checkpoint - - -def weights_to_cpu(state_dict): - """Copy a model state_dict to cpu. - - Args: - state_dict (OrderedDict): Model weights on GPU. - - Returns: - OrderedDict: Model weights on GPU. - """ - state_dict_cpu = OrderedDict() - for key, val in state_dict.items(): - state_dict_cpu[key] = val.cpu() - return state_dict_cpu - - -def _save_to_state_dict(module, destination, prefix, keep_vars): - """Saves module state to `destination` dictionary. - - This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. - - Args: - module (nn.Module): The module to generate state_dict. - destination (dict): A dict where state will be stored. - prefix (str): The prefix for parameters and buffers used in this - module. - """ - for name, param in module._parameters.items(): - if param is not None: - destination[prefix + name] = param if keep_vars else param.detach() - for name, buf in module._buffers.items(): - # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d - if buf is not None: - destination[prefix + name] = buf if keep_vars else buf.detach() - - -def get_state_dict(module, destination=None, prefix='', keep_vars=False): - """Returns a dictionary containing a whole state of the module. - - Both parameters and persistent buffers (e.g. running averages) are - included. Keys are corresponding parameter and buffer names. - - This method is modified from :meth:`torch.nn.Module.state_dict` to - recursively check parallel module in case that the model has a complicated - structure, e.g., nn.Module(nn.Module(DDP)). - - Args: - module (nn.Module): The module to generate state_dict. - destination (OrderedDict): Returned dict for the state of the - module. - prefix (str): Prefix of the key. - keep_vars (bool): Whether to keep the variable property of the - parameters. Default: False. - - Returns: - dict: A dictionary containing a whole state of the module. - """ - # recursively check parallel module in case that the model has a - # complicated structure, e.g., nn.Module(nn.Module(DDP)) - if is_module_wrapper(module): - module = module.module - - # below is the same as torch.nn.Module.state_dict() - if destination is None: - destination = OrderedDict() - destination._metadata = OrderedDict() - destination._metadata[prefix[:-1]] = local_metadata = dict( - version=module._version) - _save_to_state_dict(module, destination, prefix, keep_vars) - for name, child in module._modules.items(): - if child is not None: - get_state_dict( - child, destination, prefix + name + '.', keep_vars=keep_vars) - for hook in module._state_dict_hooks.values(): - hook_result = hook(module, destination, prefix, local_metadata) - if hook_result is not None: - destination = hook_result - return destination - - -def save_checkpoint(model, filename, optimizer=None, meta=None): - """Save checkpoint to file. - - The checkpoint will have 3 fields: ``meta``, ``state_dict`` and - ``optimizer``. By default ``meta`` will contain version and time info. - - Args: - model (Module): Module whose params are to be saved. - filename (str): Checkpoint filename. - optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. - meta (dict, optional): Metadata to be saved in checkpoint. - """ - if meta is None: - meta = {} - elif not isinstance(meta, dict): - raise TypeError(f'meta must be a dict or None, but got {type(meta)}') - meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) - - if is_module_wrapper(model): - model = model.module - - if hasattr(model, 'CLASSES') and model.CLASSES is not None: - # save class name to the meta - meta.update(CLASSES=model.CLASSES) - - checkpoint = { - 'meta': meta, - 'state_dict': weights_to_cpu(get_state_dict(model)) - } - # save optimizer state dict in the checkpoint - if isinstance(optimizer, Optimizer): - checkpoint['optimizer'] = optimizer.state_dict() - elif isinstance(optimizer, dict): - checkpoint['optimizer'] = {} - for name, optim in optimizer.items(): - checkpoint['optimizer'][name] = optim.state_dict() - - if filename.startswith('pavi://'): - try: - from pavi import modelcloud - from pavi.exception import NodeNotFoundError - except ImportError: - raise ImportError( - 'Please install pavi to load checkpoint from modelcloud.') - model_path = filename[7:] - root = modelcloud.Folder() - model_dir, model_name = osp.split(model_path) - try: - model = modelcloud.get(model_dir) - except NodeNotFoundError: - model = root.create_training_model(model_dir) - with TemporaryDirectory() as tmp_dir: - checkpoint_file = osp.join(tmp_dir, model_name) - with open(checkpoint_file, 'wb') as f: - torch.save(checkpoint, f) - f.flush() - model.create_file(checkpoint_file, name=model_name) - else: - mmcv.mkdir_or_exist(osp.dirname(filename)) - # immediately flush buffer - with open(filename, 'wb') as f: - torch.save(checkpoint, f) - f.flush() \ No newline at end of file diff --git a/main/transformer_utils/mmpose/models/backbones/i3d.py b/main/transformer_utils/mmpose/models/backbones/i3d.py deleted file mode 100644 index 64f330abac1facc16db743ef3ffbcd23248d6865..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/i3d.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -# Code is modified from `Third-party pytorch implementation of i3d -# `. - -import torch -import torch.nn as nn - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone - - -class Conv3dBlock(nn.Module): - """Basic 3d convolution block for I3D. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - expansion (float): The multiplier of in_channels and out_channels. - Default: 1. - kernel_size (tuple[int]): kernel size of the 3d convolution layer. - Default: (1, 1, 1). - stride (tuple[int]): stride of the block. Default: (1, 1, 1) - padding (tuple[int]): padding of the input tensor. Default: (0, 0, 0) - use_bias (bool): whether to enable bias in 3d convolution layer. - Default: False - use_bn (bool): whether to use Batch Normalization after 3d convolution - layer. Default: True - use_relu (bool): whether to use ReLU after Batch Normalization layer. - Default: True - """ - - def __init__(self, - in_channels, - out_channels, - expansion=1.0, - kernel_size=(1, 1, 1), - stride=(1, 1, 1), - padding=(0, 0, 0), - use_bias=False, - use_bn=True, - use_relu=True): - super().__init__() - - in_channels = int(in_channels * expansion) - out_channels = int(out_channels * expansion) - - self.conv3d = nn.Conv3d( - in_channels, - out_channels, - kernel_size, - padding=padding, - stride=stride, - bias=use_bias) - - self.use_bn = use_bn - self.use_relu = use_relu - - if self.use_bn: - self.batch3d = nn.BatchNorm3d(out_channels) - - if self.use_relu: - self.activation = nn.ReLU(inplace=True) - - def forward(self, x): - """Forward function.""" - out = self.conv3d(x) - if self.use_bn: - out = self.batch3d(out) - if self.use_relu: - out = self.activation(out) - return out - - -class Mixed(nn.Module): - """Inception block for I3D. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - expansion (float): The multiplier of in_channels and out_channels. - Default: 1. - """ - - def __init__(self, in_channels, out_channels, expansion=1.0): - super(Mixed, self).__init__() - # Branch 0 - self.branch_0 = Conv3dBlock( - in_channels, out_channels[0], expansion, kernel_size=(1, 1, 1)) - - # Branch 1 - branch_1_conv1 = Conv3dBlock( - in_channels, out_channels[1], expansion, kernel_size=(1, 1, 1)) - branch_1_conv2 = Conv3dBlock( - out_channels[1], - out_channels[2], - expansion, - kernel_size=(3, 3, 3), - padding=(1, 1, 1)) - self.branch_1 = nn.Sequential(branch_1_conv1, branch_1_conv2) - - # Branch 2 - branch_2_conv1 = Conv3dBlock( - in_channels, out_channels[3], expansion, kernel_size=(1, 1, 1)) - branch_2_conv2 = Conv3dBlock( - out_channels[3], - out_channels[4], - expansion, - kernel_size=(3, 3, 3), - padding=(1, 1, 1)) - self.branch_2 = nn.Sequential(branch_2_conv1, branch_2_conv2) - - # Branch3 - branch_3_pool = nn.MaxPool3d( - kernel_size=(3, 3, 3), - stride=(1, 1, 1), - padding=(1, 1, 1), - ceil_mode=True) - branch_3_conv2 = Conv3dBlock( - in_channels, out_channels[5], expansion, kernel_size=(1, 1, 1)) - self.branch_3 = nn.Sequential(branch_3_pool, branch_3_conv2) - - def forward(self, x): - """Forward function.""" - out_0 = self.branch_0(x) - out_1 = self.branch_1(x) - out_2 = self.branch_2(x) - out_3 = self.branch_3(x) - out = torch.cat((out_0, out_1, out_2, out_3), 1) - return out - - -@BACKBONES.register_module() -class I3D(BaseBackbone): - """I3D backbone. - - Please refer to the `paper `__ for - details. - - Args: - in_channels (int): Input channels of the backbone, which is decided - on the input modality. - expansion (float): The multiplier of in_channels and out_channels. - Default: 1. - """ - - def __init__(self, in_channels=3, expansion=1.0): - super(I3D, self).__init__() - - # expansion must be an integer multiple of 1/8 - expansion = round(8 * expansion) / 8.0 - - # xut Layer - self.conv3d_1a_7x7 = Conv3dBlock( - out_channels=64, - in_channels=in_channels / expansion, - expansion=expansion, - kernel_size=(7, 7, 7), - stride=(2, 2, 2), - padding=(2, 3, 3)) - self.maxPool3d_2a_3x3 = nn.MaxPool3d( - kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)) - - # Layer 2 - self.conv3d_2b_1x1 = Conv3dBlock( - out_channels=64, - in_channels=64, - expansion=expansion, - kernel_size=(1, 1, 1)) - self.conv3d_2c_3x3 = Conv3dBlock( - out_channels=192, - in_channels=64, - expansion=expansion, - kernel_size=(3, 3, 3), - padding=(1, 1, 1)) - self.maxPool3d_3a_3x3 = nn.MaxPool3d( - kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)) - - # Mixed_3b - self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32], expansion) - self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64], expansion) - self.maxPool3d_4a_3x3 = nn.MaxPool3d( - kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1)) - - # Mixed 4 - self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64], expansion) - self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64], expansion) - self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64], expansion) - self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64], expansion) - self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128], expansion) - - self.maxPool3d_5a_2x2 = nn.MaxPool3d( - kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0)) - - # Mixed 5 - self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128], expansion) - self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128], expansion) - - def forward(self, x): - out = self.conv3d_1a_7x7(x) - out = self.maxPool3d_2a_3x3(out) - out = self.conv3d_2b_1x1(out) - out = self.conv3d_2c_3x3(out) - out = self.maxPool3d_3a_3x3(out) - out = self.mixed_3b(out) - out = self.mixed_3c(out) - out = self.maxPool3d_4a_3x3(out) - out = self.mixed_4b(out) - out = self.mixed_4c(out) - out = self.mixed_4d(out) - out = self.mixed_4e(out) - out = self.mixed_4f(out) - out = self.maxPool3d_5a_2x2(out) - out = self.mixed_5b(out) - out = self.mixed_5c(out) - return out diff --git a/main/transformer_utils/mmpose/models/backbones/litehrnet.py b/main/transformer_utils/mmpose/models/backbones/litehrnet.py deleted file mode 100644 index 954368841eb631e3dc6c77e9810f6980f3739bf3..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/litehrnet.py +++ /dev/null @@ -1,984 +0,0 @@ -# ------------------------------------------------------------------------------ -# Adapted from https://github.com/HRNet/Lite-HRNet -# Original licence: Apache License 2.0. -# ------------------------------------------------------------------------------ - -import mmcv -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.checkpoint as cp -from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, - build_conv_layer, build_norm_layer, constant_init, - normal_init) -from torch.nn.modules.batchnorm import _BatchNorm - -from mmpose.utils import get_root_logger -from ..builder import BACKBONES -from .utils import channel_shuffle, load_checkpoint - - -class SpatialWeighting(nn.Module): - """Spatial weighting module. - - Args: - channels (int): The channels of the module. - ratio (int): channel reduction ratio. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: None. - act_cfg (dict): Config dict for activation layer. - Default: (dict(type='ReLU'), dict(type='Sigmoid')). - The last ConvModule uses Sigmoid by default. - """ - - def __init__(self, - channels, - ratio=16, - conv_cfg=None, - norm_cfg=None, - act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))): - super().__init__() - if isinstance(act_cfg, dict): - act_cfg = (act_cfg, act_cfg) - assert len(act_cfg) == 2 - assert mmcv.is_tuple_of(act_cfg, dict) - self.global_avgpool = nn.AdaptiveAvgPool2d(1) - self.conv1 = ConvModule( - in_channels=channels, - out_channels=int(channels / ratio), - kernel_size=1, - stride=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg[0]) - self.conv2 = ConvModule( - in_channels=int(channels / ratio), - out_channels=channels, - kernel_size=1, - stride=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg[1]) - - def forward(self, x): - out = self.global_avgpool(x) - out = self.conv1(out) - out = self.conv2(out) - return x * out - - -class CrossResolutionWeighting(nn.Module): - """Cross-resolution channel weighting module. - - Args: - channels (int): The channels of the module. - ratio (int): channel reduction ratio. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: None. - act_cfg (dict): Config dict for activation layer. - Default: (dict(type='ReLU'), dict(type='Sigmoid')). - The last ConvModule uses Sigmoid by default. - """ - - def __init__(self, - channels, - ratio=16, - conv_cfg=None, - norm_cfg=None, - act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))): - super().__init__() - if isinstance(act_cfg, dict): - act_cfg = (act_cfg, act_cfg) - assert len(act_cfg) == 2 - assert mmcv.is_tuple_of(act_cfg, dict) - self.channels = channels - total_channel = sum(channels) - self.conv1 = ConvModule( - in_channels=total_channel, - out_channels=int(total_channel / ratio), - kernel_size=1, - stride=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg[0]) - self.conv2 = ConvModule( - in_channels=int(total_channel / ratio), - out_channels=total_channel, - kernel_size=1, - stride=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg[1]) - - def forward(self, x): - mini_size = x[-1].size()[-2:] - out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]] - out = torch.cat(out, dim=1) - out = self.conv1(out) - out = self.conv2(out) - out = torch.split(out, self.channels, dim=1) - out = [ - s * F.interpolate(a, size=s.size()[-2:], mode='nearest') - for s, a in zip(x, out) - ] - return out - - -class ConditionalChannelWeighting(nn.Module): - """Conditional channel weighting block. - - Args: - in_channels (int): The input channels of the block. - stride (int): Stride of the 3x3 convolution layer. - reduce_ratio (int): channel reduction ratio. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - def __init__(self, - in_channels, - stride, - reduce_ratio, - conv_cfg=None, - norm_cfg=dict(type='BN'), - with_cp=False): - super().__init__() - self.with_cp = with_cp - self.stride = stride - assert stride in [1, 2] - - branch_channels = [channel // 2 for channel in in_channels] - - self.cross_resolution_weighting = CrossResolutionWeighting( - branch_channels, - ratio=reduce_ratio, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg) - - self.depthwise_convs = nn.ModuleList([ - ConvModule( - channel, - channel, - kernel_size=3, - stride=self.stride, - padding=1, - groups=channel, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None) for channel in branch_channels - ]) - - self.spatial_weighting = nn.ModuleList([ - SpatialWeighting(channels=channel, ratio=4) - for channel in branch_channels - ]) - - def forward(self, x): - - def _inner_forward(x): - x = [s.chunk(2, dim=1) for s in x] - x1 = [s[0] for s in x] - x2 = [s[1] for s in x] - - x2 = self.cross_resolution_weighting(x2) - x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)] - x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)] - - out = [torch.cat([s1, s2], dim=1) for s1, s2 in zip(x1, x2)] - out = [channel_shuffle(s, 2) for s in out] - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - return out - - -class Stem(nn.Module): - """Stem network block. - - Args: - in_channels (int): The input channels of the block. - stem_channels (int): Output channels of the stem layer. - out_channels (int): The output channels of the block. - expand_ratio (int): adjusts number of channels of the hidden layer - in InvertedResidual by this amount. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - def __init__(self, - in_channels, - stem_channels, - out_channels, - expand_ratio, - conv_cfg=None, - norm_cfg=dict(type='BN'), - with_cp=False): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.with_cp = with_cp - - self.conv1 = ConvModule( - in_channels=in_channels, - out_channels=stem_channels, - kernel_size=3, - stride=2, - padding=1, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=dict(type='ReLU')) - - mid_channels = int(round(stem_channels * expand_ratio)) - branch_channels = stem_channels // 2 - if stem_channels == self.out_channels: - inc_channels = self.out_channels - branch_channels - else: - inc_channels = self.out_channels - stem_channels - - self.branch1 = nn.Sequential( - ConvModule( - branch_channels, - branch_channels, - kernel_size=3, - stride=2, - padding=1, - groups=branch_channels, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None), - ConvModule( - branch_channels, - inc_channels, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=dict(type='ReLU')), - ) - - self.expand_conv = ConvModule( - branch_channels, - mid_channels, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=dict(type='ReLU')) - self.depthwise_conv = ConvModule( - mid_channels, - mid_channels, - kernel_size=3, - stride=2, - padding=1, - groups=mid_channels, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None) - self.linear_conv = ConvModule( - mid_channels, - branch_channels - if stem_channels == self.out_channels else stem_channels, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=dict(type='ReLU')) - - def forward(self, x): - - def _inner_forward(x): - x = self.conv1(x) - x1, x2 = x.chunk(2, dim=1) - - x2 = self.expand_conv(x2) - x2 = self.depthwise_conv(x2) - x2 = self.linear_conv(x2) - - out = torch.cat((self.branch1(x1), x2), dim=1) - - out = channel_shuffle(out, 2) - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - return out - - -class IterativeHead(nn.Module): - """Extra iterative head for feature learning. - - Args: - in_channels (int): The input channels of the block. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - """ - - def __init__(self, in_channels, norm_cfg=dict(type='BN')): - super().__init__() - projects = [] - num_branchs = len(in_channels) - self.in_channels = in_channels[::-1] - - for i in range(num_branchs): - if i != num_branchs - 1: - projects.append( - DepthwiseSeparableConvModule( - in_channels=self.in_channels[i], - out_channels=self.in_channels[i + 1], - kernel_size=3, - stride=1, - padding=1, - norm_cfg=norm_cfg, - act_cfg=dict(type='ReLU'), - dw_act_cfg=None, - pw_act_cfg=dict(type='ReLU'))) - else: - projects.append( - DepthwiseSeparableConvModule( - in_channels=self.in_channels[i], - out_channels=self.in_channels[i], - kernel_size=3, - stride=1, - padding=1, - norm_cfg=norm_cfg, - act_cfg=dict(type='ReLU'), - dw_act_cfg=None, - pw_act_cfg=dict(type='ReLU'))) - self.projects = nn.ModuleList(projects) - - def forward(self, x): - x = x[::-1] - - y = [] - last_x = None - for i, s in enumerate(x): - if last_x is not None: - last_x = F.interpolate( - last_x, - size=s.size()[-2:], - mode='bilinear', - align_corners=True) - s = s + last_x - s = self.projects[i](s) - y.append(s) - last_x = s - - return y[::-1] - - -class ShuffleUnit(nn.Module): - """InvertedResidual block for ShuffleNetV2 backbone. - - Args: - in_channels (int): The input channels of the block. - out_channels (int): The output channels of the block. - stride (int): Stride of the 3x3 convolution layer. Default: 1 - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - def __init__(self, - in_channels, - out_channels, - stride=1, - conv_cfg=None, - norm_cfg=dict(type='BN'), - act_cfg=dict(type='ReLU'), - with_cp=False): - super().__init__() - self.stride = stride - self.with_cp = with_cp - - branch_features = out_channels // 2 - if self.stride == 1: - assert in_channels == branch_features * 2, ( - f'in_channels ({in_channels}) should equal to ' - f'branch_features * 2 ({branch_features * 2}) ' - 'when stride is 1') - - if in_channels != branch_features * 2: - assert self.stride != 1, ( - f'stride ({self.stride}) should not equal 1 when ' - f'in_channels != branch_features * 2') - - if self.stride > 1: - self.branch1 = nn.Sequential( - ConvModule( - in_channels, - in_channels, - kernel_size=3, - stride=self.stride, - padding=1, - groups=in_channels, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None), - ConvModule( - in_channels, - branch_features, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg), - ) - - self.branch2 = nn.Sequential( - ConvModule( - in_channels if (self.stride > 1) else branch_features, - branch_features, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg), - ConvModule( - branch_features, - branch_features, - kernel_size=3, - stride=self.stride, - padding=1, - groups=branch_features, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None), - ConvModule( - branch_features, - branch_features, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg)) - - def forward(self, x): - - def _inner_forward(x): - if self.stride > 1: - out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) - else: - x1, x2 = x.chunk(2, dim=1) - out = torch.cat((x1, self.branch2(x2)), dim=1) - - out = channel_shuffle(out, 2) - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - return out - - -class LiteHRModule(nn.Module): - """High-Resolution Module for LiteHRNet. - - It contains conditional channel weighting blocks and - shuffle blocks. - - - Args: - num_branches (int): Number of branches in the module. - num_blocks (int): Number of blocks in the module. - in_channels (list(int)): Number of input image channels. - reduce_ratio (int): Channel reduction ratio. - module_type (str): 'LITE' or 'NAIVE' - multiscale_output (bool): Whether to output multi-scale features. - with_fuse (bool): Whether to use fuse layers. - conv_cfg (dict): dictionary to construct and config conv layer. - norm_cfg (dict): dictionary to construct and config norm layer. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - """ - - def __init__( - self, - num_branches, - num_blocks, - in_channels, - reduce_ratio, - module_type, - multiscale_output=False, - with_fuse=True, - conv_cfg=None, - norm_cfg=dict(type='BN'), - with_cp=False, - ): - super().__init__() - self._check_branches(num_branches, in_channels) - - self.in_channels = in_channels - self.num_branches = num_branches - - self.module_type = module_type - self.multiscale_output = multiscale_output - self.with_fuse = with_fuse - self.norm_cfg = norm_cfg - self.conv_cfg = conv_cfg - self.with_cp = with_cp - - if self.module_type.upper() == 'LITE': - self.layers = self._make_weighting_blocks(num_blocks, reduce_ratio) - elif self.module_type.upper() == 'NAIVE': - self.layers = self._make_naive_branches(num_branches, num_blocks) - else: - raise ValueError("module_type should be either 'LITE' or 'NAIVE'.") - if self.with_fuse: - self.fuse_layers = self._make_fuse_layers() - self.relu = nn.ReLU() - - def _check_branches(self, num_branches, in_channels): - """Check input to avoid ValueError.""" - if num_branches != len(in_channels): - error_msg = f'NUM_BRANCHES({num_branches}) ' \ - f'!= NUM_INCHANNELS({len(in_channels)})' - raise ValueError(error_msg) - - def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1): - """Make channel weighting blocks.""" - layers = [] - for i in range(num_blocks): - layers.append( - ConditionalChannelWeighting( - self.in_channels, - stride=stride, - reduce_ratio=reduce_ratio, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - with_cp=self.with_cp)) - - return nn.Sequential(*layers) - - def _make_one_branch(self, branch_index, num_blocks, stride=1): - """Make one branch.""" - layers = [] - layers.append( - ShuffleUnit( - self.in_channels[branch_index], - self.in_channels[branch_index], - stride=stride, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=dict(type='ReLU'), - with_cp=self.with_cp)) - for i in range(1, num_blocks): - layers.append( - ShuffleUnit( - self.in_channels[branch_index], - self.in_channels[branch_index], - stride=1, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=dict(type='ReLU'), - with_cp=self.with_cp)) - - return nn.Sequential(*layers) - - def _make_naive_branches(self, num_branches, num_blocks): - """Make branches.""" - branches = [] - - for i in range(num_branches): - branches.append(self._make_one_branch(i, num_blocks)) - - return nn.ModuleList(branches) - - def _make_fuse_layers(self): - """Make fuse layer.""" - if self.num_branches == 1: - return None - - num_branches = self.num_branches - in_channels = self.in_channels - fuse_layers = [] - num_out_branches = num_branches if self.multiscale_output else 1 - for i in range(num_out_branches): - fuse_layer = [] - for j in range(num_branches): - if j > i: - fuse_layer.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[i], - kernel_size=1, - stride=1, - padding=0, - bias=False), - build_norm_layer(self.norm_cfg, in_channels[i])[1], - nn.Upsample( - scale_factor=2**(j - i), mode='nearest'))) - elif j == i: - fuse_layer.append(None) - else: - conv_downsamples = [] - for k in range(i - j): - if k == i - j - 1: - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[j], - kernel_size=3, - stride=2, - padding=1, - groups=in_channels[j], - bias=False), - build_norm_layer(self.norm_cfg, - in_channels[j])[1], - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[i], - kernel_size=1, - stride=1, - padding=0, - bias=False), - build_norm_layer(self.norm_cfg, - in_channels[i])[1])) - else: - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[j], - kernel_size=3, - stride=2, - padding=1, - groups=in_channels[j], - bias=False), - build_norm_layer(self.norm_cfg, - in_channels[j])[1], - build_conv_layer( - self.conv_cfg, - in_channels[j], - in_channels[j], - kernel_size=1, - stride=1, - padding=0, - bias=False), - build_norm_layer(self.norm_cfg, - in_channels[j])[1], - nn.ReLU(inplace=True))) - fuse_layer.append(nn.Sequential(*conv_downsamples)) - fuse_layers.append(nn.ModuleList(fuse_layer)) - - return nn.ModuleList(fuse_layers) - - def forward(self, x): - """Forward function.""" - if self.num_branches == 1: - return [self.layers[0](x[0])] - - if self.module_type.upper() == 'LITE': - out = self.layers(x) - elif self.module_type.upper() == 'NAIVE': - for i in range(self.num_branches): - x[i] = self.layers[i](x[i]) - out = x - - if self.with_fuse: - out_fuse = [] - for i in range(len(self.fuse_layers)): - # `y = 0` will lead to decreased accuracy (0.5~1 mAP) - y = out[0] if i == 0 else self.fuse_layers[i][0](out[0]) - for j in range(self.num_branches): - if i == j: - y += out[j] - else: - y += self.fuse_layers[i][j](out[j]) - out_fuse.append(self.relu(y)) - out = out_fuse - if not self.multiscale_output: - out = [out[0]] - return out - - -@BACKBONES.register_module() -class LiteHRNet(nn.Module): - """Lite-HRNet backbone. - - `Lite-HRNet: A Lightweight High-Resolution Network - `_. - - Code adapted from 'https://github.com/HRNet/Lite-HRNet'. - - Args: - extra (dict): detailed configuration for each stage of HRNet. - in_channels (int): Number of input image channels. Default: 3. - conv_cfg (dict): dictionary to construct and config conv layer. - norm_cfg (dict): dictionary to construct and config norm layer. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - - Example: - >>> from mmpose.models import LiteHRNet - >>> import torch - >>> extra=dict( - >>> stem=dict(stem_channels=32, out_channels=32, expand_ratio=1), - >>> num_stages=3, - >>> stages_spec=dict( - >>> num_modules=(2, 4, 2), - >>> num_branches=(2, 3, 4), - >>> num_blocks=(2, 2, 2), - >>> module_type=('LITE', 'LITE', 'LITE'), - >>> with_fuse=(True, True, True), - >>> reduce_ratios=(8, 8, 8), - >>> num_channels=( - >>> (40, 80), - >>> (40, 80, 160), - >>> (40, 80, 160, 320), - >>> )), - >>> with_head=False) - >>> self = LiteHRNet(extra, in_channels=1) - >>> self.eval() - >>> inputs = torch.rand(1, 1, 32, 32) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 40, 8, 8) - """ - - def __init__(self, - extra, - in_channels=3, - conv_cfg=None, - norm_cfg=dict(type='BN'), - norm_eval=False, - with_cp=False): - super().__init__() - self.extra = extra - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.norm_eval = norm_eval - self.with_cp = with_cp - - self.stem = Stem( - in_channels, - stem_channels=self.extra['stem']['stem_channels'], - out_channels=self.extra['stem']['out_channels'], - expand_ratio=self.extra['stem']['expand_ratio'], - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg) - - self.num_stages = self.extra['num_stages'] - self.stages_spec = self.extra['stages_spec'] - - num_channels_last = [ - self.stem.out_channels, - ] - for i in range(self.num_stages): - num_channels = self.stages_spec['num_channels'][i] - num_channels = [num_channels[i] for i in range(len(num_channels))] - setattr( - self, f'transition{i}', - self._make_transition_layer(num_channels_last, num_channels)) - - stage, num_channels_last = self._make_stage( - self.stages_spec, i, num_channels, multiscale_output=True) - setattr(self, f'stage{i}', stage) - - self.with_head = self.extra['with_head'] - if self.with_head: - self.head_layer = IterativeHead( - in_channels=num_channels_last, - norm_cfg=self.norm_cfg, - ) - - def _make_transition_layer(self, num_channels_pre_layer, - num_channels_cur_layer): - """Make transition layer.""" - num_branches_cur = len(num_channels_cur_layer) - num_branches_pre = len(num_channels_pre_layer) - - transition_layers = [] - for i in range(num_branches_cur): - if i < num_branches_pre: - if num_channels_cur_layer[i] != num_channels_pre_layer[i]: - transition_layers.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - num_channels_pre_layer[i], - num_channels_pre_layer[i], - kernel_size=3, - stride=1, - padding=1, - groups=num_channels_pre_layer[i], - bias=False), - build_norm_layer(self.norm_cfg, - num_channels_pre_layer[i])[1], - build_conv_layer( - self.conv_cfg, - num_channels_pre_layer[i], - num_channels_cur_layer[i], - kernel_size=1, - stride=1, - padding=0, - bias=False), - build_norm_layer(self.norm_cfg, - num_channels_cur_layer[i])[1], - nn.ReLU())) - else: - transition_layers.append(None) - else: - conv_downsamples = [] - for j in range(i + 1 - num_branches_pre): - in_channels = num_channels_pre_layer[-1] - out_channels = num_channels_cur_layer[i] \ - if j == i - num_branches_pre else in_channels - conv_downsamples.append( - nn.Sequential( - build_conv_layer( - self.conv_cfg, - in_channels, - in_channels, - kernel_size=3, - stride=2, - padding=1, - groups=in_channels, - bias=False), - build_norm_layer(self.norm_cfg, in_channels)[1], - build_conv_layer( - self.conv_cfg, - in_channels, - out_channels, - kernel_size=1, - stride=1, - padding=0, - bias=False), - build_norm_layer(self.norm_cfg, out_channels)[1], - nn.ReLU())) - transition_layers.append(nn.Sequential(*conv_downsamples)) - - return nn.ModuleList(transition_layers) - - def _make_stage(self, - stages_spec, - stage_index, - in_channels, - multiscale_output=True): - num_modules = stages_spec['num_modules'][stage_index] - num_branches = stages_spec['num_branches'][stage_index] - num_blocks = stages_spec['num_blocks'][stage_index] - reduce_ratio = stages_spec['reduce_ratios'][stage_index] - with_fuse = stages_spec['with_fuse'][stage_index] - module_type = stages_spec['module_type'][stage_index] - - modules = [] - for i in range(num_modules): - # multi_scale_output is only used last module - if not multiscale_output and i == num_modules - 1: - reset_multiscale_output = False - else: - reset_multiscale_output = True - - modules.append( - LiteHRModule( - num_branches, - num_blocks, - in_channels, - reduce_ratio, - module_type, - multiscale_output=reset_multiscale_output, - with_fuse=with_fuse, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - with_cp=self.with_cp)) - in_channels = modules[-1].in_channels - - return nn.Sequential(*modules), in_channels - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - normal_init(m, std=0.001) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - """Forward function.""" - x = self.stem(x) - - y_list = [x] - for i in range(self.num_stages): - x_list = [] - transition = getattr(self, f'transition{i}') - for j in range(self.stages_spec['num_branches'][i]): - if transition[j]: - if j >= len(y_list): - x_list.append(transition[j](y_list[-1])) - else: - x_list.append(transition[j](y_list[j])) - else: - x_list.append(y_list[j]) - y_list = getattr(self, f'stage{i}')(x_list) - - x = y_list - if self.with_head: - x = self.head_layer(x) - - return [x[0]] - - def train(self, mode=True): - """Convert the model into training mode.""" - super().train(mode) - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py b/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py deleted file mode 100644 index 5dc0cd1b7dfdec2aa751861e39fc1c1a45ec488e..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py +++ /dev/null @@ -1,275 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -import logging - -import torch.nn as nn -import torch.utils.checkpoint as cp -from mmcv.cnn import ConvModule, constant_init, kaiming_init -from torch.nn.modules.batchnorm import _BatchNorm - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .utils import load_checkpoint, make_divisible - - -class InvertedResidual(nn.Module): - """InvertedResidual block for MobileNetV2. - - Args: - in_channels (int): The input channels of the InvertedResidual block. - out_channels (int): The output channels of the InvertedResidual block. - stride (int): Stride of the middle (first) 3x3 convolution. - expand_ratio (int): adjusts number of channels of the hidden layer - in InvertedResidual by this amount. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU6'). - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - def __init__(self, - in_channels, - out_channels, - stride, - expand_ratio, - conv_cfg=None, - norm_cfg=dict(type='BN'), - act_cfg=dict(type='ReLU6'), - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - act_cfg = copy.deepcopy(act_cfg) - super().__init__() - self.stride = stride - assert stride in [1, 2], f'stride must in [1, 2]. ' \ - f'But received {stride}.' - self.with_cp = with_cp - self.use_res_connect = self.stride == 1 and in_channels == out_channels - hidden_dim = int(round(in_channels * expand_ratio)) - - layers = [] - if expand_ratio != 1: - layers.append( - ConvModule( - in_channels=in_channels, - out_channels=hidden_dim, - kernel_size=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg)) - layers.extend([ - ConvModule( - in_channels=hidden_dim, - out_channels=hidden_dim, - kernel_size=3, - stride=stride, - padding=1, - groups=hidden_dim, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg), - ConvModule( - in_channels=hidden_dim, - out_channels=out_channels, - kernel_size=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None) - ]) - self.conv = nn.Sequential(*layers) - - def forward(self, x): - - def _inner_forward(x): - if self.use_res_connect: - return x + self.conv(x) - return self.conv(x) - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - return out - - -@BACKBONES.register_module() -class MobileNetV2(BaseBackbone): - """MobileNetV2 backbone. - - Args: - widen_factor (float): Width multiplier, multiply number of - channels in each layer by this amount. Default: 1.0. - out_indices (None or Sequence[int]): Output from which stages. - Default: (7, ). - frozen_stages (int): Stages to be frozen (all param fixed). - Default: -1, which means not freezing any parameters. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU6'). - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - # Parameters to build layers. 4 parameters are needed to construct a - # layer, from left to right: expand_ratio, channel, num_blocks, stride. - arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2], - [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2], - [6, 320, 1, 1]] - - def __init__(self, - widen_factor=1., - out_indices=(7, ), - frozen_stages=-1, - conv_cfg=None, - norm_cfg=dict(type='BN'), - act_cfg=dict(type='ReLU6'), - norm_eval=False, - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - act_cfg = copy.deepcopy(act_cfg) - super().__init__() - self.widen_factor = widen_factor - self.out_indices = out_indices - for index in out_indices: - if index not in range(0, 8): - raise ValueError('the item in out_indices must in ' - f'range(0, 8). But received {index}') - - if frozen_stages not in range(-1, 8): - raise ValueError('frozen_stages must be in range(-1, 8). ' - f'But received {frozen_stages}') - self.out_indices = out_indices - self.frozen_stages = frozen_stages - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.act_cfg = act_cfg - self.norm_eval = norm_eval - self.with_cp = with_cp - - self.in_channels = make_divisible(32 * widen_factor, 8) - - self.conv1 = ConvModule( - in_channels=3, - out_channels=self.in_channels, - kernel_size=3, - stride=2, - padding=1, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg) - - self.layers = [] - - for i, layer_cfg in enumerate(self.arch_settings): - expand_ratio, channel, num_blocks, stride = layer_cfg - out_channels = make_divisible(channel * widen_factor, 8) - inverted_res_layer = self.make_layer( - out_channels=out_channels, - num_blocks=num_blocks, - stride=stride, - expand_ratio=expand_ratio) - layer_name = f'layer{i + 1}' - self.add_module(layer_name, inverted_res_layer) - self.layers.append(layer_name) - - if widen_factor > 1.0: - self.out_channel = int(1280 * widen_factor) - else: - self.out_channel = 1280 - - layer = ConvModule( - in_channels=self.in_channels, - out_channels=self.out_channel, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg) - self.add_module('conv2', layer) - self.layers.append('conv2') - - def make_layer(self, out_channels, num_blocks, stride, expand_ratio): - """Stack InvertedResidual blocks to build a layer for MobileNetV2. - - Args: - out_channels (int): out_channels of block. - num_blocks (int): number of blocks. - stride (int): stride of the first block. Default: 1 - expand_ratio (int): Expand the number of channels of the - hidden layer in InvertedResidual by this ratio. Default: 6. - """ - layers = [] - for i in range(num_blocks): - if i >= 1: - stride = 1 - layers.append( - InvertedResidual( - self.in_channels, - out_channels, - stride, - expand_ratio=expand_ratio, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, - with_cp=self.with_cp)) - self.in_channels = out_channels - - return nn.Sequential(*layers) - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): - logger = logging.getLogger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - kaiming_init(m) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - x = self.conv1(x) - - outs = [] - for i, layer_name in enumerate(self.layers): - layer = getattr(self, layer_name) - x = layer(x) - if i in self.out_indices: - outs.append(x) - - if len(outs) == 1: - return outs[0] - return tuple(outs) - - def _freeze_stages(self): - if self.frozen_stages >= 0: - for param in self.conv1.parameters(): - param.requires_grad = False - for i in range(1, self.frozen_stages + 1): - layer = getattr(self, f'layer{i}') - layer.eval() - for param in layer.parameters(): - param.requires_grad = False - - def train(self, mode=True): - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py b/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py deleted file mode 100644 index d640abec79f06d689f2d4bc1e92999946bc07261..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -import logging - -import torch.nn as nn -from mmcv.cnn import ConvModule, constant_init, kaiming_init -from torch.nn.modules.batchnorm import _BatchNorm - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .utils import InvertedResidual, load_checkpoint - - -@BACKBONES.register_module() -class MobileNetV3(BaseBackbone): - """MobileNetV3 backbone. - - Args: - arch (str): Architecture of mobilnetv3, from {small, big}. - Default: small. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - out_indices (None or Sequence[int]): Output from which stages. - Default: (-1, ), which means output tensors from final stage. - frozen_stages (int): Stages to be frozen (all param fixed). - Default: -1, which means not freezing any parameters. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save - some memory while slowing down the training speed. - Default: False. - """ - # Parameters to build each block: - # [kernel size, mid channels, out channels, with_se, act type, stride] - arch_settings = { - 'small': [[3, 16, 16, True, 'ReLU', 2], - [3, 72, 24, False, 'ReLU', 2], - [3, 88, 24, False, 'ReLU', 1], - [5, 96, 40, True, 'HSwish', 2], - [5, 240, 40, True, 'HSwish', 1], - [5, 240, 40, True, 'HSwish', 1], - [5, 120, 48, True, 'HSwish', 1], - [5, 144, 48, True, 'HSwish', 1], - [5, 288, 96, True, 'HSwish', 2], - [5, 576, 96, True, 'HSwish', 1], - [5, 576, 96, True, 'HSwish', 1]], - 'big': [[3, 16, 16, False, 'ReLU', 1], - [3, 64, 24, False, 'ReLU', 2], - [3, 72, 24, False, 'ReLU', 1], - [5, 72, 40, True, 'ReLU', 2], - [5, 120, 40, True, 'ReLU', 1], - [5, 120, 40, True, 'ReLU', 1], - [3, 240, 80, False, 'HSwish', 2], - [3, 200, 80, False, 'HSwish', 1], - [3, 184, 80, False, 'HSwish', 1], - [3, 184, 80, False, 'HSwish', 1], - [3, 480, 112, True, 'HSwish', 1], - [3, 672, 112, True, 'HSwish', 1], - [5, 672, 160, True, 'HSwish', 1], - [5, 672, 160, True, 'HSwish', 2], - [5, 960, 160, True, 'HSwish', 1]] - } # yapf: disable - - def __init__(self, - arch='small', - conv_cfg=None, - norm_cfg=dict(type='BN'), - out_indices=(-1, ), - frozen_stages=-1, - norm_eval=False, - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - assert arch in self.arch_settings - for index in out_indices: - if index not in range(-len(self.arch_settings[arch]), - len(self.arch_settings[arch])): - raise ValueError('the item in out_indices must in ' - f'range(0, {len(self.arch_settings[arch])}). ' - f'But received {index}') - - if frozen_stages not in range(-1, len(self.arch_settings[arch])): - raise ValueError('frozen_stages must be in range(-1, ' - f'{len(self.arch_settings[arch])}). ' - f'But received {frozen_stages}') - self.arch = arch - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.out_indices = out_indices - self.frozen_stages = frozen_stages - self.norm_eval = norm_eval - self.with_cp = with_cp - - self.in_channels = 16 - self.conv1 = ConvModule( - in_channels=3, - out_channels=self.in_channels, - kernel_size=3, - stride=2, - padding=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=dict(type='HSwish')) - - self.layers = self._make_layer() - self.feat_dim = self.arch_settings[arch][-1][2] - - def _make_layer(self): - layers = [] - layer_setting = self.arch_settings[self.arch] - for i, params in enumerate(layer_setting): - (kernel_size, mid_channels, out_channels, with_se, act, - stride) = params - if with_se: - se_cfg = dict( - channels=mid_channels, - ratio=4, - act_cfg=(dict(type='ReLU'), dict(type='HSigmoid'))) - else: - se_cfg = None - - layer = InvertedResidual( - in_channels=self.in_channels, - out_channels=out_channels, - mid_channels=mid_channels, - kernel_size=kernel_size, - stride=stride, - se_cfg=se_cfg, - with_expand_conv=True, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=dict(type=act), - with_cp=self.with_cp) - self.in_channels = out_channels - layer_name = f'layer{i + 1}' - self.add_module(layer_name, layer) - layers.append(layer_name) - return layers - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): - logger = logging.getLogger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - kaiming_init(m) - elif isinstance(m, nn.BatchNorm2d): - constant_init(m, 1) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - x = self.conv1(x) - - outs = [] - for i, layer_name in enumerate(self.layers): - layer = getattr(self, layer_name) - x = layer(x) - if i in self.out_indices or \ - i - len(self.layers) in self.out_indices: - outs.append(x) - - if len(outs) == 1: - return outs[0] - return tuple(outs) - - def _freeze_stages(self): - if self.frozen_stages >= 0: - for param in self.conv1.parameters(): - param.requires_grad = False - for i in range(1, self.frozen_stages + 1): - layer = getattr(self, f'layer{i}') - layer.eval() - for param in layer.parameters(): - param.requires_grad = False - - def train(self, mode=True): - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py b/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py index 44feef44dfc43a7b40b82752d9a82df35f1108ba..7f93a99db49704b7e1aeb71fb5e209298465dcb0 100644 --- a/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py +++ b/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py @@ -12,13 +12,11 @@ import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as cp from .transformer_block import TransformerBlock - +from mmengine.model import constant_init, kaiming_init from mmcv.cnn import ( build_conv_layer, build_norm_layer, build_plugin_layer, - constant_init, - kaiming_init, ) diff --git a/main/transformer_utils/mmpose/models/backbones/mspn.py b/main/transformer_utils/mmpose/models/backbones/mspn.py deleted file mode 100644 index 71cee34e399780e8b67eac43d862b65a3ce05412..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/mspn.py +++ /dev/null @@ -1,513 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy as cp -from collections import OrderedDict - -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init, - normal_init) -from mmcv.runner.checkpoint import load_state_dict - -from mmpose.utils import get_root_logger -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .resnet import Bottleneck as _Bottleneck -from .utils.utils import get_state_dict - - -class Bottleneck(_Bottleneck): - expansion = 4 - """Bottleneck block for MSPN. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - stride (int): stride of the block. Default: 1 - downsample (nn.Module): downsample operation on identity branch. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - """ - - def __init__(self, in_channels, out_channels, **kwargs): - super().__init__(in_channels, out_channels * 4, **kwargs) - - -class DownsampleModule(nn.Module): - """Downsample module for MSPN. - - Args: - block (nn.Module): Downsample block. - num_blocks (list): Number of blocks in each downsample unit. - num_units (int): Numbers of downsample units. Default: 4 - has_skip (bool): Have skip connections from prior upsample - module or not. Default:False - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - in_channels (int): Number of channels of the input feature to - downsample module. Default: 64 - """ - - def __init__(self, - block, - num_blocks, - num_units=4, - has_skip=False, - norm_cfg=dict(type='BN'), - in_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.has_skip = has_skip - self.in_channels = in_channels - assert len(num_blocks) == num_units - self.num_blocks = num_blocks - self.num_units = num_units - self.norm_cfg = norm_cfg - self.layer1 = self._make_layer(block, in_channels, num_blocks[0]) - for i in range(1, num_units): - module_name = f'layer{i + 1}' - self.add_module( - module_name, - self._make_layer( - block, in_channels * pow(2, i), num_blocks[i], stride=2)) - - def _make_layer(self, block, out_channels, blocks, stride=1): - downsample = None - if stride != 1 or self.in_channels != out_channels * block.expansion: - downsample = ConvModule( - self.in_channels, - out_channels * block.expansion, - kernel_size=1, - stride=stride, - padding=0, - norm_cfg=self.norm_cfg, - act_cfg=None, - inplace=True) - - units = list() - units.append( - block( - self.in_channels, - out_channels, - stride=stride, - downsample=downsample, - norm_cfg=self.norm_cfg)) - self.in_channels = out_channels * block.expansion - for _ in range(1, blocks): - units.append(block(self.in_channels, out_channels)) - - return nn.Sequential(*units) - - def forward(self, x, skip1, skip2): - out = list() - for i in range(self.num_units): - module_name = f'layer{i + 1}' - module_i = getattr(self, module_name) - x = module_i(x) - if self.has_skip: - x = x + skip1[i] + skip2[i] - out.append(x) - out.reverse() - - return tuple(out) - - -class UpsampleUnit(nn.Module): - """Upsample unit for upsample module. - - Args: - ind (int): Indicates whether to interpolate (>0) and whether to - generate feature map for the next hourglass-like module. - num_units (int): Number of units that form a upsample module. Along - with ind and gen_cross_conv, nm_units is used to decide whether - to generate feature map for the next hourglass-like module. - in_channels (int): Channel number of the skip-in feature maps from - the corresponding downsample unit. - unit_channels (int): Channel number in this unit. Default:256. - gen_skip: (bool): Whether or not to generate skips for the posterior - downsample module. Default:False - gen_cross_conv (bool): Whether to generate feature map for the next - hourglass-like module. Default:False - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - out_channels (int): Number of channels of feature output by upsample - module. Must equal to in_channels of downsample module. Default:64 - """ - - def __init__(self, - ind, - num_units, - in_channels, - unit_channels=256, - gen_skip=False, - gen_cross_conv=False, - norm_cfg=dict(type='BN'), - out_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.num_units = num_units - self.norm_cfg = norm_cfg - self.in_skip = ConvModule( - in_channels, - unit_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - act_cfg=None, - inplace=True) - self.relu = nn.ReLU(inplace=True) - - self.ind = ind - if self.ind > 0: - self.up_conv = ConvModule( - unit_channels, - unit_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - act_cfg=None, - inplace=True) - - self.gen_skip = gen_skip - if self.gen_skip: - self.out_skip1 = ConvModule( - in_channels, - in_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - inplace=True) - - self.out_skip2 = ConvModule( - unit_channels, - in_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - inplace=True) - - self.gen_cross_conv = gen_cross_conv - if self.ind == num_units - 1 and self.gen_cross_conv: - self.cross_conv = ConvModule( - unit_channels, - out_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - inplace=True) - - def forward(self, x, up_x): - out = self.in_skip(x) - - if self.ind > 0: - up_x = F.interpolate( - up_x, - size=(x.size(2), x.size(3)), - mode='bilinear', - align_corners=True) - up_x = self.up_conv(up_x) - out = out + up_x - out = self.relu(out) - - skip1 = None - skip2 = None - if self.gen_skip: - skip1 = self.out_skip1(x) - skip2 = self.out_skip2(out) - - cross_conv = None - if self.ind == self.num_units - 1 and self.gen_cross_conv: - cross_conv = self.cross_conv(out) - - return out, skip1, skip2, cross_conv - - -class UpsampleModule(nn.Module): - """Upsample module for MSPN. - - Args: - unit_channels (int): Channel number in the upsample units. - Default:256. - num_units (int): Numbers of upsample units. Default: 4 - gen_skip (bool): Whether to generate skip for posterior downsample - module or not. Default:False - gen_cross_conv (bool): Whether to generate feature map for the next - hourglass-like module. Default:False - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - out_channels (int): Number of channels of feature output by upsample - module. Must equal to in_channels of downsample module. Default:64 - """ - - def __init__(self, - unit_channels=256, - num_units=4, - gen_skip=False, - gen_cross_conv=False, - norm_cfg=dict(type='BN'), - out_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.in_channels = list() - for i in range(num_units): - self.in_channels.append(Bottleneck.expansion * out_channels * - pow(2, i)) - self.in_channels.reverse() - self.num_units = num_units - self.gen_skip = gen_skip - self.gen_cross_conv = gen_cross_conv - self.norm_cfg = norm_cfg - for i in range(num_units): - module_name = f'up{i + 1}' - self.add_module( - module_name, - UpsampleUnit( - i, - self.num_units, - self.in_channels[i], - unit_channels, - self.gen_skip, - self.gen_cross_conv, - norm_cfg=self.norm_cfg, - out_channels=64)) - - def forward(self, x): - out = list() - skip1 = list() - skip2 = list() - cross_conv = None - for i in range(self.num_units): - module_i = getattr(self, f'up{i + 1}') - if i == 0: - outi, skip1_i, skip2_i, _ = module_i(x[i], None) - elif i == self.num_units - 1: - outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1]) - else: - outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1]) - out.append(outi) - skip1.append(skip1_i) - skip2.append(skip2_i) - skip1.reverse() - skip2.reverse() - - return out, skip1, skip2, cross_conv - - -class SingleStageNetwork(nn.Module): - """Single_stage Network. - - Args: - unit_channels (int): Channel number in the upsample units. Default:256. - num_units (int): Numbers of downsample/upsample units. Default: 4 - gen_skip (bool): Whether to generate skip for posterior downsample - module or not. Default:False - gen_cross_conv (bool): Whether to generate feature map for the next - hourglass-like module. Default:False - has_skip (bool): Have skip connections from prior upsample - module or not. Default:False - num_blocks (list): Number of blocks in each downsample unit. - Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks) - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - in_channels (int): Number of channels of the feature from ResNetTop. - Default: 64. - """ - - def __init__(self, - has_skip=False, - gen_skip=False, - gen_cross_conv=False, - unit_channels=256, - num_units=4, - num_blocks=[2, 2, 2, 2], - norm_cfg=dict(type='BN'), - in_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - num_blocks = cp.deepcopy(num_blocks) - super().__init__() - assert len(num_blocks) == num_units - self.has_skip = has_skip - self.gen_skip = gen_skip - self.gen_cross_conv = gen_cross_conv - self.num_units = num_units - self.unit_channels = unit_channels - self.num_blocks = num_blocks - self.norm_cfg = norm_cfg - - self.downsample = DownsampleModule(Bottleneck, num_blocks, num_units, - has_skip, norm_cfg, in_channels) - self.upsample = UpsampleModule(unit_channels, num_units, gen_skip, - gen_cross_conv, norm_cfg, in_channels) - - def forward(self, x, skip1, skip2): - mid = self.downsample(x, skip1, skip2) - out, skip1, skip2, cross_conv = self.upsample(mid) - - return out, skip1, skip2, cross_conv - - -class ResNetTop(nn.Module): - """ResNet top for MSPN. - - Args: - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - channels (int): Number of channels of the feature output by ResNetTop. - """ - - def __init__(self, norm_cfg=dict(type='BN'), channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.top = nn.Sequential( - ConvModule( - 3, - channels, - kernel_size=7, - stride=2, - padding=3, - norm_cfg=norm_cfg, - inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1)) - - def forward(self, img): - return self.top(img) - - -@BACKBONES.register_module() -class MSPN(BaseBackbone): - """MSPN backbone. Paper ref: Li et al. "Rethinking on Multi-Stage Networks - for Human Pose Estimation" (CVPR 2020). - - Args: - unit_channels (int): Number of Channels in an upsample unit. - Default: 256 - num_stages (int): Number of stages in a multi-stage MSPN. Default: 4 - num_units (int): Number of downsample/upsample units in a single-stage - network. Default: 4 - Note: Make sure num_units == len(self.num_blocks) - num_blocks (list): Number of bottlenecks in each - downsample unit. Default: [2, 2, 2, 2] - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - res_top_channels (int): Number of channels of feature from ResNetTop. - Default: 64. - - Example: - >>> from mmpose.models import MSPN - >>> import torch - >>> self = MSPN(num_stages=2,num_units=2,num_blocks=[2,2]) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 511, 511) - >>> level_outputs = self.forward(inputs) - >>> for level_output in level_outputs: - ... for feature in level_output: - ... print(tuple(feature.shape)) - ... - (1, 256, 64, 64) - (1, 256, 128, 128) - (1, 256, 64, 64) - (1, 256, 128, 128) - """ - - def __init__(self, - unit_channels=256, - num_stages=4, - num_units=4, - num_blocks=[2, 2, 2, 2], - norm_cfg=dict(type='BN'), - res_top_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - num_blocks = cp.deepcopy(num_blocks) - super().__init__() - self.unit_channels = unit_channels - self.num_stages = num_stages - self.num_units = num_units - self.num_blocks = num_blocks - self.norm_cfg = norm_cfg - - assert self.num_stages > 0 - assert self.num_units > 1 - assert self.num_units == len(self.num_blocks) - self.top = ResNetTop(norm_cfg=norm_cfg) - self.multi_stage_mspn = nn.ModuleList([]) - for i in range(self.num_stages): - if i == 0: - has_skip = False - else: - has_skip = True - if i != self.num_stages - 1: - gen_skip = True - gen_cross_conv = True - else: - gen_skip = False - gen_cross_conv = False - self.multi_stage_mspn.append( - SingleStageNetwork(has_skip, gen_skip, gen_cross_conv, - unit_channels, num_units, num_blocks, - norm_cfg, res_top_channels)) - - def forward(self, x): - """Model forward function.""" - out_feats = [] - skip1 = None - skip2 = None - x = self.top(x) - for i in range(self.num_stages): - out, skip1, skip2, x = self.multi_stage_mspn[i](x, skip1, skip2) - out_feats.append(out) - - return out_feats - - def init_weights(self, pretrained=None): - """Initialize model weights.""" - if isinstance(pretrained, str): - logger = get_root_logger() - state_dict_tmp = get_state_dict(pretrained) - state_dict = OrderedDict() - state_dict['top'] = OrderedDict() - state_dict['bottlenecks'] = OrderedDict() - for k, v in state_dict_tmp.items(): - if k.startswith('layer'): - if 'downsample.0' in k: - state_dict['bottlenecks'][k.replace( - 'downsample.0', 'downsample.conv')] = v - elif 'downsample.1' in k: - state_dict['bottlenecks'][k.replace( - 'downsample.1', 'downsample.bn')] = v - else: - state_dict['bottlenecks'][k] = v - elif k.startswith('conv1'): - state_dict['top'][k.replace('conv1', 'top.0.conv')] = v - elif k.startswith('bn1'): - state_dict['top'][k.replace('bn1', 'top.0.bn')] = v - - load_state_dict( - self.top, state_dict['top'], strict=False, logger=logger) - for i in range(self.num_stages): - load_state_dict( - self.multi_stage_mspn[i].downsample, - state_dict['bottlenecks'], - strict=False, - logger=logger) - else: - for m in self.multi_stage_mspn.modules(): - if isinstance(m, nn.Conv2d): - kaiming_init(m) - elif isinstance(m, nn.BatchNorm2d): - constant_init(m, 1) - elif isinstance(m, nn.Linear): - normal_init(m, std=0.01) - - for m in self.top.modules(): - if isinstance(m, nn.Conv2d): - kaiming_init(m) diff --git a/main/transformer_utils/mmpose/models/backbones/pvt.py b/main/transformer_utils/mmpose/models/backbones/pvt.py deleted file mode 100644 index 62527a7dc817513c08f42ccbb166c75cab514873..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/pvt.py +++ /dev/null @@ -1,592 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import math -import warnings - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import (Conv2d, build_activation_layer, build_norm_layer, - constant_init, normal_init, trunc_normal_init) -from mmcv.cnn.bricks.drop import build_dropout -from mmcv.cnn.bricks.transformer import MultiheadAttention -from mmcv.cnn.utils.weight_init import trunc_normal_ -from mmcv.runner import (BaseModule, ModuleList, Sequential, _load_checkpoint, - load_state_dict) -from torch.nn.modules.utils import _pair as to_2tuple - -from ...utils import get_root_logger -from ..builder import BACKBONES -from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw, pvt_convert - - -class MixFFN(BaseModule): - """An implementation of MixFFN of PVT. - - The differences between MixFFN & FFN: - 1. Use 1X1 Conv to replace Linear layer. - 2. Introduce 3X3 Depth-wise Conv to encode positional information. - - Args: - embed_dims (int): The feature dimension. Same as - `MultiheadAttention`. - feedforward_channels (int): The hidden dimension of FFNs. - act_cfg (dict, optional): The activation config for FFNs. - Default: dict(type='GELU'). - ffn_drop (float, optional): Probability of an element to be - zeroed in FFN. Default 0.0. - dropout_layer (obj:`ConfigDict`): The dropout_layer used - when adding the shortcut. - Default: None. - use_conv (bool): If True, add 3x3 DWConv between two Linear layers. - Defaults: False. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims, - feedforward_channels, - act_cfg=dict(type='GELU'), - ffn_drop=0., - dropout_layer=None, - use_conv=False, - init_cfg=None): - super(MixFFN, self).__init__(init_cfg=init_cfg) - - self.embed_dims = embed_dims - self.feedforward_channels = feedforward_channels - self.act_cfg = act_cfg - activate = build_activation_layer(act_cfg) - - in_channels = embed_dims - fc1 = Conv2d( - in_channels=in_channels, - out_channels=feedforward_channels, - kernel_size=1, - stride=1, - bias=True) - if use_conv: - # 3x3 depth wise conv to provide positional encode information - dw_conv = Conv2d( - in_channels=feedforward_channels, - out_channels=feedforward_channels, - kernel_size=3, - stride=1, - padding=(3 - 1) // 2, - bias=True, - groups=feedforward_channels) - fc2 = Conv2d( - in_channels=feedforward_channels, - out_channels=in_channels, - kernel_size=1, - stride=1, - bias=True) - drop = nn.Dropout(ffn_drop) - layers = [fc1, activate, drop, fc2, drop] - if use_conv: - layers.insert(1, dw_conv) - self.layers = Sequential(*layers) - self.dropout_layer = build_dropout( - dropout_layer) if dropout_layer else torch.nn.Identity() - - def forward(self, x, hw_shape, identity=None): - out = nlc_to_nchw(x, hw_shape) - out = self.layers(out) - out = nchw_to_nlc(out) - if identity is None: - identity = x - return identity + self.dropout_layer(out) - - -class SpatialReductionAttention(MultiheadAttention): - """An implementation of Spatial Reduction Attention of PVT. - - This module is modified from MultiheadAttention which is a module from - mmcv.cnn.bricks.transformer. - - Args: - embed_dims (int): The embedding dimension. - num_heads (int): Parallel attention heads. - attn_drop (float): A Dropout layer on attn_output_weights. - Default: 0.0. - proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. - Default: 0.0. - dropout_layer (obj:`ConfigDict`): The dropout_layer used - when adding the shortcut. Default: None. - batch_first (bool): Key, Query and Value are shape of - (batch, n, embed_dim) - or (n, batch, embed_dim). Default: False. - qkv_bias (bool): enable bias for qkv if True. Default: True. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='LN'). - sr_ratio (int): The ratio of spatial reduction of Spatial Reduction - Attention of PVT. Default: 1. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims, - num_heads, - attn_drop=0., - proj_drop=0., - dropout_layer=None, - batch_first=True, - qkv_bias=True, - norm_cfg=dict(type='LN'), - sr_ratio=1, - init_cfg=None): - super().__init__( - embed_dims, - num_heads, - attn_drop, - proj_drop, - batch_first=batch_first, - dropout_layer=dropout_layer, - bias=qkv_bias, - init_cfg=init_cfg) - - self.sr_ratio = sr_ratio - if sr_ratio > 1: - self.sr = Conv2d( - in_channels=embed_dims, - out_channels=embed_dims, - kernel_size=sr_ratio, - stride=sr_ratio) - # The ret[0] of build_norm_layer is norm name. - self.norm = build_norm_layer(norm_cfg, embed_dims)[1] - - # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa - from mmpose import digit_version, mmcv_version - if mmcv_version < digit_version('1.3.17'): - warnings.warn('The legacy version of forward function in' - 'SpatialReductionAttention is deprecated in' - 'mmcv>=1.3.17 and will no longer support in the' - 'future. Please upgrade your mmcv.') - self.forward = self.legacy_forward - - def forward(self, x, hw_shape, identity=None): - - x_q = x - if self.sr_ratio > 1: - x_kv = nlc_to_nchw(x, hw_shape) - x_kv = self.sr(x_kv) - x_kv = nchw_to_nlc(x_kv) - x_kv = self.norm(x_kv) - else: - x_kv = x - - if identity is None: - identity = x_q - - # Because the dataflow('key', 'query', 'value') of - # ``torch.nn.MultiheadAttention`` is (num_query, batch, - # embed_dims), We should adjust the shape of dataflow from - # batch_first (batch, num_query, embed_dims) to num_query_first - # (num_query ,batch, embed_dims), and recover ``attn_output`` - # from num_query_first to batch_first. - if self.batch_first: - x_q = x_q.transpose(0, 1) - x_kv = x_kv.transpose(0, 1) - - out = self.attn(query=x_q, key=x_kv, value=x_kv)[0] - - if self.batch_first: - out = out.transpose(0, 1) - - return identity + self.dropout_layer(self.proj_drop(out)) - - def legacy_forward(self, x, hw_shape, identity=None): - """multi head attention forward in mmcv version < 1.3.17.""" - x_q = x - if self.sr_ratio > 1: - x_kv = nlc_to_nchw(x, hw_shape) - x_kv = self.sr(x_kv) - x_kv = nchw_to_nlc(x_kv) - x_kv = self.norm(x_kv) - else: - x_kv = x - - if identity is None: - identity = x_q - - out = self.attn(query=x_q, key=x_kv, value=x_kv)[0] - - return identity + self.dropout_layer(self.proj_drop(out)) - - -class PVTEncoderLayer(BaseModule): - """Implements one encoder layer in PVT. - - Args: - embed_dims (int): The feature dimension. - num_heads (int): Parallel attention heads. - feedforward_channels (int): The hidden dimension for FFNs. - drop_rate (float): Probability of an element to be zeroed. - after the feed forward layer. Default: 0.0. - attn_drop_rate (float): The drop out rate for attention layer. - Default: 0.0. - drop_path_rate (float): stochastic depth rate. Default: 0.0. - qkv_bias (bool): enable bias for qkv if True. - Default: True. - act_cfg (dict): The activation config for FFNs. - Default: dict(type='GELU'). - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='LN'). - sr_ratio (int): The ratio of spatial reduction of Spatial Reduction - Attention of PVT. Default: 1. - use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN. - Default: False. - init_cfg (dict, optional): Initialization config dict. - Default: None. - """ - - def __init__(self, - embed_dims, - num_heads, - feedforward_channels, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - qkv_bias=True, - act_cfg=dict(type='GELU'), - norm_cfg=dict(type='LN'), - sr_ratio=1, - use_conv_ffn=False, - init_cfg=None): - super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg) - - # The ret[0] of build_norm_layer is norm name. - self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] - - self.attn = SpatialReductionAttention( - embed_dims=embed_dims, - num_heads=num_heads, - attn_drop=attn_drop_rate, - proj_drop=drop_rate, - dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), - qkv_bias=qkv_bias, - norm_cfg=norm_cfg, - sr_ratio=sr_ratio) - - # The ret[0] of build_norm_layer is norm name. - self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] - - self.ffn = MixFFN( - embed_dims=embed_dims, - feedforward_channels=feedforward_channels, - ffn_drop=drop_rate, - dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), - use_conv=use_conv_ffn, - act_cfg=act_cfg) - - def forward(self, x, hw_shape): - x = self.attn(self.norm1(x), hw_shape, identity=x) - x = self.ffn(self.norm2(x), hw_shape, identity=x) - - return x - - -class AbsolutePositionEmbedding(BaseModule): - """An implementation of the absolute position embedding in PVT. - - Args: - pos_shape (int): The shape of the absolute position embedding. - pos_dim (int): The dimension of the absolute position embedding. - drop_rate (float): Probability of an element to be zeroed. - Default: 0.0. - """ - - def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None): - super().__init__(init_cfg=init_cfg) - - if isinstance(pos_shape, int): - pos_shape = to_2tuple(pos_shape) - elif isinstance(pos_shape, tuple): - if len(pos_shape) == 1: - pos_shape = to_2tuple(pos_shape[0]) - assert len(pos_shape) == 2, \ - f'The size of image should have length 1 or 2, ' \ - f'but got {len(pos_shape)}' - self.pos_shape = pos_shape - self.pos_dim = pos_dim - - self.pos_embed = nn.Parameter( - torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim)) - self.drop = nn.Dropout(p=drop_rate) - - def init_weights(self): - trunc_normal_(self.pos_embed, std=0.02) - - def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'): - """Resize pos_embed weights. - - Resize pos_embed using bilinear interpolate method. - - Args: - pos_embed (torch.Tensor): Position embedding weights. - input_shape (tuple): Tuple for (downsampled input image height, - downsampled input image width). - mode (str): Algorithm used for upsampling: - ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` | - ``'trilinear'``. Default: ``'bilinear'``. - - Return: - torch.Tensor: The resized pos_embed of shape [B, L_new, C]. - """ - assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]' - pos_h, pos_w = self.pos_shape - pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):] - pos_embed_weight = pos_embed_weight.reshape( - 1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous() - pos_embed_weight = F.interpolate( - pos_embed_weight, size=input_shape, mode=mode) - pos_embed_weight = torch.flatten(pos_embed_weight, - 2).transpose(1, 2).contiguous() - pos_embed = pos_embed_weight - - return pos_embed - - def forward(self, x, hw_shape, mode='bilinear'): - pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode) - return self.drop(x + pos_embed) - - -@BACKBONES.register_module() -class PyramidVisionTransformer(BaseModule): - """Pyramid Vision Transformer (PVT) - - Implementation of `Pyramid Vision Transformer: A Versatile Backbone for - Dense Prediction without Convolutions - `_. - - Args: - pretrain_img_size (int | tuple[int]): The size of input image when - pretrain. Defaults: 224. - in_channels (int): Number of input channels. Default: 3. - embed_dims (int): Embedding dimension. Default: 64. - num_stags (int): The num of stages. Default: 4. - num_layers (Sequence[int]): The layer number of each transformer encode - layer. Default: [3, 4, 6, 3]. - num_heads (Sequence[int]): The attention heads of each transformer - encode layer. Default: [1, 2, 5, 8]. - patch_sizes (Sequence[int]): The patch_size of each patch embedding. - Default: [4, 2, 2, 2]. - strides (Sequence[int]): The stride of each patch embedding. - Default: [4, 2, 2, 2]. - paddings (Sequence[int]): The padding of each patch embedding. - Default: [0, 0, 0, 0]. - sr_ratios (Sequence[int]): The spatial reduction rate of each - transformer encode layer. Default: [8, 4, 2, 1]. - out_indices (Sequence[int] | int): Output from which stages. - Default: (0, 1, 2, 3). - mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the - embedding dim of each transformer encode layer. - Default: [8, 8, 4, 4]. - qkv_bias (bool): Enable bias for qkv if True. Default: True. - drop_rate (float): Probability of an element to be zeroed. - Default 0.0. - attn_drop_rate (float): The drop out rate for attention layer. - Default 0.0. - drop_path_rate (float): stochastic depth rate. Default 0.1. - use_abs_pos_embed (bool): If True, add absolute position embedding to - the patch embedding. Defaults: True. - use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN. - Default: False. - act_cfg (dict): The activation config for FFNs. - Default: dict(type='GELU'). - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='LN'). - pretrained (str, optional): model pretrained path. Default: None. - convert_weights (bool): The flag indicates whether the - pre-trained model is from the original repo. We may need - to convert some keys to make it compatible. - Default: True. - init_cfg (dict or list[dict], optional): Initialization config dict. - Default: None. - """ - - def __init__(self, - pretrain_img_size=224, - in_channels=3, - embed_dims=64, - num_stages=4, - num_layers=[3, 4, 6, 3], - num_heads=[1, 2, 5, 8], - patch_sizes=[4, 2, 2, 2], - strides=[4, 2, 2, 2], - paddings=[0, 0, 0, 0], - sr_ratios=[8, 4, 2, 1], - out_indices=(0, 1, 2, 3), - mlp_ratios=[8, 8, 4, 4], - qkv_bias=True, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.1, - use_abs_pos_embed=True, - norm_after_stage=False, - use_conv_ffn=False, - act_cfg=dict(type='GELU'), - norm_cfg=dict(type='LN', eps=1e-6), - pretrained=None, - convert_weights=True, - init_cfg=None): - super().__init__(init_cfg=init_cfg) - - self.convert_weights = convert_weights - if isinstance(pretrain_img_size, int): - pretrain_img_size = to_2tuple(pretrain_img_size) - elif isinstance(pretrain_img_size, tuple): - if len(pretrain_img_size) == 1: - pretrain_img_size = to_2tuple(pretrain_img_size[0]) - assert len(pretrain_img_size) == 2, \ - f'The size of image should have length 1 or 2, ' \ - f'but got {len(pretrain_img_size)}' - - assert not (init_cfg and pretrained), \ - 'init_cfg and pretrained cannot be setting at the same time' - if isinstance(pretrained, str): - self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) - elif pretrained is None: - self.init_cfg = init_cfg - else: - raise TypeError('pretrained must be a str or None') - - self.embed_dims = embed_dims - - self.num_stages = num_stages - self.num_layers = num_layers - self.num_heads = num_heads - self.patch_sizes = patch_sizes - self.strides = strides - self.sr_ratios = sr_ratios - assert num_stages == len(num_layers) == len(num_heads) \ - == len(patch_sizes) == len(strides) == len(sr_ratios) - - self.out_indices = out_indices - assert max(out_indices) < self.num_stages - self.pretrained = pretrained - - # transformer encoder - dpr = [ - x.item() - for x in torch.linspace(0, drop_path_rate, sum(num_layers)) - ] # stochastic num_layer decay rule - - cur = 0 - self.layers = ModuleList() - for i, num_layer in enumerate(num_layers): - embed_dims_i = embed_dims * num_heads[i] - patch_embed = PatchEmbed( - in_channels=in_channels, - embed_dims=embed_dims_i, - kernel_size=patch_sizes[i], - stride=strides[i], - padding=paddings[i], - bias=True, - norm_cfg=norm_cfg) - - layers = ModuleList() - if use_abs_pos_embed: - pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1]) - pos_embed = AbsolutePositionEmbedding( - pos_shape=pos_shape, - pos_dim=embed_dims_i, - drop_rate=drop_rate) - layers.append(pos_embed) - layers.extend([ - PVTEncoderLayer( - embed_dims=embed_dims_i, - num_heads=num_heads[i], - feedforward_channels=mlp_ratios[i] * embed_dims_i, - drop_rate=drop_rate, - attn_drop_rate=attn_drop_rate, - drop_path_rate=dpr[cur + idx], - qkv_bias=qkv_bias, - act_cfg=act_cfg, - norm_cfg=norm_cfg, - sr_ratio=sr_ratios[i], - use_conv_ffn=use_conv_ffn) for idx in range(num_layer) - ]) - in_channels = embed_dims_i - # The ret[0] of build_norm_layer is norm name. - if norm_after_stage: - norm = build_norm_layer(norm_cfg, embed_dims_i)[1] - else: - norm = nn.Identity() - self.layers.append(ModuleList([patch_embed, layers, norm])) - cur += num_layer - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): - self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) - - logger = get_root_logger() - if self.init_cfg is None: - logger.warn(f'No pre-trained weights for ' - f'{self.__class__.__name__}, ' - f'training start from scratch') - for m in self.modules(): - if isinstance(m, nn.Linear): - trunc_normal_init(m, std=.02, bias=0.) - elif isinstance(m, nn.LayerNorm): - constant_init(m, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[ - 1] * m.out_channels - fan_out //= m.groups - normal_init(m, 0, math.sqrt(2.0 / fan_out)) - elif isinstance(m, AbsolutePositionEmbedding): - m.init_weights() - else: - assert 'checkpoint' in self.init_cfg, f'Only support ' \ - f'specify `Pretrained` in ' \ - f'`init_cfg` in ' \ - f'{self.__class__.__name__} ' - checkpoint = _load_checkpoint( - self.init_cfg['checkpoint'], logger=logger, map_location='cpu') - logger.warn(f'Load pre-trained model for ' - f'{self.__class__.__name__} from original repo') - if 'state_dict' in checkpoint: - state_dict = checkpoint['state_dict'] - elif 'model' in checkpoint: - state_dict = checkpoint['model'] - else: - state_dict = checkpoint - if self.convert_weights: - # Because pvt backbones are not supported by mmcls, - # so we need to convert pre-trained weights to match this - # implementation. - state_dict = pvt_convert(state_dict) - load_state_dict(self, state_dict, strict=False, logger=logger) - - def forward(self, x): - outs = [] - - for i, layer in enumerate(self.layers): - x, hw_shape = layer[0](x) - - for block in layer[1]: - x = block(x, hw_shape) - x = layer[2](x) - x = nlc_to_nchw(x, hw_shape) - if i in self.out_indices: - outs.append(x) - - return outs - - -@BACKBONES.register_module() -class PyramidVisionTransformerV2(PyramidVisionTransformer): - """Implementation of `PVTv2: Improved Baselines with Pyramid Vision - Transformer `_.""" - - def __init__(self, **kwargs): - super(PyramidVisionTransformerV2, self).__init__( - patch_sizes=[7, 3, 3, 3], - paddings=[3, 1, 1, 1], - use_abs_pos_embed=False, - norm_after_stage=True, - use_conv_ffn=True, - **kwargs) diff --git a/main/transformer_utils/mmpose/models/backbones/regnet.py b/main/transformer_utils/mmpose/models/backbones/regnet.py deleted file mode 100644 index 693417c2d61066e4e9a90989ad61700448028e58..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/regnet.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import numpy as np -import torch.nn as nn -from mmcv.cnn import build_conv_layer, build_norm_layer - -from ..builder import BACKBONES -from .resnet import ResNet -from .resnext import Bottleneck - - -@BACKBONES.register_module() -class RegNet(ResNet): - """RegNet backbone. - - More details can be found in `paper `__ . - - Args: - arch (dict): The parameter of RegNets. - - w0 (int): initial width - - wa (float): slope of width - - wm (float): quantization parameter to quantize the width - - depth (int): depth of the backbone - - group_w (int): width of group - - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck. - strides (Sequence[int]): Strides of the first block of each stage. - base_channels (int): Base channels after stem layer. - in_channels (int): Number of input image channels. Default: 3. - dilations (Sequence[int]): Dilation of each stage. - out_indices (Sequence[int]): Output from which stages. - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. Default: "pytorch". - frozen_stages (int): Stages to be frozen (all param fixed). -1 means - not freezing any parameters. Default: -1. - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN', requires_grad=True). - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - zero_init_residual (bool): whether to use zero init for last norm layer - in resblocks to let them behave as identity. Default: True. - - Example: - >>> from mmpose.models import RegNet - >>> import torch - >>> self = RegNet( - arch=dict( - w0=88, - wa=26.31, - wm=2.25, - group_w=48, - depth=25, - bot_mul=1.0), - out_indices=(0, 1, 2, 3)) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 32, 32) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 96, 8, 8) - (1, 192, 4, 4) - (1, 432, 2, 2) - (1, 1008, 1, 1) - """ - arch_settings = { - 'regnetx_400mf': - dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0), - 'regnetx_800mf': - dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0), - 'regnetx_1.6gf': - dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0), - 'regnetx_3.2gf': - dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0), - 'regnetx_4.0gf': - dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0), - 'regnetx_6.4gf': - dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0), - 'regnetx_8.0gf': - dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0), - 'regnetx_12gf': - dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0), - } - - def __init__(self, - arch, - in_channels=3, - stem_channels=32, - base_channels=32, - strides=(2, 2, 2, 2), - dilations=(1, 1, 1, 1), - out_indices=(3, ), - style='pytorch', - deep_stem=False, - avg_down=False, - frozen_stages=-1, - conv_cfg=None, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=False, - with_cp=False, - zero_init_residual=True): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super(ResNet, self).__init__() - - # Generate RegNet parameters first - if isinstance(arch, str): - assert arch in self.arch_settings, \ - f'"arch": "{arch}" is not one of the' \ - ' arch_settings' - arch = self.arch_settings[arch] - elif not isinstance(arch, dict): - raise TypeError('Expect "arch" to be either a string ' - f'or a dict, got {type(arch)}') - - widths, num_stages = self.generate_regnet( - arch['w0'], - arch['wa'], - arch['wm'], - arch['depth'], - ) - # Convert to per stage format - stage_widths, stage_blocks = self.get_stages_from_blocks(widths) - # Generate group widths and bot muls - group_widths = [arch['group_w'] for _ in range(num_stages)] - self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)] - # Adjust the compatibility of stage_widths and group_widths - stage_widths, group_widths = self.adjust_width_group( - stage_widths, self.bottleneck_ratio, group_widths) - - # Group params by stage - self.stage_widths = stage_widths - self.group_widths = group_widths - self.depth = sum(stage_blocks) - self.stem_channels = stem_channels - self.base_channels = base_channels - self.num_stages = num_stages - assert 1 <= num_stages <= 4 - self.strides = strides - self.dilations = dilations - assert len(strides) == len(dilations) == num_stages - self.out_indices = out_indices - assert max(out_indices) < num_stages - self.style = style - self.deep_stem = deep_stem - if self.deep_stem: - raise NotImplementedError( - 'deep_stem has not been implemented for RegNet') - self.avg_down = avg_down - self.frozen_stages = frozen_stages - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.with_cp = with_cp - self.norm_eval = norm_eval - self.zero_init_residual = zero_init_residual - self.stage_blocks = stage_blocks[:num_stages] - - self._make_stem_layer(in_channels, stem_channels) - - _in_channels = stem_channels - self.res_layers = [] - for i, num_blocks in enumerate(self.stage_blocks): - stride = self.strides[i] - dilation = self.dilations[i] - group_width = self.group_widths[i] - width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i])) - stage_groups = width // group_width - - res_layer = self.make_res_layer( - block=Bottleneck, - num_blocks=num_blocks, - in_channels=_in_channels, - out_channels=self.stage_widths[i], - expansion=1, - stride=stride, - dilation=dilation, - style=self.style, - avg_down=self.avg_down, - with_cp=self.with_cp, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - base_channels=self.stage_widths[i], - groups=stage_groups, - width_per_group=group_width) - _in_channels = self.stage_widths[i] - layer_name = f'layer{i + 1}' - self.add_module(layer_name, res_layer) - self.res_layers.append(layer_name) - - self._freeze_stages() - - self.feat_dim = stage_widths[-1] - - def _make_stem_layer(self, in_channels, base_channels): - self.conv1 = build_conv_layer( - self.conv_cfg, - in_channels, - base_channels, - kernel_size=3, - stride=2, - padding=1, - bias=False) - self.norm1_name, norm1 = build_norm_layer( - self.norm_cfg, base_channels, postfix=1) - self.add_module(self.norm1_name, norm1) - self.relu = nn.ReLU(inplace=True) - - @staticmethod - def generate_regnet(initial_width, - width_slope, - width_parameter, - depth, - divisor=8): - """Generates per block width from RegNet parameters. - - Args: - initial_width ([int]): Initial width of the backbone - width_slope ([float]): Slope of the quantized linear function - width_parameter ([int]): Parameter used to quantize the width. - depth ([int]): Depth of the backbone. - divisor (int, optional): The divisor of channels. Defaults to 8. - - Returns: - list, int: return a list of widths of each stage and the number of - stages - """ - assert width_slope >= 0 - assert initial_width > 0 - assert width_parameter > 1 - assert initial_width % divisor == 0 - widths_cont = np.arange(depth) * width_slope + initial_width - ks = np.round( - np.log(widths_cont / initial_width) / np.log(width_parameter)) - widths = initial_width * np.power(width_parameter, ks) - widths = np.round(np.divide(widths, divisor)) * divisor - num_stages = len(np.unique(widths)) - widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist() - return widths, num_stages - - @staticmethod - def quantize_float(number, divisor): - """Converts a float to closest non-zero int divisible by divior. - - Args: - number (int): Original number to be quantized. - divisor (int): Divisor used to quantize the number. - - Returns: - int: quantized number that is divisible by devisor. - """ - return int(round(number / divisor) * divisor) - - def adjust_width_group(self, widths, bottleneck_ratio, groups): - """Adjusts the compatibility of widths and groups. - - Args: - widths (list[int]): Width of each stage. - bottleneck_ratio (float): Bottleneck ratio. - groups (int): number of groups in each stage - - Returns: - tuple(list): The adjusted widths and groups of each stage. - """ - bottleneck_width = [ - int(w * b) for w, b in zip(widths, bottleneck_ratio) - ] - groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)] - bottleneck_width = [ - self.quantize_float(w_bot, g) - for w_bot, g in zip(bottleneck_width, groups) - ] - widths = [ - int(w_bot / b) - for w_bot, b in zip(bottleneck_width, bottleneck_ratio) - ] - return widths, groups - - def get_stages_from_blocks(self, widths): - """Gets widths/stage_blocks of network at each stage. - - Args: - widths (list[int]): Width in each stage. - - Returns: - tuple(list): width and depth of each stage - """ - width_diff = [ - width != width_prev - for width, width_prev in zip(widths + [0], [0] + widths) - ] - stage_widths = [ - width for width, diff in zip(widths, width_diff[:-1]) if diff - ] - stage_blocks = np.diff([ - depth for depth, diff in zip(range(len(width_diff)), width_diff) - if diff - ]).tolist() - return stage_widths, stage_blocks - - def forward(self, x): - x = self.conv1(x) - x = self.norm1(x) - x = self.relu(x) - - outs = [] - for i, layer_name in enumerate(self.res_layers): - res_layer = getattr(self, layer_name) - x = res_layer(x) - if i in self.out_indices: - outs.append(x) - - if len(outs) == 1: - return outs[0] - return tuple(outs) diff --git a/main/transformer_utils/mmpose/models/backbones/resnest.py b/main/transformer_utils/mmpose/models/backbones/resnest.py deleted file mode 100644 index 0a2d4081df1417155f0626646f5fe3d0dbfc2864..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/resnest.py +++ /dev/null @@ -1,338 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.checkpoint as cp -from mmcv.cnn import build_conv_layer, build_norm_layer - -from ..builder import BACKBONES -from .resnet import Bottleneck as _Bottleneck -from .resnet import ResLayer, ResNetV1d - - -class RSoftmax(nn.Module): - """Radix Softmax module in ``SplitAttentionConv2d``. - - Args: - radix (int): Radix of input. - groups (int): Groups of input. - """ - - def __init__(self, radix, groups): - super().__init__() - self.radix = radix - self.groups = groups - - def forward(self, x): - batch = x.size(0) - if self.radix > 1: - x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2) - x = F.softmax(x, dim=1) - x = x.reshape(batch, -1) - else: - x = torch.sigmoid(x) - return x - - -class SplitAttentionConv2d(nn.Module): - """Split-Attention Conv2d. - - Args: - in_channels (int): Same as nn.Conv2d. - out_channels (int): Same as nn.Conv2d. - kernel_size (int | tuple[int]): Same as nn.Conv2d. - stride (int | tuple[int]): Same as nn.Conv2d. - padding (int | tuple[int]): Same as nn.Conv2d. - dilation (int | tuple[int]): Same as nn.Conv2d. - groups (int): Same as nn.Conv2d. - radix (int): Radix of SpltAtConv2d. Default: 2 - reduction_factor (int): Reduction factor of SplitAttentionConv2d. - Default: 4. - conv_cfg (dict): Config dict for convolution layer. Default: None, - which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. Default: None. - """ - - def __init__(self, - in_channels, - channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - radix=2, - reduction_factor=4, - conv_cfg=None, - norm_cfg=dict(type='BN')): - super().__init__() - inter_channels = max(in_channels * radix // reduction_factor, 32) - self.radix = radix - self.groups = groups - self.channels = channels - self.conv = build_conv_layer( - conv_cfg, - in_channels, - channels * radix, - kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups * radix, - bias=False) - self.norm0_name, norm0 = build_norm_layer( - norm_cfg, channels * radix, postfix=0) - self.add_module(self.norm0_name, norm0) - self.relu = nn.ReLU(inplace=True) - self.fc1 = build_conv_layer( - None, channels, inter_channels, 1, groups=self.groups) - self.norm1_name, norm1 = build_norm_layer( - norm_cfg, inter_channels, postfix=1) - self.add_module(self.norm1_name, norm1) - self.fc2 = build_conv_layer( - None, inter_channels, channels * radix, 1, groups=self.groups) - self.rsoftmax = RSoftmax(radix, groups) - - @property - def norm0(self): - return getattr(self, self.norm0_name) - - @property - def norm1(self): - return getattr(self, self.norm1_name) - - def forward(self, x): - x = self.conv(x) - x = self.norm0(x) - x = self.relu(x) - - batch, rchannel = x.shape[:2] - if self.radix > 1: - splits = x.view(batch, self.radix, -1, *x.shape[2:]) - gap = splits.sum(dim=1) - else: - gap = x - gap = F.adaptive_avg_pool2d(gap, 1) - gap = self.fc1(gap) - - gap = self.norm1(gap) - gap = self.relu(gap) - - atten = self.fc2(gap) - atten = self.rsoftmax(atten).view(batch, -1, 1, 1) - - if self.radix > 1: - attens = atten.view(batch, self.radix, -1, *atten.shape[2:]) - out = torch.sum(attens * splits, dim=1) - else: - out = atten * x - return out.contiguous() - - -class Bottleneck(_Bottleneck): - """Bottleneck block for ResNeSt. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - groups (int): Groups of conv2. - width_per_group (int): Width per group of conv2. 64x4d indicates - ``groups=64, width_per_group=4`` and 32x8d indicates - ``groups=32, width_per_group=8``. - radix (int): Radix of SpltAtConv2d. Default: 2 - reduction_factor (int): Reduction factor of SplitAttentionConv2d. - Default: 4. - avg_down_stride (bool): Whether to use average pool for stride in - Bottleneck. Default: True. - stride (int): stride of the block. Default: 1 - dilation (int): dilation of convolution. Default: 1 - downsample (nn.Module): downsample operation on identity branch. - Default: None - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - """ - - def __init__(self, - in_channels, - out_channels, - groups=1, - width_per_group=4, - base_channels=64, - radix=2, - reduction_factor=4, - avg_down_stride=True, - **kwargs): - super().__init__(in_channels, out_channels, **kwargs) - - self.groups = groups - self.width_per_group = width_per_group - - # For ResNet bottleneck, middle channels are determined by expansion - # and out_channels, but for ResNeXt bottleneck, it is determined by - # groups and width_per_group and the stage it is located in. - if groups != 1: - assert self.mid_channels % base_channels == 0 - self.mid_channels = ( - groups * width_per_group * self.mid_channels // base_channels) - - self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 - - self.norm1_name, norm1 = build_norm_layer( - self.norm_cfg, self.mid_channels, postfix=1) - self.norm3_name, norm3 = build_norm_layer( - self.norm_cfg, self.out_channels, postfix=3) - - self.conv1 = build_conv_layer( - self.conv_cfg, - self.in_channels, - self.mid_channels, - kernel_size=1, - stride=self.conv1_stride, - bias=False) - self.add_module(self.norm1_name, norm1) - self.conv2 = SplitAttentionConv2d( - self.mid_channels, - self.mid_channels, - kernel_size=3, - stride=1 if self.avg_down_stride else self.conv2_stride, - padding=self.dilation, - dilation=self.dilation, - groups=groups, - radix=radix, - reduction_factor=reduction_factor, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg) - delattr(self, self.norm2_name) - - if self.avg_down_stride: - self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1) - - self.conv3 = build_conv_layer( - self.conv_cfg, - self.mid_channels, - self.out_channels, - kernel_size=1, - bias=False) - self.add_module(self.norm3_name, norm3) - - def forward(self, x): - - def _inner_forward(x): - identity = x - - out = self.conv1(x) - out = self.norm1(out) - out = self.relu(out) - - out = self.conv2(out) - - if self.avg_down_stride: - out = self.avd_layer(out) - - out = self.conv3(out) - out = self.norm3(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - out = self.relu(out) - - return out - - -@BACKBONES.register_module() -class ResNeSt(ResNetV1d): - """ResNeSt backbone. - - Please refer to the `paper `__ - for details. - - Args: - depth (int): Network depth, from {50, 101, 152, 200}. - groups (int): Groups of conv2 in Bottleneck. Default: 32. - width_per_group (int): Width per group of conv2 in Bottleneck. - Default: 4. - radix (int): Radix of SpltAtConv2d. Default: 2 - reduction_factor (int): Reduction factor of SplitAttentionConv2d. - Default: 4. - avg_down_stride (bool): Whether to use average pool for stride in - Bottleneck. Default: True. - in_channels (int): Number of input image channels. Default: 3. - stem_channels (int): Output channels of the stem layer. Default: 64. - num_stages (int): Stages of the network. Default: 4. - strides (Sequence[int]): Strides of the first block of each stage. - Default: ``(1, 2, 2, 2)``. - dilations (Sequence[int]): Dilation of each stage. - Default: ``(1, 1, 1, 1)``. - out_indices (Sequence[int]): Output from which stages. If only one - stage is specified, a single tensor (feature map) is returned, - otherwise multiple stages are specified, a tuple of tensors will - be returned. Default: ``(3, )``. - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. - Default: False. - avg_down (bool): Use AvgPool instead of stride conv when - downsampling in the bottleneck. Default: False. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. Default: -1. - conv_cfg (dict | None): The config dict for conv layers. Default: None. - norm_cfg (dict): The config dict for norm layers. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - zero_init_residual (bool): Whether to use zero init for last norm layer - in resblocks to let them behave as identity. Default: True. - """ - - arch_settings = { - 50: (Bottleneck, (3, 4, 6, 3)), - 101: (Bottleneck, (3, 4, 23, 3)), - 152: (Bottleneck, (3, 8, 36, 3)), - 200: (Bottleneck, (3, 24, 36, 3)), - 269: (Bottleneck, (3, 30, 48, 8)) - } - - def __init__(self, - depth, - groups=1, - width_per_group=4, - radix=2, - reduction_factor=4, - avg_down_stride=True, - **kwargs): - self.groups = groups - self.width_per_group = width_per_group - self.radix = radix - self.reduction_factor = reduction_factor - self.avg_down_stride = avg_down_stride - super().__init__(depth=depth, **kwargs) - - def make_res_layer(self, **kwargs): - return ResLayer( - groups=self.groups, - width_per_group=self.width_per_group, - base_channels=self.base_channels, - radix=self.radix, - reduction_factor=self.reduction_factor, - avg_down_stride=self.avg_down_stride, - **kwargs) diff --git a/main/transformer_utils/mmpose/models/backbones/resnet.py b/main/transformer_utils/mmpose/models/backbones/resnet.py index 649496a755020140d94eb32fbe79d1ff135c86ca..376796046ba1634e3acdb3d26a3f33a3d8528522 100644 --- a/main/transformer_utils/mmpose/models/backbones/resnet.py +++ b/main/transformer_utils/mmpose/models/backbones/resnet.py @@ -3,9 +3,9 @@ import copy import torch.nn as nn import torch.utils.checkpoint as cp -from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer, - constant_init, kaiming_init) -from mmcv.utils.parrots_wrapper import _BatchNorm +from mmengine.model import constant_init, kaiming_init +from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer) +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm from ..builder import BACKBONES from .base_backbone import BaseBackbone diff --git a/main/transformer_utils/mmpose/models/backbones/resnext.py b/main/transformer_utils/mmpose/models/backbones/resnext.py deleted file mode 100644 index c10dc33f98ac3229c77bf306acf19950c295f904..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/resnext.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from mmcv.cnn import build_conv_layer, build_norm_layer - -from ..builder import BACKBONES -from .resnet import Bottleneck as _Bottleneck -from .resnet import ResLayer, ResNet - - -class Bottleneck(_Bottleneck): - """Bottleneck block for ResNeXt. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - groups (int): Groups of conv2. - width_per_group (int): Width per group of conv2. 64x4d indicates - ``groups=64, width_per_group=4`` and 32x8d indicates - ``groups=32, width_per_group=8``. - stride (int): stride of the block. Default: 1 - dilation (int): dilation of convolution. Default: 1 - downsample (nn.Module): downsample operation on identity branch. - Default: None - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - """ - - def __init__(self, - in_channels, - out_channels, - base_channels=64, - groups=32, - width_per_group=4, - **kwargs): - super().__init__(in_channels, out_channels, **kwargs) - self.groups = groups - self.width_per_group = width_per_group - - # For ResNet bottleneck, middle channels are determined by expansion - # and out_channels, but for ResNeXt bottleneck, it is determined by - # groups and width_per_group and the stage it is located in. - if groups != 1: - assert self.mid_channels % base_channels == 0 - self.mid_channels = ( - groups * width_per_group * self.mid_channels // base_channels) - - self.norm1_name, norm1 = build_norm_layer( - self.norm_cfg, self.mid_channels, postfix=1) - self.norm2_name, norm2 = build_norm_layer( - self.norm_cfg, self.mid_channels, postfix=2) - self.norm3_name, norm3 = build_norm_layer( - self.norm_cfg, self.out_channels, postfix=3) - - self.conv1 = build_conv_layer( - self.conv_cfg, - self.in_channels, - self.mid_channels, - kernel_size=1, - stride=self.conv1_stride, - bias=False) - self.add_module(self.norm1_name, norm1) - self.conv2 = build_conv_layer( - self.conv_cfg, - self.mid_channels, - self.mid_channels, - kernel_size=3, - stride=self.conv2_stride, - padding=self.dilation, - dilation=self.dilation, - groups=groups, - bias=False) - - self.add_module(self.norm2_name, norm2) - self.conv3 = build_conv_layer( - self.conv_cfg, - self.mid_channels, - self.out_channels, - kernel_size=1, - bias=False) - self.add_module(self.norm3_name, norm3) - - -@BACKBONES.register_module() -class ResNeXt(ResNet): - """ResNeXt backbone. - - Please refer to the `paper `__ for - details. - - Args: - depth (int): Network depth, from {50, 101, 152}. - groups (int): Groups of conv2 in Bottleneck. Default: 32. - width_per_group (int): Width per group of conv2 in Bottleneck. - Default: 4. - in_channels (int): Number of input image channels. Default: 3. - stem_channels (int): Output channels of the stem layer. Default: 64. - num_stages (int): Stages of the network. Default: 4. - strides (Sequence[int]): Strides of the first block of each stage. - Default: ``(1, 2, 2, 2)``. - dilations (Sequence[int]): Dilation of each stage. - Default: ``(1, 1, 1, 1)``. - out_indices (Sequence[int]): Output from which stages. If only one - stage is specified, a single tensor (feature map) is returned, - otherwise multiple stages are specified, a tuple of tensors will - be returned. Default: ``(3, )``. - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. - Default: False. - avg_down (bool): Use AvgPool instead of stride conv when - downsampling in the bottleneck. Default: False. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. Default: -1. - conv_cfg (dict | None): The config dict for conv layers. Default: None. - norm_cfg (dict): The config dict for norm layers. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - zero_init_residual (bool): Whether to use zero init for last norm layer - in resblocks to let them behave as identity. Default: True. - - Example: - >>> from mmpose.models import ResNeXt - >>> import torch - >>> self = ResNeXt(depth=50, out_indices=(0, 1, 2, 3)) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 32, 32) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 256, 8, 8) - (1, 512, 4, 4) - (1, 1024, 2, 2) - (1, 2048, 1, 1) - """ - - arch_settings = { - 50: (Bottleneck, (3, 4, 6, 3)), - 101: (Bottleneck, (3, 4, 23, 3)), - 152: (Bottleneck, (3, 8, 36, 3)) - } - - def __init__(self, depth, groups=32, width_per_group=4, **kwargs): - self.groups = groups - self.width_per_group = width_per_group - super().__init__(depth, **kwargs) - - def make_res_layer(self, **kwargs): - return ResLayer( - groups=self.groups, - width_per_group=self.width_per_group, - base_channels=self.base_channels, - **kwargs) diff --git a/main/transformer_utils/mmpose/models/backbones/rsn.py b/main/transformer_utils/mmpose/models/backbones/rsn.py deleted file mode 100644 index 29038afe2a77dcb3d3b027b1549d478916a50727..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/rsn.py +++ /dev/null @@ -1,616 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy as cp - -import torch -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init, - normal_init) - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone - - -class RSB(nn.Module): - """Residual Steps block for RSN. Paper ref: Cai et al. "Learning Delicate - Local Representations for Multi-Person Pose Estimation" (ECCV 2020). - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - num_steps (int): Numbers of steps in RSB - stride (int): stride of the block. Default: 1 - downsample (nn.Module): downsample operation on identity branch. - Default: None. - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - expand_times (int): Times by which the in_channels are expanded. - Default:26. - res_top_channels (int): Number of channels of feature output by - ResNet_top. Default:64. - """ - - expansion = 1 - - def __init__(self, - in_channels, - out_channels, - num_steps=4, - stride=1, - downsample=None, - with_cp=False, - norm_cfg=dict(type='BN'), - expand_times=26, - res_top_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - assert num_steps > 1 - self.in_channels = in_channels - self.branch_channels = self.in_channels * expand_times - self.branch_channels //= res_top_channels - self.out_channels = out_channels - self.stride = stride - self.downsample = downsample - self.with_cp = with_cp - self.norm_cfg = norm_cfg - self.num_steps = num_steps - self.conv_bn_relu1 = ConvModule( - self.in_channels, - self.num_steps * self.branch_channels, - kernel_size=1, - stride=self.stride, - padding=0, - norm_cfg=self.norm_cfg, - inplace=False) - for i in range(self.num_steps): - for j in range(i + 1): - module_name = f'conv_bn_relu2_{i + 1}_{j + 1}' - self.add_module( - module_name, - ConvModule( - self.branch_channels, - self.branch_channels, - kernel_size=3, - stride=1, - padding=1, - norm_cfg=self.norm_cfg, - inplace=False)) - self.conv_bn3 = ConvModule( - self.num_steps * self.branch_channels, - self.out_channels * self.expansion, - kernel_size=1, - stride=1, - padding=0, - act_cfg=None, - norm_cfg=self.norm_cfg, - inplace=False) - self.relu = nn.ReLU(inplace=False) - - def forward(self, x): - """Forward function.""" - - identity = x - x = self.conv_bn_relu1(x) - spx = torch.split(x, self.branch_channels, 1) - outputs = list() - outs = list() - for i in range(self.num_steps): - outputs_i = list() - outputs.append(outputs_i) - for j in range(i + 1): - if j == 0: - inputs = spx[i] - else: - inputs = outputs[i][j - 1] - if i > j: - inputs = inputs + outputs[i - 1][j] - module_name = f'conv_bn_relu2_{i + 1}_{j + 1}' - module_i_j = getattr(self, module_name) - outputs[i].append(module_i_j(inputs)) - - outs.append(outputs[i][i]) - out = torch.cat(tuple(outs), 1) - out = self.conv_bn3(out) - - if self.downsample is not None: - identity = self.downsample(identity) - out = out + identity - - out = self.relu(out) - - return out - - -class Downsample_module(nn.Module): - """Downsample module for RSN. - - Args: - block (nn.Module): Downsample block. - num_blocks (list): Number of blocks in each downsample unit. - num_units (int): Numbers of downsample units. Default: 4 - has_skip (bool): Have skip connections from prior upsample - module or not. Default:False - num_steps (int): Number of steps in a block. Default:4 - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - in_channels (int): Number of channels of the input feature to - downsample module. Default: 64 - expand_times (int): Times by which the in_channels are expanded. - Default:26. - """ - - def __init__(self, - block, - num_blocks, - num_steps=4, - num_units=4, - has_skip=False, - norm_cfg=dict(type='BN'), - in_channels=64, - expand_times=26): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.has_skip = has_skip - self.in_channels = in_channels - assert len(num_blocks) == num_units - self.num_blocks = num_blocks - self.num_units = num_units - self.num_steps = num_steps - self.norm_cfg = norm_cfg - self.layer1 = self._make_layer( - block, - in_channels, - num_blocks[0], - expand_times=expand_times, - res_top_channels=in_channels) - for i in range(1, num_units): - module_name = f'layer{i + 1}' - self.add_module( - module_name, - self._make_layer( - block, - in_channels * pow(2, i), - num_blocks[i], - stride=2, - expand_times=expand_times, - res_top_channels=in_channels)) - - def _make_layer(self, - block, - out_channels, - blocks, - stride=1, - expand_times=26, - res_top_channels=64): - downsample = None - if stride != 1 or self.in_channels != out_channels * block.expansion: - downsample = ConvModule( - self.in_channels, - out_channels * block.expansion, - kernel_size=1, - stride=stride, - padding=0, - norm_cfg=self.norm_cfg, - act_cfg=None, - inplace=True) - - units = list() - units.append( - block( - self.in_channels, - out_channels, - num_steps=self.num_steps, - stride=stride, - downsample=downsample, - norm_cfg=self.norm_cfg, - expand_times=expand_times, - res_top_channels=res_top_channels)) - self.in_channels = out_channels * block.expansion - for _ in range(1, blocks): - units.append( - block( - self.in_channels, - out_channels, - num_steps=self.num_steps, - expand_times=expand_times, - res_top_channels=res_top_channels)) - - return nn.Sequential(*units) - - def forward(self, x, skip1, skip2): - out = list() - for i in range(self.num_units): - module_name = f'layer{i + 1}' - module_i = getattr(self, module_name) - x = module_i(x) - if self.has_skip: - x = x + skip1[i] + skip2[i] - out.append(x) - out.reverse() - - return tuple(out) - - -class Upsample_unit(nn.Module): - """Upsample unit for upsample module. - - Args: - ind (int): Indicates whether to interpolate (>0) and whether to - generate feature map for the next hourglass-like module. - num_units (int): Number of units that form a upsample module. Along - with ind and gen_cross_conv, nm_units is used to decide whether - to generate feature map for the next hourglass-like module. - in_channels (int): Channel number of the skip-in feature maps from - the corresponding downsample unit. - unit_channels (int): Channel number in this unit. Default:256. - gen_skip: (bool): Whether or not to generate skips for the posterior - downsample module. Default:False - gen_cross_conv (bool): Whether to generate feature map for the next - hourglass-like module. Default:False - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - out_channels (in): Number of channels of feature output by upsample - module. Must equal to in_channels of downsample module. Default:64 - """ - - def __init__(self, - ind, - num_units, - in_channels, - unit_channels=256, - gen_skip=False, - gen_cross_conv=False, - norm_cfg=dict(type='BN'), - out_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.num_units = num_units - self.norm_cfg = norm_cfg - self.in_skip = ConvModule( - in_channels, - unit_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - act_cfg=None, - inplace=True) - self.relu = nn.ReLU(inplace=True) - - self.ind = ind - if self.ind > 0: - self.up_conv = ConvModule( - unit_channels, - unit_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - act_cfg=None, - inplace=True) - - self.gen_skip = gen_skip - if self.gen_skip: - self.out_skip1 = ConvModule( - in_channels, - in_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - inplace=True) - - self.out_skip2 = ConvModule( - unit_channels, - in_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - inplace=True) - - self.gen_cross_conv = gen_cross_conv - if self.ind == num_units - 1 and self.gen_cross_conv: - self.cross_conv = ConvModule( - unit_channels, - out_channels, - kernel_size=1, - stride=1, - padding=0, - norm_cfg=self.norm_cfg, - inplace=True) - - def forward(self, x, up_x): - out = self.in_skip(x) - - if self.ind > 0: - up_x = F.interpolate( - up_x, - size=(x.size(2), x.size(3)), - mode='bilinear', - align_corners=True) - up_x = self.up_conv(up_x) - out = out + up_x - out = self.relu(out) - - skip1 = None - skip2 = None - if self.gen_skip: - skip1 = self.out_skip1(x) - skip2 = self.out_skip2(out) - - cross_conv = None - if self.ind == self.num_units - 1 and self.gen_cross_conv: - cross_conv = self.cross_conv(out) - - return out, skip1, skip2, cross_conv - - -class Upsample_module(nn.Module): - """Upsample module for RSN. - - Args: - unit_channels (int): Channel number in the upsample units. - Default:256. - num_units (int): Numbers of upsample units. Default: 4 - gen_skip (bool): Whether to generate skip for posterior downsample - module or not. Default:False - gen_cross_conv (bool): Whether to generate feature map for the next - hourglass-like module. Default:False - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - out_channels (int): Number of channels of feature output by upsample - module. Must equal to in_channels of downsample module. Default:64 - """ - - def __init__(self, - unit_channels=256, - num_units=4, - gen_skip=False, - gen_cross_conv=False, - norm_cfg=dict(type='BN'), - out_channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.in_channels = list() - for i in range(num_units): - self.in_channels.append(RSB.expansion * out_channels * pow(2, i)) - self.in_channels.reverse() - self.num_units = num_units - self.gen_skip = gen_skip - self.gen_cross_conv = gen_cross_conv - self.norm_cfg = norm_cfg - for i in range(num_units): - module_name = f'up{i + 1}' - self.add_module( - module_name, - Upsample_unit( - i, - self.num_units, - self.in_channels[i], - unit_channels, - self.gen_skip, - self.gen_cross_conv, - norm_cfg=self.norm_cfg, - out_channels=64)) - - def forward(self, x): - out = list() - skip1 = list() - skip2 = list() - cross_conv = None - for i in range(self.num_units): - module_i = getattr(self, f'up{i + 1}') - if i == 0: - outi, skip1_i, skip2_i, _ = module_i(x[i], None) - elif i == self.num_units - 1: - outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1]) - else: - outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1]) - out.append(outi) - skip1.append(skip1_i) - skip2.append(skip2_i) - skip1.reverse() - skip2.reverse() - - return out, skip1, skip2, cross_conv - - -class Single_stage_RSN(nn.Module): - """Single_stage Residual Steps Network. - - Args: - unit_channels (int): Channel number in the upsample units. Default:256. - num_units (int): Numbers of downsample/upsample units. Default: 4 - gen_skip (bool): Whether to generate skip for posterior downsample - module or not. Default:False - gen_cross_conv (bool): Whether to generate feature map for the next - hourglass-like module. Default:False - has_skip (bool): Have skip connections from prior upsample - module or not. Default:False - num_steps (int): Number of steps in RSB. Default: 4 - num_blocks (list): Number of blocks in each downsample unit. - Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks) - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - in_channels (int): Number of channels of the feature from ResNet_Top. - Default: 64. - expand_times (int): Times by which the in_channels are expanded in RSB. - Default:26. - """ - - def __init__(self, - has_skip=False, - gen_skip=False, - gen_cross_conv=False, - unit_channels=256, - num_units=4, - num_steps=4, - num_blocks=[2, 2, 2, 2], - norm_cfg=dict(type='BN'), - in_channels=64, - expand_times=26): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - num_blocks = cp.deepcopy(num_blocks) - super().__init__() - assert len(num_blocks) == num_units - self.has_skip = has_skip - self.gen_skip = gen_skip - self.gen_cross_conv = gen_cross_conv - self.num_units = num_units - self.num_steps = num_steps - self.unit_channels = unit_channels - self.num_blocks = num_blocks - self.norm_cfg = norm_cfg - - self.downsample = Downsample_module(RSB, num_blocks, num_steps, - num_units, has_skip, norm_cfg, - in_channels, expand_times) - self.upsample = Upsample_module(unit_channels, num_units, gen_skip, - gen_cross_conv, norm_cfg, in_channels) - - def forward(self, x, skip1, skip2): - mid = self.downsample(x, skip1, skip2) - out, skip1, skip2, cross_conv = self.upsample(mid) - - return out, skip1, skip2, cross_conv - - -class ResNet_top(nn.Module): - """ResNet top for RSN. - - Args: - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - channels (int): Number of channels of the feature output by ResNet_top. - """ - - def __init__(self, norm_cfg=dict(type='BN'), channels=64): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - super().__init__() - self.top = nn.Sequential( - ConvModule( - 3, - channels, - kernel_size=7, - stride=2, - padding=3, - norm_cfg=norm_cfg, - inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1)) - - def forward(self, img): - return self.top(img) - - -@BACKBONES.register_module() -class RSN(BaseBackbone): - """Residual Steps Network backbone. Paper ref: Cai et al. "Learning - Delicate Local Representations for Multi-Person Pose Estimation" (ECCV - 2020). - - Args: - unit_channels (int): Number of Channels in an upsample unit. - Default: 256 - num_stages (int): Number of stages in a multi-stage RSN. Default: 4 - num_units (int): NUmber of downsample/upsample units in a single-stage - RSN. Default: 4 Note: Make sure num_units == len(self.num_blocks) - num_blocks (list): Number of RSBs (Residual Steps Block) in each - downsample unit. Default: [2, 2, 2, 2] - num_steps (int): Number of steps in a RSB. Default:4 - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - res_top_channels (int): Number of channels of feature from ResNet_top. - Default: 64. - expand_times (int): Times by which the in_channels are expanded in RSB. - Default:26. - Example: - >>> from mmpose.models import RSN - >>> import torch - >>> self = RSN(num_stages=2,num_units=2,num_blocks=[2,2]) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 511, 511) - >>> level_outputs = self.forward(inputs) - >>> for level_output in level_outputs: - ... for feature in level_output: - ... print(tuple(feature.shape)) - ... - (1, 256, 64, 64) - (1, 256, 128, 128) - (1, 256, 64, 64) - (1, 256, 128, 128) - """ - - def __init__(self, - unit_channels=256, - num_stages=4, - num_units=4, - num_blocks=[2, 2, 2, 2], - num_steps=4, - norm_cfg=dict(type='BN'), - res_top_channels=64, - expand_times=26): - # Protect mutable default arguments - norm_cfg = cp.deepcopy(norm_cfg) - num_blocks = cp.deepcopy(num_blocks) - super().__init__() - self.unit_channels = unit_channels - self.num_stages = num_stages - self.num_units = num_units - self.num_blocks = num_blocks - self.num_steps = num_steps - self.norm_cfg = norm_cfg - - assert self.num_stages > 0 - assert self.num_steps > 1 - assert self.num_units > 1 - assert self.num_units == len(self.num_blocks) - self.top = ResNet_top(norm_cfg=norm_cfg) - self.multi_stage_rsn = nn.ModuleList([]) - for i in range(self.num_stages): - if i == 0: - has_skip = False - else: - has_skip = True - if i != self.num_stages - 1: - gen_skip = True - gen_cross_conv = True - else: - gen_skip = False - gen_cross_conv = False - self.multi_stage_rsn.append( - Single_stage_RSN(has_skip, gen_skip, gen_cross_conv, - unit_channels, num_units, num_steps, - num_blocks, norm_cfg, res_top_channels, - expand_times)) - - def forward(self, x): - """Model forward function.""" - out_feats = [] - skip1 = None - skip2 = None - x = self.top(x) - for i in range(self.num_stages): - out, skip1, skip2, x = self.multi_stage_rsn[i](x, skip1, skip2) - out_feats.append(out) - - return out_feats - - def init_weights(self, pretrained=None): - """Initialize model weights.""" - for m in self.multi_stage_rsn.modules(): - if isinstance(m, nn.Conv2d): - kaiming_init(m) - elif isinstance(m, nn.BatchNorm2d): - constant_init(m, 1) - elif isinstance(m, nn.Linear): - normal_init(m, std=0.01) - - for m in self.top.modules(): - if isinstance(m, nn.Conv2d): - kaiming_init(m) diff --git a/main/transformer_utils/mmpose/models/backbones/scnet.py b/main/transformer_utils/mmpose/models/backbones/scnet.py deleted file mode 100644 index 3786c5731d685638cfa64a83e5d4a5e2eee545de..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/scnet.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.checkpoint as cp -from mmcv.cnn import build_conv_layer, build_norm_layer - -from ..builder import BACKBONES -from .resnet import Bottleneck, ResNet - - -class SCConv(nn.Module): - """SCConv (Self-calibrated Convolution) - - Args: - in_channels (int): The input channels of the SCConv. - out_channels (int): The output channel of the SCConv. - stride (int): stride of SCConv. - pooling_r (int): size of pooling for scconv. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - """ - - def __init__(self, - in_channels, - out_channels, - stride, - pooling_r, - conv_cfg=None, - norm_cfg=dict(type='BN', momentum=0.1)): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - - assert in_channels == out_channels - - self.k2 = nn.Sequential( - nn.AvgPool2d(kernel_size=pooling_r, stride=pooling_r), - build_conv_layer( - conv_cfg, - in_channels, - in_channels, - kernel_size=3, - stride=1, - padding=1, - bias=False), - build_norm_layer(norm_cfg, in_channels)[1], - ) - self.k3 = nn.Sequential( - build_conv_layer( - conv_cfg, - in_channels, - in_channels, - kernel_size=3, - stride=1, - padding=1, - bias=False), - build_norm_layer(norm_cfg, in_channels)[1], - ) - self.k4 = nn.Sequential( - build_conv_layer( - conv_cfg, - in_channels, - in_channels, - kernel_size=3, - stride=stride, - padding=1, - bias=False), - build_norm_layer(norm_cfg, out_channels)[1], - nn.ReLU(inplace=True), - ) - - def forward(self, x): - """Forward function.""" - identity = x - - out = torch.sigmoid( - torch.add(identity, F.interpolate(self.k2(x), - identity.size()[2:]))) - out = torch.mul(self.k3(x), out) - out = self.k4(out) - - return out - - -class SCBottleneck(Bottleneck): - """SC(Self-calibrated) Bottleneck. - - Args: - in_channels (int): The input channels of the SCBottleneck block. - out_channels (int): The output channel of the SCBottleneck block. - """ - - pooling_r = 4 - - def __init__(self, in_channels, out_channels, **kwargs): - super().__init__(in_channels, out_channels, **kwargs) - self.mid_channels = out_channels // self.expansion // 2 - - self.norm1_name, norm1 = build_norm_layer( - self.norm_cfg, self.mid_channels, postfix=1) - self.norm2_name, norm2 = build_norm_layer( - self.norm_cfg, self.mid_channels, postfix=2) - self.norm3_name, norm3 = build_norm_layer( - self.norm_cfg, out_channels, postfix=3) - - self.conv1 = build_conv_layer( - self.conv_cfg, - in_channels, - self.mid_channels, - kernel_size=1, - stride=1, - bias=False) - self.add_module(self.norm1_name, norm1) - - self.k1 = nn.Sequential( - build_conv_layer( - self.conv_cfg, - self.mid_channels, - self.mid_channels, - kernel_size=3, - stride=self.stride, - padding=1, - bias=False), - build_norm_layer(self.norm_cfg, self.mid_channels)[1], - nn.ReLU(inplace=True)) - - self.conv2 = build_conv_layer( - self.conv_cfg, - in_channels, - self.mid_channels, - kernel_size=1, - stride=1, - bias=False) - self.add_module(self.norm2_name, norm2) - - self.scconv = SCConv(self.mid_channels, self.mid_channels, self.stride, - self.pooling_r, self.conv_cfg, self.norm_cfg) - - self.conv3 = build_conv_layer( - self.conv_cfg, - self.mid_channels * 2, - out_channels, - kernel_size=1, - stride=1, - bias=False) - self.add_module(self.norm3_name, norm3) - - def forward(self, x): - """Forward function.""" - - def _inner_forward(x): - identity = x - - out_a = self.conv1(x) - out_a = self.norm1(out_a) - out_a = self.relu(out_a) - - out_a = self.k1(out_a) - - out_b = self.conv2(x) - out_b = self.norm2(out_b) - out_b = self.relu(out_b) - - out_b = self.scconv(out_b) - - out = self.conv3(torch.cat([out_a, out_b], dim=1)) - out = self.norm3(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - out = self.relu(out) - - return out - - -@BACKBONES.register_module() -class SCNet(ResNet): - """SCNet backbone. - - Improving Convolutional Networks with Self-Calibrated Convolutions, - Jiang-Jiang Liu, Qibin Hou, Ming-Ming Cheng, Changhu Wang, Jiashi Feng, - IEEE CVPR, 2020. - http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf - - Args: - depth (int): Depth of scnet, from {50, 101}. - in_channels (int): Number of input image channels. Normally 3. - base_channels (int): Number of base channels of hidden layer. - num_stages (int): SCNet stages, normally 4. - strides (Sequence[int]): Strides of the first block of each stage. - dilations (Sequence[int]): Dilation of each stage. - out_indices (Sequence[int]): Output from which stages. - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv - avg_down (bool): Use AvgPool instead of stride conv when - downsampling in the bottleneck. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. - norm_cfg (dict): Dictionary to construct and config norm layer. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - zero_init_residual (bool): Whether to use zero init for last norm layer - in resblocks to let them behave as identity. - - Example: - >>> from mmpose.models import SCNet - >>> import torch - >>> self = SCNet(depth=50, out_indices=(0, 1, 2, 3)) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 224, 224) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 256, 56, 56) - (1, 512, 28, 28) - (1, 1024, 14, 14) - (1, 2048, 7, 7) - """ - - arch_settings = { - 50: (SCBottleneck, [3, 4, 6, 3]), - 101: (SCBottleneck, [3, 4, 23, 3]) - } - - def __init__(self, depth, **kwargs): - if depth not in self.arch_settings: - raise KeyError(f'invalid depth {depth} for SCNet') - super().__init__(depth, **kwargs) diff --git a/main/transformer_utils/mmpose/models/backbones/seresnet.py b/main/transformer_utils/mmpose/models/backbones/seresnet.py deleted file mode 100644 index ac2d53b40a4593bce96d5c7c3bb4e06d38353d0b..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/seresnet.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch.utils.checkpoint as cp - -from ..builder import BACKBONES -from .resnet import Bottleneck, ResLayer, ResNet -from .utils.se_layer import SELayer - - -class SEBottleneck(Bottleneck): - """SEBottleneck block for SEResNet. - - Args: - in_channels (int): The input channels of the SEBottleneck block. - out_channels (int): The output channel of the SEBottleneck block. - se_ratio (int): Squeeze ratio in SELayer. Default: 16 - """ - - def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs): - super().__init__(in_channels, out_channels, **kwargs) - self.se_layer = SELayer(out_channels, ratio=se_ratio) - - def forward(self, x): - - def _inner_forward(x): - identity = x - - out = self.conv1(x) - out = self.norm1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.norm2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.norm3(out) - - out = self.se_layer(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - out = self.relu(out) - - return out - - -@BACKBONES.register_module() -class SEResNet(ResNet): - """SEResNet backbone. - - Please refer to the `paper `__ for - details. - - Args: - depth (int): Network depth, from {50, 101, 152}. - se_ratio (int): Squeeze ratio in SELayer. Default: 16. - in_channels (int): Number of input image channels. Default: 3. - stem_channels (int): Output channels of the stem layer. Default: 64. - num_stages (int): Stages of the network. Default: 4. - strides (Sequence[int]): Strides of the first block of each stage. - Default: ``(1, 2, 2, 2)``. - dilations (Sequence[int]): Dilation of each stage. - Default: ``(1, 1, 1, 1)``. - out_indices (Sequence[int]): Output from which stages. If only one - stage is specified, a single tensor (feature map) is returned, - otherwise multiple stages are specified, a tuple of tensors will - be returned. Default: ``(3, )``. - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. - Default: False. - avg_down (bool): Use AvgPool instead of stride conv when - downsampling in the bottleneck. Default: False. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. Default: -1. - conv_cfg (dict | None): The config dict for conv layers. Default: None. - norm_cfg (dict): The config dict for norm layers. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - zero_init_residual (bool): Whether to use zero init for last norm layer - in resblocks to let them behave as identity. Default: True. - - Example: - >>> from mmpose.models import SEResNet - >>> import torch - >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3)) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 224, 224) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 256, 56, 56) - (1, 512, 28, 28) - (1, 1024, 14, 14) - (1, 2048, 7, 7) - """ - - arch_settings = { - 50: (SEBottleneck, (3, 4, 6, 3)), - 101: (SEBottleneck, (3, 4, 23, 3)), - 152: (SEBottleneck, (3, 8, 36, 3)) - } - - def __init__(self, depth, se_ratio=16, **kwargs): - if depth not in self.arch_settings: - raise KeyError(f'invalid depth {depth} for SEResNet') - self.se_ratio = se_ratio - super().__init__(depth, **kwargs) - - def make_res_layer(self, **kwargs): - return ResLayer(se_ratio=self.se_ratio, **kwargs) diff --git a/main/transformer_utils/mmpose/models/backbones/seresnext.py b/main/transformer_utils/mmpose/models/backbones/seresnext.py deleted file mode 100644 index c5c4e4ce03684f8a9bd0c6166969c01bace54bd2..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/seresnext.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from mmcv.cnn import build_conv_layer, build_norm_layer - -from ..builder import BACKBONES -from .resnet import ResLayer -from .seresnet import SEBottleneck as _SEBottleneck -from .seresnet import SEResNet - - -class SEBottleneck(_SEBottleneck): - """SEBottleneck block for SEResNeXt. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - base_channels (int): Middle channels of the first stage. Default: 64. - groups (int): Groups of conv2. - width_per_group (int): Width per group of conv2. 64x4d indicates - ``groups=64, width_per_group=4`` and 32x8d indicates - ``groups=32, width_per_group=8``. - stride (int): stride of the block. Default: 1 - dilation (int): dilation of convolution. Default: 1 - downsample (nn.Module): downsample operation on identity branch. - Default: None - se_ratio (int): Squeeze ratio in SELayer. Default: 16 - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - """ - - def __init__(self, - in_channels, - out_channels, - base_channels=64, - groups=32, - width_per_group=4, - se_ratio=16, - **kwargs): - super().__init__(in_channels, out_channels, se_ratio, **kwargs) - self.groups = groups - self.width_per_group = width_per_group - - # We follow the same rational of ResNext to compute mid_channels. - # For SEResNet bottleneck, middle channels are determined by expansion - # and out_channels, but for SEResNeXt bottleneck, it is determined by - # groups and width_per_group and the stage it is located in. - if groups != 1: - assert self.mid_channels % base_channels == 0 - self.mid_channels = ( - groups * width_per_group * self.mid_channels // base_channels) - - self.norm1_name, norm1 = build_norm_layer( - self.norm_cfg, self.mid_channels, postfix=1) - self.norm2_name, norm2 = build_norm_layer( - self.norm_cfg, self.mid_channels, postfix=2) - self.norm3_name, norm3 = build_norm_layer( - self.norm_cfg, self.out_channels, postfix=3) - - self.conv1 = build_conv_layer( - self.conv_cfg, - self.in_channels, - self.mid_channels, - kernel_size=1, - stride=self.conv1_stride, - bias=False) - self.add_module(self.norm1_name, norm1) - self.conv2 = build_conv_layer( - self.conv_cfg, - self.mid_channels, - self.mid_channels, - kernel_size=3, - stride=self.conv2_stride, - padding=self.dilation, - dilation=self.dilation, - groups=groups, - bias=False) - - self.add_module(self.norm2_name, norm2) - self.conv3 = build_conv_layer( - self.conv_cfg, - self.mid_channels, - self.out_channels, - kernel_size=1, - bias=False) - self.add_module(self.norm3_name, norm3) - - -@BACKBONES.register_module() -class SEResNeXt(SEResNet): - """SEResNeXt backbone. - - Please refer to the `paper `__ for - details. - - Args: - depth (int): Network depth, from {50, 101, 152}. - groups (int): Groups of conv2 in Bottleneck. Default: 32. - width_per_group (int): Width per group of conv2 in Bottleneck. - Default: 4. - se_ratio (int): Squeeze ratio in SELayer. Default: 16. - in_channels (int): Number of input image channels. Default: 3. - stem_channels (int): Output channels of the stem layer. Default: 64. - num_stages (int): Stages of the network. Default: 4. - strides (Sequence[int]): Strides of the first block of each stage. - Default: ``(1, 2, 2, 2)``. - dilations (Sequence[int]): Dilation of each stage. - Default: ``(1, 1, 1, 1)``. - out_indices (Sequence[int]): Output from which stages. If only one - stage is specified, a single tensor (feature map) is returned, - otherwise multiple stages are specified, a tuple of tensors will - be returned. Default: ``(3, )``. - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. - Default: False. - avg_down (bool): Use AvgPool instead of stride conv when - downsampling in the bottleneck. Default: False. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. Default: -1. - conv_cfg (dict | None): The config dict for conv layers. Default: None. - norm_cfg (dict): The config dict for norm layers. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - zero_init_residual (bool): Whether to use zero init for last norm layer - in resblocks to let them behave as identity. Default: True. - - Example: - >>> from mmpose.models import SEResNeXt - >>> import torch - >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3)) - >>> self.eval() - >>> inputs = torch.rand(1, 3, 224, 224) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 256, 56, 56) - (1, 512, 28, 28) - (1, 1024, 14, 14) - (1, 2048, 7, 7) - """ - - arch_settings = { - 50: (SEBottleneck, (3, 4, 6, 3)), - 101: (SEBottleneck, (3, 4, 23, 3)), - 152: (SEBottleneck, (3, 8, 36, 3)) - } - - def __init__(self, depth, groups=32, width_per_group=4, **kwargs): - self.groups = groups - self.width_per_group = width_per_group - super().__init__(depth, **kwargs) - - def make_res_layer(self, **kwargs): - return ResLayer( - groups=self.groups, - width_per_group=self.width_per_group, - base_channels=self.base_channels, - **kwargs) diff --git a/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py b/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py deleted file mode 100644 index 9f98cbd2132250ec13adcce6e642c966b0dbd7cc..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py +++ /dev/null @@ -1,329 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -import logging - -import torch -import torch.nn as nn -import torch.utils.checkpoint as cp -from mmcv.cnn import (ConvModule, build_activation_layer, constant_init, - normal_init) -from torch.nn.modules.batchnorm import _BatchNorm - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .utils import channel_shuffle, load_checkpoint, make_divisible - - -class ShuffleUnit(nn.Module): - """ShuffleUnit block. - - ShuffleNet unit with pointwise group convolution (GConv) and channel - shuffle. - - Args: - in_channels (int): The input channels of the ShuffleUnit. - out_channels (int): The output channels of the ShuffleUnit. - groups (int, optional): The number of groups to be used in grouped 1x1 - convolutions in each ShuffleUnit. Default: 3 - first_block (bool, optional): Whether it is the first ShuffleUnit of a - sequential ShuffleUnits. Default: True, which means not using the - grouped 1x1 convolution. - combine (str, optional): The ways to combine the input and output - branches. Default: 'add'. - conv_cfg (dict): Config dict for convolution layer. Default: None, - which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). - with_cp (bool, optional): Use checkpoint or not. Using checkpoint - will save some memory while slowing down the training speed. - Default: False. - - Returns: - Tensor: The output tensor. - """ - - def __init__(self, - in_channels, - out_channels, - groups=3, - first_block=True, - combine='add', - conv_cfg=None, - norm_cfg=dict(type='BN'), - act_cfg=dict(type='ReLU'), - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - act_cfg = copy.deepcopy(act_cfg) - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.first_block = first_block - self.combine = combine - self.groups = groups - self.bottleneck_channels = self.out_channels // 4 - self.with_cp = with_cp - - if self.combine == 'add': - self.depthwise_stride = 1 - self._combine_func = self._add - assert in_channels == out_channels, ( - 'in_channels must be equal to out_channels when combine ' - 'is add') - elif self.combine == 'concat': - self.depthwise_stride = 2 - self._combine_func = self._concat - self.out_channels -= self.in_channels - self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1) - else: - raise ValueError(f'Cannot combine tensors with {self.combine}. ' - 'Only "add" and "concat" are supported') - - self.first_1x1_groups = 1 if first_block else self.groups - self.g_conv_1x1_compress = ConvModule( - in_channels=self.in_channels, - out_channels=self.bottleneck_channels, - kernel_size=1, - groups=self.first_1x1_groups, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg) - - self.depthwise_conv3x3_bn = ConvModule( - in_channels=self.bottleneck_channels, - out_channels=self.bottleneck_channels, - kernel_size=3, - stride=self.depthwise_stride, - padding=1, - groups=self.bottleneck_channels, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None) - - self.g_conv_1x1_expand = ConvModule( - in_channels=self.bottleneck_channels, - out_channels=self.out_channels, - kernel_size=1, - groups=self.groups, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None) - - self.act = build_activation_layer(act_cfg) - - @staticmethod - def _add(x, out): - # residual connection - return x + out - - @staticmethod - def _concat(x, out): - # concatenate along channel axis - return torch.cat((x, out), 1) - - def forward(self, x): - - def _inner_forward(x): - residual = x - - out = self.g_conv_1x1_compress(x) - out = self.depthwise_conv3x3_bn(out) - - if self.groups > 1: - out = channel_shuffle(out, self.groups) - - out = self.g_conv_1x1_expand(out) - - if self.combine == 'concat': - residual = self.avgpool(residual) - out = self.act(out) - out = self._combine_func(residual, out) - else: - out = self._combine_func(residual, out) - out = self.act(out) - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - return out - - -@BACKBONES.register_module() -class ShuffleNetV1(BaseBackbone): - """ShuffleNetV1 backbone. - - Args: - groups (int, optional): The number of groups to be used in grouped 1x1 - convolutions in each ShuffleUnit. Default: 3. - widen_factor (float, optional): Width multiplier - adjusts the number - of channels in each layer by this amount. Default: 1.0. - out_indices (Sequence[int]): Output from which stages. - Default: (2, ) - frozen_stages (int): Stages to be frozen (all param fixed). - Default: -1, which means not freezing any parameters. - conv_cfg (dict): Config dict for convolution layer. Default: None, - which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - def __init__(self, - groups=3, - widen_factor=1.0, - out_indices=(2, ), - frozen_stages=-1, - conv_cfg=None, - norm_cfg=dict(type='BN'), - act_cfg=dict(type='ReLU'), - norm_eval=False, - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - act_cfg = copy.deepcopy(act_cfg) - super().__init__() - self.stage_blocks = [4, 8, 4] - self.groups = groups - - for index in out_indices: - if index not in range(0, 3): - raise ValueError('the item in out_indices must in ' - f'range(0, 3). But received {index}') - - if frozen_stages not in range(-1, 3): - raise ValueError('frozen_stages must be in range(-1, 3). ' - f'But received {frozen_stages}') - self.out_indices = out_indices - self.frozen_stages = frozen_stages - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.act_cfg = act_cfg - self.norm_eval = norm_eval - self.with_cp = with_cp - - if groups == 1: - channels = (144, 288, 576) - elif groups == 2: - channels = (200, 400, 800) - elif groups == 3: - channels = (240, 480, 960) - elif groups == 4: - channels = (272, 544, 1088) - elif groups == 8: - channels = (384, 768, 1536) - else: - raise ValueError(f'{groups} groups is not supported for 1x1 ' - 'Grouped Convolutions') - - channels = [make_divisible(ch * widen_factor, 8) for ch in channels] - - self.in_channels = int(24 * widen_factor) - - self.conv1 = ConvModule( - in_channels=3, - out_channels=self.in_channels, - kernel_size=3, - stride=2, - padding=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg) - self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - - self.layers = nn.ModuleList() - for i, num_blocks in enumerate(self.stage_blocks): - first_block = (i == 0) - layer = self.make_layer(channels[i], num_blocks, first_block) - self.layers.append(layer) - - def _freeze_stages(self): - if self.frozen_stages >= 0: - for param in self.conv1.parameters(): - param.requires_grad = False - for i in range(self.frozen_stages): - layer = self.layers[i] - layer.eval() - for param in layer.parameters(): - param.requires_grad = False - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): - logger = logging.getLogger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for name, m in self.named_modules(): - if isinstance(m, nn.Conv2d): - if 'conv1' in name: - normal_init(m, mean=0, std=0.01) - else: - normal_init(m, mean=0, std=1.0 / m.weight.shape[1]) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, val=1, bias=0.0001) - if isinstance(m, _BatchNorm): - if m.running_mean is not None: - nn.init.constant_(m.running_mean, 0) - else: - raise TypeError('pretrained must be a str or None. But received ' - f'{type(pretrained)}') - - def make_layer(self, out_channels, num_blocks, first_block=False): - """Stack ShuffleUnit blocks to make a layer. - - Args: - out_channels (int): out_channels of the block. - num_blocks (int): Number of blocks. - first_block (bool, optional): Whether is the first ShuffleUnit of a - sequential ShuffleUnits. Default: False, which means using - the grouped 1x1 convolution. - """ - layers = [] - for i in range(num_blocks): - first_block = first_block if i == 0 else False - combine_mode = 'concat' if i == 0 else 'add' - layers.append( - ShuffleUnit( - self.in_channels, - out_channels, - groups=self.groups, - first_block=first_block, - combine=combine_mode, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, - with_cp=self.with_cp)) - self.in_channels = out_channels - - return nn.Sequential(*layers) - - def forward(self, x): - x = self.conv1(x) - x = self.maxpool(x) - - outs = [] - for i, layer in enumerate(self.layers): - x = layer(x) - if i in self.out_indices: - outs.append(x) - - if len(outs) == 1: - return outs[0] - return tuple(outs) - - def train(self, mode=True): - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py b/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py deleted file mode 100644 index e93533367afe4efa01fa67d14cafcca006c990e8..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py +++ /dev/null @@ -1,302 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -import logging - -import torch -import torch.nn as nn -import torch.utils.checkpoint as cp -from mmcv.cnn import ConvModule, constant_init, normal_init -from torch.nn.modules.batchnorm import _BatchNorm - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .utils import channel_shuffle, load_checkpoint - - -class InvertedResidual(nn.Module): - """InvertedResidual block for ShuffleNetV2 backbone. - - Args: - in_channels (int): The input channels of the block. - out_channels (int): The output channels of the block. - stride (int): Stride of the 3x3 convolution layer. Default: 1 - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - def __init__(self, - in_channels, - out_channels, - stride=1, - conv_cfg=None, - norm_cfg=dict(type='BN'), - act_cfg=dict(type='ReLU'), - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - act_cfg = copy.deepcopy(act_cfg) - super().__init__() - self.stride = stride - self.with_cp = with_cp - - branch_features = out_channels // 2 - if self.stride == 1: - assert in_channels == branch_features * 2, ( - f'in_channels ({in_channels}) should equal to ' - f'branch_features * 2 ({branch_features * 2}) ' - 'when stride is 1') - - if in_channels != branch_features * 2: - assert self.stride != 1, ( - f'stride ({self.stride}) should not equal 1 when ' - f'in_channels != branch_features * 2') - - if self.stride > 1: - self.branch1 = nn.Sequential( - ConvModule( - in_channels, - in_channels, - kernel_size=3, - stride=self.stride, - padding=1, - groups=in_channels, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None), - ConvModule( - in_channels, - branch_features, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg), - ) - - self.branch2 = nn.Sequential( - ConvModule( - in_channels if (self.stride > 1) else branch_features, - branch_features, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg), - ConvModule( - branch_features, - branch_features, - kernel_size=3, - stride=self.stride, - padding=1, - groups=branch_features, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None), - ConvModule( - branch_features, - branch_features, - kernel_size=1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg)) - - def forward(self, x): - - def _inner_forward(x): - if self.stride > 1: - out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) - else: - x1, x2 = x.chunk(2, dim=1) - out = torch.cat((x1, self.branch2(x2)), dim=1) - - out = channel_shuffle(out, 2) - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - return out - - -@BACKBONES.register_module() -class ShuffleNetV2(BaseBackbone): - """ShuffleNetV2 backbone. - - Args: - widen_factor (float): Width multiplier - adjusts the number of - channels in each layer by this amount. Default: 1.0. - out_indices (Sequence[int]): Output from which stages. - Default: (0, 1, 2, 3). - frozen_stages (int): Stages to be frozen (all param fixed). - Default: -1, which means not freezing any parameters. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - """ - - def __init__(self, - widen_factor=1.0, - out_indices=(3, ), - frozen_stages=-1, - conv_cfg=None, - norm_cfg=dict(type='BN'), - act_cfg=dict(type='ReLU'), - norm_eval=False, - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - act_cfg = copy.deepcopy(act_cfg) - super().__init__() - self.stage_blocks = [4, 8, 4] - for index in out_indices: - if index not in range(0, 4): - raise ValueError('the item in out_indices must in ' - f'range(0, 4). But received {index}') - - if frozen_stages not in range(-1, 4): - raise ValueError('frozen_stages must be in range(-1, 4). ' - f'But received {frozen_stages}') - self.out_indices = out_indices - self.frozen_stages = frozen_stages - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.act_cfg = act_cfg - self.norm_eval = norm_eval - self.with_cp = with_cp - - if widen_factor == 0.5: - channels = [48, 96, 192, 1024] - elif widen_factor == 1.0: - channels = [116, 232, 464, 1024] - elif widen_factor == 1.5: - channels = [176, 352, 704, 1024] - elif widen_factor == 2.0: - channels = [244, 488, 976, 2048] - else: - raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. ' - f'But received {widen_factor}') - - self.in_channels = 24 - self.conv1 = ConvModule( - in_channels=3, - out_channels=self.in_channels, - kernel_size=3, - stride=2, - padding=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg) - - self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - - self.layers = nn.ModuleList() - for i, num_blocks in enumerate(self.stage_blocks): - layer = self._make_layer(channels[i], num_blocks) - self.layers.append(layer) - - output_channels = channels[-1] - self.layers.append( - ConvModule( - in_channels=self.in_channels, - out_channels=output_channels, - kernel_size=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg)) - - def _make_layer(self, out_channels, num_blocks): - """Stack blocks to make a layer. - - Args: - out_channels (int): out_channels of the block. - num_blocks (int): number of blocks. - """ - layers = [] - for i in range(num_blocks): - stride = 2 if i == 0 else 1 - layers.append( - InvertedResidual( - in_channels=self.in_channels, - out_channels=out_channels, - stride=stride, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=self.act_cfg, - with_cp=self.with_cp)) - self.in_channels = out_channels - - return nn.Sequential(*layers) - - def _freeze_stages(self): - if self.frozen_stages >= 0: - for param in self.conv1.parameters(): - param.requires_grad = False - - for i in range(self.frozen_stages): - m = self.layers[i] - m.eval() - for param in m.parameters(): - param.requires_grad = False - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): - logger = logging.getLogger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for name, m in self.named_modules(): - if isinstance(m, nn.Conv2d): - if 'conv1' in name: - normal_init(m, mean=0, std=0.01) - else: - normal_init(m, mean=0, std=1.0 / m.weight.shape[1]) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m.weight, val=1, bias=0.0001) - if isinstance(m, _BatchNorm): - if m.running_mean is not None: - nn.init.constant_(m.running_mean, 0) - else: - raise TypeError('pretrained must be a str or None. But received ' - f'{type(pretrained)}') - - def forward(self, x): - x = self.conv1(x) - x = self.maxpool(x) - - outs = [] - for i, layer in enumerate(self.layers): - x = layer(x) - if i in self.out_indices: - outs.append(x) - - if len(outs) == 1: - return outs[0] - return tuple(outs) - - def train(self, mode=True): - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, nn.BatchNorm2d): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/swin.py b/main/transformer_utils/mmpose/models/backbones/swin.py deleted file mode 100644 index 2449cdca591bc0bbf601295bde11efe834b49f8a..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/swin.py +++ /dev/null @@ -1,733 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from collections import OrderedDict -from copy import deepcopy - -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.utils.checkpoint as cp -from mmcv.cnn import build_norm_layer, constant_init, trunc_normal_init -from mmcv.cnn.bricks.transformer import FFN, build_dropout -from mmcv.cnn.utils.weight_init import trunc_normal_ -from mmcv.runner import _load_checkpoint -from mmcv.utils import to_2tuple - -from ...utils import get_root_logger -from ..builder import BACKBONES -from ..utils.transformer import PatchEmbed, PatchMerging -from .base_backbone import BaseBackbone -from .utils.ckpt_convert import swin_converter - - -class WindowMSA(nn.Module): - """Window based multi-head self-attention (W-MSA) module with relative - position bias. - - Args: - embed_dims (int): Number of input channels. - num_heads (int): Number of attention heads. - window_size (tuple[int]): The height and width of the window. - qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. - Default: True. - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - attn_drop_rate (float, optional): Dropout ratio of attention weight. - Default: 0.0 - proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. - """ - - def __init__(self, - embed_dims, - num_heads, - window_size, - qkv_bias=True, - qk_scale=None, - attn_drop_rate=0., - proj_drop_rate=0.): - - super().__init__() - self.embed_dims = embed_dims - self.window_size = window_size # Wh, Ww - self.num_heads = num_heads - head_embed_dims = embed_dims // num_heads - self.scale = qk_scale or head_embed_dims**-0.5 - - # define a parameter table of relative position bias - self.relative_position_bias_table = nn.Parameter( - torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), - num_heads)) # 2*Wh-1 * 2*Ww-1, nH - - # About 2x faster than original impl - Wh, Ww = self.window_size - rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) - rel_position_index = rel_index_coords + rel_index_coords.T - rel_position_index = rel_position_index.flip(1).contiguous() - self.register_buffer('relative_position_index', rel_position_index) - - self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop_rate) - self.proj = nn.Linear(embed_dims, embed_dims) - self.proj_drop = nn.Dropout(proj_drop_rate) - - self.softmax = nn.Softmax(dim=-1) - - def init_weights(self): - trunc_normal_(self.relative_position_bias_table, std=0.02) - - def forward(self, x, mask=None): - """ - Args: - - x (tensor): input features with shape of (num_windows*B, N, C) - mask (tensor | None, Optional): mask with shape of (num_windows, - Wh*Ww, Wh*Ww), value should be between (-inf, 0]. - """ - B, N, C = x.shape - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - # make torchscript happy (cannot use tensor as tuple) - q, k, v = qkv[0], qkv[1], qkv[2] - - q = q * self.scale - attn = (q @ k.transpose(-2, -1)) - - relative_position_bias = self.relative_position_bias_table[ - self.relative_position_index.view(-1)].view( - self.window_size[0] * self.window_size[1], - self.window_size[0] * self.window_size[1], - -1) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute( - 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0) - - if mask is not None: - nW = mask.shape[0] - attn = attn.view(B // nW, nW, self.num_heads, N, - N) + mask.unsqueeze(1).unsqueeze(0) - attn = attn.view(-1, self.num_heads, N, N) - attn = self.softmax(attn) - - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - - @staticmethod - def double_step_seq(step1, len1, step2, len2): - seq1 = torch.arange(0, step1 * len1, step1) - seq2 = torch.arange(0, step2 * len2, step2) - return (seq1[:, None] + seq2[None, :]).reshape(1, -1) - - -class ShiftWindowMSA(nn.Module): - """Shifted Window Multihead Self-Attention Module. - - Args: - embed_dims (int): Number of input channels. - num_heads (int): Number of attention heads. - window_size (int): The height and width of the window. - shift_size (int, optional): The shift step of each window towards - right-bottom. If zero, act as regular window-msa. Defaults to 0. - qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. - Default: True - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Defaults: None. - attn_drop_rate (float, optional): Dropout ratio of attention weight. - Defaults: 0. - proj_drop_rate (float, optional): Dropout ratio of output. - Defaults: 0. - dropout_layer (dict, optional): The dropout_layer used before output. - Defaults: dict(type='DropPath', drop_prob=0.). - """ - - def __init__(self, - embed_dims, - num_heads, - window_size, - shift_size=0, - qkv_bias=True, - qk_scale=None, - attn_drop_rate=0, - proj_drop_rate=0, - dropout_layer=dict(type='DropPath', drop_prob=0.)): - super().__init__() - - self.window_size = window_size - self.shift_size = shift_size - assert 0 <= self.shift_size < self.window_size - - self.w_msa = WindowMSA( - embed_dims=embed_dims, - num_heads=num_heads, - window_size=to_2tuple(window_size), - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop_rate=attn_drop_rate, - proj_drop_rate=proj_drop_rate) - - self.drop = build_dropout(dropout_layer) - - def forward(self, query, hw_shape): - B, L, C = query.shape - H, W = hw_shape - assert L == H * W, 'input feature has wrong size' - query = query.view(B, H, W, C) - - # pad feature maps to multiples of window size - pad_r = (self.window_size - W % self.window_size) % self.window_size - pad_b = (self.window_size - H % self.window_size) % self.window_size - query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b)) - H_pad, W_pad = query.shape[1], query.shape[2] - - # cyclic shift - if self.shift_size > 0: - shifted_query = torch.roll( - query, - shifts=(-self.shift_size, -self.shift_size), - dims=(1, 2)) - - # calculate attention mask for SW-MSA - img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device) - h_slices = (slice(0, -self.window_size), - slice(-self.window_size, - -self.shift_size), slice(-self.shift_size, None)) - w_slices = (slice(0, -self.window_size), - slice(-self.window_size, - -self.shift_size), slice(-self.shift_size, None)) - cnt = 0 - for h in h_slices: - for w in w_slices: - img_mask[:, h, w, :] = cnt - cnt += 1 - - # nW, window_size, window_size, 1 - mask_windows = self.window_partition(img_mask) - mask_windows = mask_windows.view( - -1, self.window_size * self.window_size) - attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) - attn_mask = attn_mask.masked_fill(attn_mask != 0, - float(-100.0)).masked_fill( - attn_mask == 0, float(0.0)) - else: - shifted_query = query - attn_mask = None - - # nW*B, window_size, window_size, C - query_windows = self.window_partition(shifted_query) - # nW*B, window_size*window_size, C - query_windows = query_windows.view(-1, self.window_size**2, C) - - # W-MSA/SW-MSA (nW*B, window_size*window_size, C) - attn_windows = self.w_msa(query_windows, mask=attn_mask) - - # merge windows - attn_windows = attn_windows.view(-1, self.window_size, - self.window_size, C) - - # B H' W' C - shifted_x = self.window_reverse(attn_windows, H_pad, W_pad) - # reverse cyclic shift - if self.shift_size > 0: - x = torch.roll( - shifted_x, - shifts=(self.shift_size, self.shift_size), - dims=(1, 2)) - else: - x = shifted_x - - if pad_r > 0 or pad_b: - x = x[:, :H, :W, :].contiguous() - - x = x.view(B, H * W, C) - - x = self.drop(x) - return x - - def window_reverse(self, windows, H, W): - """ - Args: - windows: (num_windows*B, window_size, window_size, C) - H (int): Height of image - W (int): Width of image - Returns: - x: (B, H, W, C) - """ - window_size = self.window_size - B = int(windows.shape[0] / (H * W / window_size / window_size)) - x = windows.view(B, H // window_size, W // window_size, window_size, - window_size, -1) - x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) - return x - - def window_partition(self, x): - """ - Args: - x: (B, H, W, C) - Returns: - windows: (num_windows*B, window_size, window_size, C) - """ - B, H, W, C = x.shape - window_size = self.window_size - x = x.view(B, H // window_size, window_size, W // window_size, - window_size, C) - windows = x.permute(0, 1, 3, 2, 4, 5).contiguous() - windows = windows.view(-1, window_size, window_size, C) - return windows - - -class SwinBlock(nn.Module): - """" - Args: - embed_dims (int): The feature dimension. - num_heads (int): Parallel attention heads. - feedforward_channels (int): The hidden dimension for FFNs. - window_size (int, optional): The local window scale. Default: 7. - shift (bool, optional): whether to shift window or not. Default False. - qkv_bias (bool, optional): enable bias for qkv if True. Default: True. - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - drop_rate (float, optional): Dropout rate. Default: 0. - attn_drop_rate (float, optional): Attention dropout rate. Default: 0. - drop_path_rate (float, optional): Stochastic depth rate. Default: 0. - act_cfg (dict, optional): The config dict of activation function. - Default: dict(type='GELU'). - norm_cfg (dict, optional): The config dict of normalization. - Default: dict(type='LN'). - with_cp (bool, optional): Use checkpoint or not. Using checkpoint - will save some memory while slowing down the training speed. - Default: False. - """ - - def __init__(self, - embed_dims, - num_heads, - feedforward_channels, - window_size=7, - shift=False, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - act_cfg=dict(type='GELU'), - norm_cfg=dict(type='LN'), - with_cp=False): - - super(SwinBlock, self).__init__() - - self.with_cp = with_cp - - self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] - self.attn = ShiftWindowMSA( - embed_dims=embed_dims, - num_heads=num_heads, - window_size=window_size, - shift_size=window_size // 2 if shift else 0, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop_rate=attn_drop_rate, - proj_drop_rate=drop_rate, - dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate)) - - self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] - self.ffn = FFN( - embed_dims=embed_dims, - feedforward_channels=feedforward_channels, - num_fcs=2, - ffn_drop=drop_rate, - dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), - act_cfg=act_cfg, - add_identity=True, - init_cfg=None) - - def forward(self, x, hw_shape): - - def _inner_forward(x): - identity = x - x = self.norm1(x) - x = self.attn(x, hw_shape) - - x = x + identity - - identity = x - x = self.norm2(x) - x = self.ffn(x, identity=identity) - - return x - - if self.with_cp and x.requires_grad: - x = cp.checkpoint(_inner_forward, x) - else: - x = _inner_forward(x) - - return x - - -class SwinBlockSequence(nn.Module): - """Implements one stage in Swin Transformer. - - Args: - embed_dims (int): The feature dimension. - num_heads (int): Parallel attention heads. - feedforward_channels (int): The hidden dimension for FFNs. - depth (int): The number of blocks in this stage. - window_size (int, optional): The local window scale. Default: 7. - qkv_bias (bool, optional): enable bias for qkv if True. Default: True. - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - drop_rate (float, optional): Dropout rate. Default: 0. - attn_drop_rate (float, optional): Attention dropout rate. Default: 0. - drop_path_rate (float | list[float], optional): Stochastic depth - rate. Default: 0. - downsample (nn.Module | None, optional): The downsample operation - module. Default: None. - act_cfg (dict, optional): The config dict of activation function. - Default: dict(type='GELU'). - norm_cfg (dict, optional): The config dict of normalization. - Default: dict(type='LN'). - with_cp (bool, optional): Use checkpoint or not. Using checkpoint - will save some memory while slowing down the training speed. - Default: False. - """ - - def __init__(self, - embed_dims, - num_heads, - feedforward_channels, - depth, - window_size=7, - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - downsample=None, - act_cfg=dict(type='GELU'), - norm_cfg=dict(type='LN'), - with_cp=False): - super().__init__() - - if isinstance(drop_path_rate, list): - drop_path_rates = drop_path_rate - assert len(drop_path_rates) == depth - else: - drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)] - - self.blocks = nn.ModuleList() - for i in range(depth): - block = SwinBlock( - embed_dims=embed_dims, - num_heads=num_heads, - feedforward_channels=feedforward_channels, - window_size=window_size, - shift=False if i % 2 == 0 else True, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop_rate=drop_rate, - attn_drop_rate=attn_drop_rate, - drop_path_rate=drop_path_rates[i], - act_cfg=act_cfg, - norm_cfg=norm_cfg, - with_cp=with_cp) - self.blocks.append(block) - - self.downsample = downsample - - def forward(self, x, hw_shape): - for block in self.blocks: - x = block(x, hw_shape) - - if self.downsample: - x_down, down_hw_shape = self.downsample(x, hw_shape) - return x_down, down_hw_shape, x, hw_shape - else: - return x, hw_shape, x, hw_shape - - -@BACKBONES.register_module() -class SwinTransformer(BaseBackbone): - """ Swin Transformer - A PyTorch implement of : `Swin Transformer: - Hierarchical Vision Transformer using Shifted Windows` - - https://arxiv.org/abs/2103.14030 - - Inspiration from - https://github.com/microsoft/Swin-Transformer - - Args: - pretrain_img_size (int | tuple[int]): The size of input image when - pretrain. Defaults: 224. - in_channels (int): The num of input channels. - Defaults: 3. - embed_dims (int): The feature dimension. Default: 96. - patch_size (int | tuple[int]): Patch size. Default: 4. - window_size (int): Window size. Default: 7. - mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. - Default: 4. - depths (tuple[int]): Depths of each Swin Transformer stage. - Default: (2, 2, 6, 2). - num_heads (tuple[int]): Parallel attention heads of each Swin - Transformer stage. Default: (3, 6, 12, 24). - strides (tuple[int]): The patch merging or patch embedding stride of - each Swin Transformer stage. (In swin, we set kernel size equal to - stride.) Default: (4, 2, 2, 2). - out_indices (tuple[int]): Output from which stages. - Default: (0, 1, 2, 3). - qkv_bias (bool, optional): If True, add a learnable bias to query, key, - value. Default: True - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - patch_norm (bool): If add a norm layer for patch embed and patch - merging. Default: True. - drop_rate (float): Dropout rate. Defaults: 0. - attn_drop_rate (float): Attention dropout rate. Default: 0. - drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. - use_abs_pos_embed (bool): If True, add absolute position embedding to - the patch embedding. Defaults: False. - act_cfg (dict): Config dict for activation layer. - Default: dict(type='LN'). - norm_cfg (dict): Config dict for normalization layer at - output of backone. Defaults: dict(type='LN'). - with_cp (bool, optional): Use checkpoint or not. Using checkpoint - will save some memory while slowing down the training speed. - Default: False. - pretrained (str, optional): model pretrained path. Default: None. - convert_weights (bool): The flag indicates whether the - pre-trained model is from the original repo. We may need - to convert some keys to make it compatible. - Default: False. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - Default: -1 (-1 means not freezing any parameters). - """ - - def __init__( - self, - pretrain_img_size=224, - in_channels=3, - embed_dims=96, - patch_size=4, - window_size=7, - mlp_ratio=4, - depths=(2, 2, 6, 2), - num_heads=(3, 6, 12, 24), - strides=(4, 2, 2, 2), - out_indices=(0, 1, 2, 3), - qkv_bias=True, - qk_scale=None, - patch_norm=True, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.1, - use_abs_pos_embed=False, - act_cfg=dict(type='GELU'), - norm_cfg=dict(type='LN'), - with_cp=False, - convert_weights=False, - frozen_stages=-1, - ): - self.convert_weights = convert_weights - self.frozen_stages = frozen_stages - if isinstance(pretrain_img_size, int): - pretrain_img_size = to_2tuple(pretrain_img_size) - elif isinstance(pretrain_img_size, tuple): - if len(pretrain_img_size) == 1: - pretrain_img_size = to_2tuple(pretrain_img_size[0]) - assert len(pretrain_img_size) == 2, \ - f'The size of image should have length 1 or 2, ' \ - f'but got {len(pretrain_img_size)}' - - super(SwinTransformer, self).__init__() - - num_layers = len(depths) - self.out_indices = out_indices - self.use_abs_pos_embed = use_abs_pos_embed - - assert strides[0] == patch_size, 'Use non-overlapping patch embed.' - - self.patch_embed = PatchEmbed( - in_channels=in_channels, - embed_dims=embed_dims, - conv_type='Conv2d', - kernel_size=patch_size, - stride=strides[0], - norm_cfg=norm_cfg if patch_norm else None, - init_cfg=None) - - if self.use_abs_pos_embed: - patch_row = pretrain_img_size[0] // patch_size - patch_col = pretrain_img_size[1] // patch_size - num_patches = patch_row * patch_col - self.absolute_pos_embed = nn.Parameter( - torch.zeros((1, num_patches, embed_dims))) - - self.drop_after_pos = nn.Dropout(p=drop_rate) - - # set stochastic depth decay rule - total_depth = sum(depths) - dpr = [ - x.item() for x in torch.linspace(0, drop_path_rate, total_depth) - ] - - self.stages = nn.ModuleList() - in_channels = embed_dims - for i in range(num_layers): - if i < num_layers - 1: - downsample = PatchMerging( - in_channels=in_channels, - out_channels=2 * in_channels, - stride=strides[i + 1], - norm_cfg=norm_cfg if patch_norm else None, - init_cfg=None) - else: - downsample = None - - stage = SwinBlockSequence( - embed_dims=in_channels, - num_heads=num_heads[i], - feedforward_channels=mlp_ratio * in_channels, - depth=depths[i], - window_size=window_size, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop_rate=drop_rate, - attn_drop_rate=attn_drop_rate, - drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])], - downsample=downsample, - act_cfg=act_cfg, - norm_cfg=norm_cfg, - with_cp=with_cp) - self.stages.append(stage) - if downsample: - in_channels = downsample.out_channels - - self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] - # Add a norm layer for each output - for i in out_indices: - layer = build_norm_layer(norm_cfg, self.num_features[i])[1] - layer_name = f'norm{i}' - self.add_module(layer_name, layer) - - def train(self, mode=True): - """Convert the model into training mode while keep layers freezed.""" - super(SwinTransformer, self).train(mode) - self._freeze_stages() - - def _freeze_stages(self): - if self.frozen_stages >= 0: - self.patch_embed.eval() - for param in self.patch_embed.parameters(): - param.requires_grad = False - if self.use_abs_pos_embed: - self.absolute_pos_embed.requires_grad = False - self.drop_after_pos.eval() - - for i in range(1, self.frozen_stages + 1): - - if (i - 1) in self.out_indices: - norm_layer = getattr(self, f'norm{i-1}') - norm_layer.eval() - for param in norm_layer.parameters(): - param.requires_grad = False - - m = self.stages[i - 1] - m.eval() - for param in m.parameters(): - param.requires_grad = False - - def init_weights(self, pretrained=None): - """Initialize the weights in backbone. - - Args: - pretrained (str, optional): Path to pre-trained weights. - Defaults to None. - """ - if isinstance(pretrained, str): - logger = get_root_logger() - ckpt = _load_checkpoint( - pretrained, logger=logger, map_location='cpu') - if 'state_dict' in ckpt: - _state_dict = ckpt['state_dict'] - elif 'model' in ckpt: - _state_dict = ckpt['model'] - else: - _state_dict = ckpt - if self.convert_weights: - # supported loading weight from original repo, - _state_dict = swin_converter(_state_dict) - - state_dict = OrderedDict() - for k, v in _state_dict.items(): - if k.startswith('backbone.'): - state_dict[k[9:]] = v - - # strip prefix of state_dict - if list(state_dict.keys())[0].startswith('module.'): - state_dict = {k[7:]: v for k, v in state_dict.items()} - - # reshape absolute position embedding - if state_dict.get('absolute_pos_embed') is not None: - absolute_pos_embed = state_dict['absolute_pos_embed'] - N1, L, C1 = absolute_pos_embed.size() - N2, C2, H, W = self.absolute_pos_embed.size() - if N1 != N2 or C1 != C2 or L != H * W: - logger.warning('Error in loading absolute_pos_embed, pass') - else: - state_dict['absolute_pos_embed'] = absolute_pos_embed.view( - N2, H, W, C2).permute(0, 3, 1, 2).contiguous() - - # interpolate position bias table if needed - relative_position_bias_table_keys = [ - k for k in state_dict.keys() - if 'relative_position_bias_table' in k - ] - for table_key in relative_position_bias_table_keys: - table_pretrained = state_dict[table_key] - table_current = self.state_dict()[table_key] - L1, nH1 = table_pretrained.size() - L2, nH2 = table_current.size() - if nH1 != nH2: - logger.warning(f'Error in loading {table_key}, pass') - elif L1 != L2: - S1 = int(L1**0.5) - S2 = int(L2**0.5) - table_pretrained_resized = F.interpolate( - table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1), - size=(S2, S2), - mode='bicubic') - state_dict[table_key] = table_pretrained_resized.view( - nH2, L2).permute(1, 0).contiguous() - - # load state_dict - self.load_state_dict(state_dict, False) - elif pretrained is None: - if self.use_abs_pos_embed: - trunc_normal_(self.absolute_pos_embed, std=0.02) - for m in self.modules(): - if isinstance(m, nn.Linear): - trunc_normal_init(m, std=.02, bias=0.) - elif isinstance(m, nn.LayerNorm): - constant_init(m, 1.0) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - x, hw_shape = self.patch_embed(x) - - if self.use_abs_pos_embed: - x = x + self.absolute_pos_embed - x = self.drop_after_pos(x) - - outs = [] - for i, stage in enumerate(self.stages): - x, hw_shape, out, out_hw_shape = stage(x, hw_shape) - if i in self.out_indices: - norm_layer = getattr(self, f'norm{i}') - out = norm_layer(out) - out = out.view(-1, *out_hw_shape, - self.num_features[i]).permute(0, 3, 1, - 2).contiguous() - outs.append(out) - - return outs diff --git a/main/transformer_utils/mmpose/models/backbones/tcformer.py b/main/transformer_utils/mmpose/models/backbones/tcformer.py deleted file mode 100644 index a0805cdddd17bbba50bf203e2bc9012efd86ba03..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/tcformer.py +++ /dev/null @@ -1,283 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import math - -import torch -import torch.nn as nn -from mmcv.cnn import (build_norm_layer, constant_init, normal_init, - trunc_normal_init) -from mmcv.runner import _load_checkpoint, load_state_dict - -from ...utils import get_root_logger -from ..builder import BACKBONES -from ..utils import (PatchEmbed, TCFormerDynamicBlock, TCFormerRegularBlock, - TokenConv, cluster_dpc_knn, merge_tokens, - tcformer_convert, token2map) - - -class CTM(nn.Module): - """Clustering-based Token Merging module in TCFormer. - - Args: - sample_ratio (float): The sample ratio of tokens. - embed_dim (int): Input token feature dimension. - dim_out (int): Output token feature dimension. - k (int): number of the nearest neighbor used i DPC-knn algorithm. - """ - - def __init__(self, sample_ratio, embed_dim, dim_out, k=5): - super().__init__() - self.sample_ratio = sample_ratio - self.dim_out = dim_out - self.conv = TokenConv( - in_channels=embed_dim, - out_channels=dim_out, - kernel_size=3, - stride=2, - padding=1) - self.norm = nn.LayerNorm(self.dim_out) - self.score = nn.Linear(self.dim_out, 1) - self.k = k - - def forward(self, token_dict): - token_dict = token_dict.copy() - x = self.conv(token_dict) - x = self.norm(x) - token_score = self.score(x) - token_weight = token_score.exp() - - token_dict['x'] = x - B, N, C = x.shape - token_dict['token_score'] = token_score - - cluster_num = max(math.ceil(N * self.sample_ratio), 1) - idx_cluster, cluster_num = cluster_dpc_knn(token_dict, cluster_num, - self.k) - down_dict = merge_tokens(token_dict, idx_cluster, cluster_num, - token_weight) - - H, W = token_dict['map_size'] - H = math.floor((H - 1) / 2 + 1) - W = math.floor((W - 1) / 2 + 1) - down_dict['map_size'] = [H, W] - - return down_dict, token_dict - - -@BACKBONES.register_module() -class TCFormer(nn.Module): - """Token Clustering Transformer (TCFormer) - - Implementation of `Not All Tokens Are Equal: Human-centric Visual - Analysis via Token Clustering Transformer - ` - - Args: - in_channels (int): Number of input channels. Default: 3. - embed_dims (list[int]): Embedding dimension. Default: - [64, 128, 256, 512]. - num_heads (Sequence[int]): The attention heads of each transformer - encode layer. Default: [1, 2, 5, 8]. - mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the - embedding dim of each transformer block. - qkv_bias (bool): Enable bias for qkv if True. Default: True. - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - drop_rate (float): Probability of an element to be zeroed. - Default 0.0. - attn_drop_rate (float): The drop out rate for attention layer. - Default 0.0. - drop_path_rate (float): stochastic depth rate. Default 0. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='LN', eps=1e-6). - num_layers (Sequence[int]): The layer number of each transformer encode - layer. Default: [3, 4, 6, 3]. - sr_ratios (Sequence[int]): The spatial reduction rate of each - transformer block. Default: [8, 4, 2, 1]. - num_stages (int): The num of stages. Default: 4. - pretrained (str, optional): model pretrained path. Default: None. - k (int): number of the nearest neighbor used for local density. - sample_ratios (list[float]): The sample ratios of CTM modules. - Default: [0.25, 0.25, 0.25] - return_map (bool): If True, transfer dynamic tokens to feature map at - last. Default: False - convert_weights (bool): The flag indicates whether the - pre-trained model is from the original repo. We may need - to convert some keys to make it compatible. - Default: True. - """ - - def __init__(self, - in_channels=3, - embed_dims=[64, 128, 256, 512], - num_heads=[1, 2, 4, 8], - mlp_ratios=[4, 4, 4, 4], - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - norm_cfg=dict(type='LN', eps=1e-6), - num_layers=[3, 4, 6, 3], - sr_ratios=[8, 4, 2, 1], - num_stages=4, - pretrained=None, - k=5, - sample_ratios=[0.25, 0.25, 0.25], - return_map=False, - convert_weights=True): - super().__init__() - - self.num_layers = num_layers - self.num_stages = num_stages - self.grid_stride = sr_ratios[0] - self.embed_dims = embed_dims - self.sr_ratios = sr_ratios - self.mlp_ratios = mlp_ratios - self.sample_ratios = sample_ratios - self.return_map = return_map - self.convert_weights = convert_weights - - # stochastic depth decay rule - dpr = [ - x.item() - for x in torch.linspace(0, drop_path_rate, sum(num_layers)) - ] - cur = 0 - - # In stage 1, use the standard transformer blocks - for i in range(1): - patch_embed = PatchEmbed( - in_channels=in_channels if i == 0 else embed_dims[i - 1], - embed_dims=embed_dims[i], - kernel_size=7, - stride=4, - padding=3, - bias=True, - norm_cfg=dict(type='LN', eps=1e-6)) - - block = nn.ModuleList([ - TCFormerRegularBlock( - dim=embed_dims[i], - num_heads=num_heads[i], - mlp_ratio=mlp_ratios[i], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[cur + j], - norm_cfg=norm_cfg, - sr_ratio=sr_ratios[i]) for j in range(num_layers[i]) - ]) - norm = build_norm_layer(norm_cfg, embed_dims[i])[1] - - cur += num_layers[i] - - setattr(self, f'patch_embed{i + 1}', patch_embed) - setattr(self, f'block{i + 1}', block) - setattr(self, f'norm{i + 1}', norm) - - # In stage 2~4, use TCFormerDynamicBlock for dynamic tokens - for i in range(1, num_stages): - ctm = CTM(sample_ratios[i - 1], embed_dims[i - 1], embed_dims[i], - k) - - block = nn.ModuleList([ - TCFormerDynamicBlock( - dim=embed_dims[i], - num_heads=num_heads[i], - mlp_ratio=mlp_ratios[i], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[cur + j], - norm_cfg=norm_cfg, - sr_ratio=sr_ratios[i]) for j in range(num_layers[i]) - ]) - norm = build_norm_layer(norm_cfg, embed_dims[i])[1] - cur += num_layers[i] - - setattr(self, f'ctm{i}', ctm) - setattr(self, f'block{i + 1}', block) - setattr(self, f'norm{i + 1}', norm) - - self.init_weights(pretrained) - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): - logger = get_root_logger() - - checkpoint = _load_checkpoint( - pretrained, logger=logger, map_location='cpu') - logger.warning(f'Load pre-trained model for ' - f'{self.__class__.__name__} from original repo') - if 'state_dict' in checkpoint: - state_dict = checkpoint['state_dict'] - elif 'model' in checkpoint: - state_dict = checkpoint['model'] - else: - state_dict = checkpoint - - if self.convert_weights: - # We need to convert pre-trained weights to match this - # implementation. - state_dict = tcformer_convert(state_dict) - load_state_dict(self, state_dict, strict=False, logger=logger) - - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Linear): - trunc_normal_init(m, std=.02, bias=0.) - elif isinstance(m, nn.LayerNorm): - constant_init(m, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[ - 1] * m.out_channels - fan_out //= m.groups - normal_init(m, 0, math.sqrt(2.0 / fan_out)) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - outs = [] - - i = 0 - patch_embed = getattr(self, f'patch_embed{i + 1}') - block = getattr(self, f'block{i + 1}') - norm = getattr(self, f'norm{i + 1}') - x, (H, W) = patch_embed(x) - for blk in block: - x = blk(x, H, W) - x = norm(x) - - # init token dict - B, N, _ = x.shape - device = x.device - idx_token = torch.arange(N)[None, :].repeat(B, 1).to(device) - agg_weight = x.new_ones(B, N, 1) - token_dict = { - 'x': x, - 'token_num': N, - 'map_size': [H, W], - 'init_grid_size': [H, W], - 'idx_token': idx_token, - 'agg_weight': agg_weight - } - outs.append(token_dict.copy()) - - # stage 2~4 - for i in range(1, self.num_stages): - ctm = getattr(self, f'ctm{i}') - block = getattr(self, f'block{i + 1}') - norm = getattr(self, f'norm{i + 1}') - - token_dict = ctm(token_dict) # down sample - for j, blk in enumerate(block): - token_dict = blk(token_dict) - - token_dict['x'] = norm(token_dict['x']) - outs.append(token_dict) - - if self.return_map: - outs = [token2map(token_dict) for token_dict in outs] - return outs diff --git a/main/transformer_utils/mmpose/models/backbones/tcn.py b/main/transformer_utils/mmpose/models/backbones/tcn.py deleted file mode 100644 index deca2290aeb1830bc3e241b819157369371aaf27..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/tcn.py +++ /dev/null @@ -1,267 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import torch.nn as nn -from mmcv.cnn import ConvModule, build_conv_layer, constant_init, kaiming_init -from mmcv.utils.parrots_wrapper import _BatchNorm - -from mmpose.core import WeightNormClipHook -from ..builder import BACKBONES -from .base_backbone import BaseBackbone - - -class BasicTemporalBlock(nn.Module): - """Basic block for VideoPose3D. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - mid_channels (int): The output channels of conv1. Default: 1024. - kernel_size (int): Size of the convolving kernel. Default: 3. - dilation (int): Spacing between kernel elements. Default: 3. - dropout (float): Dropout rate. Default: 0.25. - causal (bool): Use causal convolutions instead of symmetric - convolutions (for real-time applications). Default: False. - residual (bool): Use residual connection. Default: True. - use_stride_conv (bool): Use optimized TCN that designed - specifically for single-frame batching, i.e. where batches have - input length = receptive field, and output length = 1. This - implementation replaces dilated convolutions with strided - convolutions to avoid generating unused intermediate results. - Default: False. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: dict(type='Conv1d'). - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN1d'). - """ - - def __init__(self, - in_channels, - out_channels, - mid_channels=1024, - kernel_size=3, - dilation=3, - dropout=0.25, - causal=False, - residual=True, - use_stride_conv=False, - conv_cfg=dict(type='Conv1d'), - norm_cfg=dict(type='BN1d')): - # Protect mutable default arguments - conv_cfg = copy.deepcopy(conv_cfg) - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.mid_channels = mid_channels - self.kernel_size = kernel_size - self.dilation = dilation - self.dropout = dropout - self.causal = causal - self.residual = residual - self.use_stride_conv = use_stride_conv - - self.pad = (kernel_size - 1) * dilation // 2 - if use_stride_conv: - self.stride = kernel_size - self.causal_shift = kernel_size // 2 if causal else 0 - self.dilation = 1 - else: - self.stride = 1 - self.causal_shift = kernel_size // 2 * dilation if causal else 0 - - self.conv1 = nn.Sequential( - ConvModule( - in_channels, - mid_channels, - kernel_size=kernel_size, - stride=self.stride, - dilation=self.dilation, - bias='auto', - conv_cfg=conv_cfg, - norm_cfg=norm_cfg)) - self.conv2 = nn.Sequential( - ConvModule( - mid_channels, - out_channels, - kernel_size=1, - bias='auto', - conv_cfg=conv_cfg, - norm_cfg=norm_cfg)) - - if residual and in_channels != out_channels: - self.short_cut = build_conv_layer(conv_cfg, in_channels, - out_channels, 1) - else: - self.short_cut = None - - self.dropout = nn.Dropout(dropout) if dropout > 0 else None - - def forward(self, x): - """Forward function.""" - if self.use_stride_conv: - assert self.causal_shift + self.kernel_size // 2 < x.shape[2] - else: - assert 0 <= self.pad + self.causal_shift < x.shape[2] - \ - self.pad + self.causal_shift <= x.shape[2] - - out = self.conv1(x) - if self.dropout is not None: - out = self.dropout(out) - - out = self.conv2(out) - if self.dropout is not None: - out = self.dropout(out) - - if self.residual: - if self.use_stride_conv: - res = x[:, :, self.causal_shift + - self.kernel_size // 2::self.kernel_size] - else: - res = x[:, :, - (self.pad + self.causal_shift):(x.shape[2] - self.pad + - self.causal_shift)] - - if self.short_cut is not None: - res = self.short_cut(res) - out = out + res - - return out - - -@BACKBONES.register_module() -class TCN(BaseBackbone): - """TCN backbone. - - Temporal Convolutional Networks. - More details can be found in the - `paper `__ . - - Args: - in_channels (int): Number of input channels, which equals to - num_keypoints * num_features. - stem_channels (int): Number of feature channels. Default: 1024. - num_blocks (int): NUmber of basic temporal convolutional blocks. - Default: 2. - kernel_sizes (Sequence[int]): Sizes of the convolving kernel of - each basic block. Default: ``(3, 3, 3)``. - dropout (float): Dropout rate. Default: 0.25. - causal (bool): Use causal convolutions instead of symmetric - convolutions (for real-time applications). - Default: False. - residual (bool): Use residual connection. Default: True. - use_stride_conv (bool): Use TCN backbone optimized for - single-frame batching, i.e. where batches have input length = - receptive field, and output length = 1. This implementation - replaces dilated convolutions with strided convolutions to avoid - generating unused intermediate results. The weights are - interchangeable with the reference implementation. Default: False - conv_cfg (dict): dictionary to construct and config conv layer. - Default: dict(type='Conv1d'). - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN1d'). - max_norm (float|None): if not None, the weight of convolution layers - will be clipped to have a maximum norm of max_norm. - - Example: - >>> from mmpose.models import TCN - >>> import torch - >>> self = TCN(in_channels=34) - >>> self.eval() - >>> inputs = torch.rand(1, 34, 243) - >>> level_outputs = self.forward(inputs) - >>> for level_out in level_outputs: - ... print(tuple(level_out.shape)) - (1, 1024, 235) - (1, 1024, 217) - """ - - def __init__(self, - in_channels, - stem_channels=1024, - num_blocks=2, - kernel_sizes=(3, 3, 3), - dropout=0.25, - causal=False, - residual=True, - use_stride_conv=False, - conv_cfg=dict(type='Conv1d'), - norm_cfg=dict(type='BN1d'), - max_norm=None): - # Protect mutable default arguments - conv_cfg = copy.deepcopy(conv_cfg) - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - self.in_channels = in_channels - self.stem_channels = stem_channels - self.num_blocks = num_blocks - self.kernel_sizes = kernel_sizes - self.dropout = dropout - self.causal = causal - self.residual = residual - self.use_stride_conv = use_stride_conv - self.max_norm = max_norm - - assert num_blocks == len(kernel_sizes) - 1 - for ks in kernel_sizes: - assert ks % 2 == 1, 'Only odd filter widths are supported.' - - self.expand_conv = ConvModule( - in_channels, - stem_channels, - kernel_size=kernel_sizes[0], - stride=kernel_sizes[0] if use_stride_conv else 1, - bias='auto', - conv_cfg=conv_cfg, - norm_cfg=norm_cfg) - - dilation = kernel_sizes[0] - self.tcn_blocks = nn.ModuleList() - for i in range(1, num_blocks + 1): - self.tcn_blocks.append( - BasicTemporalBlock( - in_channels=stem_channels, - out_channels=stem_channels, - mid_channels=stem_channels, - kernel_size=kernel_sizes[i], - dilation=dilation, - dropout=dropout, - causal=causal, - residual=residual, - use_stride_conv=use_stride_conv, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg)) - dilation *= kernel_sizes[i] - - if self.max_norm is not None: - # Apply weight norm clip to conv layers - weight_clip = WeightNormClipHook(self.max_norm) - for module in self.modules(): - if isinstance(module, nn.modules.conv._ConvNd): - weight_clip.register(module) - - self.dropout = nn.Dropout(dropout) if dropout > 0 else None - - def forward(self, x): - """Forward function.""" - x = self.expand_conv(x) - - if self.dropout is not None: - x = self.dropout(x) - - outs = [] - for i in range(self.num_blocks): - x = self.tcn_blocks[i](x) - outs.append(x) - - return tuple(outs) - - def init_weights(self, pretrained=None): - """Initialize the weights.""" - super().init_weights(pretrained) - if pretrained is None: - for m in self.modules(): - if isinstance(m, nn.modules.conv._ConvNd): - kaiming_init(m, mode='fan_in', nonlinearity='relu') - elif isinstance(m, _BatchNorm): - constant_init(m, 1) diff --git a/main/transformer_utils/mmpose/models/backbones/utils/utils.py b/main/transformer_utils/mmpose/models/backbones/utils/utils.py index 2a53c94a90a1802cc0c4dcfceba241711c989640..5f6ab9b43202b6491911f4e0a713cbf3f210566d 100644 --- a/main/transformer_utils/mmpose/models/backbones/utils/utils.py +++ b/main/transformer_utils/mmpose/models/backbones/utils/utils.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from collections import OrderedDict -from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict +from mmengine.runner import load_state_dict # Copyright (c) Open-MMLab. All rights reserved. @@ -22,11 +22,11 @@ from torch.utils import model_zoo from torch.nn import functional as F import mmcv -from mmcv.fileio import FileClient -from mmcv.fileio import load as load_file -from mmcv.parallel import is_module_wrapper -from mmcv.utils import mkdir_or_exist -from mmcv.runner import get_dist_info +from mmengine.fileio import FileClient +from mmengine.fileio import load as load_file +# from mmengine.model.wrappers.utils import is_module_wrapper +from mmengine.utils import mkdir_or_exist +from mmengine.dist import get_dist_info from scipy import interpolate import numpy as np @@ -75,8 +75,8 @@ def load_state_dict(module, state_dict, strict=False, logger=None): def load(module, prefix=''): # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) - if is_module_wrapper(module): - module = module.module + # if is_module_wrapper(module): + # module = module.module local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, @@ -445,8 +445,8 @@ def get_state_dict(module, destination=None, prefix='', keep_vars=False): """ # recursively check parallel module in case that the model has a # complicated structure, e.g., nn.Module(nn.Module(DDP)) - if is_module_wrapper(module): - module = module.module + # if is_module_wrapper(module): + # module = module.module # below is the same as torch.nn.Module.state_dict() if destination is None: @@ -482,8 +482,8 @@ def save_checkpoint(model, filename, optimizer=None, meta=None): raise TypeError(f'meta must be a dict or None, but got {type(meta)}') meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) - if is_module_wrapper(model): - model = model.module + # if is_module_wrapper(model): + # model = model.module if hasattr(model, 'CLASSES') and model.CLASSES is not None: # save class name to the meta diff --git a/main/transformer_utils/mmpose/models/backbones/v2v_net.py b/main/transformer_utils/mmpose/models/backbones/v2v_net.py deleted file mode 100644 index 99462af711069a34c13628364e2c466163507861..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/v2v_net.py +++ /dev/null @@ -1,257 +0,0 @@ -# ------------------------------------------------------------------------------ -# Copyright and License Information -# Adapted from -# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models/v2v_net.py -# Original Licence: MIT License -# ------------------------------------------------------------------------------ - -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import ConvModule - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone - - -class Basic3DBlock(nn.Module): - """A basic 3D convolutional block. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - kernel_size (int): Kernel size of the convolution operation - conv_cfg (dict): Dictionary to construct and config conv layer. - Default: dict(type='Conv3d') - norm_cfg (dict): Dictionary to construct and config norm layer. - Default: dict(type='BN3d') - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size, - conv_cfg=dict(type='Conv3d'), - norm_cfg=dict(type='BN3d')): - super(Basic3DBlock, self).__init__() - self.block = ConvModule( - in_channels, - out_channels, - kernel_size, - stride=1, - padding=((kernel_size - 1) // 2), - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - bias=True) - - def forward(self, x): - """Forward function.""" - return self.block(x) - - -class Res3DBlock(nn.Module): - """A residual 3D convolutional block. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - kernel_size (int): Kernel size of the convolution operation - Default: 3 - conv_cfg (dict): Dictionary to construct and config conv layer. - Default: dict(type='Conv3d') - norm_cfg (dict): Dictionary to construct and config norm layer. - Default: dict(type='BN3d') - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=3, - conv_cfg=dict(type='Conv3d'), - norm_cfg=dict(type='BN3d')): - super(Res3DBlock, self).__init__() - self.res_branch = nn.Sequential( - ConvModule( - in_channels, - out_channels, - kernel_size, - stride=1, - padding=((kernel_size - 1) // 2), - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - bias=True), - ConvModule( - out_channels, - out_channels, - kernel_size, - stride=1, - padding=((kernel_size - 1) // 2), - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None, - bias=True)) - - if in_channels == out_channels: - self.skip_con = nn.Sequential() - else: - self.skip_con = ConvModule( - in_channels, - out_channels, - 1, - stride=1, - padding=0, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None, - bias=True) - - def forward(self, x): - """Forward function.""" - res = self.res_branch(x) - skip = self.skip_con(x) - return F.relu(res + skip, True) - - -class Pool3DBlock(nn.Module): - """A 3D max-pool block. - - Args: - pool_size (int): Pool size of the 3D max-pool layer - """ - - def __init__(self, pool_size): - super(Pool3DBlock, self).__init__() - self.pool_size = pool_size - - def forward(self, x): - """Forward function.""" - return F.max_pool3d( - x, kernel_size=self.pool_size, stride=self.pool_size) - - -class Upsample3DBlock(nn.Module): - """A 3D upsample block. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - kernel_size (int): Kernel size of the transposed convolution operation. - Default: 2 - stride (int): Kernel size of the transposed convolution operation. - Default: 2 - """ - - def __init__(self, in_channels, out_channels, kernel_size=2, stride=2): - super(Upsample3DBlock, self).__init__() - assert kernel_size == 2 - assert stride == 2 - self.block = nn.Sequential( - nn.ConvTranspose3d( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=0, - output_padding=0), nn.BatchNorm3d(out_channels), nn.ReLU(True)) - - def forward(self, x): - """Forward function.""" - return self.block(x) - - -class EncoderDecorder(nn.Module): - """An encoder-decoder block. - - Args: - in_channels (int): Input channels of this block - """ - - def __init__(self, in_channels=32): - super(EncoderDecorder, self).__init__() - - self.encoder_pool1 = Pool3DBlock(2) - self.encoder_res1 = Res3DBlock(in_channels, in_channels * 2) - self.encoder_pool2 = Pool3DBlock(2) - self.encoder_res2 = Res3DBlock(in_channels * 2, in_channels * 4) - - self.mid_res = Res3DBlock(in_channels * 4, in_channels * 4) - - self.decoder_res2 = Res3DBlock(in_channels * 4, in_channels * 4) - self.decoder_upsample2 = Upsample3DBlock(in_channels * 4, - in_channels * 2, 2, 2) - self.decoder_res1 = Res3DBlock(in_channels * 2, in_channels * 2) - self.decoder_upsample1 = Upsample3DBlock(in_channels * 2, in_channels, - 2, 2) - - self.skip_res1 = Res3DBlock(in_channels, in_channels) - self.skip_res2 = Res3DBlock(in_channels * 2, in_channels * 2) - - def forward(self, x): - """Forward function.""" - skip_x1 = self.skip_res1(x) - x = self.encoder_pool1(x) - x = self.encoder_res1(x) - - skip_x2 = self.skip_res2(x) - x = self.encoder_pool2(x) - x = self.encoder_res2(x) - - x = self.mid_res(x) - - x = self.decoder_res2(x) - x = self.decoder_upsample2(x) - x = x + skip_x2 - - x = self.decoder_res1(x) - x = self.decoder_upsample1(x) - x = x + skip_x1 - - return x - - -@BACKBONES.register_module() -class V2VNet(BaseBackbone): - """V2VNet. - - Please refer to the `paper ` - for details. - - Args: - input_channels (int): - Number of channels of the input feature volume. - output_channels (int): - Number of channels of the output volume. - mid_channels (int): - Input and output channels of the encoder-decoder block. - """ - - def __init__(self, input_channels, output_channels, mid_channels=32): - super(V2VNet, self).__init__() - - self.front_layers = nn.Sequential( - Basic3DBlock(input_channels, mid_channels // 2, 7), - Res3DBlock(mid_channels // 2, mid_channels), - ) - - self.encoder_decoder = EncoderDecorder(in_channels=mid_channels) - - self.output_layer = nn.Conv3d( - mid_channels, output_channels, kernel_size=1, stride=1, padding=0) - - self._initialize_weights() - - def forward(self, x): - """Forward function.""" - x = self.front_layers(x) - x = self.encoder_decoder(x) - x = self.output_layer(x) - - return x - - def _initialize_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv3d): - nn.init.normal_(m.weight, 0, 0.001) - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.ConvTranspose3d): - nn.init.normal_(m.weight, 0, 0.001) - nn.init.constant_(m.bias, 0) diff --git a/main/transformer_utils/mmpose/models/backbones/vgg.py b/main/transformer_utils/mmpose/models/backbones/vgg.py deleted file mode 100644 index f7d467017a5520f399c84b1235ec64c99b805b42..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/vgg.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch.nn as nn -from mmcv.cnn import ConvModule, constant_init, kaiming_init, normal_init -from mmcv.utils.parrots_wrapper import _BatchNorm - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone - - -def make_vgg_layer(in_channels, - out_channels, - num_blocks, - conv_cfg=None, - norm_cfg=None, - act_cfg=dict(type='ReLU'), - dilation=1, - with_norm=False, - ceil_mode=False): - layers = [] - for _ in range(num_blocks): - layer = ConvModule( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=3, - dilation=dilation, - padding=dilation, - bias=True, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg) - layers.append(layer) - in_channels = out_channels - layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) - - return layers - - -@BACKBONES.register_module() -class VGG(BaseBackbone): - """VGG backbone. - - Args: - depth (int): Depth of vgg, from {11, 13, 16, 19}. - with_norm (bool): Use BatchNorm or not. - num_classes (int): number of classes for classification. - num_stages (int): VGG stages, normally 5. - dilations (Sequence[int]): Dilation of each stage. - out_indices (Sequence[int]): Output from which stages. If only one - stage is specified, a single tensor (feature map) is returned, - otherwise multiple stages are specified, a tuple of tensors will - be returned. When it is None, the default behavior depends on - whether num_classes is specified. If num_classes <= 0, the default - value is (4, ), outputting the last feature map before classifier. - If num_classes > 0, the default value is (5, ), outputting the - classification score. Default: None. - frozen_stages (int): Stages to be frozen (all param fixed). -1 means - not freezing any parameters. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - ceil_mode (bool): Whether to use ceil_mode of MaxPool. Default: False. - with_last_pool (bool): Whether to keep the last pooling before - classifier. Default: True. - """ - - # Parameters to build layers. Each element specifies the number of conv in - # each stage. For example, VGG11 contains 11 layers with learnable - # parameters. 11 is computed as 11 = (1 + 1 + 2 + 2 + 2) + 3, - # where 3 indicates the last three fully-connected layers. - arch_settings = { - 11: (1, 1, 2, 2, 2), - 13: (2, 2, 2, 2, 2), - 16: (2, 2, 3, 3, 3), - 19: (2, 2, 4, 4, 4) - } - - def __init__(self, - depth, - num_classes=-1, - num_stages=5, - dilations=(1, 1, 1, 1, 1), - out_indices=None, - frozen_stages=-1, - conv_cfg=None, - norm_cfg=None, - act_cfg=dict(type='ReLU'), - norm_eval=False, - ceil_mode=False, - with_last_pool=True): - super().__init__() - if depth not in self.arch_settings: - raise KeyError(f'invalid depth {depth} for vgg') - assert num_stages >= 1 and num_stages <= 5 - stage_blocks = self.arch_settings[depth] - self.stage_blocks = stage_blocks[:num_stages] - assert len(dilations) == num_stages - - self.num_classes = num_classes - self.frozen_stages = frozen_stages - self.norm_eval = norm_eval - with_norm = norm_cfg is not None - - if out_indices is None: - out_indices = (5, ) if num_classes > 0 else (4, ) - assert max(out_indices) <= num_stages - self.out_indices = out_indices - - self.in_channels = 3 - start_idx = 0 - vgg_layers = [] - self.range_sub_modules = [] - for i, num_blocks in enumerate(self.stage_blocks): - num_modules = num_blocks + 1 - end_idx = start_idx + num_modules - dilation = dilations[i] - out_channels = 64 * 2**i if i < 4 else 512 - vgg_layer = make_vgg_layer( - self.in_channels, - out_channels, - num_blocks, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - dilation=dilation, - with_norm=with_norm, - ceil_mode=ceil_mode) - vgg_layers.extend(vgg_layer) - self.in_channels = out_channels - self.range_sub_modules.append([start_idx, end_idx]) - start_idx = end_idx - if not with_last_pool: - vgg_layers.pop(-1) - self.range_sub_modules[-1][1] -= 1 - self.module_name = 'features' - self.add_module(self.module_name, nn.Sequential(*vgg_layers)) - - if self.num_classes > 0: - self.classifier = nn.Sequential( - nn.Linear(512 * 7 * 7, 4096), - nn.ReLU(True), - nn.Dropout(), - nn.Linear(4096, 4096), - nn.ReLU(True), - nn.Dropout(), - nn.Linear(4096, num_classes), - ) - - def init_weights(self, pretrained=None): - super().init_weights(pretrained) - if pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - kaiming_init(m) - elif isinstance(m, _BatchNorm): - constant_init(m, 1) - elif isinstance(m, nn.Linear): - normal_init(m, std=0.01) - - def forward(self, x): - outs = [] - vgg_layers = getattr(self, self.module_name) - for i in range(len(self.stage_blocks)): - for j in range(*self.range_sub_modules[i]): - vgg_layer = vgg_layers[j] - x = vgg_layer(x) - if i in self.out_indices: - outs.append(x) - if self.num_classes > 0: - x = x.view(x.size(0), -1) - x = self.classifier(x) - outs.append(x) - if len(outs) == 1: - return outs[0] - else: - return tuple(outs) - - def _freeze_stages(self): - vgg_layers = getattr(self, self.module_name) - for i in range(self.frozen_stages): - for j in range(*self.range_sub_modules[i]): - m = vgg_layers[j] - m.eval() - for param in m.parameters(): - param.requires_grad = False - - def train(self, mode=True): - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - # trick: eval have effect on BatchNorm only - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py b/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py deleted file mode 100644 index ed990e3966b27301dbaf081e3ec0e908704dfc8b..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy -import logging - -import torch.nn as nn -from mmcv.cnn import ConvModule -from torch.nn.modules.batchnorm import _BatchNorm - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone -from .utils import InvertedResidual, load_checkpoint - - -@BACKBONES.register_module() -class ViPNAS_MobileNetV3(BaseBackbone): - """ViPNAS_MobileNetV3 backbone. - - "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search" - More details can be found in the `paper - `__ . - - Args: - wid (list(int)): Searched width config for each stage. - expan (list(int)): Searched expansion ratio config for each stage. - dep (list(int)): Searched depth config for each stage. - ks (list(int)): Searched kernel size config for each stage. - group (list(int)): Searched group number config for each stage. - att (list(bool)): Searched attention config for each stage. - stride (list(int)): Stride config for each stage. - act (list(dict)): Activation config for each stage. - conv_cfg (dict): Config dict for convolution layer. - Default: None, which means using conv2d. - norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). - frozen_stages (int): Stages to be frozen (all param fixed). - Default: -1, which means not freezing any parameters. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save - some memory while slowing down the training speed. - Default: False. - """ - - def __init__(self, - wid=[16, 16, 24, 40, 80, 112, 160], - expan=[None, 1, 5, 4, 5, 5, 6], - dep=[None, 1, 4, 4, 4, 4, 4], - ks=[3, 3, 7, 7, 5, 7, 5], - group=[None, 8, 120, 20, 100, 280, 240], - att=[None, True, True, False, True, True, True], - stride=[2, 1, 2, 2, 2, 1, 2], - act=[ - 'HSwish', 'ReLU', 'ReLU', 'ReLU', 'HSwish', 'HSwish', - 'HSwish' - ], - conv_cfg=None, - norm_cfg=dict(type='BN'), - frozen_stages=-1, - norm_eval=False, - with_cp=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - self.wid = wid - self.expan = expan - self.dep = dep - self.ks = ks - self.group = group - self.att = att - self.stride = stride - self.act = act - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.frozen_stages = frozen_stages - self.norm_eval = norm_eval - self.with_cp = with_cp - - self.conv1 = ConvModule( - in_channels=3, - out_channels=self.wid[0], - kernel_size=self.ks[0], - stride=self.stride[0], - padding=self.ks[0] // 2, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=dict(type=self.act[0])) - - self.layers = self._make_layer() - - def _make_layer(self): - layers = [] - layer_index = 0 - for i, dep in enumerate(self.dep[1:]): - mid_channels = self.wid[i + 1] * self.expan[i + 1] - - if self.att[i + 1]: - se_cfg = dict( - channels=mid_channels, - ratio=4, - act_cfg=(dict(type='ReLU'), dict(type='HSigmoid'))) - else: - se_cfg = None - - if self.expan[i + 1] == 1: - with_expand_conv = False - else: - with_expand_conv = True - - for j in range(dep): - if j == 0: - stride = self.stride[i + 1] - in_channels = self.wid[i] - else: - stride = 1 - in_channels = self.wid[i + 1] - - layer = InvertedResidual( - in_channels=in_channels, - out_channels=self.wid[i + 1], - mid_channels=mid_channels, - kernel_size=self.ks[i + 1], - groups=self.group[i + 1], - stride=stride, - se_cfg=se_cfg, - with_expand_conv=with_expand_conv, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - act_cfg=dict(type=self.act[i + 1]), - with_cp=self.with_cp) - layer_index += 1 - layer_name = f'layer{layer_index}' - self.add_module(layer_name, layer) - layers.append(layer_name) - return layers - - def init_weights(self, pretrained=None): - if isinstance(pretrained, str): - logger = logging.getLogger() - load_checkpoint(self, pretrained, strict=False, logger=logger) - elif pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.normal_(m.weight, std=0.001) - for name, _ in m.named_parameters(): - if name in ['bias']: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - else: - raise TypeError('pretrained must be a str or None') - - def forward(self, x): - x = self.conv1(x) - - for i, layer_name in enumerate(self.layers): - layer = getattr(self, layer_name) - x = layer(x) - - return x - - def _freeze_stages(self): - if self.frozen_stages >= 0: - for param in self.conv1.parameters(): - param.requires_grad = False - for i in range(1, self.frozen_stages + 1): - layer = getattr(self, f'layer{i}') - layer.eval() - for param in layer.parameters(): - param.requires_grad = False - - def train(self, mode=True): - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py b/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py deleted file mode 100644 index 81b028ed5f5caad5f59c68b7f82c1a4661cf4d6f..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py +++ /dev/null @@ -1,589 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import copy - -import torch.nn as nn -import torch.utils.checkpoint as cp -from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer -from mmcv.cnn.bricks import ContextBlock -from mmcv.utils.parrots_wrapper import _BatchNorm - -from ..builder import BACKBONES -from .base_backbone import BaseBackbone - - -class ViPNAS_Bottleneck(nn.Module): - """Bottleneck block for ViPNAS_ResNet. - - Args: - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - expansion (int): The ratio of ``out_channels/mid_channels`` where - ``mid_channels`` is the input/output channels of conv2. Default: 4. - stride (int): stride of the block. Default: 1 - dilation (int): dilation of convolution. Default: 1 - downsample (nn.Module): downsample operation on identity branch. - Default: None. - style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the - stride-two layer is the 3x3 conv layer, otherwise the stride-two - layer is the first 1x1 conv layer. Default: "pytorch". - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - kernel_size (int): kernel size of conv2 searched in ViPANS. - groups (int): group number of conv2 searched in ViPNAS. - attention (bool): whether to use attention module in the end of - the block. - """ - - def __init__(self, - in_channels, - out_channels, - expansion=4, - stride=1, - dilation=1, - downsample=None, - style='pytorch', - with_cp=False, - conv_cfg=None, - norm_cfg=dict(type='BN'), - kernel_size=3, - groups=1, - attention=False): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - assert style in ['pytorch', 'caffe'] - - self.in_channels = in_channels - self.out_channels = out_channels - self.expansion = expansion - assert out_channels % expansion == 0 - self.mid_channels = out_channels // expansion - self.stride = stride - self.dilation = dilation - self.style = style - self.with_cp = with_cp - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - - if self.style == 'pytorch': - self.conv1_stride = 1 - self.conv2_stride = stride - else: - self.conv1_stride = stride - self.conv2_stride = 1 - - self.norm1_name, norm1 = build_norm_layer( - norm_cfg, self.mid_channels, postfix=1) - self.norm2_name, norm2 = build_norm_layer( - norm_cfg, self.mid_channels, postfix=2) - self.norm3_name, norm3 = build_norm_layer( - norm_cfg, out_channels, postfix=3) - - self.conv1 = build_conv_layer( - conv_cfg, - in_channels, - self.mid_channels, - kernel_size=1, - stride=self.conv1_stride, - bias=False) - self.add_module(self.norm1_name, norm1) - self.conv2 = build_conv_layer( - conv_cfg, - self.mid_channels, - self.mid_channels, - kernel_size=kernel_size, - stride=self.conv2_stride, - padding=kernel_size // 2, - groups=groups, - dilation=dilation, - bias=False) - - self.add_module(self.norm2_name, norm2) - self.conv3 = build_conv_layer( - conv_cfg, - self.mid_channels, - out_channels, - kernel_size=1, - bias=False) - self.add_module(self.norm3_name, norm3) - - if attention: - self.attention = ContextBlock(out_channels, - max(1.0 / 16, 16.0 / out_channels)) - else: - self.attention = None - - self.relu = nn.ReLU(inplace=True) - self.downsample = downsample - - @property - def norm1(self): - """nn.Module: the normalization layer named "norm1" """ - return getattr(self, self.norm1_name) - - @property - def norm2(self): - """nn.Module: the normalization layer named "norm2" """ - return getattr(self, self.norm2_name) - - @property - def norm3(self): - """nn.Module: the normalization layer named "norm3" """ - return getattr(self, self.norm3_name) - - def forward(self, x): - """Forward function.""" - - def _inner_forward(x): - identity = x - - out = self.conv1(x) - out = self.norm1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.norm2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.norm3(out) - - if self.attention is not None: - out = self.attention(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - - return out - - if self.with_cp and x.requires_grad: - out = cp.checkpoint(_inner_forward, x) - else: - out = _inner_forward(x) - - out = self.relu(out) - - return out - - -def get_expansion(block, expansion=None): - """Get the expansion of a residual block. - - The block expansion will be obtained by the following order: - - 1. If ``expansion`` is given, just return it. - 2. If ``block`` has the attribute ``expansion``, then return - ``block.expansion``. - 3. Return the default value according the the block type: - 4 for ``ViPNAS_Bottleneck``. - - Args: - block (class): The block class. - expansion (int | None): The given expansion ratio. - - Returns: - int: The expansion of the block. - """ - if isinstance(expansion, int): - assert expansion > 0 - elif expansion is None: - if hasattr(block, 'expansion'): - expansion = block.expansion - elif issubclass(block, ViPNAS_Bottleneck): - expansion = 1 - else: - raise TypeError(f'expansion is not specified for {block.__name__}') - else: - raise TypeError('expansion must be an integer or None') - - return expansion - - -class ViPNAS_ResLayer(nn.Sequential): - """ViPNAS_ResLayer to build ResNet style backbone. - - Args: - block (nn.Module): Residual block used to build ViPNAS ResLayer. - num_blocks (int): Number of blocks. - in_channels (int): Input channels of this block. - out_channels (int): Output channels of this block. - expansion (int, optional): The expansion for BasicBlock/Bottleneck. - If not specified, it will firstly be obtained via - ``block.expansion``. If the block has no attribute "expansion", - the following default values will be used: 1 for BasicBlock and - 4 for Bottleneck. Default: None. - stride (int): stride of the first block. Default: 1. - avg_down (bool): Use AvgPool instead of stride conv when - downsampling in the bottleneck. Default: False - conv_cfg (dict): dictionary to construct and config conv layer. - Default: None - norm_cfg (dict): dictionary to construct and config norm layer. - Default: dict(type='BN') - downsample_first (bool): Downsample at the first block or last block. - False for Hourglass, True for ResNet. Default: True - kernel_size (int): Kernel Size of the corresponding convolution layer - searched in the block. - groups (int): Group number of the corresponding convolution layer - searched in the block. - attention (bool): Whether to use attention module in the end of the - block. - """ - - def __init__(self, - block, - num_blocks, - in_channels, - out_channels, - expansion=None, - stride=1, - avg_down=False, - conv_cfg=None, - norm_cfg=dict(type='BN'), - downsample_first=True, - kernel_size=3, - groups=1, - attention=False, - **kwargs): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - self.block = block - self.expansion = get_expansion(block, expansion) - - downsample = None - if stride != 1 or in_channels != out_channels: - downsample = [] - conv_stride = stride - if avg_down and stride != 1: - conv_stride = 1 - downsample.append( - nn.AvgPool2d( - kernel_size=stride, - stride=stride, - ceil_mode=True, - count_include_pad=False)) - downsample.extend([ - build_conv_layer( - conv_cfg, - in_channels, - out_channels, - kernel_size=1, - stride=conv_stride, - bias=False), - build_norm_layer(norm_cfg, out_channels)[1] - ]) - downsample = nn.Sequential(*downsample) - - layers = [] - if downsample_first: - layers.append( - block( - in_channels=in_channels, - out_channels=out_channels, - expansion=self.expansion, - stride=stride, - downsample=downsample, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - kernel_size=kernel_size, - groups=groups, - attention=attention, - **kwargs)) - in_channels = out_channels - for _ in range(1, num_blocks): - layers.append( - block( - in_channels=in_channels, - out_channels=out_channels, - expansion=self.expansion, - stride=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - kernel_size=kernel_size, - groups=groups, - attention=attention, - **kwargs)) - else: # downsample_first=False is for HourglassModule - for i in range(0, num_blocks - 1): - layers.append( - block( - in_channels=in_channels, - out_channels=in_channels, - expansion=self.expansion, - stride=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - kernel_size=kernel_size, - groups=groups, - attention=attention, - **kwargs)) - layers.append( - block( - in_channels=in_channels, - out_channels=out_channels, - expansion=self.expansion, - stride=stride, - downsample=downsample, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - kernel_size=kernel_size, - groups=groups, - attention=attention, - **kwargs)) - - super().__init__(*layers) - - -@BACKBONES.register_module() -class ViPNAS_ResNet(BaseBackbone): - """ViPNAS_ResNet backbone. - - "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search" - More details can be found in the `paper - `__ . - - Args: - depth (int): Network depth, from {18, 34, 50, 101, 152}. - in_channels (int): Number of input image channels. Default: 3. - num_stages (int): Stages of the network. Default: 4. - strides (Sequence[int]): Strides of the first block of each stage. - Default: ``(1, 2, 2, 2)``. - dilations (Sequence[int]): Dilation of each stage. - Default: ``(1, 1, 1, 1)``. - out_indices (Sequence[int]): Output from which stages. If only one - stage is specified, a single tensor (feature map) is returned, - otherwise multiple stages are specified, a tuple of tensors will - be returned. Default: ``(3, )``. - style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two - layer is the 3x3 conv layer, otherwise the stride-two layer is - the first 1x1 conv layer. - deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv. - Default: False. - avg_down (bool): Use AvgPool instead of stride conv when - downsampling in the bottleneck. Default: False. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. Default: -1. - conv_cfg (dict | None): The config dict for conv layers. Default: None. - norm_cfg (dict): The config dict for norm layers. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - zero_init_residual (bool): Whether to use zero init for last norm layer - in resblocks to let them behave as identity. Default: True. - wid (list(int)): Searched width config for each stage. - expan (list(int)): Searched expansion ratio config for each stage. - dep (list(int)): Searched depth config for each stage. - ks (list(int)): Searched kernel size config for each stage. - group (list(int)): Searched group number config for each stage. - att (list(bool)): Searched attention config for each stage. - """ - - arch_settings = { - 50: ViPNAS_Bottleneck, - } - - def __init__(self, - depth, - in_channels=3, - num_stages=4, - strides=(1, 2, 2, 2), - dilations=(1, 1, 1, 1), - out_indices=(3, ), - style='pytorch', - deep_stem=False, - avg_down=False, - frozen_stages=-1, - conv_cfg=None, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=False, - with_cp=False, - zero_init_residual=True, - wid=[48, 80, 160, 304, 608], - expan=[None, 1, 1, 1, 1], - dep=[None, 4, 6, 7, 3], - ks=[7, 3, 5, 5, 5], - group=[None, 16, 16, 16, 16], - att=[None, True, False, True, True]): - # Protect mutable default arguments - norm_cfg = copy.deepcopy(norm_cfg) - super().__init__() - if depth not in self.arch_settings: - raise KeyError(f'invalid depth {depth} for resnet') - self.depth = depth - self.stem_channels = dep[0] - self.num_stages = num_stages - assert 1 <= num_stages <= 4 - self.strides = strides - self.dilations = dilations - assert len(strides) == len(dilations) == num_stages - self.out_indices = out_indices - assert max(out_indices) < num_stages - self.style = style - self.deep_stem = deep_stem - self.avg_down = avg_down - self.frozen_stages = frozen_stages - self.conv_cfg = conv_cfg - self.norm_cfg = norm_cfg - self.with_cp = with_cp - self.norm_eval = norm_eval - self.zero_init_residual = zero_init_residual - self.block = self.arch_settings[depth] - self.stage_blocks = dep[1:1 + num_stages] - - self._make_stem_layer(in_channels, wid[0], ks[0]) - - self.res_layers = [] - _in_channels = wid[0] - for i, num_blocks in enumerate(self.stage_blocks): - expansion = get_expansion(self.block, expan[i + 1]) - _out_channels = wid[i + 1] * expansion - stride = strides[i] - dilation = dilations[i] - res_layer = self.make_res_layer( - block=self.block, - num_blocks=num_blocks, - in_channels=_in_channels, - out_channels=_out_channels, - expansion=expansion, - stride=stride, - dilation=dilation, - style=self.style, - avg_down=self.avg_down, - with_cp=with_cp, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - kernel_size=ks[i + 1], - groups=group[i + 1], - attention=att[i + 1]) - _in_channels = _out_channels - layer_name = f'layer{i + 1}' - self.add_module(layer_name, res_layer) - self.res_layers.append(layer_name) - - self._freeze_stages() - - self.feat_dim = res_layer[-1].out_channels - - def make_res_layer(self, **kwargs): - """Make a ViPNAS ResLayer.""" - return ViPNAS_ResLayer(**kwargs) - - @property - def norm1(self): - """nn.Module: the normalization layer named "norm1" """ - return getattr(self, self.norm1_name) - - def _make_stem_layer(self, in_channels, stem_channels, kernel_size): - """Make stem layer.""" - if self.deep_stem: - self.stem = nn.Sequential( - ConvModule( - in_channels, - stem_channels // 2, - kernel_size=3, - stride=2, - padding=1, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - inplace=True), - ConvModule( - stem_channels // 2, - stem_channels // 2, - kernel_size=3, - stride=1, - padding=1, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - inplace=True), - ConvModule( - stem_channels // 2, - stem_channels, - kernel_size=3, - stride=1, - padding=1, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg, - inplace=True)) - else: - self.conv1 = build_conv_layer( - self.conv_cfg, - in_channels, - stem_channels, - kernel_size=kernel_size, - stride=2, - padding=kernel_size // 2, - bias=False) - self.norm1_name, norm1 = build_norm_layer( - self.norm_cfg, stem_channels, postfix=1) - self.add_module(self.norm1_name, norm1) - self.relu = nn.ReLU(inplace=True) - self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - - def _freeze_stages(self): - """Freeze parameters.""" - if self.frozen_stages >= 0: - if self.deep_stem: - self.stem.eval() - for param in self.stem.parameters(): - param.requires_grad = False - else: - self.norm1.eval() - for m in [self.conv1, self.norm1]: - for param in m.parameters(): - param.requires_grad = False - - for i in range(1, self.frozen_stages + 1): - m = getattr(self, f'layer{i}') - m.eval() - for param in m.parameters(): - param.requires_grad = False - - def init_weights(self, pretrained=None): - """Initialize model weights.""" - super().init_weights(pretrained) - if pretrained is None: - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.normal_(m.weight, std=0.001) - for name, _ in m.named_parameters(): - if name in ['bias']: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - def forward(self, x): - """Forward function.""" - if self.deep_stem: - x = self.stem(x) - else: - x = self.conv1(x) - x = self.norm1(x) - x = self.relu(x) - x = self.maxpool(x) - outs = [] - for i, layer_name in enumerate(self.res_layers): - res_layer = getattr(self, layer_name) - x = res_layer(x) - if i in self.out_indices: - outs.append(x) - if len(outs) == 1: - return outs[0] - return tuple(outs) - - def train(self, mode=True): - """Convert the model into training mode.""" - super().train(mode) - self._freeze_stages() - if mode and self.norm_eval: - for m in self.modules(): - # trick: eval have effect on BatchNorm only - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/builder.py b/main/transformer_utils/mmpose/models/builder.py index 47f0a53121633fb6185a4d514c05a5862a9d74cf..5fa61ca08374a3e0af8d937fdc197eb51207881c 100644 --- a/main/transformer_utils/mmpose/models/builder.py +++ b/main/transformer_utils/mmpose/models/builder.py @@ -1,10 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -from mmcv.cnn import MODELS as MMCV_MODELS -from mmcv.cnn import build_model_from_cfg -from mmcv.utils import Registry, build_from_cfg +from mmengine.registry import MODELS as MMCV_MODELS +from mmengine import Registry +from mmengine.registry import build_from_cfg, build_model_from_cfg -MODELS = Registry( - 'models', build_func=build_model_from_cfg, parent=MMCV_MODELS) +MODELS = Registry('models', parent=MMCV_MODELS, locations=['mmpose.models']) BACKBONES = MODELS NECKS = MODELS diff --git a/main/transformer_utils/mmpose/models/detectors/poseur.py b/main/transformer_utils/mmpose/models/detectors/poseur.py index b5c98ea95af4ee114e2dc731bf1b3e83489b8563..455803bbd78dca98ba80814d19dbbdb4ccfda8ab 100644 --- a/main/transformer_utils/mmpose/models/detectors/poseur.py +++ b/main/transformer_utils/mmpose/models/detectors/poseur.py @@ -12,12 +12,7 @@ from .base import BasePose import torch from config import cfg -try: - from mmcv.runner import auto_fp16 -except ImportError: - warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' - 'Please install mmcv>=1.1.4') - from mmpose.core import auto_fp16 +from mmpose.core import auto_fp16 from .top_down import TopDown diff --git a/main/transformer_utils/mmpose/models/detectors/top_down.py b/main/transformer_utils/mmpose/models/detectors/top_down.py index 99215ec70b2381fbc01be6e448e30a09f83cda2b..20bf504e9bfa20a7470203033b2265e067ad33e6 100644 --- a/main/transformer_utils/mmpose/models/detectors/top_down.py +++ b/main/transformer_utils/mmpose/models/detectors/top_down.py @@ -4,7 +4,7 @@ import warnings import mmcv import numpy as np from mmcv.image import imwrite -from mmcv.utils.misc import deprecated_api_warning +from mmengine.utils import deprecated_api_warning from mmcv.visualization.image import imshow from mmpose.core import imshow_bboxes, imshow_keypoints @@ -12,12 +12,7 @@ from .. import builder from ..builder import POSENETS from .base import BasePose -try: - from mmcv.runner import auto_fp16 -except ImportError: - warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' - 'Please install mmcv>=1.1.4') - from mmpose.core import auto_fp16 +from mmpose.core import auto_fp16 @POSENETS.register_module() diff --git a/main/transformer_utils/mmpose/models/heads/poseur_head.py b/main/transformer_utils/mmpose/models/heads/poseur_head.py index d01232247db1d687144d8fff2a3b226dd66fdcf5..db4d8a62acb9991b908d894e515befd0e7f414e8 100644 --- a/main/transformer_utils/mmpose/models/heads/poseur_head.py +++ b/main/transformer_utils/mmpose/models/heads/poseur_head.py @@ -4,9 +4,9 @@ import torch.nn as nn import copy import math import warnings -from mmcv.cnn import build_upsample_layer, Linear, bias_init_with_prob, constant_init, normal_init +from mmengine.model import constant_init, normal_init, bias_init_with_prob +from mmcv.cnn import build_upsample_layer, Linear import torch.nn.functional as F -from mmcv.cnn import normal_init from mmpose.core.evaluation import (keypoint_pck_accuracy, keypoints_from_regression) diff --git a/main/transformer_utils/mmpose/models/heads/rle_regression_head.py b/main/transformer_utils/mmpose/models/heads/rle_regression_head.py index b96a19155f6ec13f86e069d75d15ea4b70f133fa..20f702a573b5fa777c58755e1cc8270885a8bebc 100644 --- a/main/transformer_utils/mmpose/models/heads/rle_regression_head.py +++ b/main/transformer_utils/mmpose/models/heads/rle_regression_head.py @@ -1,7 +1,6 @@ import numpy as np import torch.nn as nn -from mmcv.cnn import normal_init - +from mmengine.model import normal_init from mmpose.core.evaluation import (keypoint_pck_accuracy, keypoints_from_regression) from mmpose.core.post_processing import fliplr_regression diff --git a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py index c439f5b6332d72a66db75bf599035411c4e1e0d1..ac7b42a078a210053150bc353e7c9426285d9599 100644 --- a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py +++ b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py @@ -2,10 +2,10 @@ import copy as cp import torch.nn as nn +from mmengine.model import constant_init, normal_init, kaiming_init from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, Linear, build_activation_layer, build_conv_layer, - build_norm_layer, build_upsample_layer, constant_init, - kaiming_init, normal_init) + build_norm_layer, build_upsample_layer) from mmpose.core.evaluation import pose_pck_accuracy from mmpose.core.post_processing import flip_back diff --git a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py index 5ddc058d5634a5c63970a1efb8eaa66b158da1ec..1fe95548d7ff2c20516f22e14b24a51da24d7654 100644 --- a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py +++ b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py @@ -1,8 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn -from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer, - constant_init, normal_init) +from mmengine.model import constant_init, normal_init +from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer) from mmpose.core.evaluation import pose_pck_accuracy from mmpose.core.post_processing import flip_back diff --git a/main/transformer_utils/mmpose/models/necks/__init__.py b/main/transformer_utils/mmpose/models/necks/__init__.py deleted file mode 100644 index 0593f61c01fa9968260b939f7ccd50311c058595..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/necks/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -from .fpn import FPN -from .gap_neck import GlobalAveragePooling -from .posewarper_neck import PoseWarperNeck -from .tcformer_mta_neck import MTA -from .channel_mapper import ChannelMapper - -__all__ = ['GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'MTA'] diff --git a/main/transformer_utils/mmpose/models/necks/channel_mapper.py b/main/transformer_utils/mmpose/models/necks/channel_mapper.py deleted file mode 100644 index 113d170e9d55b9e2d3984c6838a86e4c659fa75c..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/necks/channel_mapper.py +++ /dev/null @@ -1,76 +0,0 @@ -import torch.nn as nn -from mmcv.cnn import ConvModule, xavier_init - -from ..builder import NECKS - - -@NECKS.register_module() -class ChannelMapper(nn.Module): - r"""Channel Mapper to reduce/increase channels of backbone features. - - This is used to reduce/increase channels of backbone features. - - Args: - in_channels (List[int]): Number of input channels per scale. - out_channels (int): Number of output channels (used at each scale). - kernel_size (int, optional): kernel_size for reducing channels (used - at each scale). Default: 3. - conv_cfg (dict, optional): Config dict for convolution layer. - Default: None. - norm_cfg (dict, optional): Config dict for normalization layer. - Default: None. - act_cfg (dict, optional): Config dict for activation layer in - ConvModule. Default: dict(type='ReLU'). - - Example: - >>> import torch - >>> in_channels = [2, 3, 5, 7] - >>> scales = [340, 170, 84, 43] - >>> inputs = [torch.rand(1, c, s, s) - ... for c, s in zip(in_channels, scales)] - >>> self = ChannelMapper(in_channels, 11, 3).eval() - >>> outputs = self.forward(inputs) - >>> for i in range(len(outputs)): - ... print(f'outputs[{i}].shape = {outputs[i].shape}') - outputs[0].shape = torch.Size([1, 11, 340, 340]) - outputs[1].shape = torch.Size([1, 11, 170, 170]) - outputs[2].shape = torch.Size([1, 11, 84, 84]) - outputs[3].shape = torch.Size([1, 11, 43, 43]) - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=3, - conv_cfg=None, - norm_cfg=None, - act_cfg=dict(type='ReLU')): - super(ChannelMapper, self).__init__() - assert isinstance(in_channels, list) - - self.convs = nn.ModuleList() - for in_channel in in_channels: - self.convs.append( - ConvModule( - in_channel, - out_channels, - kernel_size, - padding=(kernel_size - 1) // 2, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg)) - - # default init_weights for conv(msra) and norm in ConvModule - def init_weights(self): - """Initialize the weights of ChannelMapper module.""" - for m in self.modules(): - if isinstance(m, nn.Conv2d): - xavier_init(m, distribution='uniform') - - def forward(self, inputs): - """Forward function.""" - - - assert len(inputs) == len(self.convs) - outs = [self.convs[i](inputs[i]) for i in range(len(inputs))] - return tuple(outs) diff --git a/main/transformer_utils/mmpose/models/necks/fpn.py b/main/transformer_utils/mmpose/models/necks/fpn.py deleted file mode 100644 index 795a8af0b6904153a9b4e1a41d7b803381874162..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/necks/fpn.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import ConvModule, xavier_init -from mmcv.runner import auto_fp16 - -from ..builder import NECKS - - -@NECKS.register_module() -class FPN(nn.Module): - r"""Feature Pyramid Network. - - This is an implementation of paper `Feature Pyramid Networks for Object - Detection `_. - - Args: - in_channels (list[int]): Number of input channels per scale. - out_channels (int): Number of output channels (used at each scale). - num_outs (int): Number of output scales. - start_level (int): Index of the start input backbone level used to - build the feature pyramid. Default: 0. - end_level (int): Index of the end input backbone level (exclusive) to - build the feature pyramid. Default: -1, which means the last level. - add_extra_convs (bool | str): If bool, it decides whether to add conv - layers on top of the original feature maps. Default to False. - If True, it is equivalent to `add_extra_convs='on_input'`. - If str, it specifies the source feature map of the extra convs. - Only the following options are allowed - - - 'on_input': Last feat map of neck inputs (i.e. backbone feature). - - 'on_lateral': Last feature map after lateral convs. - - 'on_output': The last output feature map after fpn convs. - relu_before_extra_convs (bool): Whether to apply relu before the extra - conv. Default: False. - no_norm_on_lateral (bool): Whether to apply norm on lateral. - Default: False. - conv_cfg (dict): Config dict for convolution layer. Default: None. - norm_cfg (dict): Config dict for normalization layer. Default: None. - act_cfg (dict): Config dict for activation layer in ConvModule. - Default: None. - upsample_cfg (dict): Config dict for interpolate layer. - Default: dict(mode='nearest'). - - Example: - >>> import torch - >>> in_channels = [2, 3, 5, 7] - >>> scales = [340, 170, 84, 43] - >>> inputs = [torch.rand(1, c, s, s) - ... for c, s in zip(in_channels, scales)] - >>> self = FPN(in_channels, 11, len(in_channels)).eval() - >>> outputs = self.forward(inputs) - >>> for i in range(len(outputs)): - ... print(f'outputs[{i}].shape = {outputs[i].shape}') - outputs[0].shape = torch.Size([1, 11, 340, 340]) - outputs[1].shape = torch.Size([1, 11, 170, 170]) - outputs[2].shape = torch.Size([1, 11, 84, 84]) - outputs[3].shape = torch.Size([1, 11, 43, 43]) - """ - - def __init__(self, - in_channels, - out_channels, - num_outs, - start_level=0, - end_level=-1, - add_extra_convs=False, - relu_before_extra_convs=False, - no_norm_on_lateral=False, - conv_cfg=None, - norm_cfg=None, - act_cfg=None, - upsample_cfg=dict(mode='nearest')): - super().__init__() - assert isinstance(in_channels, list) - self.in_channels = in_channels - self.out_channels = out_channels - self.num_ins = len(in_channels) - self.num_outs = num_outs - self.relu_before_extra_convs = relu_before_extra_convs - self.no_norm_on_lateral = no_norm_on_lateral - self.fp16_enabled = False - self.upsample_cfg = upsample_cfg.copy() - - if end_level == -1 or end_level == self.num_ins - 1: - self.backbone_end_level = self.num_ins - assert num_outs >= self.num_ins - start_level - else: - # if end_level is not the last level, no extra level is allowed - self.backbone_end_level = end_level + 1 - assert end_level < self.num_ins - assert num_outs == end_level - start_level + 1 - self.start_level = start_level - self.end_level = end_level - self.add_extra_convs = add_extra_convs - assert isinstance(add_extra_convs, (str, bool)) - if isinstance(add_extra_convs, str): - # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' - assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') - elif add_extra_convs: # True - self.add_extra_convs = 'on_input' - - self.lateral_convs = nn.ModuleList() - self.fpn_convs = nn.ModuleList() - - for i in range(self.start_level, self.backbone_end_level): - l_conv = ConvModule( - in_channels[i], - out_channels, - 1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, - act_cfg=act_cfg, - inplace=False) - fpn_conv = ConvModule( - out_channels, - out_channels, - 3, - padding=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - inplace=False) - - self.lateral_convs.append(l_conv) - self.fpn_convs.append(fpn_conv) - - # add extra conv layers (e.g., RetinaNet) - extra_levels = num_outs - self.backbone_end_level + self.start_level - if self.add_extra_convs and extra_levels >= 1: - for i in range(extra_levels): - if i == 0 and self.add_extra_convs == 'on_input': - in_channels = self.in_channels[self.backbone_end_level - 1] - else: - in_channels = out_channels - extra_fpn_conv = ConvModule( - in_channels, - out_channels, - 3, - stride=2, - padding=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - inplace=False) - self.fpn_convs.append(extra_fpn_conv) - - def init_weights(self): - """Initialize model weights.""" - for m in self.modules(): - if isinstance(m, nn.Conv2d): - xavier_init(m, distribution='uniform') - - @auto_fp16() - def forward(self, inputs): - """Forward function.""" - assert len(inputs) == len(self.in_channels) - - # build laterals - laterals = [ - lateral_conv(inputs[i + self.start_level]) - for i, lateral_conv in enumerate(self.lateral_convs) - ] - - # build top-down path - used_backbone_levels = len(laterals) - for i in range(used_backbone_levels - 1, 0, -1): - # In some cases, fixing `scale factor` (e.g. 2) is preferred, but - # it cannot co-exist with `size` in `F.interpolate`. - if 'scale_factor' in self.upsample_cfg: - # fix runtime error of "+=" inplace operation in PyTorch 1.10 - laterals[i - 1] = laterals[i - 1] + F.interpolate( - laterals[i], **self.upsample_cfg) - else: - prev_shape = laterals[i - 1].shape[2:] - laterals[i - 1] = laterals[i - 1] + F.interpolate( - laterals[i], size=prev_shape, **self.upsample_cfg) - - # build outputs - # part 1: from original levels - outs = [ - self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) - ] - # part 2: add extra levels - if self.num_outs > len(outs): - # use max pool to get more levels on top of outputs - # (e.g., Faster R-CNN, Mask R-CNN) - if not self.add_extra_convs: - for i in range(self.num_outs - used_backbone_levels): - outs.append(F.max_pool2d(outs[-1], 1, stride=2)) - # add conv layers on top of original feature maps (RetinaNet) - else: - if self.add_extra_convs == 'on_input': - extra_source = inputs[self.backbone_end_level - 1] - elif self.add_extra_convs == 'on_lateral': - extra_source = laterals[-1] - elif self.add_extra_convs == 'on_output': - extra_source = outs[-1] - else: - raise NotImplementedError - outs.append(self.fpn_convs[used_backbone_levels](extra_source)) - for i in range(used_backbone_levels + 1, self.num_outs): - if self.relu_before_extra_convs: - outs.append(self.fpn_convs[i](F.relu(outs[-1]))) - else: - outs.append(self.fpn_convs[i](outs[-1])) - return outs diff --git a/main/transformer_utils/mmpose/models/necks/gap_neck.py b/main/transformer_utils/mmpose/models/necks/gap_neck.py deleted file mode 100644 index 5e6ad68ec11110daaad3a66e09d67efb355c4b93..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/necks/gap_neck.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch -import torch.nn as nn - -from ..builder import NECKS - - -@NECKS.register_module() -class GlobalAveragePooling(nn.Module): - """Global Average Pooling neck. - - Note that we use `view` to remove extra channel after pooling. We do not - use `squeeze` as it will also remove the batch dimension when the tensor - has a batch dimension of size 1, which can lead to unexpected errors. - """ - - def __init__(self): - super().__init__() - self.gap = nn.AdaptiveAvgPool2d((1, 1)) - - def init_weights(self): - pass - - def forward(self, inputs): - if isinstance(inputs, tuple): - outs = tuple([self.gap(x) for x in inputs]) - outs = tuple( - [out.view(x.size(0), -1) for out, x in zip(outs, inputs)]) - elif isinstance(inputs, list): - outs = [self.gap(x) for x in inputs] - outs = [out.view(x.size(0), -1) for out, x in zip(outs, inputs)] - elif isinstance(inputs, torch.Tensor): - outs = self.gap(inputs) - outs = outs.view(inputs.size(0), -1) - else: - raise TypeError('neck inputs should be tuple or torch.tensor') - return outs diff --git a/main/transformer_utils/mmpose/models/necks/posewarper_neck.py b/main/transformer_utils/mmpose/models/necks/posewarper_neck.py deleted file mode 100644 index dd4ddfbf8984857a6110f19b0a7d703b53f1c433..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/necks/posewarper_neck.py +++ /dev/null @@ -1,329 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import mmcv -import torch -import torch.nn as nn -from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, - normal_init) -from mmcv.utils import digit_version -from torch.nn.modules.batchnorm import _BatchNorm - -from mmpose.models.utils.ops import resize -from ..backbones.resnet import BasicBlock, Bottleneck -from ..builder import NECKS - -try: - from mmcv.ops import DeformConv2d - has_mmcv_full = True -except (ImportError, ModuleNotFoundError): - has_mmcv_full = False - - -@NECKS.register_module() -class PoseWarperNeck(nn.Module): - """PoseWarper neck. - - `"Learning temporal pose estimation from sparsely-labeled videos" - `_. - - Args: - in_channels (int): Number of input channels from backbone - out_channels (int): Number of output channels - inner_channels (int): Number of intermediate channels of the res block - deform_groups (int): Number of groups in the deformable conv - dilations (list|tuple): different dilations of the offset conv layers - trans_conv_kernel (int): the kernel of the trans conv layer, which is - used to get heatmap from the output of backbone. Default: 1 - res_blocks_cfg (dict|None): config of residual blocks. If None, - use the default values. If not None, it should contain the - following keys: - - - block (str): the type of residual block, Default: 'BASIC'. - - num_blocks (int): the number of blocks, Default: 20. - - offsets_kernel (int): the kernel of offset conv layer. - deform_conv_kernel (int): the kernel of defomrable conv layer. - in_index (int|Sequence[int]): Input feature index. Default: 0 - input_transform (str|None): Transformation type of input features. - Options: 'resize_concat', 'multiple_select', None. - Default: None. - - - 'resize_concat': Multiple feature maps will be resize to \ - the same size as first one and than concat together. \ - Usually used in FCN head of HRNet. - - 'multiple_select': Multiple feature maps will be bundle into \ - a list and passed into decode head. - - None: Only one select feature map is allowed. - - freeze_trans_layer (bool): Whether to freeze the transition layer - (stop grad and set eval mode). Default: True. - norm_eval (bool): Whether to set norm layers to eval mode, namely, - freeze running stats (mean and var). Note: Effect on Batch Norm - and its variants only. Default: False. - im2col_step (int): the argument `im2col_step` in deformable conv, - Default: 80. - """ - blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} - minimum_mmcv_version = '1.3.17' - - def __init__(self, - in_channels, - out_channels, - inner_channels, - deform_groups=17, - dilations=(3, 6, 12, 18, 24), - trans_conv_kernel=1, - res_blocks_cfg=None, - offsets_kernel=3, - deform_conv_kernel=3, - in_index=0, - input_transform=None, - freeze_trans_layer=True, - norm_eval=False, - im2col_step=80): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.inner_channels = inner_channels - self.deform_groups = deform_groups - self.dilations = dilations - self.trans_conv_kernel = trans_conv_kernel - self.res_blocks_cfg = res_blocks_cfg - self.offsets_kernel = offsets_kernel - self.deform_conv_kernel = deform_conv_kernel - self.in_index = in_index - self.input_transform = input_transform - self.freeze_trans_layer = freeze_trans_layer - self.norm_eval = norm_eval - self.im2col_step = im2col_step - - identity_trans_layer = False - - assert trans_conv_kernel in [0, 1, 3] - kernel_size = trans_conv_kernel - if kernel_size == 3: - padding = 1 - elif kernel_size == 1: - padding = 0 - else: - # 0 for Identity mapping. - identity_trans_layer = True - - if identity_trans_layer: - self.trans_layer = nn.Identity() - else: - self.trans_layer = build_conv_layer( - cfg=dict(type='Conv2d'), - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=1, - padding=padding) - - # build chain of residual blocks - if res_blocks_cfg is not None and not isinstance(res_blocks_cfg, dict): - raise TypeError('res_blocks_cfg should be dict or None.') - - if res_blocks_cfg is None: - block_type = 'BASIC' - num_blocks = 20 - else: - block_type = res_blocks_cfg.get('block', 'BASIC') - num_blocks = res_blocks_cfg.get('num_blocks', 20) - - block = self.blocks_dict[block_type] - - res_layers = [] - downsample = nn.Sequential( - build_conv_layer( - cfg=dict(type='Conv2d'), - in_channels=out_channels, - out_channels=inner_channels, - kernel_size=1, - stride=1, - bias=False), - build_norm_layer(dict(type='BN'), inner_channels)[1]) - res_layers.append( - block( - in_channels=out_channels, - out_channels=inner_channels, - downsample=downsample)) - - for _ in range(1, num_blocks): - res_layers.append(block(inner_channels, inner_channels)) - self.offset_feats = nn.Sequential(*res_layers) - - # build offset layers - self.num_offset_layers = len(dilations) - assert self.num_offset_layers > 0, 'Number of offset layers ' \ - 'should be larger than 0.' - - target_offset_channels = 2 * offsets_kernel**2 * deform_groups - - offset_layers = [ - build_conv_layer( - cfg=dict(type='Conv2d'), - in_channels=inner_channels, - out_channels=target_offset_channels, - kernel_size=offsets_kernel, - stride=1, - dilation=dilations[i], - padding=dilations[i], - bias=False, - ) for i in range(self.num_offset_layers) - ] - self.offset_layers = nn.ModuleList(offset_layers) - - # build deformable conv layers - assert digit_version(mmcv.__version__) >= \ - digit_version(self.minimum_mmcv_version), \ - f'Current MMCV version: {mmcv.__version__}, ' \ - f'but MMCV >= {self.minimum_mmcv_version} is required, see ' \ - f'https://github.com/open-mmlab/mmcv/issues/1440, ' \ - f'Please install the latest MMCV.' - - if has_mmcv_full: - deform_conv_layers = [ - DeformConv2d( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=deform_conv_kernel, - stride=1, - padding=int(deform_conv_kernel / 2) * dilations[i], - dilation=dilations[i], - deform_groups=deform_groups, - im2col_step=self.im2col_step, - ) for i in range(self.num_offset_layers) - ] - else: - raise ImportError('Please install the full version of mmcv ' - 'to use `DeformConv2d`.') - - self.deform_conv_layers = nn.ModuleList(deform_conv_layers) - - self.freeze_layers() - - def freeze_layers(self): - if self.freeze_trans_layer: - self.trans_layer.eval() - - for param in self.trans_layer.parameters(): - param.requires_grad = False - - def init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - normal_init(m, std=0.001) - elif isinstance(m, (_BatchNorm, nn.GroupNorm)): - constant_init(m, 1) - elif isinstance(m, DeformConv2d): - filler = torch.zeros([ - m.weight.size(0), - m.weight.size(1), - m.weight.size(2), - m.weight.size(3) - ], - dtype=torch.float32, - device=m.weight.device) - for k in range(m.weight.size(0)): - filler[k, k, - int(m.weight.size(2) / 2), - int(m.weight.size(3) / 2)] = 1.0 - m.weight = torch.nn.Parameter(filler) - m.weight.requires_grad = True - - # posewarper offset layer weight initialization - for m in self.offset_layers.modules(): - constant_init(m, 0) - - def _transform_inputs(self, inputs): - """Transform inputs for decoder. - - Args: - inputs (list[Tensor] | Tensor): multi-level img features. - - Returns: - Tensor: The transformed inputs - """ - if not isinstance(inputs, list): - return inputs - - if self.input_transform == 'resize_concat': - inputs = [inputs[i] for i in self.in_index] - upsampled_inputs = [ - resize( - input=x, - size=inputs[0].shape[2:], - mode='bilinear', - align_corners=self.align_corners) for x in inputs - ] - inputs = torch.cat(upsampled_inputs, dim=1) - elif self.input_transform == 'multiple_select': - inputs = [inputs[i] for i in self.in_index] - else: - inputs = inputs[self.in_index] - - return inputs - - def forward(self, inputs, frame_weight): - assert isinstance(inputs, (list, tuple)), 'PoseWarperNeck inputs ' \ - 'should be list or tuple, even though the length is 1, ' \ - 'for unified processing.' - - output_heatmap = 0 - if len(inputs) > 1: - inputs = [self._transform_inputs(input) for input in inputs] - inputs = [self.trans_layer(input) for input in inputs] - - # calculate difference features - diff_features = [ - self.offset_feats(inputs[0] - input) for input in inputs - ] - - for i in range(len(inputs)): - if frame_weight[i] == 0: - continue - warped_heatmap = 0 - for j in range(self.num_offset_layers): - offset = (self.offset_layers[j](diff_features[i])) - warped_heatmap_tmp = self.deform_conv_layers[j](inputs[i], - offset) - warped_heatmap += warped_heatmap_tmp / \ - self.num_offset_layers - - output_heatmap += warped_heatmap * frame_weight[i] - - else: - inputs = inputs[0] - inputs = self._transform_inputs(inputs) - inputs = self.trans_layer(inputs) - - num_frames = len(frame_weight) - batch_size = inputs.size(0) // num_frames - ref_x = inputs[:batch_size] - ref_x_tiled = ref_x.repeat(num_frames, 1, 1, 1) - - offset_features = self.offset_feats(ref_x_tiled - inputs) - - warped_heatmap = 0 - for j in range(self.num_offset_layers): - offset = self.offset_layers[j](offset_features) - - warped_heatmap_tmp = self.deform_conv_layers[j](inputs, offset) - warped_heatmap += warped_heatmap_tmp / self.num_offset_layers - - for i in range(num_frames): - if frame_weight[i] == 0: - continue - output_heatmap += warped_heatmap[i * batch_size:(i + 1) * - batch_size] * frame_weight[i] - - return output_heatmap - - def train(self, mode=True): - """Convert the model into training mode.""" - super().train(mode) - self.freeze_layers() - if mode and self.norm_eval: - for m in self.modules(): - if isinstance(m, _BatchNorm): - m.eval() diff --git a/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py b/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py deleted file mode 100644 index 6723fb018e7799c1c0104868b1ca87c56cd28351..0000000000000000000000000000000000000000 --- a/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import math - -import torch.nn as nn -import torch.nn.functional as F -from mmcv.cnn import ConvModule, constant_init, normal_init, trunc_normal_init -from mmcv.runner import BaseModule - -from ..builder import NECKS -from ..utils import TCFormerDynamicBlock, token2map, token_interp - - -@NECKS.register_module() -class MTA(BaseModule): - """Multi-stage Token feature Aggregation (MTA) module in TCFormer. - - Args: - in_channels (list[int]): Number of input channels per stage. - Default: [64, 128, 256, 512]. - out_channels (int): Number of output channels (used at each scale). - num_outs (int): Number of output scales. Default: 4. - start_level (int): Index of the start input backbone level used to - build the feature pyramid. Default: 0. - end_level (int): Index of the end input backbone level (exclusive) to - build the feature pyramid. Default: -1, which means the last level. - add_extra_convs (bool | str): If bool, it decides whether to add conv - layers on top of the original feature maps. Default to False. - If True, it is equivalent to `add_extra_convs='on_input'`. - If str, it specifies the source feature map of the extra convs. - Only the following options are allowed - - 'on_input': Last feat map of neck inputs (i.e. backbone feature). - - 'on_output': The last output feature map after fpn convs. - relu_before_extra_convs (bool): Whether to apply relu before the extra - conv. Default: False. - no_norm_on_lateral (bool): Whether to apply norm on lateral. - Default: False. - conv_cfg (dict): Config dict for convolution layer. Default: None. - norm_cfg (dict): Config dict for normalization layer. Default: None. - act_cfg (dict): Config dict for activation layer in ConvModule. - num_heads (Sequence[int]): The attention heads of each transformer - block. Default: [2, 2, 2, 2]. - mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the - embedding dim of each transformer block. - sr_ratios (Sequence[int]): The spatial reduction rate of each - transformer block. Default: [8, 4, 2, 1]. - qkv_bias (bool): Enable bias for qkv if True. Default: True. - qk_scale (float | None, optional): Override default qk scale of - head_dim ** -0.5 if set. Default: None. - drop_rate (float): Probability of an element to be zeroed. - Default 0.0. - attn_drop_rate (float): The drop out rate for attention layer. - Default 0.0. - drop_path_rate (float): stochastic depth rate. Default 0. - transformer_norm_cfg (dict): Config dict for normalization layer - in transformer blocks. Default: dict(type='LN'). - use_sr_conv (bool): If True, use a conv layer for spatial reduction. - If False, use a pooling process for spatial reduction. Defaults: - False. - """ - - def __init__( - self, - in_channels=[64, 128, 256, 512], - out_channels=128, - num_outs=4, - start_level=0, - end_level=-1, - add_extra_convs=False, - relu_before_extra_convs=False, - no_norm_on_lateral=False, - conv_cfg=None, - norm_cfg=None, - act_cfg=None, - num_heads=[2, 2, 2, 2], - mlp_ratios=[4, 4, 4, 4], - sr_ratios=[8, 4, 2, 1], - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - transformer_norm_cfg=dict(type='LN'), - use_sr_conv=False, - ): - super().__init__() - assert isinstance(in_channels, list) - self.in_channels = in_channels - self.out_channels = out_channels - self.num_ins = len(in_channels) - self.num_outs = num_outs - self.no_norm_on_lateral = no_norm_on_lateral - self.fp16_enabled = False - self.norm_cfg = norm_cfg - self.conv_cfg = conv_cfg - self.act_cfg = act_cfg - self.mlp_ratios = mlp_ratios - - if end_level == -1 or end_level == self.num_ins - 1: - self.backbone_end_level = self.num_ins - assert num_outs >= self.num_ins - start_level - else: - # if end_level is not the last level, no extra level is allowed - self.backbone_end_level = end_level + 1 - assert end_level < self.num_ins - assert num_outs == end_level - start_level + 1 - self.start_level = start_level - self.end_level = end_level - - self.lateral_convs = nn.ModuleList() - self.merge_blocks = nn.ModuleList() - - for i in range(self.start_level, self.backbone_end_level): - l_conv = ConvModule( - in_channels[i], - out_channels, - 1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, - act_cfg=act_cfg, - inplace=False) - self.lateral_convs.append(l_conv) - - for i in range(self.start_level, self.backbone_end_level - 1): - merge_block = TCFormerDynamicBlock( - dim=out_channels, - num_heads=num_heads[i], - mlp_ratio=mlp_ratios[i], - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=drop_path_rate, - norm_cfg=transformer_norm_cfg, - sr_ratio=sr_ratios[i], - use_sr_conv=use_sr_conv) - self.merge_blocks.append(merge_block) - - # add extra conv layers (e.g., RetinaNet) - self.relu_before_extra_convs = relu_before_extra_convs - - self.add_extra_convs = add_extra_convs - assert isinstance(add_extra_convs, (str, bool)) - if isinstance(add_extra_convs, str): - # Extra_convs_source choices: 'on_input', 'on_output' - assert add_extra_convs in ('on_input', 'on_output') - elif add_extra_convs: # True - self.add_extra_convs = 'on_input' - - self.extra_convs = nn.ModuleList() - extra_levels = num_outs - (self.end_level + 1 - self.start_level) - if self.add_extra_convs and extra_levels >= 1: - for i in range(extra_levels): - if i == 0 and self.add_extra_convs == 'on_input': - in_channels = self.in_channels[self.end_level] - else: - in_channels = out_channels - extra_fpn_conv = ConvModule( - in_channels, - out_channels, - 3, - stride=2, - padding=1, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - inplace=False) - self.extra_convs.append(extra_fpn_conv) - - def init_weights(self): - for m in self.modules(): - if isinstance(m, nn.Linear): - trunc_normal_init(m, std=.02, bias=0.) - elif isinstance(m, nn.LayerNorm): - constant_init(m, 1.0) - elif isinstance(m, nn.Conv2d): - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - fan_out //= m.groups - normal_init(m, 0, math.sqrt(2.0 / fan_out)) - - def forward(self, inputs): - """Forward function.""" - assert len(inputs) == len(self.in_channels) - - # build lateral tokens - input_dicts = [] - for i, lateral_conv in enumerate(self.lateral_convs): - tmp = inputs[i + self.start_level].copy() - tmp['x'] = lateral_conv(tmp['x'].unsqueeze(2).permute( - 0, 3, 1, 2)).permute(0, 2, 3, 1).squeeze(2) - input_dicts.append(tmp) - - # merge from high level to low level - for i in range(len(input_dicts) - 2, -1, -1): - input_dicts[i]['x'] = input_dicts[i]['x'] + token_interp( - input_dicts[i], input_dicts[i + 1]) - input_dicts[i] = self.merge_blocks[i](input_dicts[i]) - - # transform to feature map - outs = [token2map(token_dict) for token_dict in input_dicts] - - # part 2: add extra levels - used_backbone_levels = len(outs) - if self.num_outs > len(outs): - # use max pool to get more levels on top of outputs - if not self.add_extra_convs: - for i in range(self.num_outs - used_backbone_levels): - outs.append(F.max_pool2d(outs[-1], 1, stride=2)) - # add conv layers on top of original feature maps - else: - if self.add_extra_convs == 'on_input': - tmp = inputs[self.backbone_end_level - 1] - extra_source = token2map(tmp) - elif self.add_extra_convs == 'on_output': - extra_source = outs[-1] - else: - raise NotImplementedError - - outs.append(self.extra_convs[0](extra_source)) - for i in range(1, self.num_outs - used_backbone_levels): - if self.relu_before_extra_convs: - outs.append(self.extra_convs[i](F.relu(outs[-1]))) - else: - outs.append(self.extra_convs[i](outs[-1])) - return outs diff --git a/main/transformer_utils/mmpose/models/utils/positional_encoding.py b/main/transformer_utils/mmpose/models/utils/positional_encoding.py index 3c7e6bab9f5b3a1a71895f068bcbee47a891de68..3ceb81ac078894f747d97a2ba6d78199addab3e5 100644 --- a/main/transformer_utils/mmpose/models/utils/positional_encoding.py +++ b/main/transformer_utils/mmpose/models/utils/positional_encoding.py @@ -2,11 +2,11 @@ import math import torch import torch.nn as nn -from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING -from mmcv.runner import BaseModule +# from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING +from mmengine.model import BaseModule -@POSITIONAL_ENCODING.register_module(force=True) +# @POSITIONAL_ENCODING.register_module(force=True) class SinePositionalEncoding(BaseModule): """Position encoding with sine and cosine functions. See `End-to-End Object Detection with Transformers @@ -98,7 +98,7 @@ class SinePositionalEncoding(BaseModule): return repr_str -@POSITIONAL_ENCODING.register_module(force=True) +# @POSITIONAL_ENCODING.register_module(force=True) class LearnedPositionalEncoding(BaseModule): """Position embedding with learnable embedding weights. Args: diff --git a/main/transformer_utils/mmpose/models/utils/tcformer_utils.py b/main/transformer_utils/mmpose/models/utils/tcformer_utils.py index 8d3a28534c83d60e52ed0382f54a4d9f4902e018..85fae8cd46f44b536ab396df669f041fe1f18094 100644 --- a/main/transformer_utils/mmpose/models/utils/tcformer_utils.py +++ b/main/transformer_utils/mmpose/models/utils/tcformer_utils.py @@ -4,7 +4,8 @@ import math import torch import torch.nn as nn import torch.nn.functional as F -from mmcv.cnn import build_norm_layer, trunc_normal_init +from mmcv.cnn import build_norm_layer +from mmengine.model import trunc_normal_init from mmcv.cnn.bricks.transformer import build_dropout try: diff --git a/main/transformer_utils/mmpose/models/utils/transformer.py b/main/transformer_utils/mmpose/models/utils/transformer.py index 42205707347e57c433e66eda728cc82a6df7455a..f256c4c8ffade75f020686a369d35157ba6d6b5c 100644 --- a/main/transformer_utils/mmpose/models/utils/transformer.py +++ b/main/transformer_utils/mmpose/models/utils/transformer.py @@ -6,30 +6,32 @@ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_conv_layer, build_norm_layer -from mmcv.runner.base_module import BaseModule -from mmcv.utils import to_2tuple +from mmengine.model import BaseModule, ModuleList +from mmengine.utils import digit_version, to_2tuple from mmpose.models.builder import TRANSFORMER from easydict import EasyDict from einops import rearrange, repeat -from mmcv.runner import force_fp32 +from mmpose.core import force_fp32 from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, TransformerLayerSequence, build_transformer_layer_sequence) -from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER, - TRANSFORMER_LAYER_SEQUENCE) +# from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER, +# TRANSFORMER_LAYER_SEQUENCE) import torch.distributions as distributions from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention from torch.nn.init import normal_ import copy import warnings -from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init - +from mmcv.cnn import build_activation_layer, build_norm_layer +from mmengine.model import xavier_init from utils.human_models import smpl_x from config import cfg - +from mmengine import Registry +TRANSFORMER_LAYER = Registry('transformerLayer') +TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence') def point_sample(input, point_coords, **kwargs): """ A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors. diff --git a/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py b/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py index e58ca98ebd0cef0498607270b7650a4ad1f6ec27..0d46db5038dcb24d605b1705ecc64b1dcafb6d96 100644 --- a/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py +++ b/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py @@ -6,13 +6,14 @@ import torch.nn as nn import torch.nn.functional as F from torch.autograd.function import Function, once_differentiable -from mmcv import deprecated_api_warning -from mmcv.cnn import constant_init, xavier_init -from mmcv.cnn.bricks.registry import ATTENTION -from mmcv.runner import BaseModule +from mmengine.utils import deprecated_api_warning +from mmengine.model import constant_init, xavier_init +# from mmcv.cnn.bricks.registry import ATTENTION +from mmengine.model import BaseModule from mmcv.utils import ext_loader from mmcv.ops.multi_scale_deform_attn import ext_module - +from mmengine import Registry +ATTENTION = Registry('attention') class MultiScaleDeformableAttnFunction(Function): @staticmethod diff --git a/main/transformer_utils/mmpose/utils/collect_env.py b/main/transformer_utils/mmpose/utils/collect_env.py index f75c5ea73383ccef367632cf497227498ac50078..1433f0bcb1555b550e06b5e933b2755dbc56e24c 100644 --- a/main/transformer_utils/mmpose/utils/collect_env.py +++ b/main/transformer_utils/mmpose/utils/collect_env.py @@ -1,16 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. -from mmcv.utils import collect_env as collect_basic_env -from mmcv.utils import get_git_hash +from mmengine.utils import get_git_hash +from mmengine.utils.dl_utils import collect_env as collect_base_env import mmpose def collect_env(): - env_info = collect_basic_env() + env_info = collect_base_env() env_info['MMPose'] = (mmpose.__version__ + '+' + get_git_hash(digits=7)) return env_info if __name__ == '__main__': for name, val in collect_env().items(): - print(f'{name}: {val}') + print(f'{name}: {val}') \ No newline at end of file diff --git a/main/transformer_utils/mmpose/utils/logger.py b/main/transformer_utils/mmpose/utils/logger.py index 294837fa6aec1e1896de8c8accf470f366f81296..2e60cc5c2d59c15adb963645137d54900d998a60 100644 --- a/main/transformer_utils/mmpose/utils/logger.py +++ b/main/transformer_utils/mmpose/utils/logger.py @@ -1,11 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. import logging -from mmcv.utils import get_logger +from mmengine.logging import MMLogger def get_root_logger(log_file=None, log_level=logging.INFO): - """Use `get_logger` method in mmcv to get the root logger. + """Use `MMLogger` class in mmengine to get the root logger. The logger will be initialized if it has not been initialized. By default a StreamHandler will be added. If `log_file` is specified, a FileHandler will @@ -22,4 +22,4 @@ def get_root_logger(log_file=None, log_level=logging.INFO): Returns: logging.Logger: The root logger. """ - return get_logger(__name__.split('.')[0], log_file, log_level) + return MMLogger('MMLogger', __name__.split('.')[0], log_file, log_level) \ No newline at end of file diff --git a/main/transformer_utils/mmpose/utils/setup_env.py b/main/transformer_utils/mmpose/utils/setup_env.py index 21def2f0809153a5f755af2431f7e702db625e5c..4c862d9e67a869aa8fa3624110c4d7f3ac60fa90 100644 --- a/main/transformer_utils/mmpose/utils/setup_env.py +++ b/main/transformer_utils/mmpose/utils/setup_env.py @@ -45,3 +45,34 @@ def setup_multi_processes(cfg): f'overloaded, please further tune the variable for optimal ' f'performance in your application as needed.') os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) + +# def register_all_modules(init_default_scope: bool = True) -> None: +# """Register all modules in mmpose into the registries. + +# Args: +# init_default_scope (bool): Whether initialize the mmpose default scope. +# When `init_default_scope=True`, the global default scope will be +# set to `mmpose`, and all registries will build modules from mmpose's +# registry node. To understand more about the registry, please refer +# to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md +# Defaults to True. +# """ # noqa + +# import mmpose.models # noqa: F401,F403 + +# if init_default_scope: +# never_created = DefaultScope.get_current_instance() is None \ +# or not DefaultScope.check_instance_created('mmpose') +# if never_created: +# DefaultScope.get_instance('mmpose', scope_name='mmpose') +# return +# current_scope = DefaultScope.get_current_instance() +# if current_scope.scope_name != 'mmpose': +# warnings.warn('The current default scope ' +# f'"{current_scope.scope_name}" is not "mmpose", ' +# '`register_all_modules` will force the current' +# 'default scope to be "mmpose". If this is not ' +# 'expected, please set `init_default_scope=False`.') +# # avoid name conflict +# new_instance_name = f'mmpose-{datetime.datetime.now()}' +# DefaultScope.get_instance(new_instance_name, scope_name='mmpose') \ No newline at end of file diff --git a/main/transformer_utils/mmpose/utils/timer.py b/main/transformer_utils/mmpose/utils/timer.py index 5a3185c5e89ce73bd33591c22ce74fc73ef8e770..cec6aff6226e249edca2e2ac64b7cd2db8557e19 100644 --- a/main/transformer_utils/mmpose/utils/timer.py +++ b/main/transformer_utils/mmpose/utils/timer.py @@ -4,7 +4,7 @@ from contextlib import contextmanager from functools import partial import numpy as np -from mmcv import Timer +from mmengine import Timer class RunningAverage(): @@ -114,4 +114,4 @@ class StopWatch: def reset(self): self._record = defaultdict(list) - self._active_timer_stack = [] + self._active_timer_stack = [] \ No newline at end of file diff --git a/pre-requirements.txt b/pre-requirements.txt index 4e1c5f6a2c1603941b6cc4be0c03ca373e6327ec..ba1cf2dec43cf1bd737527402e561cfb15db9b8c 100644 --- a/pre-requirements.txt +++ b/pre-requirements.txt @@ -1,6 +1,4 @@ numpy==1.23 - ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.0.0+cu118 -torchvision==0.15.0+cu118 -torchaudio==2.0.0+cu118 \ No newline at end of file +torch==2.0.0 +torchvision +torchaudio \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 34511268aa0abff3c76c7eb66397dfa2f277acc3..fb3a3646f5962e098c3521a29a4b73342157ecd8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ ---extra-index-url https://download.openmmlab.com/mmcv/dist/cu118/torch2.0/index.html -https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv_full-1.7.2-cp39-cp39-manylinux1_x86_64.whl +--extra-index-url https://download.openmmlab.com/mmcv/dist/cu117/torch2.0.0/index.html +https://download.openmmlab.com/mmcv/dist/cu117/torch2.0.0/mmcv-2.1.0-cp310-cp310-manylinux1_x86_64.whl scikit-image scipy @@ -28,5 +28,6 @@ pycocotools plyfile timm pyglet -mmdet==2.26.0 +mmcv +mmdet==3.2.0 eval_type_backport \ No newline at end of file