diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2223bbb5d31e9ba827cac883c0555838155f622a --- /dev/null +++ b/.gitignore @@ -0,0 +1,127 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/en/_build/ +docs/zh_cn/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +data/ +data +.vscode +.idea +.DS_Store + +# custom +*.pkl +*.pkl.json +*.log.json +docs/modelzoo_statistics.md +mmdet/.mim +work_dirs + +# Pytorch +*.pth +*.py~ +*.sh~ + +# venus +venus_run.sh + diff --git a/README.md b/README.md index 9a2cfb82c5429a261f1e713fea200e33215573fd..f32bdc4472920c5d87a0deff90dbc8ea6500a71e 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,141 @@ ---- -title: YOLO World -emoji: 🔥 -colorFrom: pink -colorTo: blue -sdk: docker -pinned: false -license: apache-2.0 ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +
+
+ +
+
+Tianheng Cheng*2,3, +Lin Song*1, +Yixiao Ge1,2, +Xinggang Wang3, + Wenyu Liu3, +Ying Shan1,2 +
+ +1 Tencent AI Lab, 2 ARC Lab, Tencent PCG +3 Huazhong University of Science and Technology +
+
+ +[![arxiv paper](https://img.shields.io/badge/arXiv-Paper-red)](https://arxiv.org/abs/) +[![video](https://img.shields.io/badge/🤗HugginngFace-Spaces-orange)](https://huggingface.co/) +[![license](https://img.shields.io/badge/License-GPLv3.0-blue)](LICENSE) + +
+
+ + +## Updates + +`[2024-1-25]:` We are excited to launch **YOLO-World**, a cutting-edge real-time open-vocabulary object detector. + +## Highlights + +This repo contains the PyTorch implementation, pre-trained weights, and pre-training/fine-tuning code for YOLO-World. + +* YOLO-World is pre-trained on large-scale datasets, including detection, grounding, and image-text datasets. + +* YOLO-World is the next-generation YOLO detector, with a strong open-vocabulary detection capability and grounding ability. + +* YOLO-World presents a *prompt-then-detect* paradigm for efficient user-vocabulary inference, which re-parameterizes vocabulary embeddings as parameters into the model and achieve superior inference speed. You can try to export your own detection model without extra training or fine-tuning in our [online demo]()! + + +
+ +
+ + +## Abstract + +The You Only Look Once (YOLO) series of detectors have established themselves as efficient and practical tools. However, their reliance on predefined and trained object categories limits their applicability in open scenarios. Addressing this limitation, we introduce YOLO-World, an innovative approach that enhances YOLO with open-vocabulary detection capabilities through vision-language modeling and pre-training on large-scale datasets. Specifically, we propose a new Re-parameterizable Vision-Language Path Aggregation Network (RepVL-PAN) and region-text contrastive loss to facilitate the interaction between visual and linguistic information. Our method excels in detecting a wide range of objects in a zero-shot manner with high efficiency. On the challenging LVIS dataset, YOLO-World achieves 35.4 AP with 52.0 FPS on V100, which outperforms many state-of-the-art methods in terms of both accuracy and speed. Furthermore, the fine-tuned YOLO-World achieves remarkable performance on several downstream tasks, including object detection and open-vocabulary instance segmentation. + + +## Demo + + +## Main Results + +We've pre-trained YOLO-World-S/M/L from scratch and evaluate on the `LVIS val-1.0` and `LVIS minival`. We provide the pre-trained model weights and training logs for applications/research or re-producing the results. + +### Zero-shot Inference on LVIS dataset + +| model | Pre-train Data | AP | APr | APc | APf | FPS(V100) | weights | log | +| :---- | :------------- | :-:| :------------: |:-------------: | :-------: | :-----: | :---: | :---: | +| [YOLO-World-S](./configs/pretrain/yolo_world_s_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 17.6 | 11.9 | 14.5 | 23.2 | - | [wecom](https://drive.weixin.qq.com/s?k=AJEAIQdfAAoREsieRl) | [log]() | +| [YOLO-World-M](./configs/pretrain/yolo_world_m_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 23.5 | 17.2 | 20.4 | 29.6 | - | [wecom](https://drive.weixin.qq.com/s?k=AJEAIQdfAAoj0byBC0) | [log]() | +| [YOLO-World-L](./configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py) | O365+GoldG | 25.7 | 18.7 | 22.6 | 32.2 | - | [wecom](https://drive.weixin.qq.com/s?k=AJEAIQdfAAoK06oxO2) | [log]() | + +**NOTE:** +1. The evaluation results are tested on LVIS minival in a zero-shot manner. + + +## Getting started + +### 1. Installation + +YOLO-World is developed based on `torch==1.11.0` `mmyolo==0.6.0` and `mmdetection==3.0.0`. + +```bash +# install key dependencies +pip install mmdetection==3.0.0 mmengine transformers + +# clone the repo +git clone https://xxxx.YOLO-World.git +cd YOLO-World + +# install mmyolo +mkdir third_party +git clone https://github.com/open-mmlab/mmyolo.git +cd .. + +``` + +### 2. Preparing Data + +We provide the details about the pre-training data in [docs/data](./docs/data.md). + + +## Training & Evaluation + +We adopt the default [training](./tools/train.py) or [evaluation](./tools/test.py) scripts of [mmyolo](https://github.com/open-mmlab/mmyolo). +We provide the configs for pre-training and fine-tuning in `configs/pretrain` and `configs/finetune_coco`. +Training YOLO-World is easy: + +```bash +chmod +x tools/dist_train.sh +# sample command for pre-training, use AMP for mixed-precision training +./tools/dist_train.sh configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py 8 --amp +``` +**NOTE:** YOLO-World is pre-trained on 4 nodes with 8 GPUs per node (32 GPUs in total). For pre-training, the `node_rank` and `nnodes` for multi-node training should be specified. + +Evalutating YOLO-World is also easy: + +```bash +chmod +x tools/dist_test.sh +./tools/dist_test.sh path/to/config path/to/weights 8 +``` + +**NOTE:** We mainly evaluate the performance on LVIS-minival for pre-training. + +## Deployment + +We provide the details about deployment for downstream applications in [docs/deployment](./docs/deploy.md). +You can directly download the ONNX model through the online [demo]() in Huggingface Spaces 🤗. + +## Acknowledgement + +We sincerely thank [mmyolo](https://github.com/open-mmlab/mmyolo), [mmdetection](https://github.com/open-mmlab/mmdetection), and [transformers](https://github.com/huggingface/transformers) for providing their wonderful code to the community! + +## Citations +If you find YOLO-World is useful in your research or applications, please consider giving us a star 🌟 and citing it. + +```bibtex +@article{cheng2024yolow, + title={YOLO-World: Real-Time Open-Vocabulary Object Detection}, + author={Cheng, Tianheng and Song, Lin and Ge, Yixiao and Liu, Wenyu and Wang, Xinggang and Shan, Ying}, + journal={arXiv preprint arXiv:}, + year={2024} +} +``` + +## Licence +YOLO-World is under the GPL-v3 Licence and is supported for comercial usage. \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..1af48f945a820a102f8ea7363dfbdf3678851af8 --- /dev/null +++ b/app.py @@ -0,0 +1,61 @@ +import argparse +import os.path as osp + +from mmengine.config import Config, DictAction +from mmengine.runner import Runner +from mmengine.dataset import Compose +from mmyolo.registry import RUNNERS + +from tools.demo import demo + + +def parse_args(): + parser = argparse.ArgumentParser( + description='YOLO-World Demo') + parser.add_argument('--config', default='configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py') + parser.add_argument('--checkpoint', default='model_zoo/yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if args.work_dir is not None: + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + if 'runner_type' not in cfg: + runner = Runner.from_cfg(cfg) + else: + runner = RUNNERS.build(cfg) + + runner.call_hook('before_run') + runner.load_or_resume() + pipeline = cfg.test_dataloader.dataset.pipeline + runner.pipeline = Compose(pipeline) + runner.model.eval() + demo(runner, args) + diff --git a/assets/yolo_arch.png b/assets/yolo_arch.png new file mode 100644 index 0000000000000000000000000000000000000000..96c7caada7113c783458595fb2f52a5e95fba0aa Binary files /dev/null and b/assets/yolo_arch.png differ diff --git a/assets/yolo_logo.png b/assets/yolo_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..0a9321f37d9a53e69c9f3e56b3c274cc6c80fe8e Binary files /dev/null and b/assets/yolo_logo.png differ diff --git a/configs/deploy/detection_onnxruntime-fp16_dynamic.py b/configs/deploy/detection_onnxruntime-fp16_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..9ed6af743bbcd8fc360b6ba9c573f5b17f19f3c2 --- /dev/null +++ b/configs/deploy/detection_onnxruntime-fp16_dynamic.py @@ -0,0 +1,18 @@ +_base_ = ( + '../../third_party/mmdeploy/configs/mmdet/detection/' + 'detection_onnxruntime-fp16_dynamic.py') +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.1, + confidence_threshold=0.005, + iou_threshold=0.3, + max_output_boxes_per_class=100, + pre_top_k=1000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict( + type='onnxruntime') diff --git a/configs/deploy/detection_onnxruntime-int8_dynamic.py b/configs/deploy/detection_onnxruntime-int8_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..994fd25b97aa34e2f5013258583cff9e4d9e7116 --- /dev/null +++ b/configs/deploy/detection_onnxruntime-int8_dynamic.py @@ -0,0 +1,20 @@ +_base_ = ( + '../../third_party/mmdeploy/configs/mmdet/detection/' + 'detection_onnxruntime-fp16_dynamic.py') +backend_config = dict( + precision='int8') +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.1, + confidence_threshold=0.005, + iou_threshold=0.3, + max_output_boxes_per_class=100, + pre_top_k=1000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict( + type='onnxruntime') diff --git a/configs/deploy/detection_onnxruntime_static.py b/configs/deploy/detection_onnxruntime_static.py new file mode 100644 index 0000000000000000000000000000000000000000..aba96c7297e9988d285c729d2bdbf86de5f7bbe7 --- /dev/null +++ b/configs/deploy/detection_onnxruntime_static.py @@ -0,0 +1,18 @@ +_base_ = ( + '../../third_party/mmyolo/configs/deploy/' + 'detection_onnxruntime_static.py') +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.25, + confidence_threshold=0.005, + iou_threshold=0.65, + max_output_boxes_per_class=200, + pre_top_k=1000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict( + type='onnxruntime') diff --git a/configs/deploy/detection_tensorrt-fp16_static-640x640.py b/configs/deploy/detection_tensorrt-fp16_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..c513cb51f0f7bbda77c39dfd459666a97fedb8e7 --- /dev/null +++ b/configs/deploy/detection_tensorrt-fp16_static-640x640.py @@ -0,0 +1,38 @@ +_base_ = ( + '../../third_party/mmyolo/configs/deploy/' + 'detection_tensorrt-fp16_static-640x640.py') +onnx_config = dict( + type='onnx', + export_params=True, + keep_initializers_as_inputs=False, + opset_version=11, + save_file='end2end.onnx', + input_names=['input'], + output_names=['dets', 'labels'], + input_shape=(640, 640), + optimize=True) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=True, max_workspace_size=1 << 34), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.25, + confidence_threshold=0.005, + iou_threshold=0.65, + max_output_boxes_per_class=100, + pre_top_k=1, + keep_top_k=1, + background_label_id=-1), + module=['mmyolo.deploy']) diff --git a/configs/deploy/detection_tensorrt-int8_static-640x640.py b/configs/deploy/detection_tensorrt-int8_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..09d0b4d8b48ad95ebff6f0dd29e66da42185833f --- /dev/null +++ b/configs/deploy/detection_tensorrt-int8_static-640x640.py @@ -0,0 +1,30 @@ +_base_ = [ + '../../third_party/mmdeploy/configs/mmdet/_base_/base_static.py', + '../../third_party/mmdeploy/configs/_base_/backends/tensorrt-int8.py'] + +onnx_config = dict(input_shape=(640, 640)) + +backend_config = dict( + common_config=dict(max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) + +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.1, + confidence_threshold=0.005, + iou_threshold=0.3, + max_output_boxes_per_class=100, + pre_top_k=1000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) diff --git a/configs/finetune_coco/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py b/configs/finetune_coco/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..bd7fce4d8e56e9b7031770848110556357026a11 --- /dev/null +++ b/configs/finetune_coco/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py @@ -0,0 +1,183 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'weights/yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth' +persistent_workers = False + + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/coco_class_captions.json', + pipeline=train_pipeline) +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadTextFixed'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/coco_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') + +test_evaluator = val_evaluator diff --git a/configs/finetune_coco/yolo_world_m_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py b/configs/finetune_coco/yolo_world_m_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..7cf820ecf7bd49d7fdde2388bbd2480b448763d6 --- /dev/null +++ b/configs/finetune_coco/yolo_world_m_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py @@ -0,0 +1,183 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'weights/yolow-v8_m_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth' +persistent_workers = False + + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/coco_class_captions.json', + pipeline=train_pipeline) +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadTextFixed'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/coco_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') + +test_evaluator = val_evaluator diff --git a/configs/finetune_coco/yolo_world_s_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py b/configs/finetune_coco/yolo_world_s_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..b929db6f9771299f6a6765a7cfa4403ded358a50 --- /dev/null +++ b/configs/finetune_coco/yolo_world_s_t2i_bn_2e-4_100e_4x8gpus_coco_finetune.py @@ -0,0 +1,183 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 80 +num_training_classes = 80 +max_epochs = 80 # Maximum training epochs +close_mosaic_epochs = 10 +save_epoch_intervals = 5 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 +train_batch_size_per_gpu = 16 +load_from = 'weights/yolow-v8_s_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth' +persistent_workers = False + + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict( + type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - _base_.affine_scale, + 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform +] +train_pipeline_stage2 = [ + *_base_.train_pipeline_stage2[:-1], + *text_transform +] +coco_train_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/coco_class_captions.json', + pipeline=train_pipeline) +train_dataloader = dict( + persistent_workers=persistent_workers, + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=coco_train_dataset) +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadTextFixed'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5CocoDataset', + data_root='data/coco', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/coco_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader +# training settings +default_hooks = dict( + param_scheduler=dict( + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + max_keep_ckpts=-1, + save_best=None, + interval=save_epoch_intervals)) +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=5, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict( + optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={'backbone.text_model': dict(lr_mult=0.01), + 'logit_scale': dict(weight_decay=0.0)}), + constructor='YOLOWv5OptimizerConstructor') + +# evaluation settings +val_evaluator = dict( + _delete_=True, + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file='data/coco/annotations/instances_val2017.json', + metric='bbox') + +test_evaluator = val_evaluator diff --git a/configs/pretrain/yolo_world_l_dual_3block_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_l_dual_3block_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..7ca8ce851ccb5711251e0e8840516a0aa8e67d59 --- /dev/null +++ b/configs/pretrain/yolo_world_l_dual_3block_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,173 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWolrdDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/\ + lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..84a80fc742ed5b108528aa8004b6631d1ad1bbf4 --- /dev/null +++ b/configs/pretrain/yolo_world_l_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,182 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + _delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, + mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5LVISV1Dataset', + data_root='data/lvis/', + test_mode=True, + ann_file='annotations/' + 'lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='mmdet.LVISMetric', + ann_file='data/lvis/annotations/' + 'lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_m_dual_3block_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_m_dual_3block_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..ca03fc28cea8d018c284e97a3373b2284fa033da --- /dev/null +++ b/configs/pretrain/yolo_world_m_dual_3block_l2norm_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,173 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWolrdDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/\ + lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_m_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_m_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..c3aa4ba98e94b6cd2b2af79801cba8927b215926 --- /dev/null +++ b/configs/pretrain/yolo_world_m_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,171 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_m_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_s_dual_l2norm_3block_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_s_dual_l2norm_3block_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..72621d6c405e1fbc9ba6bbf72ed8f226725ef24a --- /dev/null +++ b/configs/pretrain/yolo_world_s_dual_l2norm_3block_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,173 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWolrdDualPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + text_enhancder=dict(type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8)), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/\ + lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/pretrain/yolo_world_s_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/configs/pretrain/yolo_world_s_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py new file mode 100644 index 0000000000000000000000000000000000000000..870ba1df9b79511068dd9f0573cc1cfd8fd13f3c --- /dev/null +++ b/configs/pretrain/yolo_world_s_t2i_bn_2e-4_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -0,0 +1,172 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_s_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 100 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-3 +# for 4 nodes, 8 gpus per node, 32 total gpus +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 16 + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='pretrained_models/clip-vit-base-patch32-projection', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +train_pipeline = [ + *_base_.pre_transform, + dict(type='MultiModalMosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *text_transform] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict(type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='full_images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict(_delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + *_base_.test_pipeline[:-1], + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict(type='YOLOv5LVISV1Dataset', + data_root='data/coco/', + test_mode=True, + ann_file='lvis/lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict(type='mmdet.LVISMetric', + ann_file='data/coco/lvis/lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict(param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict(max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict(bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.01), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1024.py b/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1024.py new file mode 100644 index 0000000000000000000000000000000000000000..f6990618b82b2d3a67337b32af641c6e19794dfb --- /dev/null +++ b/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1024.py @@ -0,0 +1,216 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 20 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 8 +img_scale = (1024, 1024) +load_from = 'work_dirs/model_zoo/yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth' # noqa + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + _delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, + mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5LVISV1Dataset', + data_root='data/lvis/', + test_mode=True, + ann_file='annotations/' + 'lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='mmdet.LVISMetric', + ann_file='data/lvis/annotations/' + 'lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.0), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1280.py b/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1280.py new file mode 100644 index 0000000000000000000000000000000000000000..98b67a89e8ea5c9056ef41e1245e55b0c596f6d9 --- /dev/null +++ b/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1280.py @@ -0,0 +1,216 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 20 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 4 +img_scale = (1280, 1280) +load_from = 'work_dirs/model_zoo/yolow-v8_l_clipv2_frozen_t2iv2_bn_o365_goldg_pretrain.pth' # noqa + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + _delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, + mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5LVISV1Dataset', + data_root='data/lvis/', + test_mode=True, + ann_file='annotations/' + 'lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='mmdet.LVISMetric', + ann_file='data/lvis/annotations/' + 'lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.0), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1280_v2.py b/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1280_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..63bab46be6b6ed35a7e6e0683f428ddf1b6df65d --- /dev/null +++ b/configs/scaleup/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1280_v2.py @@ -0,0 +1,216 @@ +_base_ = ('../../third_party/mmyolo/configs/yolov8/' + 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') +custom_imports = dict(imports=['yolo_world'], + allow_failed_imports=False) + +# hyper-parameters +num_classes = 1203 +num_training_classes = 80 +max_epochs = 20 # Maximum training epochs +close_mosaic_epochs = 2 +save_epoch_intervals = 2 +text_channels = 512 +neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2] +neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32] +base_lr = 2e-4 +weight_decay = 0.05 / 2 +train_batch_size_per_gpu = 6 +img_scale = (1280, 1280) +load_from = 'work_dirs/yolo_world_l_t2i_bn_2e-4_20e_4x8gpus_obj365v1_goldg_train_lvis_minival_s1280/epoch_20.pth' # noqa + +# model settings +model = dict( + type='YOLOWorldDetector', + mm_neck=True, + num_train_classes=num_training_classes, + num_test_classes=num_classes, + data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), + backbone=dict( + _delete_=True, + type='MultiModalYOLOBackbone', + image_model={{_base_.model.backbone}}, + text_model=dict( + type='HuggingCLIPLanguageBackbone', + model_name='openai/clip-vit-base-patch32', + frozen_modules=['all'])), + neck=dict(type='YOLOWorldPAFPN', + guide_channels=text_channels, + embed_channels=neck_embed_channels, + num_heads=neck_num_heads, + block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), + num_csp_blocks=2), + bbox_head=dict(type='YOLOWorldHead', + head_module=dict(type='YOLOWorldHeadModule', + embed_dims=text_channels, + use_bn_head=True, + num_classes=num_training_classes)), + train_cfg=dict(assigner=dict(num_classes=num_training_classes))) + +# dataset settings +text_transform = [ + dict(type='RandomLoadText', + num_neg_samples=(num_classes, num_classes), + max_num_samples=num_training_classes, + padding_to_max=True, + padding_value=''), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction', 'texts')) +] +mosaic_affine_transform = [ + dict(type='MultiModalMosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] +train_pipeline = [ + *_base_.pre_transform, + *mosaic_affine_transform, + dict( + type='YOLOv5MultiModalMixUp', + prob=_base_.mixup_prob, + pre_transform=[*_base_.pre_transform, + *mosaic_affine_transform]), + *_base_.last_transform[:-1], + *text_transform, +] +train_pipeline_stage2 = [ + *_base_.pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), + *_base_.last_transform[:-1], + *text_transform, +] +obj365v1_train_dataset = dict( + type='MultiModalDataset', + dataset=dict( + type='YOLOv5Objects365V1Dataset', + data_root='data/objects365v1/', + ann_file='annotations/objects365_train.json', + data_prefix=dict(img='train/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32)), + class_text_path='data/captions/obj365v1_class_captions.json', + pipeline=train_pipeline) + +mg_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/mixed_grounding/', + ann_file='annotations/final_mixed_train_no_coco.json', + data_prefix=dict(img='gqa/images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + +flickr_train_dataset = dict( + type='YOLOv5MixedGroundingDataset', + data_root='data/flickr/', + ann_file='annotations/final_flickr_separateGT_train.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + collate_fn=dict(type='yolow_collate'), + dataset=dict( + _delete_=True, + type='ConcatDataset', + datasets=[ + obj365v1_train_dataset, + flickr_train_dataset, + mg_train_dataset + ], + ignore_keys=['classes', 'palette'])) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict(type='LoadText'), + dict(type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'texts')) +] +coco_val_dataset = dict( + _delete_=True, + type='MultiModalDataset', + dataset=dict( + type='YOLOv5LVISV1Dataset', + data_root='data/lvis/', + test_mode=True, + ann_file='annotations/' + 'lvis_v1_minival_inserted_image_name.json', + data_prefix=dict(img=''), + batch_shapes_cfg=None), + class_text_path='data/captions/lvis_v1_class_captions.json', + pipeline=test_pipeline) +val_dataloader = dict(dataset=coco_val_dataset) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='mmdet.LVISMetric', + ann_file='data/lvis/annotations/' + 'lvis_v1_minival_inserted_image_name.json', + metric='bbox') +test_evaluator = val_evaluator + +# training settings +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict(interval=save_epoch_intervals, + rule='greater')) +custom_hooks = [ + dict(type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict(type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] +train_cfg = dict( + max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + _base_.val_interval_stage2)]) +optim_wrapper = dict(optimizer=dict( + _delete_=True, + type='AdamW', + lr=base_lr, + weight_decay=weight_decay, + batch_size_per_gpu=train_batch_size_per_gpu), + paramwise_cfg=dict( + bias_decay_mult=0.0, + norm_decay_mult=0.0, + custom_keys={ + 'backbone.text_model': + dict(lr_mult=0.0), + 'logit_scale': + dict(weight_decay=0.0) + }), + constructor='YOLOWv5OptimizerConstructor') diff --git a/deploy/__init__.py b/deploy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2abf2fc45881af06c5b06b8e07a5f87b4dea69a1 --- /dev/null +++ b/deploy/__init__.py @@ -0,0 +1 @@ +from .models import * # noqa diff --git a/deploy/models/__init__.py b/deploy/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..50ac5d4ba3c4164ff28e7a67aa6532b31f34ea30 --- /dev/null +++ b/deploy/models/__init__.py @@ -0,0 +1,4 @@ +from .detectors import * # noqa +from .dense_heads import * # noqa +from .layers import * # noqa +from .necks import * # noqa diff --git a/docs/data.md b/docs/data.md new file mode 100644 index 0000000000000000000000000000000000000000..da5f4d5fb7222fb5ea656d8894e5f85d21deefd8 --- /dev/null +++ b/docs/data.md @@ -0,0 +1,19 @@ +## Preparing Data for YOLO-World + + +### Overview + + + +### Pre-training Data + +| Data | Samples | Type | Boxes | Annotations | +| :-- | :-----: | :---:| :---: | :---------: | +| Objects365v1 | | detection | | | +| GQA | | ground | | | +| Flickr | | ground | | | + + + + + diff --git a/docs/deploy.md b/docs/deploy.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/training.md b/docs/training.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/requirements.txt b/requirements.txt index 0a13ffe1597488103b013a8995be1cd10cc0271f..45215f638613fc5a28750b1acea96cd3098c62f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ regex pot sentencepiece tokenizers + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..a76fd6451041275becc397335b0453903ba88b38 --- /dev/null +++ b/setup.py @@ -0,0 +1,190 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os +import os.path as osp +import shutil +import sys +import warnings +from setuptools import find_packages, setup + + +def readme(): + with open('README.md', encoding='utf-8') as f: + content = f.read() + return content + + +def get_version(): + version_file = 'yolo_world/version.py' + with open(version_file, 'r', encoding='utf-8') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +def parse_requirements(fname='requirements.txt', with_version=True): + """Parse the package dependencies listed in a requirements file but strips + specific versioning information. + + Args: + fname (str): path to requirements file + with_version (bool, default=False): if True include version specs + + Returns: + List[str]: list of requirements items + + CommandLine: + python -c "import setup; print(setup.parse_requirements())" + """ + import re + import sys + from os.path import exists + require_fpath = fname + + def parse_line(line): + """Parse information from a line in a requirements text file.""" + if line.startswith('-r '): + # Allow specifying requirements in other files + target = line.split(' ')[1] + for info in parse_require_file(target): + yield info + else: + info = {'line': line} + if line.startswith('-e '): + info['package'] = line.split('#egg=')[1] + else: + # Remove versioning from the package + pat = '(' + '|'.join(['>=', '==', '>']) + ')' + parts = re.split(pat, line, maxsplit=1) + parts = [p.strip() for p in parts] + + info['package'] = parts[0] + if len(parts) > 1: + op, rest = parts[1:] + if ';' in rest: + # Handle platform specific dependencies + # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies + version, platform_deps = map(str.strip, + rest.split(';')) + info['platform_deps'] = platform_deps + else: + version = rest # NOQA + if '--' in version: + # the `extras_require` doesn't accept options. + version = version.split('--')[0].strip() + info['version'] = (op, version) + yield info + + def parse_require_file(fpath): + with open(fpath, 'r') as f: + for line in f.readlines(): + line = line.strip() + if line and not line.startswith('#'): + for info in parse_line(line): + yield info + + def gen_packages_items(): + if exists(require_fpath): + for info in parse_require_file(require_fpath): + parts = [info['package']] + if with_version and 'version' in info: + parts.extend(info['version']) + if not sys.version.startswith('3.4'): + # apparently package_deps are broken in 3.4 + platform_deps = info.get('platform_deps') + if platform_deps is not None: + parts.append(';' + platform_deps) + item = ''.join(parts) + yield item + + packages = list(gen_packages_items()) + return packages + + +def add_mim_extension(): + """Add extra files that are required to support MIM into the package. + + These files will be added by creating a symlink to the originals if the + package is installed in `editable` mode (e.g. pip install -e .), or by + copying from the originals otherwise. + """ + + # parse installment mode + if 'develop' in sys.argv: + # installed by `pip install -e .` + mode = 'symlink' + elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv: + # installed by `pip install .` + # or create source distribution by `python setup.py sdist` + mode = 'copy' + else: + return + + filenames = ['tools', 'configs', 'model-index.yml', 'dataset-index.yml'] + repo_path = osp.dirname(__file__) + mim_path = osp.join(repo_path, 'yolo_world', '.mim') + os.makedirs(mim_path, exist_ok=True) + + for filename in filenames: + if osp.exists(filename): + src_path = osp.join(repo_path, filename) + tar_path = osp.join(mim_path, filename) + + if osp.isfile(tar_path) or osp.islink(tar_path): + os.remove(tar_path) + elif osp.isdir(tar_path): + shutil.rmtree(tar_path) + + if mode == 'symlink': + src_relpath = osp.relpath(src_path, osp.dirname(tar_path)) + try: + os.symlink(src_relpath, tar_path) + except OSError: + # Creating a symbolic link on windows may raise an + # `OSError: [WinError 1314]` due to privilege. If + # the error happens, the src file will be copied + mode = 'copy' + warnings.warn( + f'Failed to create a symbolic link for {src_relpath}, ' + f'and it will be copied to {tar_path}') + else: + continue + + if mode == 'copy': + if osp.isfile(src_path): + shutil.copyfile(src_path, tar_path) + elif osp.isdir(src_path): + shutil.copytree(src_path, tar_path) + else: + warnings.warn(f'Cannot copy file {src_path}.') + else: + raise ValueError(f'Invalid mode {mode}') + + +if __name__ == '__main__': + setup( + name='yolo_world', + version=get_version(), + description='YOLO-World: Real-time Open Vocabulary Object Detection', + long_description=readme(), + long_description_content_type='text/markdown', + keywords='object detection', + packages=find_packages(exclude=( + 'data', 'third_party', 'tools')), + include_package_data=True, + python_requires='>=3.7', + classifiers=[ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + ], + author='Tencent AILab', + author_email='ronnysong@tencent.com', + license='Apache License 2.0', + install_requires=parse_requirements('requirements.txt'), + zip_safe=False) diff --git a/taiji/drun b/taiji/drun new file mode 100755 index 0000000000000000000000000000000000000000..15a0b914b26ac3687e9d0717fb5df68dc376b691 --- /dev/null +++ b/taiji/drun @@ -0,0 +1,35 @@ +#!/bin/bash +DOCKER_IMAGE="mirrors.tencent.com/ronnysong_rd/fastdet:torch2.0.1-cuda11.7" + +if [ ! -n "$DEBUG" ]; then + COMMAND_PREFIX="pip3 install -e ." +else + COMMAND_PREFIX="pip3 install -q -e third_party/mmengine; + pip3 install -q -e third_party/mmdetection; + pip3 install -q -e third_party/mmcv; + pip3 install -q -e third_party/mmyolo; + pip3 install -q -e ." +fi + +sudo nvidia-docker run \ + --rm \ + -it \ + -e NVIDIA_VISIBLE_DEVICES=all \ + --env="DISPLAY" \ + --env="QT_X11_NO_MITSHM=1" \ + --volume="$HOME/.Xauthority:/root/.Xauthority:rw" \ + --shm-size=20gb \ + --network=host \ + -v /apdcephfs/:/apdcephfs/ \ + -v /apdcephfs_cq2/:/apdcephfs_cq2/ \ + -v /apdcephfs_cq3/:/apdcephfs_cq3/ \ + -v /data/:/data/ \ + -w $PWD \ + $DOCKER_IMAGE \ + bash -c "export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers; + export TORCH_HOME=$PWD/work_dirs/.cache/torch; + export CLIP_CACHE=$PWD/work_dirs/.cache/clip; + export HF_HOME=$PWD/work_dirs/.cache/hf; + export TOKENIZERS_PARALLELISM=false; + $COMMAND_PREFIX + $*" diff --git a/taiji/erun b/taiji/erun new file mode 100755 index 0000000000000000000000000000000000000000..1bc67d49cc7f4c867a9d0ffd872172df0624ec84 --- /dev/null +++ b/taiji/erun @@ -0,0 +1,23 @@ +#!/bin/bash +export NCCL_IB_GID_INDEX=3 + +export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers +export TORCH_HOME=$PWD/work_dirs/.cache/torch +export CLIP_CACHE=$PWD/work_dirs/.cache/clip +export HF_HOME=$PWD/work_dirs/.cache/hf +export TOKENIZERS_PARALLELISM=false +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +export TORCH_DISTRIBUTED_DEBUG=INFO +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 +export http_proxy="http://star-proxy.oa.com:3128" +export https_proxy="http://star-proxy.oa.com:3128" +export ftp_proxy="http://star-proxy.oa.com:3128" +export no_proxy=".woa.com,mirrors.cloud.tencent.com,tlinux-mirror.tencent-cloud.com,tlinux-mirrorlist.tencent-cloud.com,localhost,127.0.0.1,mirrors-tlinux.tencentyun.com,.oa.com,.local,.3gqq.com,.7700.org,.ad.com,.ada_sixjoy.com,.addev.com,.app.local,.apps.local,.aurora.com,.autotest123.com,.bocaiwawa.com,.boss.com,.cdc.com,.cdn.com,.cds.com,.cf.com,.cjgc.local,.cm.com,.code.com,.datamine.com,.dvas.com,.dyndns.tv,.ecc.com,.expochart.cn,.expovideo.cn,.fms.com,.great.com,.hadoop.sec,.heme.com,.home.com,.hotbar.com,.ibg.com,.ied.com,.ieg.local,.ierd.com,.imd.com,.imoss.com,.isd.com,.isoso.com,.itil.com,.kao5.com,.kf.com,.kitty.com,.lpptp.com,.m.com,.matrix.cloud,.matrix.net,.mickey.com,.mig.local,.mqq.com,.oiweb.com,.okbuy.isddev.com,.oss.com,.otaworld.com,.paipaioa.com,.qqbrowser.local,.qqinternal.com,.qqwork.com,.rtpre.com,.sc.oa.com,.sec.com,.server.com,.service.com,.sjkxinternal.com,.sllwrnm5.cn,.sng.local,.soc.com,.t.km,.tcna.com,.teg.local,.tencentvoip.com,.tenpayoa.com,.test.air.tenpay.com,.tr.com,.tr_autotest123.com,.vpn.com,.wb.local,.webdev.com,.webdev2.com,.wizard.com,.wqq.com,.wsd.com,.sng.com,.music.lan,.mnet2.com,.tencentb2.com,.tmeoa.com,.pcg.com,www.wip3.adobe.com,www-mm.wip3.adobe.com,mirrors.tencent.com,csighub.tencentyun.com" +sed -i 's/np.float/float/g' /usr/local/python/lib/python3.8/site-packages/lvis/eval.py +touch /tmp/.unhold + +pip3 install -e . +$* +rm /tmp/.unhold diff --git a/taiji/etorchrun b/taiji/etorchrun new file mode 100755 index 0000000000000000000000000000000000000000..4a3a97be21afe452f9a74dc2b1b823ba0a51ae8a --- /dev/null +++ b/taiji/etorchrun @@ -0,0 +1,51 @@ +#!/bin/bash +if [ ! -n "$SH" ]; then + #export NCCL_IB_GID_INDEX=3 + export NCCL_IB_DISABLE=1 + export NCCL_P2P_DISABLE=1 + export NCCL_SOCKET_IFNAME=eth1 +else + export NCCL_IB_GID_INDEX=3 + export NCCL_IB_SL=3 + export NCCL_CHECKS_DISABLE=1 + export NCCL_P2P_DISABLE=0 + export NCCL_IB_DISABLE=0 + export NCCL_LL_THRESHOLD=16384 + export NCCL_IB_CUDA_SUPPORT=1 + export NCCL_SOCKET_IFNAME=bond1 + export UCX_NET_DEVICES=bond1 + export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6 + export NCCL_COLLNET_ENABLE=0 + export SHARP_COLL_ENABLE_SAT=0 + export NCCL_NET_GDR_LEVEL=2 + export NCCL_IB_QPS_PER_CONNECTION=4 + export NCCL_IB_TC=160 + export NCCL_PXN_DISABLE=1 + export GLOO_SOCKET_IFNAME=bond1 + export NCCL_DEBUG=info +fi + +export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers +export TORCH_HOME=$PWD/work_dirs/.cache/torch +export CLIP_CACHE=$PWD/work_dirs/.cache/clip +export HF_HOME=$PWD/work_dirs/.cache/hf +export TOKENIZERS_PARALLELISM=false +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 +export TORCH_DISTRIBUTED_DEBUG=INFO +export HF_DATASETS_OFFLINE=1 +export TRANSFORMERS_OFFLINE=1 + +export http_proxy="http://star-proxy.oa.com:3128" +export https_proxy="http://star-proxy.oa.com:3128" +export ftp_proxy="http://star-proxy.oa.com:3128" +export no_proxy=".woa.com,mirrors.cloud.tencent.com,tlinux-mirror.tencent-cloud.com,tlinux-mirrorlist.tencent-cloud.com,localhost,127.0.0.1,mirrors-tlinux.tencentyun.com,.oa.com,.local,.3gqq.com,.7700.org,.ad.com,.ada_sixjoy.com,.addev.com,.app.local,.apps.local,.aurora.com,.autotest123.com,.bocaiwawa.com,.boss.com,.cdc.com,.cdn.com,.cds.com,.cf.com,.cjgc.local,.cm.com,.code.com,.datamine.com,.dvas.com,.dyndns.tv,.ecc.com,.expochart.cn,.expovideo.cn,.fms.com,.great.com,.hadoop.sec,.heme.com,.home.com,.hotbar.com,.ibg.com,.ied.com,.ieg.local,.ierd.com,.imd.com,.imoss.com,.isd.com,.isoso.com,.itil.com,.kao5.com,.kf.com,.kitty.com,.lpptp.com,.m.com,.matrix.cloud,.matrix.net,.mickey.com,.mig.local,.mqq.com,.oiweb.com,.okbuy.isddev.com,.oss.com,.otaworld.com,.paipaioa.com,.qqbrowser.local,.qqinternal.com,.qqwork.com,.rtpre.com,.sc.oa.com,.sec.com,.server.com,.service.com,.sjkxinternal.com,.sllwrnm5.cn,.sng.local,.soc.com,.t.km,.tcna.com,.teg.local,.tencentvoip.com,.tenpayoa.com,.test.air.tenpay.com,.tr.com,.tr_autotest123.com,.vpn.com,.wb.local,.webdev.com,.webdev2.com,.wizard.com,.wqq.com,.wsd.com,.sng.com,.music.lan,.mnet2.com,.tencentb2.com,.tmeoa.com,.pcg.com,www.wip3.adobe.com,www-mm.wip3.adobe.com,mirrors.tencent.com,csighub.tencentyun.com" + +sed -i 's/np.float/float/g' /usr/local/python/lib/python3.8/site-packages/lvis/eval.py + +touch /tmp/.unhold + +pip3 install -e . +torchrun --nnodes=$1 --nproc_per_node=$2 --node_rank=$INDEX --master_addr=$CHIEF_IP ${@:3} + +rm /tmp/.unhold diff --git a/taiji/jizhi_run_vanilla b/taiji/jizhi_run_vanilla new file mode 100755 index 0000000000000000000000000000000000000000..ad1d3d2cc856128d6db537efe79ea46805225fdc --- /dev/null +++ b/taiji/jizhi_run_vanilla @@ -0,0 +1,105 @@ +#!/bin/bash +if [[ $1 = "--help" ]] || [[ $1 = "-h" ]] +then + echo "Usage: jizhi_run NUM_MECHINES NUM_GPUS TASK_NAME " +fi + +# user configuration +TOKEN=$TOKEN +if [ ! -n "$IMAGE_FULL_NAME" ]; then + IMAGE_FULL_NAME="mirrors.tencent.com/ronnysong_rd/fastdet:torch2.0.1-cuda11.7" +fi +if [ ! -n "$BUSINESS_FLAG" ]; then + BUSINESS_FLAG="TEG_AILab_CVC_chongqing" +fi +if [ ! -n "$CEPH_BUSINESS_FLAG" ]; then + CEPH_BUSINESS_FLAG="TEG_AILab_CVC_chongqing" +fi +if [ ! -n "$GPU_NAME" ]; then + GPU_NAME="V100" +fi +if [ ! -n "$PRIORITY_LEVEL" ]; then + PRIORITY_LEVEL="HIGH" +fi +if [ ! -n "$ELASTIC_LEVEL" ]; then + ELASTIC_LEVEL=1 +fi +if [ ! -n "$RDMA" ]; then + RDMA="false" +fi +if [ ! -n "$CUDA" ]; then + CUDA="11.0" +fi + +CMD_PATH="start.sh" +CONF_PATH="jizhi_conf.json" +ROOT_PATH=$PWD +UUID=$(date +%s) + +rm -f $CMD_PATH + +echo 'cd '$ROOT_PATH >> $CMD_PATH +echo 'export HF_HOME="'$ROOT_PATH'/work_dirs/.cache/hf"' >> $CMD_PATH +echo 'export TORCH_HOME="'$ROOT_PATH'/work_dirs/.cache/torch"' >> $CMD_PATH +echo 'export CLIP_CACHE="'$ROOT_PATH'/work_dirs/.cache/clip"' >> $CMD_PATH +echo 'export TRANSFORMERS_CACHE="'$ROOT_PATH'/work_dirs/.cache/transformers"' >> $CMD_PATH +echo 'export MKL_NUM_THREADS=1' >> $CMD_PATH +echo 'export OMP_NUM_THREADS=1' >> $CMD_PATH +echo 'export TOKENIZERS_PARALLELISM=false' >> $CMD_PATH +echo 'export TORCH_DISTRIBUTED_DEBUG=INFO' >> $CMD_PATH +echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH +if [ $BUSINESS_FLAG = "TaiJi_HYAide_BUFFER_SH_A800H" ]; then + echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH + echo 'export NCCL_IB_SL=3' >> $CMD_PATH + echo 'export NCCL_CHECKS_DISABLE=1' >> $CMD_PATH + echo 'export NCCL_P2P_DISABLE=0' >> $CMD_PATH + echo 'export NCCL_IB_DISABLE=0' >> $CMD_PATH + echo 'export NCCL_LL_THRESHOLD=16384' >> $CMD_PATH + echo 'export NCCL_IB_CUDA_SUPPORT=1' >> $CMD_PATH + echo 'export NCCL_SOCKET_IFNAME=bond1' >> $CMD_PATH + echo 'export UCX_NET_DEVICES=bond1' >> $CMD_PATH + echo 'export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6' >> $CMD_PATH + echo 'export NCCL_COLLNET_ENABLE=0' >> $CMD_PATH + echo 'export SHARP_COLL_ENABLE_SAT=0' >> $CMD_PATH + echo 'export NCCL_NET_GDR_LEVEL=2' >> $CMD_PATH + echo 'export NCCL_IB_QPS_PER_CONNECTION=4' >> $CMD_PATH + echo 'export NCCL_IB_TC=160' >> $CMD_PATH + echo 'export NCCL_PXN_DISABLE=1' >> $CMD_PATH +fi +echo ${@:4} >> $CMD_PATH + +chmod +x $CMD_PATH + +rm -f $CONF_PATH + +#INIT_CMD="jizhi_client mount -bf TEG_AILab_CVC_chongqing -tk $TOKEN" +INIT_CMD="" + +echo '{' > $CONF_PATH +echo '"Token": "'$TOKEN'",' >> $CONF_PATH +echo '"business_flag": "'$BUSINESS_FLAG'",' >> $CONF_PATH +echo '"model_local_file_path": "'$ROOT_PATH'/'$CMD_PATH'",' >> $CONF_PATH +echo '"host_num": '$1',' >> $CONF_PATH +echo '"host_gpu_num": '$2',' >> $CONF_PATH +echo '"task_flag": "'$3'_'$UUID'",' >> $CONF_PATH +echo '"priority_level": "'$PRIORITY_LEVEL'",' >> $CONF_PATH +echo '"elastic_level": '$ELASTIC_LEVEL',' >> $CONF_PATH +echo '"cuda_version": "'$CUDA'",' >> $CONF_PATH +echo '"image_full_name": "'$IMAGE_FULL_NAME'",' >> $CONF_PATH +echo '"GPUName": "'$GPU_NAME'",' >> $CONF_PATH +echo '"mount_ceph_business_flag": "'$CEPH_BUSINESS_FLAG'",' >> $CONF_PATH +echo '"exec_start_in_all_mpi_pods": true,' >> $CONF_PATH +echo '"enable_rdma": '$RDMA',' >> $CONF_PATH +echo '"init_cmd": "'$INIT_CMD'",' >> $CONF_PATH +echo '"envs": {' >> $CONF_PATH +echo ' "HUNYUAN_TASK_CATEGORY": "LLM",' >> $CONF_PATH +echo ' "HUNYUAN_TASK_MODEL_TYPE": "SFT",' >> $CONF_PATH +echo ' "HUNYUAN_TASK_DOMAIN": "NLP",' >> $CONF_PATH +echo ' "HUNYUAN_TASK_START_MODEL_TYPE": "7B冷启"}' >> $CONF_PATH +echo '}' >> $CONF_PATH + +jizhi_client start -scfg $CONF_PATH + +rm -f $CMD_PATH +rm -f $CONF_PATH + diff --git a/third_party/mmyolo/.circleci/config.yml b/third_party/mmyolo/.circleci/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..59ba321aeec5dd3904c8df29e2833a41dbc676f7 --- /dev/null +++ b/third_party/mmyolo/.circleci/config.yml @@ -0,0 +1,34 @@ +version: 2.1 + +# this allows you to use CircleCI's dynamic configuration feature +setup: true + +# the path-filtering orb is required to continue a pipeline based on +# the path of an updated fileset +orbs: + path-filtering: circleci/path-filtering@0.1.2 + +workflows: + # the always-run workflow is always triggered, regardless of the pipeline parameters. + always-run: + jobs: + # the path-filtering/filter job determines which pipeline + # parameters to update. + - path-filtering/filter: + name: check-updated-files + # 3-column, whitespace-delimited mapping. One mapping per + # line: + # + mapping: | + mmyolo/.* lint_only false + requirements/.* lint_only false + tests/.* lint_only false + tools/.* lint_only false + configs/.* lint_only false + .circleci/.* lint_only false + base-revision: main + # this is the path of the configuration we should trigger once + # path filtering and pipeline parameter value updates are + # complete. In this case, we are using the parent dynamic + # configuration itself. + config-path: .circleci/test.yml diff --git a/third_party/mmyolo/.circleci/docker/Dockerfile b/third_party/mmyolo/.circleci/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..d9cf8cc7712d5241975c3b748fb0d01a5545b4fd --- /dev/null +++ b/third_party/mmyolo/.circleci/docker/Dockerfile @@ -0,0 +1,11 @@ +ARG PYTORCH="1.8.1" +ARG CUDA="10.2" +ARG CUDNN="7" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +# To fix GPG key error when running apt-get update +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx diff --git a/third_party/mmyolo/.circleci/test.yml b/third_party/mmyolo/.circleci/test.yml new file mode 100644 index 0000000000000000000000000000000000000000..149d6cac15ff9643a21535638a6cd5f961a17d4a --- /dev/null +++ b/third_party/mmyolo/.circleci/test.yml @@ -0,0 +1,213 @@ +version: 2.1 + +# the default pipeline parameters, which will be updated according to +# the results of the path-filtering orb +parameters: + lint_only: + type: boolean + default: true + +jobs: + lint: + docker: + - image: cimg/python:3.7.4 + steps: + - checkout + - run: + name: Install pre-commit hook + command: | + pip install pre-commit + pre-commit install + - run: + name: Linting + command: pre-commit run --all-files + - run: + name: Check docstring coverage + command: | + pip install interrogate + interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 90 mmyolo + build_cpu: + parameters: + # The python version must match available image tags in + # https://circleci.com/developer/images/image/cimg/python + python: + type: string + torch: + type: string + torchvision: + type: string + docker: + - image: cimg/python:<< parameters.python >> + resource_class: large + steps: + - checkout + - run: + name: Install Libraries + command: | + sudo apt-get update + sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5 + - run: + name: Configure Python & pip + command: | + pip install --upgrade pip + pip install wheel + - run: + name: Install PyTorch + command: | + python -V + pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html + - run: + name: Install ONNXRuntime + command: | + pip install onnxruntime==1.8.1 + wget https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz + tar xvf onnxruntime-linux-x64-1.8.1.tgz + - run: + name: Install mmyolo dependencies + command: | + pip install -U openmim + mim install git+https://github.com/open-mmlab/mmengine.git@main + mim install 'mmcv >= 2.0.0' + mim install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x + pip install -r requirements/albu.txt + pip install -r requirements/tests.txt + - run: + name: Install mmdeploy + command: | + pip install setuptools + git clone -b dev-1.x --depth 1 https://github.com/open-mmlab/mmdeploy.git mmdeploy --recurse-submodules + wget https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.tar.gz + tar -xzvf cmake-3.20.0-linux-x86_64.tar.gz + sudo ln -sf $(pwd)/cmake-3.20.0-linux-x86_64/bin/* /usr/bin/ + cd mmdeploy && mkdir build && cd build && cmake .. -DMMDEPLOY_TARGET_BACKENDS=ort -DONNXRUNTIME_DIR=/home/circleci/project/onnxruntime-linux-x64-1.8.1 && make -j8 && make install + export LD_LIBRARY_PATH=/home/circleci/project/onnxruntime-linux-x64-1.8.1/lib:${LD_LIBRARY_PATH} + cd /home/circleci/project/mmdeploy && python -m pip install -v -e . + - run: + name: Build and install + command: | + pip install -e . + - run: + name: Run unittests + command: | + export LD_LIBRARY_PATH=/home/circleci/project/onnxruntime-linux-x64-1.8.1/lib:${LD_LIBRARY_PATH} + pytest tests/ +# coverage run --branch --source mmyolo -m pytest tests/ +# coverage xml +# coverage report -m + build_cuda: + parameters: + torch: + type: string + cuda: + type: enum + enum: ["10.1", "10.2", "11.0", "11.7"] + cudnn: + type: integer + default: 7 + machine: + image: ubuntu-2004-cuda-11.4:202110-01 + # docker_layer_caching: true + resource_class: gpu.nvidia.small + steps: + - checkout + - run: + # Cloning repos in VM since Docker doesn't have access to the private key + name: Clone Repos + command: | + git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine + git clone -b dev-3.x --depth 1 https://github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection + - run: + name: Build Docker image + command: | + docker build .circleci/docker -t mmyolo:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >> + docker run --gpus all -t -d -v /home/circleci/project:/mmyolo -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmdetection:/mmdetection -w /mmyolo --name mmyolo mmyolo:gpu + - run: + name: Install mmyolo dependencies + command: | + docker exec mmyolo pip install -U openmim + docker exec mmyolo mim install -e /mmengine + docker exec mmyolo mim install 'mmcv >= 2.0.0' + docker exec mmyolo pip install -e /mmdetection + docker exec mmyolo pip install -r requirements/albu.txt + docker exec mmyolo pip install -r requirements/tests.txt + - run: + name: Build and install + command: | + docker exec mmyolo pip install -e . + - run: + name: Run unittests + command: | + docker exec mmyolo pytest tests/ + +workflows: + pr_stage_lint: + when: << pipeline.parameters.lint_only >> + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - main + + pr_stage_test: + when: + not: << pipeline.parameters.lint_only >> + jobs: + - lint: + name: lint + filters: + branches: + ignore: + - main + - build_cpu: + name: minimum_version_cpu + torch: 1.8.0 + torchvision: 0.9.0 + python: 3.8.0 # The lowest python 3.7.x version available on CircleCI images + requires: + - lint + - build_cpu: + name: maximum_version_cpu + # mmdeploy not supported +# torch: 2.0.0 +# torchvision: 0.15.1 + torch: 1.12.1 + torchvision: 0.13.1 + python: 3.9.0 + requires: + - minimum_version_cpu + - hold: + type: approval + requires: + - maximum_version_cpu + - build_cuda: + name: mainstream_version_gpu + torch: 1.8.1 + # Use double quotation mark to explicitly specify its type + # as string instead of number + cuda: "10.2" + requires: + - hold + - build_cuda: + name: maximum_version_gpu + torch: 2.0.0 + cuda: "11.7" + cudnn: 8 + requires: + - hold + merge_stage_test: + when: + not: << pipeline.parameters.lint_only >> + jobs: + - build_cuda: + name: minimum_version_gpu + torch: 1.7.0 + # Use double quotation mark to explicitly specify its type + # as string instead of number + cuda: "11.0" + cudnn: 8 + filters: + branches: + only: + - main diff --git a/third_party/mmyolo/.dev_scripts/gather_models.py b/third_party/mmyolo/.dev_scripts/gather_models.py new file mode 100644 index 0000000000000000000000000000000000000000..f05e2b5b31329e12f1bd62196de6592fade0a7c8 --- /dev/null +++ b/third_party/mmyolo/.dev_scripts/gather_models.py @@ -0,0 +1,312 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import glob +import os +import os.path as osp +import shutil +import subprocess +import time +from collections import OrderedDict + +import torch +import yaml +from mmengine.config import Config +from mmengine.fileio import dump +from mmengine.utils import mkdir_or_exist, scandir + + +def ordered_yaml_dump(data, stream=None, Dumper=yaml.SafeDumper, **kwds): + + class OrderedDumper(Dumper): + pass + + def _dict_representer(dumper, data): + return dumper.represent_mapping( + yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, data.items()) + + OrderedDumper.add_representer(OrderedDict, _dict_representer) + return yaml.dump(data, stream, OrderedDumper, **kwds) + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + # remove optimizer for smaller file size + if 'optimizer' in checkpoint: + del checkpoint['optimizer'] + if 'message_hub' in checkpoint: + del checkpoint['message_hub'] + if 'ema_state_dict' in checkpoint: + del checkpoint['ema_state_dict'] + + for key in list(checkpoint['state_dict']): + if key.startswith('data_preprocessor'): + checkpoint['state_dict'].pop(key) + elif 'priors_base_sizes' in key: + checkpoint['state_dict'].pop(key) + elif 'grid_offset' in key: + checkpoint['state_dict'].pop(key) + elif 'prior_inds' in key: + checkpoint['state_dict'].pop(key) + + # if it is necessary to remove some sensitive data in checkpoint['meta'], + # add the code here. + if torch.__version__ >= '1.6': + torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, out_file) + sha = subprocess.check_output(['sha256sum', out_file]).decode() + final_file = out_file.rstrip('.pth') + f'-{sha[:8]}.pth' + subprocess.Popen(['mv', out_file, final_file]) + return final_file + + +def is_by_epoch(config): + cfg = Config.fromfile('./configs/' + config) + return cfg.train_cfg.type == 'EpochBasedTrainLoop' + + +def get_final_epoch_or_iter(config): + cfg = Config.fromfile('./configs/' + config) + if cfg.train_cfg.type == 'EpochBasedTrainLoop': + return cfg.train_cfg.max_epochs + else: + return cfg.train_cfg.max_iters + + +def get_best_epoch_or_iter(exp_dir): + best_epoch_iter_full_path = list( + sorted(glob.glob(osp.join(exp_dir, 'best_*.pth'))))[-1] + best_epoch_or_iter_model_path = best_epoch_iter_full_path.split('/')[-1] + best_epoch_or_iter = best_epoch_or_iter_model_path. \ + split('_')[-1].split('.')[0] + return best_epoch_or_iter_model_path, int(best_epoch_or_iter) + + +def get_real_epoch_or_iter(config): + cfg = Config.fromfile('./configs/' + config) + if cfg.train_cfg.type == 'EpochBasedTrainLoop': + epoch = cfg.train_cfg.max_epochs + return epoch + else: + return cfg.runner.max_iters + + +def get_final_results(log_json_path, + epoch_or_iter, + results_lut='coco/bbox_mAP', + by_epoch=True): + result_dict = dict() + with open(log_json_path) as f: + r = f.readlines()[-1] + last_metric = r.split(',')[0].split(': ')[-1].strip() + result_dict[results_lut] = last_metric + return result_dict + + +def get_dataset_name(config): + # If there are more dataset, add here. + name_map = dict( + CityscapesDataset='Cityscapes', + CocoDataset='COCO', + PoseCocoDataset='COCO Person', + YOLOv5CocoDataset='COCO', + CocoPanopticDataset='COCO', + YOLOv5DOTADataset='DOTA 1.0', + DeepFashionDataset='Deep Fashion', + LVISV05Dataset='LVIS v0.5', + LVISV1Dataset='LVIS v1', + VOCDataset='Pascal VOC', + YOLOv5VOCDataset='Pascal VOC', + WIDERFaceDataset='WIDER Face', + OpenImagesDataset='OpenImagesDataset', + OpenImagesChallengeDataset='OpenImagesChallengeDataset') + cfg = Config.fromfile('./configs/' + config) + return name_map[cfg.dataset_type] + + +def find_last_dir(model_dir): + dst_times = [] + for time_stamp in os.scandir(model_dir): + if osp.isdir(time_stamp): + dst_time = time.mktime( + time.strptime(time_stamp.name, '%Y%m%d_%H%M%S')) + dst_times.append([dst_time, time_stamp.name]) + return max(dst_times, key=lambda x: x[0])[1] + + +def convert_model_info_to_pwc(model_infos): + pwc_files = {} + for model in model_infos: + cfg_folder_name = osp.split(model['config'])[-2] + pwc_model_info = OrderedDict() + pwc_model_info['Name'] = osp.split(model['config'])[-1].split('.')[0] + pwc_model_info['In Collection'] = 'Please fill in Collection name' + pwc_model_info['Config'] = osp.join('configs', model['config']) + + # get metadata + meta_data = OrderedDict() + if 'epochs' in model: + meta_data['Epochs'] = get_real_epoch_or_iter(model['config']) + else: + meta_data['Iterations'] = get_real_epoch_or_iter(model['config']) + pwc_model_info['Metadata'] = meta_data + + # get dataset name + dataset_name = get_dataset_name(model['config']) + + # get results + results = [] + # if there are more metrics, add here. + if 'bbox_mAP' in model['results']: + metric = round(model['results']['bbox_mAP'] * 100, 1) + results.append( + OrderedDict( + Task='Object Detection', + Dataset=dataset_name, + Metrics={'box AP': metric})) + if 'segm_mAP' in model['results']: + metric = round(model['results']['segm_mAP'] * 100, 1) + results.append( + OrderedDict( + Task='Instance Segmentation', + Dataset=dataset_name, + Metrics={'mask AP': metric})) + if 'PQ' in model['results']: + metric = round(model['results']['PQ'], 1) + results.append( + OrderedDict( + Task='Panoptic Segmentation', + Dataset=dataset_name, + Metrics={'PQ': metric})) + pwc_model_info['Results'] = results + + link_string = 'https://download.openmmlab.com/mmyolo/v0/' + link_string += '{}/{}'.format(model['config'].rstrip('.py'), + osp.split(model['model_path'])[-1]) + pwc_model_info['Weights'] = link_string + if cfg_folder_name in pwc_files: + pwc_files[cfg_folder_name].append(pwc_model_info) + else: + pwc_files[cfg_folder_name] = [pwc_model_info] + return pwc_files + + +def parse_args(): + parser = argparse.ArgumentParser(description='Gather benchmarked models') + parser.add_argument( + 'root', + type=str, + help='root path of benchmarked models to be gathered') + parser.add_argument( + 'out', type=str, help='output path of gathered models to be stored') + parser.add_argument( + '--best', + action='store_true', + help='whether to gather the best model.') + + args = parser.parse_args() + return args + + +# TODO: Refine +def main(): + args = parse_args() + models_root = args.root + models_out = args.out + mkdir_or_exist(models_out) + + # find all models in the root directory to be gathered + raw_configs = list(scandir('./configs', '.py', recursive=True)) + + # filter configs that is not trained in the experiments dir + used_configs = [] + for raw_config in raw_configs: + if osp.exists(osp.join(models_root, raw_config)): + used_configs.append(raw_config) + print(f'Find {len(used_configs)} models to be gathered') + + # find final_ckpt and log file for trained each config + # and parse the best performance + model_infos = [] + for used_config in used_configs: + exp_dir = osp.join(models_root, used_config) + by_epoch = is_by_epoch(used_config) + # check whether the exps is finished + if args.best is True: + final_model, final_epoch_or_iter = get_best_epoch_or_iter(exp_dir) + else: + final_epoch_or_iter = get_final_epoch_or_iter(used_config) + final_model = '{}_{}.pth'.format('epoch' if by_epoch else 'iter', + final_epoch_or_iter) + + model_path = osp.join(exp_dir, final_model) + # skip if the model is still training + if not osp.exists(model_path): + continue + + # get the latest logs + latest_exp_name = find_last_dir(exp_dir) + latest_exp_json = osp.join(exp_dir, latest_exp_name, 'vis_data', + latest_exp_name + '.json') + + model_performance = get_final_results( + latest_exp_json, final_epoch_or_iter, by_epoch=by_epoch) + + if model_performance is None: + continue + + model_info = dict( + config=used_config, + results=model_performance, + final_model=final_model, + latest_exp_json=latest_exp_json, + latest_exp_name=latest_exp_name) + model_info['epochs' if by_epoch else 'iterations'] = \ + final_epoch_or_iter + model_infos.append(model_info) + + # publish model for each checkpoint + publish_model_infos = [] + for model in model_infos: + model_publish_dir = osp.join(models_out, model['config'].rstrip('.py')) + mkdir_or_exist(model_publish_dir) + + model_name = osp.split(model['config'])[-1].split('.')[0] + + model_name += '_' + model['latest_exp_name'] + publish_model_path = osp.join(model_publish_dir, model_name) + trained_model_path = osp.join(models_root, model['config'], + model['final_model']) + + # convert model + final_model_path = process_checkpoint(trained_model_path, + publish_model_path) + + # copy log + shutil.copy(model['latest_exp_json'], + osp.join(model_publish_dir, f'{model_name}.log.json')) + + # copy config to guarantee reproducibility + config_path = model['config'] + config_path = osp.join( + 'configs', + config_path) if 'configs' not in config_path else config_path + target_config_path = osp.split(config_path)[-1] + shutil.copy(config_path, osp.join(model_publish_dir, + target_config_path)) + + model['model_path'] = final_model_path + publish_model_infos.append(model) + + models = dict(models=publish_model_infos) + print(f'Totally gathered {len(publish_model_infos)} models') + dump(models, osp.join(models_out, 'model_info.json')) + + pwc_files = convert_model_info_to_pwc(publish_model_infos) + for name in pwc_files: + with open(osp.join(models_out, name + '_metafile.yml'), 'w') as f: + ordered_yaml_dump(pwc_files[name], f, encoding='utf-8') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/.dev_scripts/print_registers.py b/third_party/mmyolo/.dev_scripts/print_registers.py new file mode 100644 index 0000000000000000000000000000000000000000..52646da205969db62d3d59dc2736be00954510e2 --- /dev/null +++ b/third_party/mmyolo/.dev_scripts/print_registers.py @@ -0,0 +1,448 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import importlib +import os +import os.path as osp +import pkgutil +import sys +import tempfile +from multiprocessing import Pool +from pathlib import Path + +import numpy as np +import pandas as pd + +# host_addr = 'https://gitee.com/open-mmlab' +host_addr = 'https://github.com/open-mmlab' +tools_list = ['tools', '.dev_scripts'] +proxy_names = { + 'mmdet': 'mmdetection', + 'mmseg': 'mmsegmentation', + 'mmcls': 'mmclassification' +} +merge_module_keys = {'mmcv': ['mmengine']} +# exclude_prefix = {'mmcv': ['{_k}') + table_data.append((registry_name, registry_strings)) + + # sort the data list + table_data = sorted(table_data, key=lambda x: len(x[1])) + # split multi parts + table_data_multi_parts = [] + for (registry_name, registry_strings) in table_data: + multi_parts = False + if len(registry_strings) > max_size_per_cell: + multi_parts = True + for cell_idx, registry_cell in enumerate( + divide_list_into_groups(registry_strings, max_size_per_cell)): + registry_str = ''.join(registry_cell.tolist()) + registry_str = f'
    {registry_str}
' + table_data_multi_parts.append([ + registry_name if not multi_parts else + f'{registry_name} (part {cell_idx + 1})', registry_str + ]) + + for table_data in divide_list_into_groups(table_data_multi_parts, + max_col_per_row): + table_data = list(zip(*table_data.tolist())) + html += dataframe_to_html( + pd.DataFrame([table_data[1]], columns=table_data[0])) + if html: + html = f'
{title}
\n{html}' + html = f'
{html}
\n' + return html + + +def tools_to_html(tools_dict, repo_name=''): + + def _recurse(_dict, _connector, _result): + assert isinstance(_dict, dict), \ + f'unknown recurse type: {_dict} ({type(_dict)})' + for _k, _v in _dict.items(): + if _v is None: + if _connector not in _result: + _result[_connector] = [] + _result[_connector].append(_k) + else: + _recurse(_v, osp.join(_connector, _k), _result) + + table_data = {} + title = f'{capitalize(repo_name)} Tools' + _recurse(tools_dict, '', table_data) + return registries_to_html(table_data, title) + + +def dataframe_to_html(dataframe): + styler = dataframe.style + styler = styler.hide(axis='index') + styler = styler.format(na_rep='-') + styler = styler.set_properties(**{ + 'text-align': 'left', + 'align': 'center', + 'vertical-align': 'top' + }) + styler = styler.set_table_styles([{ + 'selector': + 'thead th', + 'props': + 'align:center;text-align:center;vertical-align:bottom' + }]) + html = styler.to_html() + html = f'
\n{html}
' + return html + + +def generate_markdown_by_repository(repo_name, + module_name, + branch, + pulldir, + throw_error=False): + # add the pull dir to the system path so that it can be found + if pulldir not in sys.path: + sys.path.insert(0, pulldir) + module_list, error_dict = load_modules_from_dir( + module_name, pulldir, throw_error=throw_error) + registries_tree = get_registries_from_modules(module_list) + if error_dict: + error_dict_name = 'error_modules' + assert (error_dict_name not in registries_tree), \ + f'duplicate module name was found: {error_dict_name}' + registries_tree.update({error_dict_name: error_dict}) + # get the tools files + for tools_name in tools_list: + assert (tools_name not in registries_tree), \ + f'duplicate tools name was found: {tools_name}' + tools_tree = osp.join(pulldir, tools_name) + tools_tree = get_scripts_from_dir(tools_tree) + registries_tree.update({tools_name: tools_tree}) + # print_tree(registries_tree) + # get registries markdown string + module_registries = registries_tree.get(module_name, {}) + for merge_key in merge_module_keys.get(module_name, []): + merge_dict = registries_tree.get(merge_key, {}) + merge_registries(module_registries, merge_dict) + for exclude_key in exclude_prefix.get(module_name, []): + exclude_registries(module_registries, exclude_key) + markdown_str = registries_to_html( + module_registries, title=f'{capitalize(repo_name)} Module Components') + # get tools markdown string + tools_registries = {} + for tools_name in tools_list: + tools_registries.update( + {tools_name: registries_tree.get(tools_name, {})}) + markdown_str += tools_to_html(tools_registries, repo_name=repo_name) + version_str = get_version_from_module_name(module_name, branch) + title_str = f'\n\n## {capitalize(repo_name)}{version_str}\n' + # remove the pull dir from system path + if pulldir in sys.path: + sys.path.remove(pulldir) + return f'{title_str}{markdown_str}' + + +def parse_args(): + parser = argparse.ArgumentParser( + description='print registries in openmmlab repositories') + parser.add_argument( + '-r', + '--repositories', + nargs='+', + default=['mmdet', 'mmcls', 'mmseg', 'mmengine', 'mmcv'], + type=str, + help='git repositories name in OpenMMLab') + parser.add_argument( + '-b', + '--branches', + nargs='+', + default=['3.x', '1.x', '1.x', 'main', '2.x'], + type=str, + help='the branch names of git repositories, the length of branches ' + 'must be same as the length of repositories') + parser.add_argument( + '-o', '--out', type=str, default='.', help='output path of the file') + parser.add_argument( + '--throw-error', + action='store_true', + default=False, + help='whether to throw error when trying to import modules') + args = parser.parse_args() + return args + + +# TODO: Refine +def main(): + args = parse_args() + repositories = args.repositories + branches = args.branches + assert isinstance(repositories, list), \ + 'Type of repositories must be list' + if branches is None: + branches = [None] * len(repositories) + assert isinstance(branches, list) and \ + len(branches) == len(repositories), \ + 'The length of branches must be same as ' \ + 'that of repositories' + assert isinstance(args.out, str), \ + 'The type of output path must be string' + # save path of file + mkdir_or_exist(args.out) + save_path = osp.join(args.out, 'registries_info.md') + with tempfile.TemporaryDirectory() as tmpdir: + # multi process init + pool = Pool(processes=len(repositories)) + multi_proc_input_list = [] + multi_proc_output_list = [] + # get the git repositories + for branch, repository in zip(branches, repositories): + repo_name, module_name = parse_repo_name(repository) + pulldir = osp.join(tmpdir, f'tmp_{repo_name}') + git_pull_branch( + repo_name=repo_name, branch_name=branch, pulldir=pulldir) + multi_proc_input_list.append( + (repo_name, module_name, branch, pulldir, args.throw_error)) + print('starting the multi process to get the registries') + for multi_proc_input in multi_proc_input_list: + multi_proc_output_list.append( + pool.apply_async(generate_markdown_by_repository, + multi_proc_input)) + pool.close() + pool.join() + with open(save_path, 'w', encoding='utf-8') as fw: + fw.write(f'{markdown_title}\n') + for multi_proc_output in multi_proc_output_list: + markdown_str = multi_proc_output.get() + fw.write(f'{markdown_str}\n') + print(f'saved registries to the path: {save_path}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/.github/CODE_OF_CONDUCT.md b/third_party/mmyolo/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..92afad1c5ab5d5781115dee45c131d3751d3cd31 --- /dev/null +++ b/third_party/mmyolo/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +- The use of sexualized language or imagery and unwelcome sexual attention or + advances +- Trolling, insulting/derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or electronic + address, without explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at chenkaidev@gmail.com. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq + +[homepage]: https://www.contributor-covenant.org diff --git a/third_party/mmyolo/.github/CONTRIBUTING.md b/third_party/mmyolo/.github/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..4ac764f10587497cb6da5ba453c08056d5bc9df7 --- /dev/null +++ b/third_party/mmyolo/.github/CONTRIBUTING.md @@ -0,0 +1 @@ +We appreciate all contributions to improve MMYOLO. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) in MMCV for more details about the contributing guideline. diff --git a/third_party/mmyolo/.github/ISSUE_TEMPLATE/1-bug-report.yml b/third_party/mmyolo/.github/ISSUE_TEMPLATE/1-bug-report.yml new file mode 100644 index 0000000000000000000000000000000000000000..0cec5853ebbde572c2c6322f9d7123cac5a97df7 --- /dev/null +++ b/third_party/mmyolo/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -0,0 +1,67 @@ +name: "🐞 Bug report" +description: "Create a report to help us reproduce and fix the bug" + + +body: + - type: markdown + attributes: + value: | + Thank you for reporting this issue to help us improve! + If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [here](https://github.com/open-mmlab/mmyolo/pulls)! + If this issue is about installing MMCV, please file an issue at [MMCV](https://github.com/open-mmlab/mmcv/issues/new/choose). + If you need our help, please fill in as much of the following form as you're able. + + - type: checkboxes + attributes: + label: Prerequisite + description: Please check the following items before creating a new issue. + options: + - label: I have searched [the existing and past issues](https://github.com/open-mmlab/mmyolo/issues) but cannot get the expected help. + required: true + - label: I have read the [FAQ documentation](https://mmyolo.readthedocs.io/en/latest/faq.html) but cannot get the expected help. + required: true + - label: The bug has not been fixed in the [latest version](https://github.com/open-mmlab/mmyolo). + required: true + + - type: textarea + attributes: + label: 🐞 Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + Preferably a simple and minimal code snippet that we can reproduce the error by running the code. + placeholder: | + A clear and concise description of what the bug is. + + ```python + # Sample code to reproduce the problem + ``` + + ```shell + The command or script you run. + ``` + + ``` + The error message or logs you got, with the full traceback. + ``` + validations: + required: true + + - type: textarea + attributes: + label: Environment + description: | + Please run `python mmyolo/utils/collect_env.py` to collect necessary environment information and paste it here. + You may add addition that may be helpful for locating the problem, such as + - How you installed PyTorch \[e.g., pip, conda, source\] + - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) + validations: + required: true + + - type: textarea + attributes: + label: Additional information + description: Tell us anything else you think we should know. + placeholder: | + 1. Did you make any modifications on the code or config? Did you understand what you have modified? + 2. What dataset did you use? + 3. What do you think might be the reason? diff --git a/third_party/mmyolo/.github/ISSUE_TEMPLATE/2-feature-request.yml b/third_party/mmyolo/.github/ISSUE_TEMPLATE/2-feature-request.yml new file mode 100644 index 0000000000000000000000000000000000000000..8b24846777e89685bcb99c5d79663839536b6607 --- /dev/null +++ b/third_party/mmyolo/.github/ISSUE_TEMPLATE/2-feature-request.yml @@ -0,0 +1,32 @@ +name: 🚀 Feature request +description: Suggest an idea for this project +labels: [feature request] + +body: + - type: markdown + attributes: + value: | + Thank you for suggesting an idea to make MMYOLO better. + We strongly appreciate you creating a PR to implete this feature [here](https://github.com/open-mmlab/mmyolo/pulls)! + + If you need our help, please fill in as much of the following form as you're able. + + - type: textarea + attributes: + label: What is the problem this feature will solve? + placeholder: | + E.g., It is inconvenient when \[....\]. + validations: + required: true + + - type: textarea + attributes: + label: What is the feature you are proposing to solve the problem? + validations: + required: true + + - type: textarea + attributes: + label: What alternatives have you considered? + description: | + Add any other context or screenshots about the feature request here. diff --git a/third_party/mmyolo/.github/ISSUE_TEMPLATE/3-new-model.yml b/third_party/mmyolo/.github/ISSUE_TEMPLATE/3-new-model.yml new file mode 100644 index 0000000000000000000000000000000000000000..2aacff4abc353c1e999c8e5952c86ffcac38b063 --- /dev/null +++ b/third_party/mmyolo/.github/ISSUE_TEMPLATE/3-new-model.yml @@ -0,0 +1,30 @@ +name: "\U0001F31F New model/dataset addition" +description: Submit a proposal/request to implement a new model / dataset +labels: [ "New model/dataset" ] + +body: + - type: textarea + id: description-request + validations: + required: true + attributes: + label: Model/Dataset description + description: | + Put any and all important information relative to the model/dataset + + - type: checkboxes + attributes: + label: Open source status + description: | + Please provide the open-source status, which would be very helpful + options: + - label: "The model implementation is available" + - label: "The model weights are available." + + - type: textarea + id: additional-info + attributes: + label: Provide useful links for the implementation + description: | + Please provide information regarding the implementation, the weights, and the authors. + Please mention the authors by @gh-username if you're aware of their usernames. diff --git a/third_party/mmyolo/.github/ISSUE_TEMPLATE/4-documentation.yml b/third_party/mmyolo/.github/ISSUE_TEMPLATE/4-documentation.yml new file mode 100644 index 0000000000000000000000000000000000000000..dbf1ef8107a33c41067743097ba78e047be43cdb --- /dev/null +++ b/third_party/mmyolo/.github/ISSUE_TEMPLATE/4-documentation.yml @@ -0,0 +1,22 @@ +name: 📚 Documentation +description: Report an issue related to https://mmyolo.readthedocs.io/en/latest/. + +body: +- type: textarea + attributes: + label: 📚 The doc issue + description: > + A clear and concise description of what content in https://mmyolo.readthedocs.io/en/latest/ is an issue. + validations: + required: true + +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. + +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/third_party/mmyolo/.github/ISSUE_TEMPLATE/5-reimplementation.yml b/third_party/mmyolo/.github/ISSUE_TEMPLATE/5-reimplementation.yml new file mode 100644 index 0000000000000000000000000000000000000000..1240aa896a50151ad47cc1bf0813d0b40d7e7169 --- /dev/null +++ b/third_party/mmyolo/.github/ISSUE_TEMPLATE/5-reimplementation.yml @@ -0,0 +1,87 @@ +name: "💥 Reimplementation Questions" +description: "Ask about questions during model reimplementation" + + +body: + - type: markdown + attributes: + value: | + If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [here](https://github.com/open-mmlab/mmyolo/pulls)! + + - type: checkboxes + attributes: + label: Prerequisite + description: Please check the following items before creating a new issue. + options: + - label: I have searched [the existing and past issues](https://github.com/open-mmlab/mmyolo/issues) but cannot get the expected help. + required: true + - label: I have read the [FAQ documentation](https://mmyolo.readthedocs.io/en/latest/faq.html) but cannot get the expected help. + required: true + - label: The bug has not been fixed in the [latest version](https://github.com/open-mmlab/mmyolo). + required: true + validations: + required: true + + - type: textarea + attributes: + label: 💬 Describe the reimplementation questions + description: | + A clear and concise description of what the problem you meet and what have you done. + There are several common situations in the reimplementation issues as below + + 1. Reimplement a model in the model zoo using the provided configs + 2. Reimplement a model in the model zoo on other dataset (e.g., custom datasets) + 3. Reimplement a custom model but all the components are implemented in MMDetection + 4. Reimplement a custom model with new modules implemented by yourself + + There are several things to do for different cases as below. + + - For case 1 & 3, please follow the steps in the following sections thus we could help to quick identify the issue. + - For case 2 & 4, please understand that we are not able to do much help here because we usually do not know the full code and the users should be responsible to the code they write. + - One suggestion for case 2 & 4 is that the users should first check whether the bug lies in the self-implemented code or the original code. For example, users can first make sure that the same model runs well on supported datasets. If you still need help, please describe what you have done and what you obtain in the issue, and follow the steps in the following sections and try as clear as possible so that we can better help you. + placeholder: | + A clear and concise description of what the bug is. + What config dir you run? + + ```none + A placeholder for the config. + ``` + + ```shell + The command or script you run. + ``` + + ``` + The error message or logs you got, with the full traceback. + ``` + validations: + required: true + + - type: textarea + attributes: + label: Environment + description: | + Please run `python mmyolo/utils/collect_env.py` to collect necessary environment information and paste it here. + You may add addition that may be helpful for locating the problem, such as + - How you installed PyTorch \[e.g., pip, conda, source\] + - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) + validations: + required: true + + - type: textarea + attributes: + label: Expected results + description: If applicable, paste the related results here, e.g., what you expect and what you get. + placeholder: | + ```none + A placeholder for results comparison + ``` + + - type: textarea + attributes: + label: Additional information + description: Tell us anything else you think we should know. + placeholder: | + 1. Did you make any modifications on the code or config? Did you understand what you have modified? + 2. What dataset did you use? + 3. What do you think might be the reason? diff --git a/third_party/mmyolo/.github/ISSUE_TEMPLATE/config.yml b/third_party/mmyolo/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..585c786b50b3692e996a1d150470852e876a24dc --- /dev/null +++ b/third_party/mmyolo/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,9 @@ +blank_issues_enabled: true + +contact_links: + - name: 💬 Forum + url: https://github.com/open-mmlab/mmyolo/discussions + about: Ask general usage questions and discuss with other MMYOLO community members + - name: 🌐 Explore OpenMMLab + url: https://openmmlab.com/ + about: Get know more about OpenMMLab diff --git a/third_party/mmyolo/.github/pull_request_template.md b/third_party/mmyolo/.github/pull_request_template.md new file mode 100644 index 0000000000000000000000000000000000000000..2997d883eec5e36302b7a4505f2d218f5cdf7c91 --- /dev/null +++ b/third_party/mmyolo/.github/pull_request_template.md @@ -0,0 +1,25 @@ +Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers. + +## Motivation + +Please describe the motivation for this PR and the goal you want to achieve through this PR. + +## Modification + +Please briefly describe what modification is made in this PR. + +## BC-breaking (Optional) + +Does the modification introduce changes that break the backward compatibility of the downstream repos? +If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR. + +## Use cases (Optional) + +If this PR introduces a new feature, it is better to list some use cases here and update the documentation. + +## Checklist + +1. Pre-commit or other linting tools are used to fix potential lint issues. +2. The modification is covered by complete unit tests. If not, please add more unit tests to ensure the correctness. +3. If the modification has a potential influence on downstream projects, this PR should be tested with downstream projects, like MMDetection or MMClassification. +4. The documentation has been modified accordingly, like docstring or example tutorials. diff --git a/third_party/mmyolo/.github/workflows/deploy.yml b/third_party/mmyolo/.github/workflows/deploy.yml new file mode 100644 index 0000000000000000000000000000000000000000..08f542bbaaae1a1f0f33712544e1ff08c7aa2e85 --- /dev/null +++ b/third_party/mmyolo/.github/workflows/deploy.yml @@ -0,0 +1,28 @@ +name: deploy + +on: push + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-n-publish: + runs-on: ubuntu-latest + if: startsWith(github.event.ref, 'refs/tags') + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install torch + run: pip install torch + - name: Install wheel + run: pip install wheel + - name: Build MMYOLO + run: python setup.py sdist bdist_wheel + - name: Publish distribution to PyPI + run: | + pip install twine + twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }} diff --git a/third_party/mmyolo/.gitignore b/third_party/mmyolo/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..195f1940ad4e0cd1c73a8192c474b816dea93978 --- /dev/null +++ b/third_party/mmyolo/.gitignore @@ -0,0 +1,126 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/en/_build/ +docs/zh_cn/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +data/ +data +.vscode +.idea +.DS_Store + +# custom +*.pkl +*.pkl.json +*.log.json +docs/modelzoo_statistics.md +mmyolo/.mim +output/ +work_dirs +yolov5-6.1/ + +# Pytorch +*.pth +*.pt +*.py~ +*.sh~ diff --git a/third_party/mmyolo/.pre-commit-config-zh-cn.yaml b/third_party/mmyolo/.pre-commit-config-zh-cn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..52bb607e86cedc4f0ac9d188bb7ec717d88b35fb --- /dev/null +++ b/third_party/mmyolo/.pre-commit-config-zh-cn.yaml @@ -0,0 +1,60 @@ +exclude: ^tests/data/ +repos: + - repo: https://gitee.com/openmmlab/mirrors-flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://gitee.com/openmmlab/mirrors-isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://gitee.com/openmmlab/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://gitee.com/openmmlab/mirrors-mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://gitee.com/openmmlab/mirrors-codespell + rev: v2.2.1 + hooks: + - id: codespell + - repo: https://gitee.com/openmmlab/mirrors-docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://gitee.com/openmmlab/mirrors-pyupgrade + rev: v3.0.0 + hooks: + - id: pyupgrade + args: ["--py36-plus"] + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 + hooks: + - id: check-copyright + args: ["mmyolo", "tests"] +# - repo: https://gitee.com/openmmlab/mirrors-mypy +# rev: v0.812 +# hooks: +# - id: mypy +# exclude: "docs" diff --git a/third_party/mmyolo/.pre-commit-config.yaml b/third_party/mmyolo/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ffae20d2d3941607fd541e03e22c0e351f296d88 --- /dev/null +++ b/third_party/mmyolo/.pre-commit-config.yaml @@ -0,0 +1,60 @@ +exclude: ^tests/data/ +repos: + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://github.com/codespell-project/codespell + rev: v2.2.1 + hooks: + - id: codespell + - repo: https://github.com/myint/docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + - repo: https://github.com/asottile/pyupgrade + rev: v3.0.0 + hooks: + - id: pyupgrade + args: ["--py36-plus"] + - repo: https://github.com/open-mmlab/pre-commit-hooks + rev: v0.2.0 + hooks: + - id: check-copyright + args: ["mmyolo", "tests"] +# - repo: https://github.com/pre-commit/mirrors-mypy +# rev: v0.812 +# hooks: +# - id: mypy +# exclude: "docs" diff --git a/third_party/mmyolo/.readthedocs.yml b/third_party/mmyolo/.readthedocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..c9ab01ce18caeebce129472bd63b0465405d6a50 --- /dev/null +++ b/third_party/mmyolo/.readthedocs.yml @@ -0,0 +1,8 @@ +version: 2 + +formats: all + +python: + version: 3.7 + install: + - requirements: requirements/docs.txt diff --git a/third_party/mmyolo/LICENSE b/third_party/mmyolo/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 --- /dev/null +++ b/third_party/mmyolo/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/third_party/mmyolo/MANIFEST.in b/third_party/mmyolo/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..5bf1d9ebabcc5ca1f28207b62eab10141474db51 --- /dev/null +++ b/third_party/mmyolo/MANIFEST.in @@ -0,0 +1,6 @@ +include requirements/*.txt +include mmyolo/VERSION +include mmyolo/.mim/model-index.yml +include mmyolo/.mim/demo/*/* +recursive-include mmyolo/.mim/configs *.py *.yml +recursive-include mmyolo/.mim/tools *.sh *.py diff --git a/third_party/mmyolo/README.md b/third_party/mmyolo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b799a759c367938cbeea728b0763a36cda5b2544 --- /dev/null +++ b/third_party/mmyolo/README.md @@ -0,0 +1,428 @@ +
+ +
 
+
+ OpenMMLab website + + + HOT + + +      + OpenMMLab platform + + + TRY IT OUT + + +
+
 
+ +[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo) +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/) +[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo) +[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE) +[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) +[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) + +[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) | +[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) | +[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) | +[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) | +[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose) + +
+ +
+ +English | [简体中文](README_zh-CN.md) + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +## 📄 Table of Contents + +- [🥳 🚀 What's New](#--whats-new-) + - [✨ Highlight](#-highlight-) +- [📖 Introduction](#-introduction-) +- [🛠️ Installation](#%EF%B8%8F-installation-) +- [👨‍🏫 Tutorial](#-tutorial-) +- [📊 Overview of Benchmark and Model Zoo](#-overview-of-benchmark-and-model-zoo-) +- [❓ FAQ](#-faq-) +- [🙌 Contributing](#-contributing-) +- [🤝 Acknowledgement](#-acknowledgement-) +- [🖊️ Citation](#️-citation-) +- [🎫 License](#-license-) +- [🏗️ Projects in OpenMMLab](#%EF%B8%8F-projects-in-openmmlab-) + +## 🥳 🚀 What's New [🔝](#-table-of-contents) + +💎 **v0.6.0** was released on 15/8/2023: + +- Support YOLOv5 instance segmentation +- Support YOLOX-Pose based on MMPose +- Add 15 minutes instance segmentation tutorial. +- YOLOv5 supports using mask annotation to optimize bbox +- Add Multi-scale training and testing docs + +For release history and update details, please refer to [changelog](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html). + +### ✨ Highlight [🔝](#-table-of-contents) + +We are excited to announce our latest work on real-time object recognition tasks, **RTMDet**, a family of fully convolutional single-stage detectors. RTMDet not only achieves the best parameter-accuracy trade-off on object detection from tiny to extra-large model sizes but also obtains new state-of-the-art performance on instance segmentation and rotated object detection tasks. Details can be found in the [technical report](https://arxiv.org/abs/2212.07784). Pre-trained models are [here](configs/rtmdet). + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + +| Task | Dataset | AP | FPS(TRT FP16 BS1 3090) | +| ------------------------ | ------- | ------------------------------------ | ---------------------- | +| Object Detection | COCO | 52.8 | 322 | +| Instance Segmentation | COCO | 44.6 | 188 | +| Rotated Object Detection | DOTA | 78.9(single-scale)/81.3(multi-scale) | 121 | + +
+ +
+ +MMYOLO currently implements the object detection and rotated object detection algorithm, but it has a significant training acceleration compared to the MMDeteciton version. The training speed is 2.6 times faster than the previous version. + +## 📖 Introduction [🔝](#-table-of-contents) + +MMYOLO is an open source toolbox for YOLO series algorithms based on PyTorch and [MMDetection](https://github.com/open-mmlab/mmdetection). It is a part of the [OpenMMLab](https://openmmlab.com/) project. + +The master branch works with **PyTorch 1.6+**. + + +
+Major features + +- 🕹️ **Unified and convenient benchmark** + + MMYOLO unifies the implementation of modules in various YOLO algorithms and provides a unified benchmark. Users can compare and analyze in a fair and convenient way. + +- 📚 **Rich and detailed documentation** + + MMYOLO provides rich documentation for getting started, model deployment, advanced usages, and algorithm analysis, making it easy for users at different levels to get started and make extensions quickly. + +- 🧩 **Modular Design** + + MMYOLO decomposes the framework into different components where users can easily customize a model by combining different modules with various training and testing strategies. + +BaseModule-P5 + The figure above is contributed by RangeKing@GitHub, thank you very much! + +And the figure of P6 model is in [model_design.md](docs/en/recommended_topics/model_design.md). + +
+ +## 🛠️ Installation [🔝](#-table-of-contents) + +MMYOLO relies on PyTorch, MMCV, MMEngine, and MMDetection. Below are quick steps for installation. Please refer to the [Install Guide](docs/en/get_started/installation.md) for more detailed instructions. + +```shell +conda create -n mmyolo python=3.8 pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -y +conda activate mmyolo +pip install openmim +mim install "mmengine>=0.6.0" +mim install "mmcv>=2.0.0rc4,<2.1.0" +mim install "mmdet>=3.0.0,<4.0.0" +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +# Install albumentations +pip install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +``` + +## 👨‍🏫 Tutorial [🔝](#-table-of-contents) + +MMYOLO is based on MMDetection and adopts the same code structure and design approach. To get better use of this, please read [MMDetection Overview](https://mmdetection.readthedocs.io/en/latest/get_started.html) for the first understanding of MMDetection. + +The usage of MMYOLO is almost identical to MMDetection and all tutorials are straightforward to use, you can also learn about [MMDetection User Guide and Advanced Guide](https://mmdetection.readthedocs.io/en/3.x/). + +For different parts from MMDetection, we have also prepared user guides and advanced guides, please read our [documentation](https://mmyolo.readthedocs.io/zenh_CN/latest/). + +
+Get Started + +- [Overview](docs/en/get_started/overview.md) +- [Dependencies](docs/en/get_started/dependencies.md) +- [Installation](docs/en/get_started/installation.md) +- [15 minutes object detection](docs/en/get_started/15_minutes_object_detection.md) +- [15 minutes rotated object detection](docs/en/get_started/15_minutes_rotated_object_detection.md) +- [15 minutes instance segmentation](docs/en/get_started/15_minutes_instance_segmentation.md) +- [Resources summary](docs/en/get_started/article.md) + +
+ +
+Recommended Topics + +- [How to contribute code to MMYOLO](docs/en/recommended_topics/contributing.md) +- [Training testing tricks](docs/en/recommended_topics/training_testing_tricks.md) +- [MMYOLO model design](docs/en/recommended_topics/model_design.md) +- [Algorithm principles and implementation](docs/en/recommended_topics/algorithm_descriptions/) +- [Replace the backbone network](docs/en/recommended_topics/replace_backbone.md) +- [MMYOLO model complexity analysis](docs/en/recommended_topics/complexity_analysis.md) +- [Annotation-to-deployment workflow for custom dataset](docs/en/recommended_topics/labeling_to_deployment_tutorials.md) +- [Visualization](docs/en/recommended_topics/visualization.md) +- [Model deployment](docs/en/recommended_topics/deploy/) +- [Troubleshooting steps](docs/en/recommended_topics/troubleshooting_steps.md) +- [MMYOLO application examples](docs/en/recommended_topics/application_examples/) +- [MM series repo essential basics](docs/en/recommended_topics/mm_basics.md) +- [Dataset preparation and description](docs/en/recommended_topics/dataset_preparation.md) + +
+ +
+Common Usage + +- [Resume training](docs/en/common_usage/resume_training.md) +- [Enabling and disabling SyncBatchNorm](docs/en/common_usage/syncbn.md) +- [Enabling AMP](docs/en/common_usage/amp_training.md) +- [Multi-scale training and testing](docs/en/common_usage/ms_training_testing.md) +- [TTA Related Notes](docs/en/common_usage/tta.md) +- [Add plugins to the backbone network](docs/en/common_usage/plugins.md) +- [Freeze layers](docs/en/common_usage/freeze_layers.md) +- [Output model predictions](docs/en/common_usage/output_predictions.md) +- [Set random seed](docs/en/common_usage/set_random_seed.md) +- [Module combination](docs/en/common_usage/module_combination.md) +- [Cross-library calls using mim](docs/en/common_usage/mim_usage.md) +- [Apply multiple Necks](docs/en/common_usage/multi_necks.md) +- [Specify specific device training or inference](docs/en/common_usage/specify_device.md) +- [Single and multi-channel application examples](docs/en/common_usage/single_multi_channel_applications.md) + +
+ +
+Useful Tools + +- [Browse coco json](docs/en/useful_tools/browse_coco_json.md) +- [Browse dataset](docs/en/useful_tools/browse_dataset.md) +- [Print config](docs/en/useful_tools/print_config.md) +- [Dataset analysis](docs/en/useful_tools/dataset_analysis.md) +- [Optimize anchors](docs/en/useful_tools/optimize_anchors.md) +- [Extract subcoco](docs/en/useful_tools/extract_subcoco.md) +- [Visualization scheduler](docs/en/useful_tools/vis_scheduler.md) +- [Dataset converters](docs/en/useful_tools/dataset_converters.md) +- [Download dataset](docs/en/useful_tools/download_dataset.md) +- [Log analysis](docs/en/useful_tools/log_analysis.md) +- [Model converters](docs/en/useful_tools/model_converters.md) + +
+ +
+Basic Tutorials + +- [Learn about configs with YOLOv5](docs/en/tutorials/config.md) +- [Data flow](docs/en/tutorials/data_flow.md) +- [Rotated detection](docs/en/tutorials/rotated_detection.md) +- [Custom Installation](docs/en/tutorials/custom_installation.md) +- [Common Warning Notes](docs/zh_cn/tutorials/warning_notes.md) +- [FAQ](docs/en/tutorials/faq.md) + +
+ +
+Advanced Tutorials + +- [MMYOLO cross-library application](docs/en/advanced_guides/cross-library_application.md) + +
+ +
+Descriptions + +- [Changelog](docs/en/notes/changelog.md) +- [Compatibility](docs/en/notes/compatibility.md) +- [Conventions](docs/en/notes/conventions.md) +- [Code Style](docs/en/notes/code_style.md) + +
+ +## 📊 Overview of Benchmark and Model Zoo [🔝](#-table-of-contents) + +
+ +
+ +Results and models are available in the [model zoo](docs/en/model_zoo.md). + +
+Supported Tasks + +- [x] Object detection +- [x] Rotated object detection + +
+ +
+Supported Algorithms + +- [x] [YOLOv5](configs/yolov5) +- [ ] [YOLOv5u](configs/yolov5/yolov5u) (Inference only) +- [x] [YOLOX](configs/yolox) +- [x] [RTMDet](configs/rtmdet) +- [x] [RTMDet-Rotated](configs/rtmdet) +- [x] [YOLOv6](configs/yolov6) +- [x] [YOLOv7](configs/yolov7) +- [x] [PPYOLOE](configs/ppyoloe) +- [x] [YOLOv8](configs/yolov8) + +
+ +
+Supported Datasets + +- [x] COCO Dataset +- [x] VOC Dataset +- [x] CrowdHuman Dataset +- [x] DOTA 1.0 Dataset + +
+ +
+
+ Module Components +
+ + + + + + + + + + + + + + + + + +
+ Backbones + + Necks + + Loss + + Common +
+
    +
  • YOLOv5CSPDarknet
  • +
  • YOLOv8CSPDarknet
  • +
  • YOLOXCSPDarknet
  • +
  • EfficientRep
  • +
  • CSPNeXt
  • +
  • YOLOv7Backbone
  • +
  • PPYOLOECSPResNet
  • +
  • mmdet backbone
  • +
  • mmcls backbone
  • +
  • timm
  • +
+
+
    +
  • YOLOv5PAFPN
  • +
  • YOLOv8PAFPN
  • +
  • YOLOv6RepPAFPN
  • +
  • YOLOXPAFPN
  • +
  • CSPNeXtPAFPN
  • +
  • YOLOv7PAFPN
  • +
  • PPYOLOECSPPAFPN
  • +
+
+
    +
  • IoULoss
  • +
  • mmdet loss
  • +
+
+
    +
+
+ +
+ +## ❓ FAQ [🔝](#-table-of-contents) + +Please refer to the [FAQ](docs/en/tutorials/faq.md) for frequently asked questions. + +## 🙌 Contributing [🔝](#-table-of-contents) + +We appreciate all contributions to improving MMYOLO. Ongoing projects can be found in our [GitHub Projects](https://github.com/open-mmlab/mmyolo/projects). Welcome community users to participate in these projects. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline. + +## 🤝 Acknowledgement [🔝](#-table-of-contents) + +MMYOLO is an open source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedback. +We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to re-implement existing methods and develop their own new detectors. + +
+ +
+ +## 🖊️ Citation [🔝](#-table-of-contents) + +If you find this project useful in your research, please consider citing: + +```latex +@misc{mmyolo2022, + title={{MMYOLO: OpenMMLab YOLO} series toolbox and benchmark}, + author={MMYOLO Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmyolo}}, + year={2022} +} +``` + +## 🎫 License [🔝](#-table-of-contents) + +This project is released under the [GPL 3.0 license](LICENSE). + +## 🏗️ Projects in OpenMMLab [🔝](#-table-of-contents) + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models. +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark. +- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox. +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark. +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. +- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. +- [MMEval](https://github.com/open-mmlab/mmeval): OpenMMLab machine learning evaluation library. +- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab. diff --git a/third_party/mmyolo/README_zh-CN.md b/third_party/mmyolo/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..6eb4d95fe5c6d013d677482762d722b20ce826f0 --- /dev/null +++ b/third_party/mmyolo/README_zh-CN.md @@ -0,0 +1,468 @@ +
+ +
 
+
+ OpenMMLab 官网 + + + HOT + + +      + OpenMMLab 开放平台 + + + TRY IT OUT + + +
+
 
+ +[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo) +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/zh_CN/latest/) +[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo) +[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE) +[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) +[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues) + +[📘使用文档](https://mmyolo.readthedocs.io/zh_CN/latest/) | +[🛠️安装教程](https://mmyolo.readthedocs.io/zh_CN/latest/get_started/installation.html) | +[👀模型库](https://mmyolo.readthedocs.io/zh_CN/latest/model_zoo.html) | +[🆕更新日志](https://mmyolo.readthedocs.io/zh_CN/latest/notes/changelog.html) | +[🤔报告问题](https://github.com/open-mmlab/mmyolo/issues/new/choose) + +
+ +
+ +[English](README.md) | 简体中文 + +
+ +
+ + + + + + + + + + + + + + + + + +
+ +## 📄 Table of Contents + +- [🥳 🚀 最新进展](#--最新进展-) + - [✨ 亮点](#-亮点-) +- [📖 简介](#-简介-) +- [🛠️ 安装](#️%EF%B8%8F-安装-) +- [👨‍🏫 教程](#-教程-) +- [📊 基准测试和模型库](#-基准测试和模型库-) +- [❓ 常见问题](#-常见问题-) +- [🙌 贡献指南](#-贡献指南-) +- [🤝 致谢](#🤝-致谢-) +- [🖊️ 引用](#️-引用-) +- [🎫 开源许可证](#-开源许可证-) +- [🏗️ OpenMMLab 的其他项目](#%EF%B8%8F-openmmlab-的其他项目-) +- [❤️ 欢迎加入 OpenMMLab 社区](#%EF%B8%8F-欢迎加入-openmmlab-社区-) + +## 🥳 🚀 最新进展 [🔝](#-table-of-contents) + +💎 **v0.6.0** 版本已经在 2023.8.15 发布: + +- 支持 YOLOv5 实例分割 +- 基于 MMPose 支持 YOLOX-Pose +- 添加 15 分钟的实例分割教程 +- YOLOv5 支持使用 mask 标注来优化边界框 +- 添加多尺度训练和测试文档 + +我们提供了实用的**脚本命令速查表** + +
+ +
+ +你可以点击[链接](https://pan.baidu.com/s/1QEaqT7YayUdEvh1an0gjHg?pwd=yolo),下载高清版 PDF 文件。 + +同时我们也推出了解读视频: + +| | 内容 | 视频 | 课程中的代码 | +| :-: | :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 🌟 | 特征图可视化 | [![Link](https://i2.hdslb.com/bfs/archive/480a0eb41fce26e0acb65f82a74501418eee1032.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV188411s7o8) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV188411s7o8)](https://www.bilibili.com/video/BV188411s7o8) | [特征图可视化.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/%5B%E5%B7%A5%E5%85%B7%E7%B1%BB%E7%AC%AC%E4%B8%80%E6%9C%9F%5D%E7%89%B9%E5%BE%81%E5%9B%BE%E5%8F%AF%E8%A7%86%E5%8C%96.ipynb) | +| 🌟 | 源码阅读和调试「必备」技巧 | [![Link](https://i2.hdslb.com/bfs/archive/790d2422c879ff20488910da1c4422b667ea6af7.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1N14y1V7mB) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1N14y1V7mB)](https://www.bilibili.com/video/BV1N14y1V7mB) | [源码阅读和调试「必备」技巧文档](https://zhuanlan.zhihu.com/p/580885852) | +| 🌟 | 10分钟换遍主干网络 | [![Link](http://i0.hdslb.com/bfs/archive/c51f1aef7c605856777249a7b4478f44bd69f3bd.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1JG4y1d7GC) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1JG4y1d7GC)](https://www.bilibili.com/video/BV1JG4y1d7GC) | [10分钟换遍主干网络文档](https://zhuanlan.zhihu.com/p/585641598)
[10分钟换遍主干网络.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第二期]10分钟换遍主干网络.ipynb) | +| 🌟 | 自定义数据集从标注到部署保姆级教程 | [![Link](https://i2.hdslb.com/bfs/archive/13f566c89a18c9c881713b63ec14da952d4c0b14.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1RG4y137i5) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1RG4y137i5)](https://www.bilibili.com/video/BV1JG4y1d7GC) | [自定义数据集从标注到部署保姆级教程](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/user_guides/custom_dataset.md) | +| 🌟 | 顶会第一步 · 模块自定义 | [![Link](http://i2.hdslb.com/bfs/archive/5b23d41ac57466824eaf185ef806ef734414e93b.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1yd4y1j7VD) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1yd4y1j7VD)](https://www.bilibili.com/video/BV1yd4y1j7VD) | [顶会第一步·模块自定义.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第四期]顶会第一步·模块自定义.ipynb) | + +完整视频列表请参考 [中文解读资源汇总 - 视频](https://mmyolo.readthedocs.io/zh_CN/latest/get_started/article.html) + +发布历史和更新细节请参考 [更新日志](https://mmyolo.readthedocs.io/zh_CN/latest/notes/changelog.html) + +### ✨ 亮点 [🔝](#-table-of-contents) + +我们很高兴向大家介绍我们在实时目标识别任务方面的最新成果 RTMDet,包含了一系列的全卷积单阶段检测模型。 RTMDet 不仅在从 tiny 到 extra-large 尺寸的目标检测模型上实现了最佳的参数量和精度的平衡,而且在实时实例分割和旋转目标检测任务上取得了最先进的成果。 更多细节请参阅[技术报告](https://arxiv.org/abs/2212.07784)。 预训练模型可以在[这里](configs/rtmdet)找到。 + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + +| Task | Dataset | AP | FPS(TRT FP16 BS1 3090) | +| ------------------------ | ------- | ------------------------------------ | ---------------------- | +| Object Detection | COCO | 52.8 | 322 | +| Instance Segmentation | COCO | 44.6 | 188 | +| Rotated Object Detection | DOTA | 78.9(single-scale)/81.3(multi-scale) | 121 | + +
+ +
+ +MMYOLO 中目前实现了目标检测和旋转框目标检测算法,但是相比 MMDeteciton 版本有显著训练加速,训练速度相比原先版本提升 2.6 倍。 + +## 📖 简介 [🔝](#-table-of-contents) + +MMYOLO 是一个基于 PyTorch 和 MMDetection 的 YOLO 系列算法开源工具箱。它是 [OpenMMLab](https://openmmlab.com/) 项目的一部分。 + +主分支代码目前支持 PyTorch 1.6 以上的版本。 + + +
+主要特性 + +- 🕹️ **统一便捷的算法评测** + + MMYOLO 统一了各类 YOLO 算法模块的实现, 并提供了统一的评测流程,用户可以公平便捷地进行对比分析。 + +- 📚 **丰富的入门和进阶文档** + + MMYOLO 提供了从入门到部署到进阶和算法解析等一系列文档,方便不同用户快速上手和扩展。 + +- 🧩 **模块化设计** + + MMYOLO 将框架解耦成不同的模块组件,通过组合不同的模块和训练测试策略,用户可以便捷地构建自定义模型。 + +基类-P5 + 图为 RangeKing@GitHub 提供,非常感谢! + +P6 模型图详见 [model_design.md](docs/zh_cn/recommended_topics/model_design.md)。 + +
+ +## 🛠️ 安装 [🔝](#-table-of-contents) + +MMYOLO 依赖 PyTorch, MMCV, MMEngine 和 MMDetection,以下是安装的简要步骤。 更详细的安装指南请参考[安装文档](docs/zh_cn/get_started/installation.md)。 + +```shell +conda create -n mmyolo python=3.8 pytorch==1.10.1 torchvision==0.11.2 cudatoolkit=11.3 -c pytorch -y +conda activate mmyolo +pip install openmim +mim install "mmengine>=0.6.0" +mim install "mmcv>=2.0.0rc4,<2.1.0" +mim install "mmdet>=3.0.0,<4.0.0" +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +# Install albumentations +pip install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +``` + +## 👨‍🏫 教程 [🔝](#-table-of-contents) + +MMYOLO 基于 MMDetection 开源库,并且采用相同的代码组织和设计方式。为了更好的使用本开源库,请先阅读 [MMDetection 概述](https://mmdetection.readthedocs.io/zh_CN/latest/get_started.html) 对 MMDetection 进行初步地了解。 + +MMYOLO 用法和 MMDetection 几乎一致,所有教程都是通用的,你也可以了解 [MMDetection 用户指南和进阶指南](https://mmdetection.readthedocs.io/zh_CN/3.x/) 。 + +针对和 MMDetection 不同的部分,我们也准备了用户指南和进阶指南,请阅读我们的 [文档](https://mmyolo.readthedocs.io/zh_CN/latest/) 。 + +
+开启 MMYOLO 之旅 + +- [概述](docs/zh_cn/get_started/overview.md) +- [依赖](docs/zh_cn/get_started/dependencies.md) +- [安装和验证](docs/zh_cn/get_started/installation.md) +- [15 分钟上手 MMYOLO 目标检测](docs/zh_cn/get_started/15_minutes_object_detection.md) +- [15 分钟上手 MMYOLO 旋转框目标检测](docs/zh_cn/get_started/15_minutes_rotated_object_detection.md) +- [15 分钟上手 MMYOLO 实例分割](docs/zh_cn/get_started/15_minutes_instance_segmentation.md) +- [中文解读资源汇总](docs/zh_cn/get_started/article.md) + +
+ +
+推荐专题 + +- [如何给 MMYOLO 贡献代码](docs/zh_cn/recommended_topics/contributing.md) +- [训练和测试技巧](docs/zh_cn/recommended_topics/training_testing_tricks.md) +- [MMYOLO 模型结构设计](docs/zh_cn/recommended_topics/model_design.md) +- [原理和实现全解析](docs/zh_cn/recommended_topics/algorithm_descriptions/) +- [轻松更换主干网络](docs/zh_cn/recommended_topics/replace_backbone.md) +- [MMYOLO 模型复杂度分析](docs/zh_cn/recommended_topics/complexity_analysis.md) +- [标注+训练+测试+部署全流程](docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md) +- [关于可视化的一切](docs/zh_cn/recommended_topics/visualization.md) +- [模型部署流程](docs/zh_cn/recommended_topics/deploy/) +- [常见错误排查步骤](docs/zh_cn/recommended_topics/troubleshooting_steps.md) +- [MMYOLO 应用范例介绍](docs/zh_cn/recommended_topics/application_examples/) +- [MM 系列 Repo 必备基础](docs/zh_cn/recommended_topics/mm_basics.md) +- [数据集准备和说明](docs/zh_cn/recommended_topics/dataset_preparation.md) + +
+ +
+常用功能 + +- [恢复训练](docs/zh_cn/common_usage/resume_training.md) +- [开启和关闭 SyncBatchNorm](docs/zh_cn/common_usage/syncbn.md) +- [开启混合精度训练](docs/zh_cn/common_usage/amp_training.md) +- [多尺度训练和测试](docs/zh_cn/common_usage/ms_training_testing.md) +- [测试时增强相关说明](docs/zh_cn/common_usage/tta.md) +- [给主干网络增加插件](docs/zh_cn/common_usage/plugins.md) +- [冻结指定网络层权重](docs/zh_cn/common_usage/freeze_layers.md) +- [输出模型预测结果](docs/zh_cn/common_usage/output_predictions.md) +- [设置随机种子](docs/zh_cn/common_usage/set_random_seed.md) +- [算法组合替换教程](docs/zh_cn/common_usage/module_combination.md) +- [使用 mim 跨库调用其他 OpenMMLab 仓库的脚本](docs/zh_cn/common_usage/mim_usage.md) +- [应用多个 Neck](docs/zh_cn/common_usage/multi_necks.md) +- [指定特定设备训练或推理](docs/zh_cn/common_usage/specify_device.md) +- [单通道和多通道应用案例](docs/zh_cn/common_usage/single_multi_channel_applications.md) +- [MM 系列开源库注册表](docs/zh_cn/common_usage/registries_info.md) + +
+ +
+实用工具 + +- [可视化 COCO 标签](docs/zh_cn/useful_tools/browse_coco_json.md) +- [可视化数据集](docs/zh_cn/useful_tools/browse_dataset.md) +- [打印完整配置文件](docs/zh_cn/useful_tools/print_config.md) +- [可视化数据集分析结果](docs/zh_cn/useful_tools/dataset_analysis.md) +- [优化锚框尺寸](docs/zh_cn/useful_tools/optimize_anchors.md) +- [提取 COCO 子集](docs/zh_cn/useful_tools/extract_subcoco.md) +- [可视化优化器参数策略](docs/zh_cn/useful_tools/vis_scheduler.md) +- [数据集转换](docs/zh_cn/useful_tools/dataset_converters.md) +- [数据集下载](docs/zh_cn/useful_tools/download_dataset.md) +- [日志分析](docs/zh_cn/useful_tools/log_analysis.md) +- [模型转换](docs/zh_cn/useful_tools/model_converters.md) + +
+ +
+基础教程 + +- [学习 YOLOv5 配置文件](docs/zh_cn/tutorials/config.md) +- [数据流](docs/zh_cn/tutorials/data_flow.md) +- [旋转目标检测](docs/zh_cn/tutorials/rotated_detection.md) +- [自定义安装](docs/zh_cn/tutorials/custom_installation.md) +- [常见警告说明](docs/zh_cn/tutorials/warning_notes.md) +- [常见问题](docs/zh_cn/tutorials/faq.md) + +
+ +
+进阶教程 + +- [MMYOLO 跨库应用解析](docs/zh_cn/advanced_guides/cross-library_application.md) + +
+ +
+说明 + +- [更新日志](docs/zh_cn/notes/changelog.md) +- [兼容性说明](docs/zh_cn/notes/compatibility.md) +- [默认约定](docs/zh_cn/notes/conventions.md) +- [代码规范](docs/zh_cn/notes/code_style.md) + +
+ +## 📊 基准测试和模型库 [🔝](#-table-of-contents) + +
+ +
+ +测试结果和模型可以在 [模型库](docs/zh_cn/model_zoo.md) 中找到。 + +
+支持的任务 + +- [x] 目标检测 +- [x] 旋转框目标检测 + +
+ +
+支持的算法 + +- [x] [YOLOv5](configs/yolov5) +- [ ] [YOLOv5u](configs/yolov5/yolov5u) (仅推理) +- [x] [YOLOX](configs/yolox) +- [x] [RTMDet](configs/rtmdet) +- [x] [RTMDet-Rotated](configs/rtmdet) +- [x] [YOLOv6](configs/yolov6) +- [x] [YOLOv7](configs/yolov7) +- [x] [PPYOLOE](configs/ppyoloe) +- [x] [YOLOv8](configs/yolov8) + +
+ +
+支持的数据集 + +- [x] COCO Dataset +- [x] VOC Dataset +- [x] CrowdHuman Dataset +- [x] DOTA 1.0 Dataset + +
+ +
+
+ 模块组件 +
+ + + + + + + + + + + + + + + + + +
+ Backbones + + Necks + + Loss + + Common +
+
    +
  • YOLOv5CSPDarknet
  • +
  • YOLOv8CSPDarknet
  • +
  • YOLOXCSPDarknet
  • +
  • EfficientRep
  • +
  • CSPNeXt
  • +
  • YOLOv7Backbone
  • +
  • PPYOLOECSPResNet
  • +
  • mmdet backbone
  • +
  • mmcls backbone
  • +
  • timm
  • +
+
+
    +
  • YOLOv5PAFPN
  • +
  • YOLOv8PAFPN
  • +
  • YOLOv6RepPAFPN
  • +
  • YOLOXPAFPN
  • +
  • CSPNeXtPAFPN
  • +
  • YOLOv7PAFPN
  • +
  • PPYOLOECSPPAFPN
  • +
+
+
    +
  • IoULoss
  • +
  • mmdet loss
  • +
+
+
    +
+
+ +
+ +## ❓ 常见问题 [🔝](#-table-of-contents) + +请参考 [FAQ](docs/zh_cn/tutorials/faq.md) 了解其他用户的常见问题。 + +## 🙌 贡献指南 [🔝](#-table-of-contents) + +我们感谢所有的贡献者为改进和提升 MMYOLO 所作出的努力。我们将正在进行中的项目添加进了[GitHub Projects](https://github.com/open-mmlab/mmyolo/projects)页面,非常欢迎社区用户能参与进这些项目中来。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。 + +## 🤝 致谢 [🔝](#-table-of-contents) + +MMYOLO 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。 我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具,供用户复现已有算法并开发自己的新模型,从而不断为开源社区提供贡献。 + +
+ +
+ +## 🖊️ 引用 [🔝](#-table-of-contents) + +如果你觉得本项目对你的研究工作有所帮助,请参考如下 bibtex 引用 MMYOLO + +```latex +@misc{mmyolo2022, + title={{MMYOLO: OpenMMLab YOLO} series toolbox and benchmark}, + author={MMYOLO Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmyolo}}, + year={2022} +} +``` + +## 🎫 开源许可证 [🔝](#-table-of-contents) + +该项目采用 [GPL 3.0 开源许可证](LICENSE)。 + +## 🏗️ OpenMMLab 的其他项目 [🔝](#-table-of-contents) + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库 +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库 +- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱 +- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成(AIGC)工具箱 +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱 +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台 +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准 +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱 +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱 +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包 +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准 +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准 +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准 +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准 +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱 +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台 +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准 +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱 +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架 +- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口 +- [MMEval](https://github.com/open-mmlab/mmeval): OpenMMLab 机器学习算法评测库 +- [Playground](https://github.com/open-mmlab/playground): 收集和展示 OpenMMLab 相关的前沿、有趣的社区项目 + +## ❤️ 欢迎加入 OpenMMLab 社区 [🔝](#-table-of-contents) + +扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=aCvMxdr3) + +
+ +
+ +我们会在 OpenMMLab 社区为大家 + +- 📢 分享 AI 框架的前沿核心技术 +- 💻 解读 PyTorch 常用模块源码 +- 📰 发布 OpenMMLab 的相关新闻 +- 🚀 介绍 OpenMMLab 开发的前沿算法 +- 🏃 获取更高效的问题答疑和意见反馈 +- 🔥 提供与各行各业开发者充分交流的平台 + +干货满满 📘,等你来撩 💗,OpenMMLab 社区期待您的加入 👬 diff --git a/third_party/mmyolo/configs/_base_/default_runtime.py b/third_party/mmyolo/configs/_base_/default_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..098f220573cf481056f2f55f0621198270d51c49 --- /dev/null +++ b/third_party/mmyolo/configs/_base_/default_runtime.py @@ -0,0 +1,43 @@ +default_scope = 'mmyolo' + +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=1), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='mmdet.DetVisualizationHook')) + +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='mmdet.DetLocalVisualizer', + vis_backends=vis_backends, + name='visualizer') +log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) + +log_level = 'INFO' +load_from = None +resume = False + +# Example to use different file client +# Method 1: simply set the data root and let the file I/O module +# automatically infer from prefix (not support LMDB and Memcache yet) + +# data_root = 's3://openmmlab/datasets/detection/coco/' + +# Method 2: Use `backend_args`, `file_client_args` in versions +# before MMDet 3.0.0rc6 +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +backend_args = None diff --git a/third_party/mmyolo/configs/_base_/det_p5_tta.py b/third_party/mmyolo/configs/_base_/det_p5_tta.py new file mode 100644 index 0000000000000000000000000000000000000000..8df0d5ea8db46fe748cc8fe1074aa928c64b4309 --- /dev/null +++ b/third_party/mmyolo/configs/_base_/det_p5_tta.py @@ -0,0 +1,58 @@ +# TODO: Need to solve the problem of multiple backend_args parameters +# _backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +_backend_args = None + +tta_model = dict( + type='mmdet.DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300)) + +img_scales = [(640, 640), (320, 320), (960, 960)] + +# LoadImageFromFile +# / | \ +# (RatioResize,LetterResize) (RatioResize,LetterResize) (RatioResize,LetterResize) # noqa +# / \ / \ / \ +# RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip # noqa +# | | | | | | +# LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn +# | | | | | | +# PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn # noqa + +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/third_party/mmyolo/configs/_base_/pose/coco.py b/third_party/mmyolo/configs/_base_/pose/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..865a95bc02fedd318f32d2e7aa8397147d78fdb5 --- /dev/null +++ b/third_party/mmyolo/configs/_base_/pose/coco.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='coco', + paper_info=dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/third_party/mmyolo/configs/deploy/base_dynamic.py b/third_party/mmyolo/configs/deploy/base_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..747c21fd2bf0523c7d1e2ace67cff3f3d6612c2a --- /dev/null +++ b/third_party/mmyolo/configs/deploy/base_dynamic.py @@ -0,0 +1,17 @@ +_base_ = ['./base_static.py'] +onnx_config = dict( + dynamic_axes={ + 'input': { + 0: 'batch', + 2: 'height', + 3: 'width' + }, + 'dets': { + 0: 'batch', + 1: 'num_dets' + }, + 'labels': { + 0: 'batch', + 1: 'num_dets' + } + }) diff --git a/third_party/mmyolo/configs/deploy/base_static.py b/third_party/mmyolo/configs/deploy/base_static.py new file mode 100644 index 0000000000000000000000000000000000000000..dee01dd5dde1185b5e156b036f72fb3ccb0bf5bc --- /dev/null +++ b/third_party/mmyolo/configs/deploy/base_static.py @@ -0,0 +1,23 @@ +onnx_config = dict( + type='onnx', + export_params=True, + keep_initializers_as_inputs=False, + opset_version=11, + save_file='end2end.onnx', + input_names=['input'], + output_names=['dets', 'labels'], + input_shape=None, + optimize=True) +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) diff --git a/third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py b/third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..14f4a12115f403fb4d091db9c07f925ba2ad83ec --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_onnxruntime_dynamic.py @@ -0,0 +1,15 @@ +_base_ = ['./base_dynamic.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') diff --git a/third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py b/third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py new file mode 100644 index 0000000000000000000000000000000000000000..3eac8ca75715b711bdf03784dbb977a81bf444d3 --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_onnxruntime_static.py @@ -0,0 +1,15 @@ +_base_ = ['./base_static.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') diff --git a/third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py b/third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py new file mode 100644 index 0000000000000000000000000000000000000000..b7bd31331ebae8374dc06f9ed4e0e82a3204e36f --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_rknn-fp16_static-320x320.py @@ -0,0 +1,9 @@ +_base_ = ['./base_static.py'] +onnx_config = dict( + input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2']) +codebase_config = dict(model_type='rknn') +backend_config = dict( + type='rknn', + common_config=dict(target_platform='rv1126', optimization_level=1), + quantization_config=dict(do_quantization=False, dataset=None), + input_size_list=[[3, 320, 320]]) diff --git a/third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py b/third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py new file mode 100644 index 0000000000000000000000000000000000000000..10c96b2f26d27be28b384612d9ae8ee2cae84983 --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_rknn-int8_static-320x320.py @@ -0,0 +1,9 @@ +_base_ = ['./base_static.py'] +onnx_config = dict( + input_shape=[320, 320], output_names=['feat0', 'feat1', 'feat2']) +codebase_config = dict(model_type='rknn') +backend_config = dict( + type='rknn', + common_config=dict(target_platform='rv1126', optimization_level=1), + quantization_config=dict(do_quantization=True, dataset=None), + input_size_list=[[3, 320, 320]]) diff --git a/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py b/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py new file mode 100644 index 0000000000000000000000000000000000000000..da565b6c341add02a74579a734eb4cb123847e6d --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py @@ -0,0 +1,13 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=True, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py b/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py new file mode 100644 index 0000000000000000000000000000000000000000..bad8521afa6ebd4f9bb24a137b66fd1c66668361 --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py @@ -0,0 +1,13 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=True, max_workspace_size=1 << 32), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 64, 64], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 1344, 1344]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py b/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..24d2a00d9340b2e3cd3392ab2881b68cccd75e8a --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_tensorrt-fp16_static-640x640.py @@ -0,0 +1,14 @@ +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=True, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py b/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py new file mode 100644 index 0000000000000000000000000000000000000000..21591c4d4e72a867392adf9c49cd60c6bb994e35 --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_dynamic-192x192-960x960.py @@ -0,0 +1,15 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict( + fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +calib_config = dict(create_calib=True, calib_file='calib_data.h5') +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py b/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..ac394a6b3f854a0d23a1d37ff07d87c523c9784a --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_tensorrt-int8_static-640x640.py @@ -0,0 +1,16 @@ +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict( + fp16_mode=True, max_workspace_size=1 << 30, int8_mode=True), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +calib_config = dict(create_calib=True, calib_file='calib_data.h5') +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py b/third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py new file mode 100644 index 0000000000000000000000000000000000000000..17047d7380043da537f2f6029bb4373986062c04 --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py @@ -0,0 +1,13 @@ +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py b/third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py new file mode 100644 index 0000000000000000000000000000000000000000..9ec49cc114cc0025310766be17bb5c45af56c516 --- /dev/null +++ b/third_party/mmyolo/configs/deploy/detection_tensorrt_static-640x640.py @@ -0,0 +1,14 @@ +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False # whether to replace TRTBatchedNMS plugin with EfficientNMS plugin # noqa E501 diff --git a/third_party/mmyolo/configs/deploy/model/yolov5_s-static.py b/third_party/mmyolo/configs/deploy/model/yolov5_s-static.py new file mode 100644 index 0000000000000000000000000000000000000000..11b7f6a040271f4c82fce8e8240b23ad54fd18c7 --- /dev/null +++ b/third_party/mmyolo/configs/deploy/model/yolov5_s-static.py @@ -0,0 +1,19 @@ +_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) diff --git a/third_party/mmyolo/configs/deploy/model/yolov6_s-static.py b/third_party/mmyolo/configs/deploy/model/yolov6_s-static.py new file mode 100644 index 0000000000000000000000000000000000000000..4f64438ca3d3ba1699e514bc2c8ee900d5095d4d --- /dev/null +++ b/third_party/mmyolo/configs/deploy/model/yolov6_s-static.py @@ -0,0 +1,19 @@ +_base_ = '../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) diff --git a/third_party/mmyolo/configs/ppyoloe/README.md b/third_party/mmyolo/configs/ppyoloe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..70a5b2055bbbc79cc6e4817cc3d936780b09f73e --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/README.md @@ -0,0 +1,43 @@ +# PPYOLOE + + + +## Abstract + +PP-YOLOE is an excellent single-stage anchor-free model based on PP-YOLOv2, surpassing a variety of popular YOLO models. PP-YOLOE has a series of models, named s/m/l/x, which are configured through width multiplier and depth multiplier. PP-YOLOE avoids using special operators, such as Deformable Convolution or Matrix NMS, to be deployed friendly on various hardware. + +
+ +
+ +
+ +PPYOLOE-PLUS-l model structure +
+ +## Results and models + +### PPYOLOE+ COCO + +| Backbone | Arch | Size | Epoch | SyncBN | Mem (GB) | Box AP | Config | Download | +| :---------: | :--: | :--: | :---: | :----: | :------: | :----: | :----------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| PPYOLOE+ -s | P5 | 640 | 80 | Yes | 4.7 | 43.5 | [config](./ppyoloe_plus_s_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052.log.json) | +| PPYOLOE+ -m | P5 | 640 | 80 | Yes | 8.4 | 49.5 | [config](./ppyoloe_plus_m_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132.log.json) | +| PPYOLOE+ -l | P5 | 640 | 80 | Yes | 13.2 | 52.6 | [config](./ppyoloe_plus_l_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825.log.json) | +| PPYOLOE+ -x | P5 | 640 | 80 | Yes | 19.1 | 54.2 | [config](./ppyoloe_plus_x_fast_8xb8-80e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921-8c953949.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921.log.json) | + +**Note**: + +1. The above Box APs are all models with the best performance in COCO +2. The gap between the above performance and the official release is about 0.3. To speed up training in mmyolo, we use pytorch to implement the image resizing in `PPYOLOEBatchRandomResize` for multi-scale training, while official PPYOLOE use opencv. And `lanczos4` is not yet supported in `PPYOLOEBatchRandomResize`. The above two reasons lead to the gap. We will continue to experiment and address the gap in future releases. +3. The mAP of the non-Plus version needs more verification, and we will update more details of the non-Plus version in future versions. + +```latex +@article{Xu2022PPYOLOEAE, + title={PP-YOLOE: An evolved version of YOLO}, + author={Shangliang Xu and Xinxin Wang and Wenyu Lv and Qinyao Chang and Cheng Cui and Kaipeng Deng and Guanzhong Wang and Qingqing Dang and Shengyun Wei and Yuning Du and Baohua Lai}, + journal={ArXiv}, + year={2022}, + volume={abs/2203.16250} +} +``` diff --git a/third_party/mmyolo/configs/ppyoloe/metafile.yml b/third_party/mmyolo/configs/ppyoloe/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..5b7ed9487b60afecbd9db87f0ad89d9b3be8c93d --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/metafile.yml @@ -0,0 +1,69 @@ +Collections: + - Name: PPYOLOE + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - PPYOLOECSPResNet + - PPYOLOECSPPAFPN + Paper: + URL: https://arxiv.org/abs/2203.16250 + Title: 'PP-YOLOE: An evolved version of YOLO' + README: configs/ppyoloe/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: ppyoloe_plus_s_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 4.7 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 43.5 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth + - Name: ppyoloe_plus_m_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 8.4 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.5 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco/ppyoloe_plus_m_fast_8xb8-80e_coco_20230104_193132-e4325ada.pth + - Name: ppyoloe_plus_L_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_L_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 13.2 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.6 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco/ppyoloe_plus_l_fast_8xb8-80e_coco_20230102_203825-1864e7b3.pth + - Name: ppyoloe_plus_x_fast_8xb8-80e_coco + In Collection: PPYOLOE + Config: configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py + Metadata: + Training Memory (GB): 19.1 + Epochs: 80 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.2 + Weights: https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco/ppyoloe_plus_x_fast_8xb8-80e_coco_20230104_194921-8c953949.pth diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ef1b4eaae7240e07a5e8450f35b6f71f2271e09f --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_l_fast_8xb20-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_l_imagenet1k_pretrained-c0010e6c.pth' # noqa + +deepen_factor = 1.0 +widen_factor = 1.0 + +train_batch_size_per_gpu = 20 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..abcfd7833016164fbef84a70366b958f28ea6648 --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_m_fast_8xb28-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_m_imagenet1k_pretrained-09f1eba2.pth' # noqa + +deepen_factor = 0.67 +widen_factor = 0.75 + +train_batch_size_per_gpu = 28 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9db53e26f4168e82b6cd760e1b8f41c0bebfae8f --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_l_fast_8xb8-80e_coco.py @@ -0,0 +1,16 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_l_obj365_pretrained-3dd89562.pth' # noqa + +deepen_factor = 1.0 +widen_factor = 1.0 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..17cb33556f7ff111a4d702e6798abda1aaafeb01 --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_m_fast_8xb8-80e_coco.py @@ -0,0 +1,16 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_m_ojb365_pretrained-03206892.pth' # noqa + +deepen_factor = 0.67 +widen_factor = 0.75 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..752ff63388cee00156dc729b68242eae68e4d052 --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +# Compared to other same scale models, this configuration consumes too much +# GPU memory and is not validated for now +_base_ = 'ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +num_last_epochs = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 2 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco/ppyoloe_plus_s_fast_8xb8-80e_coco_20230101_154052-9fee7619.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +default_hooks = dict( + param_scheduler=dict( + warmup_min_iter=10, + warmup_epochs=3, + total_epochs=int(max_epochs * 1.2))) + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3d98252ccaec23c75b3e8aa3ddb095ee85010bd8 --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py @@ -0,0 +1,239 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# dataset settings +data_root = 'data/coco/' +dataset_type = 'YOLOv5CocoDataset' + +# parameters that often need to be modified +img_scale = (640, 640) # width, height +deepen_factor = 0.33 +widen_factor = 0.5 +max_epochs = 80 +num_classes = 80 +save_epoch_intervals = 5 +train_batch_size_per_gpu = 8 +train_num_workers = 8 +val_batch_size_per_gpu = 1 +val_num_workers = 2 + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_s_obj365_pretrained-bcfe8478.pth' # noqa + +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +# Base learning rate for optim_wrapper +base_lr = 0.001 + +strides = [8, 16, 32] + +model = dict( + type='YOLODetector', + data_preprocessor=dict( + # use this to support multi_scale training + type='PPYOLOEDetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='PPYOLOEBatchRandomResize', + random_size_range=(320, 800), + interval=1, + size_divisor=32, + random_interp=True, + keep_ratio=False) + ], + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='PPYOLOECSPResNet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + norm_cfg=dict(type='BN', momentum=0.1, eps=1e-5), + act_cfg=dict(type='SiLU', inplace=True), + attention_cfg=dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + use_large_stem=True), + neck=dict( + type='PPYOLOECSPPAFPN', + in_channels=[256, 512, 1024], + out_channels=[192, 384, 768], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csplayer=1, + num_blocks_per_layer=3, + block_cfg=dict( + type='PPYOLOEBasicBlock', shortcut=False, use_alpha=False), + norm_cfg=dict(type='BN', momentum=0.1, eps=1e-5), + act_cfg=dict(type='SiLU', inplace=True), + drop_block_cfg=None, + use_spp=True), + bbox_head=dict( + type='PPYOLOEHead', + head_module=dict( + type='PPYOLOEHeadModule', + num_classes=num_classes, + in_channels=[192, 384, 768], + widen_factor=widen_factor, + featmap_strides=strides, + reg_max=16, + norm_cfg=dict(type='BN', momentum=0.1, eps=1e-5), + act_cfg=dict(type='SiLU', inplace=True), + num_base_priors=1), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + loss_cls=dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox=dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + # Since the dflloss is implemented differently in the official + # and mmdet, we're going to divide loss_weight by 4. + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=0.5 / 4)), + train_cfg=dict( + initial_epoch=30, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=num_classes, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + topk=13, + alpha=1, + beta=6, + eps=1e-9)), + test_cfg=dict( + multi_label=True, + nms_pre=1000, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.7), + max_per_img=300)) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='PPYOLOERandomDistort'), + dict(type='mmdet.Expand', mean=(103.53, 116.28, 123.675)), + dict(type='PPYOLOERandomCrop'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate', use_ms_training=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', + data_prefix=dict(img='train2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=0), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='mmdet.FixShapeResize', + width=img_scale[0], + height=img_scale[1], + keep_ratio=False, + interpolation='bicubic'), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img='val2017/'), + filter_cfg=dict(filter_empty_gt=True, min_size=0), + ann_file='annotations/instances_val2017.json', + pipeline=test_pipeline)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.9, + weight_decay=5e-4, + nesterov=False), + paramwise_cfg=dict(norm_decay_mult=0.)) + +default_hooks = dict( + param_scheduler=dict( + type='PPYOLOEParamSchedulerHook', + warmup_min_iter=1000, + start_factor=0., + warmup_epochs=5, + min_lr_ratio=0.0, + total_epochs=int(max_epochs * 1.2)), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + save_best='auto', + max_keep_ckpts=3)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + 'annotations/instances_val2017.json', + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b8e61120bee63c67da1ae31e492709381b365b47 --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_plus_x_fast_8xb8-80e_coco.py @@ -0,0 +1,16 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +load_from = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/ppyoloe_plus_x_obj365_pretrained-43a8000d.pth' # noqa + +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..622332899cd4f8589559ed3484fb5affb6a7963b --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-300e_coco.py @@ -0,0 +1,36 @@ +_base_ = './ppyoloe_plus_s_fast_8xb8-80e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_s_imagenet1k_pretrained-2be81763.pth' # noqa + +train_batch_size_per_gpu = 32 +max_epochs = 300 + +# Base learning rate for optim_wrapper +base_lr = 0.01 + +model = dict( + data_preprocessor=dict( + mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], + std=[0.229 * 255., 0.224 * 255., 0.225 * 255.]), + backbone=dict( + block_cfg=dict(use_alpha=False), + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint=checkpoint, + map_location='cpu')), + train_cfg=dict(initial_epoch=100)) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2))) + +train_cfg = dict(max_epochs=max_epochs) + +# PPYOLOE plus use obj365 pretrained model, but PPYOLOE not, +# `load_from` need to set to None. +load_from = None diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..bef9e9130d6194fceeb6471369941050110ace2d --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_s_fast_8xb32-400e_coco.py @@ -0,0 +1,9 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +max_epochs = 400 + +model = dict(train_cfg=dict(initial_epoch=133)) + +default_hooks = dict(param_scheduler=dict(total_epochs=int(max_epochs * 1.2))) + +train_cfg = dict(max_epochs=max_epochs) diff --git a/third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fed594f0d08acf2fa64feffa419d0143d1036c55 --- /dev/null +++ b/third_party/mmyolo/configs/ppyoloe/ppyoloe_x_fast_8xb16-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = './ppyoloe_s_fast_8xb32-300e_coco.py' + +# The pretrained model is geted and converted from official PPYOLOE. +# https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.5/configs/ppyoloe/README.md +checkpoint = 'https://download.openmmlab.com/mmyolo/v0/ppyoloe/ppyoloe_pretrain/cspresnet_x_imagenet1k_pretrained-81c33ccb.pth' # noqa + +deepen_factor = 1.33 +widen_factor = 1.25 + +train_batch_size_per_gpu = 16 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) diff --git a/third_party/mmyolo/configs/razor/subnets/README.md b/third_party/mmyolo/configs/razor/subnets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..456021bdd32036a31ca9863194dd74a174fcdd76 --- /dev/null +++ b/third_party/mmyolo/configs/razor/subnets/README.md @@ -0,0 +1,79 @@ +# Projecs Based on MMRazor + +There are many research works and pre-trained models built on MMRazor. We list some of them as examples of how to use MMRazor slimmable models for downstream frameworks. As the page might not be completed, please feel free to contribute more efficient mmrazor-models to update this page. + +## Description + +This is an implementation of MMRazor Searchable Backbone Application, we provide detection configs and models for MMRazor in MMYOLO. + +### Backbone support + +Here are the Neural Architecture Search(NAS) Models that come from MMRazor which support YOLO Series. If you are looking for MMRazor models only for Backbone, you could refer to MMRazor [ModelZoo](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/docs/en/get_started/model_zoo.md) and corresponding repository. + +- [x] [AttentiveMobileNetV3](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/configs/_base_/nas_backbones/attentive_mobilenetv3_supernet.py) +- [x] [SearchableShuffleNetV2](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/configs/_base_/nas_backbones/spos_shufflenet_supernet.py) +- [x] [SearchableMobileNetV2](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/configs/_base_/nas_backbones/spos_mobilenet_supernet.py) + +## Usage + +### Prerequisites + +- [MMRazor v1.0.0rc2](https://github.com/open-mmlab/mmrazor/tree/v1.0.0rc2) or higher (dev-1.x) + +Install MMRazor using MIM. + +```shell +mim install mmengine +mim install "mmrazor>=1.0.0rc2" +``` + +Install MMRazor from source + +``` +git clone -b dev-1.x https://github.com/open-mmlab/mmrazor.git +cd mmrazor +# Install MMRazor +mim install -v -e . +``` + +### Training commands + +In MMYOLO's root directory, if you want to use single GPU for training, run the following command to train the model: + +```bash +CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_train.sh configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py +``` + +If you want to use several of these GPUs to train in parallel, you can use the following command: + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py +``` + +### Testing commands + +In MMYOLO's root directory, run the following command to test the model: + +```bash +CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_test.sh configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py ${CHECKPOINT_PATH} +``` + +## Results and Models + +Here we provide the baseline version of YOLO Series with NAS backbone. + +| Model | size | box AP | Params(M) | FLOPs(G) | Config | Download | +| :------------------------: | :--: | :----: | :----------: | :------: | :---------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| yolov5-s | 640 | 37.7 | 7.235 | 8.265 | [config](../../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json) | +| yolov5_s_spos_shufflenetv2 | 640 | 38.0 | 7.04(-2.7%) | 7.03 | [config](./yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco_20230211_220635-578be9a9.pth) \| log | +| yolov6-s | 640 | 44.0 | 18.869 | 24.253 | [config](../../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json) | +| yolov6_l_attentivenas_a6 | 640 | 45.3 | 18.38(-2.6%) | 8.49 | [config](./yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco_20230211_222409-dcc72668.pth) \| log | +| RTMDet-tiny | 640 | 41.0 | 4.8 | 8.1 | [config](../../rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117.log.json) | +| rtmdet_tiny_ofa_lat31 | 960 | 41.3 | 3.91(-18.5%) | 6.09 | [config](./rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco_20230214_210623-449bb2a0.pth) \| log | + +**Note**: + +1. For fair comparison, the training configuration is consistent with the original configuration and results in an improvement of about 0.2-0.5% AP. +2. `yolov5_s_spos_shufflenetv2` achieves 38.0% AP with only 7.042M parameters, directly instead of the backbone, and outperforms `yolov5_s` with a similar size by more than 0.3% AP. +3. With the efficient backbone of `yolov6_l_attentivenas_a6`, the input channels of `YOLOv6RepPAFPN` are reduced. Meanwhile, modify the **deepen_factor** and the neck is made deeper to restore the AP. +4. with the `rtmdet_tiny_ofa_lat31` backbone with only 3.315M parameters and 3.634G flops, we can modify the input resolution to 960, with a similar model size compared to `rtmdet_tiny` and exceeds `rtmdet_tiny` by 0.4% AP, reducing the size of the whole model to 3.91 MB. diff --git a/third_party/mmyolo/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py b/third_party/mmyolo/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2f9da6685ef0ef920ceb137a165dfb8adcd36254 --- /dev/null +++ b/third_party/mmyolo/configs/razor/subnets/rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py @@ -0,0 +1,124 @@ +_base_ = [ + 'mmrazor::_base_/nas_backbones/ofa_mobilenetv3_supernet.py', + '../../rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py' +] + +checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/ofa/ofa_mobilenet_subnet_8xb256_in1k_note8_lat%4031ms_top1%4072.8_finetune%4025.py_20221214_0939-981a8b2a.pth' # noqa +fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/yolo_nas_backbone/OFA_SUBNET_NOTE8_LAT31.yaml' # noqa +deepen_factor = 0.167 +widen_factor = 1.0 +channels = [40, 112, 160] +train_batch_size_per_gpu = 16 +img_scale = (960, 960) + +_base_.nas_backbone.out_indices = (2, 4, 5) +_base_.nas_backbone.conv_cfg = dict(type='mmrazor.OFAConv2d') +_base_.nas_backbone.init_cfg = dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='architecture.backbone.') +nas_backbone = dict( + type='mmrazor.sub_model', + fix_subnet=fix_subnet, + cfg=_base_.nas_backbone, + extra_prefix='backbone.') + +_base_.model.backbone = nas_backbone +_base_.model.neck.widen_factor = widen_factor +_base_.model.neck.deepen_factor = deepen_factor +_base_.model.neck.in_channels = channels +_base_.model.neck.out_channels = channels[0] +_base_.model.bbox_head.head_module.in_channels = channels[0] +_base_.model.bbox_head.head_module.feat_channels = channels[0] +_base_.model.bbox_head.head_module.widen_factor = widen_factor + +_base_.model.test_cfg = dict( + multi_label=True, + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=20, + random_pop=False, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + scale=(1280, 1280), + ratio_range=(0.5, 2.0), # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOXMixUp', + img_scale=(960, 960), + ratio_range=(1.0, 1.0), + max_cached_images=10, + use_cached=True, + random_pop=False, + pad_val=(114, 114, 114), + prob=0.5), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=(0.5, 2.0), # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, dataset=dict(pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=(960, 960), keep_ratio=True), + dict(type='mmdet.Pad', size=(960, 960), pad_val=dict(img=(114, 114, 114))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) + +test_dataloader = val_dataloader + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] + +find_unused_parameters = True diff --git a/third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py b/third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..beb4941cfa482ec52e83abc67df70d9734fa3d3a --- /dev/null +++ b/third_party/mmyolo/configs/razor/subnets/yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py @@ -0,0 +1,29 @@ +_base_ = [ + 'mmrazor::_base_/nas_backbones/spos_shufflenet_supernet.py', + '../../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' +] + +checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_v3.pth' # noqa +fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/spos/spos_shufflenetv2_subnet_8xb128_in1k_flops_0.33M_acc_73.87_20211222-1f0a0b4d_subnet_cfg_v3.yaml' # noqa +widen_factor = 1.0 +channels = [160, 320, 640] + +_base_.nas_backbone.out_indices = (1, 2, 3) +_base_.nas_backbone.init_cfg = dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='architecture.backbone.') +nas_backbone = dict( + type='mmrazor.sub_model', + fix_subnet=fix_subnet, + cfg=_base_.nas_backbone, + extra_prefix='architecture.backbone.') + +_base_.model.backbone = nas_backbone +_base_.model.neck.widen_factor = widen_factor +_base_.model.neck.in_channels = channels +_base_.model.neck.out_channels = channels +_base_.model.bbox_head.head_module.in_channels = channels +_base_.model.bbox_head.head_module.widen_factor = widen_factor + +find_unused_parameters = True diff --git a/third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab64a6460b3fbb29cc1a47a1bd1a2456bb11ac3 --- /dev/null +++ b/third_party/mmyolo/configs/razor/subnets/yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,35 @@ +_base_ = [ + 'mmrazor::_base_/nas_backbones/attentive_mobilenetv3_supernet.py', + '../../yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py' +] + +checkpoint_file = 'https://download.openmmlab.com/mmrazor/v1/bignas/attentive_mobilenet_subnet_8xb256_in1k_flops-0.93G_acc-80.81_20221229_200440-73d92cc6.pth' # noqa +fix_subnet = 'https://download.openmmlab.com/mmrazor/v1/bignas/ATTENTIVE_SUBNET_A6.yaml' # noqa +deepen_factor = 1.2 +widen_factor = 1 +channels = [40, 128, 224] +mid_channels = [40, 128, 224] + +_base_.train_dataloader.batch_size = 16 +_base_.nas_backbone.out_indices = (2, 4, 6) +_base_.nas_backbone.conv_cfg = dict(type='mmrazor.BigNasConv2d') +_base_.nas_backbone.norm_cfg = dict(type='mmrazor.DynamicBatchNorm2d') +_base_.nas_backbone.init_cfg = dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='architecture.backbone.') +nas_backbone = dict( + type='mmrazor.sub_model', + fix_subnet=fix_subnet, + cfg=_base_.nas_backbone, + extra_prefix='backbone.') + +_base_.model.backbone = nas_backbone +_base_.model.neck.widen_factor = widen_factor +_base_.model.neck.deepen_factor = deepen_factor +_base_.model.neck.in_channels = channels +_base_.model.neck.out_channels = mid_channels +_base_.model.bbox_head.head_module.in_channels = mid_channels +_base_.model.bbox_head.head_module.widen_factor = widen_factor + +find_unused_parameters = True diff --git a/third_party/mmyolo/configs/rtmdet/README.md b/third_party/mmyolo/configs/rtmdet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..94e86546a34c3d70da4b51d81ff46e8ee7d5f242 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/README.md @@ -0,0 +1,83 @@ +# RTMDet: An Empirical Study of Designing Real-Time Object Detectors + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/real-time-instance-segmentation-on-mscoco)](https://paperswithcode.com/sota/real-time-instance-segmentation-on-mscoco?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-dota-1)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-dota-1?p=rtmdet-an-empirical-study-of-designing-real) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/rtmdet-an-empirical-study-of-designing-real/object-detection-in-aerial-images-on-hrsc2016)](https://paperswithcode.com/sota/object-detection-in-aerial-images-on-hrsc2016?p=rtmdet-an-empirical-study-of-designing-real) + + + +## Abstract + +In this paper, we aim to design an efficient real-time object detector that exceeds the YOLO series and is easily extensible for many object recognition tasks such as instance segmentation and rotated object detection. To obtain a more efficient model architecture, we explore an architecture that has compatible capacities in the backbone and neck, constructed by a basic building block that consists of large-kernel depth-wise convolutions. We further introduce soft labels when calculating matching costs in the dynamic label assignment to improve accuracy. Together with better training techniques, the resulting object detector, named RTMDet, achieves 52.8% AP on COCO with 300+ FPS on an NVIDIA 3090 GPU, outperforming the current mainstream industrial detectors. RTMDet achieves the best parameter-accuracy trade-off with tiny/small/medium/large/extra-large model sizes for various application scenarios, and obtains new state-of-the-art performance on real-time instance segmentation and rotated object detection. We hope the experimental results can provide new insights into designing versatile real-time object detectors for many object recognition tasks. + +
+ +
+ +
+ +RTMDet-l model structure +
+ +## Results and Models + +### Object Detection + +| Model | size | Params(M) | FLOPs(G) | TRT-FP16-Latency(ms) | box AP | TTA box AP | Config | Download | +| :------------: | :--: | :-------: | :------: | :------------------: | :---------: | :---------: | :---------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-tiny | 640 | 4.8 | 8.1 | 0.98 | 41.0 | 42.7 | [config](./rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117.log.json) | +| RTMDet-tiny \* | 640 | 4.8 | 8.1 | 0.98 | 41.8 (+0.8) | 43.2 (+0.5) | [config](./distillation/kd_tiny_rtmdet_s_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-e1e4197c.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-176901d8.json) | +| RTMDet-s | 640 | 8.89 | 14.8 | 1.22 | 44.6 | 45.8 | [config](./rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329.log.json) | +| RTMDet-s \* | 640 | 8.89 | 14.8 | 1.22 | 45.7 (+1.1) | 47.3 (+1.5) | [config](./distillation/kd_s_rtmdet_m_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-446ff003.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-89862269.json) | +| RTMDet-m | 640 | 24.71 | 39.27 | 1.62 | 49.3 | 50.9 | [config](./rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952.log.json) | +| RTMDet-m \* | 640 | 24.71 | 39.27 | 1.62 | 50.2 (+0.9) | 51.9 (+1.0) | [config](./distillation/kd_m_rtmdet_l_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-b806f503.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-bd028fd3.json) | +| RTMDet-l | 640 | 52.3 | 80.23 | 2.44 | 51.4 | 53.1 | [config](./rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928.log.json) | +| RTMDet-l \* | 640 | 52.3 | 80.23 | 2.44 | 52.3 (+0.9) | 53.7 (+0.6) | [config](./distillation/kd_l_rtmdet_x_neck_300e_coco.py) | [model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c9979722.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c5c4e17b.json) | +| RTMDet-x | 640 | 94.86 | 141.67 | 3.10 | 52.8 | 54.2 | [config](./rtmdet_x_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345.log.json) | + +**Note**: + +1. The inference speed of RTMDet is measured on an NVIDIA 3090 GPU with TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1, and without NMS. +2. For a fair comparison, the config of bbox postprocessing is changed to be consistent with YOLOv5/6/7 after [PR#9494](https://github.com/open-mmlab/mmdetection/pull/9494), bringing about 0.1~0.3% AP improvement. +3. `TTA` means that Test Time Augmentation. It's perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). You only need to specify `--tta` when testing to enable. see [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/common_usage/tta.md) for details. +4. \* means checkpoints are trained with knowledge distillation. More details can be found in [RTMDet distillation](./distillation). + +### Rotated Object Detection + +RTMDet-R achieves state-of-the-art on various remote sensing datasets. + +| Backbone | pretrain | Epoch | Batch Size | Aug | mmAP | mAP50 | mAP75 | Mem (GB) | Params(M) | FLOPS(G) | TRT-FP16-Latency(ms) | Config | Download | +| :---------: | :------: | :---: | :--------: | :-------------: | :---: | :---: | :---: | :------: | :-------: | :------: | :------------------: | :--------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| RTMDet-tiny | IN | 36 | 1xb8 | RR | 46.94 | 75.07 | 50.11 | 12.7 | 4.88 | 20.45 | 4.40 | [config](./rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota/rtmdet-r_tiny_fast_1xb8-36e_dota_20230228_162210-e8ccfb1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota/rtmdet-r_tiny_fast_1xb8-36e_dota_20230228_162210.log.json) | +| RTMDet-s | IN | 36 | 1xb8 | RR | 48.99 | 77.33 | 52.65 | 16.6 | 8.86 | 37.62 | 4.86 | [config](./rotated/rtmdet-r_s_fast_1xb8-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota/rtmdet-r_s_fast_1xb8-36e_dota_20230224_110307-3946a5aa.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota/rtmdet-r_s_fast_1xb8-36e_dota_20230224_110307.log.json) | +| RTMDet-m | IN | 36 | 2xb4 | RR | 50.38 | 78.43 | 54.28 | 10.9 | 24.67 | 99.76 | 7.82 | [config](./rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota/rtmdet-r_m_syncbn_fast_2xb4-36e_dota_20230224_124237-29ae1619.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota/rtmdet-r_m_syncbn_fast_2xb4-36e_dota_20230224_124237.log.json) | +| RTMDet-l | IN | 36 | 2xb4 | RR | 50.61 | 78.66 | 54.95 | 16.1 | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota/rtmdet-r_l_syncbn_fast_2xb4-36e_dota_20230224_124544-38bc5f08.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota/rtmdet-r_l_syncbn_fast_2xb4-36e_dota_20230224_124544.log.json) | +| RTMDet-tiny | IN | 36 | 1xb8 | MS+RR | - | - | - | | 4.88 | 20.45 | 4.40 | [config](./rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py) | \| | +| RTMDet-s | IN | 36 | 1xb8 | MS+RR | - | - | - | | 8.86 | 37.62 | 4.86 | [config](./rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py) | \| | +| RTMDet-m | IN | 36 | 2xb4 | MS+RR | - | - | - | | 24.67 | 99.76 | 7.82 | [config](./rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py) | \| | +| RTMDet-l | IN | 36 | 2xb4 | MS+RR | - | - | - | | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py) | \| | +| RTMDet-l | COCO | 36 | 2xb4 | MS+RR | - | - | - | | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py) | \| | +| RTMDet-l | IN | 100 | 2xb4 | Mixup+Mosaic+RR | 55.05 | 80.14 | 61.32 | 19.6 | 52.27 | 204.21 | 10.82 | [config](./rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py) | [model](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota_20230224_124735-ed4ea966.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota_20230224_124735.log.json) | + +**Note**: + +1. Please follow doc to get start with rotated detection. [Rotated Object Detection](../../docs/zh_cn/tutorials/rotated_detection.md) +2. We follow the latest metrics from the DOTA evaluation server, original voc format mAP is now mAP50. +3. All models trained with image size 1024\*1024. +4. `IN` means ImageNet pretrain, `COCO` means COCO pretrain. +5. For Aug, RR means `RandomRotate`, MS means multi-scale augmentation in data prepare. +6. The inference speed here is measured on an NVIDIA 2080Ti GPU with TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1, and with NMS. +7. Currently, the training process of RTMDet-R tiny is unstable and may have 1% accuracy fluctuation, we will continue to investigate why. + +## Citation + +```latex +@misc{lyu2022rtmdet, + title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors}, + author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen}, + year={2022}, + eprint={2212.07784}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md b/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2db5a50ec5ed0d3b499ca7d3c83bc4963c95af3f --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/README.md @@ -0,0 +1,53 @@ +# CSPNeXt ImageNet Pre-training + +In this folder, we provide the imagenet pre-training config of RTMDet's backbone CSPNeXt. + +## Requirements + +To train with these configs, please install [MMClassification 1.x](https://github.com/open-mmlab/mmclassification/tree/1.x) first. + +Install by MIM: + +```shell +mim install mmcls>=1.0.0rc0 +``` + +or install by pip: + +```shell +pip install mmcls>=1.0.0rc0 +``` + +## Prepare Dataset + +To pre-train on ImageNet, you need to prepare the dataset first. Please refer to the [guide](https://mmclassification.readthedocs.io/en/1.x/user_guides/dataset_prepare.html#imagenet). + +## How to Train + +You can use the classification config in the same way as the detection config. + +For single-GPU training, run: + +```shell +python tools/train.py \ + ${CONFIG_FILE} \ + [optional arguments] +``` + +For multi-GPU training, run: + +```shell +bash ./tools/dist_train.sh \ + ${CONFIG_FILE} \ + ${GPU_NUM} \ + [optional arguments] +``` + +More details can be found in [user guides](https://mmdetection.readthedocs.io/en/3.x/user_guides/train.html). + +## Results and Models + +| Model | resolution | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Download | +| :----------: | :--------: | :-------: | :------: | :-------: | :-------: | :-----------------------------------------------------------------------------------------------------------------: | +| CSPNeXt-tiny | 224x224 | 2.73 | 0.339 | 69.44 | 89.45 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth) | +| CSPNeXt-s | 224x224 | 4.89 | 0.664 | 74.41 | 92.23 | [model](https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth) | diff --git a/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py b/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000000000000000000000000000000000000..4281f9cd7d260f22d7b0e8d18d2c4f56866ad840 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-s_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,67 @@ +_base_ = [ + 'mmcls::_base_/datasets/imagenet_bs256_rsb_a12.py', + 'mmcls::_base_/schedules/imagenet_bs2048_rsb.py', + 'mmcls::_base_/default_runtime.py' +] + +custom_imports = dict( + imports=['mmdet.models', 'mmyolo.models'], allow_failed_imports=False) + +model = dict( + type='ImageClassifier', + backbone=dict( + type='mmyolo.CSPNeXt', + arch='P5', + out_indices=(4, ), + expand_ratio=0.5, + deepen_factor=0.33, + widen_factor=0.5, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='mmyolo.SiLU')), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=512, + loss=dict( + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='original', + loss_weight=1.0), + topk=(1, 5)), + train_cfg=dict(augments=[ + dict(type='Mixup', alpha=0.2, num_classes=1000), + dict(type='CutMix', alpha=1.0, num_classes=1000) + ])) + +# dataset settings +train_dataloader = dict(sampler=dict(type='RepeatAugSampler', shuffle=True)) + +# schedule settings +optim_wrapper = dict( + optimizer=dict(weight_decay=0.01), + paramwise_cfg=dict(bias_decay_mult=0., norm_decay_mult=0.), +) + +param_scheduler = [ + # warm up learning rate scheduler + dict( + type='LinearLR', + start_factor=0.0001, + by_epoch=True, + begin=0, + end=5, + # update by iter + convert_to_iter_based=True), + # main learning rate scheduler + dict( + type='CosineAnnealingLR', + T_max=595, + eta_min=1.0e-6, + by_epoch=True, + begin=5, + end=600) +] + +train_cfg = dict(by_epoch=True, max_epochs=600) diff --git a/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py b/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py new file mode 100644 index 0000000000000000000000000000000000000000..af3170bdc51778c4601d4426aa88cc27c608f100 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/cspnext_imagenet_pretrain/cspnext-tiny_8xb256-rsb-a1-600e_in1k.py @@ -0,0 +1,5 @@ +_base_ = './cspnext-s_8xb256-rsb-a1-600e_in1k.py' + +model = dict( + backbone=dict(deepen_factor=0.167, widen_factor=0.375), + head=dict(in_channels=384)) diff --git a/third_party/mmyolo/configs/rtmdet/distillation/README.md b/third_party/mmyolo/configs/rtmdet/distillation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..452a46cb9904a1782c0fee9cd7d469c0749caadb --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/distillation/README.md @@ -0,0 +1,146 @@ +# Distill RTM Detectors Based on MMRazor + +## Description + +To further improve the model accuracy while not introducing much additional +computation cost, we apply the feature-based distillation to the training phase +of these RTM detectors. In summary, our distillation strategy are threefold: + +(1) Inspired by [PKD](https://arxiv.org/abs/2207.02039), we first normalize +the intermediate feature maps to have zero mean and unit variances before calculating +the distillation loss. + +(2) Inspired by [CWD](https://arxiv.org/abs/2011.13256), we adopt the channel-wise +distillation paradigm, which can pay more attention to the most salient regions +of each channel. + +(3) Inspired by [DAMO-YOLO](https://arxiv.org/abs/2211.15444), the distillation +process is split into two stages. 1) The teacher distills the student at the +first stage (280 epochs) on strong mosaic domain. 2) The student finetunes itself +on no masaic domain at the second stage (20 epochs). + +## Results and Models + +| Location | Dataset | Teacher | Student | mAP | mAP(T) | mAP(S) | Config | Download | +| :------: | :-----: | :---------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------: | :---------: | :----: | :----: | :------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| FPN | COCO | [RTMDet-s](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-tiny](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py) | 41.8 (+0.8) | 44.6 | 41.0 | [config](kd_tiny_rtmdet_s_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-e1e4197c.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-176901d8.json) | +| FPN | COCO | [RTMDet-m](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-s](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py) | 45.7 (+1.1) | 49.3 | 44.6 | [config](kd_s_rtmdet_m_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-446ff003.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-89862269.json) | +| FPN | COCO | [RTMDet-l](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-m](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py) | 50.2 (+0.9) | 51.4 | 49.3 | [config](kd_m_rtmdet_l_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-b806f503.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-bd028fd3.json) | +| FPN | COCO | [RTMDet-x](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py) | [RTMDet-l](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py) | 52.3 (+0.9) | 52.8 | 51.4 | [config](kd_l_rtmdet_x_neck_300e_coco.py) | [teacher](https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth) \|[model](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c9979722.pth) \| [log](https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c5c4e17b.json) | + +## Usage + +### Prerequisites + +- [MMRazor dev-1.x](https://github.com/open-mmlab/mmrazor/tree/dev-1.x) + +Install MMRazor from source + +``` +git clone -b dev-1.x https://github.com/open-mmlab/mmrazor.git +cd mmrazor +# Install MMRazor +mim install -v -e . +``` + +### Training commands + +In MMYOLO's root directory, run the following command to train the RTMDet-tiny +with 8 GPUs, using RTMDet-s as the teacher: + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=29500 ./tools/dist_train.sh configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py +``` + +### Testing commands + +In MMYOLO's root directory, run the following command to test the model: + +```bash +CUDA_VISIBLE_DEVICES=0 PORT=29500 ./tools/dist_test.sh configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py ${CHECKPOINT_PATH} +``` + +### Getting student-only checkpoint + +After training, the checkpoint contains parameters for both student and teacher models. +Run the following command to convert it to student-only checkpoint: + +```bash +python ./tools/model_converters/convert_kd_ckpt_to_student.py ${CHECKPOINT_PATH} --out-path ${OUTPUT_CHECKPOINT_PATH} +``` + +## Configs + +Here we provide detection configs and models for MMRazor in MMYOLO. For clarify, +we take `./kd_tiny_rtmdet_s_neck_300e_coco.py` as an example to show how to +distill a RTM detector based on MMRazor. + +Here is the main part of `./kd_tiny_rtmdet_s_neck_300e_coco.py`. + +```shell +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +distiller=dict( + type='ConfigurableDistiller', + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + connectors=dict( + fpn0_s=dict(type='ConvModuleConnector', in_channel=96, + out_channel=128, bias=False, norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', in_channel=96, + out_channel=128, bias=False, norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', in_channel=96, + out_channel=128, bias=False, norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict(from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict(from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict(from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict(from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict(from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict(from_student=False, recorder='fpn2', connector='fpn2_t')))) + +``` + +`recorders` are used to record various intermediate results during the model forward. +In this example, they can help record the output of 3 `nn.Module` of the teacher +and the student. Details are list in [Recorder](https://github.com/open-mmlab/mmrazor/blob/dev-1.x/docs/en/advanced_guides/recorder.md) and [MMRazor Distillation](https://zhuanlan.zhihu.com/p/596582609) (if users can read Chinese). + +`connectors` are adaptive layers which usually map teacher's and students features +to the same dimension. + +`distill_losses` are configs for multiple distill losses. + +`loss_forward_mappings` are mappings between distill loss forward arguments and records. + +In addition, the student finetunes itself on no masaic domain at the last 20 epochs, +so we add a new hook named `StopDistillHook` to stop distillation on time. +We need to add this hook to the `custom_hooks` list like this: + +```shell +custom_hooks = [..., dict(type='mmrazor.StopDistillHook', detach_epoch=280)] +``` diff --git a/third_party/mmyolo/configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py b/third_party/mmyolo/configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2bab26a0d20342c38d7d1ec0a8221fdc426f016b --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=256, + out_channel=320, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=320, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=256, + out_channel=320, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=320, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=256, + out_channel=320, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=320, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/third_party/mmyolo/configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py b/third_party/mmyolo/configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f7d7f9211f1f77c4d83677f7f6c485a5c6212252 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_m_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=192, + out_channel=256, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=256, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=192, + out_channel=256, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=256, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=192, + out_channel=256, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=256, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/third_party/mmyolo/configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py b/third_party/mmyolo/configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..99b5dc5e48d04fed927cbd80c1538ca99912fc1b --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_s_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=128, + out_channel=192, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=192, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=128, + out_channel=192, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=192, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=128, + out_channel=192, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=192, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/third_party/mmyolo/configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py b/third_party/mmyolo/configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..50c23580bf6b7c1a120267a65bc7cc334513c475 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py @@ -0,0 +1,99 @@ +_base_ = '../rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py' + +teacher_ckpt = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth' # noqa: E501 + +norm_cfg = dict(type='BN', affine=False, track_running_stats=False) + +model = dict( + _delete_=True, + _scope_='mmrazor', + type='FpnTeacherDistill', + architecture=dict( + cfg_path='mmyolo::rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py'), + teacher=dict( + cfg_path='mmyolo::rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py'), + teacher_ckpt=teacher_ckpt, + distiller=dict( + type='ConfigurableDistiller', + # `recorders` are used to record various intermediate results during + # the model forward. + student_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv'), + ), + teacher_recorders=dict( + fpn0=dict(type='ModuleOutputs', source='neck.out_layers.0.conv'), + fpn1=dict(type='ModuleOutputs', source='neck.out_layers.1.conv'), + fpn2=dict(type='ModuleOutputs', source='neck.out_layers.2.conv')), + # `connectors` are adaptive layers which usually map teacher's and + # students features to the same dimension. + connectors=dict( + fpn0_s=dict( + type='ConvModuleConnector', + in_channel=96, + out_channel=128, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn0_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn1_s=dict( + type='ConvModuleConnector', + in_channel=96, + out_channel=128, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn1_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg), + fpn2_s=dict( + type='ConvModuleConnector', + in_channel=96, + out_channel=128, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None), + fpn2_t=dict( + type='NormConnector', in_channels=128, norm_cfg=norm_cfg)), + distill_losses=dict( + loss_fpn0=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn1=dict(type='ChannelWiseDivergence', loss_weight=1), + loss_fpn2=dict(type='ChannelWiseDivergence', loss_weight=1)), + # `loss_forward_mappings` are mappings between distill loss forward + # arguments and records. + loss_forward_mappings=dict( + loss_fpn0=dict( + preds_S=dict( + from_student=True, recorder='fpn0', connector='fpn0_s'), + preds_T=dict( + from_student=False, recorder='fpn0', connector='fpn0_t')), + loss_fpn1=dict( + preds_S=dict( + from_student=True, recorder='fpn1', connector='fpn1_s'), + preds_T=dict( + from_student=False, recorder='fpn1', connector='fpn1_t')), + loss_fpn2=dict( + preds_S=dict( + from_student=True, recorder='fpn2', connector='fpn2_s'), + preds_T=dict( + from_student=False, recorder='fpn2', + connector='fpn2_t'))))) + +find_unused_parameters = True + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=_base_.train_pipeline_stage2), + # stop distillation after the 280th epoch + dict(type='mmrazor.StopDistillHook', stop_epoch=280) +] diff --git a/third_party/mmyolo/configs/rtmdet/metafile.yml b/third_party/mmyolo/configs/rtmdet/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..704a44ba83c90d1c639d4bcbabf88b72fa867553 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/metafile.yml @@ -0,0 +1,215 @@ +Collections: + - Name: RTMDet + Metadata: + Training Data: COCO + Training Techniques: + - AdamW + - Flat Cosine Annealing + Training Resources: 8x A100 GPUs + Architecture: + - CSPNeXt + - CSPNeXtPAFPN + README: configs/rtmdet/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.1 + - Name: Rotated_RTMDet + Metadata: + Training Data: DOTAv1.0 + Training Techniques: + - AdamW + - Flat Cosine Annealing + Training Resources: 1x A100 GPUs + Architecture: + - CSPNeXt + - CSPNeXtPAFPN + README: configs/rtmdet/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.1 + +Models: + - Name: rtmdet_tiny_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 11.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth + + - Name: kd_tiny_rtmdet_s_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_tiny_rtmdet_s_neck_300e_coco.py + Metadata: + Training Memory (GB): 11.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.8 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_tiny_rtmdet_s_neck_300e_coco/kd_tiny_rtmdet_s_neck_300e_coco_20230213_104240-e1e4197c.pth + + - Name: rtmdet_s_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 15.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.6 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth + + - Name: kd_s_rtmdet_m_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_s_rtmdet_m_neck_300e_coco.py + Metadata: + Training Memory (GB): 16.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.7 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_s_rtmdet_m_neck_300e_coco/kd_s_rtmdet_m_neck_300e_coco_20230220_140647-446ff003.pth + + - Name: rtmdet_m_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 27.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.3 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco/rtmdet_m_syncbn_fast_8xb32-300e_coco_20230102_135952-40af4fe8.pth + + - Name: kd_m_rtmdet_l_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_m_rtmdet_l_neck_300e_coco.py + Metadata: + Training Memory (GB): 29.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.2 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_m_rtmdet_l_neck_300e_coco/kd_m_rtmdet_l_neck_300e_coco_20230220_141313-b806f503.pth + + - Name: rtmdet_l_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 43.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.4 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth + + - Name: kd_l_rtmdet_x_neck_300e_coco + In Collection: RTMDet + Config: configs/rtmdet/distillation/kd_l_rtmdet_x_neck_300e_coco.py + Metadata: + Training Memory (GB): 45.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.3 + Weights: https://download.openmmlab.com/mmrazor/v1/rtmdet_distillation/kd_l_rtmdet_x_neck_300e_coco/kd_l_rtmdet_x_neck_300e_coco_20230220_141912-c9979722.pth + + - Name: rtmdet_x_syncbn_fast_8xb32-300e_coco + In Collection: RTMDet + Config: configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 63.4 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.8 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco/rtmdet_x_syncbn_fast_8xb32-300e_coco_20221231_100345-b85cd476.pth + + - Name: rtmdet-r_tiny_fast_1xb8-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py + Metadata: + Training Memory (GB): 12.7 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 75.07 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota/rtmdet-r_tiny_fast_1xb8-36e_dota_20230228_162210-e8ccfb1c.pth + + - Name: rtmdet-r_s_fast_1xb8-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py + Metadata: + Training Memory (GB): 16.6 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 77.33 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota/rtmdet-r_s_fast_1xb8-36e_dota_20230224_110307-3946a5aa.pth + + - Name: rtmdet-r_m_syncbn_fast_2xb4-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py + Metadata: + Training Resources: 2x A100 GPUs + Training Memory (GB): 10.9 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 78.43 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota/rtmdet-r_m_syncbn_fast_2xb4-36e_dota_20230224_124237-29ae1619.pth + + - Name: rtmdet-r_l_syncbn_fast_2xb4-36e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py + Metadata: + Training Resources: 2x A100 GPUs + Training Memory (GB): 16.1 + Epochs: 36 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 78.66 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota/rtmdet-r_l_syncbn_fast_2xb4-36e_dota_20230224_124544-38bc5f08.pth + + - Name: rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota + In Collection: Rotated_RTMDet + Config: configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py + Metadata: + Training Resources: 2x A100 GPUs + Training Memory (GB): 19.6 + Epochs: 100 + Results: + - Task: Oriented Object Detection + Dataset: DOTAv1.0 + Metrics: + mAP: 80.14 + Weights: https://download.openmmlab.com/mmyolo/v0/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota_20230224_124735-ed4ea966.pth diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..ef29a1d051b84d8c546edb3cabb958ec586e1261 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py @@ -0,0 +1,30 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +# ========================modified parameters====================== +data_root = 'data/split_ms_dota/' +# Path of test images folder +test_data_prefix = 'test/images/' +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +train_dataloader = dict(dataset=dict(data_root=data_root)) + +val_dataloader = dict(dataset=dict(data_root=data_root)) + +# Inference on val dataset +test_dataloader = val_dataloader + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..cbb2ae77a370a73e463068e11291afb4a59cda02 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py @@ -0,0 +1,331 @@ +_base_ = '../../_base_/default_runtime.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth' # noqa + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/split_ss_dota/' +# Path of train annotation folder +train_ann_file = 'trainval/annfiles/' +train_data_prefix = 'trainval/images/' # Prefix of train image path +# Path of val annotation folder +val_ann_file = 'trainval/annfiles/' +val_data_prefix = 'trainval/images/' # Prefix of val image path +# Path of test images folder +test_data_prefix = 'test/images/' + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +num_classes = 15 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 4 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 1xb8=8 bs +base_lr = 0.00025 # 0.004 / 16 +max_epochs = 36 # Maximum training epochs + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # Decode rbox with angle, For RTMDet-R, Defaults to True. + # When set to True, use rbox coder such as DistanceAnglePointCoder + # When set to False, use hbox coder such as DistancePointBBoxCoder + # different setting lead to different AP. + decode_with_angle=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.05, # Threshold to filter out boxes. + nms=dict(type='nms_rotated', iou_threshold=0.1), # NMS type and threshold + max_per_img=2000) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (1024, 1024) # width, height +# ratio for random rotate +random_rotate_ratio = 0.5 +# label ids for rect objs +rotate_rect_obj_labels = [9, 11] +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5DOTADataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 8 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 8 + +# Config of batch shapes. Only on val. Not use in RTMDet-R +batch_shapes_cfg = None + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1.0 +# The scaling factor that controls the width of the network structure +widen_factor = 1.0 +# Strides of multi-scale prior box +strides = [8, 16, 32] +# The angle definition for model +angle_version = 'le90' # le90, le135, oc are available options + +norm_cfg = dict(type='BN') # Normalization config + +# -----train val related----- +lr_start_factor = 1.0e-5 +dsl_topk = 13 # Number of bbox selected in each level +loss_cls_weight = 1.0 +loss_bbox_weight = 2.0 +qfl_beta = 2.0 # beta of QualityFocalLoss +weight_decay = 0.05 + +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + channel_attention=True, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + init_cfg=dict( + type='Pretrained', prefix='backbone.', checkpoint=checkpoint)), + neck=dict( + type='CSPNeXtPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=256, + num_csp_blocks=3, + expand_ratio=0.5, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='RTMDetRotatedHead', + head_module=dict( + type='RTMDetRotatedSepBNHeadModule', + num_classes=num_classes, + widen_factor=widen_factor, + in_channels=256, + stacked_convs=2, + feat_channels=256, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + share_conv=True, + pred_kernel_size=1, + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0, strides=strides), + bbox_coder=dict( + type='DistanceAnglePointCoder', angle_version=angle_version), + loss_cls=dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=qfl_beta, + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='mmrotate.RotatedIoULoss', + mode='linear', + loss_weight=loss_bbox_weight), + angle_version=angle_version, + # Used for angle encode and decode, similar to bbox coder + angle_coder=dict(type='mmrotate.PseudoAngleCoder'), + # If true, it will apply loss_bbox on horizontal box, and angle_loss + # needs to be specified. In this case the loss_bbox should use + # horizontal box loss e.g. IoULoss. Arg details can be seen in + # `docs/zh_cn/tutorials/rotated_detection.md` + use_hbbox_loss=False, + loss_angle=None), + train_cfg=dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=num_classes, + topk=dsl_topk, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + # RBboxOverlaps2D doesn't support batch input, use loop instead. + batch_iou=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=model_test_cfg, +) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.RandomFlip', + prob=0.75, + direction=['horizontal', 'vertical', 'diagonal']), + dict( + type='mmrotate.RandomRotate', + prob=random_rotate_ratio, + angle_range=180, + rotate_type='mmrotate.Rotate', + rect_obj_labels=rotate_rect_obj_labels), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='RegularizeRotatedBox', angle_version=angle_version), + dict(type='mmdet.PackDetInputs') +] + +val_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='LoadAnnotations', + with_bbox=True, + box_type='qbox', + _scope_='mmdet'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + collate_fn=dict(type='yolov5_collate'), + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img_path=train_data_prefix), + filter_cfg=dict(filter_empty_gt=True), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img_path=val_data_prefix), + test_mode=True, + batch_shapes_cfg=batch_shapes_cfg, + pipeline=val_pipeline)) + +val_evaluator = dict(type='mmrotate.DOTAMetric', metric='mAP') + +# Inference on val dataset +test_dataloader = val_dataloader +test_evaluator = val_evaluator + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# batch_size=val_batch_size_per_gpu, +# num_workers=val_num_workers, +# persistent_workers=True, +# drop_last=False, +# sampler=dict(type='DefaultSampler', shuffle=False), +# dataset=dict( +# type=dataset_type, +# data_root=data_root, +# data_prefix=dict(img_path=test_data_prefix), +# test_mode=True, +# batch_shapes_cfg=batch_shapes_cfg, +# pipeline=test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=weight_decay), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=lr_start_factor, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + max_keep_ckpts=max_keep_ckpts, # only keep latest 3 checkpoints + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +visualizer = dict(type='mmrotate.RotLocalVisualizer') diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..dcafa55db97ffd543af3bc382d15de361cadbd75 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_2xb4-aug-100e_dota.py @@ -0,0 +1,168 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +# This config use longer schedule with Mixup, Mosaic and Random Rotate. + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-l_8xb256-rsb-a1-600e_in1k-6a760974.pth' # noqa + +# ========================modified parameters====================== + +# Base learning rate for optim_wrapper. Corresponding to 1xb8=8 bs +base_lr = 0.00025 # 0.004 / 16 +lr_start_factor = 1.0e-5 +max_epochs = 100 # Maximum training epochs +# Change train_pipeline for final 10 epochs (stage 2) +num_epochs_stage2 = 10 + +img_scale = (1024, 1024) # width, height +# ratio range for random resize +random_resize_ratio_range = (0.1, 2.0) +# Cached images number in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 +# ratio for random rotate +random_rotate_ratio = 0.5 +# label ids for rect objs +rotate_rect_obj_labels = [9, 11] + +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 1 +# validation intervals in stage 2 +val_interval_stage2 = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict( + type='mmrotate.RandomRotate', + prob=random_rotate_ratio, + angle_range=180, + rotate_type='mmrotate.Rotate', + rect_obj_labels=rotate_rect_obj_labels), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='mmdet.RandomFlip', + prob=0.75, + direction=['horizontal', 'vertical', 'diagonal']), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True, box_type='qbox'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict( + type='mmrotate.RandomRotate', + prob=random_rotate_ratio, + angle_range=180, + rotate_type='mmrotate.Rotate', + rect_obj_labels=rotate_rect_obj_labels), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict( + type='mmdet.RandomFlip', + prob=0.75, + direction=['horizontal', 'vertical', 'diagonal']), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=lr_start_factor, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + max_keep_ckpts=max_keep_ckpts, # only keep latest 3 checkpoints + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals, + dynamic_intervals=[(max_epochs - num_epochs_stage2, val_interval_stage2)]) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..1a9f50cdded21c36f9b76b49e291b60e0a2dff07 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_l_syncbn_fast_coco-pretrain_2xb4-36e_dota-ms.py @@ -0,0 +1,20 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth' # noqa + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..4be8605f6de383c4e39edae6cfdc19f5ea005353 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota-ms.py @@ -0,0 +1,33 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..8df61cffd6e165e36965b2622735abb93fbe8d83 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_m_syncbn_fast_2xb4-36e_dota.py @@ -0,0 +1,33 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-m_8xb256-rsb-a1-600e_in1k-ecb3bbd9.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..2b7b0b6ffee9cdf2720696ce6fe51b87927ada6e --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota-ms.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.5 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..d200dd76491dafb306900de23a25359224205d13 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_s_fast_1xb8-36e_dota.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.5 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py new file mode 100644 index 0000000000000000000000000000000000000000..56bf038b6500bb0640160e680ddbb5e4c34fd3f8 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota-ms.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota-ms.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.167 +widen_factor = 0.375 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..739a2de8020ad6879a8401255395df2e807f66c4 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rotated/rtmdet-r_tiny_fast_1xb8-36e_dota.py @@ -0,0 +1,38 @@ +_base_ = './rtmdet-r_l_syncbn_fast_2xb4-36e_dota.py' + +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.167 +widen_factor = 0.375 + +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 + +# Submission dir for result submit +submission_dir = './work_dirs/{{fileBasenameNoExtension}}/submission' + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) + +# Inference on test dataset and format the output results +# for submission. Note: the test set has no annotation. +# test_dataloader = dict( +# dataset=dict( +# data_root=_base_.data_root, +# ann_file='', # test set has no annotation +# data_prefix=dict(img_path=_base_.test_data_prefix), +# pipeline=_base_.test_pipeline)) +# test_evaluator = dict( +# type='mmrotate.DOTAMetric', +# format_only=True, +# merge_patches=True, +# outfile_prefix=submission_dir) diff --git a/third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..279a7990bc4a58a5c10bfc3dd29e570c7e3a14cc --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rtmdet-ins_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,31 @@ +_base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py' + +widen_factor = 0.5 + +model = dict( + bbox_head=dict( + type='RTMDetInsSepBNHead', + head_module=dict( + type='RTMDetInsSepBNHeadModule', + use_sigmoid_cls=True, + widen_factor=widen_factor), + loss_mask=dict( + type='mmdet.DiceLoss', loss_weight=2.0, eps=5e-6, + reduction='mean')), + test_cfg=dict( + multi_label=True, + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr_binary=0.5)) + +_base_.test_pipeline[-2] = dict( + type='LoadAnnotations', with_bbox=True, with_mask=True, _scope_='mmdet') + +val_dataloader = dict(dataset=dict(pipeline=_base_.test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(metric=['bbox', 'segm']) +test_evaluator = val_evaluator diff --git a/third_party/mmyolo/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..c36ac38ce16db6bbd66fe0c2271c34c252a538ab --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,304 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 10 +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=64 bs +base_lr = 0.004 +max_epochs = 300 # Maximum training epochs +# Change train_pipeline for final 20 epochs (stage 2) +num_epochs_stage2 = 20 + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.65), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# ratio range for random resize +random_resize_ratio_range = (0.1, 2.0) +# Cached images number in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 10 + +# Config of batch shapes. Only on val. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1.0 +# The scaling factor that controls the width of the network structure +widen_factor = 1.0 +# Strides of multi-scale prior box +strides = [8, 16, 32] + +norm_cfg = dict(type='BN') # Normalization config + +# -----train val related----- +lr_start_factor = 1.0e-5 +dsl_topk = 13 # Number of bbox selected in each level +loss_cls_weight = 1.0 +loss_bbox_weight = 2.0 +qfl_beta = 2.0 # beta of QualityFocalLoss +weight_decay = 0.05 + +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 10 +# validation intervals in stage 2 +val_interval_stage2 = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[57.375, 57.12, 58.395], + bgr_to_rgb=False), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + channel_attention=True, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='CSPNeXtPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=256, + num_csp_blocks=3, + expand_ratio=0.5, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='RTMDetHead', + head_module=dict( + type='RTMDetSepBNHeadModule', + num_classes=num_classes, + in_channels=256, + stacked_convs=2, + feat_channels=256, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + share_conv=True, + pred_kernel_size=1, + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + loss_cls=dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=qfl_beta, + loss_weight=loss_cls_weight), + loss_bbox=dict(type='mmdet.GIoULoss', loss_weight=loss_bbox_weight)), + train_cfg=dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=num_classes, + topk=dsl_topk, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + allowed_border=-1, + pos_weight=-1, + debug=False), + test_cfg=model_test_cfg, +) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + collate_fn=dict(type='yolov5_collate'), + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix), + test_mode=True, + batch_shapes_cfg=batch_shapes_cfg, + pipeline=test_pipeline)) + +test_dataloader = val_dataloader + +# Reduce evaluation time +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +# optimizer +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=weight_decay), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', + start_factor=lr_start_factor, + by_epoch=False, + begin=0, + end=1000), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# hooks +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + max_keep_ckpts=max_keep_ckpts # only keep latest 3 checkpoints + )) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals, + dynamic_intervals=[(max_epochs - num_epochs_stage2, val_interval_stage2)]) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..52576bf41689f462e46e83e6236de91ead43e97c --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rtmdet_m_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,11 @@ +_base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8cead7805974a0a9434f41623ab92beb87fadc60 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,92 @@ +_base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.5 +img_scale = _base_.img_scale + +# ratio range for random resize +random_resize_ratio_range = (0.5, 2.0) +# Number of cached images in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + # Since the checkpoint includes CUDA:0 data, + # it must be forced to set map_location. + # Once checkpoint is fixed, it can be removed. + init_cfg=dict( + type='Pretrained', + prefix='backbone.', + checkpoint=checkpoint, + map_location='cpu')), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] diff --git a/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py b/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..8d1182c5ef663efdf06801c6cc22991b9545b2ea --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_fast_1xb12-40e_cat.py @@ -0,0 +1,70 @@ +_base_ = 'rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +num_epochs_stage2 = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 +val_batch_size_per_gpu = 1 +val_num_workers = 2 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=_base_.lr_start_factor, + by_epoch=False, + begin=0, + end=30), + dict( + # use cosine lr from 150 to 300 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=max_epochs // 2, + end=max_epochs, + T_max=max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +_base_.custom_hooks[1].switch_epoch = max_epochs - num_epochs_stage2 + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..257110d22e9f2330e4c5378001eaf72f6bb885d1 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,58 @@ +_base_ = './rtmdet_s_syncbn_fast_8xb32-300e_coco.py' +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.167 +widen_factor = 0.375 +img_scale = _base_.img_scale + +# ratio range for random resize +random_resize_ratio_range = (0.5, 2.0) +# Number of cached images in mosaic +mosaic_max_cached_images = 20 +# Number of cached images in mixup +mixup_max_cached_images = 10 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + init_cfg=dict(checkpoint=checkpoint)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, # note + random_pop=False, # note + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + random_pop=False, + max_cached_images=mixup_max_cached_images, + prob=0.5), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7fc9001f99ef3d468994c8201d43f08500bdeef9 --- /dev/null +++ b/third_party/mmyolo/configs/rtmdet/rtmdet_x_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,11 @@ +_base_ = './rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/README.md b/third_party/mmyolo/configs/yolov5/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bd33e83f430b9309e4c0e95902a61db0dd7ae002 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/README.md @@ -0,0 +1,146 @@ +# YOLOv5 + + + +## Abstract + +YOLOv5 is a family of object detection architectures and models pretrained on the COCO dataset, and represents Ultralytics open-source research into future vision AI methods, incorporating lessons learned and best practices evolved over thousands of hours of research and development. + +
+ +YOLOv5-l-P5 model structure +
+ +
+ +YOLOv5-l-P6 model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | size | Mask Refine | SyncBN | AMP | Mem (GB) | box AP | TTA box AP | Config | Download | +| :-------: | :--: | :--: | :---------: | :----: | :-: | :------: | :---------: | :--------: | :-----------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv5-n | P5 | 640 | No | Yes | Yes | 1.5 | 28.0 | 30.7 | [config](./yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json) | +| YOLOv5-n | P5 | 640 | Yes | Yes | Yes | 1.5 | 28.0 | | [config](./mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706-712fb1b2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706.log.json) | +| YOLOv5u-n | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_n_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-s | P5 | 640 | No | Yes | Yes | 2.7 | 37.7 | 40.2 | [config](./yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json) | +| YOLOv5-s | P5 | 640 | Yes | Yes | Yes | 2.7 | 38.0 (+0.3) | | [config](./mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134-8e0cd271.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134.log.json) | +| YOLOv5u-s | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_s_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-m | P5 | 640 | No | Yes | Yes | 5.0 | 45.3 | 46.9 | [config](./yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944.log.json) | +| YOLOv5-m | P5 | 640 | Yes | Yes | Yes | 5.0 | 45.3 | | [config](./mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946-44e96155.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946.log.json) | +| YOLOv5u-m | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_m_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-l | P5 | 640 | No | Yes | Yes | 8.1 | 48.8 | 49.9 | [config](./yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007.log.json) | +| YOLOv5-l | P5 | 640 | Yes | Yes | Yes | 8.1 | 49.3 (+0.5) | | [config](./mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301-2c1d912a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301.log.json) | +| YOLOv5u-l | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_l_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-x | P5 | 640 | No | Yes | Yes | 12.2 | 50.2 | | [config](./yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943-00776a4b.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943.log.json) | +| YOLOv5-x | P5 | 640 | Yes | Yes | Yes | 12.2 | 50.9 (+0.7) | | [config](./mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321-07edeb62.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321.log.json) | +| YOLOv5u-x | P5 | 640 | Yes | Yes | Yes | | | | [config](./yolov5/yolov5u/yolov5_x_mask-refine_syncbn_fast_8xb16-300e_coco.py) | [model](<>) \| [log](<>) | +| YOLOv5-n | P6 | 1280 | No | Yes | Yes | 5.8 | 35.9 | | [config](./yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705-d493c5f3.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705.log.json) | +| YOLOv5-s | P6 | 1280 | No | Yes | Yes | 10.5 | 44.4 | | [config](./yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044-58865c19.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044.log.json) | +| YOLOv5-m | P6 | 1280 | No | Yes | Yes | 19.1 | 51.3 | | [config](./yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453-49564d58.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453.log.json) | +| YOLOv5-l | P6 | 1280 | No | Yes | Yes | 30.5 | 53.7 | | [config](./yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308-7a2ba6bf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308.log.json) | + +**Note**: + +1. `fast` means that `YOLOv5DetDataPreprocessor` and `yolov5_collate` are used for data preprocessing, which is faster for training, but less flexible for multitasking. Recommended to use fast version config if you only care about object detection. +2. `detect` means that the network input is fixed to `640x640` and the post-processing thresholds is modified. +3. `SyncBN` means use SyncBN, `AMP` indicates training with mixed precision. +4. We use 8x A100 for training, and the single-GPU batch size is 16. This is different from the official code. +5. The performance is unstable and may fluctuate by about 0.4 mAP and the highest performance weight in `COCO` training in `YOLOv5` may not be the last epoch. +6. `TTA` means that Test Time Augmentation. It's perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). You only need to specify `--tta` when testing to enable. see [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/common_usage/tta.md) for details. +7. The performance of `Mask Refine` training is for the weight performance officially released by YOLOv5. `Mask Refine` means refining bbox by mask while loading annotations and transforming after `YOLOv5RandomAffine`, `Copy Paste` means using `YOLOv5CopyPaste`. +8. `YOLOv5u` models use the same loss functions and split Detect head as `YOLOv8` models for improved performance, but only requires 300 epochs. + +### COCO Instance segmentation + +| Backbone | Arch | size | SyncBN | AMP | Mem (GB) | Box AP | Mask AP | Config | Download | +| :-------------------: | :--: | :--: | :----: | :-: | :------: | :----: | :-----: | :--------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv5-n | P5 | 640 | Yes | Yes | 3.3 | 27.9 | 23.7 | [config](./ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807.log.json) | +| YOLOv5-s | P5 | 640 | Yes | Yes | 4.8 | 38.1 | 32.0 | [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542.log.json) | +| YOLOv5-s(non-overlap) | P5 | 640 | Yes | Yes | 4.8 | 38.0 | 32.1 | [config](./ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642.log.json) | +| YOLOv5-m | P5 | 640 | Yes | Yes | 7.3 | 45.1 | 37.3 | [config](./ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529.log.json) | +| YOLOv5-l | P5 | 640 | Yes | Yes | 10.7 | 48.8 | 39.9 | [config](./ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049-daa09f70.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049.log.json) | +| YOLOv5-x | P5 | 640 | Yes | Yes | 15.0 | 50.6 | 41.4 | [config](./ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925-a260c798.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925.log.json) | + +**Note**: + +1. `Non-overlap` refers to the instance-level masks being stored in the format (num_instances, h, w) instead of (h, w). Storing masks in overlap format consumes less memory and GPU memory. +2. For the M model, the `affine_scale` parameter should be 0.9, but due to some reason, we set it to 0.5 and found that the mAP did not change. Therefore, the released M model has an `affine_scale` parameter of 0.5, which is inconsistent with the value of 0.9 in the configuration. + +### VOC + +| Backbone | size | Batchsize | AMP | Mem (GB) | box AP(COCO metric) | Config | Download | +| :------: | :--: | :-------: | :-: | :------: | :-----------------: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv5-n | 512 | 64 | Yes | 3.5 | 51.2 | [config](./yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254-f1493430.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254.log.json) | +| YOLOv5-s | 512 | 64 | Yes | 6.5 | 62.7 | [config](./yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156-0009b33e.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156.log.json) | +| YOLOv5-m | 512 | 64 | Yes | 12.0 | 70.1 | [config](./yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138-815c143a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138.log.json) | +| YOLOv5-l | 512 | 32 | Yes | 10.0 | 73.1 | [config](./yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500-edc7e0d8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500.log.json) | + +**Note**: + +1. Training on VOC dataset need pretrained model which trained on COCO. +2. The performance is unstable and may fluctuate by about 0.4 mAP. +3. Official YOLOv5 use COCO metric, while training VOC dataset. +4. We converted the VOC test dataset to COCO format offline, while reproducing mAP result as shown above. We will support to use COCO metric while training VOC dataset in later version. +5. Hyperparameter reference from `https://wandb.ai/glenn-jocher/YOLOv5_VOC_official`. + +### CrowdHuman + +Since the `iscrowd` annotation of the COCO dataset is not equivalent to `ignore`, we use the CrowdHuman dataset to verify that the YOLOv5 ignore logic is correct. + +| Backbone | size | SyncBN | AMP | Mem (GB) | ignore_iof_thr | box AP50(CrowDHuman Metric) | MR | JI | Config | Download | +| :------: | :--: | :----: | :-: | :------: | :------------: | :-------------------------: | :--: | :---: | :------------------------------------------------------------------------: | :------: | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | -1 | 85.79 | 48.7 | 75.33 | [config](./yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py) | | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | 0.5 | 86.17 | 48.8 | 75.87 | [config](./yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py) | | + +**Note**: + +1. `ignore_iof_thr` is -1 indicating that the ignore tag is not considered. We adjusted with `ignore_iof_thr` thresholds of 0.5, 0.8, 0.9, and the results show that 0.5 has the best performance. +2. The above table shows the performance of the model with the best performance on the validation set. The best performing models are around 160+ epoch which means that there is no need to train so many epochs. +3. This is a very simple implementation that simply replaces COCO's anchor with the `tools/analysis_tools/optimize_anchors.py` script. We'll adjust other parameters later to improve performance. + +## Citation + +```latex +@software{glenn_jocher_2022_7002879, + author = {Glenn Jocher and + Ayush Chaurasia and + Alex Stoken and + Jirka Borovec and + NanoCode012 and + Yonghye Kwon and + TaoXie and + Kalen Michael and + Jiacong Fang and + imyhxy and + Lorna and + Colin Wong and + 曾逸夫(Zeng Yifu) and + Abhiram V and + Diego Montes and + Zhiqiang Wang and + Cristi Fati and + Jebastin Nadar and + Laughing and + UnglvKitDe and + tkianai and + yxNONG and + Piotr Skalski and + Adam Hogan and + Max Strobel and + Mrinal Jain and + Lorenzo Mammana and + xylieong}, + title = {{ultralytics/yolov5: v6.2 - YOLOv5 Classification + Models, Apple M1, Reproducibility, ClearML and + Deci.ai integrations}}, + month = aug, + year = 2022, + publisher = {Zenodo}, + version = {v6.2}, + doi = {10.5281/zenodo.7002879}, + url = {https://doi.org/10.5281/zenodo.7002879} +} +``` diff --git a/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py b/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py new file mode 100644 index 0000000000000000000000000000000000000000..85b371929acd68bfd06cc257d20978c3fcc36db7 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py @@ -0,0 +1,63 @@ +_base_ = 'yolov5_s-v61_fast_8xb16-300e_crowdhuman.py' + +model = dict( + data_preprocessor=dict( + _delete_=True, + type='mmdet.DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + bbox_head=dict(ignore_iof_thr=0.5)) + +img_scale = _base_.img_scale + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + # only change this + dict(type='mmdet.LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.5, 1.5), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + collate_fn=dict(type='pseudo_collate'), + dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py b/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py new file mode 100644 index 0000000000000000000000000000000000000000..a61859fa0f2c0ea8a08ffd7783adc4ccac8540dd --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/crowdhuman/yolov5_s-v61_fast_8xb16-300e_crowdhuman.py @@ -0,0 +1,47 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# Use the model trained on the COCO as the pretrained model +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +# dataset settings +data_root = 'data/CrowdHuman/' +dataset_type = 'YOLOv5CrowdHumanDataset' + +# parameters that often need to be modified +num_classes = 1 + +anchors = [ + [(6, 14), (12, 28), (19, 48)], # P3/8 + [(29, 79), (46, 124), (142, 54)], # P4/16 + [(73, 198), (124, 330), (255, 504)] # P5/32 +] + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors))) + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotation_train.odgt', + data_prefix=dict(img='Images/'))) + +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='annotation_val.odgt', + data_prefix=dict(img='Images/'), + # CrowdHumanMetric does not support out-of-order output images + # for the time being. batch_shapes_cfg does not support. + batch_shapes_cfg=None)) +test_dataloader = val_dataloader + +val_evaluator = dict( + _delete_=True, + type='mmdet.CrowdHumanMetric', + ann_file=data_root + 'annotation_val.odgt', + metric=['AP', 'MR', 'JI']) +test_evaluator = val_evaluator diff --git a/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..6b27c7647bd233172e11df8e5a736946d70acfe0 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,81 @@ +_base_ = './yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=_base_.use_mask2refine), +] + +# enable mixup +train_pipeline = [ + *pre_transform, + *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=_base_.mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..831e815cb2f982e92c9995bd6e012bcce95950f6 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,89 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 + +affine_scale = 0.9 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=_base_.use_mask2refine), +] + +# enable mixup +train_pipeline = [ + *pre_transform, + *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=_base_.mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..e06130bd317dba004a7fa1d5de0750f5b1cd21cf --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..82e2ae6d059df466940fc3df84ce53102ffec081 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py @@ -0,0 +1,42 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +data_root = 'data/balloon/' +# Path of train annotation file +train_ann_file = 'train.json' +train_data_prefix = 'train/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'val.json' +val_data_prefix = 'val/' # Prefix of val image path +metainfo = { + 'classes': ('balloon', ), + 'palette': [ + (220, 20, 60), + ] +} +num_classes = 1 + +train_batch_size_per_gpu = 4 +train_num_workers = 2 +log_interval = 1 +##################### +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=train_data_prefix), + ann_file=train_ann_file)) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) +test_dataloader = val_dataloader +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = val_evaluator +default_hooks = dict(logger=dict(interval=log_interval)) +##################### + +model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) diff --git a/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..0ab980ca7dfdd9c2feaba660f8745c92b49e6bbc --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,126 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' # noqa + +# ========================modified parameters====================== +# YOLOv5RandomAffine +use_mask2refine = True +max_aspect_ratio = 100 +min_area_ratio = 0.01 +# Polygon2Mask +downsample_ratio = 4 +mask_overlap = True +# LeterResize +# half_pad_param: if set to True, left and right pad_param will +# be given by dividing padding_h by 2. If set to False, pad_param is +# in int format. We recommend setting this to False for object +# detection tasks, and True for instance segmentation tasks. +# Default to False. +half_pad_param = True + +# Testing take a long time due to model_test_cfg. +# If you want to speed it up, you can increase score_thr +# or decraese nms_pre and max_per_img +model_test_cfg = dict( + multi_label=True, + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300, + mask_thr_binary=0.5, + # fast_test: Whether to use fast test methods. When set + # to False, the implementation here is the same as the + # official, with higher mAP. If set to True, mask will first + # be upsampled to origin image shape through Pytorch, and + # then use mask_thr_binary to determine which pixels belong + # to the object. If set to False, will first use + # mask_thr_binary to determine which pixels belong to the + # object , and then use opencv to upsample mask to origin + # image shape. Default to False. + fast_test=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + bbox_head=dict( + type='YOLOv5InsHead', + head_module=dict( + type='YOLOv5InsHeadModule', mask_channels=32, proto_channels=256), + mask_overlap=mask_overlap, + loss_mask=dict( + type='mmdet.CrossEntropyLoss', use_sigmoid=True, reduction='none'), + loss_mask_weight=0.05), + test_cfg=model_test_cfg) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + max_aspect_ratio=max_aspect_ratio, + use_mask_refine=use_mask2refine), + # TODO: support mask transform in albu + # Geometric transformations are not supported in albu now. + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=downsample_ratio, + mask_overlap=mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + half_pad_param=half_pad_param, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +val_evaluator = dict(metric=['bbox', 'segm']) +test_evaluator = val_evaluator diff --git a/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..83b48cab69ade156f69864d11b37af597dd82da2 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py @@ -0,0 +1,49 @@ +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +# ========================modified parameters====================== +mask_overlap = False # Polygon2Mask + +# ===============================Unmodified in most cases==================== +model = dict(bbox_head=dict(mask_overlap=mask_overlap)) + +train_pipeline = [ + *_base_.pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + max_aspect_ratio=_base_.max_aspect_ratio, + use_mask_refine=True), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes', + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='Polygon2Mask', + downsample_ratio=_base_.downsample_ratio, + mask_overlap=mask_overlap), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..a18170ccc30c541f583ca3f4eaf829b853ed2816 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..206eec3c41542958ae105764fbf3991935b30bc8 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,77 @@ +_base_ = './yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), + dict(type='RemoveDataElement', keys=['gt_masks']) +] + +# enable mixup and copypaste +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..4af27a917e6113f33ff72781eeee911381bbed53 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,86 @@ +_base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 + +affine_scale = 0.9 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), + dict(type='RemoveDataElement', keys=['gt_masks']) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3fe8dc32ceaf687940596f6b8094d79857921deb --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,20 @@ +_base_ = './yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 + +# ===============================Unmodified in most cases==================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..74febbb7764435d7ab4d9a8014fb6977a269da68 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,62 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +use_mask2refine = True +min_area_ratio = 0.01 # YOLOv5RandomAffine + +# ===============================Unmodified in most cases==================== +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +last_transform = [ + # Delete gt_masks to avoid more computation + dict(type='RemoveDataElement', keys=['gt_masks']), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fb76f1057872d81f52ac9369a689545194a61bb7 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# ===============================Unmodified in most cases==================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/metafile.yml b/third_party/mmyolo/configs/yolov5/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..bfe5add4fa0f268a8a6566c7ddc2e9b46a92ffe7 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/metafile.yml @@ -0,0 +1,346 @@ +Collections: + - Name: YOLOv5 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + README: configs/yolov5/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.1.0/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.0 + - Name: YOLOv5_VOC + Metadata: + Training Data: VOC + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + Training Resources: 1x A100 GPU + Architecture: + - CSPDarkNet + - PAFPN + README: configs/yolov5/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.1.0/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.0 + +Models: + - Name: yolov5_n-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 1.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 28.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth + - Name: yolov5_s-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 2.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth + - Name: yolov5_m-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth + - Name: yolov5_l-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 8.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth + - Name: yolov5_x-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco/yolov5_x-v61_syncbn_fast_8xb16-300e_coco_20230305_152943-00776a4b.pth + - Name: yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 5.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 35.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_224705-d493c5f3.pth + - Name: yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 10.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_215044-58865c19.pth + - Name: yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 19.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_230453-49564d58.pth + - Name: yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 30.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 53.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco_20221027_234308-7a2ba6bf.pth + - Name: yolov5_n-v61_fast_1xb64-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py + Metadata: + Training Memory (GB): 3.5 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_fast_1xb64-50e_voc/yolov5_n-v61_fast_1xb64-50e_voc_20221017_234254-f1493430.pth + - Name: yolov5_s-v61_fast_1xb64-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py + Metadata: + Training Memory (GB): 6.5 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 62.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_fast_1xb64-50e_voc/yolov5_s-v61_fast_1xb64-50e_voc_20221017_234156-0009b33e.pth + - Name: yolov5_m-v61_fast_1xb64-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py + Metadata: + Training Memory (GB): 12.0 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 70.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_fast_1xb64-50e_voc/yolov5_m-v61_fast_1xb64-50e_voc_20221017_114138-815c143a.pth + - Name: yolov5_l-v61_fast_1xb32-50e_voc + In Collection: YOLOv5_VOC + Config: configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py + Metadata: + Training Memory (GB): 10.0 + Epochs: 50 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 73.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_fast_1xb32-50e_voc/yolov5_l-v61_fast_1xb32-50e_voc_20221017_045500-edc7e0d8.pth + - Name: yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 1.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 28.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_n_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_152706-712fb1b2.pth + - Name: yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 2.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_s_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230304_033134-8e0cd271.pth + - Name: yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 5.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_m_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_153946-44e96155.pth + - Name: yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 8.1 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_l_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154301-2c1d912a.pth + - Name: yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco + In Collection: YOLOv5 + Config: configs/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/mask_refine/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco/yolov5_x_mask-refine-v61_syncbn_fast_8xb16-300e_coco_20230305_154321-07edeb62.pth + - Name: yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 3.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 27.9 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 23.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_n-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_104807-84cc9240.pth + - Name: yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance_20230426_012542-3e570436.pth + - Name: yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 4.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 38.0 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 32.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance/yolov5_ins_s-v61_syncbn_fast_non_overlap_8xb16-300e_coco_instance_20230424_104642-6780d34e.pth + - Name: yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 7.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 37.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_m-v61_syncbn_fast_8xb16-300e_coco_instance_20230424_111529-ef5ba1a9.pth + - Name: yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.8 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 39.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_l-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_104049-daa09f70.pth + - Name: yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance + In Collection: YOLOv5 + Config: configs/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance.py + Metadata: + Training Memory (GB): 15.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.6 + - Task: Instance Segmentation + Dataset: COCO + Metrics: + mask AP: 41.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov5/ins_seg/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance/yolov5_ins_x-v61_syncbn_fast_8xb16-300e_coco_instance_20230508_103925-a260c798.pth diff --git a/third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py b/third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..4b470973c46073748803bac2f736eca615e3cb00 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/voc/yolov5_l-v61_fast_1xb32-50e_voc.py @@ -0,0 +1,25 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 1.0 +widen_factor = 1.0 +train_batch_size_per_gpu = 32 +train_num_workers = 8 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco/yolov5_l-v61_syncbn_fast_8xb16-300e_coco_20220917_031007-096ef0eb.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, num_workers=train_num_workers) + +optim_wrapper = dict( + optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu)) diff --git a/third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py b/third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed2127a19854fde1b6fa0c80f4d6fd2ba818f0a --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/voc/yolov5_m-v61_fast_1xb64-50e_voc.py @@ -0,0 +1,17 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 0.67 +widen_factor = 0.75 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py b/third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..041f6537d03a4f13402b1bb7e2665443793e4681 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/voc/yolov5_n-v61_fast_1xb64-50e_voc.py @@ -0,0 +1,17 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth' # noqa + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py b/third_party/mmyolo/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..f777fff9697dfbd315a0b8f762a2bf31a1118ca8 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py @@ -0,0 +1,270 @@ +_base_ = '../yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# dataset settings +data_root = 'data/VOCdevkit/' +dataset_type = 'YOLOv5VOCDataset' + +# parameters that often need to be modified +num_classes = 20 +img_scale = (512, 512) # width, height +max_epochs = 50 +train_batch_size_per_gpu = 64 +train_num_workers = 8 +val_batch_size_per_gpu = 1 +val_num_workers = 2 + +# persistent_workers must be False if num_workers is 0. +persistent_workers = True + +lr_factor = 0.15135 +affine_scale = 0.75544 + +# only on Val +batch_shapes_cfg = dict(img_size=img_scale[0]) + +anchors = [[(26, 44), (67, 57), (61, 130)], [(121, 118), (120, 239), + (206, 182)], + [(376, 161), (234, 324), (428, 322)]] +num_det_layers = 3 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +tta_img_scales = [img_scale, (416, 416), (640, 640)] + +# Hyperparameter reference from: +# https://github.com/ultralytics/yolov5/blob/master/data/hyps/hyp.VOC.yaml +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict( + loss_weight=0.21638 * (num_classes / 80 * 3 / num_det_layers), + class_weight=0.5), + loss_bbox=dict(loss_weight=0.02 * (3 / num_det_layers)), + loss_obj=dict( + loss_weight=0.51728 * + ((img_scale[0] / 640)**2 * 3 / num_det_layers), + class_weight=0.67198), + # Different from COCO + prior_match_thr=3.3744), + test_cfg=dict(nms=dict(iou_threshold=0.6))) + +albu_train_transforms = _base_.albu_train_transforms +pre_transform = _base_.pre_transform + +with_mosiac_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.04591, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='YOLOv5MixUp', + prob=0.04266, + pre_transform=[ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.04591, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) + ]) +] + +without_mosaic_pipeline = [ + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.04591, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(0, 0), + border_val=(114, 114, 114)), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114)) +] + +# Because the border parameter is inconsistent when +# using mosaic or not, `RandomChoice` is used here. +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[with_mosiac_pipeline, without_mosaic_pipeline], + prob=[0.85834, 0.14166]) + +train_pipeline = [ + *pre_transform, randchoice_mosaic_pipeline, + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict( + type='YOLOv5HSVRandomAug', + hue_delta=0.01041, + saturation_delta=0.54703, + value_delta=0.27739), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + _delete_=True, + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline), + dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline) + ], + # Use ignore_keys to avoid judging metainfo is + # not equal in `ConcatDataset`. + ignore_keys='dataset_type'), + collate_fn=dict(type='yolov5_collate')) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='VOC2007/ImageSets/Main/test.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + test_mode=True, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + optimizer=dict( + lr=0.00334, + momentum=0.74832, + weight_decay=0.00025, + batch_size_per_gpu=train_batch_size_per_gpu)) + +default_hooks = dict( + param_scheduler=dict( + lr_factor=lr_factor, + max_epochs=max_epochs, + warmup_epochs=3.3835, + warmup_momentum=0.59462, + warmup_bias_lr=0.18657)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + # To load COCO pretrained model, need to set `strict_load=False` + strict_load=False, + priority=49) +] + +# TODO: Support using coco metric in voc dataset +val_evaluator = dict( + _delete_=True, type='mmdet.VOCMetric', metric='mAP', eval_mode='area') + +test_evaluator = val_evaluator + +train_cfg = dict(max_epochs=max_epochs) + +# Config for Test Time Augmentation. (TTA) +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in tta_img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py b/third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..2fc4d79f86b40c45d3f7692f32adc88295bbb4a4 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/voc/yolov5_x-v61_fast_1xb32-50e_voc.py @@ -0,0 +1,26 @@ +_base_ = './yolov5_s-v61_fast_1xb64-50e_voc.py' + +deepen_factor = 1.33 +widen_factor = 1.25 +train_batch_size_per_gpu = 32 +train_num_workers = 8 + +# TODO: need to add pretrained_model +load_from = None + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, num_workers=train_num_workers) + +optim_wrapper = dict( + optimizer=dict(batch_size_per_gpu=train_batch_size_per_gpu)) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..6a84fdbebc11dd4eafadc34be1e98bfb6f9b2f43 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_l-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 1.0 +widen_factor = 1.0 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..60a11a375c3dd8ead1d3f6a04340aed2acb20b20 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_l-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 1.0 +widen_factor = 1.0 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f593e378a9fbbf1381e48a186a645a559b1f129a --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,79 @@ +_base_ = './yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +affine_scale = 0.9 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ef324ed097a30d5a04fba2bb85641e7857f353 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,79 @@ +_base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +affine_scale = 0.9 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3cd2d6b7be817f4f8e6729acc1d3f9e450457e07 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_n-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = 'yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b6f93428fc8d6dc1b94a8d447671ffc1a877dbb8 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f64df69fd4ea0f4c8d30b9e8928bcd1c4e1d9d35 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,138 @@ +_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +img_scale = (1280, 1280) # width, height +num_classes = 80 # Number of classes for classification +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=64) +# Basic size of multi-scale prior box +anchors = [ + [(19, 27), (44, 40), (38, 94)], # P3/8 + [(96, 68), (86, 152), (180, 137)], # P4/16 + [(140, 301), (303, 264), (238, 542)], # P5/32 + [(436, 615), (739, 380), (925, 792)] # P6/64 +] +# Strides of multi-scale prior box +strides = [8, 16, 32, 64] +num_det_layers = 4 # The number of model output scales +loss_cls_weight = 0.5 +loss_bbox_weight = 0.05 +loss_obj_weight = 1.0 +# The obj loss weights of the three output layers +obj_level_weights = [4.0, 1.0, 0.25, 0.06] +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio + +tta_img_scales = [(1280, 1280), (1024, 1024), (1536, 1536)] +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(arch='P6', out_indices=(2, 3, 4, 5)), + neck=dict( + in_channels=[256, 512, 768, 1024], out_channels=[256, 512, 768, 1024]), + bbox_head=dict( + head_module=dict( + in_channels=[256, 512, 768, 1024], featmap_strides=strides), + prior_generator=dict(base_sizes=anchors, strides=strides), + # scaled based on number of detection layers + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict(loss_weight=loss_bbox_weight * (3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + obj_level_weights=obj_level_weights)) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +# Config for Test Time Augmentation. (TTA) +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in tta_img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..5bbd13e0859abb7a9fa315a8b0f956f959a560d7 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py @@ -0,0 +1,70 @@ +_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py' + +# This configuration is used to provide non-square training examples +# Must be a multiple of 32 +img_scale = (608, 352) # w h + +anchors = [ + [(65, 35), (159, 45), (119, 80)], # P3/8 + [(215, 77), (224, 116), (170, 166)], # P4/16 + [(376, 108), (339, 176), (483, 190)] # P5/32 +] + +# ===============================Unmodified in most cases==================== +_base_.model.bbox_head.loss_obj.loss_weight = 1.0 * ((img_scale[1] / 640)**2) +_base_.model.bbox_head.prior_generator.base_sizes = anchors + +train_pipeline = [ + *_base_.pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +_base_.train_dataloader.dataset.pipeline = train_pipeline + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='mmdet.LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +test_dataloader = val_dataloader diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..7b7e4f227bbc6aa37873dc306009d1af842c166c --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..dc460fa9802d34ece214482bcda7a6bdf7435b39 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py @@ -0,0 +1,13 @@ +_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py' + +model = dict( + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=1) + ])) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d8238c1377cb2f56f4c3bf0c5cd6d4227b2d70a5 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py @@ -0,0 +1,23 @@ +_base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + use_mini_pad=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +test_dataloader = val_dataloader + +model = dict( + test_cfg=dict( + multi_label=False, score_thr=0.25, nms=dict(iou_threshold=0.45))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7e81a0385587df40c588dcb44202a7f5d82478c1 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py @@ -0,0 +1,292 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [ + [(10, 13), (16, 30), (33, 23)], # P3/8 + [(30, 61), (62, 45), (59, 119)], # P4/16 + [(116, 90), (156, 198), (373, 326)] # P5/32 +] + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=128 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.65), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=32, + # Additional paddings for pixel scale + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +# Strides of multi-scale prior box +strides = [8, 16, 32] +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) # Normalization config + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +loss_cls_weight = 0.5 +loss_bbox_weight = 0.05 +loss_obj_weight = 1.0 +prior_match_thr = 4. # Priori box matching threshold +# The obj loss weights of the three output layers +obj_level_weights = [4., 1., 0.4] +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='mmdet.DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv5CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[256, 512, 1024], + num_csp_blocks=3, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + num_classes=num_classes, + in_channels=[256, 512, 1024], + widen_factor=widen_factor, + featmap_strides=strides, + num_base_priors=3), + prior_generator=dict( + type='mmdet.YOLOAnchorGenerator', + base_sizes=anchors, + strides=strides), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + eps=1e-7, + reduction='mean', + loss_weight=loss_bbox_weight * (3 / num_det_layers), + return_iou=True), + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + prior_match_thr=prior_match_thr, + obj_level_weights=obj_level_weights), + test_cfg=model_test_cfg) + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='linear', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py new file mode 100644 index 0000000000000000000000000000000000000000..2c585ceb92e9bfb1984b49ce02f86f4d3cd4532d --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py @@ -0,0 +1,42 @@ +_base_ = './yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +data_root = 'data/balloon/' +# Path of train annotation file +train_ann_file = 'train.json' +train_data_prefix = 'train/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'val.json' +val_data_prefix = 'val/' # Prefix of val image path +metainfo = { + 'classes': ('balloon', ), + 'palette': [ + (220, 20, 60), + ] +} +num_classes = 1 + +train_batch_size_per_gpu = 4 +train_num_workers = 2 +log_interval = 1 + +# =======================Unmodified in most cases================== +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=train_data_prefix), + ann_file=train_ann_file)) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) +test_dataloader = val_dataloader +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = val_evaluator +model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) +default_hooks = dict(logger=dict(interval=log_interval)) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..17b4a73b092fda1b98a088a83619697702859f71 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = 'yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# fast means faster training speed, +# but less flexibility for multitasking +model = dict( + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True)) + +train_dataloader = dict(collate_fn=dict(type='yolov5_collate')) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9fe5c0103520280ba26bb3f56a4a30658576b74b --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_x-p6-v62_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,14 @@ +_base_ = './yolov5_m-p6-v62_syncbn_fast_8xb16-300e_coco.py' +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8782eed8df6318b3aad6333809a04f639fd0cefb --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5_x-v61_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,14 @@ +_base_ = './yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' +deepen_factor = 1.33 +widen_factor = 1.25 + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..60c11feb3d4e6f8db5f3e70af5d3afdbc5f65535 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,59 @@ +_base_ = './yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.00 +widen_factor = 1.00 + +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform +affine_scale = _base_.affine_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..22b9e881d024bfc781b1328913b50439ac80a2f3 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_l_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,18 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +# TODO: Update the training hyperparameters +deepen_factor = 1.0 +widen_factor = 1.0 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ecc86fdd2d9ae362477f4edc5e5f9dd497222946 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,79 @@ +_base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +affine_scale = 0.9 +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..0cfb332488ba41c5e0880bd91d8c73fccde52f36 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_m_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,18 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +# TODO: Update the training hyperparameters +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..1ca21b65147e830b04b0e70e61011f6a9371d637 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,20 @@ +_base_ = './yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 + +# ===============================Unmodified in most cases==================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ad6a9f2eba7ac8fc56c12fab52a3a8f9b24acba1 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_n_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d6840bc288b2cb9d26ebc06d0b888926035ce8b9 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,80 @@ +_base_ = './yolov5u_s_syncbn_fast_8xb16-300e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +use_mask2refine = True +min_area_ratio = 0.01 # YOLOv5RandomAffine + +# ===============================Unmodified in most cases==================== +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +last_transform = [ + # Delete gt_masks to avoid more computation + dict(type='RemoveDataElement', keys=['gt_masks']), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..81d3a981c281af0f4cd9596c4a7349cb2e1bf367 --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_s_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,326 @@ +_base_ = ['../../_base_/default_runtime.py', '../../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=128 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs +# Disable mosaic augmentation for final 10 epochs (stage 2) +close_mosaic_epochs = 10 + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.7), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=32, + # Additional paddings for pixel scale + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +# Strides of multi-scale prior box +strides = [8, 16, 32] +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) # Normalization config + +# -----train val related----- +tal_topk = 10 # Number of bbox selected in each level +tal_alpha = 0.5 # A Hyper-parameter related to alignment_metrics +tal_beta = 6.0 # A Hyper-parameter related to alignment_metrics + +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +# YOLOv5RandomAffine aspect ratio of width and height thres to filter bboxes +max_aspect_ratio = 100 +# TODO: Automatically scale loss_weight based on number of detection layers +loss_cls_weight = 0.5 +loss_bbox_weight = 7.5 +# Since the dfloss is implemented differently in the official +# and mmdet, we're going to divide loss_weight by 4. +loss_dfl_weight = 1.5 / 4 +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.001 +# Save model checkpoint and validation intervals +save_checkpoint_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv5CSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[256, 512, 1024], + num_csp_blocks=3, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv8Head', + head_module=dict( + type='YOLOv8HeadModule', + num_classes=num_classes, + in_channels=[256, 512, 1024], + widen_factor=widen_factor, + reg_max=16, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=loss_bbox_weight, + return_iou=False), + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=loss_dfl_weight)), + train_cfg=dict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + use_ciou=True, + topk=tal_topk, + alpha=tal_alpha, + beta=tal_beta, + eps=1e-9)), + test_cfg=model_test_cfg) + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +last_transform = [ + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='linear', + lr_factor=lr_factor, + max_epochs=max_epochs, + warmup_epochs=3.0, + warmup_momentum=0.8, + warmup_bias_lr=0.1), + checkpoint=dict( + type='CheckpointHook', + interval=save_checkpoint_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_checkpoint_intervals) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..33092aa6a47e6053c8ce83dcdf820828619077bc --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_mask-refine_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov5u_l_mask-refine_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fd471fd46f3e19c4e0a4176703d4ab5eeee3aa0b --- /dev/null +++ b/third_party/mmyolo/configs/yolov5/yolov5u/yolov5u_x_syncbn_fast_8xb16-300e_coco.py @@ -0,0 +1,18 @@ +_base_ = './yolov5u_l_syncbn_fast_8xb16-300e_coco.py' + +# ========================modified parameters====================== +# TODO: Update the training hyperparameters +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov6/README.md b/third_party/mmyolo/configs/yolov6/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7ecda276988ff87702e902be8799d85b2dfdc79f --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/README.md @@ -0,0 +1,53 @@ +# YOLOv6 + +> [YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications](https://arxiv.org/abs/2209.02976) + + + +## Abstract + +For years, YOLO series have been de facto industry-level standard for efficient object detection. The YOLO community has prospered overwhelmingly to enrich its use in a multitude of hardware platforms and abundant scenarios. In this technical report, we strive to push its limits to the next level, stepping forward with an unwavering mindset for industry application. Considering the diverse requirements for speed and accuracy in the real environment, we extensively examine the up-to-date object detection advancements either from industry or academy. Specifically, we heavily assimilate ideas from recent network design, training strategies, testing techniques, quantization and optimization methods. On top of this, we integrate our thoughts and practice to build a suite of deployment-ready networks at various scales to accommodate diversified use cases. With the generous permission of YOLO authors, we name it YOLOv6. We also express our warm welcome to users and contributors for further enhancement. For a glimpse of performance, our YOLOv6-N hits 35.9% AP on COCO dataset at a throughput of 1234 FPS on an NVIDIA Tesla T4 GPU. YOLOv6-S strikes 43.5% AP at 495 FPS, outperforming other mainstream detectors at the same scale (YOLOv5-S, YOLOX-S and PPYOLOE-S). Our quantized version of YOLOv6-S even brings a new state-of-the-art 43.3% AP at 869 FPS. Furthermore, YOLOv6-M/L also achieves better accuracy performance (i.e., 49.5%/52.3%) than other detectors with the similar inference speed. We carefully conducted experiments to validate the effectiveness of each component. + +
+ +
+ +
+YOLOv6-s +YOLOv6-s model structure +
+ +
+YOLOv6-l +YOLOv6-l model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | Size | Epoch | SyncBN | AMP | Mem (GB) | Box AP | Config | Download | +| :------: | :--: | :--: | :---: | :----: | :-: | :------: | :----: | :-------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv6-n | P5 | 640 | 400 | Yes | Yes | 6.04 | 36.2 | [config](./yolov6_n_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726.log.json) | +| YOLOv6-t | P5 | 640 | 400 | Yes | Yes | 8.13 | 41.0 | [config](./yolov6_t_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755.log.json) | +| YOLOv6-s | P5 | 640 | 400 | Yes | Yes | 8.88 | 44.0 | [config](./yolov6_s_syncbn_fast_8xb32-400e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035.log.json) | +| YOLOv6-m | P5 | 640 | 300 | Yes | Yes | 16.69 | 48.4 | [config](./yolov6_m_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658.log.json) | +| YOLOv6-l | P5 | 640 | 300 | Yes | Yes | 20.86 | 51.0 | [config](./yolov6_l_syncbn_fast_8xb32-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156.log.json) | + +**Note**: + +1. The official m and l models use knowledge distillation, but our version does not support it, which will be implemented in [MMRazor](https://github.com/open-mmlab/mmrazor) in the future. +2. The performance is unstable and may fluctuate by about 0.3 mAP. +3. If users need the weight of 300 epoch for nano, tiny and small model, they can train according to the configs of 300 epoch provided by us, or convert the official weight according to the [converter script](../../tools/model_converters/). +4. We have observed that the [base model](https://github.com/meituan/YOLOv6/tree/main/configs/base) has been officially released in v6 recently. Although the accuracy has decreased, it is more efficient. We will also provide the base model configuration in the future. + +## Citation + +```latex +@article{li2022yolov6, + title={YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications}, + author={Li, Chuyi and Li, Lulu and Jiang, Hongliang and Weng, Kaiheng and Geng, Yifei and Li, Liang and Ke, Zaidan and Li, Qingyuan and Cheng, Meng and Nie, Weiqiang and others}, + journal={arXiv preprint arXiv:2209.02976}, + year={2022} +} +``` diff --git a/third_party/mmyolo/configs/yolov6/metafile.yml b/third_party/mmyolo/configs/yolov6/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..df451526957c08d5956db33fe5e180cd7d5fcd66 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/metafile.yml @@ -0,0 +1,83 @@ +Collections: + - Name: YOLOv6 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + - RepVGG + Paper: + URL: https://arxiv.org/abs/2209.02976 + Title: 'YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications' + README: configs/yolov6/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: yolov6_s_syncbn_fast_8xb32-400e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py + Metadata: + Training Memory (GB): 8.88 + Epochs: 400 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth + - Name: yolov6_n_syncbn_fast_8xb32-400e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py + Metadata: + Training Memory (GB): 6.04 + Epochs: 400 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 36.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco/yolov6_n_syncbn_fast_8xb32-400e_coco_20221030_202726-d99b2e82.pth + - Name: yolov6_t_syncbn_fast_8xb32-400e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py + Metadata: + Training Memory (GB): 8.13 + Epochs: 400 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco/yolov6_t_syncbn_fast_8xb32-400e_coco_20221030_143755-cf0d278f.pth + - Name: yolov6_m_syncbn_fast_8xb32-300e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 16.69 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 48.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth + - Name: yolov6_l_syncbn_fast_8xb32-300e_coco + In Collection: YOLOv6 + Config: configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py + Metadata: + Training Memory (GB): 20.86 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth diff --git a/third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ad5ecf347e4aa0b3194b8be33d9c294915dd9e56 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,28 @@ +_base_ = './yolov6_m_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1 +# The scaling factor that controls the width of the network structure +widen_factor = 1 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + block_act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..09811c8c06fb81a061ac4da7904c8d7d1e248411 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,62 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.6 +# The scaling factor that controls the width of the network structure +widen_factor = 0.75 + +# -----train val related----- +affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + type='YOLOv6CSPBep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=2. / 3, + block_cfg=dict(type='RepVGGBlock'), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6CSPRepPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict(type='RepVGGBlock'), + hidden_ratio=2. / 3, + block_act_cfg=dict(type='ReLU', inplace=True)), + bbox_head=dict( + type='YOLOv6Head', head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..bc2db4b6c03277a7c62ba3ed505d54f54267328f --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.25 + +# -----train val related----- +lr_factor = 0.02 # Learning rate scaling factor + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f66aa15fc447bce5f510a60bdda1914a8a7b5a76 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_n_syncbn_fast_8xb32-400e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.25 + +# -----train val related----- +lr_factor = 0.02 # Learning rate scaling factor + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py b/third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..82578fccf7fffb8e4bb4ac21170543a7f71bc63e --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_s_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 +num_last_epochs = 5 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu +_base_.custom_hooks[1].switch_epoch = max_epochs - num_last_epochs + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict( + max_epochs=max_epochs, + val_interval=10, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..dbffaeb3362883d8a70f43c0722dd6c99b8b8352 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,33 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Frequently modified parameters ===================== +# -----train val related----- +# Base learning rate for optim_wrapper +max_epochs = 300 # Maximum training epochs +num_last_epochs = 15 # Last epoch number to switch training pipeline + +# ============================== Unmodified in most cases =================== +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=0.01, + max_epochs=max_epochs)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] + +train_cfg = dict( + max_epochs=max_epochs, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..eb564c07a906185f6702aac88cbb4d53493f168c --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py @@ -0,0 +1,280 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ======================= Frequently modified parameters ===================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper +base_lr = 0.01 +max_epochs = 400 # Maximum training epochs +num_last_epochs = 15 # Last epoch number to switch training pipeline + +# ======================= Possible modified parameters ======================= +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals +save_epoch_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ============================== Unmodified in most cases =================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv6EfficientRep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6RepPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[128, 256, 512], + num_csp_blocks=12, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True), + ), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict( + type='YOLOv6HeadModule', + num_classes=num_classes, + in_channels=[128, 256, 512], + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=[8, 16, 32]), + loss_bbox=dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False)), + train_cfg=dict( + initial_epoch=4, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=num_classes, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + topk=13, + alpha=1, + beta=6), + ), + test_cfg=dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300)) + +# The training pipeline of YOLOv6 is basically the same as YOLOv5. +# The difference is that Mosaic and RandomAffine will be closed in the last 15 epochs. # noqa +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + max_shear_degree=0.0), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_shear_degree=0.0, + ), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + collate_fn=dict(type='yolov5_collate'), + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +# Optimizer and learning rate scheduler of YOLOv6 are basically the same as YOLOv5. # noqa +# The difference is that the scheduler_type of YOLOv6 is cosine. +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_last_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..aa9da63f6984a9a23bc7ca78780db5be5a782399 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.375 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..75755555a58b45309df9213b6262cee030e41a9d --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_t_syncbn_fast_8xb32-400e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.375 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed4b05538c077d6f49036c6399942d5f8b3f627 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_v3_l_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,28 @@ +_base_ = './yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 1 +# The scaling factor that controls the width of the network structure +widen_factor = 1 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + block_act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..982b0c8865a557c9970c1f50e3b84acba89bf93f --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_v3_m_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,63 @@ +_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.6 +# The scaling factor that controls the width of the network structure +widen_factor = 0.75 + +# -----train val related----- +affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict( + type='YOLOv6CSPBep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=2. / 3, + block_cfg=dict(type='RepVGGBlock'), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6CSPRepBiPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict(type='RepVGGBlock'), + hidden_ratio=2. / 3, + block_act_cfg=dict(type='ReLU', inplace=True)), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(reg_max=16, widen_factor=widen_factor))) + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..96469f026e253b76a293f8f3ef81148af5d258a8 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_v3_n_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.25 + +# -----train val related----- +lr_factor = 0.02 # Learning rate scaling factor + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8b0ad190139fa199918752cb8b531352db942fc0 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,282 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ======================= Frequently modified parameters ===================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 32 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs +num_last_epochs = 15 # Last epoch number to switch training pipeline + +# ======================= Possible modified parameters ======================= +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals +save_epoch_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ============================== Unmodified in most cases =================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv6EfficientRep', + out_indices=[1, 2, 3, 4], + use_cspsppf=True, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6RepBiPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[128, 256, 512, 1024], + out_channels=[128, 256, 512], + num_csp_blocks=12, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True), + ), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict( + type='YOLOv6HeadModule', + num_classes=num_classes, + in_channels=[128, 256, 512], + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=[8, 16, 32]), + loss_bbox=dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False)), + train_cfg=dict( + initial_epoch=4, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=num_classes, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + topk=13, + alpha=1, + beta=6), + ), + test_cfg=dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300)) + +# The training pipeline of YOLOv6 is basically the same as YOLOv5. +# The difference is that Mosaic and RandomAffine will be closed in the last 15 epochs. # noqa +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + max_shear_degree=0.0), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_translate_ratio=0.1, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_shear_degree=0.0, + ), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + collate_fn=dict(type='yolov5_collate'), + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +# Optimizer and learning rate scheduler of YOLOv6 are basically the same as YOLOv5. # noqa +# The difference is that the scheduler_type of YOLOv6 is cosine. +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - num_last_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py b/third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..d088b6b6629345f6f086f67373206b6d6f9b7e31 --- /dev/null +++ b/third_party/mmyolo/configs/yolov6/yolov6_v3_t_syncbn_fast_8xb32-300e_coco.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_v3_s_syncbn_fast_8xb32-300e_coco.py' + +# ======================= Possible modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.375 + +# ============================== Unmodified in most cases =================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict( + type='YOLOv6Head', + head_module=dict(widen_factor=widen_factor), + loss_bbox=dict(iou_mode='siou'))) diff --git a/third_party/mmyolo/configs/yolov7/README.md b/third_party/mmyolo/configs/yolov7/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f8f87f8358e25b7c8004aabfe7229d7941b6919a --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/README.md @@ -0,0 +1,50 @@ +# YOLOv7 + +> [YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors](https://arxiv.org/abs/2207.02696) + + + +## Abstract + +YOLOv7 surpasses all known object detectors in both speed and accuracy in the range from 5 FPS to 160 FPS and has the highest accuracy 56.8% AP among all known real-time object detectors with 30 FPS or higher on GPU V100. YOLOv7-E6 object detector (56 FPS V100, 55.9% AP) outperforms both transformer-based detector SWIN-L Cascade-Mask R-CNN (9.2 FPS A100, 53.9% AP) by 509% in speed and 2% in accuracy, and convolutional-based detector ConvNeXt-XL Cascade-Mask R-CNN (8.6 FPS A100, 55.2% AP) by 551% in speed and 0.7% AP in accuracy, as well as YOLOv7 outperforms: YOLOR, YOLOX, Scaled-YOLOv4, YOLOv5, DETR, Deformable DETR, DINO-5scale-R50, ViT-Adapter-B and many other object detectors in speed and accuracy. Moreover, we train YOLOv7 only on MS COCO dataset from scratch without using any other datasets or pre-trained weights. Source code is released in [this https URL](https://github.com/WongKinYiu/yolov7). + +
+ +
+ +
+YOLOv7-l +YOLOv7-l-P5 model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | Size | SyncBN | AMP | Mem (GB) | Box AP | Config | Download | +| :---------: | :--: | :--: | :----: | :-: | :------: | :----: | :----------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv7-tiny | P5 | 640 | Yes | Yes | 2.7 | 37.5 | [config](./yolov7_tiny_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719.log.json) | +| YOLOv7-l | P5 | 640 | Yes | Yes | 10.3 | 50.9 | [config](./yolov7_l_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601.log.json) | +| YOLOv7-x | P5 | 640 | Yes | Yes | 13.7 | 52.8 | [config](./yolov7_x_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331.log.json) | +| YOLOv7-w | P6 | 1280 | Yes | Yes | 27.0 | 54.1 | [config](./yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031-a68ef9d2.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031.log.json) | +| YOLOv7-e | P6 | 1280 | Yes | Yes | 42.5 | 55.1 | [config](./yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636-34425033.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636.log.json) | + +**Note**: +In the official YOLOv7 code, the `random_perspective` data augmentation in COCO object detection task training uses mask annotation information, which leads to higher performance. Object detection should not use mask annotation, so only box annotation information is used in `MMYOLO`. We will use the mask annotation information in the instance segmentation task. + +1. The performance is unstable and may fluctuate by about 0.3 mAP. The performance shown above is the best model. +2. If users need the weight of `YOLOv7-e2e`, they can train according to the configs provided by us, or convert the official weight according to the [converter script](https://github.com/open-mmlab/mmyolo/blob/main/tools/model_converters/yolov7_to_mmyolo.py). +3. `fast` means that `YOLOv5DetDataPreprocessor` and `yolov5_collate` are used for data preprocessing, which is faster for training, but less flexible for multitasking. Recommended to use fast version config if you only care about object detection. +4. `SyncBN` means use SyncBN, `AMP` indicates training with mixed precision. +5. We use 8x A100 for training, and the single-GPU batch size is 16. This is different from the official code. + +## Citation + +```latex +@article{wang2022yolov7, + title={{YOLOv7}: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors}, + author={Wang, Chien-Yao and Bochkovskiy, Alexey and Liao, Hong-Yuan Mark}, + journal={arXiv preprint arXiv:2207.02696}, + year={2022} +} +``` diff --git a/third_party/mmyolo/configs/yolov7/metafile.yml b/third_party/mmyolo/configs/yolov7/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..067ec6b45afefa2ae444b0343ad327b94f1507d2 --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/metafile.yml @@ -0,0 +1,83 @@ +Collections: + - Name: YOLOv7 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - EELAN + - PAFPN + - RepVGG + Paper: + URL: https://arxiv.org/abs/2207.02696 + Title: 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors' + README: configs/yolov7/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: yolov7_tiny_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 2.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.5 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth + - Name: yolov7_l_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 10.3 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth + - Name: yolov7_x_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 13.7 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth + - Name: yolov7_w-p6_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 27.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco/yolov7_w-p6_syncbn_fast_8x16b-300e_coco_20221123_053031-a68ef9d2.pth + - Name: yolov7_e-p6_syncbn_fast_8x16b-300e_coco + In Collection: YOLOv7 + Config: configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py + Metadata: + Training Memory (GB): 42.5 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 55.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco/yolov7_e-p6_syncbn_fast_8x16b-300e_coco_20221126_102636-34425033.pth diff --git a/third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py b/third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..a68715264d59c16ef2b31010ede44310d97a3a7e --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_d-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='D'), + neck=dict( + use_maxpool_in_downsample=True, + use_in_channels_in_downsample=True, + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.2, + num_blocks=6, + num_convs_in_block=1), + in_channels=[384, 768, 1152, 1536], + out_channels=[192, 384, 576, 768]), + bbox_head=dict( + head_module=dict( + in_channels=[192, 384, 576, 768], + main_out_channels=[384, 768, 1152, 1536], + aux_out_channels=[384, 768, 1152, 1536], + ))) diff --git a/third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py b/third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3d1463dc487e05eabfd3f586a28262017a9dc566 --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_e-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,19 @@ +_base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='E'), + neck=dict( + use_maxpool_in_downsample=True, + use_in_channels_in_downsample=True, + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.2, + num_blocks=6, + num_convs_in_block=1), + in_channels=[320, 640, 960, 1280], + out_channels=[160, 320, 480, 640]), + bbox_head=dict( + head_module=dict( + in_channels=[160, 320, 480, 640], + main_out_channels=[320, 640, 960, 1280]))) diff --git a/third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py b/third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..6af81051b72977410d5b51cf7a02a476d55ceb24 --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_e2e-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,20 @@ +_base_ = './yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='E2E'), + neck=dict( + use_maxpool_in_downsample=True, + use_in_channels_in_downsample=True, + block_cfg=dict( + type='EELANBlock', + num_elan_block=2, + middle_ratio=0.4, + block_ratio=0.2, + num_blocks=6, + num_convs_in_block=1), + in_channels=[320, 640, 960, 1280], + out_channels=[160, 320, 480, 640]), + bbox_head=dict( + head_module=dict( + in_channels=[160, 320, 480, 640], + main_out_channels=[320, 640, 960, 1280]))) diff --git a/third_party/mmyolo/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py b/third_party/mmyolo/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..e8a756c27e5366e3a83658132b0e330a5f68ad22 --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,324 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [ + [(12, 16), (19, 36), (40, 28)], # P3/8 + [(36, 75), (76, 55), (72, 146)], # P4/16 + [(142, 110), (192, 243), (459, 401)] # P5/32 +] +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=128 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs + +num_epoch_stage2 = 30 # The last 30 epochs switch evaluation interval +val_interval_stage2 = 1 # Evaluation interval + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS. + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.65), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + # The image scale of padding should be divided by pad_size_divisor + size_divisor=32, + # Additional paddings for pixel scale + extra_pad_ratio=0.5) + +# -----model related----- +strides = [8, 16, 32] # Strides of multi-scale prior box +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) + +# Data augmentation +max_translate_ratio = 0.2 # YOLOv5RandomAffine +scaling_ratio_range = (0.1, 2.0) # YOLOv5RandomAffine +mixup_prob = 0.15 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.3 +loss_bbox_weight = 0.05 +loss_obj_weight = 0.7 +# BatchYOLOv7Assigner params +simota_candidate_topk = 10 +simota_iou_weight = 3.0 +simota_cls_weight = 1.0 +prior_match_thr = 4. # Priori box matching threshold +obj_level_weights = [4., 1., + 0.4] # The obj loss weights of the three output layers + +lr_factor = 0.1 # Learning rate scaling factor +weight_decay = 0.0005 +save_epoch_intervals = 1 # Save model checkpoint and validation intervals +max_keep_ckpts = 3 # The maximum checkpoints to keep. + +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv7Backbone', + arch='L', + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv7PAFPN', + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.25, + num_blocks=4, + num_convs_in_block=1), + upsample_feats_cat_first=False, + in_channels=[512, 1024, 1024], + # The real output channel will be multiplied by 2 + out_channels=[128, 256, 512], + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv7Head', + head_module=dict( + type='YOLOv7HeadModule', + num_classes=num_classes, + in_channels=[256, 512, 1024], + featmap_strides=strides, + num_base_priors=3), + prior_generator=dict( + type='mmdet.YOLOAnchorGenerator', + base_sizes=anchors, + strides=strides), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + reduction='mean', + loss_weight=loss_bbox_weight * (3 / num_det_layers), + return_iou=True), + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + prior_match_thr=prior_match_thr, + obj_level_weights=obj_level_weights, + # BatchYOLOv7Assigner params + simota_candidate_topk=simota_candidate_topk, + simota_iou_weight=simota_iou_weight, + simota_cls_weight=simota_cls_weight), + test_cfg=model_test_cfg) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, # note + beta=mixup_beta, # note + prob=mixup_prob, + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate'), # FASTER + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv7OptimWrapperConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='cosine', + lr_factor=lr_factor, # note + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + save_param_scheduler=False, + interval=save_epoch_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), # Can be accelerated + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_epoch_stage2, val_interval_stage2)]) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py b/third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..eb0446760eeb39951ad2bf6a8cbb1fe3cc19870a --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_tiny_fast_1xb12-40e_cat.py @@ -0,0 +1,56 @@ +_base_ = 'yolov7_tiny_syncbn_fast_8x16b-300e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/third_party/mmyolo/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py b/third_party/mmyolo/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b9e9f10e2926a840d2af7a9e27b0e2047710343d --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,98 @@ +_base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +# ========================modified parameters======================== + +# -----model related----- +# Data augmentation +max_translate_ratio = 0.1 # YOLOv5RandomAffine +scaling_ratio_range = (0.5, 1.6) # YOLOv5RandomAffine +mixup_prob = 0.05 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.5 +loss_obj_weight = 1.0 + +lr_factor = 0.01 # Learning rate scaling factor +# ===============================Unmodified in most cases==================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +model = dict( + backbone=dict( + arch='Tiny', act_cfg=dict(type='LeakyReLU', negative_slope=0.1)), + neck=dict( + is_tiny_version=True, + in_channels=[128, 256, 512], + out_channels=[64, 128, 256], + block_cfg=dict( + _delete_=True, type='TinyDownSampleBlock', middle_ratio=0.25), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + use_repconv_outs=False), + bbox_head=dict( + head_module=dict(in_channels=[128, 256, 512]), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, + beta=mixup_beta, + prob=mixup_prob, # change + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py b/third_party/mmyolo/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9758b871785050ef41303082aab745a6568e373b --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_w-p6_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,182 @@ +_base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +# ========================modified parameters======================== +# -----data related----- +img_scale = (1280, 1280) # height, width +num_classes = 80 # Number of classes for classification +# Config of batch shapes. Only on val +# It means not used if batch_shapes_cfg is None. +batch_shapes_cfg = dict( + img_size=img_scale[ + 0], # The image scale of padding should be divided by pad_size_divisor + size_divisor=64) # Additional paddings for pixel scale +tta_img_scales = [(1280, 1280), (1024, 1024), (1536, 1536)] + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [ + [(19, 27), (44, 40), (38, 94)], # P3/8 + [(96, 68), (86, 152), (180, 137)], # P4/16 + [(140, 301), (303, 264), (238, 542)], # P5/32 + [(436, 615), (739, 380), (925, 792)] # P6/64 +] +strides = [8, 16, 32, 64] # Strides of multi-scale prior box +num_det_layers = 4 # # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) + +# Data augmentation +max_translate_ratio = 0.2 # YOLOv5RandomAffine +scaling_ratio_range = (0.1, 2.0) # YOLOv5RandomAffine +mixup_prob = 0.15 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.3 +loss_bbox_weight = 0.05 +loss_obj_weight = 0.7 +obj_level_weights = [4.0, 1.0, 0.25, 0.06] +simota_candidate_topk = 20 + +# The only difference between P6 and P5 in terms of +# hyperparameters is lr_factor +lr_factor = 0.2 + +# ===============================Unmodified in most cases==================== +pre_transform = _base_.pre_transform + +model = dict( + backbone=dict(arch='W', out_indices=(2, 3, 4, 5)), + neck=dict( + in_channels=[256, 512, 768, 1024], + out_channels=[128, 256, 384, 512], + use_maxpool_in_downsample=False, + use_repconv_outs=False), + bbox_head=dict( + head_module=dict( + type='YOLOv7p6HeadModule', + in_channels=[128, 256, 384, 512], + featmap_strides=strides, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + prior_generator=dict(base_sizes=anchors, strides=strides), + simota_candidate_topk=simota_candidate_topk, # note + # scaled based on number of detection layers + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_bbox=dict(loss_weight=loss_bbox_weight * (3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), + obj_level_weights=obj_level_weights)) + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # note + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, # note + beta=mixup_beta, # note + prob=mixup_prob, + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +val_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=batch_shapes_cfg)) +test_dataloader = val_dataloader + +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) + +# Config for Test Time Augmentation. (TTA) +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in tta_img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] diff --git a/third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py b/third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..9929705962c918392af12dd0a8275321f89fd361 --- /dev/null +++ b/third_party/mmyolo/configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py @@ -0,0 +1,15 @@ +_base_ = './yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +model = dict( + backbone=dict(arch='X'), + neck=dict( + in_channels=[640, 1280, 1280], + out_channels=[160, 320, 640], + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2), + use_repconv_outs=False), + bbox_head=dict(head_module=dict(in_channels=[320, 640, 1280]))) diff --git a/third_party/mmyolo/configs/yolov8/README.md b/third_party/mmyolo/configs/yolov8/README.md new file mode 100644 index 0000000000000000000000000000000000000000..766aa99163c97bff5206724febd41c3e484faa55 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/README.md @@ -0,0 +1,45 @@ +# YOLOv8 + + + +## Abstract + +Ultralytics YOLOv8, developed by Ultralytics, is a cutting-edge, state-of-the-art (SOTA) model that builds upon the success of previous YOLO versions and introduces new features and improvements to further boost performance and flexibility. YOLOv8 is designed to be fast, accurate, and easy to use, making it an excellent choice for a wide range of object detection, image segmentation and image classification tasks. + +
+ +YOLOv8 performance +
+ +
+ +YOLOv8-P5 model structure +
+ +## Results and models + +### COCO + +| Backbone | Arch | size | Mask Refine | SyncBN | AMP | Mem (GB) | box AP | TTA box AP | Config | Download | +| :------: | :--: | :--: | :---------: | :----: | :-: | :------: | :---------: | :--------: | :-------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOv8-n | P5 | 640 | No | Yes | Yes | 2.8 | 37.2 | | [config](./yolov8_n_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804.log.json) | +| YOLOv8-n | P5 | 640 | Yes | Yes | Yes | 2.5 | 37.4 (+0.2) | 39.9 | [config](./yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206-b975b1cd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206.log.json) | +| YOLOv8-s | P5 | 640 | No | Yes | Yes | 4.0 | 44.2 | | [config](./yolov8_s_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101.log.json) | +| YOLOv8-s | P5 | 640 | Yes | Yes | Yes | 4.0 | 45.1 (+0.9) | 46.8 | [config](./yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938-ce3c1b3f.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938.log.json) | +| YOLOv8-m | P5 | 640 | No | Yes | Yes | 7.2 | 49.8 | | [config](./yolov8_m_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200.log.json) | +| YOLOv8-m | P5 | 640 | Yes | Yes | Yes | 7.0 | 50.6 (+0.8) | 52.3 | [config](./yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400-f40abfcd.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400.log.json) | +| YOLOv8-l | P5 | 640 | No | Yes | Yes | 9.8 | 52.1 | | [config](./yolov8_l_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526-189611b6.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526.log.json) | +| YOLOv8-l | P5 | 640 | Yes | Yes | Yes | 9.1 | 53.0 (+0.9) | 54.4 | [config](./yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100-5881dec4.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100.log.json) | +| YOLOv8-x | P5 | 640 | No | Yes | Yes | 12.2 | 52.7 | | [config](./yolov8_x_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338-5674673c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338.log.json) | +| YOLOv8-x | P5 | 640 | Yes | Yes | Yes | 12.4 | 54.0 (+1.3) | 55.0 | [config](./yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411-079ca8d1.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411.log.json) | + +**Note** + +1. We use 8x A100 for training, and the single-GPU batch size is 16. This is different from the official code, but has no effect on performance. +2. The performance is unstable and may fluctuate by about 0.3 mAP and the highest performance weight in `COCO` training in `YOLOv8` may not be the last epoch. The performance shown above is the best model. +3. We provide [scripts](https://github.com/open-mmlab/mmyolo/tree/dev/tools/model_converters/yolov8_to_mmyolo.py) to convert official weights to MMYOLO. +4. `SyncBN` means using SyncBN, `AMP` indicates training with mixed precision. +5. The performance of `Mask Refine` training is for the weight performance officially released by YOLOv8. `Mask Refine` means refining bbox by mask while loading annotations and transforming after `YOLOv5RandomAffine`, and the L and X models use `Copy Paste`. +6. `TTA` means that Test Time Augmentation. It's perform 3 multi-scaling transformations on the image, followed by 2 flipping transformations (flipping and not flipping). You only need to specify `--tta` when testing to enable. see [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/common_usage/tta.md) for details. + +## Citation diff --git a/third_party/mmyolo/configs/yolov8/metafile.yml b/third_party/mmyolo/configs/yolov8/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..33cd22bc69114f39c4b2a1fcaeabf5228534bb68 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/metafile.yml @@ -0,0 +1,140 @@ +Collections: + - Name: YOLOv8 + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - AMP + - Synchronize BN + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + - Decoupled Head + README: configs/yolov8/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.0.1/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.0.1 + +Models: + - Name: yolov8_n_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 2.8 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco/yolov8_n_syncbn_fast_8xb16-500e_coco_20230114_131804-88c11cdb.pth + - Name: yolov8_s_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 4.0 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 44.2 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth + - Name: yolov8_m_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 7.2 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 49.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco/yolov8_m_syncbn_fast_8xb16-500e_coco_20230115_192200-c22e560a.pth + - Name: yolov8_l_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 9.8 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco/yolov8_l_syncbn_fast_8xb16-500e_coco_20230217_182526-189611b6.pth + - Name: yolov8_x_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 12.2 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 52.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco/yolov8_x_syncbn_fast_8xb16-500e_coco_20230218_023338-5674673c.pth + - Name: yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 2.5 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 37.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_101206-b975b1cd.pth + - Name: yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 4.0 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 45.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_095938-ce3c1b3f.pth + - Name: yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 7.0 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.6 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco_20230216_223400-f40abfcd.pth + - Name: yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 9.1 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 53.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120100-5881dec4.pth + - Name: yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco + In Collection: YOLOv8 + Config: configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py + Metadata: + Training Memory (GB): 12.4 + Epochs: 500 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 54.0 + Weights: https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco_20230217_120411-079ca8d1.pth diff --git a/third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..e25b6bcb63d1bad084f7c2175a6983dadb591fc4 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,65 @@ +_base_ = './yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 1.00 +widen_factor = 1.00 +last_stage_out_channels = 512 + +mixup_prob = 0.15 +copypaste_prob = 0.3 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform +affine_scale = _base_.affine_scale + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..bea8b2d56fecd46beddd0370732e8b83309528e5 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_l_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,39 @@ +_base_ = './yolov8_m_syncbn_fast_8xb16-500e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.00 +widen_factor = 1.00 +last_stage_out_channels = 512 + +mixup_prob = 0.15 + +# =======================Unmodified in most cases================== +pre_transform = _base_.pre_transform +mosaic_affine_transform = _base_.mosaic_affine_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2884daeb436e321c2c256687e0f063780d680f37 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_m_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,85 @@ +_base_ = './yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +last_stage_out_channels = 768 + +affine_scale = 0.9 +mixup_prob = 0.1 +copypaste_prob = 0.1 + +# ===============================Unmodified in most cases==================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='YOLOv5CopyPaste', prob=copypaste_prob), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100., + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=_base_.min_area_ratio, + use_mask_refine=_base_.use_mask2refine), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..840d32ccff78db31d9945bfe32531c1970845ee7 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_m_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,76 @@ +_base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 +last_stage_out_channels = 768 + +affine_scale = 0.9 +mixup_prob = 0.1 + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +last_transform = _base_.last_transform + +model = dict( + backbone=dict( + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels]), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels]))) + +mosaic_affine_transform = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_aspect_ratio=100, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_transform, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_transform]), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=100, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..50d3774267fd89b747574f72b34e6d7d2237c5ef --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_n_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..5833df3a157151bca2d2ce29380962e43f1ec876 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,9 @@ +_base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +deepen_factor = 0.33 +widen_factor = 0.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py b/third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..e54bff03358c4138ea175187f6617735e80f185e --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_s_fast_1xb12-40e_cat.py @@ -0,0 +1,52 @@ +_base_ = 'yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +close_mosaic_epochs = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco/yolov8_s_syncbn_fast_8xb16-500e_coco_20230117_180101-5aa5f0f1.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu +_base_.custom_hooks[1].switch_epoch = max_epochs - close_mosaic_epochs + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..769a698e4b52886797e08169cdc6da8eedea204d --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,83 @@ +_base_ = './yolov8_s_syncbn_fast_8xb16-500e_coco.py' + +# This config will refine bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +# ========================modified parameters====================== +use_mask2refine = True +min_area_ratio = 0.01 # YOLOv5RandomAffine + +# ===============================Unmodified in most cases==================== +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LoadAnnotations', + with_bbox=True, + with_mask=True, + mask2bbox=use_mask2refine) +] + +last_transform = [ + # Delete gt_masks to avoid more computation + dict(type='RemoveDataElement', keys=['gt_masks']), + dict( + type='mmdet.Albu', + transforms=_base_.albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=_base_.img_scale), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale), + max_aspect_ratio=_base_.max_aspect_ratio, + border_val=(114, 114, 114), + min_area_ratio=min_area_ratio, + use_mask_refine=use_mask2refine), *last_transform +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +_base_.custom_hooks[1].switch_pipeline = train_pipeline_stage2 diff --git a/third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7e4127efbfd549803d8794b0bdf9fbcc9565e55c --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,334 @@ +_base_ = ['../_base_/default_runtime.py', '../_base_/det_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# Path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# Path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of val image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 16 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 +# persistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=64 bs +base_lr = 0.01 +max_epochs = 500 # Maximum training epochs +# Disable mosaic augmentation for final 10 epochs (stage 2) +close_mosaic_epochs = 10 + +model_test_cfg = dict( + # The config of multi-label for multi-class prediction. + multi_label=True, + # The number of boxes before NMS + nms_pre=30000, + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', iou_threshold=0.7), # NMS type and threshold + max_per_img=300) # Max number of detections of each image + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# Config of batch shapes. Only on val. +# We tested YOLOv8-m will get 0.02 higher than not using it. +batch_shapes_cfg = None +# You can turn on `batch_shapes_cfg` by uncommenting the following lines. +# batch_shapes_cfg = dict( +# type='BatchShapePolicy', +# batch_size=val_batch_size_per_gpu, +# img_size=img_scale[0], +# # The image scale of padding should be divided by pad_size_divisor +# size_divisor=32, +# # Additional paddings for pixel scale +# extra_pad_ratio=0.5) + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +# Strides of multi-scale prior box +strides = [8, 16, 32] +# The output channel of the last stage +last_stage_out_channels = 1024 +num_det_layers = 3 # The number of model output scales +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) # Normalization config + +# -----train val related----- +affine_scale = 0.5 # YOLOv5RandomAffine scaling ratio +# YOLOv5RandomAffine aspect ratio of width and height thres to filter bboxes +max_aspect_ratio = 100 +tal_topk = 10 # Number of bbox selected in each level +tal_alpha = 0.5 # A Hyper-parameter related to alignment_metrics +tal_beta = 6.0 # A Hyper-parameter related to alignment_metrics +# TODO: Automatically scale loss_weight based on number of detection layers +loss_cls_weight = 0.5 +loss_bbox_weight = 7.5 +# Since the dfloss is implemented differently in the official +# and mmdet, we're going to divide loss_weight by 4. +loss_dfl_weight = 1.5 / 4 +lr_factor = 0.01 # Learning rate scaling factor +weight_decay = 0.0005 +# Save model checkpoint and validation intervals in stage 1 +save_epoch_intervals = 10 +# validation intervals in stage 2 +val_interval_stage2 = 1 +# The maximum checkpoints to keep. +max_keep_ckpts = 2 +# Single-scale training is recommended to +# be turned on, which can speed up training. +env_cfg = dict(cudnn_benchmark=True) + +# ===============================Unmodified in most cases==================== +model = dict( + type='YOLODetector', + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True), + backbone=dict( + type='YOLOv8CSPDarknet', + arch='P5', + last_stage_out_channels=last_stage_out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + type='YOLOv8PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, last_stage_out_channels], + out_channels=[256, 512, last_stage_out_channels], + num_csp_blocks=3, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOv8Head', + head_module=dict( + type='YOLOv8HeadModule', + num_classes=num_classes, + in_channels=[256, 512, last_stage_out_channels], + widen_factor=widen_factor, + reg_max=16, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + featmap_strides=strides), + prior_generator=dict( + type='mmdet.MlvlPointGenerator', offset=0.5, strides=strides), + bbox_coder=dict(type='DistancePointBBoxCoder'), + # scaled based on number of detection layers + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=loss_bbox_weight, + return_iou=False), + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=loss_dfl_weight)), + train_cfg=dict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=num_classes, + use_ciou=True, + topk=tal_topk, + alpha=tal_alpha, + beta=tal_beta, + eps=1e-9)), + test_cfg=model_test_cfg) + +albu_train_transforms = [ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) +] + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +last_transform = [ + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), + *last_transform +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=True, + pad_val=dict(img=114.0)), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + max_aspect_ratio=max_aspect_ratio, + border_val=(114, 114, 114)), *last_transform +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + sampler=dict(type='DefaultSampler', shuffle=True), + collate_fn=dict(type='yolov5_collate'), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file, + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) + +test_dataloader = val_dataloader + +param_scheduler = None +optim_wrapper = dict( + type='OptimWrapper', + clip_grad=dict(max_norm=10.0), + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.937, + weight_decay=weight_decay, + nesterov=True, + batch_size_per_gpu=train_batch_size_per_gpu), + constructor='YOLOv5OptimizerConstructor') + +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', + scheduler_type='linear', + lr_factor=lr_factor, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + save_best='auto', + max_keep_ckpts=max_keep_ckpts)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - close_mosaic_epochs, + switch_pipeline=train_pipeline_stage2) +] + +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[((max_epochs - close_mosaic_epochs), + val_interval_stage2)]) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8c27b9619d288f222ea0ce351f9e4578c31934a7 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_x_mask-refine_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,13 @@ +_base_ = './yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py' + +# This config use refining bbox and `YOLOv5CopyPaste`. +# Refining bbox means refining bbox by mask while loading annotations and +# transforming after `YOLOv5RandomAffine` + +deepen_factor = 1.00 +widen_factor = 1.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py b/third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3d8e6653278db54745aa3a3a606bc63aa40328b7 --- /dev/null +++ b/third_party/mmyolo/configs/yolov8/yolov8_x_syncbn_fast_8xb16-500e_coco.py @@ -0,0 +1,9 @@ +_base_ = './yolov8_l_syncbn_fast_8xb16-500e_coco.py' + +deepen_factor = 1.00 +widen_factor = 1.25 + +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolox/README.md b/third_party/mmyolo/configs/yolox/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7d5dc683c1b2e912ee27c7492bf7f869c103bb15 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/README.md @@ -0,0 +1,86 @@ +# YOLOX + +> [YOLOX: Exceeding YOLO Series in 2021](https://arxiv.org/abs/2107.08430) + + + +## Abstract + +In this report, we present some experienced improvements to YOLO series, forming a new high-performance detector -- YOLOX. We switch the YOLO detector to an anchor-free manner and conduct other advanced detection techniques, i.e., a decoupled head and the leading label assignment strategy SimOTA to achieve state-of-the-art results across a large scale range of models: For YOLO-Nano with only 0.91M parameters and 1.08G FLOPs, we get 25.3% AP on COCO, surpassing NanoDet by 1.8% AP; for YOLOv3, one of the most widely used detectors in industry, we boost it to 47.3% AP on COCO, outperforming the current best practice by 3.0% AP; for YOLOX-L with roughly the same amount of parameters as YOLOv4-CSP, YOLOv5-L, we achieve 50.0% AP on COCO at a speed of 68.9 FPS on Tesla V100, exceeding YOLOv5-L by 1.8% AP. Further, we won the 1st Place on Streaming Perception Challenge (Workshop on Autonomous Driving at CVPR 2021) using a single YOLOX-L model. We hope this report can provide useful experience for developers and researchers in practical scenes, and we also provide deploy versions with ONNX, TensorRT, NCNN, and Openvino supported. + +
+ +
+ +
+ +YOLOX-l model structure +
+ +## 🥳 🚀 Results and Models + +| Backbone | Size | Batch Size | AMP | RTMDet-Hyp | Mem (GB) | Box AP | Config | Download | +| :--------: | :--: | :--------: | :-: | :--------: | :------: | :---------: | :-------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOX-tiny | 416 | 8xb8 | No | No | 2.8 | 32.7 | [config](./yolox_tiny_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908.log.json) | +| YOLOX-tiny | 416 | 8xb32 | Yes | Yes | 4.9 | 34.3 (+1.6) | [config](./yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637.log.json) | +| YOLOX-s | 640 | 8xb8 | Yes | No | 2.9 | 40.7 | [config](./yolox_s_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600-2b224d8b.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600.log.json) | +| YOLOX-s | 640 | 8xb32 | Yes | Yes | 9.8 | 41.9 (+1.2) | [config](./yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645.log.json) | +| YOLOX-m | 640 | 8xb8 | Yes | No | 4.9 | 46.9 | [config](./yolox_m_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218-a71a6b25.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218.log.json) | +| YOLOX-m | 640 | 8xb32 | Yes | Yes | 17.6 | 47.5 (+0.6) | [config](./yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328.log.json) | +| YOLOX-l | 640 | 8xb8 | Yes | No | 8.0 | 50.1 | [config](./yolox_l_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715.log.json) | +| YOLOX-x | 640 | 8xb8 | Yes | No | 9.8 | 51.4 | [config](./yolox_x_fast_8xb8-300e_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950-1d509fab.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950.log.json) | + +YOLOX uses a default training configuration of `8xbs8` which results in a long training time, we expect it to use `8xbs32` to speed up the training and not cause a decrease in mAP. We modified `train_batch_size_per_gpu` from 8 to 32, `batch_augments_interval` from 10 to 1 and `base_lr` from 0.01 to 0.04 under YOLOX-s default configuration based on the linear scaling rule, which resulted in mAP degradation. Finally, I found that using RTMDet's training hyperparameter can improve performance in YOLOX Tiny/S/M, which also validates the superiority of RTMDet's training hyperparameter. + +The modified training parameters are as follows: + +1. train_batch_size_per_gpu: 8 -> 32 +2. batch_augments_interval: 10 -> 1 +3. num_last_epochs: 15 -> 20 +4. optim cfg: SGD -> AdamW, base_lr 0.01 -> 0.004, weight_decay 0.0005 -> 0.05 +5. ema momentum: 0.0001 -> 0.0002 + +**Note**: + +1. The test score threshold is 0.001. +2. Due to the need for pre-training weights, we cannot reproduce the performance of the `yolox-nano` model. Please refer to https://github.com/Megvii-BaseDetection/YOLOX/issues/674 for more information. + +## YOLOX-Pose + +Based on [MMPose](https://github.com/open-mmlab/mmpose/blob/main/projects/yolox-pose/README.md), we have implemented a YOLOX-based human pose estimator, utilizing the approach outlined in **YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object Keypoint Similarity Loss (CVPRW 2022)**. This pose estimator is lightweight and quick, making it well-suited for crowded scenes. + +
+ +
+ +### Results + +| Backbone | Size | Batch Size | AMP | RTMDet-Hyp | Mem (GB) | AP | Config | Download | +| :--------: | :--: | :--------: | :-: | :--------: | :------: | :--: | :------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| YOLOX-tiny | 416 | 8xb32 | Yes | Yes | 5.3 | 52.8 | [config](./pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351-2117af67.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351.log.json) | +| YOLOX-s | 640 | 8xb32 | Yes | Yes | 10.7 | 63.7 | [config](./pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150-e87d843a.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150.log.json) | +| YOLOX-m | 640 | 8xb32 | Yes | Yes | 19.2 | 69.3 | [config](./pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024-bbeacc1c.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024.log.json) | +| YOLOX-l | 640 | 8xb32 | Yes | Yes | 30.3 | 71.1 | [config](./pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py) | [model](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140-82d65ac8.pth) \| [log](https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140.log.json) | + +**Note** + +1. The performance is unstable and may fluctuate and the highest performance weight in `COCO` training may not be the last epoch. The performance shown above is the best model. + +### Installation + +Install MMPose + +``` +mim install -r requirements/mmpose.txt +``` + +## Citation + +```latex +@article{yolox2021, + title={{YOLOX}: Exceeding YOLO Series in 2021}, + author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian}, + journal={arXiv preprint arXiv:2107.08430}, + year={2021} +} +``` diff --git a/third_party/mmyolo/configs/yolox/metafile.yml b/third_party/mmyolo/configs/yolox/metafile.yml new file mode 100644 index 0000000000000000000000000000000000000000..78ede704a629fa44957bc2b24e05e6559fc17710 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/metafile.yml @@ -0,0 +1,166 @@ +Collections: + - Name: YOLOX + Metadata: + Training Data: COCO + Training Techniques: + - SGD with Nesterov + - Weight Decay + - Cosine Annealing Lr Updater + Training Resources: 8x A100 GPUs + Architecture: + - CSPDarkNet + - PAFPN + Paper: + URL: https://arxiv.org/abs/2107.08430 + Title: 'YOLOX: Exceeding YOLO Series in 2021' + README: configs/yolox/README.md + Code: + URL: https://github.com/open-mmlab/mmyolo/blob/v0.1.0/mmyolo/models/detectors/yolo_detector.py#L12 + Version: v0.1.0 + + +Models: + - Name: yolox_tiny_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 2.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 32.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_8xb8-300e_coco/yolox_tiny_8xb8-300e_coco_20220919_090908-0e40a6fc.pth + - Name: yolox_s_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_s_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 2.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 40.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb8-300e_coco/yolox_s_fast_8xb8-300e_coco_20230213_142600-2b224d8b.pth + - Name: yolox_m_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_m_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 4.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 46.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb8-300e_coco/yolox_m_fast_8xb8-300e_coco_20230213_160218-a71a6b25.pth + - Name: yolox_l_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_l_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 8.0 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 50.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth + - Name: yolox_x_fast_8xb8-300e_coco + In Collection: YOLOX + Config: configs/yolox/yolox_x_fast_8xb8-300e_coco.py + Metadata: + Training Memory (GB): 9.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 51.4 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_x_fast_8xb8-300e_coco/yolox_x_fast_8xb8-300e_coco_20230215_133950-1d509fab.pth + - Name: yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 4.9 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 34.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth + - Name: yolox_s_fast_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 9.8 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 41.9 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth + - Name: yolox_m_fast_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 17.6 + Epochs: 300 + Results: + - Task: Object Detection + Dataset: COCO + Metrics: + box AP: 47.5 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth + - Name: yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 5.3 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 52.8 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco_20230427_080351-2117af67.pth + - Name: yolox-pose_s_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 10.7 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 63.7 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco_20230427_005150-e87d843a.pth + - Name: yolox-pose_m_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 19.2 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 69.3 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco_20230427_094024-bbeacc1c.pth + - Name: yolox-pose_l_8xb32-300e-rtmdet-hyp_coco + In Collection: YOLOX + Config: yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py + Metadata: + Training Memory (GB): 30.3 + Epochs: 300 + Results: + - Task: Human Pose Estimation + Dataset: COCO + Metrics: + AP: 71.1 + Weights: https://download.openmmlab.com/mmyolo/v0/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco_20230427_041140-82d65ac8.pth diff --git a/third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..96de5e98183b33d6c19865547e7f7e217be31ea5 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/pose/yolox-pose_l_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,14 @@ +_base_ = ['./yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py'] + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_l_fast_8xb8-300e_coco/yolox_l_fast_8xb8-300e_coco_20230213_160715-c731eb1c.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f78d6a3a2f8ce2828839073f1fe2582f49bb5a69 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/pose/yolox-pose_m_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,14 @@ +_base_ = ['./yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py'] + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco_20230210_144328-e657e182.pth' # noqa + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa2172c989ddfa6c6b28e33654e1c14b8cbbc91 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/pose/yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,136 @@ +_base_ = '../yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth' # noqa + +num_keypoints = 17 +scaling_ratio_range = (0.75, 1.0) +mixup_ratio_range = (0.8, 1.6) +num_last_epochs = 20 + +# model settings +model = dict( + bbox_head=dict( + type='YOLOXPoseHead', + head_module=dict( + type='YOLOXPoseHeadModule', + num_classes=1, + num_keypoints=num_keypoints, + ), + loss_pose=dict( + type='OksLoss', + metainfo='configs/_base_/pose/coco.py', + loss_weight=30.0)), + train_cfg=dict( + assigner=dict( + type='PoseSimOTAAssigner', + center_radius=2.5, + oks_weight=3.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + oks_calculator=dict( + type='OksLoss', metainfo='configs/_base_/pose/coco.py'))), + test_cfg=dict(score_thr=0.01)) + +# pipelines +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_keypoints=True) +] + +img_scale = _base_.img_scale + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='RandomAffine', + scaling_ratio_range=scaling_ratio_range, + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=mixup_ratio_range, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='FilterAnnotations', by_keypoints=True, keep_empty=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict(type='FilterAnnotations', by_keypoints=True, keep_empty=False), + dict(type='PackDetInputs') +] + +test_pipeline = [ + *pre_transform, + dict(type='Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='PackDetInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip_indices')) +] + +# dataset settings +dataset_type = 'PoseCocoDataset' + +train_dataloader = dict( + dataset=dict( + type=dataset_type, + data_mode='bottomup', + ann_file='annotations/person_keypoints_train2017.json', + pipeline=train_pipeline_stage1)) + +val_dataloader = dict( + dataset=dict( + type=dataset_type, + data_mode='bottomup', + ann_file='annotations/person_keypoints_val2017.json', + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + _delete_=True, + type='mmpose.CocoMetric', + ann_file=_base_.data_root + 'annotations/person_keypoints_val2017.json', + score_mode='bbox') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(save_best='coco/AP', rule='greater')) + +visualizer = dict(type='mmpose.PoseLocalVisualizer') + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict(type='mmdet.SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49) +] diff --git a/third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..a7399065e70f40f4142abc943b572cbd93954222 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/pose/yolox-pose_tiny_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,70 @@ +_base_ = './yolox-pose_s_8xb32-300e-rtmdet-hyp_coco.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco_20230210_143637-4c338102.pth' # noqa + +deepen_factor = 0.33 +widen_factor = 0.375 +scaling_ratio_range = (0.75, 1.0) + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=1) + ]), + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +# data settings +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='RandomAffine', + scaling_ratio_range=scaling_ratio_range, + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='RandomFlip', prob=0.5), + dict( + type='FilterAnnotations', + by_keypoints=True, + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +test_pipeline = [ + *pre_transform, + dict(type='Resize', scale=(416, 416), keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict( + type='PackDetInputs', + meta_keys=('id', 'img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip_indices')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py b/third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..39198d2e245b00445f0a5d38e41a1ffe389b17de --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_l_fast_8xb8-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.0 +widen_factor = 1.0 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..4a4743c2dd4bcbe9e692aff54e3af1909d540c60 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py b/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..ec8fd2c854bc2d41d53ba481fa3ad7f23ba3c54a --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_m_fast_8xb8-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.67 +widen_factor = 0.75 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..851664fb3cb03dc24c4ea03e158b08db011684e9 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 +use_depthwise = True + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, use_depthwise=use_depthwise))) diff --git a/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py b/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..a0a5d373856343af82259f9c165f851be49de16d --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_nano_fast_8xb8-300e_coco.py @@ -0,0 +1,21 @@ +_base_ = './yolox_tiny_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.25 +use_depthwise = True + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + use_depthwise=use_depthwise), + bbox_head=dict( + head_module=dict( + widen_factor=widen_factor, use_depthwise=use_depthwise))) diff --git a/third_party/mmyolo/configs/yolox/yolox_p5_tta.py b/third_party/mmyolo/configs/yolox/yolox_p5_tta.py new file mode 100644 index 0000000000000000000000000000000000000000..7ffe3490ca3f7f059d498201277f4df86fbcd3da --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_p5_tta.py @@ -0,0 +1,56 @@ +# TODO: Need to solve the problem of multiple backend_args parameters +# _backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/': 's3://openmmlab/datasets/detection/', +# 'data/': 's3://openmmlab/datasets/detection/' +# })) + +_backend_args = None + +tta_model = dict( + type='mmdet.DetTTAModel', + tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.65), max_per_img=300)) + +img_scales = [(640, 640), (320, 320), (960, 960)] + +# LoadImageFromFile +# / | \ +# Resize Resize Resize # noqa +# / \ / \ / \ +# RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip # noqa +# | | | | | | +# LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn +# | | | | | | +# PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn # noqa + +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_backend_args), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='mmdet.Resize', scale=s, keep_ratio=True) + for s in img_scales + ], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], + [ + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + ], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/third_party/mmyolo/configs/yolox/yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py b/third_party/mmyolo/configs/yolox/yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..f7eac58fb548a034e22acccef72a32951bb80dee --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_s_fast_1xb12-40e-rtmdet-hyp_cat.py @@ -0,0 +1,76 @@ +_base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +data_root = './data/cat/' +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +num_last_epochs = 5 + +max_epochs = 40 +train_batch_size_per_gpu = 12 +train_num_workers = 4 + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco_20230210_134645-3a8dfbd7.pth' # noqa + +model = dict( + backbone=dict(frozen_stages=4), + bbox_head=dict(head_module=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +param_scheduler = [ + dict( + # use quadratic formula to warm up 3 epochs + # and lr is updated by iteration + # TODO: fix default scope in get function + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=3, + convert_to_iter_based=True), + dict( + # use cosine lr from 5 to 35 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=5, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last num_last_epochs epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ) +] + +_base_.custom_hooks[0].num_last_epochs = num_last_epochs + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + logger=dict(type='LoggerHook', interval=5)) +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +# visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) # noqa diff --git a/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..167023da94815e13a782b85209e1116aeac7803d --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,87 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +# Batch size of a single GPU during training +# 8 -> 32 +train_batch_size_per_gpu = 32 + +# Multi-scale training intervals +# 10 -> 1 +batch_augments_interval = 1 + +# Last epoch number to switch training pipeline +# 15 -> 20 +num_last_epochs = 20 + +# Base learning rate for optim_wrapper. Corresponding to 8xb32=256 bs +base_lr = 0.004 + +# SGD -> AdamW +optim_wrapper = dict( + _delete_=True, + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05), + paramwise_cfg=dict( + norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True)) + +# 0.0001 -> 0.0002 +ema_momentum = 0.0002 + +# ============================== Unmodified in most cases =================== +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=batch_augments_interval) + ])) + +param_scheduler = [ + dict( + # use quadratic formula to warm up 5 epochs + # and lr is updated by iteration + # TODO: fix default scope in get function + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + # use cosine lr from 5 to 285 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=5, + T_max=_base_.max_epochs - num_last_epochs, + end=_base_.max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last num_last_epochs epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=_base_.max_epochs - num_last_epochs, + end=_base_.max_epochs, + ) +] + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + new_train_pipeline=_base_.train_pipeline_stage2, + priority=48), + dict(type='mmdet.SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=ema_momentum, + update_buffers=True, + strict_load=False, + priority=49) +] + +train_dataloader = dict(batch_size=train_batch_size_per_gpu) +train_cfg = dict(dynamic_intervals=[(_base_.max_epochs - num_last_epochs, 1)]) +auto_scale_lr = dict(base_batch_size=8 * train_batch_size_per_gpu) diff --git a/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb8-300e_coco.py b/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b371ea11d2dd0900476d88a9de626e881297d790 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_s_fast_8xb8-300e_coco.py @@ -0,0 +1,331 @@ +_base_ = ['../_base_/default_runtime.py', 'yolox_p5_tta.py'] + +# ========================Frequently modified parameters====================== +# -----data related----- +data_root = 'data/coco/' # Root path of data +# path of train annotation file +train_ann_file = 'annotations/instances_train2017.json' +train_data_prefix = 'train2017/' # Prefix of train image path +# path of val annotation file +val_ann_file = 'annotations/instances_val2017.json' +val_data_prefix = 'val2017/' # Prefix of train image path + +num_classes = 80 # Number of classes for classification +# Batch size of a single GPU during training +train_batch_size_per_gpu = 8 +# Worker to pre-fetch data for each single GPU during tarining +train_num_workers = 8 +# Presistent_workers must be False if num_workers is 0 +persistent_workers = True + +# -----train val related----- +# Base learning rate for optim_wrapper. Corresponding to 8xb16=64 bs +base_lr = 0.01 +max_epochs = 300 # Maximum training epochs + +model_test_cfg = dict( + yolox_style=True, # better + # The config of multi-label for multi-class prediction + multi_label=True, # 40.5 -> 40.7 + score_thr=0.001, # Threshold to filter out boxes + max_per_img=300, # Max number of detections of each image + nms=dict(type='nms', iou_threshold=0.65)) # NMS type and threshold + +# ========================Possible modified parameters======================== +# -----data related----- +img_scale = (640, 640) # width, height +# Dataset type, this will be used to define the dataset +dataset_type = 'YOLOv5CocoDataset' +# Batch size of a single GPU during validation +val_batch_size_per_gpu = 1 +# Worker to pre-fetch data for each single GPU during validation +val_num_workers = 2 + +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.33 +# The scaling factor that controls the width of the network structure +widen_factor = 0.5 +norm_cfg = dict(type='BN', momentum=0.03, eps=0.001) +# generate new random resize shape interval +batch_augments_interval = 10 + +# -----train val related----- +weight_decay = 0.0005 +loss_cls_weight = 1.0 +loss_bbox_weight = 5.0 +loss_obj_weight = 1.0 +loss_bbox_aux_weight = 1.0 +center_radius = 2.5 # SimOTAAssigner +num_last_epochs = 15 +random_affine_scaling_ratio_range = (0.1, 2) +mixup_ratio_range = (0.8, 1.6) +# Save model checkpoint and validation intervals +save_epoch_intervals = 10 +# The maximum checkpoints to keep. +max_keep_ckpts = 3 + +ema_momentum = 0.0001 + +# ===============================Unmodified in most cases==================== +# model settings +model = dict( + type='YOLODetector', + init_cfg=dict( + type='Kaiming', + layer='Conv2d', + a=2.23606797749979, # math.sqrt(5) + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu'), + # TODO: Waiting for mmengine support + use_syncbn=False, + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(480, 800), + size_divisor=32, + interval=batch_augments_interval) + ]), + backbone=dict( + type='YOLOXCSPDarknet', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + out_indices=(2, 3, 4), + spp_kernal_sizes=(5, 9, 13), + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + ), + neck=dict( + type='YOLOXPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=256, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict( + type='YOLOXHead', + head_module=dict( + type='YOLOXHeadModule', + num_classes=num_classes, + in_channels=256, + feat_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + featmap_strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=norm_cfg, + act_cfg=dict(type='SiLU', inplace=True), + ), + loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=loss_cls_weight), + loss_bbox=dict( + type='mmdet.IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=loss_bbox_weight), + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=loss_obj_weight), + loss_bbox_aux=dict( + type='mmdet.L1Loss', + reduction='sum', + loss_weight=loss_bbox_aux_weight)), + train_cfg=dict( + assigner=dict( + type='mmdet.SimOTAAssigner', + center_radius=center_radius, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'))), + test_cfg=model_test_cfg) + +pre_transform = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=random_affine_scaling_ratio_range, + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=mixup_ratio_range, + pad_val=114.0, + pre_transform=pre_transform), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_pipeline_stage2 = [ + *pre_transform, + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + # If the image is three-channel, the pad value needs + # to be set separately for each channel. + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + collate_fn=dict(type='yolov5_collate'), + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=train_pipeline_stage1)) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=True), + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix), + test_mode=True, + pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# Reduce evaluation time +val_evaluator = dict( + type='mmdet.CocoMetric', + proposal_nums=(100, 1, 10), + ann_file=data_root + val_ann_file, + metric='bbox') + +test_evaluator = val_evaluator + +# optimizer +# default 8 gpu +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=base_lr, + momentum=0.9, + weight_decay=weight_decay, + nesterov=True), + paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.)) + +# learning rate +param_scheduler = [ + dict( + # use quadratic formula to warm up 5 epochs + # and lr is updated by iteration + # TODO: fix default scope in get function + type='mmdet.QuadraticWarmupLR', + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + # use cosine lr from 5 to 285 epoch + type='CosineAnnealingLR', + eta_min=base_lr * 0.05, + begin=5, + T_max=max_epochs - num_last_epochs, + end=max_epochs - num_last_epochs, + by_epoch=True, + convert_to_iter_based=True), + dict( + # use fixed lr during last 15 epochs + type='ConstantLR', + by_epoch=True, + factor=1, + begin=max_epochs - num_last_epochs, + end=max_epochs, + ) +] + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +custom_hooks = [ + dict( + type='YOLOXModeSwitchHook', + num_last_epochs=num_last_epochs, + new_train_pipeline=train_pipeline_stage2, + priority=48), + dict(type='mmdet.SyncNormHook', priority=48), + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=ema_momentum, + update_buffers=True, + strict_load=False, + priority=49) +] + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - num_last_epochs, 1)]) + +auto_scale_lr = dict(base_batch_size=8 * train_batch_size_per_gpu) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') diff --git a/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py b/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..28e539c9472d20fe2e28b49659ec523c098bb170 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb32-300e-rtmdet-hyp_coco.py @@ -0,0 +1,70 @@ +_base_ = './yolox_s_fast_8xb32-300e-rtmdet-hyp_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.375 + +# Multi-scale training intervals +# 10 -> 1 +batch_augments_interval = 1 + +scaling_ratio_range = (0.5, 1.5) + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=batch_augments_interval) + ]), + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=(416, 416), keep_ratio=True), # note + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader diff --git a/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py b/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fd175a6c73ccc55df697ccbf04dfb46a3fbdc0ee --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_tiny_fast_8xb8-300e_coco.py @@ -0,0 +1,100 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 0.33 +widen_factor = 0.375 +scaling_ratio_range = (0.5, 1.5) + +# =======================Unmodified in most cases================== +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform + +test_img_scale = (416, 416) +tta_img_scales = [test_img_scale, (320, 320), (640, 640)] + +# model settings +model = dict( + data_preprocessor=dict(batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + random_size_range=(320, 640), + size_divisor=32, + interval=10) + ]), + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline_stage1 = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=scaling_ratio_range, # note + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.FilterAnnotations', + min_gt_bbox_wh=(1, 1), + keep_empty=False), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=test_img_scale, keep_ratio=True), # note + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline_stage1)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = val_dataloader + +# Config for Test Time Augmentation. (TTA) +tta_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='mmdet.Resize', scale=s, keep_ratio=True) + for s in tta_img_scales + ], + [ + # ``RandomFlip`` must be placed before ``Pad``, otherwise + # bounding box coordinates after flipping cannot be + # recovered correctly. + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], + [ + dict( + type='mmdet.Pad', + pad_to_square=True, + pad_val=dict(img=(114.0, 114.0, 114.0))), + ], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'flip', 'flip_direction')) + ] + ]) +] diff --git a/third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py b/third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..0759d468be70f9af026fef2ae0dbf2308082ad96 --- /dev/null +++ b/third_party/mmyolo/configs/yolox/yolox_x_fast_8xb8-300e_coco.py @@ -0,0 +1,12 @@ +_base_ = './yolox_s_fast_8xb8-300e_coco.py' + +# ========================modified parameters====================== +deepen_factor = 1.33 +widen_factor = 1.25 + +# =======================Unmodified in most cases================== +# model settings +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/demo/15_minutes_instance_segmentation.ipynb b/third_party/mmyolo/demo/15_minutes_instance_segmentation.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a09a1a10512c15abd611c35cefdfbeda64090268 --- /dev/null +++ b/third_party/mmyolo/demo/15_minutes_instance_segmentation.ipynb @@ -0,0 +1,658 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "x7seefPduh36" + }, + "source": [ + "
\n", + " \n", + "
 
\n", + "
\n", + " OpenMMLab website\n", + " \n", + " \n", + " HOT\n", + " \n", + " \n", + "     \n", + " OpenMMLab platform\n", + " \n", + " \n", + " TRY IT OUT\n", + " \n", + " \n", + "
\n", + "
 
\n", + "\n", + "\"Open\n", + "\n", + "[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo)\n", + "[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/)\n", + "[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions)\n", + "[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo)\n", + "[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE)\n", + "[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "\n", + "[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) |\n", + "[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) |\n", + "[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) |\n", + "[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) |\n", + "[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose)\n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "V6W8P5XEJGoc" + }, + "source": [ + "# 15 minutes to get started with MMYOLO instance segmentation\n", + "\n", + "Instance segmentation is a task in computer vision that aims to segment each object in an image and assign each object a unique identifier.\n", + "\n", + "Unlike semantic segmentation, instance segmentation not only segments out different categories in an image, but also separates different instances of the same category.\n", + "\n", + "
\n", + "\"Instance\n", + "
\n", + "\n", + "Taking the downloadable balloon dataset as an example, I will guide you through a 15-minute easy introduction to MMYOLO instance segmentation. The entire process includes the following steps:\n", + "\n", + "- [Installation](#installation)\n", + "- [Dataset](#dataset)\n", + "- [Config](#config)\n", + "- [Training](#training)\n", + "- [Testing](#testing)\n", + "- [EasyDeploy](#easydeploy-deployment)\n", + "\n", + "In this tutorial, we will use YOLOv5-s as an example. For the demo configuration of the balloon dataset with other YOLO series algorithms, please refer to the corresponding algorithm configuration folder." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Ae5SqsA7wYGQ" + }, + "source": [ + "## Installation\n", + "\n", + "Assuming you've already installed Conda in advance, then install PyTorch using the following commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XVLRaEIzwW-6", + "outputId": "901b5db6-b1d7-4830-e746-485ee76d6648" + }, + "outputs": [], + "source": [ + "# -----------------------------------------------------------------------------------------\n", + "# If you are using colab, you can skip this cell for PyTorch is pre-installed on the colab.\n", + "# -----------------------------------------------------------------------------------------\n", + "!python -V\n", + "# Check nvcc version\n", + "!nvcc -V\n", + "# Check GCC version\n", + "!gcc --version\n", + "# Create a new Conda environment\n", + "%conda create -n mmyolo python=3.8 -y\n", + "%conda activate mmyolo\n", + "# If you have GPU\n", + "%conda install pytorch torchvision -c pytorch\n", + "# If you only have CPU\n", + "# %conda install pytorch torchvision cpuonly -c pytorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check PyTorch version\n", + "import torch\n", + "print(torch.__version__)\n", + "print(torch.cuda.is_available())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install MMYOLO and dependency libraries using the following commands.\n", + "For details about how to configure the environment, see [Installation and verification](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html).\n", + "```{note}\n", + "Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-qATUuntwmfD", + "outputId": "24be577b-efce-46f2-8b2f-a65d02824467" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/open-mmlab/mmyolo.git\n", + "%cd mmyolo\n", + "%pip install -U openmim\n", + "!mim install -r requirements/mminstall.txt\n", + "# Install albumentations\n", + "!mim install -r requirements/albu.txt\n", + "# Install MMYOLO\n", + "!mim install -v -e .\n", + "# \"-v\" means verbose, or more output\n", + "# \"-e\" means installing a project in editable mode,\n", + "# thus any local modifications made to the code will take effect without reinstallation." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "The Balloon dataset is a single-class dataset that consists of 74 images and includes annotated information required for training. Here is an example image from the dataset:\n", + "\n", + "
\n", + "\"balloon\n", + "
\n", + "\n", + "You can download and use it directly by the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gMQXwWuIw3ef", + "outputId": "c8efeac7-5b0c-4342-b5af-d3e790e358c3" + }, + "outputs": [], + "source": [ + "!python tools/misc/download_dataset.py --dataset-name balloon --save-dir ./data/balloon --unzip --delete\n", + "!python ./tools/dataset_converters/balloon2coco.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "covQskXXw2ul" + }, + "source": [ + "The data for the MMYOLO project is located in the MMYOLO project directory. The `train.json` and `val.json` files store the annotations in COCO format, while the `data/balloon/train` and `data/balloon/val` directories contain all the images for the dataset.\n", + "\n", + "## Config\n", + "\n", + "Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows:\n", + "\n", + "- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively.\n", + "- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12.\n", + "- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary.\n", + "\n", + "To perform the specific operation, create a new configuration file named `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` in the `configs/yolov5/ins_seg` folder. For convenience, we have already provided this configuration file. Copy the following contents into the configuration file.\n", + "\n", + "```python\n", + "_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa\n", + "\n", + "data_root = 'data/balloon/' # dataset root\n", + "# Training set annotation file of json path\n", + "train_ann_file = 'train.json'\n", + "train_data_prefix = 'train/' # Dataset prefix\n", + "# Validation set annotation file of json path\n", + "val_ann_file = 'val.json'\n", + "val_data_prefix = 'val/'\n", + "metainfo = {\n", + " 'classes': ('balloon', ), # dataset category name\n", + " 'palette': [\n", + " (220, 20, 60),\n", + " ]\n", + "}\n", + "num_classes = 1\n", + "# Set batch size to 4\n", + "train_batch_size_per_gpu = 4\n", + "# dataloader num workers\n", + "train_num_workers = 2\n", + "log_interval = 1\n", + "#####################\n", + "train_dataloader = dict(\n", + " batch_size=train_batch_size_per_gpu,\n", + " num_workers=train_num_workers,\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " metainfo=metainfo,\n", + " data_prefix=dict(img=train_data_prefix),\n", + " ann_file=train_ann_file))\n", + "val_dataloader = dict(\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " metainfo=metainfo,\n", + " data_prefix=dict(img=val_data_prefix),\n", + " ann_file=val_ann_file))\n", + "test_dataloader = val_dataloader\n", + "val_evaluator = dict(ann_file=data_root + val_ann_file)\n", + "test_evaluator = val_evaluator\n", + "default_hooks = dict(logger=dict(interval=log_interval))\n", + "#####################\n", + "\n", + "model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes)))\n", + "```\n", + "\n", + "The above configuration inherits from `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py` and updates configurations such as `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes`, etc., based on the characteristics of the balloon dataset.\n", + "\n", + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "TQ0h6sv_rJxq" + }, + "source": [ + "After running the training command mentioned above, the folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance` will be automatically generated. The weight files and the training configuration file for this session will be saved in this folder. On a lower-end GPU like the GTX 1660, the entire training process will take approximately 30 minutes.\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The performance on `val.json` is as follows:\n", + "\n", + "```text\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.330\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.509\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.317\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.150\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.396\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.454\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.317\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525\n", + "```\n", + "\n", + "The above performance is obtained by printing using the COCO API, where -1 indicates the absence of objects of that scale.\n", + "\n", + "### Some Notes\n", + "\n", + "Two key warnings are printed during training:\n", + "\n", + "- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.\n", + "\n", + "The warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon.\n", + "\n", + "### Training is resumed after the interruption\n", + "\n", + "If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --resume" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "3sJxvQoUrMhX" + }, + "source": [ + "### Save GPU memory strategy\n", + "\n", + "The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --amp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jVJdyHTxrQ9a" + }, + "source": [ + "### Training visualization\n", + "\n", + "MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process.\n", + "\n", + "#### 1 WandB\n", + "\n", + "WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys.\n", + "\n", + "
\n", + "\"image\"/\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install wandb\n", + "# After running wandb login, enter the API Keys obtained above, and the login is successful.\n", + "!wandb login" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Yu0_4YYRrbyY" + }, + "source": [ + "Add the wandb config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])\n", + "```\n", + "\n", + "Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "f_DyzfDIzwMa" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "#### 2 Tensorboard\n", + "\n", + "Install Tensorboard using the following command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "gHkGlii3n29Q" + }, + "outputs": [], + "source": [ + "%pip install tensorboard" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "bE-nx9TY1P-M" + }, + "source": [ + "Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])\n", + "```\n", + "\n", + "After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/vis_data`.\n", + "We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "g8fZgokho5CE" + }, + "outputs": [], + "source": [ + "!tensorboard --logdir=work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "GUZ7MPoaro-o" + }, + "source": [ + "## Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VYmxtE0GunTB", + "outputId": "f440807c-1931-4810-b76d-617f73fde227" + }, + "outputs": [], + "source": [ + "!python tools/test.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance best_coco_bbox_mAP_epoch_300.pth --show-dir show_results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "_cFocUqN0BCb" + }, + "source": [ + "Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model.\n", + "\n", + "
\n", + "\"result_img\"/\n", + "
\n", + "\n", + "You can also visualize model inference results in a browser window if you use `WandbVisBackend` or `TensorboardVisBackend`.\n", + "\n", + "## Feature map visualization\n", + "\n", + "MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)\n", + "\n", + "Due to the bias of direct visualization of `test_pipeline`, we need to modify the `test_pipeline` of `configs/yolov5/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='YOLOv5KeepRatioResize', scale=img_scale),\n", + " dict(\n", + " type='LetterResize',\n", + " scale=img_scale,\n", + " allow_scale_up=False,\n", + " pad_val=dict(img=114)),\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor', 'pad_param'))\n", + "]\n", + "```\n", + "\n", + "to the following config:\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "```\n", + "\n", + "Let's choose the `data/balloon/train/3927754171_9011487133_b.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers.\n", + "\n", + "**1. Visualize the three channels of YOLOv5 backbone**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg onfigs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth --target-layers backbone --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps.\n", + "\n", + "**2. Visualize the three channels of YOLOv5 neck**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \\\n", + " configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \\\n", + " work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \\\n", + " --target-layers neck \\\n", + " --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "**3. Grad-Based CAM visualization**\n", + "TODO" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EasyDeploy deployment\n", + "TODO\n", + "\n", + "This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial.\n", + "\n", + "If you encounter problems during training or testing, please check the [common troubleshooting steps](https://mmyolo.readthedocs.io/en/dev/recommended_topics/troubleshooting_steps.html) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/third_party/mmyolo/demo/15_minutes_object_detection.ipynb b/third_party/mmyolo/demo/15_minutes_object_detection.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..47e0ccbd803c808982b2a30d55b640f0b1bd48da --- /dev/null +++ b/third_party/mmyolo/demo/15_minutes_object_detection.ipynb @@ -0,0 +1,1002 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "x7seefPduh36" + }, + "source": [ + "
\n", + " \n", + "
 
\n", + "
\n", + " OpenMMLab website\n", + " \n", + " \n", + " HOT\n", + " \n", + " \n", + "     \n", + " OpenMMLab platform\n", + " \n", + " \n", + " TRY IT OUT\n", + " \n", + " \n", + "
\n", + "
 
\n", + "\n", + "\"Open\n", + "\n", + "[![PyPI](https://img.shields.io/pypi/v/mmyolo)](https://pypi.org/project/mmyolo)\n", + "[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmyolo.readthedocs.io/en/latest/)\n", + "[![deploy](https://github.com/open-mmlab/mmyolo/workflows/deploy/badge.svg)](https://github.com/open-mmlab/mmyolo/actions)\n", + "[![codecov](https://codecov.io/gh/open-mmlab/mmyolo/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmyolo)\n", + "[![license](https://img.shields.io/github/license/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/blob/main/LICENSE)\n", + "[![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "[![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmyolo.svg)](https://github.com/open-mmlab/mmyolo/issues)\n", + "\n", + "[📘Documentation](https://mmyolo.readthedocs.io/en/latest/) |\n", + "[🛠️Installation](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html) |\n", + "[👀Model Zoo](https://mmyolo.readthedocs.io/en/latest/model_zoo.html) |\n", + "[🆕Update News](https://mmyolo.readthedocs.io/en/latest/notes/changelog.html) |\n", + "[🤔Reporting Issues](https://github.com/open-mmlab/mmyolo/issues/new/choose)\n", + "\n", + "
\n", + "\n", + "
\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + " \"\"\n", + " \n", + " \"\"\n", + "
" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "V6W8P5XEJGoc" + }, + "source": [ + "# 15 minutes to get started with MMYOLO object detection\n", + "\n", + "Object detection task refers to that given a picture, the network predicts all the categories of objects included in the picture and the corresponding boundary boxes\n", + "\n", + "
\n", + "\"object\n", + "
\n", + "\n", + "Take the small dataset of cat as an example, you can easily learn MMYOLO object detection in 15 minutes. The whole process consists of the following steps:\n", + "\n", + "- [Installation](#installation)\n", + "- [Dataset](#dataset)\n", + "- [Config](#config)\n", + "- [Training](#training)\n", + "- [Testing](#testing)\n", + "- [EasyDeploy](#easydeploy-deployment)\n", + "\n", + "In this tutorial, we take YOLOv5-s as an example. For the rest of the YOLO series algorithms, please see the corresponding algorithm configuration folder." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Ae5SqsA7wYGQ" + }, + "source": [ + "## Installation\n", + "\n", + "Assuming you've already installed Conda in advance, then install PyTorch using the following commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XVLRaEIzwW-6", + "outputId": "901b5db6-b1d7-4830-e746-485ee76d6648" + }, + "outputs": [], + "source": [ + "# -----------------------------------------------------------------------------------------\n", + "# If you are using colab, you can skip this cell for PyTorch is pre-installed on the colab.\n", + "# -----------------------------------------------------------------------------------------\n", + "!python -V\n", + "# Check nvcc version\n", + "!nvcc -V\n", + "# Check GCC version\n", + "!gcc --version\n", + "# Create a new Conda environment\n", + "%conda create -n mmyolo python=3.8 -y\n", + "%conda activate mmyolo\n", + "# If you have GPU\n", + "%conda install pytorch torchvision -c pytorch\n", + "# If you only have CPU\n", + "# %conda install pytorch torchvision cpuonly -c pytorch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check PyTorch version\n", + "import torch\n", + "print(torch.__version__)\n", + "print(torch.cuda.is_available())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install MMYOLO and dependency libraries using the following commands.\n", + "For details about how to configure the environment, see [Installation and verification](https://mmyolo.readthedocs.io/en/latest/get_started/installation.html).\n", + "```{note}\n", + "Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-qATUuntwmfD", + "outputId": "24be577b-efce-46f2-8b2f-a65d02824467" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/open-mmlab/mmyolo.git\n", + "%cd mmyolo\n", + "%pip install -U openmim\n", + "!mim install -r requirements/mminstall.txt\n", + "# Install albumentations\n", + "!mim install -r requirements/albu.txt\n", + "# Install MMYOLO\n", + "!mim install -v -e .\n", + "# \"-v\" means verbose, or more output\n", + "# \"-e\" means installing a project in editable mode,\n", + "# thus any local modifications made to the code will take effect without reinstallation." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "The Cat dataset is a single-category dataset consisting of 144 pictures (the original pictures are provided by @RangeKing, and cleaned by @PeterH0323), which contains the annotation information required for training. The sample image is shown below:\n", + "\n", + "
\n", + "\"cat\n", + "
\n", + "\n", + "You can download and use it directly by the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gMQXwWuIw3ef", + "outputId": "c8efeac7-5b0c-4342-b5af-d3e790e358c3" + }, + "outputs": [], + "source": [ + "!python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "covQskXXw2ul" + }, + "source": [ + "This dataset is automatically downloaded to the `./data/cat` dir with the following directory structure:\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The cat dataset is located in the mmyolo project dir, and `data/cat/annotations` stores annotations in COCO format, and `data/cat/images` stores all images\n", + "\n", + "## Config\n", + "\n", + "Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows:\n", + "\n", + "- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively\n", + "- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12.\n", + "- The default training epoch is 300. Change it to 40 epoch\n", + "- Given the small size of the dataset, we opted to use fixed backbone weights\n", + "- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary\n", + "\n", + "Create a `yolov5_s-v61_fast_1xb12-40e_cat.py` config file in the `configs/yolov5` folder (we have provided this config for you to use directly) and copy the following into the config file.\n", + "\n", + "```python\n", + "# Inherit and overwrite part of the config based on this config\n", + "_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py'\n", + "\n", + "data_root = './data/cat/' # dataset root\n", + "class_name = ('cat', ) # dataset category name\n", + "num_classes = len(class_name) # dataset category number\n", + "# metainfo is a configuration that must be passed to the dataloader, otherwise it is invalid\n", + "# palette is a display color for category at visualization\n", + "# The palette length must be greater than or equal to the length of the classes\n", + "metainfo = dict(classes=class_name, palette=[(20, 220, 60)])\n", + "\n", + "# Adaptive anchor based on tools/analysis_tools/optimize_anchors.py\n", + "anchors = [\n", + " [(68, 69), (154, 91), (143, 162)], # P3/8\n", + " [(242, 160), (189, 287), (391, 207)], # P4/16\n", + " [(353, 337), (539, 341), (443, 432)] # P5/32\n", + "]\n", + "# Max training 40 epoch\n", + "max_epochs = 40\n", + "# bs = 12\n", + "train_batch_size_per_gpu = 12\n", + "# dataloader num workers\n", + "train_num_workers = 4\n", + "\n", + "# load COCO pre-trained weight\n", + "load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa\n", + "\n", + "model = dict(\n", + " # Fixed the weight of the entire backbone without training\n", + " backbone=dict(frozen_stages=4),\n", + " bbox_head=dict(\n", + " head_module=dict(num_classes=num_classes),\n", + " prior_generator=dict(base_sizes=anchors)\n", + " ))\n", + "\n", + "train_dataloader = dict(\n", + " batch_size=train_batch_size_per_gpu,\n", + " num_workers=train_num_workers,\n", + " dataset=dict(\n", + " data_root=data_root,\n", + " metainfo=metainfo,\n", + " # Dataset annotation file of json path\n", + " ann_file='annotations/trainval.json',\n", + " # Dataset prefix\n", + " data_prefix=dict(img='images/')))\n", + "\n", + "val_dataloader = dict(\n", + " dataset=dict(\n", + " metainfo=metainfo,\n", + " data_root=data_root,\n", + " ann_file='annotations/test.json',\n", + " data_prefix=dict(img='images/')))\n", + "\n", + "test_dataloader = val_dataloader\n", + "\n", + "_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu\n", + "\n", + "val_evaluator = dict(ann_file=data_root + 'annotations/test.json')\n", + "test_evaluator = val_evaluator\n", + "\n", + "default_hooks = dict(\n", + " # Save weights every 10 epochs and a maximum of two weights can be saved.\n", + " # The best model is saved automatically during model evaluation\n", + " checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'),\n", + " # The warmup_mim_iter parameter is critical.\n", + " # The default value is 1000 which is not suitable for cat datasets.\n", + " param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10),\n", + " # The log printing interval is 5\n", + " logger=dict(type='LoggerHook', interval=5))\n", + "# The evaluation interval is 10\n", + "train_cfg = dict(max_epochs=max_epochs, val_interval=10)\n", + "```\n", + "\n", + "The above config is inherited from `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`. According to the characteristics of cat dataset updated `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes` and other config.\n", + "\n", + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "TQ0h6sv_rJxq" + }, + "source": [ + "Run the above training command, `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat` folder will be automatically generated, the checkpoint file and the training config file will be saved in this folder. On a low-end 1660 GPU, the entire training process takes about eight minutes.\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The performance on `test.json` is as follows:\n", + "\n", + "```text\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.631\n", + " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.909\n", + " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.747\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000\n", + " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.631\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.627\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.703\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.703\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000\n", + " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.703\n", + "```\n", + "\n", + "The above properties are printed via the COCO API, where -1 indicates that no object exists for the scale. According to the rules defined by COCO, the Cat dataset contains all large sized objects, and there are no small or medium-sized objects.\n", + "\n", + "### Some Notes\n", + "\n", + "Two key warnings are printed during training:\n", + "\n", + "- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon.\n", + "- The model and loaded state dict do not match exactly\n", + "\n", + "Neither of these warnings will have any impact on performance. The first warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon. The second warning is because we are currently training in fine-tuning mode, we load the COCO pre-trained weights for 80 classes,\n", + "This will lead to the final Head module convolution channel number does not correspond, resulting in this part of the weight can not be loaded, which is also a normal phenomenon.\n", + "\n", + "### Training is resumed after the interruption\n", + "\n", + "If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --resume" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "3sJxvQoUrMhX" + }, + "source": [ + "### Save GPU memory strategy\n", + "\n", + "The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --amp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jVJdyHTxrQ9a" + }, + "source": [ + "### Training visualization\n", + "\n", + "MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process.\n", + "\n", + "#### 1 WandB\n", + "\n", + "WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys.\n", + "\n", + "
\n", + "\"image\"/\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install wandb\n", + "# After running wandb login, enter the API Keys obtained above, and the login is successful.\n", + "!wandb login" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "Yu0_4YYRrbyY" + }, + "source": [ + "Add the wandb config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')])\n", + "```\n", + "\n", + "Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "f_DyzfDIzwMa" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "#### 2 Tensorboard\n", + "\n", + "Install Tensorboard using the following command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "gHkGlii3n29Q" + }, + "outputs": [], + "source": [ + "%pip install tensorboard" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "bE-nx9TY1P-M" + }, + "source": [ + "Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`.\n", + "\n", + "```python\n", + "visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')])\n", + "```\n", + "\n", + "After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/vis_data`.\n", + "We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "g8fZgokho5CE" + }, + "outputs": [], + "source": [ + "!tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "GUZ7MPoaro-o" + }, + "source": [ + "## Testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VYmxtE0GunTB", + "outputId": "f440807c-1931-4810-b76d-617f73fde227" + }, + "outputs": [], + "source": [ + "!python tools/test.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --show-dir show_results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "_cFocUqN0BCb" + }, + "source": [ + "Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model.\n", + "\n", + "
\n", + "\"result_img\"/\n", + "
\n", + "\n", + "You can also visualize model inference results in a browser window if you use 'WandbVisBackend' or 'TensorboardVisBackend'.\n", + "\n", + "## Feature map visualization\n", + "\n", + "MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md)\n", + "\n", + "Due to the bias of direct visualization of `test_pipeline`, we need modify the `test_pipeline` of `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py`,\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='YOLOv5KeepRatioResize', scale=img_scale),\n", + " dict(\n", + " type='LetterResize',\n", + " scale=img_scale,\n", + " allow_scale_up=False,\n", + " pad_val=dict(img=114)),\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor', 'pad_param'))\n", + "]\n", + "```\n", + "\n", + "to the following config:\n", + "\n", + "```python\n", + "test_pipeline = [\n", + " dict(\n", + " type='LoadImageFromFile',\n", + " file_client_args=_base_.file_client_args),\n", + " dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize\n", + " dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),\n", + " dict(\n", + " type='mmdet.PackDetInputs',\n", + " meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',\n", + " 'scale_factor'))\n", + "]\n", + "```\n", + "\n", + "Let's choose the `data/cat/images/IMG_20221020_112705.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers.\n", + "\n", + "**1. Visualize the three channels of YOLOv5 backbone**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layers backbone \\\n", + " --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps. As the backbone of this training is not actually involved in training, it can be seen from the above figure that the big object cat is predicted on the small feature map, which is in line with the idea of hierarchical detection of object detection.\n", + "\n", + "**2. Visualize the three channels of YOLOv5 neck**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layers neck \\\n", + " --channel-reduction squeeze_mean" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "As can be seen from the above figure, because neck is involved in training, and we also reset anchor, the three output feature maps are forced to simulate the same scale object, resulting in the three output maps of neck are similar, which destroys the original pre-training distribution of backbone. At the same time, it can also be seen that 40 epochs are not enough to train the above dataset, and the feature maps do not perform well.\n", + "\n", + "**3. Grad-Based CAM visualization**\n", + "\n", + "Based on the above feature map visualization, we can analyze Grad CAM at the feature layer of bbox level.\n", + "\n", + "Install `grad-cam` package:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install \"grad-cam\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(a) View Grad CAM of the minimum output feature map of the neck" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layer neck.out_layers[2]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "9v-dMkePvHMg" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "(b) View Grad CAM of the medium output feature map of the neck" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "p9H9u0A-3KAD", + "outputId": "32ca5a56-052f-4930-f53c-41cc3a9dc619" + }, + "outputs": [], + "source": [ + "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layer neck.out_layers[1]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(c) View Grad CAM of the maximum output feature map of the neck" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MrKan1U43uUY", + "outputId": "690f8414-a76b-4fa6-e600-7cc874ce1914" + }, + "outputs": [], + "source": [ + "!python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --target-layer neck.out_layers[0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "## EasyDeploy deployment\n", + "\n", + "Here we'll use MMYOLO's [EasyDeploy](../../../projects/easydeploy/) to demonstrate the transformation deployment and basic inference of model.\n", + "\n", + "First you need to follow EasyDeploy's [basic documentation](../../../projects/easydeploy/docs/model_convert.md) controls own equipment installed for each library.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install onnx\n", + "%pip install onnx-simplifier # Install if you want to use simplify\n", + "%pip install tensorrt # If you have GPU environment and need to output TensorRT model you need to continue execution" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once installed, you can use the following command to transform and deploy the trained model on the cat dataset with one click. The current ONNX version is 1.13.0 and TensorRT version is 8.5.3.1, so keep the `--opset` value of 11. The remaining parameters need to be adjusted according to the config used. Here we export the CPU version of ONNX with the `--backend` set to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 534 + }, + "id": "YsRFEecU5C0w", + "outputId": "c26011d4-2836-4715-cd6b-68836294db33" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/export.py \\\n", + "\t configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + "\t work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + "\t --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \\\n", + " --img-size 640 640 \\\n", + " --batch 1 \\\n", + " --device cpu \\\n", + " --simplify \\\n", + "\t --opset 11 \\\n", + "\t --backend 1 \\\n", + "\t --pre-topk 1000 \\\n", + "\t --keep-topk 100 \\\n", + "\t --iou-threshold 0.65 \\\n", + "\t --score-threshold 0.25\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "q1EY415x3Idx" + }, + "source": [ + "On success, you will get the converted ONNX model under `work-dir`, which is named `end2end.onnx` by default.\n", + "\n", + "Let's use `end2end.onnx` model to perform a basic image inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/image-demo.py \\\n", + " data/cat/images/IMG_20210728_205312.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \\\n", + " --device cpu" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "IrjiBa5YwDQM" + }, + "source": [ + "After successful inference, the result image will be generated in the `output` folder of the default MMYOLO root directory. If you want to see the result without saving it, you can add `--show` to the end of the above command. For convenience, the following is the generated result.\n", + "\n", + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "Let's go on to convert the engine file for TensorRT, because TensorRT needs to be specific to the current environment and deployment version, so make sure to export the parameters, here we export the TensorRT8 file, the `--backend` is 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d8zxczqiBLoB" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/export.py \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \\\n", + " --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \\\n", + " --img-size 640 640 \\\n", + " --batch 1 \\\n", + " --device cuda:0 \\\n", + " --simplify \\\n", + " --opset 11 \\\n", + " --backend 2 \\\n", + " --pre-topk 1000 \\\n", + " --keep-topk 100 \\\n", + " --iou-threshold 0.65 \\\n", + " --score-threshold 0.25" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The resulting `end2end.onnx` is the ONNX file for the TensorRT8 deployment, which we will use to complete the TensorRT engine transformation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "QFh8rIsX_kVw", + "outputId": "c5bd6929-03a8-400e-be1e-581f32b23f61" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/build_engine.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \\\n", + " --img-size 640 640 \\\n", + " --device cuda:0" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Successful execution will generate the `end2end.engine` file under `work-dir`:\n", + "\n", + "```shell\n", + "work_dirs/yolov5_s-v61_fast_1xb12-40e_cat\n", + "├── 202302XX_XXXXXX\n", + "│ ├── 202302XX_XXXXXX.log\n", + "│ └── vis_data\n", + "│ ├── 202302XX_XXXXXX.json\n", + "│ ├── config.py\n", + "│ └── scalars.json\n", + "├── best_coco\n", + "│ └── bbox_mAP_epoch_40.pth\n", + "├── end2end.engine\n", + "├── end2end.onnx\n", + "├── epoch_30.pth\n", + "├── epoch_40.pth\n", + "├── last_checkpoint\n", + "└── yolov5_s-v61_fast_1xb12-40e_cat.py\n", + "```\n", + "\n", + "Let's continue use `image-demo.py` for image inference:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "rOqXEi-jAI7Y", + "outputId": "2a21aaaa-d4ba-498a-f985-2a6a2b8d348f" + }, + "outputs": [], + "source": [ + "!python projects/easydeploy/tools/image-demo.py \\\n", + " data/cat/images/IMG_20210728_205312.jpg \\\n", + " configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \\\n", + " work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.engine \\\n", + " --device cuda:0" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "ocHGUUEA_TjI" + }, + "source": [ + "
\n", + "\"image\"/\n", + "
\n", + "\n", + "This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial.\n", + "\n", + "If you encounter problems during training or testing, please check the [common troubleshooting steps](https://mmyolo.readthedocs.io/en/dev/recommended_topics/troubleshooting_steps.html) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it.\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/third_party/mmyolo/demo/boxam_vis_demo.py b/third_party/mmyolo/demo/boxam_vis_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..278574f89fe5427cb5be7b9a7fd99f70de090bd4 --- /dev/null +++ b/third_party/mmyolo/demo/boxam_vis_demo.py @@ -0,0 +1,276 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This script is in the experimental verification stage and cannot be +guaranteed to be completely correct. Currently Grad-based CAM and Grad-free CAM +are supported. + +The target detection task is different from the classification task. It not +only includes the AM map of the category, but also includes information such as +bbox and mask, so this script is named bboxam. +""" + +import argparse +import os.path +import warnings +from functools import partial + +import cv2 +import mmcv +from mmengine import Config, DictAction, MessageHub +from mmengine.utils import ProgressBar + +try: + from pytorch_grad_cam import AblationCAM, EigenCAM +except ImportError: + raise ImportError('Please run `pip install "grad-cam"` to install ' + 'pytorch_grad_cam package.') + +from mmyolo.utils.boxam_utils import (BoxAMDetectorVisualizer, + BoxAMDetectorWrapper, DetAblationLayer, + DetBoxScoreTarget, GradCAM, + GradCAMPlusPlus, reshape_transform) +from mmyolo.utils.misc import get_file_list + +GRAD_FREE_METHOD_MAP = { + 'ablationcam': AblationCAM, + 'eigencam': EigenCAM, + # 'scorecam': ScoreCAM, # consumes too much memory +} + +GRAD_BASED_METHOD_MAP = {'gradcam': GradCAM, 'gradcam++': GradCAMPlusPlus} + +ALL_SUPPORT_METHODS = list(GRAD_FREE_METHOD_MAP.keys() + | GRAD_BASED_METHOD_MAP.keys()) + +IGNORE_LOSS_PARAMS = { + 'yolov5': ['loss_obj'], + 'yolov6': ['loss_cls'], + 'yolox': ['loss_obj'], + 'rtmdet': ['loss_cls'], + 'yolov7': ['loss_obj'], + 'yolov8': ['loss_cls'], + 'ppyoloe': ['loss_cls'], +} + +# This parameter is required in some algorithms +# for calculating Loss +message_hub = MessageHub.get_current_instance() +message_hub.runtime_info['epoch'] = 0 + + +def parse_args(): + parser = argparse.ArgumentParser(description='Visualize Box AM') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--method', + default='gradcam', + choices=ALL_SUPPORT_METHODS, + help='Type of method to use, supports ' + f'{", ".join(ALL_SUPPORT_METHODS)}.') + parser.add_argument( + '--target-layers', + default=['neck.out_layers[2]'], + nargs='+', + type=str, + help='The target layers to get Box AM, if not set, the tool will ' + 'specify the neck.out_layers[2]') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the CAM results') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--topk', + type=int, + default=-1, + help='Select topk predict resutls to show. -1 are mean all.') + parser.add_argument( + '--max-shape', + nargs='+', + type=int, + default=-1, + help='max shapes. Its purpose is to save GPU memory. ' + 'The activation map is scaled and then evaluated. ' + 'If set to -1, it means no scaling.') + parser.add_argument( + '--preview-model', + default=False, + action='store_true', + help='To preview all the model layers') + parser.add_argument( + '--norm-in-bbox', action='store_true', help='Norm in bbox of am image') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + # Only used by AblationCAM + parser.add_argument( + '--batch-size', + type=int, + default=1, + help='batch of inference of AblationCAM') + parser.add_argument( + '--ratio-channels-to-ablate', + type=int, + default=0.5, + help='Making it much faster of AblationCAM. ' + 'The parameter controls how many channels should be ablated') + + args = parser.parse_args() + return args + + +def init_detector_and_visualizer(args, cfg): + max_shape = args.max_shape + if not isinstance(max_shape, list): + max_shape = [args.max_shape] + assert len(max_shape) == 1 or len(max_shape) == 2 + + model_wrapper = BoxAMDetectorWrapper( + cfg, args.checkpoint, args.score_thr, device=args.device) + + if args.preview_model: + print(model_wrapper.detector) + print('\n Please remove `--preview-model` to get the BoxAM.') + return None, None + + target_layers = [] + for target_layer in args.target_layers: + try: + target_layers.append( + eval(f'model_wrapper.detector.{target_layer}')) + except Exception as e: + print(model_wrapper.detector) + raise RuntimeError('layer does not exist', e) + + ablationcam_extra_params = { + 'batch_size': args.batch_size, + 'ablation_layer': DetAblationLayer(), + 'ratio_channels_to_ablate': args.ratio_channels_to_ablate + } + + if args.method in GRAD_BASED_METHOD_MAP: + method_class = GRAD_BASED_METHOD_MAP[args.method] + is_need_grad = True + else: + method_class = GRAD_FREE_METHOD_MAP[args.method] + is_need_grad = False + + boxam_detector_visualizer = BoxAMDetectorVisualizer( + method_class, + model_wrapper, + target_layers, + reshape_transform=partial( + reshape_transform, max_shape=max_shape, is_need_grad=is_need_grad), + is_need_grad=is_need_grad, + extra_params=ablationcam_extra_params) + return model_wrapper, boxam_detector_visualizer + + +def main(): + args = parse_args() + + # hard code + ignore_loss_params = None + for param_keys in IGNORE_LOSS_PARAMS: + if param_keys in args.config: + print(f'The algorithm currently used is {param_keys}') + ignore_loss_params = IGNORE_LOSS_PARAMS[param_keys] + break + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + model_wrapper, boxam_detector_visualizer = init_detector_and_visualizer( + args, cfg) + + # get file list + image_list, source_type = get_file_list(args.img) + + progress_bar = ProgressBar(len(image_list)) + + for image_path in image_list: + image = cv2.imread(image_path) + model_wrapper.set_input_data(image) + + # forward detection results + result = model_wrapper()[0] + + pred_instances = result.pred_instances + # Get candidate predict info with score threshold + pred_instances = pred_instances[pred_instances.scores > args.score_thr] + + if len(pred_instances) == 0: + warnings.warn('empty detection results! skip this') + continue + + if args.topk > 0: + pred_instances = pred_instances[:args.topk] + + targets = [ + DetBoxScoreTarget( + pred_instances, + device=args.device, + ignore_loss_params=ignore_loss_params) + ] + + if args.method in GRAD_BASED_METHOD_MAP: + model_wrapper.need_loss(True) + model_wrapper.set_input_data(image, pred_instances) + boxam_detector_visualizer.switch_activations_and_grads( + model_wrapper) + + # get box am image + grayscale_boxam = boxam_detector_visualizer(image, targets=targets) + + # draw cam on image + pred_instances = pred_instances.numpy() + image_with_bounding_boxes = boxam_detector_visualizer.show_am( + image, + pred_instances, + grayscale_boxam, + with_norm_in_bboxes=args.norm_in_bbox) + + if source_type['is_dir']: + filename = os.path.relpath(image_path, args.img).replace('/', '_') + else: + filename = os.path.basename(image_path) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + if out_file: + mmcv.imwrite(image_with_bounding_boxes, out_file) + else: + cv2.namedWindow(filename, 0) + cv2.imshow(filename, image_with_bounding_boxes) + cv2.waitKey(0) + + # switch + if args.method in GRAD_BASED_METHOD_MAP: + model_wrapper.need_loss(False) + boxam_detector_visualizer.switch_activations_and_grads( + model_wrapper) + + progress_bar.update() + + if not args.show: + print(f'All done!' + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/demo/demo.jpg b/third_party/mmyolo/demo/demo.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dd613cee3bc13a3677908d7d6f1899e8278a4b47 Binary files /dev/null and b/third_party/mmyolo/demo/demo.jpg differ diff --git a/third_party/mmyolo/demo/demo.mp4 b/third_party/mmyolo/demo/demo.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..6c06d15d941c640e15785e0416818181313d83b7 Binary files /dev/null and b/third_party/mmyolo/demo/demo.mp4 differ diff --git a/third_party/mmyolo/demo/deploy_demo.py b/third_party/mmyolo/demo/deploy_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..f5d08df47fc9740bc1d2ca837d5188f8b4eac267 --- /dev/null +++ b/third_party/mmyolo/demo/deploy_demo.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Deploy demo for mmdeploy. + +This script help user to run mmdeploy demo after convert the +checkpoint to backends. + +Usage: + python deploy_demo.py img \ + config \ + checkpoint \ + [--deploy-cfg DEPLOY_CFG] \ + [--device DEVICE] \ + [--out-dir OUT_DIR] \ + [--show] \ + [--score-thr SCORE_THR] + +Example: + python deploy_demo.py \ + ${MMYOLO_PATH}/data/cat/images \ + ./yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + ./end2end.engine \ + --deploy-cfg ./detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + --out-dir ${MMYOLO_PATH}/work_dirs/deploy_predict_out \ + --device cuda:0 \ + --score-thr 0.5 +""" +import argparse +import os + +import torch +from mmengine import ProgressBar + +from mmyolo.utils.misc import get_file_list + +try: + from mmdeploy.apis.utils import build_task_processor + from mmdeploy.utils import get_input_shape, load_config +except ImportError: + raise ImportError( + 'mmdeploy is not installed, please see ' + 'https://mmdeploy.readthedocs.io/en/1.x/01-how-to-build/build_from_source.html' # noqa + ) + + +def parse_args(): + parser = argparse.ArgumentParser(description='For mmdeploy predict') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='model config root') + parser.add_argument('checkpoint', help='checkpoint backend model path') + parser.add_argument('--deploy-cfg', help='deploy config path') + parser.add_argument( + '--device', default='cuda:0', help='device used for conversion') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + args = parser.parse_args() + return args + + +# TODO Still need to refactor to not building dataset. +def main(): + args = parse_args() + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + # read deploy_cfg and config + deploy_cfg, model_cfg = load_config(args.deploy_cfg, args.config) + + # build task and backend model + task_processor = build_task_processor(model_cfg, deploy_cfg, args.device) + model = task_processor.build_backend_model([args.checkpoint]) + + # get model input shape + input_shape = get_input_shape(deploy_cfg) + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for file in files: + # process input image + model_inputs, _ = task_processor.create_input(file, input_shape) + + # do model inference + with torch.no_grad(): + result = model.test_step(model_inputs) + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # filter score + result = result[0] + result.pred_instances = result.pred_instances[ + result.pred_instances.scores > args.score_thr] + + # visualize results + task_processor.visualize( + image=file, + model=model, + result=result, + show_result=args.show, + window_name=os.path.basename(filename), + output_file=out_file) + + progress_bar.update() + + print('All done!') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/demo/dog.jpg b/third_party/mmyolo/demo/dog.jpg new file mode 100644 index 0000000000000000000000000000000000000000..77b0381222eaed50867643f4166092c781e56d5b Binary files /dev/null and b/third_party/mmyolo/demo/dog.jpg differ diff --git a/third_party/mmyolo/demo/featmap_vis_demo.py b/third_party/mmyolo/demo/featmap_vis_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..892e73d616b0e629ddfcc276e8eb4ca289f5085b --- /dev/null +++ b/third_party/mmyolo/demo/featmap_vis_demo.py @@ -0,0 +1,199 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +from typing import Sequence + +import mmcv +from mmdet.apis import inference_detector, init_detector +from mmengine import Config, DictAction +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar + +from mmyolo.registry import VISUALIZERS +from mmyolo.utils.misc import auto_arrange_images, get_file_list + + +def parse_args(): + parser = argparse.ArgumentParser(description='Visualize feature map') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--target-layers', + default=['backbone'], + nargs='+', + type=str, + help='The target layers to get feature map, if not set, the tool will ' + 'specify the backbone') + parser.add_argument( + '--preview-model', + default=False, + action='store_true', + help='To preview all the model layers') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--show', action='store_true', help='Show the featmap results') + parser.add_argument( + '--channel-reduction', + default='select_max', + help='Reduce multiple channels to a single channel') + parser.add_argument( + '--topk', + type=int, + default=4, + help='Select topk channel to show by the sum of each channel') + parser.add_argument( + '--arrangement', + nargs='+', + type=int, + default=[2, 2], + help='The arrangement of featmap when channel_reduction is ' + 'not None and topk > 0') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +class ActivationsWrapper: + + def __init__(self, model, target_layers): + self.model = model + self.activations = [] + self.handles = [] + self.image = None + for target_layer in target_layers: + self.handles.append( + target_layer.register_forward_hook(self.save_activation)) + + def save_activation(self, module, input, output): + self.activations.append(output) + + def __call__(self, img_path): + self.activations = [] + results = inference_detector(self.model, img_path) + return results, self.activations + + def release(self): + for handle in self.handles: + handle.remove() + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + channel_reduction = args.channel_reduction + if channel_reduction == 'None': + channel_reduction = None + assert len(args.arrangement) == 2 + + model = init_detector(args.config, args.checkpoint, device=args.device) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + if args.preview_model: + print(model) + print('\n This flag is only show model, if you want to continue, ' + 'please remove `--preview-model` to get the feature map.') + return + + target_layers = [] + for target_layer in args.target_layers: + try: + target_layers.append(eval(f'model.{target_layer}')) + except Exception as e: + print(model) + raise RuntimeError('layer does not exist', e) + + activations_wrapper = ActivationsWrapper(model, target_layers) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + image_list, source_type = get_file_list(args.img) + + progress_bar = ProgressBar(len(image_list)) + for image_path in image_list: + result, featmaps = activations_wrapper(image_path) + if not isinstance(featmaps, Sequence): + featmaps = [featmaps] + + flatten_featmaps = [] + for featmap in featmaps: + if isinstance(featmap, Sequence): + flatten_featmaps.extend(featmap) + else: + flatten_featmaps.append(featmap) + + img = mmcv.imread(image_path) + img = mmcv.imconvert(img, 'bgr', 'rgb') + + if source_type['is_dir']: + filename = os.path.relpath(image_path, args.img).replace('/', '_') + else: + filename = os.path.basename(image_path) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # show the results + shown_imgs = [] + visualizer.add_datasample( + 'result', + img, + data_sample=result, + draw_gt=False, + show=False, + wait_time=0, + out_file=None, + pred_score_thr=args.score_thr) + drawn_img = visualizer.get_image() + + for featmap in flatten_featmaps: + shown_img = visualizer.draw_featmap( + featmap[0], + drawn_img, + channel_reduction=channel_reduction, + topk=args.topk, + arrangement=args.arrangement) + shown_imgs.append(shown_img) + + shown_imgs = auto_arrange_images(shown_imgs) + + progress_bar.update() + if out_file: + mmcv.imwrite(shown_imgs[..., ::-1], out_file) + + if args.show: + visualizer.show(shown_imgs) + + if not args.show: + print(f'All done!' + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +# Please refer to the usage tutorial: +# https://github.com/open-mmlab/mmyolo/blob/main/docs/zh_cn/user_guides/visualization.md # noqa +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/demo/image_demo.py b/third_party/mmyolo/demo/image_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..fa2cfb2a03f7e8328dd068851433d69c9f4a0db5 --- /dev/null +++ b/third_party/mmyolo/demo/image_demo.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from argparse import ArgumentParser +from pathlib import Path + +import mmcv +from mmdet.apis import inference_detector, init_detector +from mmengine.config import Config, ConfigDict +from mmengine.logging import print_log +from mmengine.utils import ProgressBar, path + +from mmyolo.registry import VISUALIZERS +from mmyolo.utils import switch_to_deploy +from mmyolo.utils.labelme_utils import LabelmeFormat +from mmyolo.utils.misc import get_file_list, show_data_classes + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--class-name', + nargs='+', + type=str, + help='Only Save those classes if set') + parser.add_argument( + '--to-labelme', + action='store_true', + help='Output labelme style label file') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + if args.to_labelme and args.show: + raise RuntimeError('`--to-labelme` or `--show` only ' + 'can choose one at the same time.') + config = args.config + + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + if args.tta: + assert 'tta_model' in config, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in config, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + config.model = ConfigDict(**config.tta_model, module=config.model) + test_data_cfg = config.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = config.tta_pipeline + + # TODO: TTA mode will error if cfg_options is not set. + # This is an mmdet issue and needs to be fixed later. + # build the model from a config file and a checkpoint file + model = init_detector( + config, args.checkpoint, device=args.device, cfg_options={}) + + if args.deploy: + switch_to_deploy(model) + + if not args.show: + path.mkdir_or_exist(args.out_dir) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + files, source_type = get_file_list(args.img) + + # get model class name + dataset_classes = model.dataset_meta.get('classes') + + # ready for labelme format if it is needed + to_label_format = LabelmeFormat(classes=dataset_classes) + + # check class name + if args.class_name is not None: + for class_name in args.class_name: + if class_name in dataset_classes: + continue + show_data_classes(dataset_classes) + raise RuntimeError( + 'Expected args.class_name to be one of the list, ' + f'but got "{class_name}"') + + # start detector inference + progress_bar = ProgressBar(len(files)) + for file in files: + result = inference_detector(model, file) + + img = mmcv.imread(file) + img = mmcv.imconvert(img, 'bgr', 'rgb') + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + progress_bar.update() + + # Get candidate predict info with score threshold + pred_instances = result.pred_instances[ + result.pred_instances.scores > args.score_thr] + + if args.to_labelme: + # save result to labelme files + out_file = out_file.replace( + os.path.splitext(out_file)[-1], '.json') + to_label_format(pred_instances, result.metainfo, out_file, + args.class_name) + continue + + visualizer.add_datasample( + filename, + img, + data_sample=result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=out_file, + pred_score_thr=args.score_thr) + + if not args.show and not args.to_labelme: + print_log( + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + elif args.to_labelme: + print_log('\nLabelme format label files ' + f'had all been saved in {args.out_dir}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/demo/large_image.jpg b/third_party/mmyolo/demo/large_image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1abbc5d9bb9cf1656ff95fb813fee0db4a40d74e Binary files /dev/null and b/third_party/mmyolo/demo/large_image.jpg differ diff --git a/third_party/mmyolo/demo/large_image_demo.py b/third_party/mmyolo/demo/large_image_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..bdbc3a56d0056c3965fac28c49e18b31355a2029 --- /dev/null +++ b/third_party/mmyolo/demo/large_image_demo.py @@ -0,0 +1,294 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Perform MMYOLO inference on large images (as satellite imagery) as: + +```shell +wget -P checkpoint https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth # noqa: E501, E261. + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + checkpoint/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth +``` +""" + +import os +import random +from argparse import ArgumentParser +from pathlib import Path + +import mmcv +import numpy as np +from mmdet.apis import inference_detector, init_detector +from mmengine.config import Config, ConfigDict +from mmengine.logging import print_log +from mmengine.utils import ProgressBar + +try: + from sahi.slicing import slice_image +except ImportError: + raise ImportError('Please run "pip install -U sahi" ' + 'to install sahi first for large image inference.') + +from mmyolo.registry import VISUALIZERS +from mmyolo.utils import switch_to_deploy +from mmyolo.utils.large_image import merge_results_by_nms, shift_predictions +from mmyolo.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser( + description='Perform MMYOLO inference on large images.') + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--patch-size', type=int, default=640, help='The size of patches') + parser.add_argument( + '--patch-overlap-ratio', + type=float, + default=0.25, + help='Ratio of overlap between two patches') + parser.add_argument( + '--merge-iou-thr', + type=float, + default=0.25, + help='IoU threshould for merging results') + parser.add_argument( + '--merge-nms-type', + type=str, + default='nms', + help='NMS type for merging results') + parser.add_argument( + '--batch-size', + type=int, + default=1, + help='Batch size, must greater than or equal to 1') + parser.add_argument( + '--debug', + action='store_true', + help='Export debug results before merging') + parser.add_argument( + '--save-patch', + action='store_true', + help='Save the results of each patch. ' + 'The `--debug` must be enabled.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + config = args.config + + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + if args.tta: + assert 'tta_model' in config, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in config, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + config.model = ConfigDict(**config.tta_model, module=config.model) + test_data_cfg = config.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = config.tta_pipeline + + # TODO: TTA mode will error if cfg_options is not set. + # This is an mmdet issue and needs to be fixed later. + # build the model from a config file and a checkpoint file + model = init_detector( + config, args.checkpoint, device=args.device, cfg_options={}) + + if args.deploy: + switch_to_deploy(model) + + if not os.path.exists(args.out_dir) and not args.show: + os.mkdir(args.out_dir) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + visualizer.dataset_meta = model.dataset_meta + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + print(f'Performing inference on {len(files)} images.... ' + 'This may take a while.') + progress_bar = ProgressBar(len(files)) + for file in files: + # read image + img = mmcv.imread(file) + + # arrange slices + height, width = img.shape[:2] + sliced_image_object = slice_image( + img, + slice_height=args.patch_size, + slice_width=args.patch_size, + auto_slice_resolution=False, + overlap_height_ratio=args.patch_overlap_ratio, + overlap_width_ratio=args.patch_overlap_ratio, + ) + + # perform sliced inference + slice_results = [] + start = 0 + while True: + # prepare batch slices + end = min(start + args.batch_size, len(sliced_image_object)) + images = [] + for sliced_image in sliced_image_object.images[start:end]: + images.append(sliced_image) + + # forward the model + slice_results.extend(inference_detector(model, images)) + + if end >= len(sliced_image_object): + break + start += args.batch_size + + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + + img = mmcv.imconvert(img, 'bgr', 'rgb') + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # export debug images + if args.debug: + # export sliced image results + name, suffix = os.path.splitext(filename) + + shifted_instances = shift_predictions( + slice_results, + sliced_image_object.starting_pixels, + src_image_shape=(height, width)) + merged_result = slice_results[0].clone() + merged_result.pred_instances = shifted_instances + + debug_file_name = name + '_debug' + suffix + debug_out_file = None if args.show else os.path.join( + args.out_dir, debug_file_name) + visualizer.set_image(img.copy()) + + debug_grids = [] + for starting_point in sliced_image_object.starting_pixels: + start_point_x = starting_point[0] + start_point_y = starting_point[1] + end_point_x = start_point_x + args.patch_size + end_point_y = start_point_y + args.patch_size + debug_grids.append( + [start_point_x, start_point_y, end_point_x, end_point_y]) + debug_grids = np.array(debug_grids) + debug_grids[:, 0::2] = np.clip(debug_grids[:, 0::2], 1, + img.shape[1] - 1) + debug_grids[:, 1::2] = np.clip(debug_grids[:, 1::2], 1, + img.shape[0] - 1) + + palette = np.random.randint(0, 256, size=(len(debug_grids), 3)) + palette = [tuple(c) for c in palette] + line_styles = random.choices(['-', '-.', ':'], k=len(debug_grids)) + visualizer.draw_bboxes( + debug_grids, + edge_colors=palette, + alpha=1, + line_styles=line_styles) + visualizer.draw_bboxes( + debug_grids, face_colors=palette, alpha=0.15) + + visualizer.draw_texts( + list(range(len(debug_grids))), + debug_grids[:, :2] + 5, + colors='w') + + visualizer.add_datasample( + debug_file_name, + visualizer.get_image(), + data_sample=merged_result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=debug_out_file, + pred_score_thr=args.score_thr, + ) + + if args.save_patch: + debug_patch_out_dir = os.path.join(args.out_dir, + f'{name}_patch') + for i, slice_result in enumerate(slice_results): + patch_out_file = os.path.join( + debug_patch_out_dir, + f'{filename}_slice_{i}_result.jpg') + image = mmcv.imconvert(sliced_image_object.images[i], + 'bgr', 'rgb') + + visualizer.add_datasample( + 'patch_result', + image, + data_sample=slice_result, + draw_gt=False, + show=False, + wait_time=0, + out_file=patch_out_file, + pred_score_thr=args.score_thr, + ) + + image_result = merge_results_by_nms( + slice_results, + sliced_image_object.starting_pixels, + src_image_shape=(height, width), + nms_cfg={ + 'type': args.merge_nms_type, + 'iou_threshold': args.merge_iou_thr + }) + + visualizer.add_datasample( + filename, + img, + data_sample=image_result, + draw_gt=False, + show=args.show, + wait_time=0, + out_file=out_file, + pred_score_thr=args.score_thr, + ) + progress_bar.update() + + if not args.show or (args.debug and args.save_patch): + print_log( + f'\nResults have been saved at {os.path.abspath(args.out_dir)}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/demo/video_demo.py b/third_party/mmyolo/demo/video_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..d8317a2c6c777eaa9cc6aab27e55bf53efe9e8fd --- /dev/null +++ b/third_party/mmyolo/demo/video_demo.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Perform MMYOLO inference on a video as: + +```shell +wget -P checkpoint https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth # noqa: E501, E261. + +python demo/video_demo.py \ + demo/video_demo.mp4 \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + checkpoint/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --out demo_result.mp4 +``` +""" +import argparse + +import cv2 +import mmcv +from mmcv.transforms import Compose +from mmdet.apis import inference_detector, init_detector +from mmengine.utils import track_iter_progress + +from mmyolo.registry import VISUALIZERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMYOLO video demo') + parser.add_argument('video', help='Video file') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument('--out', type=str, help='Output video file') + parser.add_argument('--show', action='store_true', help='Show video') + parser.add_argument( + '--wait-time', + type=float, + default=1, + help='The interval of show (s), 0 is block') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + assert args.out or args.show, \ + ('Please specify at least one operation (save/show the ' + 'video) with the argument "--out" or "--show"') + + # build the model from a config file and a checkpoint file + model = init_detector(args.config, args.checkpoint, device=args.device) + + # build test pipeline + model.cfg.test_dataloader.dataset.pipeline[ + 0].type = 'mmdet.LoadImageFromNDArray' + test_pipeline = Compose(model.cfg.test_dataloader.dataset.pipeline) + + # init visualizer + visualizer = VISUALIZERS.build(model.cfg.visualizer) + # the dataset_meta is loaded from the checkpoint and + # then pass to the model in init_detector + visualizer.dataset_meta = model.dataset_meta + + video_reader = mmcv.VideoReader(args.video) + video_writer = None + if args.out: + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + video_writer = cv2.VideoWriter( + args.out, fourcc, video_reader.fps, + (video_reader.width, video_reader.height)) + + for frame in track_iter_progress(video_reader): + result = inference_detector(model, frame, test_pipeline=test_pipeline) + visualizer.add_datasample( + name='video', + image=frame, + data_sample=result, + draw_gt=False, + show=False, + pred_score_thr=args.score_thr) + frame = visualizer.get_image() + + if args.show: + cv2.namedWindow('video', 0) + mmcv.imshow(frame, 'video', args.wait_time) + if args.out: + video_writer.write(frame) + + if video_writer: + video_writer.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/docker/Dockerfile b/third_party/mmyolo/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..fc65431a2940604118aaf747290442da78741365 --- /dev/null +++ b/third_party/mmyolo/docker/Dockerfile @@ -0,0 +1,36 @@ +ARG PYTORCH="1.9.0" +ARG CUDA="11.1" +ARG CUDNN="8" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \ + TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ + FORCE_CUDA="1" + +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key del 7fa2af80 \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ +# pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update \ + && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install MMEngine , MMCV and MMDet +RUN pip install --no-cache-dir openmim && \ + mim install --no-cache-dir "mmengine>=0.6.0" "mmcv>=2.0.0rc4,<2.1.0" "mmdet>=3.0.0,<4.0.0" + +# Install MMYOLO +RUN git clone https://github.com/open-mmlab/mmyolo.git /mmyolo && \ + cd /mmyolo && \ + mim install --no-cache-dir -e . + +WORKDIR /mmyolo diff --git a/third_party/mmyolo/docker/Dockerfile_deployment b/third_party/mmyolo/docker/Dockerfile_deployment new file mode 100644 index 0000000000000000000000000000000000000000..8ea1e380b0fab494047f9e2f94545f4e4b0b72e9 --- /dev/null +++ b/third_party/mmyolo/docker/Dockerfile_deployment @@ -0,0 +1,65 @@ +FROM nvcr.io/nvidia/pytorch:22.04-py3 + +WORKDIR /openmmlab +ARG ONNXRUNTIME_VERSION=1.8.1 +ENV DEBIAN_FRONTEND=noninteractive \ + APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn \ + FORCE_CUDA="1" + +RUN apt-key del 7fa2af80 \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list \ +# && pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update \ + && apt-get install -y ffmpeg git libgl1-mesa-glx libopencv-dev \ + libsm6 libspdlog-dev libssl-dev ninja-build libxext6 libxrender-dev \ + libglib2.0-0 vim wget --no-install-recommends \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# get onnxruntime +RUN wget -q https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz \ + && tar -zxvf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz \ + && pip install --no-cache-dir onnxruntime-gpu==${ONNXRUNTIME_VERSION} \ + && pip install pycuda + + +# Install OPENMIM MMENGINE MMDET +RUN pip install --no-cache-dir openmim \ + && mim install --no-cache-dir "mmengine>=0.6.0" "mmdet>=3.0.0,<4.0.0" \ + && mim install --no-cache-dir opencv-python==4.5.5.64 opencv-python-headless==4.5.5.64 + +RUN git clone https://github.com/open-mmlab/mmcv.git -b 2.x mmcv \ + && cd mmcv \ + && mim install --no-cache-dir -r requirements/optional.txt \ + && MMCV_WITH_OPS=1 mim install --no-cache-dir -e . -v \ + && cd .. + +# Install MMYOLO +RUN git clone https://github.com/open-mmlab/mmyolo.git -b dev mmyolo \ + && cd mmyolo \ + && mim install --no-cache-dir -e . \ + && cd .. + +# Install MMDEPLOY +ENV ONNXRUNTIME_DIR=/openmmlab/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION} \ + TENSORRT_DIR=/usr/lib/x86_64-linux-gnu \ + CUDNN_DIR=/usr/lib/x86_64-linux-gnu + +RUN git clone https://github.com/open-mmlab/mmdeploy -b dev-1.x mmdeploy \ + && cd mmdeploy \ + && git submodule update --init --recursive \ + && mkdir -p build \ + && cd build \ + && cmake -DMMDEPLOY_TARGET_BACKENDS="ort;trt" -DONNXRUNTIME_DIR=${ONNXRUNTIME_DIR} -DTENSORRT_DIR=${TENSORRT_DIR} -DCUDNN_DIR=${CUDNN_DIR} .. \ + && make -j$(nproc) \ + && make install \ + && cd .. \ + && mim install --no-cache-dir -e . + +# Fix undefined symbol bug + RUN echo -e "\nexport LD_LIBRARY_PATH=${ONNXRUNTIME_DIR}/lib:${TENSORRT_DIR}/lib:${CUDNN_DIR}/lib64:${LD_LIBRARY_PATH}\nldconfig" >> /root/.bashrc diff --git a/third_party/mmyolo/docs/README.md b/third_party/mmyolo/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f0b79699be51033fec6b1defb413f4abd48220d1 --- /dev/null +++ b/third_party/mmyolo/docs/README.md @@ -0,0 +1,28 @@ +## Build Documentation + +1. Clone MMYOLO + + ```bash + git clone https://github.com/open-mmlab/mmyolo.git + cd mmyolo + ``` + +2. Install the building dependencies of documentation + + ```bash + pip install -r requirements/docs.txt + ``` + +3. Change directory to `docs/en` or `docs/zh_cn` + + ```bash + cd docs/en # or docs/zh_cn + ``` + +4. Build documentation + + ```bash + make html + ``` + +5. Open `_build/html/index.html` with browser diff --git a/third_party/mmyolo/docs/en/Makefile b/third_party/mmyolo/docs/en/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d4bb2cbb9eddb1bb1b4f366623044af8e4830919 --- /dev/null +++ b/third_party/mmyolo/docs/en/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/third_party/mmyolo/docs/en/_static/css/readthedocs.css b/third_party/mmyolo/docs/en/_static/css/readthedocs.css new file mode 100644 index 0000000000000000000000000000000000000000..353aa9e285a5639b0f34ecb3b16115cff1ad25ed --- /dev/null +++ b/third_party/mmyolo/docs/en/_static/css/readthedocs.css @@ -0,0 +1,6 @@ +.header-logo { + background-image: url("../image/mmyolo-logo.png"); + background-size: 115px 40px; + height: 40px; + width: 115px; +} diff --git a/third_party/mmyolo/docs/en/_static/image/mmyolo-logo.png b/third_party/mmyolo/docs/en/_static/image/mmyolo-logo.png new file mode 100644 index 0000000000000000000000000000000000000000..41318aec92d86749d327bc5f9b9c689632ffc735 Binary files /dev/null and b/third_party/mmyolo/docs/en/_static/image/mmyolo-logo.png differ diff --git a/third_party/mmyolo/docs/en/advanced_guides/cross-library_application.md b/third_party/mmyolo/docs/en/advanced_guides/cross-library_application.md new file mode 100644 index 0000000000000000000000000000000000000000..271d1290a5e772bb20fd26a72035aafc5e7d7e21 --- /dev/null +++ b/third_party/mmyolo/docs/en/advanced_guides/cross-library_application.md @@ -0,0 +1 @@ +# MMYOLO cross-library application diff --git a/third_party/mmyolo/docs/en/api.rst b/third_party/mmyolo/docs/en/api.rst new file mode 100644 index 0000000000000000000000000000000000000000..a45f66ad7ea5e8eb89888ad131468c606393fe41 --- /dev/null +++ b/third_party/mmyolo/docs/en/api.rst @@ -0,0 +1,80 @@ +mmyolo.datasets +------------------ + +datasets +^^^^^^^^^^ +.. automodule:: mmyolo.datasets + :members: + +transforms +^^^^^^^^^^^^ +.. automodule:: mmyolo.datasets.transforms + :members: + +mmyolo.engine +-------------- + +hooks +^^^^^^^^^^ +.. automodule:: mmyolo.engine.hooks + :members: + +optimizers +^^^^^^^^^^ +.. automodule:: mmyolo.engine.optimizers + :members: + +mmyolo.models +-------------- + +backbones +^^^^^^^^^^ +.. automodule:: mmyolo.models.backbones + :members: + +data_preprocessor +^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmyolo.models.data_preprocessor + :members: + +dense_heads +^^^^^^^^^^^^ +.. automodule:: mmyolo.models.dense_heads + :members: + +detectors +^^^^^^^^^^ +.. automodule:: mmyolo.models.detectors + :members: + +layers +^^^^^^^^^^ +.. automodule:: mmyolo.models.layers + :members: + +losses +^^^^^^^^^^ +.. automodule:: mmyolo.models.losses + :members: + +necks +^^^^^^^^^^^^ +.. automodule:: mmyolo.models.necks + :members: + + +task_modules +^^^^^^^^^^^^^^^ +.. automodule:: mmyolo.models.task_modules + :members: + +utils +^^^^^^^^^^ +.. automodule:: mmyolo.models.utils + :members: + + +mmyolo.utils +-------------- +.. automodule::mmyolo.utils + :members: diff --git a/third_party/mmyolo/docs/en/common_usage/amp_training.md b/third_party/mmyolo/docs/en/common_usage/amp_training.md new file mode 100644 index 0000000000000000000000000000000000000000..ac1fddd817f8f11f44c44918ecea9283c74edb20 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/amp_training.md @@ -0,0 +1,13 @@ +# Automatic mixed precision(AMP)training + +To enable Automatic Mixing Precision (AMP) training, add `--amp` to the end of the training command, which is as follows: + +```shell +python tools/train.py python ./tools/train.py ${CONFIG} --amp +``` + +Specific examples are as follows: + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py --amp +``` diff --git a/third_party/mmyolo/docs/en/common_usage/freeze_layers.md b/third_party/mmyolo/docs/en/common_usage/freeze_layers.md new file mode 100644 index 0000000000000000000000000000000000000000..4614f324572319e360d9ed90f09b31fdd36ab6b0 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/freeze_layers.md @@ -0,0 +1,28 @@ +# Freeze layers + +## Freeze the weight of backbone + +In MMYOLO, we can freeze some `stages` of the backbone network by setting `frozen_stages` parameters, so that these `stage` parameters do not participate in model updating. +It should be noted that `frozen_stages = i` means that all parameters from the initial `stage` to the `i`th `stage` will be frozen. The following is an example of `YOLOv5`. Other algorithms are the same logic. + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + backbone=dict( + frozen_stages=1 # Indicates that the parameters in the first stage and all stages before it are frozen + )) +``` + +## Freeze the weight of neck + +In addition, it's able to freeze the whole `neck` with the parameter `freeze_all` in MMYOLO. The following is an example of `YOLOv5`. Other algorithms are the same logic. + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + neck=dict( + freeze_all=True # If freeze_all=True, all parameters of the neck will be frozen + )) +``` diff --git a/third_party/mmyolo/docs/en/common_usage/mim_usage.md b/third_party/mmyolo/docs/en/common_usage/mim_usage.md new file mode 100644 index 0000000000000000000000000000000000000000..2752ea5f9a8a4e28bccd4b3b9617cbeff265b9df --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/mim_usage.md @@ -0,0 +1,89 @@ +# Use mim to run scripts from other OpenMMLab repositories + +```{note} +1. All script calls across libraries are currently not supported and are being fixed. More examples will be added to this document when the fix is complete. 2. +2. mAP plotting and average training speed calculation are fixed in the MMDetection dev-3.x branch, which currently needs to be installed via the source code to be run successfully. +``` + +## Log Analysis + +### Curve plotting + +`tools/analysis_tools/analyze_logs.py` plots loss/mAP curves given a training log file. Run `pip install seaborn` first to install the dependency. + +```shell +mim run mmdet analyze_logs plot_curve \ + ${LOG} \ # path of train log in json format + [--keys ${KEYS}] \ # the metric that you want to plot, default to 'bbox_mAP' + [--start-epoch ${START_EPOCH}] # the epoch that you want to start, default to 1 + [--eval-interval ${EVALUATION_INTERVAL}] \ # the evaluation interval when training, default to 1 + [--title ${TITLE}] \ # title of figure + [--legend ${LEGEND}] \ # legend of each plot, default to None + [--backend ${BACKEND}] \ # backend of plt, default to None + [--style ${STYLE}] \ # style of plt, default to 'dark' + [--out ${OUT_FILE}] # the path of output file +# [] stands for optional parameters, when actually entering the command line, you do not need to enter [] +``` + +Examples: + +- Plot the classification loss of some run. + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls \ + --legend loss_cls + ``` + + + +- Plot the classification and regression loss of some run, and save the figure to a pdf. + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls loss_bbox \ + --legend loss_cls loss_bbox \ + --out losses_yolov5_s.pdf + ``` + + + +- Compare the bbox mAP of two runs in the same figure. + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json \ + --keys bbox_mAP \ + --legend yolov5_s yolov5_n \ + --eval-interval 10 # Note that the evaluation interval must be the same as during training. Otherwise, it will raise an error. + ``` + + + +### Compute the average training speed + +```shell +mim run mmdet analyze_logs cal_train_time \ + ${LOG} \ # path of train log in json format + [--include-outliers] # include the first value of every epoch when computing the average time +``` + +Examples: + +```shell +mim run mmdet analyze_logs cal_train_time \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json +``` + +The output is expected to be like the following. + +```text +-----Analyze train time of yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json----- +slowest epoch 278, average time is 0.1705 s/iter +fastest epoch 300, average time is 0.1510 s/iter +time std over epochs is 0.0026 +average iter time: 0.1556 s/iter +``` diff --git a/third_party/mmyolo/docs/en/common_usage/module_combination.md b/third_party/mmyolo/docs/en/common_usage/module_combination.md new file mode 100644 index 0000000000000000000000000000000000000000..3f9ffa4c38559fbcc806f3132dc2a91ae0f0dad7 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/module_combination.md @@ -0,0 +1 @@ +# Module combination diff --git a/third_party/mmyolo/docs/en/common_usage/ms_training_testing.md b/third_party/mmyolo/docs/en/common_usage/ms_training_testing.md new file mode 100644 index 0000000000000000000000000000000000000000..b7d88f63217343b7c9c3c3a512f9e2a9e822fe28 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/ms_training_testing.md @@ -0,0 +1,39 @@ +# Multi-scale training and testing + +## Multi-scale training + +The popular YOLOv5, YOLOv6, YOLOv7, YOLOv8 and RTMDet algorithms are supported in MMYOLO currently, and their default configuration is single-scale 640x640 training. There are two implementations of multi-scale training commonly used in the MM family of open source libraries + +1. Each image output in `train_pipeline` is at variable scale, and pad different scales of input images to the same scale by [stack_batch](https://github.com/open-mmlab/mmengine/blob/dbae83c52fa54d6dda08b6692b124217fe3b2135/mmengine/model/base_model/data_preprocessor.py#L260-L261) function in [DataPreprocessor](https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/data_preprocessors/data_preprocessor.py). Most of the algorithms in MMDet are implemented using this approach. +2. Each image output in `train_pipeline` is at a fixed scale, and `DataPreprocessor` performs up- and down-sampling of image batches for multi-scale training directly. + +Both two multi-scale training approaches are supported in MMYOLO. Theoretically, the first implementation can generate richer scales, but its training efficiency is not as good as the second one due to its independent augmentation of a single image. Therefore, we recommend using the second approach. + +Take `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py` configuration as an example, its default configuration is 640x640 fixed scale training, suppose you want to implement training in multiples of 32 and multi-scale range (480, 800), you can refer to YOLOX practice by [YOLOXBatchSyncRandomResize](https://github.com/open-mmlab/mmyolo/blob/dc85144fab20a970341550794857a2f2f9b11564/mmyolo/models/data_preprocessors/data_preprocessor.py#L20) in the DataPreprocessor. + +Create a new configuration under the `configs/yolov5` path named `configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py` with the following contents. + +```python +_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py' + +model = dict( + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + # multi-scale range (480, 800) + random_size_range=(480, 800), + # The output scale needs to be divisible by 32 + size_divisor=32, + interval=1) + ]) +) +``` + +The above configuration will enable multi-scale training. We have already provided this configuration under `configs/yolov5/` for convenience. The rest of the YOLO family of algorithms are similar. + +## Multi-scale testing + +MMYOLO multi-scale testing is equivalent to Test-Time Enhancement TTA and is currently supported, see [Test-Time Augmentation TTA](./tta.md). diff --git a/third_party/mmyolo/docs/en/common_usage/multi_necks.md b/third_party/mmyolo/docs/en/common_usage/multi_necks.md new file mode 100644 index 0000000000000000000000000000000000000000..b6f2bc252b2f151d80e0c500d3513651b09a704f --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/multi_necks.md @@ -0,0 +1,37 @@ +# Apply multiple Necks + +If you want to stack multiple Necks, you can directly set the Neck parameters in the config. MMYOLO supports concatenating multiple Necks in the form of `List`. You need to ensure that the output channel of the previous Neck matches the input channel of the next Neck. If you need to adjust the number of channels, you can insert the `mmdet.ChannelMapper` module to align the number of channels between multiple Necks. The specific configuration is as follows: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = _base_.widen_factor +model = dict( + type='YOLODetector', + neck=[ + dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[256, 512, 1024], # The out_channels is controlled by widen_factor,so the YOLOv5PAFPN's out_channels equls to out_channels * widen_factor + num_csp_blocks=3, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True)), + dict( + type='mmdet.ChannelMapper', + in_channels=[128, 256, 512], + out_channels=128, + ), + dict( + type='mmdet.DyHead', + in_channels=128, + out_channels=256, + num_blocks=2, + # disable zero_init_offset to follow official implementation + zero_init_offset=False) + ] + bbox_head=dict(head_module=dict(in_channels=[512,512,512])) # The out_channels is controlled by widen_factor,so the YOLOv5HeadModuled in_channels * widen_factor equals to the last neck's out_channels +) +``` diff --git a/third_party/mmyolo/docs/en/common_usage/output_predictions.md b/third_party/mmyolo/docs/en/common_usage/output_predictions.md new file mode 100644 index 0000000000000000000000000000000000000000..571929900a1d516262cc17e0918c63a61f83c305 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/output_predictions.md @@ -0,0 +1,40 @@ +# Output prediction results + +If you want to save the prediction results as a specific file for offline evaluation, MMYOLO currently supports both json and pkl formats. + +```{note} +The json file only save `image_id`, `bbox`, `score` and `category_id`. The json file can be read using the json library. +The pkl file holds more content than the json file, and also holds information such as the file name and size of the predicted image; the pkl file can be read using the pickle library. The pkl file can be read using the pickle library. +``` + +## Output into json file + +If you want to output the prediction results as a json file, the command is as follows. + +```shell +python tools/test.py {path_to_config} {path_to_checkpoint} --json-prefix {json_prefix} +``` + +The argument after `--json-prefix` should be a filename prefix (no need to enter the `.json` suffix) and can also contain a path. For a concrete example: + +```shell +python tools/test.py configs\yolov5\yolov5_s-v61_syncbn_8xb16-300e_coco.py yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth --json-prefix work_dirs/demo/json_demo +``` + +Running the above command will output the `json_demo.bbox.json` file in the `work_dirs/demo` folder. + +## Output into pkl file + +If you want to output the prediction results as a pkl file, the command is as follows. + +```shell +python tools/test.py {path_to_config} {path_to_checkpoint} --out {path_to_output_file} +``` + +The argument after `--out` should be a full filename (**must be** with a `.pkl` or `.pickle` suffix) and can also contain a path. For a concrete example: + +```shell +python tools/test.py configs\yolov5\yolov5_s-v61_syncbn_8xb16-300e_coco.py yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth --out work_dirs/demo/pkl_demo.pkl +``` + +Running the above command will output the `pkl_demo.pkl` file in the `work_dirs/demo` folder. diff --git a/third_party/mmyolo/docs/en/common_usage/plugins.md b/third_party/mmyolo/docs/en/common_usage/plugins.md new file mode 100644 index 0000000000000000000000000000000000000000..5a0b32364308acf9f08eb369cccae183ad6cc121 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/plugins.md @@ -0,0 +1,34 @@ +# Plugins + +MMYOLO supports adding plugins such as `none_local` and `dropblock` after different stages of Backbone. Users can directly manage plugins by modifying the plugins parameter of the backbone in the config. For example, add `GeneralizedAttention` plugins for `YOLOv5`. The configuration files are as follows: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + backbone=dict( + plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='0011', + kv_stride=2), + stages=(False, False, True, True)) + ])) +``` + +`cfg` parameter indicates the specific configuration of the plugin. The `stages` parameter indicates whether to add plug-ins after the corresponding stage of the backbone. The length of the list `stages` must be the same as the number of backbone stages. + +MMYOLO currently supports the following plugins: + +
+Supported Plugins + +1. [CBAM](https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/models/plugins/cbam.py#L86) +2. [GeneralizedAttention](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/bricks/generalized_attention.py#L13) +3. [NonLocal2d](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/bricks/non_local.py#L250) +4. [ContextBlock](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/bricks/context_block.py#L18) + +
diff --git a/third_party/mmyolo/docs/en/common_usage/resume_training.md b/third_party/mmyolo/docs/en/common_usage/resume_training.md new file mode 100644 index 0000000000000000000000000000000000000000..1e1184a728f2d22a71f52a2c2f9a1e3671bc3c41 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/resume_training.md @@ -0,0 +1,9 @@ +# Resume training + +Resume training means to continue training from the state saved from one of the previous trainings, where the state includes the model weights, the state of the optimizer and the optimizer parameter adjustment strategy. + +The user can add `--resume` at the end of the training command to resume training, and the program will automatically load the latest weight file from `work_dirs` to resume training. If there is an updated checkpoint in `work_dir` (e.g. the training was interrupted during the last training), the training will be resumed from that checkpoint, otherwise (e.g. the last training did not have time to save the checkpoint or a new training task was started) the training will be restarted. Here is an example of resuming training: + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py --resume +``` diff --git a/third_party/mmyolo/docs/en/common_usage/set_random_seed.md b/third_party/mmyolo/docs/en/common_usage/set_random_seed.md new file mode 100644 index 0000000000000000000000000000000000000000..c45c165f4323e5e522daccf0b1fbbb9bbf1f4b2a --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/set_random_seed.md @@ -0,0 +1,18 @@ +# Set the random seed + +If you want to set the random seed during training, you can use the following command. + +```shell +python ./tools/train.py \ + ${CONFIG} \ # path of the config file + --cfg-options randomness.seed=2023 \ # set seed to 2023 + [randomness.diff_rank_seed=True] \ # set different seeds according to global rank + [randomness.deterministic=True] # set the deterministic option for CUDNN backend +# [] stands for optional parameters, when actually entering the command line, you do not need to enter [] +``` + +`randomness` has three parameters that can be set, with the following meanings. + +- `randomness.seed=2023`, set the random seed to 2023. +- `randomness.diff_rank_seed=True`, set different seeds according to global rank. Defaults to False. +- `randomness.deterministic=True`, set the deterministic option for cuDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Defaults to False. See https://pytorch.org/docs/stable/notes/randomness.html for more details. diff --git a/third_party/mmyolo/docs/en/common_usage/set_syncbn.md b/third_party/mmyolo/docs/en/common_usage/set_syncbn.md new file mode 100644 index 0000000000000000000000000000000000000000..dba33be6e39b268c7a286b2c3d54469b5665d42c --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/set_syncbn.md @@ -0,0 +1 @@ +# Enabling and disabling SyncBatchNorm diff --git a/third_party/mmyolo/docs/en/common_usage/single_multi_channel_applications.md b/third_party/mmyolo/docs/en/common_usage/single_multi_channel_applications.md new file mode 100644 index 0000000000000000000000000000000000000000..30932708bb59ae226e1282ca70dbdca023f32a0f --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/single_multi_channel_applications.md @@ -0,0 +1,188 @@ +# Single and multi-channel application examples + +## Training example on a single-channel image dataset + +The default training images in MMYOLO are all color three-channel data. If you want to use a single-channel dataset for training and testing, it is expected that the following modifications are needed. + +1. All image processing pipelines have to support single channel operations +2. The input channel of the first convolutional layer of the backbone network of the model needs to be changed from 3 to 1 +3. If you wish to load COCO pre-training weights, you need to handle the first convolutional layer weight size mismatch + +The following uses the `cat` dataset as an example to describe the entire modification process, if you are using a custom grayscale image dataset, you can skip the dataset preprocessing step. + +### 1 Dataset pre-processing + +The processing training of the custom dataset can be found in [Annotation-to-deployment workflow for custom dataset](../recommended_topics/labeling_to_deployment_tutorials.md)。 + +`cat` is a three-channel color image dataset. For demonstration purpose, you can run the following code and commands to replace the dataset images with single-channel images for subsequent validation. + +**1. Download the `cat` dataset for decompression** + +```shell +python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete +``` + +**2. Convert datasets to grayscale maps** + +```python +import argparse +import imghdr +import os +from typing import List +import cv2 + +def parse_args(): + parser = argparse.ArgumentParser(description='data_path') + parser.add_argument('path', type=str, help='Original dataset path') + return parser.parse_args() + +def main(): + args = parse_args() + + path = args.path + '/images/' + save_path = path + file_list: List[str] = os.listdir(path) + # Grayscale conversion of each imager + for file in file_list: + if imghdr.what(path + '/' + file) != 'jpeg': + continue + img = cv2.imread(path + '/' + file) + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + cv2.imwrite(save_path + '/' + file, img) + +if __name__ == '__main__': + main() +``` + +Name the above script as `cvt_single_channel.py`, and run the command as: + +```shell +python cvt_single_channel.py data/cat +``` + +### 2 Modify the base configuration file + +**At present, some image processing functions of MMYOLO, such as color space transformation, are not compatible with single-channel images, so if we use single-channel data for training directly, we need to modify part of the pipeline, which is a large amount of work**. In order to solve the incompatibility problem, the recommended approach is to load the single-channel image as a three-channel image as a three-channel data, but convert it to single-channel format before input to the network. This approach will slightly increase the arithmetic burden, but the user basically does not need to modify the code to use. + +Take `projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` as the `base` configuration, copy it to the `configs/yolov5` directory, and add `yolov5_s- v61_syncbn_fast_1xb32-100e_cat_single_channel.py` file. We can inherit `YOLOv5DetDataPreprocessor` from the `mmyolo/models/data_preprocessors/data_preprocessor.py` file and name the new class `YOLOv5SCDetDataPreprocessor`, in which convert the image to a single channel, add the dependency library and register the new class in `mmyolo/models/data_preprocessors/__init__.py`. The `YOLOv5SCDetDataPreprocessor` sample code is: + +```python +@MODELS.register_module() +class YOLOv5SCDetDataPreprocessor(YOLOv5DetDataPreprocessor): + """Rewrite collate_fn to get faster training speed. + + Note: It must be used together with `mmyolo.datasets.utils.yolov5_collate` + """ + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding, bgr2rgb conversion and convert to single channel image based on ``DetDataPreprocessor``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # TODO: Supports multi-scale training + if self._channel_conversion and inputs.shape[1] == 3: + inputs = inputs[:, [2, 1, 0], ...] + + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + + # Convert to single channel image + inputs = inputs.mean(dim=1, keepdim=True) + + return {'inputs': inputs, 'data_samples': data_samples} +``` + +At this point, the `yolov5_s-v61_syncbn_fast_1xb32-100e_cat_single_channel.py` configuration file reads as follows. + +```python +_base_ = 'yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py' + +_base_.model.data_preprocessor.type = 'YOLOv5SCDetDataPreprocessor' +``` + +### 3 Pre-training model loading problem + +When using a pre-trained 3-channel model directly, it's theoretically possible to experience a decrease in accuracy, though this has not been experimentally verified. To mitigate this potential issue, there are several solutions, including adjusting the weight of each channel in the input layer. One approach is to set the weight of each channel in the input layer to the average of the weights of the original 3 channels. Alternatively, the weight of each channel could be set to one of the weights of the original 3 channels, or the input layer could be trained directly without modifying the weights, depending on the specific circumstances. In this work, we chose to adjust the weights of the 3 channels in the input layer to the average of the weights of the pre-trained 3 channels. + +```python +import torch + +def main(): + # Load weights file + state_dict = torch.load( + 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' + ) + + # Modify input layer weights + weights = state_dict['state_dict']['backbone.stem.conv.weight'] + avg_weight = weights.mean(dim=1, keepdim=True) + state_dict['state_dict']['backbone.stem.conv.weight'] = avg_weight + + # Save the modified weights to a new file + torch.save( + state_dict, + 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187_single_channel.pth' + ) + +if __name__ == '__main__': + main() +``` + +At this point, the `yolov5_s-v61_syncbn_fast_1xb32-100e_cat_single_channel.py` configuration file reads as follows: + +```python +_base_ = 'yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py' + +_base_.model.data_preprocessor.type = 'YOLOv5SCDetDataPreprocessor' + +load_from = './checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187_single_channel.pth' +``` + +### 4 Model training effect + + + +The left figure shows the actual label and the right figure shows the target detection result. + +```shell + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.958 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.958 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.881 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 +bbox_mAP_copypaste: 0.958 1.000 1.000 -1.000 -1.000 0.958 +Epoch(val) [100][116/116] coco/bbox_mAP: 0.9580 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 1.0000 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9580 +``` + +## Training example on a multi-channel image dataset + +TODO diff --git a/third_party/mmyolo/docs/en/common_usage/specify_device.md b/third_party/mmyolo/docs/en/common_usage/specify_device.md new file mode 100644 index 0000000000000000000000000000000000000000..72c8017e552040413e118a85ad7785fb854a8d59 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/specify_device.md @@ -0,0 +1,23 @@ +# Specify specific GPUs during training or inference + +If you have multiple GPUs, such as 8 GPUs, numbered `0, 1, 2, 3, 4, 5, 6, 7`, GPU 0 will be used by default for training or inference. If you want to specify other GPUs for training or inference, you can use the following commands: + +```shell +CUDA_VISIBLE_DEVICES=5 python ./tools/train.py ${CONFIG} #train +CUDA_VISIBLE_DEVICES=5 python ./tools/test.py ${CONFIG} ${CHECKPOINT_FILE} #test +``` + +If you set `CUDA_VISIBLE_DEVICES` to -1 or a number greater than the maximum GPU number, such as 8, the CPU will be used for training or inference. + +If you want to use several of these GPUs to train in parallel, you can use the following command: + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 ./tools/dist_train.sh ${CONFIG} ${GPU_NUM} +``` + +Here the `GPU_NUM` is 4. In addition, if multiple tasks are trained in parallel on one machine and each task requires multiple GPUs, the PORT of each task need to be set differently to avoid communication conflict, like the following commands: + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG} 4 +``` diff --git a/third_party/mmyolo/docs/en/common_usage/tta.md b/third_party/mmyolo/docs/en/common_usage/tta.md new file mode 100644 index 0000000000000000000000000000000000000000..517d34b8b67f4336c1e2acd93304c0e47af36571 --- /dev/null +++ b/third_party/mmyolo/docs/en/common_usage/tta.md @@ -0,0 +1,87 @@ +# TTA Related Notes + +## Test Time Augmentation (TTA) + +MMYOLO support for TTA in v0.5.0+, so that users can specify the `-tta` parameter to enable it during evaluation. Take `YOLOv5-s` as an example, its single GPU TTA test command is as follows + +```shell +python tools/test.py configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth --tta +``` + +For TTA to work properly, you must ensure that the variables `tta_model` and `tta_pipeline` are present in the configuration, see [det_p5_tta.py](https://github.com/open-mmlab/mmyolo/blob/dev/configs/_base_/det_p5_tta.py) for details. + +The default TTA in MMYOLO performs 3 multi-scale enhancements, followed by 2 horizontal flip enhancements, for a total of 6 parallel pipelines. take `YOLOv5-s` as an example, its TTA configuration is as follows + +```python +img_scales = [(640, 640), (320, 320), (960, 960)] + +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] +``` + +The schematic diagram is shown below. + +```text + LoadImageFromFile + / | \ +(RatioResize,LetterResize) (RatioResize,LetterResize) (RatioResize,LetterResize) + / \ / \ / \ + RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip + | | | | | | + LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn + | | | | | | + PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn +``` + +You can modify `img_scales` to support different multi-scale enhancements, or you can insert a new pipeline to implement custom TTA requirements. Assuming you only want to do horizontal flip enhancements, the configuration should be modified as follows. + +```python +tta_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] +``` diff --git a/third_party/mmyolo/docs/en/conf.py b/third_party/mmyolo/docs/en/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..437a257a34618f2d7022dbbe0b58928c671b800e --- /dev/null +++ b/third_party/mmyolo/docs/en/conf.py @@ -0,0 +1,115 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import subprocess +import sys + +import pytorch_sphinx_theme + +sys.path.insert(0, os.path.abspath('../../')) + +# -- Project information ----------------------------------------------------- + +project = 'MMYOLO' +copyright = '2022, OpenMMLab' +author = 'MMYOLO Authors' +version_file = '../../mmyolo/version.py' + + +def get_version(): + with open(version_file) as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +# The full version, including alpha/beta/rc tags +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_markdown_tables', + 'sphinx_copybutton', +] + +myst_enable_extensions = ['colon_fence'] +myst_heading_anchors = 3 + +autodoc_mock_imports = [ + 'matplotlib', 'pycocotools', 'terminaltables', 'mmyolo.version', 'mmcv.ops' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +# The master toctree document. +master_doc = 'index' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'sphinx_rtd_theme' +html_theme = 'pytorch_sphinx_theme' +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] + +html_theme_options = { + 'menu': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/open-mmlab/mmyolo' + }, + ], + # Specify the language of shared menu + 'menu_lang': 'en', +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_css_files = ['css/readthedocs.css'] + +# -- Extension configuration ------------------------------------------------- +# Ignore >>> when copying code +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + + +def builder_inited_handler(app): + subprocess.run(['./stat.py']) + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) diff --git a/third_party/mmyolo/docs/en/get_started/15_minutes_instance_segmentation.md b/third_party/mmyolo/docs/en/get_started/15_minutes_instance_segmentation.md new file mode 100644 index 0000000000000000000000000000000000000000..b42e25f646f7adbc49f1b323e0016d62dd14a3ab --- /dev/null +++ b/third_party/mmyolo/docs/en/get_started/15_minutes_instance_segmentation.md @@ -0,0 +1,332 @@ +# 15 minutes to get started with MMYOLO instance segmentation + +Instance segmentation is a task in computer vision that aims to segment each object in an image and assign each object a unique identifier. + +Unlike semantic segmentation, instance segmentation not only segments out different categories in an image, but also separates different instances of the same category. + +
+Instance Segmentation +
+ +Taking the downloadable balloon dataset as an example, I will guide you through a 15-minute easy introduction to MMYOLO instance segmentation. The entire process includes the following steps: + +- [Installation](#installation) +- [Dataset](#dataset) +- [Config](#config) +- [Training](#training) +- [Testing](#testing) +- [EasyDeploy](#easydeploy-deployment) + +In this tutorial, we will use YOLOv5-s as an example. For the demo configuration of the balloon dataset with other YOLO series algorithms, please refer to the corresponding algorithm configuration folder. + +## Installation + +Assuming you've already installed Conda in advance, then install PyTorch using the following commands. + +```{note} +Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0. +``` + +```shell +conda create -n mmyolo python=3.8 -y +conda activate mmyolo +# If you have GPU +conda install pytorch torchvision -c pytorch +# If you only have CPU +# conda install pytorch torchvision cpuonly -c pytorch +``` + +Install MMYOLO and dependency libraries using the following commands. + +```shell +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +pip install -U openmim +mim install -r requirements/mminstall.txt +# Install albumentations +mim install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +# "-v" means verbose, or more output +# "-e" means installing a project in editable mode, +# thus any local modifications made to the code will take effect without reinstallation. +``` + +For details about how to configure the environment, see [Installation and verification](./installation.md). + +## Dataset + +The Balloon dataset is a single-class dataset that consists of 74 images and includes annotated information required for training. Here is an example image from the dataset: + +
+balloon dataset +
+ +You can download and use it directly by the following command: + +```shell +python tools/misc/download_dataset.py --dataset-name balloon --save-dir ./data/balloon --unzip --delete +python ./tools/dataset_converters/balloon2coco.py +``` + +The data for the MMYOLO project is located in the MMYOLO project directory. The `train.json` and `val.json` files store the annotations in COCO format, while the `data/balloon/train` and `data/balloon/val` directories contain all the images for the dataset. + +## Config + +Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows: + +- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively. +- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12. +- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary. + +To perform the specific operation, create a new configuration file named `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` in the `configs/yolov5/ins_seg` folder. For convenience, we have already provided this configuration file. Copy the following contents into the configuration file. + +```python +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +data_root = 'data/balloon/' # dataset root +# Training set annotation file of json path +train_ann_file = 'train.json' +train_data_prefix = 'train/' # Dataset prefix +# Validation set annotation file of json path +val_ann_file = 'val.json' +val_data_prefix = 'val/' +metainfo = { + 'classes': ('balloon', ), # dataset category name + 'palette': [ + (220, 20, 60), + ] +} +num_classes = 1 +# Set batch size to 4 +train_batch_size_per_gpu = 4 +# dataloader num workers +train_num_workers = 2 +log_interval = 1 +##################### +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=train_data_prefix), + ann_file=train_ann_file)) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) +test_dataloader = val_dataloader +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = val_evaluator +default_hooks = dict(logger=dict(interval=log_interval)) +##################### + +model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) +``` + +The above configuration inherits from `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py` and updates configurations such as `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes`, etc., based on the characteristics of the balloon dataset. + +## Training + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py +``` + +After running the training command mentioned above, the folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance` will be automatically generated. The weight files and the training configuration file for this session will be saved in this folder. On a lower-end GPU like the GTX 1660, the entire training process will take approximately 30 minutes. + +
+image +
+ +The performance on `val.json` is as follows: + +```text + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.330 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.509 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.317 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.150 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.396 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.454 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.317 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525 +``` + +The above performance is obtained by printing using the COCO API, where -1 indicates the absence of objects of that scale. + +### Some Notes + +The key warnings are printed during training: + +- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon. + +The warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon. + +### Training is resumed after the interruption + +If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`. + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --resume +``` + +### Save GPU memory strategy + +The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --amp +``` + +### Training visualization + +MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process. + +#### 1 WandB + +WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys. + +
+image +
+ +```shell +pip install wandb +# After running wandb login, enter the API Keys obtained above, and the login is successful. +wandb login +``` + +Add the wandb config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`. + +```python +visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) +``` + +Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link. + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py +``` + +#### 2 Tensorboard + +Install Tensorboard package using the following command: + +```shell +pip install tensorboard +``` + +Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`. + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')]) +``` + +After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/vis_data`. +We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command: + +```shell +tensorboard --logdir=work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance +``` + +## Testing + +```shell +python tools/test.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \ + work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ + --show-dir show_results +``` + +Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model. + +
+result_img +
+ +You can also visualize model inference results in a browser window if you use `WandbVisBackend` or `TensorboardVisBackend`. + +## Feature map visualization + +MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md) + +Due to the bias of direct visualization of `test_pipeline`, we need to modify the `test_pipeline` of `configs/yolov5/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +to the following config: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +Let's choose the `data/balloon/train/3927754171_9011487133_b.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers. + +**1. Visualize the three channels of YOLOv5s backbone** + +```shell +python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \ + configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \ + work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers backbone \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps. + +**2. Visualize the three channels of YOLOv5 neck** + +```shell +python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \ + configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \ + work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers neck \ + --channel-reduction squeeze_mean +``` + +
+image +
+**3. Grad-Based CAM visualization** + +TODO + +## EasyDeploy deployment + +TODO + +The full content above can be viewed in [15_minutes_object_detection.ipynb](../../../demo/15_minutes_object_detection.ipynb). This is the end of the tutorial. If you encounter problems during training or testing, please check the [common troubleshooting steps](../recommended_topics/troubleshooting_steps.md) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it. diff --git a/third_party/mmyolo/docs/en/get_started/15_minutes_object_detection.md b/third_party/mmyolo/docs/en/get_started/15_minutes_object_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..354b2e7080d727d9ccd91b48b904b4fb59772888 --- /dev/null +++ b/third_party/mmyolo/docs/en/get_started/15_minutes_object_detection.md @@ -0,0 +1,535 @@ +# 15 minutes to get started with MMYOLO object detection + +Object detection task refers to that given a picture, the network predicts all the categories of objects included in the picture and the corresponding boundary boxes + +
+object detection +
+ +Take the small dataset of cat as an example, you can easily learn MMYOLO object detection in 15 minutes. The whole process consists of the following steps: + +- [Installation](#installation) +- [Dataset](#dataset) +- [Config](#config) +- [Training](#training) +- [Testing](#testing) +- [EasyDeploy](#easydeploy-deployment) + +In this tutorial, we take YOLOv5-s as an example. For the rest of the YOLO series algorithms, please see the corresponding algorithm configuration folder. + +## Installation + +Assuming you've already installed Conda in advance, then install PyTorch using the following commands. + +```{note} +Note: Since this repo uses OpenMMLab 2.0, it is better to create a new conda virtual environment to prevent conflicts with the repo installed in OpenMMLab 1.0. +``` + +```shell +conda create -n mmyolo python=3.8 -y +conda activate mmyolo +# If you have GPU +conda install pytorch torchvision -c pytorch +# If you only have CPU +# conda install pytorch torchvision cpuonly -c pytorch +``` + +Install MMYOLO and dependency libraries using the following commands. + +```shell +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +pip install -U openmim +mim install -r requirements/mminstall.txt +# Install albumentations +mim install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +# "-v" means verbose, or more output +# "-e" means installing a project in editable mode, +# thus any local modifications made to the code will take effect without reinstallation. +``` + +For details about how to configure the environment, see [Installation and verification](./installation.md). + +## Dataset + +The Cat dataset is a single-category dataset consisting of 144 pictures (the original pictures are provided by @RangeKing, and cleaned by @PeterH0323), which contains the annotation information required for training. The sample image is shown below: + +
+cat dataset +
+ +You can download and use it directly by the following command: + +```shell +python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete +``` + +This dataset is automatically downloaded to the `./data/cat` dir with the following directory structure: + +
+image +
+ +The cat dataset is located in the mmyolo project dir, and `data/cat/annotations` stores annotations in COCO format, and `data/cat/images` stores all images + +## Config + +Taking YOLOv5 algorithm as an example, considering the limited GPU memory of users, we need to modify some default training parameters to make them run smoothly. The key parameters to be modified are as follows: + +- YOLOv5 is an Anchor-Based algorithm, and different datasets need to calculate suitable anchors adaptively +- The default config uses 8 GPUs with a batch size of 16 per GPU. Now change it to a single GPU with a batch size of 12. +- The default training epoch is 300. Change it to 40 epoch +- Given the small size of the dataset, we opted to use fixed backbone weights +- In principle, the learning rate should be linearly scaled accordingly when the batch size is changed, but actual measurements have found that this is not necessary + +Create a `yolov5_s-v61_fast_1xb12-40e_cat.py` config file in the `configs/yolov5` folder (we have provided this config for you to use directly) and copy the following into the config file. + +```python +# Inherit and overwrite part of the config based on this config +_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +data_root = './data/cat/' # dataset root +class_name = ('cat', ) # dataset category name +num_classes = len(class_name) # dataset category number +# metainfo is a configuration that must be passed to the dataloader, otherwise it is invalid +# palette is a display color for category at visualization +# The palette length must be greater than or equal to the length of the classes +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +# Adaptive anchor based on tools/analysis_tools/optimize_anchors.py +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] +# Max training 40 epoch +max_epochs = 40 +# Set batch size to 12 +train_batch_size_per_gpu = 12 +# dataloader num workers +train_num_workers = 4 + +# load COCO pre-trained weight +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +model = dict( + # Fixed the weight of the entire backbone without training + backbone=dict(frozen_stages=4), + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors) + )) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + # Dataset annotation file of json path + ann_file='annotations/trainval.json', + # Dataset prefix + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + # Save weights every 10 epochs and a maximum of two weights can be saved. + # The best model is saved automatically during model evaluation + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # The warmup_mim_iter parameter is critical. + # The default value is 1000 which is not suitable for cat datasets. + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + # The log printing interval is 5 + logger=dict(type='LoggerHook', interval=5)) +# The evaluation interval is 10 +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +``` + +The above config is inherited from `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`. According to the characteristics of cat dataset updated `data_root`, `metainfo`, `train_dataloader`, `val_dataloader`, `num_classes` and other config. + +## Training + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py +``` + +Run the above training command, `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat` folder will be automatically generated, the checkpoint file and the training config file will be saved in this folder. On a low-end 1660 GPU, the entire training process takes about eight minutes. + +
+image +
+ +The performance on `test.json` is as follows: + +```text + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.631 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.909 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.747 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.631 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.627 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.703 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.703 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.703 +``` + +The above properties are printed via the COCO API, where -1 indicates that no object exists for the scale. According to the rules defined by COCO, the Cat dataset contains all large sized objects, and there are no small or medium-sized objects. + +### Some Notes + +Two key warnings are printed during training: + +- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon. +- The model and loaded state dict do not match exactly + +Neither of these warnings will have any impact on performance. The first warning is because the `num_classes` currently trained is 1, the loss of the classification branch is always 0 according to the community of the YOLOv5 algorithm, which is a normal phenomenon. The second warning is because we are currently training in fine-tuning mode, we load the COCO pre-trained weights for 80 classes, +This will lead to the final Head module convolution channel number does not correspond, resulting in this part of the weight can not be loaded, which is also a normal phenomenon. + +### Training is resumed after the interruption + +If you stop training, you can add `--resume` to the end of the training command and the program will automatically resume training with the latest weights file from `work_dirs`. + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --resume +``` + +### Save GPU memory strategy + +The above config requires about 3G RAM, so if you don't have enough, consider turning on mixed-precision training + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --amp +``` + +### Training visualization + +MMYOLO currently supports local, TensorBoard, WandB and other back-end visualization. The default is to use local visualization, and you can switch to WandB and other real-time visualization of various indicators in the training process. + +#### 1 WandB + +WandB visualization need registered in website, and in the https://wandb.ai/settings for wandb API Keys. + +
+image +
+ +```shell +pip install wandb +# After running wandb login, enter the API Keys obtained above, and the login is successful. +wandb login +``` + +Add the wandb config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`. + +```python +visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) +``` + +Running the training command and you will see the loss, learning rate, and coco/bbox_mAP visualizations in the link. + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py +``` + +
+image +
+
+image +
+ +#### 2 Tensorboard + +Install Tensorboard package: + +```shell +pip install tensorboard +``` + +Add the `tensorboard` config at the end of config file we just created: `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`. + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')]) +``` + +After re-running the training command, Tensorboard file will be generated in the visualization folder `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/vis_data`. +We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command: + +```shell +tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat +``` + +## Testing + +```shell +python tools/test.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --show-dir show_results +``` + +Run the above test command, you can not only get the AP performance printed in the **Training** section, You can also automatically save the result images to the `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/show_results` folder. Below is one of the result images, the left image is the actual annotation, and the right image is the inference result of the model. + +
+result_img +
+ +You can also visualize model inference results in a browser window if you use 'WandbVisBackend' or 'TensorboardVisBackend'. + +## Feature map visualization + +MMYOLO provides visualization scripts for feature map to analyze the current model training. Please refer to [Feature Map Visualization](../recommended_topics/visualization.md) + +Due to the bias of direct visualization of `test_pipeline`, we need to modify the `test_pipeline` of `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +to the following config: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # modify the LetterResize to mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +Let's choose the `data/cat/images/IMG_20221020_112705.jpg` image as an example to visualize the output feature maps of YOLOv5 backbone and neck layers. + +**1. Visualize the three channels of YOLOv5 backbone** + +```shell +python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layers backbone \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +The result will be saved to the output folder in current path. Three output feature maps plotted in the above figure correspond to small, medium and large output feature maps. As the backbone of this training is not actually involved in training, it can be seen from the above figure that the big object cat is predicted on the small feature map, which is in line with the idea of hierarchical detection of object detection. + +**2. Visualize the three channels of YOLOv5 neck** + +```shell +python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layers neck \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +As can be seen from the above figure, because neck is involved in training, and we also reset anchor, the three output feature maps are forced to simulate the same scale object, resulting in the three output maps of neck are similar, which destroys the original pre-training distribution of backbone. At the same time, it can also be seen that 40 epochs are not enough to train the above dataset, and the feature maps do not perform well. + +**3. Grad-Based CAM visualization** + +Based on the above feature map visualization, we can analyze Grad CAM at the feature layer of bbox level. + +Install `grad-cam` package: + +```shell +pip install "grad-cam" +``` + +(a) View Grad CAM of the minimum output feature map of the neck + +```shell +python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layer neck.out_layers[2] +``` + +
+image +
+ +(b) View Grad CAM of the medium output feature map of the neck + +```shell +python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layer neck.out_layers[1] +``` + +
+image +
+ +(c) View Grad CAM of the maximum output feature map of the neck + +```shell +python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layer neck.out_layers[0] +``` + +
+image +
+ +## EasyDeploy deployment + +Here we'll use MMYOLO's [EasyDeploy](../../../projects/easydeploy/) to demonstrate the transformation deployment and basic inference of model. + +First you need to follow EasyDeploy's [basic documentation](../../../projects/easydeploy/docs/model_convert.md) controls own equipment installed for each library. + +```shell +pip install onnx +pip install onnx-simplifier # Install if you want to use simplify +pip install tensorrt # If you have GPU environment and need to output TensorRT model you need to continue execution +``` + +Once installed, you can use the following command to transform and deploy the trained model on the cat dataset with one click. The current ONNX version is 1.13.0 and TensorRT version is 8.5.3.1, so keep the `--opset` value of 11. The remaining parameters need to be adjusted according to the config used. Here we export the CPU version of ONNX with the `--backend` set to 1. + +```shell +python projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --backend 1 \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +On success, you will get the converted ONNX model under `work-dir`, which is named `end2end.onnx` by default. + +Let's use `end2end.onnx` model to perform a basic image inference: + +```shell +python projects/easydeploy/tools/image-demo.py \ + data/cat/images/IMG_20210728_205312.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \ + --device cpu +``` + +After successful inference, the result image will be generated in the `output` folder of the default MMYOLO root directory. If you want to see the result without saving it, you can add `--show` to the end of the above command. For convenience, the following is the generated result. + +
+image +
+ +Let's go on to convert the engine file for TensorRT, because TensorRT needs to be specific to the current environment and deployment version, so make sure to export the parameters, here we export the TensorRT8 file, the `--backend` is 2. + +```shell +python projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \ + --img-size 640 640 \ + --batch 1 \ + --device cuda:0 \ + --simplify \ + --opset 11 \ + --backend 2 \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +The resulting `end2end.onnx` is the ONNX file for the TensorRT8 deployment, which we will use to complete the TensorRT engine transformation. + +```shell +python projects/easydeploy/tools/build_engine.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \ + --img-size 640 640 \ + --device cuda:0 +``` + +Successful execution will generate the `end2end.engine` file under `work-dir`: + +```shell +work_dirs/yolov5_s-v61_fast_1xb12-40e_cat +├── 202302XX_XXXXXX +│ ├── 202302XX_XXXXXX.log +│ └── vis_data +│ ├── 202302XX_XXXXXX.json +│ ├── config.py +│ └── scalars.json +├── best_coco +│ └── bbox_mAP_epoch_40.pth +├── end2end.engine +├── end2end.onnx +├── epoch_30.pth +├── epoch_40.pth +├── last_checkpoint +└── yolov5_s-v61_fast_1xb12-40e_cat.py +``` + +Let's continue use `image-demo.py` for image inference: + +```shell +python projects/easydeploy/tools/image-demo.py \ + data/cat/images/IMG_20210728_205312.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.engine \ + --device cuda:0 +``` + +Here we choose to save the inference results under `output` instead of displaying them directly. The following shows the inference results. + +
+image +
+ +This completes the transformation deployment of the trained model and checks the inference results. This is the end of the tutorial. + +The full content above can be viewed in [15_minutes_object_detection.ipynb](https://github.com/open-mmlab/mmyolo/blob/dev/demo/15_minutes_object_detection.ipynb). If you encounter problems during training or testing, please check the [common troubleshooting steps](../recommended_topics/troubleshooting_steps.md) first and feel free to open an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) if you still can't solve it. diff --git a/third_party/mmyolo/docs/en/get_started/15_minutes_rotated_object_detection.md b/third_party/mmyolo/docs/en/get_started/15_minutes_rotated_object_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..6e04c8c0a8fbda5266e2cd488fc4ca584fc8cfb2 --- /dev/null +++ b/third_party/mmyolo/docs/en/get_started/15_minutes_rotated_object_detection.md @@ -0,0 +1,3 @@ +# 15 minutes to get started with MMYOLO rotated object detection + +TODO diff --git a/third_party/mmyolo/docs/en/get_started/dependencies.md b/third_party/mmyolo/docs/en/get_started/dependencies.md new file mode 100644 index 0000000000000000000000000000000000000000..0d7fc6ad0c3c9d1295201f9cefe423928e44caec --- /dev/null +++ b/third_party/mmyolo/docs/en/get_started/dependencies.md @@ -0,0 +1,60 @@ +# Prerequisites + +Compatible MMEngine, MMCV and MMDetection versions are shown as below. Please install the correct version to avoid installation issues. + +| MMYOLO version | MMDetection version | MMEngine version | MMCV version | +| :------------: | :----------------------: | :----------------------: | :---------------------: | +| main | mmdet>=3.0.0, \<3.1.0 | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 | +| 0.6.0 | mmdet>=3.0.0, \<3.1.0 | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 | +| 0.5.0 | mmdet>=3.0.0rc6, \<3.1.0 | mmengine>=0.6.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 | +| 0.4.0 | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.3.0 | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.2.0 | mmdet>=3.0.0rc3, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.3 | mmdet>=3.0.0rc3, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.2 | mmdet>=3.0.0rc2, \<3.1.0 | mmengine>=0.3.0, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.1 | mmdet==3.0.0rc1 | mmengine>=0.1.0, \<0.2.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.0 | mmdet==3.0.0rc0 | mmengine>=0.1.0, \<0.2.0 | mmcv>=2.0.0rc0, \<2.1.0 | + +In this section, we demonstrate how to prepare an environment with PyTorch. + +MMDetection works on Linux, Windows, and macOS. It requires: + +- Python 3.7+ +- PyTorch 1.7+ +- CUDA 9.2+ +- GCC 5.4+ + +```{note} +If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](#installation). Otherwise, you can follow these steps for the preparation. +``` + +**Step 0.** Download and install Miniconda from the [official website](https://docs.conda.io/en/latest/miniconda.html). + +**Step 1.** Create a conda environment and activate it. + +```shell +conda create --name openmmlab python=3.8 -y +conda activate openmmlab +``` + +**Step 2.** Install PyTorch following [official commands](https://pytorch.org/get-started/locally/), e.g. + +On GPU platforms: + +```shell +conda install pytorch torchvision -c pytorch +``` + +On CPU platforms: + +```shell +conda install pytorch torchvision cpuonly -c pytorch +``` + +**Step 3.** Verify PyTorch installation + +```shell +python -c "import torch; print(torch.__version__); print(torch.cuda.is_available())" +``` + +If the GPU is used, the version information and `True` are printed; otherwise, the version information and `False` are printed. diff --git a/third_party/mmyolo/docs/en/get_started/installation.md b/third_party/mmyolo/docs/en/get_started/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..3259acfbb6f0326844a27d72275cec53e4cf6395 --- /dev/null +++ b/third_party/mmyolo/docs/en/get_started/installation.md @@ -0,0 +1,131 @@ +# Installation + +## Best Practices + +**Step 0.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim). + +```shell +pip install -U openmim +mim install "mmengine>=0.6.0" +mim install "mmcv>=2.0.0rc4,<2.1.0" +mim install "mmdet>=3.0.0,<4.0.0" +``` + +If you are currently in the mmyolo project directory, you can use the following simplified commands + +```shell +cd mmyolo +pip install -U openmim +mim install -r requirements/mminstall.txt +``` + +**Note:** + +a. In MMCV-v2.x, `mmcv-full` is rename to `mmcv`, if you want to install `mmcv` without CUDA ops, you can use `mim install "mmcv-lite>=2.0.0rc1"` to install the lite version. + +b. If you would like to use `albumentations`, we suggest using `pip install -r requirements/albu.txt` or `pip install -U albumentations --no-binary qudida,albumentations`. If you simply use `pip install albumentations==1.0.1`, it will install `opencv-python-headless` simultaneously (even though you have already installed `opencv-python`). We recommended checking the environment after installing albumentation to ensure that `opencv-python` and `opencv-python-headless` are not installed at the same time, because it might cause unexpected issues if they both installed. Please refer to [official documentation](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies) for more details. + +**Step 1.** Install MMYOLO. + +Case a: If you develop and run mmdet directly, install it from source: + +```shell +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +# Install albumentations +pip install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +# "-v" means verbose, or more output +# "-e" means installing a project in editable mode, +# thus any local modifications made to the code will take effect without reinstallation. +``` + +Case b: If you use MMYOLO as a dependency or third-party package, install it with MIM: + +```shell +mim install "mmyolo" +``` + +## Verify the installation + +To verify whether MMYOLO is installed correctly, we provide an inference demo. + +**Step 1.** We need to download config and checkpoint files. + +```shell +mim download mmyolo --config yolov5_s-v61_syncbn_fast_8xb16-300e_coco --dest . +``` + +The downloading will take several seconds or more, depending on your network environment. When it is done, you will find two files `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py` and `yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth` in your current folder. + +**Step 2.** Verify the inference demo. + +Option (a). If you install MMYOLO from source, just run the following command. + +```shell +python demo/image_demo.py demo/demo.jpg \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth + +# Optional parameters +# --out-dir ./output *The detection results are output to the specified directory. When args have action --show, the script do not save results. Default: ./output +# --device cuda:0 *The computing resources used, including cuda and cpu. Default: cuda:0 +# --show *Display the results on the screen. Default: False +# --score-thr 0.3 *Confidence threshold. Default: 0.3 +``` + +You will see a new image on your `output` folder, where bounding boxes are plotted. + +Supported input types: + +- Single image, include `jpg`, `jpeg`, `png`, `ppm`, `bmp`, `pgm`, `tif`, `tiff`, `webp`. +- Folder, all image files in the folder will be traversed and the corresponding results will be output. +- URL, will automatically download from the URL and the corresponding results will be output. + +Option (b). If you install MMYOLO with MIM, open your python interpreter and copy&paste the following codes. + +```python +from mmdet.apis import init_detector, inference_detector + +config_file = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' +checkpoint_file = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' +model = init_detector(config_file, checkpoint_file, device='cpu') # or device='cuda:0' +inference_detector(model, 'demo/demo.jpg') +``` + +You will see a list of `DetDataSample`, and the predictions are in the `pred_instance`, indicating the detected bounding boxes, labels, and scores. + +## Using MMYOLO with Docker + +We provide a [Dockerfile](https://github.com/open-mmlab/mmyolo/blob/main/docker/Dockerfile) to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >=19.03. + +Reminder: If you find out that your download speed is very slow, we suggest canceling the comments in the last two lines of `Optional` in the [Dockerfile](https://github.com/open-mmlab/mmyolo/blob/main/docker/Dockerfile#L19-L20) to obtain a rocket like download speed: + +```dockerfile +# (Optional) +RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +``` + +Build Command: + +```shell +# build an image with PyTorch 1.9, CUDA 11.1 +# If you prefer other versions, just modified the Dockerfile +docker build -t mmyolo docker/ +``` + +Run it with: + +```shell +export DATA_DIR=/path/to/your/dataset +docker run --gpus all --shm-size=8g -it -v ${DATA_DIR}:/mmyolo/data mmyolo +``` + +For other customized inatallation, see [Customized Installation](../tutorials/custom_installation.md) + +## Troubleshooting + +If you have some issues during the installation, please first view the [FAQ](../tutorials/faq.md) page. +You may [open an issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) on GitHub if no solution is found. diff --git a/third_party/mmyolo/docs/en/get_started/overview.md b/third_party/mmyolo/docs/en/get_started/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..52bcbd1716674a42e2155e27a948250777fe958f --- /dev/null +++ b/third_party/mmyolo/docs/en/get_started/overview.md @@ -0,0 +1,81 @@ +# Overview + +## MMYOLO Introduction + +
+image +
+ +MMYOLO is an open-source algorithms toolkit of YOLO based on PyTorch and MMDetection, part of the [OpenMMLab](https://openmmlab.com/) project. MMYOLO is positioned as a popular open-source library of YOLO series and core library of industrial applications. Its vision diagram is shown as follows: + +
+vision diagram +
+ +The following tasks are currently supported: + +
+Tasks currently supported + +- Object detection +- Rotated object detection + +
+ +The YOLO series of algorithms currently supported are as follows: + +
+Algorithms currently supported + +- YOLOv5 +- YOLOX +- RTMDet +- RTMDet-Rotated +- YOLOv6 +- YOLOv7 +- PPYOLOE +- YOLOv8 + +
+ +The datasets currently supported are as follows: + +
+Datasets currently supported + +- COCO Dataset +- VOC Dataset +- CrowdHuman Dataset +- DOTA 1.0 Dataset + +
+ +MMYOLO runs on Linux, Windows, macOS, and supports PyTorch 1.7 or later. It has the following three characteristics: + +- 🕹️ **Unified and convenient algorithm evaluation** + + MMYOLO unifies various YOLO algorithm modules and provides a unified evaluation process, so that users can compare and analyze fairly and conveniently. + +- 📚 **Extensive documentation for started and advanced** + + MMYOLO provides a series of documents, including getting started, deployment, advanced practice and algorithm analysis, which is convenient for different users to get started and expand. + +- 🧩 **Modular Design** + + MMYOLO disentangled the framework into modular components, and users can easily build custom models by combining different modules and training and testing strategies. + +Base module-P5 + This image is provided by RangeKing@GitHub, thanks very much! + +## User guide for this documentation + +MMYOLO divides the document structure into 6 parts, corresponding to different user needs. + +- **Get started with MMYOLO**. This part is must read for first-time MMYOLO users, so please read it carefully. +- **Recommend Topics**. This part is the essence documentation provided in MMYOLO by topics, including lots of MMYOLO features, etc. Highly recommended reading for all MMYOLO users. +- **Common functions**. This part provides a list of common features that you will use during the training and testing process, so you can refer back to them when you need. +- **Useful tools**. This part is useful tools summary under `tools`, so that you can quickly and happily use the various scripts provided in MMYOLO. +- **Basic and advanced tutorials**. This part introduces some basic concepts and advanced tutorials in MMYOLO. It is suitable for users who want to understand the design idea and structure design of MMYOLO in detail. +- **Others**. The rest includes model repositories, specifications and interface documentation, etc. + +Users with different needs can choose your favorite content to read. If you have any questions about this documentation or a better idea to improve it, welcome to post a Pull Request to MMYOLO ~. Please refer to [How to Contribute to MMYOLO](../recommended_topics/contributing.md) diff --git a/third_party/mmyolo/docs/en/index.rst b/third_party/mmyolo/docs/en/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1a0ab6c3b3d170479f096487c21871a2e273beb4 --- /dev/null +++ b/third_party/mmyolo/docs/en/index.rst @@ -0,0 +1,120 @@ +Welcome to MMYOLO's documentation! +======================================= +You can switch between Chinese and English documents in the top-right corner of the layout. + +.. toctree:: + :maxdepth: 2 + :caption: Get Started + + get_started/overview.md + get_started/dependencies.md + get_started/installation.md + get_started/15_minutes_object_detection.md + get_started/15_minutes_rotated_object_detection.md + get_started/15_minutes_instance_segmentation.md + get_started/article.md + +.. toctree:: + :maxdepth: 2 + :caption: Recommended Topics + + recommended_topics/contributing.md + recommended_topics/training_testing_tricks.md + recommended_topics/model_design.md + recommended_topics/algorithm_descriptions/index.rst + recommended_topics/application_examples/index.rst + recommended_topics/replace_backbone.md + recommended_topics/complexity_analysis.md + recommended_topics/labeling_to_deployment_tutorials.md + recommended_topics/visualization.md + recommended_topics/deploy/index.rst + recommended_topics/troubleshooting_steps.md + recommended_topics/mm_basics.md + recommended_topics/dataset_preparation.md + +.. toctree:: + :maxdepth: 2 + :caption: Common Usage + + common_usage/resume_training.md + common_usage/syncbn.md + common_usage/amp_training.md + common_usage/ms_training_testing.md + common_usage/tta.md + common_usage/plugins.md + common_usage/freeze_layers.md + common_usage/output_predictions.md + common_usage/set_random_seed.md + common_usage/module_combination.md + common_usage/mim_usage.md + common_usage/multi_necks.md + common_usage/specify_device.md + common_usage/single_multi_channel_applications.md + + +.. toctree:: + :maxdepth: 2 + :caption: Useful Tools + + useful_tools/browse_coco_json.md + useful_tools/browse_dataset.md + useful_tools/print_config.md + useful_tools/dataset_analysis.md + useful_tools/optimize_anchors.md + useful_tools/extract_subcoco.md + useful_tools/vis_scheduler.md + useful_tools/dataset_converters.md + useful_tools/download_dataset.md + useful_tools/log_analysis.md + useful_tools/model_converters.md + +.. toctree:: + :maxdepth: 2 + :caption: Basic Tutorials + + tutorials/config.md + tutorials/data_flow.md + tutorials/custom_installation.md + tutorials/warning_notes.md + tutorials/faq.md + + +.. toctree:: + :maxdepth: 2 + :caption: Advanced Tutorials + + advanced_guides/cross-library_application.md + + +.. toctree:: + :maxdepth: 2 + :caption: Model Zoo + + model_zoo.md + +.. toctree:: + :maxdepth: 1 + :caption: Notes + + notes/changelog.md + notes/compatibility.md + notes/conventions.md + notes/code_style.md + +.. toctree:: + :maxdepth: 1 + :caption: API Reference + + api.rst + +.. toctree:: + :caption: Switch Language + + switch_language.md + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/third_party/mmyolo/docs/en/make.bat b/third_party/mmyolo/docs/en/make.bat new file mode 100644 index 0000000000000000000000000000000000000000..922152e96a04a242e6fc40f124261d74890617d8 --- /dev/null +++ b/third_party/mmyolo/docs/en/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/third_party/mmyolo/docs/en/model_zoo.md b/third_party/mmyolo/docs/en/model_zoo.md new file mode 100644 index 0000000000000000000000000000000000000000..1547bb9d090c3f7aa65c2c1a39ae10fa096cb0f8 --- /dev/null +++ b/third_party/mmyolo/docs/en/model_zoo.md @@ -0,0 +1,94 @@ +# Model Zoo and Benchmark + +This page is used to summarize the performance and related evaluation metrics of various models supported in MMYOLO for users to compare and analyze. + +## COCO dataset + +
+ +
+ +| Model | Arch | Size | Batch Size | Epoch | SyncBN | AMP | Mem (GB) | Params(M) | FLOPs(G) | TRT-FP16-GPU-Latency(ms) | Box AP | TTA Box AP | +| :--------------: | :--: | :--: | :--------: | :---: | :----: | :-: | :------: | :-------: | :------: | :----------------------: | :----: | :--------: | +| YOLOv5-n | P5 | 640 | 8xb16 | 300 | Yes | Yes | 1.5 | 1.87 | 2.26 | 1.14 | 28.0 | 30.7 | +| YOLOv6-v2.0-n | P5 | 640 | 8xb32 | 400 | Yes | Yes | 6.04 | 4.32 | 5.52 | 1.37 | 36.2 | | +| YOLOv8-n | P5 | 640 | 8xb16 | 500 | Yes | Yes | 2.5 | 3.16 | 4.4 | 1.53 | 37.4 | 39.9 | +| RTMDet-tiny | P5 | 640 | 8xb32 | 300 | Yes | No | 11.9 | 4.90 | 8.09 | 2.31 | 41.8 | 43.2 | +| YOLOv6-v2.0-tiny | P5 | 640 | 8xb32 | 400 | Yes | Yes | 8.13 | 9.70 | 12.37 | 2.19 | 41.0 | | +| YOLOv7-tiny | P5 | 640 | 8xb16 | 300 | Yes | Yes | 2.7 | 6.23 | 6.89 | 1.88 | 37.5 | | +| YOLOX-tiny | P5 | 416 | 8xb32 | 300 | No | Yes | 4.9 | 5.06 | 7.63 | 1.19 | 34.3 | | +| RTMDet-s | P5 | 640 | 8xb32 | 300 | Yes | No | 16.3 | 8.89 | 14.84 | 2.89 | 45.7 | 47.3 | +| YOLOv5-s | P5 | 640 | 8xb16 | 300 | Yes | Yes | 2.7 | 7.24 | 8.27 | 1.89 | 37.7 | 40.2 | +| YOLOv6-v2.0-s | P5 | 640 | 8xb32 | 400 | Yes | Yes | 8.88 | 17.22 | 21.94 | 2.67 | 44.0 | | +| YOLOv8-s | P5 | 640 | 8xb16 | 500 | Yes | Yes | 4.0 | 11.17 | 14.36 | 2.61 | 45.1 | 46.8 | +| YOLOX-s | P5 | 640 | 8xb32 | 300 | No | Yes | 9.8 | 8.97 | 13.40 | 2.38 | 41.9 | | +| PPYOLOE+ -s | P5 | 640 | 8xb8 | 80 | Yes | No | 4.7 | 7.93 | 8.68 | 2.54 | 43.5 | | +| RTMDet-m | P5 | 640 | 8xb32 | 300 | Yes | No | 29.0 | 24.71 | 39.21 | 6.23 | 50.2 | 51.9 | +| YOLOv5-m | P5 | 640 | 8xb16 | 300 | Yes | Yes | 5.0 | 21.19 | 24.53 | 4.28 | 45.3 | 46.9 | +| YOLOv6-v2.0-m | P5 | 640 | 8xb32 | 300 | Yes | Yes | 16.69 | 34.25 | 40.7 | 5.12 | 48.4 | | +| YOLOv8-m | P5 | 640 | 8xb16 | 500 | Yes | Yes | 7.0 | 25.9 | 39.57 | 5.78 | 50.6 | 52.3 | +| YOLOX-m | P5 | 640 | 8xb32 | 300 | No | Yes | 17.6 | 25.33 | 36.88 | 5.31 | 47.5 | | +| PPYOLOE+ -m | P5 | 640 | 8xb8 | 80 | Yes | No | 8.4 | 23.43 | 24.97 | 5.47 | 49.5 | | +| RTMDet-l | P5 | 640 | 8xb32 | 300 | Yes | No | 45.2 | 52.32 | 80.12 | 10.13 | 52.3 | 53.7 | +| YOLOv5-l | P5 | 640 | 8xb16 | 300 | Yes | Yes | 8.1 | 46.56 | 54.65 | 6.8 | 48.8 | 49.9 | +| YOLOv6-v2.0-l | P5 | 640 | 8xb32 | 300 | Yes | Yes | 20.86 | 58.53 | 71.43 | 8.78 | 51.0 | | +| YOLOv7-l | P5 | 640 | 8xb16 | 300 | Yes | Yes | 10.3 | 36.93 | 52.42 | 6.63 | 50.9 | | +| YOLOv8-l | P5 | 640 | 8xb16 | 500 | Yes | Yes | 9.1 | 43.69 | 82.73 | 8.97 | 53.0 | 54.4 | +| YOLOX-l | P5 | 640 | 8xb8 | 300 | No | Yes | 8.0 | 54.21 | 77.83 | 9.23 | 50.1 | | +| PPYOLOE+ -l | P5 | 640 | 8xb8 | 80 | Yes | No | 13.2 | 52.20 | 55.05 | 8.2 | 52.6 | | +| RTMDet-x | P5 | 640 | 8xb32 | 300 | Yes | No | 63.4 | 94.86 | 145.41 | 17.89 | 52.8 | 54.2 | +| YOLOv7-x | P5 | 640 | 8xb16 | 300 | Yes | Yes | 13.7 | 71.35 | 95.06 | 11.63 | 52.8 | | +| YOLOv8-x | P5 | 640 | 8xb16 | 500 | Yes | Yes | 12.4 | 68.23 | 132.10 | 14.22 | 54.0 | 55.0 | +| YOLOX-x | P5 | 640 | 8xb8 | 300 | No | Yes | 9.8 | 99.07 | 144.39 | 15.35 | 51.4 | | +| PPYOLOE+ -x | P5 | 640 | 8xb8 | 80 | Yes | No | 19.1 | 98.42 | 105.48 | 14.02 | 54.2 | | +| YOLOv5-n | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 5.8 | 3.25 | 2.30 | | 35.9 | | +| YOLOv5-s | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 10.5 | 12.63 | 8.45 | | 44.4 | | +| YOLOv5-m | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 19.1 | 35.73 | 25.05 | | 51.3 | | +| YOLOv5-l | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 30.5 | 76.77 | 55.77 | | 53.7 | | +| YOLOv7-w | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 27.0 | 82.31 | 45.07 | | 54.1 | | +| YOLOv7-e | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 42.5 | 114.69 | 64.48 | | 55.1 | | + +- All the models are trained on COCO train2017 dataset and evaluated on val2017 dataset. +- TRT-FP16-GPU-Latency(ms) is the GPU Compute time on NVIDIA Tesla T4 device with TensorRT 8.4, a batch size of 1, a test shape of 640x640 and only model forward (The test shape for YOLOX-tiny is 416x416) +- The number of model parameters and FLOPs are obtained using the [get_flops](https://github.com/open-mmlab/mmyolo/blob/dev/tools/analysis_tools/get_flops.py) script. Different calculation methods may vary slightly +- RTMDet performance is the result of training with [MMRazor Knowledge Distillation](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/distillation/README.md) +- Only YOLOv6 version 2.0 is implemented in MMYOLO for now, and L and M are the results without knowledge distillation +- YOLOv8 results are optimized using mask instance annotation, but YOLOv5, YOLOv6 and YOLOv7 do not use +- PPYOLOE+ uses Obj365 as pre-training weights, so the number of epochs for COCO training only needs 80 +- YOLOX-tiny, YOLOX-s and YOLOX-m are trained with the optimizer parameters proposed in RTMDet, with different degrees of performance improvement compared to the original implementation. + +Please see below items for more details + +- [RTMDet](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet) +- [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) +- [YOLOv6](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov6) +- [YOLOv7](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov7) +- [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov8) +- [YOLOX](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolox) +- [PPYOLO-E](https://github.com/open-mmlab/mmyolo/blob/main/configs/ppyoloe) + +## VOC dataset + +| Backbone | size | Batchsize | AMP | Mem (GB) | box AP(COCO metric) | +| :------: | :--: | :-------: | :-: | :------: | :-----------------: | +| YOLOv5-n | 512 | 64 | Yes | 3.5 | 51.2 | +| YOLOv5-s | 512 | 64 | Yes | 6.5 | 62.7 | +| YOLOv5-m | 512 | 64 | Yes | 12.0 | 70.1 | +| YOLOv5-l | 512 | 32 | Yes | 10.0 | 73.1 | + +Please see below items for more details + +- [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) + +## CrowdHuman dataset + +| Backbone | size | SyncBN | AMP | Mem (GB) | ignore_iof_thr | box AP50(CrowDHuman Metric) | MR | JI | +| :------: | :--: | :----: | :-: | :------: | :------------: | :-------------------------: | :--: | :---: | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | -1 | 85.79 | 48.7 | 75.33 | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | 0.5 | 86.17 | 48.8 | 75.87 | + +Please see below items for more details + +- [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) + +## DOTA 1.0 dataset diff --git a/third_party/mmyolo/docs/en/notes/changelog.md b/third_party/mmyolo/docs/en/notes/changelog.md new file mode 100644 index 0000000000000000000000000000000000000000..fa3e1a776423df5c5a05d36870350e5b2fcd0bb1 --- /dev/null +++ b/third_party/mmyolo/docs/en/notes/changelog.md @@ -0,0 +1,342 @@ +# Changelog + +## v0.6.0 (15/8/2023) + +### Highlights + +- Support YOLOv5 instance segmentation +- Support YOLOX-Pose based on MMPose +- Add 15 minutes instance segmentation tutorial. +- YOLOv5 supports using mask annotation to optimize bbox +- Add Multi-scale training and testing docs + +### New Features + +- Add training and testing tricks doc (#659) +- Support setting the cache_size_limit parameter and support mmdet 3.0.0 (#707) +- Support YOLOv5u and YOLOv6 3.0 inference (#624, #744) +- Support model-only inference (#733) +- Add YOLOv8 deepstream config (#633) +- Add ionogram example in MMYOLO application (#643) + +### Bug Fixes + +- Fix the browse_dataset for visualization of test and val (#641) +- Fix installation doc error (#662) +- Fix yolox-l ckpt link (#677) +- Fix typos in the YOLOv7 and YOLOv8 diagram (#621, #710) +- Adjust the order of package imports in `boxam_vis_demo.py` (#655) + +### Improvements + +- Optimize the `convert_kd_ckpt_to_student.py` file (#647) +- Add en doc of `FAQ` and `training_testing_tricks` (#691,#693) + +### Contributors + +A total of 21 developers contributed to this release. + +Thank @Lum1104,@azure-wings,@FeiGeChuanShu,@Lingrui Gu,@Nioolek,@huayuan4396,@RangeKing,@danielhonies,@yechenzhi,@JosonChan1998,@kitecats,@Qingrenn,@triple-Mu,@kikefdezl,@zhangrui-wolf,@xin-li-67,@Ben-Louis,@zgzhengSEU,@VoyagerXvoyagerx,@tang576225574,@hhaAndroid + +## v0.5.0 (2/3/2023) + +### Highlights + +1. Support [RTMDet-R](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/README.md#rotated-object-detection) rotated object detection +2. Support for using mask annotation to improve [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md) object detection performance +3. Support [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/razor/subnets/README.md) searchable NAS sub-network as the backbone of YOLO series algorithm +4. Support calling [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/distillation/README.md) to distill the knowledge of RTMDet +5. [MMYOLO](https://mmyolo.readthedocs.io/zh_CN/dev/) document structure optimization, comprehensive content upgrade +6. Improve YOLOX mAP and training speed based on RTMDet training hyperparameters +7. Support calculation of model parameters and FLOPs, provide GPU latency data on T4 devices, and update [Model Zoo](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/model_zoo.md) +8. Support test-time augmentation (TTA) +9. Support RTMDet, YOLOv8 and YOLOv7 assigner visualization + +### New Features + +01. Support inference for RTMDet instance segmentation tasks (#583) +02. Beautify the configuration file in MMYOLO and add more comments (#501, #506, #516, #529, #531, #539) +03. Refactor and optimize documentation (#568, #573, #579, #584, #587, #589, #596, #599, #600) +04. Support fast version of YOLOX (#518) +05. Support DeepStream in EasyDeploy and add documentation (#485, #545, #571) +06. Add confusion matrix drawing script (#572) +07. Add single channel application case (#460) +08. Support auto registration (#597) +09. Support Box CAM of YOLOv7, YOLOv8 and PPYOLOE (#601) +10. Add automated generation of MM series repo registration information and tools scripts (#559) +11. Added YOLOv7 model structure diagram (#504) +12. Add how to specify specific GPU training and inference files (#503) +13. Add check if `metainfo` is all lowercase when training or testing (#535) +14. Add links to Twitter, Discord, Medium, YouTube, etc. (#555) + +### Bug Fixes + +1. Fix isort version issue (#492, #497) +2. Fix type error of assigner visualization (#509) +3. Fix YOLOv8 documentation link error (#517) +4. Fix RTMDet Decoder error in EasyDeploy (#519) +5. Fix some document linking errors (#537) +6. Fix RTMDet-Tiny weight path error (#580) + +### Improvements + +1. Update `contributing.md` +2. Optimize `DetDataPreprocessor` branch to support multitasking (#511) +3. Optimize `gt_instances_preprocess` so it can be used for other YOLO algorithms (#532) +4. Add `yolov7-e6e` weight conversion script (#570) +5. Reference YOLOv8 inference code modification PPYOLOE + +### Contributors + +A total of 22 developers contributed to this release. + +Thank @triple-Mu, @isLinXu, @Audrey528, @TianWen580, @yechenzhi, @RangeKing, @lyviva, @Nioolek, @PeterH0323, @tianleiSHI, @aptsunny, @satuoqaq, @vansin, @xin-li-67, @VoyagerXvoyagerx, +@landhill, @kitecats, @tang576225574, @HIT-cwh, @AI-Tianlong, @RangiLyu, @hhaAndroid + +## v0.4.0 (18/1/2023) + +### Highlights + +1. Implemented [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md) object detection model, and supports model deployment in [projects/easydeploy](https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy) +2. Added Chinese and English versions of [Algorithm principles and implementation with YOLOv8](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/algorithm_descriptions/yolov8_description.md) + +### New Features + +1. Added YOLOv8 and PPYOLOE model structure diagrams (#459, #471) +2. Adjust the minimum supported Python version from 3.6 to 3.7 (#449) +3. Added a new YOLOX decoder in TensorRT-8 (#450) +4. Add a tool for scheduler visualization (#479) + +### Bug Fixes + +1. Fix `optimize_anchors.py` script import error (#452) +2. Fix the wrong installation steps in `get_started.md` (#474) +3. Fix the neck error when using the `RTMDet` P6 model (#480) + +### Contributors + +A total of 9 developers contributed to this release. + +Thank @VoyagerXvoyagerx, @tianleiSHI, @RangeKing, @PeterH0323, @Nioolek, @triple-Mu, @lyviva, @Zheng-LinXiao, @hhaAndroid + +## v0.3.0 (8/1/2023) + +### Highlights + +1. Implement fast version of [RTMDet](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/README.md). RTMDet-s 8xA100 training takes only 14 hours. The training speed is 2.6 times faster than the previous version. +2. Support [PPYOLOE](https://github.com/open-mmlab/mmyolo/blob/dev/configs/ppyoloe/README.md) training +3. Support `iscrowd` attribute training in [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py) +4. Support [YOLOv5 assigner result visualization](https://github.com/open-mmlab/mmyolo/blob/dev/projects/assigner_visualization/README.md) + +### New Features + +01. Add `crowdhuman` dataset (#368) +02. Easydeploy support TensorRT inference (#377) +03. Add `YOLOX` structure description (#402) +04. Add a feature for the video demo (#392) +05. Support `YOLOv7` easy deploy (#427) +06. Add resume from specific checkpoint in CLI (#393) +07. Set `metainfo` fields to lower case (#362, #412) +08. Add module combination doc (#349, #352, #345) +09. Add docs about how to freeze the weight of backbone or neck (#418) +10. Add don't used pre-training weights doc in `how_to.md` (#404) +11. Add docs about how to set the random seed (#386) +12. Translate `rtmdet_description.md` document to English (#353) +13. Add doc of `yolov6_description.md` (#382, #372) + +### Bug Fixes + +01. Fix bugs in the output annotation file when `--class-id-txt` is set (#430) +02. Fix batch inference bug in `YOLOv5` head (#413) +03. Fix typehint in some heads (#415, #416, #443) +04. Fix RuntimeError of `torch.cat()` expected a non-empty list of Tensors (#376) +05. Fix the device inconsistency error in `YOLOv7` training (#397) +06. Fix the `scale_factor` and `pad_param` value in `LetterResize` (#387) +07. Fix docstring graph rendering error of readthedocs (#400) +08. Fix AssertionError when `YOLOv6` from training to val (#378) +09. Fix CI error due to `np.int` and legacy builder.py (#389) +10. Fix MMDeploy rewriter (#366) +11. Fix MMYOLO unittest scope bug (#351) +12. Fix `pad_param` error (#354) +13. Fix twice head inference bug (#342) +14. Fix customize dataset training (#428) + +### Improvements + +01. Update `useful_tools.md` (#384) +02. update the English version of `custom_dataset.md` (#381) +03. Remove context argument from the rewriter function (#395) +04. deprecating `np.bool` type alias (#396) +05. Add new video link for custom dataset (#365) +06. Export onnx for model only (#361) +07. Add MMYOLO regression test yml (#359) +08. Update video tutorials in `article.md` (#350) +09. Add deploy demo (#343) +10. Optimize the vis results of large images in debug mode (#346) +11. Improve args for `browse_dataset` and support `RepeatDataset` (#340, #338) + +### Contributors + +A total of 28 developers contributed to this release. + +Thank @RangeKing, @PeterH0323, @Nioolek, @triple-Mu, @matrixgame2018, @xin-li-67, @tang576225574, @kitecats, @Seperendity, @diplomatist, @vaew, @wzr-skn, @VoyagerXvoyagerx, @MambaWong, @tianleiSHI, @caj-github, @zhubochao, @lvhan028, @dsghaonan, @lyviva, @yuewangg, @wang-tf, @satuoqaq, @grimoire, @RunningLeon, @hanrui1sensetime, @RangiLyu, @hhaAndroid + +## v0.2.0(1/12/2022) + +### Highlights + +1. Support [YOLOv7](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov7) P5 and P6 model +2. Support [YOLOv6](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov6/README.md) ML model +3. Support [Grad-Based CAM and Grad-Free CAM](https://github.com/open-mmlab/mmyolo/blob/dev/demo/boxam_vis_demo.py) +4. Support [large image inference](https://github.com/open-mmlab/mmyolo/blob/dev/demo/large_image_demo.py) based on sahi +5. Add [easydeploy](https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy/README.md) project under the projects folder +6. Add [custom dataset guide](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/user_guides/custom_dataset.md) + +### New Features + +1. `browse_dataset.py` script supports visualization of original image, data augmentation and intermediate results (#304) +2. Add flag to output labelme label file in `image_demo.py` (#288, #314) +3. Add `labelme2coco` script (#308, #313) +4. Add split COCO dataset script (#311) +5. Add two examples of backbone replacement in `how-to.md` and update `plugin.md` (#291) +6. Add `contributing.md` and `code_style.md` (#322) +7. Add docs about how to use mim to run scripts across libraries (#321) +8. Support `YOLOv5` deployment at RV1126 device (#262) + +### Bug Fixes + +1. Fix MixUp padding error (#319) +2. Fix scale factor order error of `LetterResize` and `YOLOv5KeepRatioResize` (#305) +3. Fix training errors of `YOLOX Nano` model (#285) +4. Fix `RTMDet` deploy error (#287) +5. Fix int8 deploy config (#315) +6. Fix `make_stage_plugins` doc in `basebackbone` (#296) +7. Enable switch to deploy when create pytorch model in deployment (#324) +8. Fix some errors in `RTMDet` model graph (#317) + +### Improvements + +1. Add option of json output in `test.py` (#316) +2. Add area condition in `extract_subcoco.py` script (#286) +3. Deployment doc translation (#289) +4. Add YOLOv6 description overview doc (#252) +5. Improve `config.md` (#297, #303) + 6Add mosaic9 graph in docstring (#307) +6. Improve `browse_coco_json.py` script args (#309) +7. Refactor some functions in `dataset_analysis.py` to be more general (#294) + +#### Contributors + +A total of 14 developers contributed to this release. + +Thank @fcakyon, @matrixgame2018, @MambaWong, @imAzhou, @triple-Mu, @RangeKing, @PeterH0323, @xin-li-67, @kitecats, @hanrui1sensetime, @AllentDan, @Zheng-LinXiao, @hhaAndroid, @wanghonglie + +## v0.1.3(10/11/2022) + +### New Features + +1. Support CBAM plug-in and provide plug-in documentation (#246) +2. Add YOLOv5 P6 model structure diagram and related descriptions (#273) + +### Bug Fixes + +1. Fix training failure when saving best weights based on mmengine 0.3.1 +2. Fix `add_dump_metric` error based on mmdet 3.0.0rc3 (#253) +3. Fix backbone does not support `init_cfg` issue (#272) +4. Change typing import method based on mmdet 3.0.0rc3 (#261) + +### Improvements + +1. `featmap_vis_demo` support for folder and url input (#248) +2. Deploy docker file refinement (#242) + +#### Contributors + +A total of 10 developers contributed to this release. + +Thank @kitecats, @triple-Mu, @RangeKing, @PeterH0323, @Zheng-LinXiao, @tkhe, @weikai520, @zytx121, @wanghonglie, @hhaAndroid + +## v0.1.2(3/11/2022) + +### Highlights + +1. Support [YOLOv5/YOLOv6/YOLOX/RTMDet deployments](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy) for ONNXRuntime and TensorRT +2. Support [YOLOv6](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov6) s/t/n model training +3. YOLOv5 supports [P6 model training which can input 1280-scale images](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) +4. YOLOv5 supports [VOC dataset training](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/voc) +5. Support [PPYOLOE](https://github.com/open-mmlab/mmyolo/blob/main/configs/ppyoloe) and [YOLOv7](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov7) model inference and official weight conversion +6. Add YOLOv5 replacement [backbone tutorial](https://github.com/open-mmlab/mmyolo/blob/dev/docs/en/advanced_guides/how_to.md#use-backbone-network-implemented-in-other-openmmlab-repositories) in How-to documentation + +### New Features + +1. Add `optimize_anchors` script (#175) +2. Add `extract_subcoco` script (#186) +3. Add `yolo2coco` conversion script (#161) +4. Add `dataset_analysis` script (#172) +5. Remove Albu version restrictions (#187) + +### Bug Fixes + +1. Fix the problem that `cfg.resume` does not work when set (#221) +2. Fix the problem of not showing bbox in feature map visualization script (#204) +3. uUpdate the metafile of RTMDet (#188) +4. Fix a visualization error in `test_pipeline` (#166) +5. Update badges (#140) + +### Improvements + +1. Optimize Readthedoc display page (#209) +2. Add docstring for module structure diagram for base model (#196) +3. Support for not including any instance logic in LoadAnnotations (#161) +4. Update `image_demo` script to support folder and url paths (#128) +5. Update pre-commit hook (#129) + +### Documentation + +1. Translate `yolov5_description.md`, `yolov5_tutorial.md` and `visualization.md` into English (#138, #198, #206) +2. Add deployment-related Chinese documentation (#220) +3. Update `config.md`, `faq.md` and `pull_request_template.md` (#190, #191, #200) +4. Update the `article` page (#133) + +#### Contributors + +A total of 14 developers contributed to this release. + +Thank @imAzhou, @triple-Mu, @RangeKing, @PeterH0323, @xin-li-67, @Nioolek, @kitecats, @Bin-ze, @JiayuXu0, @cydiachen, @zhiqwang, @Zheng-LinXiao, @hhaAndroid, @wanghonglie + +## v0.1.1(29/9/2022) + +Based on MMDetection's RTMDet high precision and low latency object detection algorithm, we have also released RTMDet and provided a Chinese document on the principle and implementation of RTMDet. + +### Highlights + +1. Support [RTMDet](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet) +2. Support for backbone customization plugins and update How-to documentation (#75) + +### Bug Fixes + +1. Fix some documentation errors (#66, #72, #76, #83, #86) +2. Fix checkpoints link error (#63) +3. Fix the bug that the output of `LetterResize` does not meet the expectation when using `imscale` (#105) + +### Improvements + +1. Reducing the size of docker images (#67) +2. Simplifying `Compose` Logic in `BaseMixImageTransform` (#71) +3. Supports dump results in `test.py` (#84) + +#### Contributors + +A total of 13 developers contributed to this release. + +Thank @wanghonglie, @hhaAndroid, @yang-0201, @PeterH0323, @RangeKing, @satuoqaq, @Zheng-LinXiao, @xin-li-67, @suibe-qingtian, @MambaWong, @MichaelCai0912, @rimoire, @Nioolek + +## v0.1.0(21/9/2022) + +We have released MMYOLO open source library, which is based on MMEngine, MMCV 2.x and MMDetection 3.x libraries. At present, the object detection has been realized, and it will be expanded to multi-task in the future. + +### Highlights + +1. Support YOLOv5/YOLOX training, support YOLOv6 inference. Deployment will be supported soon. +2. Refactored YOLOX from MMDetection to accelerate training and inference. +3. Detailed introduction and advanced tutorials are provided, see the [English tutorial](https://mmyolo.readthedocs.io/en/latest). diff --git a/third_party/mmyolo/docs/en/notes/code_style.md b/third_party/mmyolo/docs/en/notes/code_style.md new file mode 100644 index 0000000000000000000000000000000000000000..3bc8291e24cdc998a0a412ec8b70ba23be4821b8 --- /dev/null +++ b/third_party/mmyolo/docs/en/notes/code_style.md @@ -0,0 +1,3 @@ +# Code Style + +Coming soon. Please refer to [chinese documentation](https://mmyolo.readthedocs.io/zh_CN/latest/community/code_style.html). diff --git a/third_party/mmyolo/docs/en/notes/compatibility.md b/third_party/mmyolo/docs/en/notes/compatibility.md new file mode 100644 index 0000000000000000000000000000000000000000..7e6ad3da3e116d055d7cc5d7039fad8a9ecdaee6 --- /dev/null +++ b/third_party/mmyolo/docs/en/notes/compatibility.md @@ -0,0 +1,46 @@ +# Compatibility of MMYOLO + +## MMYOLO 0.3.0 + +### METAINFO modification + +To unify with other OpenMMLab repositories, change all keys of `METAINFO` in Dataset from upper case to lower case. + +| Before v0.3.0 | after v0.3.0 | +| :-----------: | :----------: | +| CLASSES | classes | +| PALETTE | palette | +| DATASET_TYPE | dataset_type | + +### About the order of image shape + +In OpenMMLab 2.0, to be consistent with the input argument of OpenCV, the argument about image shape in the data transformation pipeline is always in the `(width, height)` order. On the contrary, for computation convenience, the order of the field going through the data pipeline and the model is `(height, width)`. Specifically, in the results processed by each data transform pipeline, the fields and their value meaning is as below: + +- img_shape: (height, width) +- ori_shape: (height, width) +- pad_shape: (height, width) +- batch_input_shape: (height, width) + +As an example, the initialization arguments of `Mosaic` are as below: + +```python +@TRANSFORMS.register_module() +class Mosaic(BaseTransform): + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + prob: float = 1.0) -> None: + ... + + # img_scale order should be (width, height) + self.img_scale = img_scale + + def transform(self, results: dict) -> dict: + ... + + results['img'] = mosaic_img + # (height, width) + results['img_shape'] = mosaic_img.shape[:2] +``` diff --git a/third_party/mmyolo/docs/en/notes/conventions.md b/third_party/mmyolo/docs/en/notes/conventions.md new file mode 100644 index 0000000000000000000000000000000000000000..40ca991c6cb845df4ee6f5a9a879bf6ff1d58765 --- /dev/null +++ b/third_party/mmyolo/docs/en/notes/conventions.md @@ -0,0 +1,36 @@ +# Conventions + +Please check the following conventions if you would like to modify MMYOLO as your own project. + +## About the order of image shape + +In OpenMMLab 2.0, to be consistent with the input argument of OpenCV, the argument about image shape in the data transformation pipeline is always in the `(width, height)` order. On the contrary, for computation convenience, the order of the field going through the data pipeline and the model is `(height, width)`. Specifically, in the results processed by each data transform pipeline, the fields and their value meaning is as below: + +- img_shape: (height, width) +- ori_shape: (height, width) +- pad_shape: (height, width) +- batch_input_shape: (height, width) + +As an example, the initialization arguments of `Mosaic` are as below: + +```python +@TRANSFORMS.register_module() +class Mosaic(BaseTransform): + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + prob: float = 1.0) -> None: + ... + + # img_scale order should be (width, height) + self.img_scale = img_scale + + def transform(self, results: dict) -> dict: + ... + + results['img'] = mosaic_img + # (height, width) + results['img_shape'] = mosaic_img.shape[:2] +``` diff --git a/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/index.rst b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..e51d04cb36c92e88976c201ffb2a543987eb717f --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/index.rst @@ -0,0 +1,9 @@ +Algorithm principles and implementation +****************************************** + +.. toctree:: + :maxdepth: 1 + + yolov5_description.md + yolov8_description.md + rtmdet_description.md diff --git a/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/rtmdet_description.md b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/rtmdet_description.md new file mode 100644 index 0000000000000000000000000000000000000000..1cd62828341bc78a67255c1e807992a24d3f82b9 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/rtmdet_description.md @@ -0,0 +1,191 @@ +# Algorithm principles and implementation with RTMDet + +## 0 Introduction + +High performance, low latency one-stage object detection + +
+RTMDet_structure_v1.3 +
+ +RangeKing@github provides the graph above. Thanks, RangeKing! + +Recently,the open-source community has spring up a large number of high-precision object detection projects, one of the most prominent projects is YOLO series. OpenMMLab has also launched MMYOLO in collaboration with the community. +After investigating many improved models in current YOLO series, MMDetection core developers empirically summarized these designs and training methods, and optimized them to launch a single-stage object detector with high accuracy and low latency RTMDet, **R**eal-**t**ime **M**odels for Object **Det**ection +(**R**elease **t**o **M**anufacture) + +RTMDet consists of a series of tiny/s/m/l/x models of different sizes, which provide different choices for different application scenarios. +Specifically, RTMDet-x achieves a 300+ FPS inference speed with an accuracy of 52.6 mAP. + +```{note} +Note: Inference speed and accuracy test (excluding NMS) were performed on `TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1` on 1 NVIDIA 3090 GPU. +``` + +The lightest model, RTMDet-tiny, can achieve 40.9 mAP with only 4M parameters and inference speed \< 1 ms. + +
+RTMDet_accuracy_graph +
+ +The accuracy in this figure is a fair comparison to 300 training epochs, without distillation. + +| | mAP | Params | Flops | Inference speed | +| -------------------------- | --------------- | -------------- | ------------ | --------------- | +| Baseline(YOLOX) | 40.2 | 9M | 13.4G | 1.2ms | +| + AdamW + Flat Cosine | 40.6 (+0.4) | 9M | 13.4G | 1.2ms | +| + CSPNeXt backbone & PAFPN | 41.8 (+1.2) | 10.07M (+1.07) | 14.8G (+1.4) | 1.22ms (+0.02) | +| + SepBNHead | 41.8 (+0) | 8.89M (-1.18) | 14.8G | 1.22ms | +| + Label Assign & Loss | 42.9 (+1.1) | 8.89M | 14.8G | 1.22ms | +| + Cached Mosaic & MixUp | 44.2 (+1.3) | 8.89M | 14.8G | 1.22ms | +| + RSB-pretrained backbone | **44.5 (+0.3)** | 8.89M | 14.8G | 1.22ms | + +- Official repository: https://github.com/open-mmlab/mmdetection/blob/3.x/configs/rtmdet/README.md +- MMYOLO repository: https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/README.md + +## 1 v1.0 algorithm principle and MMYOLO implementation analysis + +### 1.1 Data augmentation + +Many data augmentation methods are used in RTMDet, mainly include single image data augmentation: + +- **RandomResize** +- **RandomCrop** +- **HSVRandomAug** +- **RandomFlip** + +and mixed image data augmentation: + +- **Mosaic** +- **MixUp** + +The following picture demonstrates the data augmentation process: + +
+image +
+ +The RandomResize hyperparameters are different on the large models M,L,X and the small models S, Tiny. Due to the number of parameters,the large models can use the `large jitter scale strategy` with parameters of (0.1,2.0). The small model adopts the `stand scale jitter` strategy with parameters of (0.5, 2.0). + +The single image data augmentation has been packaged in `MMDetection` so users can directly use all methods through simple configurations. As a very ordinary and common processing method, this part will not be further introduced now. The implementation of mixed image data augmentation is described in the following. + +Unlike YOLOv5, which considers the use of MixUp on S and Nano models is excessive. Small models don't need such strong data augmentation. However, RTMDet also uses MixUp on S and Tiny, because RTMDet will switch to normal aug at last 20 epochs, and this operation was proved to be effective by training. Moreover, RTMDet introduces a Cache scheme for mixed image data augmentation, which effectively reduces the image processing time and introduces adjustable hyperparameters. + +`max_cached_images`, which is similar to `repeated augmentation` when using a smaller cache. The details are as follows: + +| | Use cache | ms / 100 imgs | +| ------ | --------- | ------------- | +| Mosaic | | 87.1 | +| Mosaic | √ | **24.0** | +| MixUp | | 19.3 | +| MixUp | √ | **12.4** | + +| | RTMDet-s | RTMDet-l | +| ----------------------------- | -------- | -------- | +| Mosaic + MixUp + 20e finetune | 43.9 | **51.3** | + +#### 1.1.1 Introducing Cache for mixins data augmentation + +Mosaic&MixUp needs to blend multiple images, which takes k times longer than common data augmentation (k is the number of images mixed in). For example, in YOLOv5, every time Mosaic is done, the information of four images needs to be reloaded from the hard disk. RTMDet only needs to reload the current image, and the rest images participating in the mixed augmentation are obtained from the cache queue, which greatly improves the efficiency by sacrificing a certain memory space. Moreover, we can modify the cache size and pop mode to adjust the strength of augmentation. + +
+data cache +
+ +As shown in the figure, N loaded images and labels are stored in the cache queue in advance. In each training step, only a new image and its label need to be loaded and updated to the cache queue (the images in the cache queue can be repeated, as shown in the figure for img3 twice). Meanwhile, if the cache queue length exceeds the preset length, it will pop a random image (in order to make the Tiny model more stable, the Tiny model doesn't use the random pop, but removes the first added image). When mixed data augmentation is needed, only the required images need to be randomly selected from the cache for splicing and other processing, instead of loading them all from the hard disk, which saves the time of image loading. + +```{note} +The maximum length N of the cache queue is an adjustable parameter. According to the empirical principle, when ten caches are provided for each image to be blended, it can be considered to provide enough randomness, while the Mosaic enhancement is four image blends, so the number of caches defaults to N=40. Similarly, MixUp has a default cache size of 20, but tiny model requires more stable training conditions, so it has half cache size of other specs (10 for MixUp and 20 for Mosaic). +``` + +In the implementation, MMYOLO designed the `BaseMiximageTransform` class to support mixed data augmentation of multiple images: + +```python +if self.use_cached: + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + self.results_cache.append(copy.deepcopy(results)) # Cache the currently loaded data + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: # Except for the tiny model, self.random_pop=True + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results +else: + assert 'dataset' in results + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) +``` + +#### 1.1.2 Mosaic + +Mosaic concatenates four images into a large image, which is equivalent to increasing the batch size, as follows: + +1. Randomly resample three images from customize datasets based on the index, possibly repeated. + +```python +def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes +``` + +2. Randomly select the midpoint of the intersection of four images. + +```python +# mosaic center x, y +center_x = int( + random.uniform(*self.center_ratio_range) * self.img_scale[1]) +center_y = int( + random.uniform(*self.center_ratio_range) * self.img_scale[0]) +center_position = (center_x, center_y) +``` + +3. Read and concatenate images based on the sampled index. Using the `keep-ratio` resize image (i.e. the maximum edge must be 640) before concatenating. + +```python +# keep_ratio resize +scale_ratio_i = min(self.img_scale[0] / h_i, + self.img_scale[1] / w_i) +img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) +``` + +4. After concatenating images, the bboxes and labels are all concatenated together, and then the bboxes are cropped but not filtered (some invalid bboxes may appear). + +```python +mosaic_bboxes.clip_([2 * self.img_scale[0], 2 * self.img_scale[1]]) +``` + +Please reference the Mosaic theory of [YOLOv5](./yolov5_description.md) for more details. + +#### 1.1.3 MixUp + +The MixUp implementation of RTMDet is the same as YOLOX, with the addition of cache function similar to above mentioned. + +Please reference the MixUp theory of [YOLOv5](./yolov5_description.md) for more details. + +#### 1.1.4 Strong and weak two-stage training + +Mosaic + MixUp has high distortion. Continuously using strong data augmentation isn't beneficial. YOLOX use strong and weak two-stage training mode firstly. However, the introduction of rotation and shear result in box annotation errors, which needs to introduce L1 loss to correct the performance of regression branch. + +In order to make the data augmentation method more general, RTMDet uses Mosaic + MixUp without rotation during the first 280 epochs, and increases the intensity and positive samples by mixing eight images. During the last 20 epochs, a relatively small learning rate is used to fine-tune under weak agumentation, and slowly update parameters to model by EMA, which could obtain a large improvement. + +| | RTMDet-s | RTMDet-l | +| ----------------------------- | -------- | -------- | +| LSJ + rand crop | 42.3 | 46.7 | +| Mosaic+MixUp | 41.9 | 49.8 | +| Mosaic + MixUp + 20e finetune | 43.9 | **51.3** | + +### 1.2 Model structure diff --git a/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/yolov5_description.md b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/yolov5_description.md new file mode 100644 index 0000000000000000000000000000000000000000..4d2ed512e5022e94da9e1b87593df3536c366a24 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/yolov5_description.md @@ -0,0 +1,651 @@ +# Algorithm principles and implementation with YOLOv5 + +## 0 Introduction + +
+YOLOv5-P5_structure_v3.4 +Figure 1: YOLOv5-l-P5 model structure +
+ +
+YOLOv5-P6_structure_v1.1 +Figure 2: YOLOv5-l-P6 model structure +
+ +RangeKing@github provides the graph above. Thanks, RangeKing! + +YOLOv5 is an open-source object detection algorithm for real-time industrial applications which has received extensive attention. The reason for the explosion of YOLOv5 is not simply due to its excellent performance. It is more about the overall utility and robustness of its library. +In short, the main features of YOLOv5 are: + +1. **Friendly and perfect deployment supports** +2. **Fast training speed**: the training time in the case of 300 epochs is similar to most of the one-stage and two-stage algorithms under 12 epochs, such as RetinaNet, ATSS, and Faster R-CNN. +3. **Abundant optimization for corner cases**: YOLOv5 has implemented many optimizations. The functions and documentation are richer as well. + +Figures 1 and 2 show that the main differences between the P5 and P6 versions of YOLOv5 are the network structure and the image input resolution. Other differences, such as the number of anchors and loss weights, can be found in the [configuration file](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py). This article will start with the principle of the YOLOv5 algorithm and then focus on analyzing the implementation in MMYOLO. The follow-up part includes the guide and speed benchmark of YOLOv5. + +```{hint} +Unless specified, the P5 model is described by default in this documentation. +``` + +We hope this article becomes your core document to start and master YOLOv5. Since YOLOv5 is still constantly updated, we will also keep updating this document. So please always catch up with the latest version. + +MMYOLO implementation configuration: https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/ + +YOLOv5 official repository: https://github.com/ultralytics/yolov5 + +## 1 v6.1 algorithm principle and MMYOLO implementation analysis + +YOLOv5 official release: https://github.com/ultralytics/yolov5/releases/tag/v6.1 + +
+YOLOv5 accuracy +
+ +
+YOLOv5 benchmark +
+ +The performance is shown in the table above. YOLOv5 has two models with different scales. P6 is larger with a 1280x1280 input size, whereas P5 is the model used more often. This article focuses on the structure of the P5 model. + +Usually, we divide the object detection algorithm into different parts, such as data augmentation, model structure, loss calculation, etc. It is the same as YOLOv5: + +
+Strategy +
+ +Now we will briefly analyze the principle and our specific implementation in MMYOLO. + +### 1.1 Data augmentation + +Many data augmentation methods are used in YOLOv5, including: + +- **Mosaic** +- **RandomAffine** +- **MixUp** +- **Image blur and other transformations using Albu** +- **HSV color space enhancement** +- **Random horizontal flips** + +The mosaic probability is set to `1`, so it will always be triggered. MixUp is not used for the small and nano models, and the probability is `0.1` for other l/m/x series models. As small models have limited capabilities, we generally do not use strong data augmentations like MixUp. + +The following picture demonstrates the `Mosaic + RandomAffine + MixUp` process. + +
+image +
+ +#### 1.1.1 Mosaic + +
+image +
+ +Mosaic is a hybrid data augmentation method requiring four images to be stitched together, which is equivalent to increasing the training batch size. + +We can summarize the process as: + +1. Randomly generates coordinates of the intersection point of the four spliced images. +2. Randomly select the indexes of the other three images and read the corresponding annotations. +3. Resizes each image to the specified size by maintaining its aspect ratio. +4. Calculate the position of each image in the output image according to the top, bottom, left, and right rule. You also need to calculate the crop coordinates because the image may be out of bounds. +5. Uses the crop coordinates to crop the scaled image and paste it to the position calculated. The rest of the places will be pad with `114 pixels`. +6. Process the label of each image accordingly. + +Note: since four images are stitched together, the output image area will be enlarged four times (from 640x640 to 1280x1280). Therefore, to revert to 640x640, you must add a **RandomAffine** transformation. Otherwise, the image area will always be four times larger. + +#### 1.1.2 RandomAffine + +
+image +
+ +RandomAffine has two purposes: + +1. Performs a stochastic geometric affine transformation to the image. +2. Reduces the size of the image generated by Mosaic back to 640x640. + +RandomAffine includes geometric augmentations such as translation, rotation, scaling, misalignment, etc. Since Mosaic and RandomAffine are strong augmentations, they will introduce considerable noise. Therefore, the enhanced annotations need to be processed. The rules are + +1. The width and height of the enhanced gt bbox should be larger than wh_thr; +2. The ratio of the area of gt bbox after and before the enhancement should be greater than ar_thr to prevent it from changing too much. +3. The maximum aspect ratio should be smaller than area_thr to prevent it from changing too much. + +Object detection algorithms will rarely use this augmentation method as the annotation box becomes larger after the rotation, resulting in inaccuracy. + +#### 1.1.3 MixUp + +
+image +
+ +MixUp, similar to Mosaic, is also a hybrid image augmentation. It randomly selects another image and mixes the two images together. There are various ways to do this, and the typical approach is to either stitch the label together directly or mix the label using `alpha` method. +The original author's approach is straightforward: the label is directly stitched, and the images are mixed by distributional sampling. + +Note: **In YOLOv5's implementation of MixUP, the other random image must be processed by Mosaic+RandomAffine before the mixing process.** This may not be the same as implementations in other open-source libraries. + +#### 1.1.4 Image blur and other augmentations + +
+image +
+ +The rest of the augmentations are: + +- **Image blur and other transformations using Albu** +- **HSV color space enhancement** +- **Random horizontal flips** + +The Albu library has been packaged in MMDetection so users can directly use all Albu's methods through simple configurations. As a very ordinary and common processing method, HSV will not be further introduced now. + +#### 1.1.5 The implementations in MMYOLO + +While conventional single-image augmentations such as random flip are relatively easy to implement, hybrid data augmentations like Mosaic are more complicated. Therefore, in MMDetection's reimplementation of YOLOX, a dataset wrapper called `MultiImageMixDataset` was introduced. The process is as follows: + +
+image +
+ +For hybrid data augmentations such as Mosaic, you need to implement an additional `get_indexes` method to retrieve the index information of other images and then perform the enhancement. +Take the YOLOX implementation in MMDetection as an example. The configuration file is like this: + +```python +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0), + ... +] + +train_dataset = dict( + # use MultiImageMixDataset wrapper to support mosaic and mixup + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + pipeline=train_pipeline) +``` + +MultiImageMixDataset passes in a data augmentation method, including Mosaic and RandomAffine. CocoDataset also adds a pipeline to load the images and the annotations. This way, it is possible to quickly achieve a hybrid data augmentation method. + +However, the above implementation has one drawback: **For users unfamiliar with MMDetection, they often forget that Mosaic must be used with MultiImageMixDataset. Otherwise, it will return an error. Plus, this approach increases the complexity and difficulty of understanding**. + +To solve this problem, we have simplified it further in MMYOLO. By making the dataset object directly accessible to the pipeline, the implementation and the use of hybrid data augmentations can be the same as random flipping. + +The configuration of YOLOX in MMYOLO is written as follows: + +```python +pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=pre_transform), + ... +] +``` + +This eliminates the need for the MultiImageMixDataset and makes it much easier to use and understand. + +Back to the YOLOv5 configuration, since the other randomly selected image in the MixUp also needs to be enhanced by Mosaic+RandomAffine before it can be used, the YOLOv5-m data enhancement configuration is as follows. + +```python +pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) +] + +mosaic_transform= [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.1, 1.9), # scale = 0.9 + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *pre_transform, + *mosaic_transform, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[ + *pre_transform, + *mosaic_transform + ]), + ... +] +``` + +### 1.2 Network structure + +This section was written by RangeKing@github. Thanks a lot! + +The YOLOv5 network structure is the standard `CSPDarknet` + `PAFPN` + `non-decoupled Head`. + +The size of the YOLOv5 network structure is determined by the `deepen_factor` and `widen_factor` parameters. `deepen_factor` controls the depth of the network structure, that is, the number of stacks of `DarknetBottleneck` modules in `CSPLayer`. `widen_factor` controls the width of the network structure, that is, the number of channels of the module output feature map. Take YOLOv5-l as an example. Its `deepen_factor = widen_factor = 1.0`. the overall structure is shown in the graph above. + +The upper part of the figure is an overview of the model; the lower part is the specific network structure, in which the modules are marked with numbers in serial, which is convenient for users to correspond to the configuration files of the YOLOv5 official repository. The middle part is the detailed composition of each sub-module. + +If you want to use **netron** to visualize the details of the network structure, open the ONNX file format exported by MMDeploy in netron. + +```{hint} +The shapes of the feature map in Section 1.2 are (B, C, H, W) by default. +``` + +#### 1.2.1 Backbone + +`CSPDarknet` in MMYOLO inherits from `BaseBackbone`. The overall structure is similar to `ResNet` with a total of 5 layers of design, including one `Stem Layer` and four `Stage Layer`: + +- `Stem Layer` is a `ConvModule` whose kernel size is 6x6. It is more efficient than the `Focus` module used before v6.1. +- Except for the last `Stage Layer`, each `Stage Layer` consists of one `ConvModule` and one `CSPLayer`, as shown in the Details part in the graph above. `ConvModule` is a 3x3 `Conv2d` + `BatchNorm` + `SiLU activation function` module. `CSPLayer` is the C3 module in the official YOLOv5 repository, consisting of three `ConvModule` + n `DarknetBottleneck` with residual connections. +- The last `Stage Layer` adds an `SPPF` module at the end. The `SPPF` module is to serialize the input through multiple 5x5 `MaxPool2d` layers, which has the same effect as the `SPP` module but is faster. +- The P5 model passes the corresponding results from the second to the fourth `Stage Layer` to the `Neck` structure and extracts three output feature maps. Take a 640x640 input image as an example. The output features are (B, 256, 80, 80), (B,512,40,40), and (B,1024,20,20). The corresponding stride is 8/16/32. +- The P6 model passes the corresponding results from the second to the fifth `Stage Layer` to the `Neck` structure and extracts three output feature maps. Take a 1280x1280 input image as an example. The output features are (B, 256, 160, 160), (B,512,80,80), (B,768,40,40), and (B,1024,20,20). The corresponding stride is 8/16/32/64. + +#### 1.2.2 Neck + +There is no **Neck** part in the official YOLOv5. However, to facilitate users to correspond to other object detection networks easier, we split the `Head` of the official repository into `PAFPN` and `Head`. + +Based on the `BaseYOLONeck` structure, YOLOv5's `Neck` also follows the same build process. However, for non-existed modules, we use `nn.Identity` instead. + +The feature maps output by the Neck module is the same as the Backbone. The P5 model is (B,256,80,80), (B,512,40,40) and (B,1024,20,20); the P6 model is (B,256,160,160), (B,512,80,80), (B,768,40,40) and (B,1024,20,20). + +#### 1.2.3 Head + +The `Head` structure of YOLOv5 is the same as YOLOv3, which is a `non-decoupled Head`. The Head module includes three convolution modules that do not share weights. They are used only for input feature map transformation. + +The `PAFPN` outputs three feature maps of different scales, whose shapes are (B,256,80,80), (B,512,40,40), and (B,1024,20,20) accordingly. + +Since YOLOv5 has a non-decoupled output, that is, classification and bbox detection results are all in different channels of the same convolution module. Taking the COCO dataset as an example: + +- When the input of P5 model is 640x640 resolution, the output shapes of the Head module are `(B, 3x(4+1+80),80,80)`, `(B, 3x(4+1+80),40,40)` and `(B, 3x(4+1+80),20,20)`. + +- When the input of P6 model is 1280x1280 resolution, the output shapes of the Head module are `(B, 3x(4+1+80),160,160)`, `(B, 3x(4+1+80),80,80)`, `(B, 3x(4+1+80),40,40)` and `(B, 3x(4+1+80),20,20)`. + + `3` represents three anchors, `4` represents the bbox prediction branch, `1` represents the obj prediction branch, and `80` represents the class prediction branch of the COCO dataset. + +### 1.3 Positive and negative sample assignment strategy + +The core of the positive and negative sample assignment strategy is to determine which positions in all positions of the predicted feature map should be positive or negative and even which samples will be ignored. + +This is one of the most significant components of the object detection algorithm because a good strategy can improve the algorithm's performance. + +The assignment strategy of YOLOv5 can be briefly summarized as calculating the shape-matching rate between anchor and gt_bbox. Plus, the cross-neighborhood grid is also introduced to get more positive samples. + +It consists of the following two main steps: + +1. For any output layer, instead of the commonly used strategy based on Max IoU matching, YOLOv5 switched to comparing the shape matching ratio. First, the GT Bbox and the anchor of the current layer are used to calculate the aspect ratio. If the ratio is greater than the threshold, the GT Bbox and Anchor are considered not matched. Then the current GT Bbox is temporarily discarded, and the predicted position in the grid of this GT Bbox in the current layer is regarded as a negative sample. +2. For the remaining GT Bboxes (the matched GT Bboxes), YOLOv5 calculates which grid they fall in. Using the rounding rule to find the nearest two grids and considering all three grids as a group that is responsible for predicting the GT Bbox. The number of positive samples has increased by at least three times compared to the previous YOLO series algorithms. + +Now we will explain each part of the assignment strategy in detail. Some descriptions and illustrations are directly or indirectly referenced from the official [repo](https://github.com/ultralytics/YOLOv5/issues/6998#44). + +#### 1.3.1 Anchor settings + +YOLOv5 is an anchor-based object detection algorithm. Similar to YOLOv3, the anchor sizes are still obtained by clustering. However, the difference compared with YOLOv3 is that instead of clustering based on IoU, YOLOv5 switched to using the aspect ratio on the width and height (shape-match based method). + +While training on customized data, user can use the tool in MMYOLO to analyze and get the appropriate anchor sizes of the dataset. + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm v5-k-means + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR} +``` + +Then modify the default anchor size setting in the [config file](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py): + +```python +anchors = [[(10, 13), (16, 30), (33, 23)], [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] +``` + +#### 1.3.2 Bbox encoding and decoding process + +The predicted bounding box will transform based on the pre-set anchors in anchor-based algorithms. Then, the transformation amount is predicted, known as the GT Bbox encoding process. Finally, the Pred Bbox decoding needs to be performed after the prediction to restore the bboxes to the original scale, known as the Pred Bbox decoding process. + +In YOLOv3, the bbox regression formula is: + +```{math} +b_x=\sigma(t_x)+c_x \\ +b_y=\sigma(t_y)+c_y \\ +b_w=a_w\cdot e^{t_w} \\ +b_h=a_h\cdot e^{t_h} \\ +``` + +In the above formula, + +```{math} +a_w represents the width of the anchor \\ +c_x represents the coordinate of the grid \\ +\sigma represents the Sigmoid function. +``` + +However, the regression formula in YOLOv5 is: + +```{math} +b_x=(2\cdot\sigma(t_x)-0.5)+c_x \\ +b_y=(2\cdot\sigma(t_y)-0.5)+c_y \\ +b_w=a_w\cdot(2\cdot\sigma(t_w))^2 \\ +b_h=a_h\cdot(2\cdot\sigma(t_h))^2 +``` + +Two main changes are: + +- adjusted the range of the center point coordinate from (0, 1) to (-0.5, 1.5); +- adjusted the width and height from + +```{math} +(0,+\infty) +``` + +to + +```{math} +(0,4a_{wh}) +``` + +The changes have the two benefits: + +- It will be **better to predict zero and one** with the changed center point range, which makes the bbox coordinate regression more accurate. + +
+image +
+ +- `exp(x)` in the width and height regression formula is unbounded, which may cause the **gradient out of control** and make the training stage unstable. The revised width-height regression in YOLOv5 optimizes this problem. + +
+image +
+ +#### 1.3.3 Assignment strategy + +Note: in MMYOLO, **we call anchor as prior** for both anchor-based and anchor-free networks. + +Positive sample assignment consists of the following two steps: + +(1) Scale comparison + +Compare the scale of the WH in the GT BBox and the WH in the Prior: + +```{math} +r_w = w\_{gt} / w\_{pt} \\ +r_h = h\_{gt} / h\_{pt} \\ +r_w^{max}=max(r_w, 1/r_w) \\ +r_h^{max}=max(r_h, 1/r_h) \\ +r^{max}=max(r_w^{max}, r_h^{max}) \\ +if\ \ r_{max} < prior\_match\_thr: match! +``` + +Taking the assignment process of the GT Bbox and the Prior of the P3 feature map as the example: + +
+image +
+ +The reason why Prior 1 fails to match the GT Bbox is because: + +```{math} +h\_{gt}\ /\ h\_{prior}\ =\ 4.8\ >\ prior\_match\_thr +``` + +(2) Assign corresponded positive samples to the matched GT BBox in step 1 + +We still use the example in the previous step. + +The value of (cx, cy, w, h) of the GT BBox is (26, 37, 36, 24), and the WH value of the Prior is \[(15, 5), (24, 16), (16, 24)\]. In the P3 feature map, the stride is eight. Prior 2 and prior 3 are matched. + +The detailed process can be described as: + +(2.1) Map the center point coordinates of the GT Bbox to the grid of P3. + +```{math} +GT_x^{center_grid}=26/8=3.25 \\ +GT_y^{center_grid}=37/8=4.625 +``` + +
+image +
+ +(2.2) Divide the grid where the center point of GT Bbox locates into four quadrants. **Since the center point falls in the lower left quadrant, the left and lower grids of the object will also be considered positive samples**. + +
+image +
+ +The following picture shows the distribution of positive samples when the center point falls to different positions: + +
+image +
+ +So what improvements does the Assign method bring to YOLOv5? + +- One GT Bbox can match multiple Priors. + +- When a GT Bbox matches a Prior, at most three positive samples can be assigned. + +- These strategies can **moderately alleviate the problem of unbalanced positive and negative samples, which is very common in object detection algorithms**. + +The regression method in YOLOv5 corresponds to the Assign method: + +1. Center point regression: + +
+image +
+ +2. WH regression: + +
+image +
+ +### 1.4 Loss design + +YOLOv5 contains a total of three Loss, which are: + +- Classes loss: BCE loss +- Objectness loss: BCE loss +- Location loss: CIoU loss + +These three losses are aggregated according to a certain proportion: + +```{math} +Loss=\lambda_1L_{cls}+\lambda_2L_{obj}+\lambda_3L_{loc} +``` + +The Objectness loss corresponding to the P3, P4, and P5 layers are added according to different weights. The default setting is + +```python +obj_level_weights=[4., 1., 0.4] +``` + +```{math} +L_{obj}=4.0\cdot L_{obj}^{small}+1.0\cdot L_{obj}^{medium}+0.4\cdot L_{obj}^{large} +``` + +In the reimplementation, we found a certain gap between the CIoU used in YOLOv5 and the latest official CIoU, which is reflected in the calculation of the alpha parameter. + +In the official version: + +Reference: https://github.com/Zzh-tju/CIoU/blob/master/layers/modules/multibox_loss.py#L53-L55 + +```python +alpha = (ious > 0.5).float() * v / (1 - ious + v) +``` + +In YOLOv5's version: + +```python +alpha = v / (v - ious + (1 + eps)) +``` + +This is an interesting detail, and we need to test the accuracy gap caused by different alpha calculation methods in our follow-up development. + +### 1.5 Optimization and training strategies + +YOLOv5 has very fine-grained control over the parameter groups of each optimizer, which briefly includes the following sections. + +#### 1.5.1 Optimizer grouping + +The optimization parameters are divided into three groups: Conv/Bias/BN. In the WarmUp stage, different groups use different lr and momentum update curves. +At the same time, the iter-based update strategy is adopted in the WarmUp stage, and it becomes an epoch-based update strategy in the non-WarmUp stage, which is quite tricky. + +In MMYOLO, the YOLOv5OptimizerConstructor optimizer constructor is used to implement optimizer parameter grouping. The role of an optimizer constructor is to control the initialization process of some special parameter groups finely so that it can meet the needs well. + +Different parameter groups use different scheduling curve functions through YOLOv5ParamSchedulerHook. + +#### 1.5.2 weight decay parameter auto-adaptation + +The author adopts different weight decay strategies for different batch sizes, specifically: + +1. When the training batch size does not exceed 64, weight decay remains unchanged. +2. When the training batch size exceeds 64, weight decay will be linearly scaled according to the total batch size. + +MMYOLO also implements through the YOLOv5OptimizerConstructor. + +#### 1.5.3 Gradient accumulation + +To maximize the performance under different batch sizes, the author sets the gradient accumulation function automatically when the total batch size is less than 64. + +The training process is similar to most YOLO, including the following strategies: + +1. Not using pre-trained weights. +2. There is no multi-scale training strategy, and cudnn.benchmark can be turned on to accelerate training further. +3. The EMA strategy is used to smooth the model. +4. Automatic mixed-precision training with AMP by default. + +What needs to be reminded is that the official YOLOv5 repository uses single-card v100 training for the small model with a bs is 128. However, m/l/x models are trained with different numbers of multi-cards. +This training strategy is not relatively standard, **For this reason, eight cards are used in MMYOLO, and each card sets the bs to 16. At the same time, in order to avoid performance differences, SyncBN is turned on during training**. + +### 1.6 Inference and post-processing + +The YOLOv5 post-processing is very similar to YOLOv3. In fact, all post-processing stages of the YOLO series are similar. + +#### 1.6.1 Core parameters + +1. **multi_label** + +For multi-category prediction, you need to consider whether it is a multi-label case or not. Multi-label case predicts probabilities of more than one category at one location. As YOLOv5 uses sigmoid, it is possible that one object may have two different predictions. It is good to evaluate mAP, but not good to use. +Therefore, multi_label is set to `True` during the evaluation and changed to `False` for inferencing and practical usage. + +2. **score_thr and nms_thr** + +The score_thr threshold is used for the score of each category, and the detection boxes with a score below the threshold are treated as background. nms_thr is used for nms process. During the evaluation, score_thr can be set very low, which improves the recall and the mAP. However, it is meaningless for practical usage and leads to a very slow inference performance. For this reason, different thresholds are set in the testing and inference phases. + +3. **nms_pre and max_per_img** + +nms_pre is the maximum number of frames to be preserved before NMS, which is used to prevent slowdown caused by too many input frames during the NMS process. max_per_img is the final maximum number of frames to be reserved, usually set to 300. + +Take the COCO dataset as an example. It has 80 classes, and the input size is 640x640. + +
+image +
+ +The inference and post-processing include: + +**(1) Dimensional transformation** + +YOLOv5 outputs three feature maps. Each feature map is scaled at 80x80, 40x40, and 20x20. As three anchors are at each position, the output feature map channel is 3x(5+80)=255. +YOLOv5 uses a non-decoupled Head, while most other algorithms use decoupled Head. Therefore, to unify the post-processing logic, we decouple YOLOv5's Head into the category prediction branch, the bbox prediction branch, and the obj prediction branch. + +The three scales of category prediction, bbox prediction, and obj prediction are stitched together and dimensionally transformed. For subsequent processing, the original channel dimensions are replaced at the end, and the shapes of the category prediction branch, bbox prediction branch, and obj prediction branch are (b, 3x80x80+3x40x40+3x20x20, 80)=(b,25200,80), (b,25200,4), and (b,25200,1), respectively. + +**(2) Decoding to the original graph scale** + +The classification branch and obj branch need to be computed with the sigmoid function, while the bbox prediction branch needs to be decoded and reduced to the original image in xyxy format. + +**(3) First filtering** + +Iterate through each graph in the batch, and then use score_thr to threshold filter the category prediction scores to remove the prediction results below score_thr. + +**(4) Second filtering** + +Multiply the obj prediction scores and the filtered category prediction scores, and then still use score_thr for threshold filtering. +It is also necessary to consider **multi_label and nms_pre in this process to ensure that the number of detected boxes after filtering is no more than nms_pre**. + +**(5) Rescale to original size and NMS** + +Based on the pre-processing process, restore the remaining detection frames to the original graph scale before the network output and perform NMS. The final output detection frame cannot be more than **max_per_img**. + +#### 1.6.2 batch shape strategy + +To speed up the inference process on the validation set, the authors propose the batch shape strategy, whose principle is to **ensure that the images within the same batch have the least number of pad pixels in the batch inference process and do not require all the images in the batch to have the same scale throughout the validation process**. + +It first sorts images according to their aspect ratio of the entire test or validation set, and then forms a batch of the sorted images based on the settings. +At the same time, the batch shape of the current batch is calculated to prevent too many pad pixels. We focus on padding with the original aspect ratio but not padding the image to a perfect square. + +```python + image_shapes = [] + for data_info in data_list: + image_shapes.append((data_info['width'], data_info['height'])) + + image_shapes = np.array(image_shapes, dtype=np.float64) + + n = len(image_shapes) # number of images + batch_index = np.floor(np.arange(n) / self.batch_size).astype( + np.int64) # batch index + number_of_batches = batch_index[-1] + 1 # number of batches + + aspect_ratio = image_shapes[:, 1] / image_shapes[:, 0] # aspect ratio + irect = aspect_ratio.argsort() + + data_list = [data_list[i] for i in irect] + + aspect_ratio = aspect_ratio[irect] + # Set training image shapes + shapes = [[1, 1]] * number_of_batches + for i in range(number_of_batches): + aspect_ratio_index = aspect_ratio[batch_index == i] + min_index, max_index = aspect_ratio_index.min( + ), aspect_ratio_index.max() + if max_index < 1: + shapes[i] = [max_index, 1] + elif min_index > 1: + shapes[i] = [1, 1 / min_index] + + batch_shapes = np.ceil( + np.array(shapes) * self.img_size / self.size_divisor + + self.pad).astype(np.int64) * self.size_divisor + + for i, data_info in enumerate(data_list): + data_info['batch_shape'] = batch_shapes[batch_index[i]] +``` + +## 2 Sum up + +This article focuses on the principle of YOLOv5 and our implementation in MMYOLO in detail, hoping to help users understand the algorithm and the implementation process. At the same time, again, please note that since YOLOv5 itself is constantly being updated, this open-source library will also be continuously iterated. So please always check the latest version. diff --git a/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/yolov8_description.md b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/yolov8_description.md new file mode 100644 index 0000000000000000000000000000000000000000..70f1686b4f461bc07fe101dd8e011deb220d3767 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/algorithm_descriptions/yolov8_description.md @@ -0,0 +1,241 @@ +# Algorithm principles and implementation with YOLOv8 + +## 0 Introduction + +
+YOLOv8-P5_structure +Figure 1:YOLOv8-P5 +
+ +RangeKing@github provides the graph above. Thanks, RangeKing! + +YOLOv8 is the next major update from YOLOv5, open sourced by Ultralytics on 2023.1.10, and now supports image classification, object detection and instance segmentation tasks. + +
+YOLOv8-logo +Figure 2:YOLOv8-logo +
+According to the official description, Ultralytics YOLOv8 is the latest version of the YOLO object detection and image segmentation model developed by Ultralytics. YOLOv8 is a cutting-edge, state-of-the-art (SOTA) model that builds upon the success of previous YOLO versions and introduces new features and improvements to further boost performance and flexibility. These include a new backbone network, a new anchor-free detection head, and a new loss function. YOLOv8 is also highly efficient and can be run on a variety of hardware platforms, from CPUs to GPUs. + +However, instead of naming the open source library YOLOv8, ultralytics uses the word ultralytics directly because ultralytics positions the library as an algorithmic framework rather than a specific algorithm, with a major focus on scalability. It is expected that the library can be used not only for the YOLO model family, but also for non-YOLO models and various tasks such as classification segmentation pose estimation. + +Overall, YOLOv8 is a powerful and flexible tool for object detection and image segmentation that offers the best of both worlds: **the SOTA technology and the ability to use and compare all previous YOLO versions.** + +
+YOLOv8-table +Figure 3:YOLOv8-performance +
+ +YOLOv8 official open source address: [this](https://github.com/ultralytics/ultralytics) + +MMYOLO open source address for YOLOv8: [this](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/) + +The following table shows the official results of mAP, number of parameters and FLOPs tested on the COCO Val 2017 dataset. It is evident that YOLOv8 has significantly improved precision compared to YOLOv5. However, the number of parameters and FLOPs of the N/S/M models have significantly increased. Additionally, it can be observed that the inference speed of YOLOv8 is slower in comparison to most of the YOLOv5 models. + +| **model** | **YOLOv5** | **params(M)** | **FLOPs@640 (B)** | **YOLOv8** | **params(M)** | **FLOPs@640 (B)** | +| --------- | ----------- | ------------- | ----------------- | ----------- | ------------- | ----------------- | +| n | 28.0(300e) | 1.9 | 4.5 | 37.3 (500e) | 3.2 | 8.7 | +| s | 37.4 (300e) | 7.2 | 16.5 | 44.9 (500e) | 11.2 | 28.6 | +| m | 45.4 (300e) | 21.2 | 49.0 | 50.2 (500e) | 25.9 | 78.9 | +| l | 49.0 (300e) | 46.5 | 109.1 | 52.9 (500e) | 43.7 | 165.2 | +| x | 50.7 (300e) | 86.7 | 205.7 | 53.9 (500e) | 68.2 | 257.8 | + +It is worth mentioning that the recent YOLO series have shown significant performance improvements on the COCO dataset. However, their generalizability on custom datasets has not been extensively tested, which thereby will be a focus in the future development of MMYOLO. + +Before reading this article, if you are not familiar with YOLOv5, YOLOv6 and RTMDet, you can read the detailed explanation of [YOLOv5 and its implementation](https://mmyolo.readthedocs.io/en/latest/algorithm_descriptions/yolov5_description.html). + +## 1 YOLOv8 Overview + +The core features and modifications of YOLOv8 can be summarized as follows: + +1. **A new state-of-the-art (SOTA) model is proposed, featuring an object detection model for P5 640 and P6 1280 resolutions, as well as a YOLACT-based instance segmentation model. The model also includes different size options with N/S/M/L/X scales, similar to YOLOv5, to cater to various scenarios.** +2. **The backbone network and neck module are based on the YOLOv7 ELAN design concept, replacing the C3 module of YOLOv5 with the C2f module. However, there are a lot of operations such as Split and Concat in this C2f module that are not as deployment-friendly as before.** +3. **The Head module has been updated to the current mainstream decoupled structure, separating the classification and detection heads, and switching from Anchor-Based to Anchor-Free.** +4. **The loss calculation adopts the TaskAlignedAssigner in TOOD and introduces the Distribution Focal Loss to the regression loss.** +5. **In the data augmentation part, Mosaic is closed in the last 10 training epoch, which is the same as YOLOX training part.** + **As can be seen from the above summaries, YOLOv8 mainly refers to the design of recently proposed algorithms such as YOLOX, YOLOv6, YOLOv7 and PPYOLOE.** + +Next, we will introduce various improvements in the YOLOv8 model in detail by 5 parts: model structure design, loss calculation, training strategy, model inference process and data augmentation. + +## 2 Model structure design + +The Figure 1 is the model structure diagram based on the official code of YOLOv8. **If you like this style of model structure diagram, welcome to check out the model structure diagram in algorithm README of MMYOLO, which currently covers YOLOv5, YOLOv6, YOLOX, RTMDet and YOLOv8.** + +Comparing the YOLOv5 and YOLOv8 yaml configuration files without considering the head module, you can see that the changes are minor. + +
+yaml +Figure 4:YOLOv5 and YOLOv8 YAML diff +
+ +The structure on the left is YOLOv5-s and the other side is YOLOv8-s. The specific changes in the backbone network and neck module are: + +- The kernel of the first convolutional layer has been changed from 6x6 to 3x3 +- All C3 modules are replaced by C2f, and the structure is as follows, with more skip connections and additional split operations. + +
+module +Figure 5:YOLOv5 and YOLOv8 module diff +
+ +- Removed 2 convolutional connection layers from neck module +- The block number has been changed from 3-6-9-3 to 3-6-6-3. +- **If we look at the N/S/M/L/X models, we can see that of the N/S and L/X models only changed the scaling factors, but the number of channels in the S/ML backbone network is not the same and does not follow the same scaling factor principle. The main reason for this design is that the channel settings under the same set of scaling factors are not the most optimal, and the YOLOv7 network design does not follow one set of scaling factors for all models either.** + +The most significant changes in the model lay in the head module. The head module has been changed from the original coupling structure to the decoupling one, and its style has been changed from **YOLOv5's Anchor-Based to Anchor-Free**. The structure is shown below. + +
+head +Figure 6:YOLOv8 Head +
+ +As demonstrated, the removal of the objectness branch and the retention of only the decoupled classification and regression branches stand as the major differences. Additionally, the regression branch now employs integral form representation as proposed in the Distribution Focal Loss. + +## 3 Loss calculation + +The loss calculation process consists of 2 parts: the sample assignment strategy and loss calculation. + +The majority of contemporary detectors employ dynamic sample assignment strategies, such as YOLOX's simOTA, TOOD's TaskAlignedAssigner, and RTMDet's DynamicSoftLabelAssigner. Given the superiority of dynamic assignment strategies, the YOLOv8 algorithm directly incorporates the one employed in TOOD's TaskAlignedAssigner. + +The matching strategy of TaskAlignedAssigner can be summarized as follows: positive samples are selected based on the weighted scores of classification and regression. + +```{math} +t=s^\alpha+u^\beta +``` + +`s` is the prediction score corresponding to the ground truth category, `u` is the IoU of the prediction bounding box and the gt bounding box. + +1. For each ground truth, the task-aligned assigner calculates the `alignment metric` for each anchor by taking the weighted product of two values: the predicted classification score of the corresponding class, and the Intersection over Union (IoU) between the predicted bounding box and the Ground Truth bounding box. +2. For each Ground Truth, the larger top-k samples are selected as positive based on the `alignment_metrics` values directly. + +The loss calculation consists of 2 parts: the classification and regression, without the objectness loss in the previous model. + +- The classification branch still uses BCE Loss. +- The regression branch employs both Distribution Focal Loss and CIoU Loss. + +The 3 Losses are weighted by a specific weight ratio. + +## 4 Data augmentation + +YOLOv8's data augmentation is similar to YOLOv5, whereas it stops the Mosaic augmentation in the final 10 epochs as proposed in YOLOX. The data process pipelines are illustrated in the diagram below. + +
+head +Figure 7:pipeline +
+ +The intensity of data augmentation required for different scale models varies, therefore the hyperparameters for the scaled models are adjusted depending on the situation. For larger models, techniques such as MixUp and CopyPaste are typically employed. The result of data augmentation can be seen in the example below: + +
+head +Figure 8:results +
+ +The above visualization result can be obtained by running the [browse_dataset](https://github.com/open-mmlab/mmyolo/blob/dev/tools/analysis_tools/browse_dataset.py) script. + +As the data augmentation process utilized in YOLOv8 is similar to YOLOv5, we will not delve into the specifics within this article. For a more in-depth understanding of each data transformation, we recommend reviewing the [YOLOv5 algorithm analysis document](https://mmyolo.readthedocs.io/en/latest/algorithm_descriptions/yolov5_description.html#id2) in MMYOLO. + +## 5 Training strategy + +The distinctions between the training strategy of YOLOv8 and YOLOv5 are minimal. The most notable variation is that the overall number of training epochs for YOLOv8 has been raised from 300 to 500, resulting in a significant expansion in the duration of training. As an illustration, the training strategy for YOLOv8-S can be succinctly outlined as follows: + +| config | YOLOv8-s P5 hyp | +| ---------------------- | ------------------------------- | +| optimizer | SGD | +| base learning rate | 0.01 | +| Base weight decay | 0.0005 | +| optimizer momentum | 0.937 | +| batch size | 128 | +| learning rate schedule | linear | +| training epochs | **500** | +| warmup iterations | max(1000,3 * iters_per_epochs) | +| input size | 640x640 | +| EMA decay | 0.9999 | + +## 6 Inference process + +The inference process of YOLOv8 is almost the same as YOLOv5. The only difference is that the integral representation bbox in Distribution Focal Loss needs to be decoded into a regular 4-dimensional bbox, and the subsequent calculation process is the same as YOLOv5. + +Taking COCO 80 class as an example, assuming that the input image size is 640x640, the inference process implemented in MMYOLO is shown as follows. + +
+head +Figure 9:results +
+The inference and post-processing process is: + +**(1) Decoding bounding box** +Integrate the probability of the distance between the center and the boundary of the box into the mathematical expectation of the distances. + +**(2) Dimensional transformation** +YOLOv8 outputs three feature maps with `80x80`, `40x40` and `20x20` scales. A total of 6 classification and regression different scales of feature map are output by the head module. +The 3 different scales of category prediction branch and bbox prediction branch are combined and dimensionally transformed. For the convenience of subsequent processing, the original channel dimensions are transposed to the end, and the category prediction branch and bbox prediction branch shapes are (b, 80x80+40x40+20x20, 80)=(b,8400,80), (b,8400,4), respectively. + +**(3) Scale Restroation** +The classification prediction branch utilizes sigmoid calculations, whereas the bbox prediction branch requires decoding to xyxy format and conversion to the original scale of the input images. + +**(4) Thresholding** +Iterate through each graph in the batch and use `score_thr` to perform thresholding. In this process, we also need to consider multi_label and nms_pre to ensure that the number of detected bboxs after filtering is no more than nms_pre. + +**(5) Reduction to the original image scale and NMS** +Reusing the parameters for preprocessing, the remaining bboxs are first resized to the original image scale and then NMS is performed. The final number of bboxes cannot be more than `max_per_img`. + +Special Note: **The Batch shape inference strategy, which is present in YOLOv5, is currently not activated in YOLOv8. By performing a quick test in MMYOLO, it can be observed that activating the Batch shape strategy can result in an approximate AP increase of around 0.1% to 0.2%.** + +## 7 Feature map visualization + +A comprehensive set of feature map visualization tools are provided in MMYOLO to help users visualize the feature maps. + +Take the YOLOv8-s model as an example. The first step is to download the official weights, and then convert them to MMYOLO by using the [yolov8_to_mmyolo](https://github.com/open-mmlab/mmyolo/blob/dev/tools/model_converters/yolov8_to_mmyolo.py) script. Note that the script must be placed under the official repository in order to run correctly. + +Assuming that you want to visualize the effect of the 3 feature maps output by backbone and the weights are named 'mmyolov8s.pth'. Run the following command: + +```bash +cd mmyolo +python demo/featmap_vis_demo.py demo/demo.jpg configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py mmyolov8s.pth --channel-reductio squeeze_mean +``` + +In particular, to ensure that the feature map and image are shown aligned, the original `test_pipeline` configuration needs to be replaced with the following: + +```Python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # change + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +
+head +Figure 10:featmap +
+From the above figure, we can see that the different output feature maps are mainly responsible for predicting objects at different scales. +We can also visualize the 3 output feature maps of the neck layer. + +```bash +cd mmyolo +python demo/featmap_vis_demo.py demo/demo.jpg configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py mmyolov8s.pth --channel-reductio squeeze_mean --target-layers neck +``` + +
+head +Figure 11:featmap +
+ +From the above figure, we can find the features at the object are more focused. + +## Summary + +This article delves into the intricacies of the YOLOv8 algorithm, offering a comprehensive examination of its overall design, model structure, loss function, training data enhancement techniques, and inference process. To aid in comprehension, a plethora of diagrams are provided. + +In summary, YOLOv8 is a highly efficient algorithm that incorporates image classification, Anchor-Free object detection, and instance segmentation. Its detection component incorporates numerous state-of-the-art YOLO algorithms to achieve new levels of performance. + +MMYOLO open source address for YOLOV8 [this](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/) + +MMYOLO Algorithm Analysis Tutorial address is [yolov5_description](https://mmyolo.readthedocs.io/en/latest/algorithm_descriptions/yolov5_description.html) diff --git a/third_party/mmyolo/docs/en/recommended_topics/application_examples/index.rst b/third_party/mmyolo/docs/en/recommended_topics/application_examples/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..03c091d19f7376b804d505ec9187cdbc5602adfc --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/application_examples/index.rst @@ -0,0 +1,7 @@ +MMYOLO application examples +******************** + +.. toctree:: + :maxdepth: 1 + + ionogram_detection.md diff --git a/third_party/mmyolo/docs/en/recommended_topics/application_examples/ionogram_detection.md b/third_party/mmyolo/docs/en/recommended_topics/application_examples/ionogram_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..a1bc7cc919ac6dd52e1e781f3eda2d4773eb7207 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/application_examples/ionogram_detection.md @@ -0,0 +1,307 @@ +# A benchmark for ionogram real-time object detection based on MMYOLO + +## Dataset + +Digital ionogram is the most important way to obtain real-time ionospheric information. +Ionospheric structure detection is of great research significance for accurate extraction of ionospheric key parameters. + +This study utilize 4311 ionograms with different seasons obtained by the Chinese Academy of Sciences in Hainan, Wuhan, and Huailai to establish a dataset. The six structures, including Layer E, Es-l, Es-c, F1, F2, and Spread F are manually annotated using [labelme](https://github.com/wkentaro/labelme). [Dataset Download](https://github.com/VoyagerXvoyagerx/Ionogram_detection/releases/download/Dataset/Iono4311.zip) + +
+ + +Preview of annotated images + +
+ +1. Dataset prepration + +After downloading the data, put it in the root directory of the MMYOLO repository, and use `unzip test.zip` (for Linux) to unzip it to the current folder. The structure of the unzipped folder is as follows: + +```shell +Iono4311/ +├── images +| ├── 20130401005200.png +| └── ... +└── labels + ├── 20130401005200.json + └── ... +``` + +The `images` directory contains input images,while the `labels` directory contains annotation files generated by labelme. + +2. Convert the dataset into COCO format + +Use the script `tools/dataset_converters/labelme2coco.py` to convert labelme labels to COCO labels. + +```shell +python tools/dataset_converters/labelme2coco.py --img-dir ./Iono4311/images \ + --labels-dir ./Iono4311/labels \ + --out ./Iono4311/annotations/annotations_all.json +``` + +3. Check the converted COCO labels + +To confirm that the conversion process went successfully, use the following command to display the COCO labels on the images. + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir ./Iono4311/images \ + --ann-file ./Iono4311/annotations/annotations_all.json +``` + +4. Divide dataset into training set, validation set and test set + +Set 70% of the images in the dataset as the training set, 15% as the validation set, and 15% as the test set. + +```shell +python tools/misc/coco_split.py --json ./Iono4311/annotations/annotations_all.json \ + --out-dir ./Iono4311/annotations \ + --ratios 0.7 0.15 0.15 \ + --shuffle \ + --seed 14 +``` + +The file tree after division is as follows: + +```shell +Iono4311/ +├── annotations +│ ├── annotations_all.json +│ ├── class_with_id.txt +│ ├── test.json +│ ├── train.json +│ └── val.json +├── classes_with_id.txt +├── images +├── labels +├── test_images +├── train_images +└── val_images +``` + +## Config files + +The configuration files are stored in the directory `/projects/misc/ionogram_detection/`. + +1. Dataset analysis + +To perform a dataset analysis, a sample of 200 images from the dataset can be analyzed using the `tools/analysis_tools/dataset_analysis.py` script. + +```shell +python tools/analysis_tools/dataset_analysis.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + --out-dir output +``` + +Part of the output is as follows: + +```shell +The information obtained is as follows: ++------------------------------+ +| Information of dataset class | ++---------------+--------------+ +| Class name | Bbox num | ++---------------+--------------+ +| E | 98 | +| Es-l | 27 | +| Es-c | 46 | +| F1 | 100 | +| F2 | 194 | +| Spread-F | 6 | ++---------------+--------------+ +``` + +This indicates that the distribution of categories in the dataset is unbalanced. + +
+ + +Statistics of object sizes for each category + +
+ +According to the statistics, small objects are predominant in the E, Es-l, Es-c, and F1 categories, while medium-sized objects are more common in the F2 and Spread F categories. + +2. Visualization of the data processing part in the config + +Taking YOLOv5-s as an example, according to the `train_pipeline` in the config file, the data augmentation strategies used during training include: + +- Mosaic augmentation +- Random affine +- Albumentations (include various digital image processing methods) +- HSV augmentation +- Random affine + +Use the **'pipeline'** mode of the script `tools/analysis_tools/browse_dataset.py` to obtains all intermediate images in the data pipeline. + +```shell +python tools/analysis_tools/browse_dataset.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + -m pipeline \ + --out-dir output +``` + +
+ + +Visualization for intermediate images in the data pipeline + +
+ +3. Optimize anchor size + +Use the script `tools/analysis_tools/optimize_anchors.py` to obtain prior anchor box sizes suitable for the dataset. + +```shell +python tools/analysis_tools/optimize_anchors.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + --algorithm v5-k-means \ + --input-shape 640 640 \ + --prior-match-thr 4.0 \ + --out-dir work_dirs/dataset_analysis_5_s +``` + +4. Model complexity analysis + +With the config file, the parameters and FLOPs can be calculated by the script `tools/analysis_tools/get_flops.py`. Take yolov5-s as an example: + +```shell +python tools/analysis_tools/get_flops.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py +``` + +The following output indicates that the model has 7.947G FLOPs with the input shape (640, 640), and a total of 7.036M learnable parameters. + +```shell +============================== +Input shape: torch.Size([640, 640]) +Model Flops: 7.947G +Model Parameters: 7.036M +============================== +``` + +## Train and test + +1. Train + +**Training visualization**: By following the tutorial of [Annotation-to-deployment workflow for custom dataset](https://mmyolo.readthedocs.io/en/dev/recommended_topics/labeling_to_deployment_tutorials.html#id11), this example uses [wandb](https://wandb.ai/site) to visulize training. + +**Debug tricks**: During the process of debugging code, sometimes it is necessary to train for several epochs, such as debugging the validation process or checking whether the checkpoint saving meets expectations. For datasets inherited from `BaseDataset` (such as `YOLOv5CocoDataset` in this example), setting `indices` in the `dataset` field can specify the number of samples per epoch to reduce the iteration time. + +```python +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=1, + dataset=dict( + type=_base_.dataset_type, + indices=200, # set indices=200,represent every epoch only iterator 200 samples + data_root=data_root, + metainfo=metainfo, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) +``` + +**Start training**: + +```shell +python tools/train.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py +``` + +2. Test + +Specify the path of the config file and the model to start the test: + +```shell +python tools/test.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + work_dirs/yolov5_s-v61_fast_1xb96-100e_ionogram/xxx +``` + +## Experiments and results + +### Choose a suitable batch size + +- Often, the batch size governs the training speed, and the ideal batch size will be the largest batch size supported by the available hardware. +- If the video memory is not yet fully utilized, doubling the batch size should result in a corresponding doubling (or close to doubling) of the training throughput. This is equivalent to maintaining a constant (or nearly constant) time per step as the batch size increases. +- Automatic Mixed Precision (AMP) is a technique to accelerate the training with minimal loss in accuracy. To enable AMP training, add `--amp` to the end of the training command. + +Hardware information: + +- GPU:V100 with 32GB memory +- CPU:10-core CPU with 40GB memory + +Results: + +| Model | Epoch(best) | AMP | Batchsize | Num workers | Memory Allocated | Training Time | Val mAP | +| -------- | ----------- | ----- | --------- | ----------- | ---------------- | ------------- | ------- | +| YOLOv5-s | 100(82) | False | 32 | 6 | 35.07% | 54 min | 0.575 | +| YOLOv5-s | 100(96) | True | 32 | 6 | 24.93% | 49 min | 0.578 | +| YOLOv5-s | 100(100) | False | 96 | 6 | 96.64% | 48 min | 0.571 | +| YOLOv5-s | 100(100) | True | 96 | 6 | 54.66% | **37** min | 0.575 | +| YOLOv5-s | 100(90) | True | 144 | 6 | 77.06% | 39 min | 0.573 | +| YOLOv5-s | 200(148) | True | 96 | 6 | 54.66% | 72 min | 0.575 | +| YOLOv5-s | 200(188) | True | 96 | **8** | 54.66% | 67 min | 0.576 | + +
+ + +The proportion of data loading time to the total time of each step. + +
+ +Based on the results above, we can conclude that + +- AMP has little impact on the accuracy of the model, but can significantly reduce memory usage while training. +- Increasing batch size by three times does not reduce the training time by a corresponding factor of three. According to the `data_time` recorded during training, the larger the batch size, the larger the `data_time`, indicating that data loading has become the bottleneck limiting the training speed. Increasing `num_workers`, the number of processes used to load data, can accelerate the training speed. + +### Ablation studies + +In order to obtain a training pipeline applicable to the dataset, the following ablation studies with the YOLOv5-s model as an example are performed. + +#### Data augmentation + +| Aug Method | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | +| ---------- | ------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------- | +| Mosaic | | √ | √ | √ | √ | +| Affine | | | √ | √ | √ | +| Albu | | | | √ | √ | +| HSV | | | | √ | √ | +| Flip | | | | | √ | +| Val mAP | 0.507 | 0.550 | 0.572 | 0.567 | 0.575 | + +The results indicate that mosaic augmentation and random affine transformation can significantly improve the performance on the validation set. + +#### Using pre-trained models + +If you prefer not to use pre-trained weights, you can simply set `load_from = None` in the config file. For experiments that do not use pre-trained weights, it is recommended to increase the base learning rate by a factor of four and extend the number of training epochs to 200 to ensure adequate model training. + +| Model | Epoch(best) | FLOPs(G) | Params(M) | Pretrain | Val mAP | Config | +| -------- | ----------- | -------- | --------- | -------- | ------- | ------------------------------------------------------------------------------------------------ | +| YOLOv5-s | 100(82) | 7.95 | 7.04 | Coco | 0.575 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | +| YOLOv5-s | 200(145) | 7.95 | 7.04 | None | 0.565 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py) | +| YOLOv6-s | 100(54) | 24.2 | 18.84 | Coco | 0.584 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py) | +| YOLOv6-s | 200(188) | 24.2 | 18.84 | None | 0.557 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py) | + +
+ + +Comparison of loss reduction during training + +
+ +The loss reduction curve shows that when using pre-trained weights, the loss decreases faster. It can be seen that even using models pre-trained on natural image datasets can accelerate model convergence when fine-tuned on radar image datasets. + +### Benchmark for ionogram object detection + +| Model | epoch(best) | FLOPs(G) | Params(M) | pretrain | val mAP | test mAP | Config | Log | +| ----------- | ----------- | -------- | --------- | -------- | ------- | -------- | ------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- | +| YOLOv5-s | 100(82) | 7.95 | 7.04 | Coco | 0.575 | 0.584 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_s_20230105_213510.json) | +| YOLOv5-m | 100(70) | 24.05 | 20.89 | Coco | 0.587 | 0.586 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_m_20230106_004642.json) | +| YOLOv6-s | 100(54) | 24.2 | 18.84 | Coco | 0.584 | 0.594 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_s_20230107_003207.json) | +| YOLOv6-m | 100(76) | 37.08 | 44.42 | Coco | 0.590 | 0.590 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_m_20230107_201029.json) | +| YOLOv6-l | 100(76) | 71.33 | 58.47 | Coco | 0.605 | 0.597 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_l_20230108_005634.json) | +| YOLOv7-tiny | 100(78) | 6.57 | 6.02 | Coco | 0.549 | 0.568 | [config](/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_tiny_20230215_202837.json) | +| YOLOv7-x | 100(58) | 94.27 | 70.85 | Coco | 0.602 | 0.595 | [config](/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_x_20230110_165832.json) | +| rtmdet-tiny | 100(100) | 8.03 | 4.88 | Coco | 0.582 | 0.589 | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_tiny_20230310_125440.json) | +| rtmdet-s | 100(92) | 14.76 | 8.86 | Coco | 0.588 | 0.585 | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_s_20230310_163853.json) | diff --git a/third_party/mmyolo/docs/en/recommended_topics/complexity_analysis.md b/third_party/mmyolo/docs/en/recommended_topics/complexity_analysis.md new file mode 100644 index 0000000000000000000000000000000000000000..ae7989df280f54c74a4dc355305b1407be14965f --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/complexity_analysis.md @@ -0,0 +1,120 @@ +# Model Complexity Analysis + +We provide a `tools/analysis_tools/get_flops.py` script to help with the complexity analysis for models of MMYOLO. +Currently, it provides the interfaces to compute parameter, activation and flops of the given model, +and supports printing the related information layer-by-layer in terms of network structure or table. + +The commands as follows: + +```shell +python tools/analysis_tools/get_flops.py + ${CONFIG_FILE} \ # config file path + [--shape ${IMAGE_SIZE}] \ # input image size (int), default 640*640 + [--show-arch ${ARCH_DISPLAY}] \ # print related information by network layers + [--not-show-table ${TABLE_DISPLAY}] \ # print related information by table + [--cfg-options ${CFG_OPTIONS}] # config file option +# [] stands for optional parameter, do not type [] when actually entering the command line +``` + +Let's take the `rtmdet_s_syncbn_fast_8xb32-300e_coco.py` config file in RTMDet as an example to show how this script can be used: + +## Usage Example 1: Print Flops, Parameters and related information by table + +```shell +python tools/analysis_tools/get_flops.py configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py +``` + +Output: + +```python +============================== +Input shape: torch.Size([640, 640]) +Model Flops: 14.835G +Model Parameters: 8.887M +============================== +``` + +| module | #parameters or shape | #flops | #activations | +| :-------------------------------- | :------------------- | :------ | :----------: | +| model | 8.887M | 14.835G | 35.676M | +| backbone | 4.378M | 5.416G | 22.529M | +| backbone.stem | 7.472K | 0.765G | 6.554M | +| backbone.stem.0 | 0.464K | 47.514M | 1.638M | +| backbone.stem.1 | 2.336K | 0.239G | 1.638M | +| backbone.stem.2 | 4.672K | 0.478G | 3.277M | +| backbone.stage1 | 42.4K | 0.981G | 7.373M | +| backbone.stage1.0 | 18.56K | 0.475G | 1.638M | +| backbone.stage1.1 | 23.84K | 0.505G | 5.734M | +| backbone.stage2 | 0.21M | 1.237G | 4.915M | +| backbone.stage2.0 | 73.984K | 0.473G | 0.819M | +| backbone.stage2.1 | 0.136M | 0.764G | 4.096M | +| backbone.stage3 | 0.829M | 1.221G | 2.458M | +| backbone.stage3.0 | 0.295M | 0.473G | 0.41M | +| backbone.stage3.1 | 0.534M | 0.749G | 2.048M | +| backbone.stage4 | 3.29M | 1.211G | 1.229M | +| backbone.stage4.0 | 1.181M | 0.472G | 0.205M | +| backbone.stage4.1 | 0.657M | 0.263G | 0.307M | +| backbone.stage4.2 | 1.452M | 0.476G | 0.717M | +| neck | 3.883M | 4.366G | 8.141M | +| neck.reduce_layers.2 | 0.132M | 52.634M | 0.102M | +| neck.reduce_layers.2.conv | 0.131M | 52.429M | 0.102M | +| neck.reduce_layers.2.bn | 0.512K | 0.205M | 0 | +| neck.top_down_layers | 0.491M | 1.23G | 4.506M | +| neck.top_down_layers.0 | 0.398M | 0.638G | 1.638M | +| neck.top_down_layers.1 | 92.608K | 0.593G | 2.867M | +| neck.downsample_layers | 0.738M | 0.472G | 0.307M | +| neck.downsample_layers.0 | 0.148M | 0.236G | 0.205M | +| neck.downsample_layers.1 | 0.59M | 0.236G | 0.102M | +| neck.bottom_up_layers | 1.49M | 0.956G | 2.15M | +| neck.bottom_up_layers.0 | 0.3M | 0.48G | 1.434M | +| neck.bottom_up_layers.1 | 1.19M | 0.476G | 0.717M | +| neck.out_layers | 1.033M | 1.654G | 1.075M | +| neck.out_layers.0 | 0.148M | 0.945G | 0.819M | +| neck.out_layers.1 | 0.295M | 0.472G | 0.205M | +| neck.out_layers.2 | 0.59M | 0.236G | 51.2K | +| neck.upsample_layers | | 1.229M | 0 | +| neck.upsample_layers.0 | | 0.41M | 0 | +| neck.upsample_layers.1 | | 0.819M | 0 | +| bbox_head.head_module | 0.625M | 5.053G | 5.006M | +| bbox_head.head_module.cls_convs | 0.296M | 2.482G | 2.15M | +| bbox_head.head_module.cls_convs.0 | 0.295M | 2.481G | 2.15M | +| bbox_head.head_module.cls_convs.1 | 0.512K | 0.819M | 0 | +| bbox_head.head_module.cls_convs.2 | 0.512K | 0.205M | 0 | +| bbox_head.head_module.reg_convs | 0.296M | 2.482G | 2.15M | +| bbox_head.head_module.reg_convs.0 | 0.295M | 2.481G | 2.15M | +| bbox_head.head_module.reg_convs.1 | 0.512K | 0.819M | 0 | +| bbox_head.head_module.reg_convs.2 | 0.512K | 0.205M | 0 | +| bbox_head.head_module.rtm_cls | 30.96K | 86.016M | 0.672M | +| bbox_head.head_module.rtm_cls.0 | 10.32K | 65.536M | 0.512M | +| bbox_head.head_module.rtm_cls.1 | 10.32K | 16.384M | 0.128M | +| bbox_head.head_module.rtm_cls.2 | 10.32K | 4.096M | 32K | +| bbox_head.head_module.rtm_reg | 1.548K | 4.301M | 33.6K | +| bbox_head.head_module.rtm_reg.0 | 0.516K | 3.277M | 25.6K | +| bbox_head.head_module.rtm_reg.1 | 0.516K | 0.819M | 6.4K | +| bbox_head.head_module.rtm_reg.2 | 0.516K | 0.205M | 1.6K | + +## Usage Example 2: Print related information by network layers + +```shell +python tools/analysis_tools/get_flops.py configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py --show-arch +``` + +Due to the complex structure of RTMDet, the output is long. +The following shows only the output from bbox_head.head_module.rtm_reg section: + +```python +(rtm_reg): ModuleList( + #params: 1.55K, #flops: 4.3M, #acts: 33.6K + (0): Conv2d( + 128, 4, kernel_size=(1, 1), stride=(1, 1) + #params: 0.52K, #flops: 3.28M, #acts: 25.6K + ) + (1): Conv2d( + 128, 4, kernel_size=(1, 1), stride=(1, 1) + #params: 0.52K, #flops: 0.82M, #acts: 6.4K + ) + (2): Conv2d( + 128, 4, kernel_size=(1, 1), stride=(1, 1) + #params: 0.52K, #flops: 0.2M, #acts: 1.6K + ) +``` diff --git a/third_party/mmyolo/docs/en/recommended_topics/contributing.md b/third_party/mmyolo/docs/en/recommended_topics/contributing.md new file mode 100644 index 0000000000000000000000000000000000000000..9efb8871b2ca2dbd637867aa24979380662be07d --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/contributing.md @@ -0,0 +1,314 @@ +# Contributing to OpenMMLab + +Welcome to the MMYOLO community, we are committed to building a cutting-edge computer vision foundational library, and all kinds of contributions are welcomed, including but not limited to + +**Fix bug** + +You can directly post a Pull Request to fix typos in code or documents + +The steps to fix the bug of code implementation are as follows. + +1. If the modification involves significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss it with you and propose a proper solution. + +2. Posting a pull request after fixing the bug and adding the corresponding unit test. + +**New Feature or Enhancement** + +1. If the modification involves significant changes, you should create an issue to discuss with our developers to propose a proper design. +2. Post a Pull Request after implementing the new feature or enhancement and add the corresponding unit test. + +**Document** + +You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable. + +## Preparation + +The commands for processing pull requests are implemented using Git, and this chapter details `Git Configuration` and `associated GitHub`. + +### 1. Git Configuration + +First, make sure you have Git installed on your computer. For Linux systems and macOS systems, Git is generally installed by default. If it is not installed, it can be downloaded at [Git-Downloads](https://git-scm.com/downloads). + +```shell +# view the Git version +git --version +``` + +Second, check your `Git Config` + +```shell +# view the Git config +git config --global --list +``` + +If `user.name` and `user.email` are empty, run the command. + +```shell +git config --global user.name "Change your username here" +git config --global user.email "Change your useremail here" +``` + +Finally, run the command in `git bash` or `terminal` to generate the key file. After the generation is successful, a `.ssh` file will appear in the user directory, and `id_rsa.pub` is the public key file. + +```shell +# useremail is GitHub's email address +ssh-keygen -t rsa -C "useremail" +``` + +### 2. Associated GitHub + +First, open `id_rsa.pub` and copy the entire contents. + +Second, log in to your GitHub account to set it up. + + + +Click `New SSH key` to add a new SSH keys, and paste the copied content into Key. + + + +Finally, verify that SSH matches the GitHub account by running the command in `git bash` or `terminal`. If it matches, enter `yes` to succeed. + +```shell +ssh -T git@github.com +``` + + + +## Pull Request Workflow + +If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the development mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) + +### 1. Fork and clone + +If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile. + + + +Then, you can clone the repositories to local: + +```shell +git clone git@github.com:{username}/mmyolo.git +``` + +After that, you should get into the project folder and add official repository as the upstream repository. + +```bash +cd mmyolo +git remote add upstream git@github.com:open-mmlab/mmyolo +``` + +Check whether the remote repository has been added successfully by `git remote -v` + +```bash +origin git@github.com:{username}/mmyolo.git (fetch) +origin git@github.com:{username}/mmyolo.git (push) +upstream git@github.com:open-mmlab/mmyolo (fetch) +upstream git@github.com:open-mmlab/mmyolo (push) +``` + +```{note} +Here's a brief introduction to the origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically. +``` + +### 2. Configure pre-commit + +You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMYOLO directory. + +```shell +pip install -U pre-commit +pre-commit install +``` + +Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`. + +```shell +pre-commit run --all-files +``` + + + + + +```{note} +Chinese users may fail to download the pre-commit hooks due to the network issue. In this case, you could download these hooks from gitee by setting the .pre-commit-config-zh-cn.yaml + +pre-commit install -c .pre-commit-config-zh-cn.yaml +pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml +``` + +If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation. + +If the code does not conform to the code style specification, pre-commit will raise a warning and fixes some of the errors automatically. + + + +If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**). + +```shell +git commit -m "xxx" --no-verify +``` + +### 3. Create a development branch + +After configuring the pre-commit, we should create a branch based on the dev branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name` + +```shell +git checkout -b yhc/refactor_contributing_doc +``` + +In subsequent development, if the dev branch of the local repository is behind the dev branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command: + +```shell +git pull upstream dev +``` + +### 4. Commit the code and pass the unit test + +- MMYOLO introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html). + +- The committed code should pass through the unit test + + ```shell + # Pass all unit tests + pytest tests + + # Pass the unit test of yolov5_coco dataset + pytest tests/test_datasets/test_yolov5_coco.py + ``` + + If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test) + +- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering) + +### 5. Push the code to remote + +We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option. + +```shell +git push -u origin {branch_name} +``` + +This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository. + +### 6. Create a Pull Request + +(1) Create a pull request in GitHub's Pull request interface + + + +(2) Modify the PR description according to the guidelines so that other developers can better understand your changes. + +```{note} +The *base* branch should be modified to *dev* branch. +``` + + + +Find more details about Pull Request description in [pull request guidelines](#pr-specs). + +**note** + +(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)) + +(b) If it is your first contribution, please sign the CLA + + + +(c) Check whether the Pull Request pass through the CI + + + +MMYOLO will run unit test for the posted Pull Request on Linux, based on different versions of Python, and PyTorch to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code. + +(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP. + + + +### 7. Resolve conflicts + +If your local branch conflicts with the latest dev branch of "upstream", you'll need to resolove them. There are two ways to do this: + +```shell +git fetch --all --prune +git rebase upstream/dev +``` + +or + +```shell +git fetch --all --prune +git merge upstream/dev +``` + +If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are unfamiliar with `rebase`, you can use `merge` to resolve conflicts. + +## Guidance + +### Unit test + +We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test: + +```shell +python -m coverage run -m pytest /path/to/test_file +python -m coverage html +# check file in htmlcov/index.html +``` + +### Document rendering + +If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results: + +```shell +pip install -r requirements/docs.txt +cd docs/zh_cn/ +# or docs/en +make html +# check file in ./docs/zh_cn/_build/html/index.html +``` + +## Code style + +### Python + +We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style. + +We use the following tools for linting and formatting: + +- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools. +- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports. +- [yapf](https://github.com/google/yapf): A formatter for Python files. +- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files. +- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files. +- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring. + +Style configurations of yapf and isort can be found in [setup.cfg](../../../setup.cfg). + +We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, +fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. +The config for a pre-commit hook is stored in [.pre-commit-config](../../../.pre-commit-config.yaml). + +### C++ and CUDA + +We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). + +## PR Specs + +1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style + +2. One short-time branch should be matched with only one PR + +3. Accomplish a detailed change in one PR. Avoid large PR + + - Bad: Support Faster R-CNN + - Acceptable: Add a box head to Faster R-CNN + - Good: Add a parameter to box head to support custom conv-layer number + +4. Provide clear and significant commit message + +5. Provide clear and meaningful PR description + + - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix) + - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily) + - Introduce main changes, results and influences on other modules in short description + - Associate related issues and pull requests with a milestone diff --git a/third_party/mmyolo/docs/en/recommended_topics/dataset_preparation.md b/third_party/mmyolo/docs/en/recommended_topics/dataset_preparation.md new file mode 100644 index 0000000000000000000000000000000000000000..af670d89a4214bd440bd39b4faf9fc37fe7d5286 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/dataset_preparation.md @@ -0,0 +1,145 @@ +# Dataset preparation and description + +## DOTA Dataset + +### Download dataset + +The DOTA dataset can be downloaded from [DOTA](https://captain-whu.github.io/DOTA/dataset.html) +or [OpenDataLab](https://opendatalab.org.cn/DOTA_V1.0). + +We recommend using [OpenDataLab](https://opendatalab.org.cn/DOTA_V1.0) to download the dataset, as the folder structure has already been arranged as needed and can be directly extracted without the need to adjust the folder structure. + +Please unzip the file and place it in the following structure. + +```none +${DATA_ROOT} +├── train +│ ├── images +│ │ ├── P0000.png +│ │ ├── ... +│ ├── labelTxt-v1.0 +│ │ ├── labelTxt +│ │ │ ├── P0000.txt +│ │ │ ├── ... +│ │ ├── trainset_reclabelTxt +│ │ │ ├── P0000.txt +│ │ │ ├── ... +├── val +│ ├── images +│ │ ├── P0003.png +│ │ ├── ... +│ ├── labelTxt-v1.0 +│ │ ├── labelTxt +│ │ │ ├── P0003.txt +│ │ │ ├── ... +│ │ ├── valset_reclabelTxt +│ │ │ ├── P0003.txt +│ │ │ ├── ... +├── test +│ ├── images +│ │ ├── P0006.png +│ │ ├── ... + +``` + +The folder ending with reclabelTxt stores the labels for the horizontal boxes and is not used when slicing. + +### Split DOTA dataset + +Script `tools/dataset_converters/dota/dota_split.py` can split and prepare DOTA dataset. + +```shell +python tools/dataset_converters/dota/dota_split.py \ + [--splt-config ${SPLIT_CONFIG}] \ + [--data-root ${DATA_ROOT}] \ + [--out-dir ${OUT_DIR}] \ + [--ann-subdir ${ANN_SUBDIR}] \ + [--phase ${DATASET_PHASE}] \ + [--nproc ${NPROC}] \ + [--save-ext ${SAVE_EXT}] \ + [--overwrite] +``` + +shapely is required, please install shapely first by `pip install shapely`. + +**Description of all parameters**: + +- `--split-config` : The split config for image slicing. +- `--data-root`: Root dir of DOTA dataset. +- `--out-dir`: Output dir for split result. +- `--ann-subdir`: The subdir name for annotation. Defaults to `labelTxt-v1.0`. +- `--phase`: Phase of the data set to be prepared. Defaults to `trainval test` +- `--nproc`: Number of processes. Defaults to 8. +- `--save-ext`: Extension of the saved image. Defaults to `png` +- `--overwrite`: Whether to allow overwrite if annotation folder exist. + +Based on the configuration in the DOTA paper, we provide two commonly used split config. + +- `./split_config/single_scale.json` means single-scale split. +- `./split_config/multi_scale.json` means multi-scale split. + +DOTA dataset usually uses the trainval set for training and the test set for online evaluation, since most papers +provide the results of online evaluation. If you want to evaluate the model performance locally firstly, please split +the train set and val set. + +Examples: + +Split DOTA trainval set and test set with single scale. + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/single_scale.json' + --data-root ${DATA_ROOT} \ + --out-dir ${OUT_DIR} +``` + +If you want to split DOTA-v1.5 dataset, which have different annotation dir 'labelTxt-v1.5'. + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/single_scale.json' + --data-root ${DATA_ROOT} \ + --out-dir ${OUT_DIR} \ + --ann-subdir 'labelTxt-v1.5' +``` + +If you want to split DOTA train and val set with single scale. + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/single_scale.json' + --data-root ${DATA_ROOT} \ + --phase train val \ + --out-dir ${OUT_DIR} +``` + +For multi scale split: + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/multi_scale.json' + --data-root ${DATA_ROOT} \ + --out-dir ${OUT_DIR} +``` + +The new data structure is as follows: + +```none +${OUT_DIR} +├── trainval +│ ├── images +│ │ ├── P0000__1024__0___0.png +│ │ ├── ... +│ ├── annfiles +│ │ ├── P0000__1024__0___0.txt +│ │ ├── ... +├── test +│ ├── images +│ │ ├── P0006__1024__0___0.png +│ │ ├── ... +│ ├── annfiles +│ │ ├── P0006__1024__0___0.txt +│ │ ├── ... +``` + +Then change `data_root` to ${OUT_DIR}. diff --git a/third_party/mmyolo/docs/en/recommended_topics/deploy/easydeploy_guide.md b/third_party/mmyolo/docs/en/recommended_topics/deploy/easydeploy_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..46fab865340f6db1eb52146de9459078b68f1319 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/deploy/easydeploy_guide.md @@ -0,0 +1 @@ +# EasyDeploy Deployment diff --git a/third_party/mmyolo/docs/en/recommended_topics/deploy/index.rst b/third_party/mmyolo/docs/en/recommended_topics/deploy/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..f21f353c8c64d457e21c79b4af1c75eae9715174 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/deploy/index.rst @@ -0,0 +1,16 @@ +MMDeploy deployment tutorial +******************************** + +.. toctree:: + :maxdepth: 1 + + mmdeploy_guide.md + mmdeploy_yolov5.md + +EasyDeploy deployment tutorial +************************************ + +.. toctree:: + :maxdepth: 1 + + easydeploy_guide.md diff --git a/third_party/mmyolo/docs/en/recommended_topics/deploy/mmdeploy_guide.md b/third_party/mmyolo/docs/en/recommended_topics/deploy/mmdeploy_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..096d39fbc9bd6ee46309332339cbcdca2009c098 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/deploy/mmdeploy_guide.md @@ -0,0 +1,414 @@ +# Basic Deployment Guide + +## Introduction of MMDeploy + +MMDeploy is an open-source deep learning model deployment toolset. It is a part of the [OpenMMLab](https://openmmlab.com/) project, and provides **a unified experience of exporting different models** to various platforms and devices of the OpenMMLab series libraries. Using MMDeploy, developers can easily export the specific compiled SDK they need from the training result, which saves a lot of effort. + +More detailed introduction and guides can be found [here](https://mmdeploy.readthedocs.io/en/latest/get_started.html) + +## Supported Algorithms + +Currently our deployment kit supports on the following models and backends: + +| Model | Task | OnnxRuntime | TensorRT | Model config | +| :----- | :-------------- | :---------: | :------: | :---------------------------------------------------------------------: | +| YOLOv5 | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5) | +| YOLOv6 | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov6) | +| YOLOX | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolox) | +| RTMDet | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/rtmdet) | + +Note: ncnn and other inference backends support are coming soon. + +## Installation + +Please install mmdeploy by following [this](https://mmdeploy.readthedocs.io/en/latest/get_started.html) guide. + +```{note} +If you install mmdeploy prebuilt package, please also clone its repository by 'git clone https://github.com/open-mmlab/mmdeploy.git --depth=1' to get the 'tools' file for deployment. +``` + +## How to Write Config for MMYOLO + +All config files related to the deployment are located at [`configs/deploy`](../../../configs/deploy/). + +You only need to change the relative data processing part in the model config file to support either static or dynamic input for your model. Besides, MMDeploy integrates the post-processing parts as customized ops, you can modify the strategy in `post_processing` parameter in `codebase_config`. + +Here is the detail description: + +```python +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +``` + +- `score_threshold`: set the score threshold to filter candidate bboxes before `nms` +- `confidence_threshold`: set the confidence threshold to filter candidate bboxes before `nms` +- `iou_threshold`: set the `iou` threshold for removing duplicates in `nms` +- `max_output_boxes_per_class`: set the maximum number of bboxes for each class +- `pre_top_k`: set the number of fixedcandidate bboxes before `nms`, sorted by scores +- `keep_top_k`: set the number of output candidate bboxs after `nms` +- `background_label_id`: set to `-1` as MMYOLO has no background class information + +### Configuration for Static Inputs + +#### 1. Model Config + +Taking `YOLOv5` of MMYOLO as an example, here are the details: + +```python +_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +``` + +`_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'` inherits the model config in the training stage. + +`test_pipeline` adds the data processing piple for the deployment, `LetterResize` controls the size of the input images and the input for the converted model + +`test_dataloader` adds the dataloader config for the deployment, `batch_shapes_cfg` decides whether to use the `batch_shapes` strategy. More details can be found at [yolov5 configs](../user_guides/config.md) + +#### 2. Deployment Config + +Here we still use the `YOLOv5` in MMYOLO as the example. We can use [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_static.py) as the config to deploy `YOLOv5` to `ONNXRuntime` with static inputs. + +```python +_base_ = ['./base_static.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +`backend_config` indicates the deployment backend with `type='onnxruntime'`, other information can be referred from the third section. + +To deploy the `YOLOv5` to `TensorRT`, please refer to the [`detection_tensorrt_static-640x640.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_tensorrt_static-640x640.py) as follows. + +```python +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False +``` + +`backend_config` indices the backend with `type='tensorrt'`. + +Different from `ONNXRuntime` deployment configuration, `TensorRT` needs to specify the input image size and the parameters required to build the engine file, including: + +- `onnx_config` specifies the input shape as `input_shape=(640, 640)` +- `fp16_mode=False` and `max_workspace_size=1 << 30` in `backend_config['common_config']` indicates whether to build the engine in the parameter format of `fp16`, and the maximum video memory for the current `gpu` device, respectively. The unit is in `GB`. For detailed configuration of `fp16`, please refer to the [`detection_tensorrt-fp16_static-640x640.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_tensorrt-fp16_static-640x640.py) +- The `min_shape`/`opt_shape`/`max_shape` in `backend_config['model_inputs']['input_shapes']['input']` should remain the same under static input, the default is `[1, 3, 640, 640]`. + +`use_efficientnms` is a new configuration introduced by the `MMYOLO` series, indicating whether to enable `Efficient NMS Plugin` to replace `TRTBatchedNMS plugin` in `MMDeploy` when exporting `onnx`. + +You can refer to the official [efficient NMS plugins](https://github.com/NVIDIA/TensorRT/blob/main/plugin/efficientNMSPlugin/README.md) by `TensorRT` for more details. + +Note: this out-of-box feature is **only available in TensorRT>=8.0**, no need to compile it by yourself. + +### Configuration for Dynamic Inputs + +#### 1. Model Config + +When you deploy a dynamic input model, you don't need to modify any model configuration files but the deployment configuration files. + +#### 2. Deployment Config + +To deploy the `YOLOv5` in MMYOLO to `ONNXRuntime`, please refer to the [`detection_onnxruntime_dynamic.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_dynamic.py). + +```python +_base_ = ['./base_dynamic.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +`backend_config` indicates the backend with `type='onnxruntime'`. Other parameters stay the same as the static input section. + +To deploy the `YOLOv5` to `TensorRT`, please refer to the [`detection_tensorrt_dynamic-192x192-960x960.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py). + +```python +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False +``` + +`backend_config` indicates the backend with `type='tensorrt'`. Since the dynamic and static inputs are different in `TensorRT`, please check the details at [TensorRT dynamic input official introduction](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-843/developer-guide/index.html#work_dynamic_shapes). + +`TensorRT` deployment requires you to specify `min_shape`, `opt_shape` , and `max_shape`. `TensorRT` limits the size of the input image between `min_shape` and `max_shape`. + +`min_shape` is the minimum size of the input image. `opt_shape` is the common size of the input image, inference performance is best under this size. `max_shape` is the maximum size of the input image. + +`use_efficientnms` configuration is the same as the `TensorRT` static input configuration in the previous section. + +### INT8 Quantization Support + +Note: Int8 quantization support will soon be released. + +## How to Convert Model + +### Usage + +#### Deploy with MMDeploy Tools + +Set the root directory of `MMDeploy` as an env parameter `MMDEPLOY_DIR` using `export MMDEPLOY_DIR=/the/root/path/of/MMDeploy` command. + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + ${DEPLOY_CFG_PATH} \ + ${MODEL_CFG_PATH} \ + ${MODEL_CHECKPOINT_PATH} \ + ${INPUT_IMG} \ + --test-img ${TEST_IMG} \ + --work-dir ${WORK_DIR} \ + --calib-dataset-cfg ${CALIB_DATA_CFG} \ + --device ${DEVICE} \ + --log-level INFO \ + --show \ + --dump-info +``` + +### Parameter Description + +- `deploy_cfg`: set the deployment config path of MMDeploy for the model, including the type of inference framework, whether quantize, whether the input shape is dynamic, etc. There may be a reference relationship between configuration files, e.g. `configs/deploy/detection_onnxruntime_static.py` +- `model_cfg`: set the MMYOLO model config path, e.g. `configs/deploy/model/yolov5_s-deploy.py`, regardless of the path to MMDeploy +- `checkpoint`: set the torch model path. It can start with `http/https`, more details are available in `mmengine.fileio` apis +- `img`: set the path to the image or point cloud file used for testing during model conversion +- `--test-img`: set the image file that used to test model. If not specified, it will be set to `None` +- `--work-dir`: set the work directory that used to save logs and models +- `--calib-dataset-cfg`: use for calibration only for INT8 mode. If not specified, it will be set to None and use “val” dataset in model config for calibration +- `--device`: set the device used for model conversion. The default is `cpu`, for TensorRT used `cuda:0` +- `--log-level`: set log level which in `'CRITICAL', 'FATAL', 'ERROR', 'WARN', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'`. If not specified, it will be set to `INFO` +- `--show`: show the result on screen or not +- `--dump-info`: output SDK information or not + +#### Deploy with MMDeploy API + +Suppose the working directory is the root path of mmyolo. Take [YoloV5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py) model as an example. You can download its checkpoint from [here](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth), and then convert it to onnx model as follows: + +```python +from mmdeploy.apis import torch2onnx +from mmdeploy.backend.sdk.export_info import export2SDK + +img = 'demo/demo.jpg' +work_dir = 'mmdeploy_models/mmyolo/onnx' +save_file = 'end2end.onnx' +deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model_checkpoint = 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' +device = 'cpu' + +# 1. convert model to onnx +torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, + model_checkpoint, device) + +# 2. extract pipeline info for inference by MMDeploy SDK +export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, + device=device) +``` + +## Model specification + +Before moving on to model inference chapter, let's know more about the converted result structure which is very important for model inference. It is saved in the directory specified with `--wodk_dir`. + +The converted results are saved in the working directory `mmdeploy_models/mmyolo/onnx` in the previous example. It includes: + +``` +mmdeploy_models/mmyolo/onnx +├── deploy.json +├── detail.json +├── end2end.onnx +└── pipeline.json +``` + +in which, + +- **end2end.onnx**: backend model which can be inferred by ONNX Runtime +- ***xxx*.json**: the necessary information for mmdeploy SDK + +The whole package **mmdeploy_models/mmyolo/onnx** is defined as **mmdeploy SDK model**, i.e., **mmdeploy SDK model** includes both backend model and inference meta information. + +## Model inference + +### Backend model inference + +Take the previous converted `end2end.onnx` model as an example, you can use the following code to inference the model and visualize the results. + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +device = 'cpu' +backend_model = ['mmdeploy_models/mmyolo/onnx/end2end.onnx'] +image = 'demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='work_dir/output_detection.png') +``` + +With the above code, you can find the inference result `output_detection.png` in `work_dir`. + +### SDK model inference + +You can also perform SDK model inference like following, + +```python +from mmdeploy_runtime import Detector +import cv2 + +img = cv2.imread('demo/demo.jpg') + +# create a detector +detector = Detector(model_path='mmdeploy_models/mmyolo/onnx', + device_name='cpu', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('work_dir/output_detection.png', img) +``` + +Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Interface), such as C, C++, C#, Java and so on. You can learn their usage from [demos](https://github.com/open-mmlab/mmdeploy/tree/main/demo). + +## How to Evaluate Model + +### Usage + +After the model is converted to your backend, you can use `${MMDEPLOY_DIR}/tools/test.py` to evaluate the performance. + +```shell +python3 ${MMDEPLOY_DIR}/tools/test.py \ + ${DEPLOY_CFG} \ + ${MODEL_CFG} \ + --model ${BACKEND_MODEL_FILES} \ + --device ${DEVICE} \ + --work-dir ${WORK_DIR} \ + [--cfg-options ${CFG_OPTIONS}] \ + [--show] \ + [--show-dir ${OUTPUT_IMAGE_DIR}] \ + [--interval ${INTERVAL}] \ + [--wait-time ${WAIT_TIME}] \ + [--log2file work_dirs/output.txt] + [--speed-test] \ + [--warmup ${WARM_UP}] \ + [--log-interval ${LOG_INTERVERL}] \ + [--batch-size ${BATCH_SIZE}] \ + [--uri ${URI}] +``` + +### Parameter Description + +- `deploy_cfg`: set the deployment config file path. +- `model_cfg`: set the MMYOLO model config file path. +- `--model`: set the converted model. For example, if we exported a TensorRT model, we need to pass in the file path with the suffix ".engine". +- `--device`: indicate the device to run the model. Note that some backends limit the running devices. For example, TensorRT must run on CUDA. +- `--work-dir`: the directory to save the file containing evaluation metrics. +- `--cfg-options`: pass in additional configs, which will override the current deployment configs. +- `--show`: show the evaluation result on screen or not. +- `--show-dir`: save the evaluation result to this directory, valid only when specified. +- `--interval`: set the display interval between each two evaluation results. +- `--wait-time`: set the display time of each window. +- `--log2file`: log evaluation results and speed to file. +- `--speed-test`: test the inference speed or not. +- `--warmup`: warm up before speed test or not, works only when `speed-test` is specified. +- `--log-interval`: the interval between each log, works only when `speed-test` is specified. +- `--batch-size`: set the batch size for inference, which will override the `samples_per_gpu` in data config. The default value is `1`, however, not every model supports `batch_size > 1`. +- `--uri`: Remote ipv4:port or ipv6:port for inference on edge device. + +Note: other parameters in `${MMDEPLOY_DIR}/tools/test.py` are used for speed test, they will not affect the evaluation results. diff --git a/third_party/mmyolo/docs/en/recommended_topics/deploy/mmdeploy_yolov5.md b/third_party/mmyolo/docs/en/recommended_topics/deploy/mmdeploy_yolov5.md new file mode 100644 index 0000000000000000000000000000000000000000..321a6734fe0a18f35e88b5f31e28be6b3abc7ee5 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/deploy/mmdeploy_yolov5.md @@ -0,0 +1,572 @@ +# YOLOv5 Deployment + +Please check the [basic_deployment_guide](mmdeploy_guide.md) to get familiar with the configurations. + +## Model Training and Validation + +TODO + +## MMDeploy Environment Setup + +Please check the installation document of `MMDeploy` at [build_from_source](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/en/01-how-to-build/build_from_source.md). Please build both `MMDeploy` and the customized Ops to your specific platform. + +Note: please check at `MMDeploy` [FAQ](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/en/faq.md) or create new issues in `MMDeploy` when you come across any problems. + +## How to Prepare Configuration File + +This deployment guide uses the `YOLOv5` model trained on `COCO` dataset in MMYOLO to illustrate the whole process, including both static and dynamic inputs and different procedures for `TensorRT` and `ONNXRuntime`. + +### For Static Input + +#### 1. Model Config + +To deploy the model with static inputs, you need to ensure that the model inputs are in fixed size, e.g. the input size is set to `640x640` while uploading data in the test pipeline and test dataloader. + +Here is a example in [`yolov5_s-static.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/model/yolov5_s-static.py) + +```python +_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +``` + +As the `YOLOv5` will turn on `allow_scale_up` and `use_mini_pad` during the test to change the size of the input image in order to achieve higher accuracy. However, it will cause the input size mismatch problem when deploying in the static input model. + +Compared with the original configuration file, this configuration has been modified as follows: + +- turn off the settings related to reshaping the image in `test_pipeline`, e.g. setting `allow_scale_up=False` and `use_mini_pad=False` in `LetterResize` +- turn off the `batch_shapes` in `test_dataloader` as `batch_shapes_cfg=None`. + +#### 2. Deployment Cofnig + +To deploy the model to `ONNXRuntime`, please refer to the [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/detection_onnxruntime_static.py) as follows: + +```python +_base_ = ['./base_static.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +The `post_processing` in the default configuration aligns the accuracy of the current model with the trained `pytorch` model. If you need to modify the relevant parameters, you can refer to the detailed introduction of [dasic_deployment_guide](mmdeploy_guide.md). + +To deploy the model to `TensorRT`, please refer to the [`detection_tensorrt_static-640x640.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/detection_tensorrt_static-640x640.p). + +```python +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False +``` + +In this guide, we use the default settings such as `input_shape=(640, 640)` and `fp16_mode=False` to build in network in `fp32` mode. Moreover, we set `max_workspace_size=1 << 30` for the gpu memory which allows `TensorRT` to build the engine with maximum `1GB` memory. + +### For Dynamic Input + +#### 1. Model Confige + +As `TensorRT` limits the minimum and maximum input size, we can use any size for the inputs when deploy the model in dynamic mode. In this way, we can keep the default settings in [`yolov5_s-v61_syncbn_8xb16-300e_coco.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py). The data processing and dataloader parts are as follows. + +```python +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img='val2017/'), + ann_file='annotations/instances_val2017.json', + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) +``` + +We use `allow_scale_up=False` to control when the input small images will be upsampled or not in the initialization of `LetterResize`. At the same time, the default `use_mini_pad=False` turns off the minimum padding strategy of the image, and `val_dataloader['dataset']` is passed in` batch_shapes_cfg=batch_shapes_cfg` to ensure that the minimum padding is performed according to the input size in `batch`. These configs will change the dimensions of the input image, so the converted model can support dynamic inputs according to the above dataset loader when testing. + +#### 2. Deployment Cofnig + +To deploy the model to `ONNXRuntime`, please refer to the [`detection_onnxruntime_dynamic.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_dynamic.py) for more details. + +```python +_base_ = ['./base_dynamic.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +Differs from the static input config we introduced in previous section, dynamic input config additionally inherits the `dynamic_axes`. The rest of the configuration stays the same as the static inputs. + +To deploy the model to `TensorRT`, please refer to the [`detection_tensorrt_dynamic-192x192-960x960.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py) for more details. + +```python +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False +``` + +In our example, the network is built in `fp32` mode as `fp16_mode=False`, and the maximum graphic memory is `1GB` for building the `TensorRT` engine as `max_workspace_size=1 << 30`. + +At the same time, `min_shape=[1, 3, 192, 192]`, `opt_shape=[1, 3, 640, 640]`, and `max_shape=[1, 3, 960, 960]` in the default setting set the model with minimum input size to `192x192`, the maximum size to `960x960`, and the most common size to `640x640`. + +When you deploy the model, it can adopt to the input image dimensions automatically. + +## How to Convert Model + +Note: The `MMDeploy` root directory used in this guide is `/home/openmmlab/dev/mmdeploy`, please modify it to your `MMDeploy` directory. + +Use the following command to download the pretrained YOLOv5 weight and save it to your device: + +```shell +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth -O /home/openmmlab/dev/mmdeploy/yolov5s.pth +``` + +Set the relevant env parameters using the following command as well: + +```shell +export MMDEPLOY_DIR=/home/openmmlab/dev/mmdeploy +export PATH_TO_CHECKPOINTS=/home/openmmlab/dev/mmdeploy/yolov5s.pth +``` + +### YOLOv5 Static Model Deployment + +#### ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cpu +``` + +#### TensorRT + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cuda:0 +``` + +### YOLOv5 Dynamic Model Deployment + +#### ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_onnxruntime_dynamic.py \ + configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cpu + --dump-info +``` + +#### TensorRT + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py \ + configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cuda:0 + --dump-info +``` + +When convert the model using the above commands, you will find the following files under the `work_dir` folder: + +![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/760f3f7f-aa23-46cf-987c-717d3490246f) + +or + +![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/732bcd9a-fca0-40ba-b5af-540a47eb9c35) + +After exporting to `onnxruntime`, you will get six files as shown in Figure 1, where `end2end.onnx` represents the exported `onnxruntime` model. The `xxx.json` are the meta info for `MMDeploy SDK` inference. + +After exporting to `TensorRT`, you will get the seven files as shown in Figure 2, where `end2end.onnx` represents the exported intermediate model. `MMDeploy` uses this model to automatically continue to convert the `end2end.engine` model for `TensorRT `Deployment. The `xxx.json` are the meta info for `MMDeploy SDK` inference. + +## How to Evaluate Model + +After successfully convert the model, you can use `${MMDEPLOY_DIR}/tools/test.py` to evaluate the converted model. The following part shows how to evaluate the static models of `ONNXRuntime` and `TensorRT`. For dynamic model evaluation, please modify the configuration of the inputs. + +### ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir/end2end.onnx \ + --device cpu \ + --work-dir work_dir +``` + +Once the process is done, you can get the output results as this: + +![image](https://user-images.githubusercontent.com/92794867/199380483-cf8d867b-7309-4994-938a-f743f4cada77.png) + +### TensorRT + +Note: `TensorRT` must run on `CUDA` devices! + +```shell +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir/end2end.engine \ + --device cuda:0 \ + --work-dir work_dir +``` + +Once the process is done, you can get the output results as this: + +![image](https://user-images.githubusercontent.com/92794867/199380370-da15cfca-2723-4e5b-b6cf-0afb5f44a66a.png) + +More useful evaluation tools will be released in the future. + +# Deploy using Docker + +`MMYOLO` provides a deployment [`Dockerfile`](https://github.com/open-mmlab/mmyolo/blob/main/docker/Dockerfile_deployment) for deployment purpose. Please make sure your local docker version is greater than `19.03`. + +Note: users in mainland China can comment out the `Optional` part in the dockerfile for better experience. + +```dockerfile +# (Optional) +RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +``` + +To build the docker image, + +```bash +# build an image with PyTorch 1.12, CUDA 11.6, TensorRT 8.2.4 ONNXRuntime 1.8.1 +docker build -f docker/Dockerfile_deployment -t mmyolo:v1 . +``` + +To run the docker image, + +```bash +export DATA_DIR=/path/to/your/dataset +docker run --gpus all --shm-size=8g -it --name mmyolo -v ${DATA_DIR}:/openmmlab/mmyolo/data/coco mmyolo:v1 +``` + +`DATA_DIR` is the path of your `COCO` dataset. + +We provide a `script.sh` file for you which runs the whole pipeline. Create the script under `/openmmlab/mmyolo` directory in your docker container using the following content. + +```bash +#!/bin/bash +wget -q https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + -O yolov5s.pth +export MMDEPLOY_DIR=/openmmlab/mmdeploy +export PATH_TO_CHECKPOINTS=/openmmlab/mmyolo/yolov5s.pth + +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir_trt \ + --device cuda:0 + +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir_trt/end2end.engine \ + --device cuda:0 \ + --work-dir work_dir_trt + +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir_ort \ + --device cpu + +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir_ort/end2end.onnx \ + --device cpu \ + --work-dir work_dir_ort +``` + +Then run the script under `/openmmlab/mmyolo`. + +```bash +sh script.sh +``` + +This script automatically downloads the `YOLOv5` pretrained weights in `MMYOLO` and convert the model using `MMDeploy`. You will get the output result as follows. + +- TensorRT: + + ![image](https://user-images.githubusercontent.com/92794867/199657349-1bad9196-c00b-4a65-84f5-80f51e65a2bd.png) + +- ONNXRuntime: + + ![image](https://user-images.githubusercontent.com/92794867/199657283-95412e84-3ba4-463f-b4b2-4bf52ec4acbd.png) + +We can see from the above images that the accuracy of converted models shrink within 1% compared with the pytorch [MMYOLO-YOLOv5](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5#results-and-models) models. + +If you need to test the inference speed of the converted model, you can use the following commands. + +- TensorRT + +```shell +python3 ${MMDEPLOY_DIR}/tools/profiler.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + data/coco/val2017 \ + --model work_dir_trt/end2end.engine \ + --device cuda:0 +``` + +- ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/profiler.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + data/coco/val2017 \ + --model work_dir_ort/end2end.onnx \ + --device cpu +``` + +## Model Inference + +### Backend Model Inference + +#### ONNXRuntime + +For the converted model `end2end.onnx`,you can do the inference with the following code: + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = './configs/deploy/detection_onnxruntime_dynamic.py' +model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +device = 'cpu' +backend_model = ['./work_dir/end2end.onnx'] +image = '../mmyolo/demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='work_dir/output_detection.png') +``` + +#### TensorRT + +For the converted model `end2end.engine`,you can do the inference with the following code: + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = './configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py' +model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +device = 'cuda:0' +backend_model = ['./work_dir/end2end.engine'] +image = '../mmyolo/demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='work_dir/output_detection.png') +``` + +### SDK Model Inference + +#### ONNXRuntime + +For the converted model `end2end.onnx`,you can do the SDK inference with the following code: + +```python +from mmdeploy_runtime import Detector +import cv2 + +img = cv2.imread('../mmyolo/demo/demo.jpg') + +# create a detector +detector = Detector(model_path='work_dir', + device_name='cpu', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('work_dir/output_detection.png', img) +``` + +#### TensorRT + +For the converted model `end2end.engine`,you can do the SDK inference with the following code: + +```python +from mmdeploy_runtime import Detector +import cv2 + +img = cv2.imread('../mmyolo/demo/demo.jpg') + +# create a detector +detector = Detector(model_path='work_dir', + device_name='cuda', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('work_dir/output_detection.png', img) +``` + +Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Interface), such as C, C++, C#, Java and so on. You can learn their usage from [demos](https://github.com/open-mmlab/mmdeploy/tree/main/demo). diff --git a/third_party/mmyolo/docs/en/recommended_topics/labeling_to_deployment_tutorials.md b/third_party/mmyolo/docs/en/recommended_topics/labeling_to_deployment_tutorials.md new file mode 100644 index 0000000000000000000000000000000000000000..bce5d53f57e6dad4baafc843ad6f86cb19540eb1 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/labeling_to_deployment_tutorials.md @@ -0,0 +1,1331 @@ +# Annotation-to-deployment workflow for custom dataset + +In our daily work and study, we often encounter some tasks that need to train custom dataset. There are few scenarios in which open-source datasets can be used as online models, so we need to carry out a series of operations on our custom datasets to ensure that the models can be put into production and serve users. + +```{SeeAlso} +The video of this document has been posted on Bilibili: [A nanny level tutorials for custom datasets from annotationt to deployment](https://www.bilibili.com/video/BV1RG4y137i5) +``` + +```{Note} +All instructions in this document are done on Linux and are fully available on Windows, only slightly different in commands and operations. +``` + +Default that you have completed the installation of MMYOLO, if not installed, please refer to the document [GET STARTED](https://mmyolo.readthedocs.io/en/latest/get_started.html) for installation. + +In this tutorial, we will introduce the whole process from annotating custom dataset to final training, testing and deployment. The overview steps are as below: + +01. Prepare dataset: `tools/misc/download_dataset.py` +02. Use the software of [labelme](https://github.com/wkentaro/labelme) to annotate: `demo/image_demo.py` + labelme +03. Convert the dataset into COCO format: `tools/dataset_converters/labelme2coco.py` +04. Split dataset:`tools/misc/coco_split.py` +05. Creat a config file based on dataset +06. Dataset visualization analysis: `tools/analysis_tools/dataset_analysis.py` +07. Optimize Anchor size: `tools/analysis_tools/optimize_anchors.py` +08. Visualization the data processing part of config: `tools/analysis_tools/browse_dataset.py` +09. Train: `tools/train.py` +10. Inference: `demo/image_demo.py` +11. Deployment + +```{Note} +After obtaining the model weight and the mAP of validation set, users need to deep analyse the bad cases of incorrect predictions in order to optimize model. MMYOLO will add this function in the future. Expect. +``` + +Each step is described in detail below. + +## 1. Prepare custom dataset + +- If you don't have your own dataset, or want to use a small dataset to run the whole process, you can use the 144 images `cat` dataset provided with this tutorial (the raw picture of this dataset is supplied by @RangeKing, cleaned by @PeterH0323). This `cat` dataset will be used as an example for the rest tutorial. + +
+cat dataset +
+ +The download is also very simple, requiring only one command (dataset compression package size `217 MB`): + +```shell +python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete +``` + +This dataset is automatically downloaded to the `./data/cat` dir with the following directory structure: + +```shell +. +└── ./data/cat + ├── images # image files + │ ├── image1.jpg + │ ├── image2.png + │ └── ... + ├── labels # labelme files + │ ├── image1.json + │ ├── image2.json + │ └── ... + ├── annotations # annotated files of COCO + │ ├── annotations_all.json # all labels of COCO + │ ├── trainval.json # 80% labels of the dataset + │ └── test.json # 20% labels of the dataset + └── class_with_id.txt # id + class_name file +``` + +This dataset can be trained directly. You can remove everything **outside** the `images` dir if you want to go through the whole process. + +- If you already have a dataset, you can compose it into the following structure: + +```shell +. +└── $DATA_ROOT + └── images + ├── image1.jpg + ├── image2.png + └── ... +``` + +## 2. Use the software of labelme to annotate + +In general, there are two annotation methods: + +- Software or algorithmic assistance + manual correction (Recommend, reduce costs and speed up) +- Only manual annotation + +```{Note} +At present, we also consider to access third-party libraries to support the integration of algorithm-assisted annotation and manual optimized annotation by calling MMYOLO inference API through GUI interface. +If you have any interest or ideas, please leave a comment in the issue or contact us directly! +``` + +### 2.1 Software or algorithmic assistance + manual correction + +The principle is using the existing model to inference, and save the result as label file. Manually operating the software and loading the generated label files, you only need to check whether each image is correctly labeled and whether there are missing objects.【assistance + manual correction】you can save a lot of time in order to **reduce costs and speed up** by this way. + +```{Note} +If the existing model doesn't have the categories defined in your dataset, such as COCO pre-trained model, you can manually annotate 100 images to train an initial model, and then software assistance. +``` + +The process is described below: + +#### 2.1.1 Software or algorithmic assistance + +MMYOLO provide model inference script `demo/image_demo.py`. Setting `--to-labelme` to generate labelme format label file: + +```shell +python demo/image_demo.py img \ + config \ + checkpoint + [--out-dir OUT_DIR] \ + [--device DEVICE] \ + [--show] \ + [--deploy] \ + [--score-thr SCORE_THR] \ + [--class-name CLASS_NAME] + [--to-labelme] +``` + +These include: + +- `img`: image path, supported by dir, file, URL; +- `config`:config file path of model; +- `checkpoint`:weight file path of model; +- `--out-dir`:inference results saved in this dir, default as `./output`, if set this `--show` parameter, the detection results are not saved; +- `--device`:cumputing resources, including `CUDA`, `CPU` etc., default as `cuda:0`; +- `--show`:display the detection results, default as `False`; +- `--deploy`:whether to switch to deploy mode; +- `--score-thr`:confidence threshold, default as `0.3`; +- `--to-labelme`:whether to export label files in `labelme` format, shouldn't exist with the `--show` at the same time. + +For example: + +Here, we'll use YOLOv5-s as an example to help us label the 'cat' dataset we just downloaded. First, download the weights for YOLOv5-s: + +```shell +mkdir work_dirs +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth -P ./work_dirs +``` + +Since the COCO 80 dataset already includes the `cat` class, we can directly load the COCO pre-trained model for assistant annotation. + +```shell +python demo/image_demo.py ./data/cat/images \ + ./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + ./work_dirs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --out-dir ./data/cat/labels \ + --class-name cat \ + --to-labelme +``` + +```{Tip} +- If your dataset needs to label with multiclass, you can use this `--class-name class1 class2` format; +- Removing the `--class-name` flag to output all classes. +``` + +the generated label files saved in `--out-dir`: + +```shell +. +└── $OUT_DIR + ├── image1.json + ├── image1.json + └── ... +``` + +Here is an example of the original image and it's generating json file: + +
+ Image + Image +
+ +#### 2.1.2 Manual annotation + +In this tutorial, we use [labelme](https://github.com/wkentaro/labelme) to annotate + +- Install labelme + +```shell +conda create -n labelme python=3.8 +conda activate labelme +pip install labelme==5.1.1 +``` + +- Start labelme + +```shell +labelme ${image dir path (same as the previous step)} \ + --output ${the dir path of label file(same as --out-dir)} \ + --autosave \ + --nodata +``` + +These include: + +- `--output`:saved path of labelme file. If there already exists label file of some images, it will be loaded; +- `--autosave`:auto-save label file, and some tedioys steps will be omitted. +- `--nodata`:doesn't store the base64 encoding of each image, so setting this flag will greatly reduce the size of the label file. + +For example: + +```shell +cd /path/to/mmyolo +labelme ./data/cat/images --output ./data/cat/labels --autosave --nodata +``` + +Type in command and labelme will start, and then check label. If labelme fails to start, type `export QT_DEBUG_PLUGINS=1` in command to see which libraries are missing and install it. + +
+label UI +
+ +```{warning} +Make sure to use `rectangle` with the shortcut `Ctrl + R` (see below). + +
+rectangle +
+``` + +### 2.2 Only manual annotation + +The procedure is the same as 【2.1.2 Manual annotation】, except that this is a direct labeling, there is no pre-generated label. + +## 3. Convert the dataset into COCO format + +### 3.1 Using scripts to convert + +MMYOLO provides scripts to convert labelme labels to COCO labels + +```shell +python tools/dataset_converters/labelme2coco.py --img-dir ${image dir path} \ + --labels-dir ${label dir location} \ + --out ${output COCO label json path} \ + [--class-id-txt ${class_with_id.txt path}] +``` + +These include: +`--class-id-txt`: is the `.txt` file of `id class_name` dataset: + +- If not specified, the script will be generated automatically in the same directory as `--out`, and save it as `class_with_id.txt`; + +- If specified, the script will read but not add or overwrite. It will also check if there are any other classes in the `.txt` file and will give you an error if there are any. Please check the `.txt` file and add the new class and its `id`. + +An example `.txt` file looks like this (`id` start at `1`, just like COCO): + +```text +1 cat +2 dog +3 bicycle +4 motorcycle + +``` + +For example: + +Coonsider the `cat` dataset for this tutorial: + +```shell +python tools/dataset_converters/labelme2coco.py --img-dir ./data/cat/images \ + --labels-dir ./data/cat/labels \ + --out ./data/cat/annotations/annotations_all.json +``` + +For the `cat` dataset in this demo (note that we don't need to include the background class), we can see that the generated `class_with_id.txt` has only the `1` class: + +```text +1 cat + +``` + +### 3.2 Check the converted COCO label + +Using the following command, we can display the COCO label on the image, which will verify that there are no problems with the conversion: + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir ${image dir path} \ + --ann-file ${COCO label json path} +``` + +For example: + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir ./data/cat/images \ + --ann-file ./data/cat/annotations/annotations_all.json +``` + +
+Image +
+ +```{SeeAlso} +See [Visualizing COCO label](https://mmyolo.readthedocs.io/en/latest/user_guides/useful_tools.html#coco) for more information on `tools/analysis_tools/browse_coco_json.py`. +``` + +## 4. Divide dataset into training set, validation set and test set + +Usually, custom dataset is a large folder with full of images. We need to divide the dataset into training set, validation set and test set by ourselves. If the amount of data is small, we can not divide the validation set. Here's how the split script works: + +```shell +python tools/misc/coco_split.py --json ${COCO label json path} \ + --out-dir ${divide label json saved path} \ + --ratios ${ratio of division} \ + [--shuffle] \ + [--seed ${random seed for division}] +``` + +These include: + +- `--ratios`: ratio of division. If only 2 are set, the split is `trainval + test`, and if 3 are set, the split is `train + val + test`. Two formats are supported - integer and decimal: + + - Integer: divide the dataset in proportion after normalization. Example: `--ratio 2 1 1` (the code will convert to `0.5 0.25 0.25`) or `--ratio 3 1`(the code will convert to `0.75 0.25`) + + - Decimal: divide the dataset in proportion. **If the sum does not add up to 1, the script performs an automatic normalization correction.** Example: `--ratio 0.8 0.1 0.1` or `--ratio 0.8 0.2` + +- `--shuffle`: whether to shuffle the dataset before splitting. + +- `--seed`: the random seed of dataset division. If not set, this will be generated automatically. + +For example: + +```shell +python tools/misc/coco_split.py --json ./data/cat/annotations/annotations_all.json \ + --out-dir ./data/cat/annotations \ + --ratios 0.8 0.2 \ + --shuffle \ + --seed 10 +``` + +
+Image +
+ +## 5. Create a new config file based on the dataset + +Make sure the dataset directory looks like this: + +```shell +. +└── $DATA_ROOT + ├── annotations + │ ├── trainval.json # only divide into trainval + test according to the above commands; If you use 3 groups to divide the ratio, here is train.json、val.json、test.json + │ └── test.json + ├── images + │ ├── image1.jpg + │ ├── image1.png + │ └── ... + └── ... +``` + +Since this is custom dataset, we need to create a new config and add some information we want to change. + +About naming the new config: + +- This config inherits from `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`; +- We will train the class `cat` from the dataset provided with this tutorial (if you are using your own dataset, you can define the class name of your own dataset); +- The GPU tested in this tutorial is 1 x 3080Ti with 12G video memory, and the computer memory is 32G. The maximum batch size for YOLOv5-s training is `batch size = 32` (see the Appendix for detailed machine information); +- Training epoch is `100 epoch`. + +To sum up: you can name it `yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` and place it into the dir of `configs/custom_dataset`. + +Create a new directory named `custom_dataset` inside configs dir, and add config file with the following content: + +
+Image +
+ +```python +_base_ = '../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +max_epochs = 100 # maximum epochs for training +data_root = './data/cat/' # absolute path to the dataset directory +# data_root = '/root/workspace/mmyolo/data/cat/' # absolute path to the dataset dir inside the Docker container + +# the path of result save, can be omitted, omitted save file name is located under work_dirs with the same name of config file. +# If a config variable changes only part of its parameters, changing this variable will save the new training file elsewhere +work_dir = './work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat' + +# load_from can specify a local path or URL, setting the URL will automatically download, because the above has been downloaded, we set the local path here +# since this tutorial is fine-tuning on the cat dataset, we need to use `load_from` to load the pre-trained model from MMYOLO. This allows for faster convergence and accuracy +load_from = './work_dirs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +# according to your GPU situation, modify the batch size, and YOLOv5-s defaults to 8 cards x 16bs +train_batch_size_per_gpu = 32 +train_num_workers = 4 # recommend to use train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 # save weights every interval round + +# according to your GPU situation, modify the base_lr, modification ratio is base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 4 + +anchors = [ # the anchor has been updated according to the characteristics of dataset. The generation of anchor will be explained in the following section. + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +class_name = ('cat', ) # according to the label information of class_with_id.txt, set the class_name +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(220, 20, 60)] # the color of drawing, free to set +) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, # number of epochs to start validation. Here 20 is set because the accuracy of the first 20 epochs is not high and the test is not meaningful, so it is skipped + val_interval=save_epoch_intervals # the test evaluation is performed iteratively every val_interval round +) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + + # loss_cls is dynamically adjusted based on num_classes, but when num_classes = 1, loss_cls is always 0 + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + # if the dataset is too small, you can use RepeatDataset, which repeats the current dataset n times per epoch, where 5 is set. + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + # set how many epochs to save the model, and the maximum number of models to save,`save_best` is also the best model (recommended). + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + # logger output interval + logger=dict(type='LoggerHook', interval=10)) + +``` + +```{Note} +We put an identical config file in `projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py`. You can choose to copy to `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` to start training directly. +``` + +## 6. Visual analysis of datasets + +The script `tools/analysis_tools/dataset_analysis.py` will helo you get a plot of your dataset. The script can generate four types of analysis graphs: + +- A distribution plot showing categories and the number of bbox instances: `show_bbox_num` +- A distribution plot showing categories and the width and height of bbox instances: `show_bbox_wh` +- A distribution plot showing categories and the width/height ratio of bbox instances: `show_bbox_wh_ratio` +- A distribution plot showing categories and the area of bbox instances based on the area rule: `show_bbox_area` + +Here's how the script works: + +```shell +python tools/analysis_tools/dataset_analysis.py ${CONFIG} \ + [--val-dataset ${TYPE}] \ + [--class-name ${CLASS_NAME}] \ + [--area-rule ${AREA_RULE}] \ + [--func ${FUNC}] \ + [--out-dir ${OUT_DIR}] +``` + +For example: + +Consider the config of `cat` dataset in this tutorial: + +Check the distribution of the training data: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --out-dir work_dirs/dataset_analysis_cat/train_dataset +``` + +Check the distribution of the validation data: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --out-dir work_dirs/dataset_analysis_cat/val_dataset \ + --val-dataset +``` + +Effect (click on the image to view a larger image): + + + + + + + + + + + + + + + + + + + + +
+ A distribution plot showing categories and the area of bbox instances based on the area rule + + A distribution plot showing categories and the width and height of bbox instances +
+ YOLOv5CocoDataset_bbox_area + + YOLOv5CocoDataset_bbox_wh +
+ A distribution plot showing categories and the number of bbox instances + + A distribution plot showing categories and the width/height ratio of bbox instances +
+ YOLOv5CocoDataset_bbox_num + + YOLOv5CocoDataset_bbox_ratio +
+ +```{Note} +Due to the cat dataset used in this tutorial is relatively small, we use RepeatDataset in config. The numbers shown are actually repeated five times. If you want a repeat-free analysis, you can change the `times` argument in RepeatDataset from `5` to `1` for now. +``` + +From the analysis output, we can conclude that the training set of the `cat` dataset used in this tutorial has the following characteristics: + +- The images are all `large object`; +- The number of categories cat is `655`; +- The width and height ratio of bbox is mostly concentrated in `1.0 ~ 1.11`, the minimum ratio is `0.36` and the maximum ratio is `2.9`; +- The width of bbox is about `500 ~ 600` , and the height is about `500 ~ 600`. + +```{SeeAlso} +See [Visualizing Dataset Analysis](https://mmyolo.readthedocs.io/en/latest/user_guides/useful_tools.html#id4) for more information on `tools/analysis_tools/dataset_analysis.py` +``` + +## 7. Optimize Anchor size + +```{Warning} +This step only works for anchor-base models such as YOLOv5; + +This step can be skipped for Anchor-free models, such as YOLOv6, YOLOX. +``` + +The `tools/analysis_tools/optimize_anchors.py` script supports three anchor generation methods from YOLO series: `k-means`, `Differential Evolution` and `v5-k-means`. + +In this tutorial, we will use YOLOv5 for training, with an input size of `640 x 640`, and `v5-k-means` to optimize anchor: + +```shell +python tools/analysis_tools/optimize_anchors.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --algorithm v5-k-means \ + --input-shape 640 640 \ + --prior-match-thr 4.0 \ + --out-dir work_dirs/dataset_analysis_cat +``` + +```{Note} +Because this command uses the k-means clustering algorithm, there is some randomness, which is related to the initialization. Therefore, the Anchor obtained by each execution will be somewhat different, but it is generated based on the dataset passed in, so it will not have any adverse effects. +``` + +The calculated anchors are as follows: + +
+Anchor +
+ +Modify the `anchors` variable in config file: + +```python +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] +``` + +```{SeeAlso} +See [Optimize Anchor Sizes](https://mmyolo.readthedocs.io/en/latest/user_guides/useful_tools.html#id8) for more information on `tools/analysis_tools/optimize_anchors.py` +``` + +## 8. Visualization the data processing part of config + +The script `tools/analysis_tools/browse_dataset.py` allows you to visualize the data processing part of config directly in the window, with the option to save the visualization to a specific directory. + +Let's use the config file we just created `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` to visualize the images. Each image lasts for `3` seconds, and the images are not saved: + +```shell +python tools/analysis_tools/browse_dataset.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --show-interval 3 +``` + +
+image +
+ +
+image +
+ +```{SeeAlso} +See [Visualizing Datasets](https://mmyolo.readthedocs.io/en/latest/user_guides/useful_tools.html#id3) for more information on `tools/analysis_tools/browse_dataset.py` +``` + +## 9. Train + +Here are three points to explain: + +1. Training visualization +2. YOLOv5 model training +3. Switching YOLO model training + +### 9.1 Training visualization + +If you need to use a browser to visualize the training process, MMYOLO currently offers two ways [wandb](https://wandb.ai/site) and [TensorBoard](https://tensorflow.google.cn/tensorboard). Pick one according to your own situation (we'll expand support for more visualization backends in the future). + +#### 9.1.1 wandb + +Wandb visualization need registered in [website](https://wandb.ai/site), and in the https://wandb.ai/settings for wandb API Keys. + +
+image +
+ +Then install it from the command line: + +```shell +pip install wandb +# After running wandb login, enter the API Keys obtained above, and the login is successful. +wandb login +``` + +
+Image +
+ +Add the `wandb` configuration at the end of config file we just created, `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py`. + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) +``` + +#### 9.1.2 TensorBoard + +Install Tensorboard environment + +```shell +pip install tensorboard +``` + +Add the `tensorboard` configuration at the end of config file we just created, `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py`. + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')]) +``` + +After running the training command, Tensorboard files will be generated in the visualization folder `work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat/${TIMESTAMP}/vis_data`. We can use Tensorboard to view the loss, learning rate, and coco/bbox_mAP visualizations from a web link by running the following command: + +```shell +tensorboard --logdir=work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat +``` + +### 9.2 Perform training + +Let's start the training with the following command (training takes about 2.5 hours) : + +```shell +python tools/train.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py +``` + +If you have enabled wandb, you can log in to your account to view the details of this training in wandb: + +
+Image +
+ +
+Image +
+ +The following is `1 x 3080Ti`, `batch size = 32`, training `100 epoch` optimal precision weight `work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_98.pth` obtained accuracy (see Appendix for detailed machine information): + +```shell + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.968 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.968 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.886 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.977 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.977 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.977 + +bbox_mAP_copypaste: 0.968 1.000 1.000 -1.000 -1.000 0.968 +Epoch(val) [98][116/116] coco/bbox_mAP: 0.9680 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 1.0000 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9680 +``` + +```{Tip} +In general finetune best practice, it is recommended that backbone be left out of training and that the learning rate lr be scaled accordingly. However, in this tutorial, we found this approach can fall short to some extent. The possible reason is that the cat category is already in the COCO dataset, and the cat dataset used in this tutorial is relatively small +``` + +The following table shows the test accuracy of the MMYOLO YOLOv5 pre-trained model `yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth` without finetune on the cat dataset. It can be seen that the mAP of the `cat` category is only `0.866`, which improve to `0.968` after finetune, improved by '10.2%', which proves that the training was very successful: + +```shell ++---------------+-------+--------------+-----+----------------+------+ +| category | AP | category | AP | category | AP | ++---------------+-------+--------------+-----+----------------+------+ +| person | nan | bicycle | nan | car | nan | +| motorcycle | nan | airplane | nan | bus | nan | +| train | nan | truck | nan | boat | nan | +| traffic light | nan | fire hydrant | nan | stop sign | nan | +| parking meter | nan | bench | nan | bird | nan | +| cat | 0.866 | dog | nan | horse | nan | +| sheep | nan | cow | nan | elephant | nan | +| bear | nan | zebra | nan | giraffe | nan | +| backpack | nan | umbrella | nan | handbag | nan | +| tie | nan | suitcase | nan | frisbee | nan | +| skis | nan | snowboard | nan | sports ball | nan | +| kite | nan | baseball bat | nan | baseball glove | nan | +| skateboard | nan | surfboard | nan | tennis racket | nan | +| bottle | nan | wine glass | nan | cup | nan | +| fork | nan | knife | nan | spoon | nan | +| bowl | nan | banana | nan | apple | nan | +| sandwich | nan | orange | nan | broccoli | nan | +| carrot | nan | hot dog | nan | pizza | nan | +| donut | nan | cake | nan | chair | nan | +| couch | nan | potted plant | nan | bed | nan | +| dining table | nan | toilet | nan | tv | nan | +| laptop | nan | mouse | nan | remote | nan | +| keyboard | nan | cell phone | nan | microwave | nan | +| oven | nan | toaster | nan | sink | nan | +| refrigerator | nan | book | nan | clock | nan | +| vase | nan | scissors | nan | teddy bear | nan | +| hair drier | nan | toothbrush | nan | None | None | ++---------------+-------+--------------+-----+----------------+------+ +``` + +```{SeeAlso} +For details on how to get the accuracy of the pre-trained weights, see the appendix【2. How to test the accuracy of dataset on pre-trained weights】 +``` + +### 9.3 Switch other models in MMYOLO + +MMYOLO integrates multiple YOLO algorithms, which makes switching between YOLO models very easy. There is no need to reacquaint with a new repo. You can easily switch between YOLO models by simply modifying the config file: + +1. Create a new config file +2. Download the pre-trained weights +3. Starting training + +Let's take YOLOv6-s as an example. + +1. Create a new config file: + +```python +_base_ = '../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +max_epochs = 100 # maximum of training epoch +data_root = './data/cat/' # absolute path to the dataset directory + +# the path of result save, can be omitted, omitted save file name is located under work_dirs with the same name of config file. +# If a config variable changes only part of its parameters, changing this variable will save the new training file elsewhere +work_dir = './work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat' + +# load_from can specify a local path or URL, setting the URL will automatically download, because the above has been downloaded, we set the local path here +# since this tutorial is fine-tuning on the cat dataset, we need to use `load_from` to load the pre-trained model from MMYOLO. This allows for faster convergence and accuracy +load_from = './work_dirs/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa + +# according to your GPU situation, modify the batch size, and YOLOv6-s defaults to 8 cards x 32bs +train_batch_size_per_gpu = 32 +train_num_workers = 4 # recommend to use train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 # save weights every interval round + +# according to your GPU situation, modify the base_lr, modification ratio is base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 8 + +class_name = ('cat', ) # according to the label information of class_with_id.txt, set the class_name +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(220, 20, 60)] # the color of drawing, free to set +) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, # number of epochs to start validation. Here 20 is set because the accuracy of the first 20 epochs is not high and the test is not meaningful, so it is skipped + val_interval=save_epoch_intervals, # the test evaluation is performed iteratively every val_interval round + dynamic_intervals=[(max_epochs - _base_.num_last_epochs, 1)] +) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes)) +) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + # if the dataset is too small, you can use RepeatDataset, which repeats the current dataset n times per epoch, where 5 is set. + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + # set how many epochs to save the model, and the maximum number of models to save,`save_best` is also the best model (recommended). + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + # logger output interval + logger=dict(type='LoggerHook', interval=10)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - _base_.num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] + +``` + +```{Note} +Similarly, We put an identical config file in `projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py`. You can choose to copy to `configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py` to start training directly. + +Even though the new config looks like a lot of stuff, it's actually a lot of duplication. You can use a comparison software to see that most of the configuration is identical to 'yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py'. Because the two config files need to inherit from different config files, you still need to add the necessary configuration. +``` + +2. Download the pre-trained weights + +```bash +wget https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth -P work_dirs/ +``` + +3. Starting training + +```shell +python tools/train.py configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py +``` + +In my experiments, the best model is `work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth`,which accuracy is as follows: + +```bash + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.987 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.987 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.895 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.989 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.989 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.989 + +bbox_mAP_copypaste: 0.987 1.000 1.000 -1.000 -1.000 0.987 +Epoch(val) [96][116/116] coco/bbox_mAP: 0.9870 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 1.0000 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9870 +``` + +The above demonstrates how to switch models in MMYOLO, you can quickly compare the accuracy of different models, and the model with high accuracy can be put into production. In my experiment, the best accuracy of YOLOv6 `0.9870` is `1.9 %` higher than the best accuracy of YOLOv5 `0.9680` , so we will use YOLOv6 for explanation. + +## 10. Inference + +Using the best model for inference, the best model path in the following command is `./work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth`, please modify the best model path you trained. + +```shell +python demo/image_demo.py ./data/cat/images \ + ./configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + ./work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth \ + --out-dir ./data/cat/pred_images +``` + +
+Image +
+ +```{Tip} +If the inference result is not ideal, here are two cases: + +1. Model underfitting: + + First, we need to determine if there is not enough training epochs resulting in underfitting. If there is not enough training, we need to change the `max_epochs` and `work_dir` parameters in the config file, or create a new config file named as above and start the training again. + +2. The dataset needs to be optimized: + If adding epochs still doesn't work, we can increase the number of datasets and re-examine and refine the annotations of the dataset before retraining. +``` + +## 11. Deployment + +MMYOLO provides two deployment options: + +1. [MMDeploy](https://github.com/open-mmlab/mmdeploy) framework for deployment +2. Using `projects/easydeploy` to deployment + +### 11.1 MMDeploy framework for deployment + +Considering that the wide variety of machine deployments, there are many times when a local machine will work, but not in production. Here, we recommended to use Docker, so that the environment can be deployed once and used for life, saving the time of operation and maintenance to build the environment and deploy production. + +In this part, we will introduce the following steps: + +1. Building a Docker image +2. Creating a Docker container +3. Transforming TensorRT models +4. Deploying model and performing inference + +```{SeeAlso} +If you are not familiar with Docker, you can refer to the MMDeploy [source manual installation].(https://mmdeploy.readthedocs.io/en/latest/01-how-to-build/build_from_source.html) file to compile directly locally. Once installed, you can skip to【11.1.3 Transforming TensorRT models】 +``` + +#### 11.1.1 Building a Docker image + +```shell +git clone -b dev-1.x https://github.com/open-mmlab/mmdeploy.git +cd mmdeploy +docker build docker/GPU/ -t mmdeploy:gpu --build-arg USE_SRC_INSIDE=true +``` + +Where `USE_SRC_INSIDE=true` is to pull the basis after switching the domestic source, the build speed will be faster. + +After executing the script, the build will start, which will take a while: + +
+Image +
+ +#### 11.1.2 Creating a Docker container + +```shell +export MMYOLO_PATH=/path/to/local/mmyolo # write the path to MMYOLO on your machine to an environment variable +docker run --gpus all --name mmyolo-deploy -v ${MMYOLO_PATH}:/root/workspace/mmyolo -it mmdeploy:gpu /bin/bash +``` + +
+Image +
+ +You can see your local MMYOLO environment mounted inside the container + +
+Image +
+ +```{SeeAlso} +You can read more about this in the MMDeploy official documentation [Using Docker Images](https://mmdeploy.readthedocs.io/en/latest/01-how-to-build/build_from_docker.html#docker) +``` + +#### 11.1.3 Transforming TensorRT models + +The first step is to install MMYOLO and `pycuda` in a Docker container: + +```shell +export MMYOLO_PATH=/root/workspace/mmyolo # path in the image, which doesn't need to modify +cd ${MMYOLO_PATH} +export MMYOLO_VERSION=$(python -c "import mmyolo.version as v; print(v.__version__)") # Check the version number of MMYOLO used for training +echo "Using MMYOLO ${MMYOLO_VERSION}" +mim install --no-cache-dir mmyolo==${MMYOLO_VERSION} +pip install --no-cache-dir pycuda==2022.2 +``` + +Performing model transformations + +```shell +cd /root/workspace/mmdeploy +python ./tools/deploy.py \ + ${MMYOLO_PATH}/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + ${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + ${MMYOLO_PATH}/work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth \ + ${MMYOLO_PATH}/data/cat/images/mmexport1633684751291.jpg \ + --test-img ${MMYOLO_PATH}/data/cat/images/mmexport1633684751291.jpg \ + --work-dir ./work_dir/yolov6_s_syncbn_fast_1xb32-100e_cat_deploy_dynamic_fp16 \ + --device cuda:0 \ + --log-level INFO \ + --show \ + --dump-info +``` + +
+Image +
+ +Wait for a few minutes, `All process success.` appearance indicates success: + +
+Image +
+ +Looking at the exported path, you can see the file structure as shown in the following screenshot: + +```shell +$WORK_DIR + ├── deploy.json + ├── detail.json + ├── end2end.engine + ├── end2end.onnx + └── pipeline.json +``` + +```{SeeAlso} +For a detailed description of transforming models, see [How to Transform Models](https://mmdeploy.readthedocs.io/en/latest/02-how-to-run/convert_model.html) +``` + +#### 11.1.4 Deploying model and performing inference + +We need to change the `data_root` in `${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py` to the path in the Docker container: + +```python +data_root = '/root/workspace/mmyolo/data/cat/' # absolute path of the dataset dir in the Docker container. +``` + +Execute speed and accuracy tests: + +```shell +python tools/test.py \ + ${MMYOLO_PATH}/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + ${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + --model ./work_dir/yolov6_s_syncbn_fast_1xb32-100e_cat_deploy_dynamic_fp16/end2end.engine \ + --speed-test \ + --device cuda +``` + +The speed test is as follows, we can see that the average inference speed is `24.10ms`, which is a speed improvement compared to PyTorch inference, but also reduce lots of video memory usage: + +```shell +Epoch(test) [ 10/116] eta: 0:00:20 time: 0.1919 data_time: 0.1330 memory: 12 +Epoch(test) [ 20/116] eta: 0:00:15 time: 0.1220 data_time: 0.0939 memory: 12 +Epoch(test) [ 30/116] eta: 0:00:12 time: 0.1168 data_time: 0.0850 memory: 12 +Epoch(test) [ 40/116] eta: 0:00:10 time: 0.1241 data_time: 0.0940 memory: 12 +Epoch(test) [ 50/116] eta: 0:00:08 time: 0.0974 data_time: 0.0696 memory: 12 +Epoch(test) [ 60/116] eta: 0:00:06 time: 0.0865 data_time: 0.0547 memory: 16 +Epoch(test) [ 70/116] eta: 0:00:05 time: 0.1521 data_time: 0.1226 memory: 16 +Epoch(test) [ 80/116] eta: 0:00:04 time: 0.1364 data_time: 0.1056 memory: 12 +Epoch(test) [ 90/116] eta: 0:00:03 time: 0.0923 data_time: 0.0627 memory: 12 +Epoch(test) [100/116] eta: 0:00:01 time: 0.0844 data_time: 0.0583 memory: 12 +[tensorrt]-110 times per count: 24.10 ms, 41.50 FPS +Epoch(test) [110/116] eta: 0:00:00 time: 0.1085 data_time: 0.0832 memory: 12 +``` + +Accuracy test is as follows. This configuration uses FP16 format inference, which has some drop points, but it is faster and uses less video memory: + +```shell + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.954 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.975 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.954 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.860 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.965 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.965 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.965 + +INFO - bbox_mAP_copypaste: 0.954 1.000 0.975 -1.000 -1.000 0.954 +INFO - Epoch(test) [116/116] coco/bbox_mAP: 0.9540 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 0.9750 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9540 +``` + +Deployment model and inference demonstration: + +```{Note} +You can use the MMDeploy SDK for deployment and use C++ to further improve inference speed. +``` + +```shell +cd ${MMYOLO_PATH}/demo +python deploy_demo.py \ + ${MMYOLO_PATH}/data/cat/images/mmexport1633684900217.jpg \ + ${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + /root/workspace/mmdeploy/work_dir/yolov6_s_syncbn_fast_1xb32-100e_cat_deploy_dynamic_fp16/end2end.engine \ + --deploy-cfg ${MMYOLO_PATH}/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + --out-dir ${MMYOLO_PATH}/work_dirs/deploy_predict_out \ + --device cuda:0 \ + --score-thr 0.5 +``` + +```{Warning} +The script `deploy_demo.py` doesn't achieve batch inference, and the pre-processing code needs to be improved. It cannot fully show the inference speed at the moment, only demonstrate the inference results. we will optimize in the future. Expect! +``` + +After executing, you can see the inference image results in `--out-dir` : + +
+Image +
+ +```{Note} +You can also use other optimizations like increasing batch size, int8 quantization, etc. +``` + +#### 11.1.5 Save and load the Docker container + +It would be a waste of time to build a docker image every time. At this point you can consider using docker's packaging api for packaging and loading. + +```shell +# save, the result tar package can be placed on mobile hard disk +docker save mmyolo-deploy > mmyolo-deploy.tar + +# load image to system +docker load < /path/to/mmyolo-deploy.tar +``` + +### 11.2 Using `projects/easydeploy` to deploy + +```{SeeAlso} +See [deployment documentation](https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy/README.md) for details. +``` + +TODO: This part will be improved in the next version... + +## Appendix + +### 1. The detailed environment for training the machine in this tutorial is as follows: + +```shell +sys.platform: linux +Python: 3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:58:50) [GCC 10.3.0] +CUDA available: True +numpy_random_seed: 2147483648 +GPU 0: NVIDIA GeForce RTX 3080 Ti +CUDA_HOME: /usr/local/cuda +NVCC: Cuda compilation tools, release 11.5, V11.5.119 +GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0 +PyTorch: 1.10.0 +PyTorch compiling details: PyTorch built with: + - GCC 7.3 + - C++ Version: 201402 + - Intel(R) oneAPI Math Kernel Library Version 2021.4-Product Build 20210904 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.3 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode; + arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70; + -gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode; + arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.2 + - Magma 2.5.2 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.2.0, + CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden + -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK + -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra + -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas + -Wno-sign-compare -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic + -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new + -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format + -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, + TORCH_VERSION=1.10.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, + USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, + +TorchVision: 0.11.0 +OpenCV: 4.6.0 +MMEngine: 0.3.1 +MMCV: 2.0.0rc3 +MMDetection: 3.0.0rc3 +MMYOLO: 0.2.0+cf279a5 +``` + +### 2. How to test the accuracy of our dataset on the pre-trained weights: + +```{Warning} +Premise: The class is in the COCO 80 class! +``` + +In this part, we will use the `cat` dataset as an example, using: + +- config file: `configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py` +- weight `yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth` + +1. modify the path in config file + +Because `configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py` is inherited from `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py`. Therefore, you can mainly modify the `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` file. + +| before modification | after modification | +| ------------------------------------------------------------------------------ | --------------------------------------------------------------------------- | +| `data_root = 'data/coco/'` | `data_root = './data/cat/'` | +| `ann_file='annotations/instances_train2017.json'` | `ann_file='annotations/trainval.json'` | +| data_prefix=dict(img='train2017/')\` | `data_prefix=dict(img='images/')` | +| `val_evaluator` of `ann_file=data_root + 'annotations/instances_val2017.json'` | `val_evaluator` of `dict(ann_file=data_root + 'annotations/trainval.json')` | + +2. modify label + +```{note} +It is recommended to make a copy of the label directly to prevent damage to original label +``` + +Change the `categories` in `trainval.json` to COCO's original: + +```json + "categories": [{"supercategory": "person","id": 1,"name": "person"},{"supercategory": "vehicle","id": 2,"name": "bicycle"},{"supercategory": "vehicle","id": 3,"name": "car"},{"supercategory": "vehicle","id": 4,"name": "motorcycle"},{"supercategory": "vehicle","id": 5,"name": "airplane"},{"supercategory": "vehicle","id": 6,"name": "bus"},{"supercategory": "vehicle","id": 7,"name": "train"},{"supercategory": "vehicle","id": 8,"name": "truck"},{"supercategory": "vehicle","id": 9,"name": "boat"},{"supercategory": "outdoor","id": 10,"name": "traffic light"},{"supercategory": "outdoor","id": 11,"name": "fire hydrant"},{"supercategory": "outdoor","id": 13,"name": "stop sign"},{"supercategory": "outdoor","id": 14,"name": "parking meter"},{"supercategory": "outdoor","id": 15,"name": "bench"},{"supercategory": "animal","id": 16,"name": "bird"},{"supercategory": "animal","id": 17,"name": "cat"},{"supercategory": "animal","id": 18,"name": "dog"},{"supercategory": "animal","id": 19,"name": "horse"},{"supercategory": "animal","id": 20,"name": "sheep"},{"supercategory": "animal","id": 21,"name": "cow"},{"supercategory": "animal","id": 22,"name": "elephant"},{"supercategory": "animal","id": 23,"name": "bear"},{"supercategory": "animal","id": 24,"name": "zebra"},{"supercategory": "animal","id": 25,"name": "giraffe"},{"supercategory": "accessory","id": 27,"name": "backpack"},{"supercategory": "accessory","id": 28,"name": "umbrella"},{"supercategory": "accessory","id": 31,"name": "handbag"},{"supercategory": "accessory","id": 32,"name": "tie"},{"supercategory": "accessory","id": 33,"name": "suitcase"},{"supercategory": "sports","id": 34,"name": "frisbee"},{"supercategory": "sports","id": 35,"name": "skis"},{"supercategory": "sports","id": 36,"name": "snowboard"},{"supercategory": "sports","id": 37,"name": "sports ball"},{"supercategory": "sports","id": 38,"name": "kite"},{"supercategory": "sports","id": 39,"name": "baseball bat"},{"supercategory": "sports","id": 40,"name": "baseball glove"},{"supercategory": "sports","id": 41,"name": "skateboard"},{"supercategory": "sports","id": 42,"name": "surfboard"},{"supercategory": "sports","id": 43,"name": "tennis racket"},{"supercategory": "kitchen","id": 44,"name": "bottle"},{"supercategory": "kitchen","id": 46,"name": "wine glass"},{"supercategory": "kitchen","id": 47,"name": "cup"},{"supercategory": "kitchen","id": 48,"name": "fork"},{"supercategory": "kitchen","id": 49,"name": "knife"},{"supercategory": "kitchen","id": 50,"name": "spoon"},{"supercategory": "kitchen","id": 51,"name": "bowl"},{"supercategory": "food","id": 52,"name": "banana"},{"supercategory": "food","id": 53,"name": "apple"},{"supercategory": "food","id": 54,"name": "sandwich"},{"supercategory": "food","id": 55,"name": "orange"},{"supercategory": "food","id": 56,"name": "broccoli"},{"supercategory": "food","id": 57,"name": "carrot"},{"supercategory": "food","id": 58,"name": "hot dog"},{"supercategory": "food","id": 59,"name": "pizza"},{"supercategory": "food","id": 60,"name": "donut"},{"supercategory": "food","id": 61,"name": "cake"},{"supercategory": "furniture","id": 62,"name": "chair"},{"supercategory": "furniture","id": 63,"name": "couch"},{"supercategory": "furniture","id": 64,"name": "potted plant"},{"supercategory": "furniture","id": 65,"name": "bed"},{"supercategory": "furniture","id": 67,"name": "dining table"},{"supercategory": "furniture","id": 70,"name": "toilet"},{"supercategory": "electronic","id": 72,"name": "tv"},{"supercategory": "electronic","id": 73,"name": "laptop"},{"supercategory": "electronic","id": 74,"name": "mouse"},{"supercategory": "electronic","id": 75,"name": "remote"},{"supercategory": "electronic","id": 76,"name": "keyboard"},{"supercategory": "electronic","id": 77,"name": "cell phone"},{"supercategory": "appliance","id": 78,"name": "microwave"},{"supercategory": "appliance","id": 79,"name": "oven"},{"supercategory": "appliance","id": 80,"name": "toaster"},{"supercategory": "appliance","id": 81,"name": "sink"},{"supercategory": "appliance","id": 82,"name": "refrigerator"},{"supercategory": "indoor","id": 84,"name": "book"},{"supercategory": "indoor","id": 85,"name": "clock"},{"supercategory": "indoor","id": 86,"name": "vase"},{"supercategory": "indoor","id": 87,"name": "scissors"},{"supercategory": "indoor","id": 88,"name": "teddy bear"},{"supercategory": "indoor","id": 89,"name": "hair drier"},{"supercategory": "indoor","id": 90,"name": "toothbrush"}], +``` + +Also, change the `category_id` in the `annotations` to the `id` corresponding to COCO, for example, `cat` is `17` in this example. Here are some of the results: + +```json + "annotations": [ + { + "iscrowd": 0, + "category_id": 17, # This "category_id" is changed to the id corresponding to COCO, for example, cat is 17 + "id": 32, + "image_id": 32, + "bbox": [ + 822.49072265625, + 958.3897094726562, + 1513.693115234375, + 988.3231811523438 + ], + "area": 1496017.9949368387, + "segmentation": [ + [ + 822.49072265625, + 958.3897094726562, + 822.49072265625, + 1946.712890625, + 2336.183837890625, + 1946.712890625, + 2336.183837890625, + 958.3897094726562 + ] + ] + } + ] +``` + +3. executive command + +```shell +python tools\test.py configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + work_dirs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --cfg-options test_evaluator.classwise=True +``` + +After executing it, we can see the test metrics: + +```shell ++---------------+-------+--------------+-----+----------------+------+ +| category | AP | category | AP | category | AP | ++---------------+-------+--------------+-----+----------------+------+ +| person | nan | bicycle | nan | car | nan | +| motorcycle | nan | airplane | nan | bus | nan | +| train | nan | truck | nan | boat | nan | +| traffic light | nan | fire hydrant | nan | stop sign | nan | +| parking meter | nan | bench | nan | bird | nan | +| cat | 0.866 | dog | nan | horse | nan | +| sheep | nan | cow | nan | elephant | nan | +| bear | nan | zebra | nan | giraffe | nan | +| backpack | nan | umbrella | nan | handbag | nan | +| tie | nan | suitcase | nan | frisbee | nan | +| skis | nan | snowboard | nan | sports ball | nan | +| kite | nan | baseball bat | nan | baseball glove | nan | +| skateboard | nan | surfboard | nan | tennis racket | nan | +| bottle | nan | wine glass | nan | cup | nan | +| fork | nan | knife | nan | spoon | nan | +| bowl | nan | banana | nan | apple | nan | +| sandwich | nan | orange | nan | broccoli | nan | +| carrot | nan | hot dog | nan | pizza | nan | +| donut | nan | cake | nan | chair | nan | +| couch | nan | potted plant | nan | bed | nan | +| dining table | nan | toilet | nan | tv | nan | +| laptop | nan | mouse | nan | remote | nan | +| keyboard | nan | cell phone | nan | microwave | nan | +| oven | nan | toaster | nan | sink | nan | +| refrigerator | nan | book | nan | clock | nan | +| vase | nan | scissors | nan | teddy bear | nan | +| hair drier | nan | toothbrush | nan | None | None | ++---------------+-------+--------------+-----+----------------+------+ +``` diff --git a/third_party/mmyolo/docs/en/recommended_topics/mm_basics.md b/third_party/mmyolo/docs/en/recommended_topics/mm_basics.md new file mode 100644 index 0000000000000000000000000000000000000000..9f23cfe6606a6a7adfa20b2e532c8f804820ce12 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/mm_basics.md @@ -0,0 +1 @@ +# MM series repo essential basics diff --git a/third_party/mmyolo/docs/en/recommended_topics/model_design.md b/third_party/mmyolo/docs/en/recommended_topics/model_design.md new file mode 100644 index 0000000000000000000000000000000000000000..e1fc5b822abb9b033f582ea2df5c70d3fd708b95 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/model_design.md @@ -0,0 +1,106 @@ +# Model design instructions + +## YOLO series model basic class + +The structural figure is provided by RangeKing@GitHub. Thank you RangeKing! + +
+BaseModule-P5 +Figure 1: P5 model structure +
+ +
+BaseModule-P6 +Figure 2: P6 model structure +
+ +Most YOLO series algorithms adopt a unified algorithm-building structure, typically as Darknet + PAFPN. In order to let users quickly understand the YOLO series algorithm architecture, we deliberately designed the `BaseBackbone` + `BaseYOLONeck` structure, as shown in the above figure. + +The benefits of the abstract `BaseBackbone` include: + +1. Subclasses do not need to be concerned about the forward process. Just build the model as a builder pattern. +2. It can be configured to achieve custom plug-in functions. Users can easily insert some similar attention modules. +3. All subclasses automatically support freezing certain stages and bn functions. + +`BaseYOLONeck` has the same benefits as `BaseBackbone`. + +### BaseBackbone + +- As shown in Figure 1, for P5, `BaseBackbone` includes 1 stem layer and 4 stage layers which are similar to the basic structure of ResNet. +- As shown in Figure 2, for P6, `BaseBackbone` includes 1 stem layer and 5 stage layers. + Different backbone network algorithms inherit the `BaseBackbone`. Users can build each layer of the whole network by implementing customized basic modules through the internal `build_xx` method. + +### BaseYOLONeck + +We reproduce the YOLO series Neck components in the similar way as the `BaseBackbone`, and we can mainly divide them into `Reduce layer`, `UpSample layer`, `TopDown layer`, `DownSample layer`, `BottomUP layer` and `output convolution layer`. Each layer can be customized its internal construction by the inheritance and rewrite from the `build_xx` method. + +### BaseDenseHead + +MMYOLO uses the `BaseDenseHead` designed in MMDetection as the base class of the Head structure. Take YOLOv5 as an example, the forward function of its [HeadModule](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/dense_heads/yolov5_head.py#L2) replaces the original forward method. + +## HeadModule + +
+image +
+ +As shown in the above graph, the solid line is the implementation in [MMYOLO](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/dense_heads/yolov5_head.py), whereas the original implementation in [MMDetection](https://github.com/open-mmlab/mmdetection) is shown in the dotted line. MMYOLO has the following advantages over the original implementation: + +1. In MMDetection, `bbox_head` is split into three large components: `assigner` + `box coder` + `sampler`. But because the transfer between these three components is universal, it is necessary to encapsulate additional objects. With the unification in MMYOLO, users do not need to separate them. The advantages of not deliberately forcing the division of the three components are: data encapsulation of internal data is no longer required, code logic is simplified, and the difficulty of community use and algorithm reproduction is reduced. +2. MMYOLO is Faster. When users customize the implementation algorithm, they can deeply optimize part of the code without relying on the original framework. + +In general, with the partly decoupled model + `loss_by_feat` part in MMYOLO, users can construct any model with any `loss_by_feat` by modifying the configuration. For example, applying the `loss_by_feat` of YOLOX to the YOLOv5 model, etc. + +Take the YOLOX configuration in MMDetection as an example, the Head module configuration is written as follows: + +```python +bbox_head=dict( + type='YOLOXHead', + num_classes=80, + in_channels=128, + feat_channels=128, + stacked_convs=2, + strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + ... + loss_obj=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)), +train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), +``` + +For the head_module in MMYOLO, the new configuration is written as follows: + +```python +bbox_head=dict( + type='YOLOXHead', + head_module=dict( + type='YOLOXHeadModule', + num_classes=80, + in_channels=256, + feat_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + featmap_strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True), + ), + ... + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox_aux=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0)), +train_cfg=dict( + assigner=dict( + type='mmdet.SimOTAAssigner', + center_radius=2.5, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'))), +``` diff --git a/third_party/mmyolo/docs/en/recommended_topics/replace_backbone.md b/third_party/mmyolo/docs/en/recommended_topics/replace_backbone.md new file mode 100644 index 0000000000000000000000000000000000000000..82d2046b8e8906a2d186d1ccffd775ab0f23f3ad --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/replace_backbone.md @@ -0,0 +1,306 @@ +# Replace the backbone network + +```{note} +1. When using other backbone networks, you need to ensure that the output channels of the backbone network match the input channels of the neck network. +2. The configuration files given below only ensure that the training will work correctly, and their training performance may not be optimal. Because some backbones require specific learning rates, optimizers, and other hyperparameters. Related contents will be added in the "Training Tips" section later. +``` + +## Use backbone network implemented in MMYOLO + +Suppose you want to use `YOLOv6EfficientRep` as the backbone network of `YOLOv5`, the example config is as the following: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + backbone=dict( + type='YOLOv6EfficientRep', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)) +) +``` + +## Use backbone network implemented in other OpenMMLab repositories + +The model registry in MMYOLO, MMDetection, MMClassification, and MMSegmentation all inherit from the root registry in MMEngine in the OpenMMLab 2.0 system, allowing these repositories to directly use modules already implemented by each other. Therefore, in MMYOLO, users can use backbone networks from MMDetection and MMClassification without reimplementation. + +### Use backbone network implemented in MMDetection + +1. Suppose you want to use `ResNet-50` as the backbone network of `YOLOv5`, the example config is as the following: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [512, 1024, 2048] + +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmdet.ResNet', # Using ResNet from mmdet + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='YOLOv5PAFPN', + widen_factor=widen_factor, + in_channels=channels, # Note: The 3 channels of ResNet-50 output are [512, 1024, 2048], which do not match the original yolov5-s neck and need to be changed. + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # input channels of head need to be changed accordingly + widen_factor=widen_factor)) +) +``` + +2. Suppose you want to use `SwinTransformer-Tiny` as the backbone network of `YOLOv5`, the example config is as the following: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [192, 384, 768] +checkpoint_file = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmdet.SwinTransformer', # Using SwinTransformer from mmdet + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # Note: The 3 channels of SwinTransformer-Tiny output are [192, 384, 768], which do not match the original yolov5-s neck and need to be changed. + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # input channels of head need to be changed accordingly + widen_factor=widen_factor)) +) +``` + +### Use backbone network implemented in MMClassification + +1. Suppose you want to use `ConvNeXt-Tiny` as the backbone network of `YOLOv5`, the example config is as the following: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# please run the command, mim install "mmcls>=1.0.0rc2", to install mmcls +# import mmcls.models to trigger register_module in mmcls +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth' # noqa +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [192, 384, 768] + +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmcls.ConvNeXt', # Using ConvNeXt from mmcls + arch='tiny', + out_indices=(1, 2, 3), + drop_path_rate=0.4, + layer_scale_init_value=1.0, + gap_before_final_norm=False, + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint_file, + prefix='backbone.')), # The pre-trained weights of backbone network in MMCls have prefix='backbone.'. The prefix in the keys will be removed so that these weights can be normally loaded. + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # Note: The 3 channels of ConvNeXt-Tiny output are [192, 384, 768], which do not match the original yolov5-s neck and need to be changed. + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # input channels of head need to be changed accordingly + widen_factor=widen_factor)) +) +``` + +2. Suppose you want to use `MobileNetV3-small` as the backbone network of `YOLOv5`, the example config is as the following: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# please run the command, mim install "mmcls>=1.0.0rc2", to install mmcls +# import mmcls.models to trigger register_module in mmcls +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth' # noqa +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [24, 48, 96] + +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmcls.MobileNetV3', # Using MobileNetV3 from mmcls + arch='small', + out_indices=(3, 8, 11), # Modify out_indices + init_cfg=dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='backbone.')), # The pre-trained weights of backbone network in MMCls have prefix='backbone.'. The prefix in the keys will be removed so that these weights can be normally loaded. + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # Note: The 3 channels of MobileNetV3 output are [24, 48, 96], which do not match the original yolov5-s neck and need to be changed. + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # input channels of head need to be changed accordingly + widen_factor=widen_factor)) +) +``` + +### Use backbone network in `timm` through MMClassification + +MMClassification also provides a wrapper for the Py**T**orch **Im**age **M**odels (`timm`) backbone network, users can directly use the backbone network in `timm` through MMClassification. Suppose you want to use `EfficientNet-B1` as the backbone network of `YOLOv5`, the example config is as the following: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# please run the command, mim install "mmcls>=1.0.0rc2", to install mmcls +# and the command, pip install timm, to install timm +# import mmcls.models to trigger register_module in mmcls +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [40, 112, 320] + +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmcls.TIMMBackbone', # Using timm from mmcls + model_name='efficientnet_b1', # Using efficientnet_b1 in timm + features_only=True, + pretrained=True, + out_indices=(2, 3, 4)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # Note: The 3 channels of EfficientNet-B1 output are [40, 112, 320], which do not match the original yolov5-s neck and need to be changed. + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # input channels of head need to be changed accordingly + widen_factor=widen_factor)) +) +``` + +### Use backbone network implemented in MMSelfSup + +Suppose you want to use `ResNet-50` which is self-supervised trained by `MoCo v3` in MMSelfSup as the backbone network of `YOLOv5`, the example config is as the following: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# please run the command, mim install "mmselfsup>=1.0.0rc3", to install mmselfsup +# import mmselfsup.models to trigger register_module in mmselfsup +custom_imports = dict(imports=['mmselfsup.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth' # noqa +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [512, 1024, 2048] + +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmselfsup.ResNet', + depth=50, + num_stages=4, + out_indices=(2, 3, 4), # Note: out_indices of ResNet in MMSelfSup are 1 larger than those in MMdet and MMCls + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # Note: The 3 channels of ResNet-50 output are [512, 1024, 2048], which do not match the original yolov5-s neck and need to be changed. + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # input channels of head need to be changed accordingly + widen_factor=widen_factor)) +) +``` + +### Don't used pre-training weights + +When we replace the backbone network, the model initialization is trained by default loading the pre-training weight of the backbone network. Instead of using the pre-training weights of the backbone network, if you want to train the time model from scratch, +You can set `init_cfg` in 'backbone' to 'None'. In this case, the backbone network will be initialized with the default initialization method, instead of using the trained pre-training weight. + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [512, 1024, 2048] + +model = dict( + backbone=dict( + _delete_=True, # Delete the backbone field in _base_ + type='mmdet.ResNet', # Using ResNet from mmdet + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=None # If init_cfg is set to None, backbone will not be initialized with pre-trained weights + ), + neck=dict( + type='YOLOv5PAFPN', + widen_factor=widen_factor, + in_channels=channels, # Note: The 3 channels of ResNet-50 output are [512, 1024, 2048], which do not match the original yolov5-s neck and need to be changed. + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # input channels of head need to be changed accordingly + widen_factor=widen_factor)) +) +``` diff --git a/third_party/mmyolo/docs/en/recommended_topics/training_testing_tricks.md b/third_party/mmyolo/docs/en/recommended_topics/training_testing_tricks.md new file mode 100644 index 0000000000000000000000000000000000000000..48ce25f8bd1708727e2738ff2e81035e20b16466 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/training_testing_tricks.md @@ -0,0 +1,310 @@ +# Training testing tricks + +MMYOLO has already supported most of the YOLO series object detection related algorithms. Different algorithms may involve some practical tricks. This section will describe in detail the commonly used training and testing tricks supported by MMYOLO based on the implemented object detection algorithms. + +## Training tricks + +### Improve performance of detection + +#### 1. Multi-scale training + +In the field of object detection, multi-scale training is a very common trick. However, in YOLO, most of the models are trained with a single-scale input of 640x640. There are two reasons for this: + +1. Single-scale training is faster than multi-scale training. When the training epoch is at 300 or 500, training efficiency is a major concern for users. Multi-scale training will be slower. +2. Multi-scale augmentation is implied in the training pipeline, which is equivalent to the application of multi-scale training, such as the 'Mosaic', 'RandomAffine' and 'Resize', so there is no need to introduce the multi-scale training of model input again. + +Through experiments on the COCO dataset, it is founded that the multi-scale training is introduced directly after the output of YOLOv5's DataLoader, the actual performance improvement is very small. If you want to start multi-scale training for YOLO series algorithms in MMYOLO, you can refer to [ms_training_testing](../common_usage/ms_training_testing.md), +however, this does not mean that there are no significant gains in user-defined dataset fine-tuning mode + +#### 2 Use Mask annotation to optimize object detection performance + +When the dataset annotation is complete, such as boundary box annotation and instance segmentation annotation exist at the same time, but only part of the annotation is required for the task, the task can be trained with complete data annotation to improve the performance. +In object detection, we can also learn from instance segmentation annotation to improve the performance of object detection. The following is the detection result of additional instance segmentation annotation optimization introduced by YOLOv8. The performance gains are shown below: + +
+ +
+ +As shown in the figure, different scale models have different degrees of performance improvement. +It is important to note that 'Mask Refine' only functions in the data enhancement phase and does not require any changes to other training parts of the model and does not affect the speed of training. The details are as follows: + +
+ +
+ +The above-mentioned Mask represents a data augmentation transformation in which instance segmentation annotations play a key role. +The application of this technique to other YOLO series has varying degrees of increase. + +#### 3 Turn off strong augmentation in the later stage of training to improve detection performance + +This strategy is proposed for the first time in YOLOX algorithm and can greatly improve the detection performance. +The paper points out that Mosaic+MixUp can greatly improve the target detection performance, but the training pictures are far from the real distribution of natural pictures, and Mosaic's large number of cropping operations will bring many inaccurate label boxes, +therefore, YOLOX proposes to turn off the strong enhancement in the last 15 epochs and use the weaker enhancement instead, so that the detector can avoid the influence of inaccurate labeled boxes and complete the final convergence under the data distribution of the natural picture. + +This strategy has been applied to most YOLO algorithms. Taking YOLOv8 as an example, its data augmentation pipeline is shown as follows: + +
+ +
+ +However, when to turn off the strong augmentation is a hyper-parameter. If you turn off the strong augmentation too early, it may not give full play to Mosaic and other strong augmentation effects. If you turn off the strong enhancement too late, it will have no gain because it has been overfitted before. This phenomenon can be observed in YOLOv8 experiment + +| Backbone | Mask Refine | box AP | Epoch of best mAP | +| :------: | :---------: | :---------: | :---------------: | +| YOLOv8-n | No | 37.2 | 500 | +| YOLOv8-n | Yes | 37.4 (+0.2) | 499 | +| YOLOv8-s | No | 44.2 | 430 | +| YOLOv8-s | Yes | 45.1 (+0.9) | 460 | +| YOLOv8-m | No | 49.8 | 460 | +| YOLOv8-m | Yes | 50.6 (+0.8) | 480 | +| YOLOv8-l | No | 52.1 | 460 | +| YOLOv8-l | Yes | 53.0 (+0.9) | 491 | +| YOLOv8-x | No | 52.7 | 450 | +| YOLOv8-x | Yes | 54.0 (+1.3) | 460 | + +As can be seen from the above table: + +- Large models trained on COCO dataset for 500 epochs are prone to overfitting, and disabling strong augmentations such as Mosaic may not be effective in reducing overfitting in such cases. +- Using Mask annotations can alleviate overfitting and improve performance + +#### 4 Add pure background images to suppress false positives + +For non-open-world datasets in object detection, both training and testing are conducted on a fixed set of classes, and there is a possibility of producing false positives when applied to images with classes that have not been trained. A common mitigation strategy is to add a certain proportion of pure background images. +In most YOLO series, the function of suppressing false positives by adding pure background images is enabled by default. Users only need to set train_dataloader.dataset.filter_cfg.filter_empty_gt to False, indicating that pure background images should not be filtered out during training. + +#### 5 Maybe the AdamW works wonders + +YOLOv5, YOLOv6, YOLOv7 and YOLOv8 all adopt the SGD optimizer, which is strict about parameter settings, while AdamW is on the contrary, which is not so sensitive to learning rate. If user fine-tune a custom-dataset can try to select the AdamW optimizer. We did a simple trial in YOLOX and found that replacing the optimizer with AdamW on the tiny, s, and m scale models all had some improvement. + +| Backbone | Size | Batch Size | RTMDet-Hyp | Box AP | +| :--------: | :--: | :--------: | :--------: | :---------: | +| YOLOX-tiny | 416 | 8xb8 | No | 32.7 | +| YOLOX-tiny | 416 | 8xb32 | Yes | 34.3 (+1.6) | +| YOLOX-s | 640 | 8xb8 | No | 40.7 | +| YOLOX-s | 640 | 8xb32 | Yes | 41.9 (+1.2) | +| YOLOX-m | 640 | 8xb8 | No | 46.9 | +| YOLOX-m | 640 | 8xb32 | Yes | 47.5 (+0.6) | + +More details can be found in [configs/yolox/README.md](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolox/README.md#--results-and-models). + +#### 6 Consider ignore scenarios to avoid uncertain annotations + +Take CrowdHuman as an example, a crowded pedestrian detection dataset. Here's a typical image: + +
+ +
+ +The image is sourced from [detectron2 issue](https://github.com/facebookresearch/detectron2/issues/1909). The area marked with a yellow cross indicates the `iscrowd` label. There are two reasons for this: + +- This area is not a real person, such as the person on the poster +- The area is too crowded to mark + +In this scenario, you cannot simply delete such annotations, because once you delete them, it means treating them as background areas during training. However, they are different from the background. Firstly, the people on the posters are very similar to real people, and there are indeed people in crowded areas that are difficult to annotate. If you simply train them as background, it will cause false negatives. The best approach is to treat the crowded area as an ignored region, where any output in this area is directly ignored, with no loss calculated and no model fitting enforced. + +MMYOLO quickly and easily verifies the function of 'iscrowd' annotation on YOLOv5. The performance is as follows: + +| Backbone | ignore_iof_thr | box AP50(CrowDHuman Metric) | MR | JI | +| :------: | :------------: | :-------------------------: | :--: | :---: | +| YOLOv5-s | -1 | 85.79 | 48.7 | 75.33 | +| YOLOv5-s | 0.5 | 86.17 | 48.8 | 75.87 | + +`ignore_iof_thr` set to -1 indicates that the ignored labels are not considered, and it can be seen that the performance is improved to a certain extent, more details can be found in [CrowdHuman results](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/README.md#crowdhuman). If you encounter similar situations in your custom dataset, it is recommended that you consider using `ignore` labels to avoid uncertain annotations. + +#### 7 Use knowledge distillation + +Knowledge distillation is a widely used technique that can transfer the performance of a large model to a smaller model, thereby improving the detection performance of the smaller model. Currently, MMYOLO and MMRazor have supported this feature and conducted initial verification on RTMDet. + +| Model | box AP | +| :------------: | :---------: | +| RTMDet-tiny | 41.0 | +| RTMDet-tiny \* | 41.8 (+0.8) | +| RTMDet-s | 44.6 | +| RTMDet-s \* | 45.7 (+1.1) | +| RTMDet-m | 49.3 | +| RTMDet-m \* | 50.2 (+0.9) | +| RTMDet-l | 51.4 | +| RTMDet-l \* | 52.3 (+0.9) | + +`*` indicates the result of using the large model distillation, more details can be found in [Distill RTMDet](https://github.com/open-mmlab/mmyolo/tree/main/configs/rtmdet/distillation). + +#### 8 Stronger augmentation parameters are used for larger models + +If you have modified the model based on the default configuration or replaced the backbone network, it is recommended to scale the data augmentation parameters based on the current model size. Generally, larger models require stronger augmentation parameters, otherwise they may not fully leverage the benefits of large models. Conversely, if strong augmentations are applied to small models, it may result in underfitting. Taking RTMDet as an example, we can observe the data augmentation parameters for different model sizes. + +
+ +
+ +`random_resize_ratio_range` represents the random scaling range of `RandomResize`, and `mosaic_max_cached_images/mixup_max_cached_images` represents the number of cached images during `Mosaic/MixUp` augmentation, which can be used to adjust the strength of augmentation. The YOLO series models all follow the same set of parameter settings principles. + +### Accelerate training speed + +#### 1 Enable cudnn_benchmark for single-scale training + +Most of the input image sizes in the YOLO series algorithms are fixed, which is single-scale training. In this case, you can turn on cudnn_benchmark to accelerate the training speed. This parameter is mainly set for PyTorch's cuDNN underlying library, and setting this flag can allow the built-in cuDNN to automatically find the most efficient algorithm that is best suited for the current configuration to optimize the running efficiency. If this flag is turned on in multi-scale mode, it will continuously search for the optimal algorithm, which may slow down the training speed instead. + +To enable `cudnn_benchmark` in MMYOLO, you can set `env_cfg = dict(cudnn_benchmark=True)` in the configuration. + +#### 2 Use Mosaic and MixUp with caching + +If you have applied Mosaic and MixUp in your data augmentation, and after investigating the training bottleneck, it is found that the random image reading is causing the issue, then it is recommended to replace the regular Mosaic and MixUp with the cache-enabled versions proposed in RTMDet. + +| Data Aug | Use cache | ms/100 imgs | +| :------: | :-------: | :---------: | +| Mosaic | No | 87.1 | +| Mosaic | Yes | 24.0 | +| MixUp | No | 19.3 | +| MixUp | Yes | 12.4 | + +Mosaic and MixUp involve mixing multiple images, and their time consumption is K times that of ordinary data augmentation (K is the number of images mixed). For example, in YOLOv5, when doing Mosaic each time, the information of 4 images needs to be reloaded from the hard disk. However, the cached version of Mosaic and MixUp only needs to reload the current image, while the remaining images involved in the mixed augmentation are obtained from the cache queue, greatly improving efficiency by sacrificing a certain amount of memory space. + +
+data cache +
+ +As shown in the figure, N preloaded images and label data are stored in the cache queue. In each training step, only one new image and its label data need to be loaded and updated in the cache queue. (Images in the cache queue can be duplicated, as shown in the figure with img3 appearing twice.) If the length of the cache queue exceeds the preset length, a random image will be popped out. When it is necessary to perform mixed data augmentation, only the required images need to be randomly selected from the cache for concatenation or other processing, without the need to load all images from the hard disk, thus saving image loading time. + +### Reduce the number of hyperparameter + +YOLOv5 provides some practical methods for reducing the number of hyperparameter, which are described below. + +#### 1 Adaptive loss weighting, reducing one hyperparameter + +In general, it can be challenging to set hyperparameters specifically for different tasks or categories. YOLOv5 proposes some adaptive methods for scaling loss weights based on the number of classes and the number of detection output layers have been proposed based on practical experience, as shown below: + +```python +# scaled based on number of detection layers +loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), +loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + eps=1e-7, + reduction='mean', + loss_weight=loss_bbox_weight * (3 / num_det_layer + return_iou=True), +loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), +``` + +`loss_cls` can adaptively scale `loss_weight` based on the custom number of classes and the number of detection layers, `loss_bbox` can adaptively calculate based on the number of detection layers, and `loss_obj` can adaptively scale based on the input image size and the number of detection layers. This strategy allows users to avoid setting Loss weight hyperparameters. +It should be noted that this is only an empirical principle and not necessarily the optimal setting combination, it should be used as a reference. + +#### 2 Adaptive Weight Decay and Loss output values base on Batch Size, reducing two hyperparameters + +In general,when training on different `Batch Size`, it is necessary to follow the rule of automatic learning rate scaling. However, validation on various datasets shows that YOLOv5 can achieve good results without scaling the learning rate when changing the Batch Size, and sometimes scaling can even lead to worse results. The reason lies in the technique of `Weight Decay` and Loss output based on `Batch Size` adaptation in the code. In YOLOv5, `Weight Decay` and Loss output values will be scaled based on the total `Batch Size` being trained. The corresponding code is: + +```python +# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/engine/optimizers/yolov5_optim_constructor.py#L86 +if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') +``` + +```python +# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/models/dense_heads/yolov5_head.py#L635 + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) +``` + +The weight of Loss varies in different Batch Sizes, and generally, the larger Batch Size means most larger the Loss and gradient. I personally speculate that this can be equivalent to a scenario of linearly increasing learning rate when Batch Size increases. +In fact, from the [YOLOv5 Study: mAP vs Batch-Size](https://github.com/ultralytics/yolov5/discussions/2452) of YOLOv5, it can be found that it is desirable for users to achieve similar performance without modifying other parameters when modifying the Batch Size. The above two strategies are very good training techniques. + +### Save memory on GPU + +How to reduce training memory usage is a frequently discussed issue, and there are many techniques involved. The training executor of MMYOLO comes from MMEngine, so you can refer to the MMEngine documentation for how to reduce training memory usage. Currently, MMEngine supports gradient accumulation, gradient checkpointing, and large model training techniques, details of which can be found in the +[SAVE MEMORY ON GPU](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/save_gpu_memory.html). + +## Testing trick + +### Balance between inference speed and testing accuracy + +During model performance testing, we generally require a higher mAP, but in practical applications or inference, we want the model to perform faster while maintaining low false positive and false negative rates. In other words, the testing only focuses on mAP while ignoring post-processing and evaluation speed, while in practical applications, a balance between speed and accuracy is pursued. +In the YOLO series, it is possible to achieve a balance between speed and accuracy by controlling certain parameters. In this example, we will describe this in detail using YOLOv5. + +#### 1 Avoiding multiple class outputs for a single detection box during inference + +YOLOv5 uses BCE Loss (use_sigmoid=True) during the training of the classification branch. Assuming there are 4 object categories, the number of categories output by the classification branch is 4 instead of 5. Moreover, due to the use of sigmoid instead of softmax prediction, it is possible to predict multiple detection boxes that meet the filtering threshold at a certain position, which means that there may be a situation where one predicted bbox corresponds to multiple predicted labels. This is shown in the figure below: + +
+multi-label +
+ +Generally, when calculating mAP, the filtering threshold is set to 0.001. Due to the non-competitive prediction mode of sigmoid, one box may correspond to multiple labels. This calculation method can increase the recall rate when calculating mAP, but it may not be convenient for practical applications. + +One common approach is to increase the filtering threshold. However, if you don't want to have many false negatives, it is recommended to set the `multi_label` parameter to False. It is located in the configuration file at `mode.test_cfg.multi_label` and its default value is True, which allows one detection box to correspond to multiple labels. + +#### 2 Simplify test pipeline + +Note that the test pipeline for YOLOv5 is as follows: + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +It uses two different Resizes with different functions, with the aim of improving the mAP value during evaluation. In actual deployment, you can simplify this pipeline as shown below: + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + use_mini_pad=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +In practical applications, YOLOv5 algorithm uses a simplified pipeline with multi_label set to False, score_thr increased to 0.25, and iou_threshold reduced to 0.45. +In the YOLOv5 configuration, we provide a set of configuration parameters for detection on the ground, as detailed in [yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py). + +#### 3 Batch Shape speeds up the testing speed + +Batch Shape is a testing technique proposed in YOLOv5 that can speed up inference. The idea is to no longer require that all images in the testing process be 640x640, but to test at variable scales, as long as the shapes within the current batch are the same. This approach can reduce additional image pixel padding and speed up the inference process. The specific implementation of Batch Shape can be found in the [link](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/utils.py#L55). +Almost all algorithms in MMYOLO default to enabling the Batch Shape strategy during testing. If users want to disable this feature, you can set `val_dataloader.dataset.batch_shapes_cfg=None`. + +In practical applications, because dynamic shape is not as fast and efficient as fixed shape. Therefore, this strategy is generally not used in real-world scenarios. + +### TTA improves test accuracy + +Data augmentation with TTA (Test Time Augmentation) is a versatile trick that can improve the performance of object detection models and is particularly useful in competition scenarios. MMYOLO has already supported TTA, and it can be enabled simply by adding `--tta` when testing. For more details, please refer to the [TTA](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/common_usage/tta.md). diff --git a/third_party/mmyolo/docs/en/recommended_topics/troubleshooting_steps.md b/third_party/mmyolo/docs/en/recommended_topics/troubleshooting_steps.md new file mode 100644 index 0000000000000000000000000000000000000000..60cc1143f3db6556b8491ec0037df174c2fe823b --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/troubleshooting_steps.md @@ -0,0 +1 @@ +# Troubleshooting steps for common errors diff --git a/third_party/mmyolo/docs/en/recommended_topics/visualization.md b/third_party/mmyolo/docs/en/recommended_topics/visualization.md new file mode 100644 index 0000000000000000000000000000000000000000..f986648f385d1798663209812163dc3d87bce755 --- /dev/null +++ b/third_party/mmyolo/docs/en/recommended_topics/visualization.md @@ -0,0 +1,346 @@ +# Visualization + +This article includes feature map visualization and Grad-Based and Grad-Free CAM visualization + +## Feature map visualization + +
+image +
+ +Visualization provides an intuitive explanation of the training and testing process of the deep learning model. + +In MMYOLO, you can use the `Visualizer` provided in MMEngine for feature map visualization, which has the following features: + +- Support basic drawing interfaces and feature map visualization. +- Support selecting different layers in the model to get the feature map. The display methods include `squeeze_mean`, `select_max`, and `topk`. Users can also customize the layout of the feature map display with `arrangement`. + +### Feature map generation + +You can use `demo/featmap_vis_demo.py` to get a quick view of the visualization results. To better understand all functions, we list all primary parameters and their features here as follows: + +- `img`: the image to visualize. Can be either a single image file or a list of image file paths. + +- `config`: the configuration file for the algorithm. + +- `checkpoint`: the weight file of the corresponding algorithm. + +- `--out-file`: the file path to save the obtained feature map on your device. + +- `--device`: the hardware used for image inference. For example, `--device cuda:0` means use the first GPU, whereas `--device cpu` means use CPU. + +- `--score-thr`: the confidence score threshold. Only bboxes whose confidence scores are higher than this threshold will be displayed. + +- `--preview-model`: if there is a need to preview the model. This could make users understand the structure of the feature layer more straightforwardly. + +- `--target-layers`: the specific layer to get the visualized feature map result. + + - If there is only one parameter, the feature map of that specific layer will be visualized. For example, `--target-layers backbone` , `--target-layers neck` , `--target-layers backbone.stage4`, etc. + - If the parameter is a list, all feature maps of the corresponding layers will be visualized. For example, `--target-layers backbone.stage4 neck` means that the stage4 layer of the backbone and the three layers of the neck are output simultaneously, a total of four layers of feature maps. + +- `--channel-reduction`: if needs to compress multiple channels into a single channel and then display it overlaid with the picture as the input tensor usually has multiple channels. Three parameters can be used here: + + - `squeeze_mean`: The input channel C will be compressed into one channel using the mean function, and the output dimension becomes (1, H, W). + - `select_max`: Sum the input channel C in the spatial space, and the dimension becomes (C, ). Then select the channel with the largest value. + - `None`: Indicates that no compression is required. In this case, the `topk` feature maps with the highest activation degree can be selected to display through the `topk` parameter. + +- `--topk`: only valid when the `channel_reduction` parameter is `None`. It selects the `topk` channels according to the activation degree and then displays it overlaid with the image. The display layout can be specified using the `--arrangement` parameter, which is an array of two numbers separated by space. For example, `--topk 5 --arrangement 2 3` means the five feature maps with the highest activation degree are displayed in `2 rows and 3 columns`. Similarly, `--topk 7 --arrangement 3 3` means the seven feature maps with the highest activation degree are displayed in `3 rows and 3 columns`. + + - If `topk` is not -1, topk channels will be selected to display in order of the activation degree. + - If `topk` is -1, channel number C must be either 1 or 3 to indicate that the input data is a picture. Otherwise, an error will prompt the user to compress the channel with `channel_reduction`. + +- Considering that the input feature map is usually very small, the function will upsample the feature map by default for easy visualization. + +**Note: When the image and feature map scales are different, the `draw_featmap` function will automatically perform an upsampling alignment. If your image has an operation such as `Pad` in the preprocessing during the inference, the feature map obtained is processed with `Pad`, which may cause misalignment problems if you directly upsample the image.** + +### Usage examples + +Take the pre-trained YOLOv5-s model as an example. Please download the model weight file to the root directory. + +```shell +cd mmyolo +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth +``` + +(1) Compress the multi-channel feature map into a single channel with `select_max` and display it. By extracting the output of the `backbone` layer for visualization, the feature maps of the three output layers in the `backbone` will be generated: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone \ + --channel-reduction select_max +``` + +
+image +
+ +The above code has the problem that the image and the feature map need to be aligned. There are two solutions for this: + +1. Change the post-process to simple `Resize` in the YOLOv5 configuration, which does not affect visualization. + +2. Use the images after the pre-process stage instead of before the pre-process when visualizing. + +**For simplicity purposes, we take the first solution in this demo. However, the second solution will be made in the future so that everyone can use it without extra modification on the configuration file**. More specifically, change the original `test_pipeline` with the version with Resize process only. + +The original `test_pipeline` is: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +Change to the following version: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # change the LetterResize to mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +The correct result is shown as follows: + +
+image +
+ +(2) Compress the multi-channel feature map into a single channel using the `squeeze_mean` parameter and display it. By extracting the output of the `neck` layer for visualization, the feature maps of the three output layers of `neck` will be generated: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers neck \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +(3) Compress the multi-channel feature map into a single channel using the `squeeze_mean` parameter and display it. Then, visualize the feature map by extracting the outputs of the `backbone.stage4` and `backbone.stage3` layers, and the feature maps of the two output layers will be generated: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone.stage4 backbone.stage3 \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +(4) Use the `--topk 3 --arrangement 2 2` parameter to select the top 3 channels with the highest activation degree in the multi-channel feature map and display them in a `2x2` layout. Users can change the layout to what they want through the `arrangement` parameter, and the feature map will be automatically formatted. First, the `top3` feature map in each layer is formatted in a `2x2` shape, and then each layer is formatted in `2x2` as well: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone.stage3 backbone.stage4 \ + --channel-reduction None \ + --topk 3 \ + --arrangement 2 2 +``` + +
+image +
+ +(5) When the visualization process finishes, you can choose to display the result or store it locally. You only need to add the parameter `--out-file xxx.jpg`: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone \ + --channel-reduction select_max \ + --out-file featmap_backbone.jpg +``` + +## Grad-Based and Grad-Free CAM Visualization + +Object detection CAM visualization is much more complex and different than classification CAM. +This article only briefly explains the usage, and a separate document will be opened to describe the implementation principles and precautions in detail later. + +You can call `demo/boxmap_vis_demo.py` to get the AM visualization results at the Box level easily and quickly. Currently, `YOLOv5/YOLOv6/YOLOX/RTMDet` is supported. + +Taking YOLOv5 as an example, as with the feature map visualization, you need to modify the `test_pipeline` first, otherwise there will be a problem of misalignment between the feature map and the original image. + +The original `test_pipeline` is: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +Change to the following version: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # change the LetterResize to mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +(1) Use the `GradCAM` method to visualize the AM of the last output layer of the neck module + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth +``` + +
+image +
+ +The corresponding feature AM is as follows: + +
+image +
+ +It can be seen that the `GradCAM` effect can highlight the AM information at the box level. + +You can choose to visualize only the top prediction boxes with the highest prediction scores via the `--topk` parameter + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --topk 2 +``` + +
+image +
+ +(2) Use the AblationCAM method to visualize the AM of the last output layer of the neck module + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --method ablationcam +``` + +
+image +
+ +Since `AblationCAM` is weighted by the contribution of each channel to the score, it is impossible to visualize only the AM information at the box level like `GradCAN`. But you can use `--norm-in-bbox` to only show bbox inside AM + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --method ablationcam \ + --norm-in-bbox +``` + +
+image +
+ +## Perform inference on large images + +First install [`sahi`](https://github.com/obss/sahi) with: + +```shell +pip install -U sahi>=0.11.4 +``` + +Perform MMYOLO inference on large images (as satellite imagery) as: + +```shell +wget -P checkpoint https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth + +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py \ + checkpoint/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth \ +``` + +Arrange slicing parameters as: + +```shell +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py \ + checkpoint/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth \ + --patch-size 512 + --patch-overlap-ratio 0.25 +``` + +Export debug visuals while performing inference on large images as: + +```shell +python demo/large_image_demo.py \ + demo/large_image.jpg \ + configs/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py \ + checkpoint/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth \ + --debug +``` + +[`sahi`](https://github.com/obss/sahi) citation: + +``` +@article{akyon2022sahi, + title={Slicing Aided Hyper Inference and Fine-tuning for Small Object Detection}, + author={Akyon, Fatih Cagatay and Altinuc, Sinan Onur and Temizel, Alptekin}, + journal={2022 IEEE International Conference on Image Processing (ICIP)}, + doi={10.1109/ICIP46576.2022.9897990}, + pages={966-970}, + year={2022} +} +``` diff --git a/third_party/mmyolo/docs/en/stat.py b/third_party/mmyolo/docs/en/stat.py new file mode 100755 index 0000000000000000000000000000000000000000..6c8afcc7bd287b3287452095cbeb3cfa0aaf0fef --- /dev/null +++ b/third_party/mmyolo/docs/en/stat.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +import functools as func +import glob +import os.path as osp +import re + +import numpy as np + +url_prefix = 'https://github.com/open-mmlab/mmdetection/blob/3.x/configs' + +files = sorted(glob.glob('../../configs/*/README.md')) + +stats = [] +titles = [] +num_ckpts = 0 + +for f in files: + url = osp.dirname(f.replace('../../configs', url_prefix)) + + with open(f) as content_file: + content = content_file.read() + + title = content.split('\n')[0].replace('# ', '').strip() + ckpts = { + x.lower().strip() + for x in re.findall(r'\[model\]\((https?.*)\)', content) + } + + if len(ckpts) == 0: + continue + + _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] + assert len(_papertype) > 0 + papertype = _papertype[0] + + paper = {(papertype, title)} + + titles.append(title) + num_ckpts += len(ckpts) + + statsmsg = f""" +\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts) +""" + stats.append((paper, ckpts, statsmsg)) + +allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats]) +msglist = '\n'.join(x for _, _, x in stats) + +papertypes, papercounts = np.unique([t for t, _ in allpapers], + return_counts=True) +countstr = '\n'.join( + [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) + +modelzoo = f""" +# Model Zoo Statistics + +* Number of papers: {len(set(titles))} +{countstr} + +* Number of checkpoints: {num_ckpts} + +{msglist} +""" + +with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) diff --git a/third_party/mmyolo/docs/en/switch_language.md b/third_party/mmyolo/docs/en/switch_language.md new file mode 100644 index 0000000000000000000000000000000000000000..57b71ebfe41843c8bc8ad29d01d4657f0770465e --- /dev/null +++ b/third_party/mmyolo/docs/en/switch_language.md @@ -0,0 +1,3 @@ +## English + +## 简体中文 diff --git a/third_party/mmyolo/docs/en/tutorials/config.md b/third_party/mmyolo/docs/en/tutorials/config.md new file mode 100644 index 0000000000000000000000000000000000000000..448452243ec9f6dd9bf6e2ef2fee7c2451b48e7e --- /dev/null +++ b/third_party/mmyolo/docs/en/tutorials/config.md @@ -0,0 +1,556 @@ +# Learn about Configs with YOLOv5 + +MMYOLO and other OpenMMLab repositories use [MMEngine's config system](https://mmengine.readthedocs.io/en/latest/tutorials/config.html). It has a modular and inheritance design, which is convenient to conduct various experiments. + +## Config file content + +MMYOLO uses a modular design, all modules with different functions can be configured through the config. Taking [yolov5_s-v61_syncbn_8xb16-300e_coco.py](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py) as an example, we will introduce each field in the config according to different function modules: + +### Important parameters + +When changing the training configuration, it is usually necessary to modify the following parameters. For example, the scaling factors `deepen_factor` and `widen_factor` are used by the network to control the size of the model in MMYOLO. So we recommend defining these parameters separately in the configuration file. + +```python +img_scale = (640, 640) # height of image, width of image +deepen_factor = 0.33 # The scaling factor that controls the depth of the network structure, 0.33 for YOLOv5-s +widen_factor = 0.5 # The scaling factor that controls the width of the network structure, 0.5 for YOLOv5-s +max_epochs = 300 # Maximum training epochs: 300 epochs +save_epoch_intervals = 10 # Validation intervals. Run validation every 10 epochs. +train_batch_size_pre_gpu = 16 # Batch size of a single GPU during training +train_num_workers = 8 # Worker to pre-fetch data for each single GPU +val_batch_size_pre_gpu = 1 # Batch size of a single GPU during validation. +val_num_workers = 2 # Worker to pre-fetch data for each single GPU during validation +``` + +### Model config + +In MMYOLO's config, we use `model` to set up detection algorithm components. In addition to neural network components such as `backbone`, `neck`, etc, it also requires `data_preprocessor`, `train_cfg`, and `test_cfg`. `data_preprocessor` is responsible for processing a batch of data output by the dataloader. `train_cfg` and `test_cfg` in the model config are for training and testing hyperparameters of the components. + +```python +anchors = [[(10, 13), (16, 30), (33, 23)], # Basic size of multi-scale prior box + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] +strides = [8, 16, 32] # Strides of multi-scale prior box + +model = dict( + type='YOLODetector', # The name of detector + data_preprocessor=dict( # The config of data preprocessor, usually includes image normalization and padding + type='mmdet.DetDataPreprocessor', # The type of the data preprocessor, refer to https://mmdetection.readthedocs.io/en/dev-3.x/api.html#module-mmdet.models.data_preprocessors. It is worth noticing that using `YOLOv5DetDataPreprocessor` achieves faster training speed. + mean=[0., 0., 0.], # Mean values used to pre-training the pre-trained backbone models, ordered in R, G, B + std=[255., 255., 255.], # Standard variance used to pre-training the pre-trained backbone models, ordered in R, G, B + bgr_to_rgb=True), # whether to convert image from BGR to RGB + backbone=dict( # The config of backbone + type='YOLOv5CSPDarknet', # The type of backbone, currently it is available candidates are 'YOLOv5CSPDarknet', 'YOLOv6EfficientRep', 'YOLOXCSPDarknet' + deepen_factor=deepen_factor, # The scaling factor that controls the depth of the network structure + widen_factor=widen_factor, # The scaling factor that controls the width of the network structure + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), # The config of normalization layers. + act_cfg=dict(type='SiLU', inplace=True)), # The config of activation function + neck=dict( + type='YOLOv5PAFPN', # The neck of detector is YOLOv5FPN, We also support 'YOLOv6RepPAFPN', 'YOLOXPAFPN'. + deepen_factor=deepen_factor, # The scaling factor that controls the depth of the network structure + widen_factor=widen_factor, # The scaling factor that controls the width of the network structure + in_channels=[256, 512, 1024], # The input channels, this is consistent with the output channels of backbone + out_channels=[256, 512, 1024], # The output channels of each level of the pyramid feature map, this is consistent with the input channels of head + num_csp_blocks=3, # The number of bottlenecks of CSPLayer + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), # The config of normalization layers. + act_cfg=dict(type='SiLU', inplace=True)), # The config of activation function + bbox_head=dict( + type='YOLOv5Head', # The type of BBox head is 'YOLOv5Head', we also support 'YOLOv6Head', 'YOLOXHead' + head_module=dict( + type='YOLOv5HeadModule', # The type of Head module is 'YOLOv5HeadModule', we also support 'YOLOv6HeadModule', 'YOLOXHeadModule' + num_classes=80, # Number of classes for classification + in_channels=[256, 512, 1024], # The input channels, this is consistent with the input channels of neck + widen_factor=widen_factor, # The scaling factor that controls the width of the network structure + featmap_strides=[8, 16, 32], # The strides of the multi-scale feature maps + num_base_priors=3), # The number of prior boxes on a certain point + prior_generator=dict( # The config of prior generator + type='mmdet.YOLOAnchorGenerator', # The prior generator uses 'YOLOAnchorGenerator. Refer to https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/task_modules/prior_generators/anchor_generator.py for more details + base_sizes=anchors, # Basic scale of the anchor + strides=strides), # The strides of the anchor generator. This is consistent with the FPN feature strides. The strides will be taken as base_sizes if base_sizes is not set. + ), + test_cfg=dict( + multi_label=True, # The config of multi-label for multi-clas prediction. The default setting is True. + nms_pre=30000, # The number of boxes before NMS + score_thr=0.001, # Threshold to filter out boxes. + nms=dict(type='nms', # Type of NMS + iou_threshold=0.65), # NMS threshold + max_per_img=300)) # Max number of detections of each image +``` + +### Dataset and evaluator config + +[Dataloaders](https://pytorch.org/docs/stable/data.html?highlight=data%20loader#torch.utils.data.DataLoader) are required for the training, validation, and testing of the [runner](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). Dataset and data pipeline need to be set to build the dataloader. Due to the complexity of this part, we use intermediate variables to simplify the writing of dataloader configs. More complex data augmentation methods are adopted for the lightweight object detection algorithms in MMYOLO. Therefore, MMYOLO has a wider range of dataset configurations than other models in MMDetection. + +The training and testing data flow of YOLOv5 have a certain difference. We will introduce them separately here. + +```python +dataset_type = 'CocoDataset' # Dataset type, this will be used to define the dataset +data_root = 'data/coco/' # Root path of data + +pre_transform = [ # Training data loading pipeline + dict( + type='LoadImageFromFile'), # First pipeline to load images from file path + dict(type='LoadAnnotations', # Second pipeline to load annotations for current image + with_bbox=True) # Whether to use bounding box, True for detection +] + +albu_train_transforms = [ # Albumentation is introduced for image data augmentation. We follow the code of YOLOv5-v6.1, please make sure its version is 1.0.+ + dict(type='Blur', p=0.01), # Blur augmentation, the probability is 0.01 + dict(type='MedianBlur', p=0.01), # Median blue augmentation, the probability is 0.01 + dict(type='ToGray', p=0.01), # Randomly convert RGB to gray-scale image, the probability is 0.01 + dict(type='CLAHE', p=0.01) # CLAHE(Limited Contrast Adaptive Histogram Equalization) augmentation, the probability is 0.01 +] +train_pipeline = [ # Training data processing pipeline + *pre_transform, # Introduce the pre-defined training data loading processing + dict( + type='Mosaic', # Mosaic augmentation + img_scale=img_scale, # The image scale after Mosaic augmentation + pad_val=114.0, # Pixel values filled with empty areas + pre_transform=pre_transform), # Pre-defined training data loading pipeline + dict( + type='YOLOv5RandomAffine', # Random Affine augmentation for YOLOv5 + max_rotate_degree=0.0, # Maximum degrees of rotation transform + max_shear_degree=0.0, # Maximum degrees of shear transform + scaling_ratio_range=(0.5, 1.5), # Minimum and maximum ratio of scaling transform + border=(-img_scale[0] // 2, -img_scale[1] // 2), # Distance from height and width sides of input image to adjust output shape. Only used in mosaic dataset. + border_val=(114, 114, 114)), # Border padding values of 3 channels. + dict( + type='mmdet.Albu', # Albumentation of MMDetection + transforms=albu_train_transforms, # Pre-defined albu_train_transforms + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), # Random augmentation on HSV channel + dict(type='mmdet.RandomFlip', prob=0.5), # Random flip, the probability is 0.5 + dict( + type='mmdet.PackDetInputs', # Pipeline that formats the annotation data and decides which keys in the data should be packed into data_samples + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +train_dataloader = dict( # Train dataloader config + batch_size=train_batch_size_pre_gpu, # Batch size of a single GPU during training + num_workers=train_num_workers, # Worker to pre-fetch data for each single GPU during training + persistent_workers=True, # If ``True``, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed. + pin_memory=True, # If ``True``, the dataloader will allow pinned memory, which can reduce copy time between CPU and memory + sampler=dict( # training data sampler + type='DefaultSampler', # DefaultSampler which supports both distributed and non-distributed training. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # randomly shuffle the training data in each epoch + dataset=dict( # Train dataset config + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', # Path of annotation file + data_prefix=dict(img='train2017/'), # Prefix of image path + filter_cfg=dict(filter_empty_gt=False, min_size=32), # Config of filtering images and annotations + pipeline=train_pipeline)) +``` + +In the testing phase of YOLOv5, the [Letter Resize](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/transforms/transforms.py#L116) method resizes all the test images to the same scale, which preserves the aspect ratio of all testing images. Therefore, the validation and testing phases share the same data pipeline. + +```python +test_pipeline = [ # Validation/ Testing dataloader config + dict( + type='LoadImageFromFile'), # First pipeline to load images from file path + dict(type='YOLOv5KeepRatioResize', # Second pipeline to resize images with the same aspect ratio + scale=img_scale), # Pipeline that resizes the images + dict( + type='LetterResize', # Third pipeline to rescale images to meet the requirements of different strides + scale=img_scale, # Target scale of image + allow_scale_up=False, # Allow scale up when radio > 1 + pad_val=dict(img=114)), # Padding value + dict(type='LoadAnnotations', with_bbox=True), # Forth pipeline to load annotations for current image + dict( + type='mmdet.PackDetInputs', # Pipeline that formats the annotation data and decides which keys in the data should be packed into data_samples + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_pre_gpu, # Batch size of a single GPU + num_workers=val_num_workers, # Worker to pre-fetch data for each single GPU + persistent_workers=True, # If ``True``, the dataloader will not shut down the worker processes after an epoch end, which can accelerate training speed. + pin_memory=True, # If ``True``, the dataloader will allow pinned memory, which can reduce copy time between CPU and memory + drop_last=False, # IF ``True``, the dataloader will drop data, which fails to make a batch + sampler=dict( + type='DefaultSampler', # Default sampler for both distributed and normal training + shuffle=False), # not shuffle during validation and testing + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, # # Turn on test mode of the dataset to avoid filtering annotations or images + data_prefix=dict(img='val2017/'), # Prefix of image path + ann_file='annotations/instances_val2017.json', # Path of annotation file + pipeline=test_pipeline, + batch_shapes_cfg=dict( # Config of batch shapes + type='BatchShapePolicy', # Policy that makes paddings with least pixels during batch inference process, which does not require the image scales of all batches to be the same throughout validation. + batch_size=val_batch_size_pre_gpu, # Batch size for batch shapes strategy, equals to validation batch size on single GPU + img_size=img_scale[0], # Image scale + size_divisor=32, # The image scale of padding should be divided by pad_size_divisor + extra_pad_ratio=0.5))) # additional paddings for pixel scale + +test_dataloader = val_dataloader +``` + +[Evaluators](https://mmengine.readthedocs.io/en/latest/design/evaluation.html) are used to compute the metrics of the trained model on the validation and testing datasets. The config of evaluators consists of one or a list of metric configs: + +```python +val_evaluator = dict( # Validation evaluator config + type='mmdet.CocoMetric', # The coco metric used to evaluate AR, AP, and mAP for detection + proposal_nums=(100, 1, 10), # The number of proposal used to evaluate for detection + ann_file=data_root + 'annotations/instances_val2017.json', # Annotation file path + metric='bbox', # Metrics to be evaluated, `bbox` for detection +) +test_evaluator = val_evaluator # Testing evaluator config +``` + +Since the test dataset has no annotation files, the test_dataloader and test_evaluator config in MMYOLO are generally the same as the val's. If you want to save the detection results on the test dataset, you can write the config like this: + +```python +# inference on test dataset and +# format the output results for submission. +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + data_prefix=dict(img='test2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_evaluator = dict( + type='mmdet.CocoMetric', + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + metric='bbox', + format_only=True, # Only format and save the results to coco json file + outfile_prefix='./work_dirs/coco_detection/test') # The prefix of output json files +``` + +### Training and testing config + +MMEngine's runner uses Loop to control the training, validation, and testing processes. +Users can set the maximum training epochs and validation intervals with these fields. + +```python +max_epochs = 300 # Maximum training epochs: 300 epochs +save_epoch_intervals = 10 # Validation intervals. Run validation every 10 epochs. + +train_cfg = dict( + type='EpochBasedTrainLoop', # The training loop type. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=max_epochs, # Maximum training epochs: 300 epochs + val_interval=save_epoch_intervals) # Validation intervals. Run validation every 10 epochs. +val_cfg = dict(type='ValLoop') # The validation loop type +test_cfg = dict(type='TestLoop') # The testing loop type +``` + +MMEngine also supports dynamic intervals for evaluation. For example, you can run validation every 10 epochs on the first 280 epochs, and run validation every epoch on the final 20 epochs. The configurations are as follows. + +```python +max_epochs = 300 # Maximum training epochs: 300 epochs +save_epoch_intervals = 10 # Validation intervals. Run validation every 10 epochs. + +train_cfg = dict( + type='EpochBasedTrainLoop', # The training loop type. Refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=max_epochs, # Maximum training epochs: 300 epochs + val_interval=save_epoch_intervals, # Validation intervals. Run validation every 10 epochs. + dynamic_intervals=[(280, 1)]) # Switch evaluation on 280 epoch and switch the interval to 1. +val_cfg = dict(type='ValLoop') # The validation loop type +test_cfg = dict(type='TestLoop') # The testing loop type +``` + +### Optimization config + +`optim_wrapper` is the field to configure optimization-related settings. The optimizer wrapper not only provides the functions of the optimizer but also supports functions such as gradient clipping, mixed precision training, etc. Find out more in the [optimizer wrapper tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html). + +```python +optim_wrapper = dict( # Optimizer wrapper config + type='OptimWrapper', # Optimizer wrapper type, switch to AmpOptimWrapper to enable mixed precision training. + optimizer=dict( # Optimizer config. Support all kinds of optimizers in PyTorch. Refer to https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # Stochastic gradient descent optimizer + lr=0.01, # The base learning rate + momentum=0.937, # Stochastic gradient descent with momentum + weight_decay=0.0005, # Weight decay of SGD + nesterov=True, # Enable Nesterov momentum, Refer to http://www.cs.toronto.edu/~hinton/absps/momentum.pdf + batch_size_pre_gpu=train_batch_size_pre_gpu), # Enable automatic learning rate scaling + clip_grad=None, # Gradient clip option. Set None to disable gradient clip. Find usage in https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html + constructor='YOLOv5OptimizerConstructor') # The constructor for YOLOv5 optimizer +``` + +`param_scheduler` is the field that configures methods of adjusting optimization hyperparameters such as learning rate and momentum. Users can combine multiple schedulers to create a desired parameter adjustment strategy. Find more in the [parameter scheduler tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/param_scheduler.html). In YOLOv5, parameter scheduling is complex to implement and difficult to implement with `param_scheduler`. So we use `YOLOv5ParamSchedulerHook` to implement it (see next section), which is simpler but less versatile. + +```python +param_scheduler = None +``` + +### Hook config + +Users can attach hooks to training, validation, and testing loops to insert some operations during running. There are two different hook fields, one is `default_hooks` and the other is `custom_hooks`. + +`default_hooks` is a dict of hook configs for the hooks that must be required at the runtime. They have default priority which should not be modified. If not set, the runner will use the default values. To disable a default hook, users can set its config to `None`. + +```python +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', # MMYOLO uses `YOLOv5ParamSchedulerHook` to adjust hyper-parameters in optimizers + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', # Hook to save model checkpoint on specific intervals + interval=save_epoch_intervals, # Save model checkpoint every 10 epochs. + max_keep_ckpts=3)) # The maximum checkpoints to keep. +``` + +`custom_hooks` is a list of hook configs. Users can develop their hooks and insert them in this field. + +```python +custom_hooks = [ + dict( + type='EMAHook', # A Hook to apply Exponential Moving Average (EMA) on the model during training. + ema_type='ExpMomentumEMA', # The type of EMA strategy to use. + momentum=0.0001, # The momentum of EMA + update_buffers=True, # # If ``True``, calculate the running averages of model parameters + priority=49) # Priority higher than NORMAL(50) +] +``` + +### Runtime config + +```python +default_scope = 'mmyolo' # The default registry scope to find modules. Refer to https://mmengine.readthedocs.io/en/latest/tutorials/registry.html + +env_cfg = dict( + cudnn_benchmark=True, # Whether to enable cudnn benchmark + mp_cfg=dict( # Multi-processing config + mp_start_method='fork', # Use fork to start multi-processing threads. 'fork' is usually faster than 'spawn' but may be unsafe. See discussion in https://github.com/pytorch/pytorch/issues/1355 + opencv_num_threads=0), # Disable opencv multi-threads to avoid system being overloaded + dist_cfg=dict(backend='nccl'), # Distribution configs +) + +vis_backends = [dict(type='LocalVisBackend')] # Visualization backends. Refer to: https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html +visualizer = dict( + type='mmdet.DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict( + type='LogProcessor', # Log processor to process runtime logs + window_size=50, # Smooth interval of log values + by_epoch=True) # Whether to format logs with epoch style. Should be consistent with the train loop's type. + +log_level = 'INFO' # The level of logging. +load_from = None # Load model checkpoint as a pre-trained model from a given path. This will not resume training. +resume = False # Whether to resume from the checkpoint defined in `load_from`. If `load_from` is None, it will resume the latest checkpoint in the `work_dir`. +``` + +## Config file inheritance + +`config/_base_` contains default runtime. The configs that are composed of components from `_base_` are called _primitive_. + +For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should be inherited from the _primitive_ config. In this way, the maximum inheritance level is 3. + +For easy understanding, we recommend contributors inherit from existing methods. +For example, if some modification is made based on YOLOv5-s, such as modifying the depth of the network, users may first inherit the `_base_ = ./yolov5_s-v61_syncbn_8xb16-300e_coco.py `, then modify the necessary fields in the config files. + +If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder `yolov100` under `configs`, + +Please refer to the [mmengine config tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) for more details. + +By setting the `_base_` field, we can set which files the current configuration file inherits from. + +When `_base_` is a string of a file path, it means inheriting the contents of one config file. + +```python +_base_ = '../_base_/default_runtime.py' +``` + +When `_base_` is a list of multiple file paths, it means inheriting multiple files. + +```python +_base_ = [ + './yolov5_s-v61_syncbn_8xb16-300e_coco.py', + '../_base_/default_runtime.py' +] +``` + +If you wish to inspect the config file, you may run `mim run mmdet print_config /PATH/TO/CONFIG` to see the complete config. + +### Ignore some fields in the base configs + +Sometimes, you may set `_delete_=True` to ignore some of the fields in base configs. +You may refer to the [mmengine config tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) for a simple illustration. + +In MMYOLO, for example, to change the backbone of RTMDet with the following config. + +```python +model = dict( + type='YOLODetector', + data_preprocessor=dict(...), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict(...), + bbox_head=dict(...)) +``` + +If you want to change `CSPNeXt` to `YOLOv6EfficientRep` for the RTMDet backbone, because there are different fields (`channel_attention` and `expand_ratio`) in `CSPNeXt` and `YOLOv6EfficientRep`, you need to use `_delete_=True` to replace all the old keys in the `backbone` field with the new keys. + +```python +_base_ = '../rtmdet/rtmdet_l_syncbn_8xb32-300e_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='YOLOv6EfficientRep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict(...), + bbox_head=dict(...)) +``` + +### Use intermediate variables in configs + +Some intermediate variables are used in the configs files, like `train_pipeline` and `test_pipeline` in datasets. It's worth noting that when modifying intermediate variables in the children configs, users need to pass the intermediate variables into corresponding fields again. +For example, we would like to change the `image_scale` during training and add `YOLOv5MixUp` data augmentation, `img_scale/train_pipeline/test_pipeline` are intermediate variables we would like to modify. + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +img_scale = (1280, 1280) # image height, image width +affine_scale = 0.9 + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', # MixUp augmentation of YOLOv5 + prob=0.1, # the probability of YOLOv5MixUp + pre_transform=[*pre_transform,*mosaic_affine_pipeline]), # Pre-defined Training data pipeline and MixUp augmentation. + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict( + type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +``` + +We first define a new `train_pipeline`/`test_pipeline` and pass it into `data`. + +Likewise, if we want to switch from `SyncBN` to `BN` or `MMSyncBN`, we need to modify every `norm_cfg` in the configuration file. + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + backbone=dict(norm_cfg=norm_cfg), + neck=dict(norm_cfg=norm_cfg), + ...) +``` + +### Reuse variables in \_base\_ file + +If the users want to reuse the variables in the base file, they can get a copy of the corresponding variable by using `{{_base_.xxx}}`. The latest version of MMEngine also supports reusing variables without `{{}}` usage. + +E.g: + +```python +_base_ = '../_base_/default_runtime.py' + +pre_transform = _base_.pre_transform # `pre_transform` equals to `pre_transform` in the _base_ config +``` + +## Modify config through script arguments + +When submitting jobs using `tools/train.py` or `tools/test.py`, you may specify `--cfg-options` to in-place modify the config. + +- Update config keys of dict chains. + + The config options can be specified following the order of the dict keys in the original config. + For example, `--cfg-options model.backbone.norm_eval=False` changes the all BN modules in model backbones to `train` mode. + +- Update keys inside a list of configs. + + Some config dicts are composed as a list in your config. For example, the training pipeline `train_dataloader.dataset.pipeline` is normally a list, e.g. `[dict(type='LoadImageFromFile'), ...]`. If you want to change `'LoadImageFromFile'` to `'LoadImageFromNDArray'` in the pipeline, you may specify `--cfg-options data.train.pipeline.0.type=LoadImageFromNDArray`. + +- Update values of list/tuples. + + Sometimes the value to update is a list or a tuple, for example, the config file normally sets `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`. If you want to change the mean values, you may specify `--cfg-options model.data_preprocessor.mean="[127,127,127]"`. Note that the quotation mark `"` is necessary to support list/tuple data types, and that **NO** white space is allowed inside the quotation marks in the specified value. + +## Config name style + +We follow the below style to name config files. Contributors are advised to follow the same style. + +``` +{algorithm name}_{model component names [component1]_[component2]_[...]}-[version id]_[norm setting]_[data preprocessor type]_{training settings}_{training dataset information}_[testing dataset information].py +``` + +The file name is divided into 8 name fields, which have 4 required parts and 4 optional parts. All parts and components are connected with `_` and words of each part or component should be connected with `-`. `{}` indicates the required name field, and `[]` indicates the optional name field. + +- `{algorithm name}`: The name of the algorithm. It can be a detector name such as `yolov5`, `yolov6`, `yolox`, etc. +- `{component names}`: Names of the components used in the algorithm such as backbone, neck, etc. For example, `yolov5_s` means its `deepen_factor` is `0.33` and its `widen_factor` is `0.5`. +- `[version_id]` (optional): Since the evolution of the YOLO series is much faster than traditional object detection algorithms, `version id` is used to distinguish the differences between different sub-versions. E.g, YOLOv5-3.0 uses the `Focus` layer as the stem layer, and YOLOv5-6.0 uses the `Conv` layer as the stem layer. +- `[norm_setting]` (optional): `bn` indicates `Batch Normalization`, `syncbn` indicates `Synchronized Batch Normalization`。 +- `[data preprocessor type]` (optional): `fast` incorporates [YOLOv5DetDataPreprocessor](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/data_preprocessors/data_preprocessor.py#L9) and [yolov5_collate](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/utils.py#L12) to preprocess data. The training speed is faster than the default `mmdet.DetDataPreprocessor`, while results in extending the overall pipeline to multi-task learning. +- `{training settings}`: Information of training settings such as batch size, augmentations, loss trick, scheduler, and epochs/iterations. For example: `8xb16-300e_coco` means using 8-GPUs x 16-images-per-GPU, and train 300 epochs. + Some abbreviations: + - `{gpu x batch_per_gpu}`: GPUs and samples per GPU. For example, `4xb4` is the short term of 4-GPUs x 4-images-per-GPU. + - `{schedule}`: training schedule, default option in MMYOLO is 300 epochs. +- `{training dataset information}`: Training dataset names like `coco`, `cityscapes`, `voc-0712`, `wider-face`, and `balloon`. +- `[testing dataset information]` (optional): Testing dataset name for models trained on one dataset but tested on another. If not mentioned, it means the model was trained and tested on the same dataset type. diff --git a/third_party/mmyolo/docs/en/tutorials/custom_installation.md b/third_party/mmyolo/docs/en/tutorials/custom_installation.md new file mode 100644 index 0000000000000000000000000000000000000000..604a77a305c590ffb598d582208732b136f99cf3 --- /dev/null +++ b/third_party/mmyolo/docs/en/tutorials/custom_installation.md @@ -0,0 +1,109 @@ +# Customize Installation + +## CUDA versions + +When installing PyTorch, you need to specify the version of CUDA. If you are not clear on which to choose, follow our recommendations: + +- For Ampere-based NVIDIA GPUs, such as GeForce 30 series and NVIDIA A100, CUDA 11 is a must. +- For older NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 offers better compatibility and is more lightweight. + +Please make sure the GPU driver satisfies the minimum version requirements. See [this table](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions) for more information. + +```{note} +Installing CUDA runtime libraries is enough if you follow our best practices, because no CUDA code will be compiled locally. However, if you hope to compile MMCV from source or develop other CUDA operators, you need to install the complete CUDA toolkit from NVIDIA's [website](https://developer.nvidia.com/cuda-downloads), and its version should match the CUDA version of PyTorch. i.e., the specified version of cudatoolkit in `conda install` command. +``` + +## Install MMEngine without MIM + +To install MMEngine with pip instead of MIM, please follow \[MMEngine installation guides\](https://mmengine.readthedocs.io/en/latest/get_started/installation.html). + +For example, you can install MMEngine by the following command. + +```shell +pip install "mmengine>=0.6.0" +``` + +## Install MMCV without MIM + +MMCV contains C++ and CUDA extensions, thus depending on PyTorch in a complex way. MIM solves such dependencies automatically and makes the installation easier. However, it is not a must. + +To install MMCV with pip instead of MIM, please follow [MMCV installation guides](https://mmcv.readthedocs.io/en/2.x/get_started/installation.html). This requires manually specifying a find-url based on the PyTorch version and its CUDA version. + +For example, the following command installs MMCV built for PyTorch 1.12.x and CUDA 11.6. + +```shell +pip install "mmcv>=2.0.0rc4" -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12.0/index.html +``` + +## Install on CPU-only platforms + +MMDetection can be built for the CPU-only environment. In CPU mode you can train (requires MMCV version >= `2.0.0rc1`), test, or infer a model. + +However, some functionalities are gone in this mode: + +- Deformable Convolution +- Modulated Deformable Convolution +- ROI pooling +- Deformable ROI pooling +- CARAFE +- SyncBatchNorm +- CrissCrossAttention +- MaskedConv2d +- Temporal Interlace Shift +- nms_cuda +- sigmoid_focal_loss_cuda +- bbox_overlaps + +If you try to train/test/infer a model containing the above ops, an error will be raised. +The following table lists affected algorithms. + +| Operator | Model | +| :-----------------------------------------------------: | :--------------------------------------------------------------------------------------: | +| Deformable Convolution/Modulated Deformable Convolution | DCN、Guided Anchoring、RepPoints、CentripetalNet、VFNet、CascadeRPN、NAS-FCOS、DetectoRS | +| MaskedConv2d | Guided Anchoring | +| CARAFE | CARAFE | +| SyncBatchNorm | ResNeSt | + +## Install on Google Colab + +[Google Colab](https://research.google.com/) usually has PyTorch installed, +thus we only need to install MMEngine, MMCV, MMDetection, and MMYOLO with the following commands. + +**Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim). + +```shell +!pip3 install openmim +!mim install "mmengine>=0.6.0" +!mim install "mmcv>=2.0.0rc4,<2.1.0" +!mim install "mmdet>=3.0.0,<4.0.0" +``` + +**Step 2.** Install MMYOLO from the source. + +```shell +!git clone https://github.com/open-mmlab/mmyolo.git +%cd mmyolo +!pip install -e . +``` + +**Step 3.** Verification. + +```python +import mmyolo +print(mmyolo.__version__) +# Example output: 0.1.0, or an another version. +``` + +```{note} +Within Jupyter, the exclamation mark `!` is used to call external executables and `%cd` is a [magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd) to change the current working directory of Python. +``` + +## Develop using multiple MMYOLO versions + +The training and testing scripts have been modified in `PYTHONPATH` to ensure that the scripts use MMYOLO in the current directory. + +To have the default MMYOLO installed in your environment instead of what is currently in use, you can remove the code that appears in the relevant script: + +```shell +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH +``` diff --git a/third_party/mmyolo/docs/en/tutorials/data_flow.md b/third_party/mmyolo/docs/en/tutorials/data_flow.md new file mode 100644 index 0000000000000000000000000000000000000000..ab0e2e64a6a47592e8109d468bb8e9109cc08073 --- /dev/null +++ b/third_party/mmyolo/docs/en/tutorials/data_flow.md @@ -0,0 +1,121 @@ +# Mixed image data augmentation update + +Mixed image data augmentation is similar to Mosaic and MixUp, in which the annotation information of multiple images needs to be obtained for fusion during the running process. In the OpenMMLab data augmentation pipeline, other indexes of the dataset are generally not available. In order to achieve the above function, in the YOLOX reproduced in MMDetection, the concept of [MultiImageMixDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/dataset_wrappers.py#L338) dataset wrapper is proposed. + +`MultiImageMixDataset` dataset wrapper will include some data augmentation methods such as `Mosaic` and `RandAffine`, while `CocoDataset` will also need to include a `pipeline` to achieve the image and annotation loading function. In this way, we can achieve mixed data augmentation quickly. The configuration method is as follows: + +```python +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0), + ... +] +train_dataset = dict( + # use MultiImageMixDataset wrapper to support mosaic and mixup + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + pipeline=train_pipeline) + +``` + +However, this implementation has a disadvantage: users unfamiliar with MMDetection will forget those data augmentation methods like Mosaic must be used together with `MultiImageMixDataset`, increasing the usage complexity. Moreover, it is hard to understand as well. + +To address this problem, further simplifications are made in MMYOLO, which directly lets `pipeline` get `dataset`. In this way, the implementation of `Mosaic` and other data augmentation methods can be achieved and used just as the random flip, without a data wrapper anymore. The new configuration method is as follows: + +```python +pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) +] +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=pre_transform), + ... +] +``` + +A more complex YOLOv5-m configuration including MixUp is shown as follows: + +```python +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +``` + +It is very easy to use, just pass the object of Dataset to the pipeline. + +```python +def prepare_data(self, idx) -> Any: + """Pass the dataset to the pipeline during training to support mixed + data augmentation, such as Mosaic and MixUp.""" + if self.test_mode is False: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) + else: + return super().prepare_data(idx) +``` diff --git a/third_party/mmyolo/docs/en/tutorials/faq.md b/third_party/mmyolo/docs/en/tutorials/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..ca2a0b25fa54a928df81ce7214625d7cd7df4977 --- /dev/null +++ b/third_party/mmyolo/docs/en/tutorials/faq.md @@ -0,0 +1,101 @@ +# Frequently Asked Questions + +We list some common problems many users face and their corresponding solutions here. Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. If the contents here do not cover your issue, please create an [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) and make sure you fill in all the required information in the template. + +## Why do we need to launch MMYOLO? + +Why do we need to launch MMYOLO? Why do we need to open a separate repository instead of putting it directly into MMDetection? Since the open source, we have been receiving similar questions from our community partners, and the answers can be summarized in the following three points. + +**(1) Unified operation and inference platform** + +At present, there are very many improved algorithms for YOLO in the field of target detection, and they are very popular, but such algorithms are based on different frameworks for different back-end implementations, and there are significant differences, lacking a unified and convenient fair evaluation process from training to deployment. + +**(2) Protocol limitations** + +As we all know, YOLOv5 and its derived algorithms, such as YOLOv6 and YOLOv7 are GPL 3.0 protocols, which differ from the Apache protocol of MMDetection. Therefore, due to the protocol issue, it is not possible to incorporate MMYOLO directly into MMDetection. + +**(3) Multitasking support** + +There is another far-reaching reason: **MMYOLO tasks are not limited to MMDetection**, and more tasks will be supported in the future, such as MMPose based keypoint-related applications and MMTracking based tracking related applications, so it is not suitable to be directly incorporated into MMDetection. + +## What is the projects folder used for? + +The `projects` folder is newly introduced in OpenMMLab 2.0. There are three primary purposes: + +1. facilitate community contributors: Since OpenMMLab series codebases have a rigorous code management process, this inevitably leads to long algorithm reproduction cycles, which is not friendly to community contributions. +2. facilitate rapid support for new algorithms: A long development cycle can also lead to another problem users may not be able to experience the latest algorithms as soon as possible. +3. facilitate rapid support for new approaches and features: New approaches or new features may be incompatible with the current design of the codebases and cannot be quickly incorporated. + +In summary, the `projects` folder solves the problems of slow support for new algorithms and complicated support for new features due to the long algorithm reproduction cycle. Each folder in `projects` is an entirely independent project, and community users can quickly support some algorithms in the current version through `projects`. This allows the community to quickly use new algorithms and features that are difficult to adapt in the current version. When the design is stable or the code meets the merge specification, it will be considered to merge into the main branch. + +## Why does the performance drop significantly by switching the YOLOv5 backbone to Swin? + +In [Replace the backbone network](../recommended_topics/replace_backbone.md), we provide many tutorials on replacing the backbone module. However, you may not get a desired result once you replace the module and start directly training the model. This is because different networks have very distinct hyperparameters. Take the backbones of Swin and YOLOv5 as an example. Swin belongs to the transformer family, and the YOLOv5 is a convolutional network. Their training optimizers, learning rates, and other hyperparameters are different. If we force using Swin as the backbone of YOLOv5 and try to get a moderate performance, we must modify many parameters. + +## How to use the components implemented in all MM series repositories? + +In OpenMMLab 2.0, we have enhanced the ability to use different modules across MM series libraries. Currently, users can call any module that has been registered in MM series algorithm libraries via `MM Algorithm Library A. Module Name`. We demonstrated using MMClassification backbones in the [Replace the backbone network](../recommended_topics/replace_backbone.md). Other modules can be used in the same way. + +## Can pure background pictures be added in MMYOLO for training? + +Adding pure background images to training can suppress the false positive rate in most scenarios, and this feature has already been supported for most datasets. Take `YOLOv5CocoDataset` as an example. The control parameter is `train_dataloader.dataset.filter_cfg.filter_empty_gt`. If `filter_empty_gt` is True, the pure background images will be filtered out and not used in training, and vice versa. Most of the algorithms in MMYOLO have added this feature by default. + +## Is there a script to calculate the inference FPS in MMYOLO? + +MMYOLO is based on MMDet 3.x, which provides a [benchmark script](https://github.com/open-mmlab/mmdetection/blob/3.x/tools/analysis_tools/benchmark.py) to calculate the inference FPS. We recommend using `mim` to run the script in MMDet directly across the library instead of copying them to MMYOLO. More details about `mim` usages can be found at [Use mim to run scripts from other OpenMMLab repositories](../common_usage/mim_usage.md). + +## What is the difference between MMDeploy and EasyDeploy? + +MMDeploy is developed and maintained by the OpenMMLab deployment team to provide model deployment solutions for the OpenMMLab series algorithms, which support various inference backends and customization features. EasyDeploy is an easier and more lightweight deployment project provided by the community. However, it does not support as many features as MMDeploy. Users can choose which one to use in MMYOLO according to their needs. + +## How to check the AP of every category in COCOMetric? + +Just set `test_evaluator.classwise` to True or add `--cfg-options test_evaluator.classwise=True` when running the test script. + +## Why doesn't MMYOLO support the auto-learning rate scaling feature as MMDet? + +It is because the YOLO series algorithms are not very well suited for linear scaling. We have verified on several datasets that the performance is better without the auto-scaling based on batch size. + +## Why is the weight size of my trained model larger than the official one? + +The reason is that user-trained weights usually include extra data such as `optimizer`, `ema_state_dict`, and `message_hub`, which are removed when we publish the models. While on the contrary, the weight users trained by themselves are kept. You can use the [publish_model.py](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/publish_model.py) to remove these unnecessary components. + +## Why does the RTMDet cost more graphics memory during the training than YOLOv5? + +It is due to the assigner in RTMDet. YOLOv5 uses a simple and efficient shape-matching assigner, while RTMDet uses a dynamic soft label assigner for entire batch computation. Therefore, it consumes more memory in its internal cost matrix, especially when there are too many labeled bboxes in the current batch. We are considering solving this problem soon. + +## Do I need to reinstall MMYOLO after modifying some code? + +Without adding any new python code, and if you installed the MMYOLO by `mim install -v -e .`, any new modifications will take effect without reinstalling. However, if you add new python codes and are using them, you need to reinstall with `mim install -v -e .`. + +## How to use multiple versions of MMYOLO to develop? + +If users have multiple versions of the MMYOLO, such as mmyolo-v1 and mmyolo-v2. They can specify the target version of their MMYOLO by using this command in the shell: + +```shell +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH +``` + +Users can unset the `PYTHONPATH` when they want to reset to the default MMYOLO by this command: + +```shell +unset PYTHONPATH +``` + +## How to save the best checkpoints during the training? + +Users can choose what metrics to filter the best models by setting the `default_hooks.checkpoint.save_best` in the configuration. Take the COCO dataset detection task as an example. Users can customize the `default_hooks.checkpoint.save_best` with these parameters: + +1. `auto` works based on the first evaluation metric in the validation set. +2. `coco/bbox_mAP` works based on `bbox_mAP`. +3. `coco/bbox_mAP_50` works based on `bbox_mAP_50`. +4. `coco/bbox_mAP_75` works based on `bbox_mAP_75`. +5. `coco/bbox_mAP_s` works based on `bbox_mAP_s`. +6. `coco/bbox_mAP_m` works based on `bbox_mAP_m`. +7. `coco/bbox_mAP_l` works based on `bbox_mAP_l`. + +In addition, users can also choose the filtering logic by setting `default_hooks.checkpoint.rule` in the configuration. For example, `default_hooks.checkpoint.rule=greater` means that the larger the indicator is, the better it is. More details can be found at [checkpoint_hook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py). + +## How to train and test with non-square input sizes? + +The default configurations of the YOLO series algorithms are mostly squares like 640x640 or 1280x1280. However, if users want to train with a non-square shape, they can modify the `image_scale` to the desired value in the configuration. A more detailed example could be found at [yolov5_s-v61_fast_1xb12-40e_608x352_cat.py](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py). diff --git a/third_party/mmyolo/docs/en/tutorials/rotated_detection.md b/third_party/mmyolo/docs/en/tutorials/rotated_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..c0addb015f92b9f98926eacfcc82192b3a9c63ac --- /dev/null +++ b/third_party/mmyolo/docs/en/tutorials/rotated_detection.md @@ -0,0 +1,3 @@ +# Rotated Object Detection + +TODO diff --git a/third_party/mmyolo/docs/en/tutorials/warning_notes.md b/third_party/mmyolo/docs/en/tutorials/warning_notes.md new file mode 100644 index 0000000000000000000000000000000000000000..791cd9d4bbf6d5f20a36d9f00a88097cfabde5e7 --- /dev/null +++ b/third_party/mmyolo/docs/en/tutorials/warning_notes.md @@ -0,0 +1,24 @@ +# Common Warning Notes + +The purpose of this document is to collect warning messages that users often find confusing, and provide explanations to facilitate understanding. + +## xxx registry in mmyolo did not set import location + +The warning message complete information is that The xxx registry in mmyolo did not set import location. Fallback to call `mmyolo.utils.register_all_modules` instead. + +This warning means that a module was not set with an import location when importing it, making it impossible to determine its location. Therefore, `mmyolo.utils.register_all_modules` is automatically called to trigger the package import. +This warning belongs to the very low-level module warning in MMEngine, which may be difficult for users to understand, but it has no impact on the actual use and can be ignored directly. + +## save_param_schedulers is true but self.param_schedulers is None + +The following information is an example using the YOLOv5 algorithm. This is because the parameter scheduler strategy `YOLOv5ParamSchedulerHook` has been rewritten in YOLOv5, so the ParamScheduler designed in MMEngine is not used. However, `save_param_schedulers` is not set to False in the YOLOv5 configuration. + +First of all, this warning has no impact on performance and resuming training. If users think this warning affects experience, you can set `default_hooks.checkpoint.save_param_scheduler` to False, or set `--cfg-options default_hooks.checkpoint.save_param_scheduler=False` when training via the command line. + +## The loss_cls will be 0. This is a normal phenomenon. + +This is related to specific algorithms. Taking YOLOv5 as an example, its classification loss only considers positive samples. If the number of classes is 1, then the classification loss and object loss are functionally redundant. Therefore, in the design, when the number of classes is 1, the loss_cls is not calculated and is always 0. This is a normal phenomenon. + +## The model and loaded state dict do not match exactly + +Whether this warning will affect performance needs to be determined based on more information. If it occurs during fine-tuning, it is a normal phenomenon that the COCO pre-trained weights of the Head module cannot be loaded due to the user's custom class differences, and it will not affect performance. diff --git a/third_party/mmyolo/docs/en/useful_tools/browse_coco_json.md b/third_party/mmyolo/docs/en/useful_tools/browse_coco_json.md new file mode 100644 index 0000000000000000000000000000000000000000..772b8a56ff143676a0c05249203d3bffb3f33527 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/browse_coco_json.md @@ -0,0 +1,62 @@ +# Visualize COCO labels + +`tools/analysis_tools/browse_coco_json.py` is a script that can visualization to display the COCO label in the picture. + +```shell +python tools/analysis_tools/browse_coco_json.py [--data-root ${DATA_ROOT}] \ + [--img-dir ${IMG_DIR}] \ + [--ann-file ${ANN_FILE}] \ + [--wait-time ${WAIT_TIME}] \ + [--disp-all] [--category-names CATEGORY_NAMES [CATEGORY_NAMES ...]] \ + [--shuffle] +``` + +If images and labels are in the same folder, you can specify `--data-root` to the folder, and then `--img-dir` and `--ann-file` to specify the relative path of the folder. The code will be automatically spliced. +If the image and label files are not in the same folder, you do not need to specify `--data-root`, but directly specify `--img-dir` and `--ann-file` of the absolute path. + +E.g: + +1. Visualize all categories of `COCO` and display all types of annotations such as `bbox` and `mask`: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --disp-all +``` + +If images and labels are not in the same folder, you can use a absolutely path: + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir '/dataset/image/coco/train2017' \ + --ann-file '/label/instances_train2017.json' \ + --disp-all +``` + +2. Visualize all categories of `COCO`, and display only the `bbox` type labels, and shuffle the image to show: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --shuffle +``` + +3. Only visualize the `bicycle` and `person` categories of `COCO` and only the `bbox` type labels are displayed: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --category-names 'bicycle' 'person' +``` + +4. Visualize all categories of `COCO`, and display all types of label such as `bbox`, `mask`, and shuffle the image to show: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --disp-all \ + --shuffle +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/browse_dataset.md b/third_party/mmyolo/docs/en/useful_tools/browse_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..f066d22545f9896f6c60ab4cf3303b7137b26629 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/browse_dataset.md @@ -0,0 +1,42 @@ +# Visualize Datasets + +`tools/analysis_tools/browse_dataset.py` helps the user to browse a detection dataset (both images and bounding box annotations) visually, or save the image to a designated directory. + +```shell +python tools/analysis_tools/browse_dataset.py ${CONFIG} \ + [--out-dir ${OUT_DIR}] \ + [--not-show] \ + [--show-interval ${SHOW_INTERVAL}] +``` + +E,g: + +1. Use `config` file `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` to visualize the picture. The picture will pop up directly and be saved to the directory `work_dirs/browse_ dataset` at the same time: + +```shell +python tools/analysis_tools/browse_dataset.py 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' \ + --out-dir 'work_dirs/browse_dataset' +``` + +2. Use `config` file `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` to visualize the picture. The picture will pop up and display directly. Each picture lasts for `10` seconds. At the same time, it will be saved to the directory `work_dirs/browse_ dataset`: + +```shell +python tools/analysis_tools/browse_dataset.py 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' \ + --out-dir 'work_dirs/browse_dataset' \ + --show-interval 10 +``` + +3. Use `config` file `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` to visualize the picture. The picture will pop up and display directly. Each picture lasts for `10` seconds and the picture will not be saved: + +```shell +python tools/analysis_tools/browse_dataset.py 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' \ + --show-interval 10 +``` + +4. Use `config` file `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` to visualize the picture. The picture will not pop up directly, but only saved to the directory `work_dirs/browse_ dataset`: + +```shell +python tools/analysis_tools/browse_dataset.py 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' \ + --out-dir 'work_dirs/browse_dataset' \ + --not-show +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/dataset_analysis.md b/third_party/mmyolo/docs/en/useful_tools/dataset_analysis.md new file mode 100644 index 0000000000000000000000000000000000000000..c6149e9435f92a911daafe3ce2ba963d6bd4619b --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/dataset_analysis.md @@ -0,0 +1,79 @@ +# Visualize dataset analysis + +`tools/analysis_tools/dataset_analysis.py` help users get the renderings of the four functions, and save the pictures to the `dataset_analysis` folder under the current running directory. + +Description of the script's functions: + +The data required by each sub function is obtained through the data preparation of `main()`. + +Function 1: Generated by the sub function `show_bbox_num` to display the distribution of categories and bbox instances. + + + +Function 2: Generated by the sub function `show_bbox_wh` to display the width and height distribution of categories and bbox instances. + + + +Function 3: Generated by the sub function `show_bbox_wh_ratio` to display the width to height ratio distribution of categories and bbox instances. + + + +Function 3: Generated by the sub function `show_bbox_area` to display the distribution map of category and bbox instance area based on area rules. + + + +Print List: Generated by the sub function `show_class_list` and `show_data_list`. + + + +```shell +python tools/analysis_tools/dataset_analysis.py ${CONFIG} \ + [--type ${TYPE}] \ + [--class-name ${CLASS_NAME}] \ + [--area-rule ${AREA_RULE}] \ + [--func ${FUNC}] \ + [--out-dir ${OUT_DIR}] +``` + +E,g: + +1.Use `config` file `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` analyze the dataset, By default,the data loading type is `train_dataset`, the area rule is `[0,32,96,1e5]`, generate a result graph containing all functions and save the graph to the current running directory `./dataset_analysis` folder: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py +``` + +2.Use `config` file `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` analyze the dataset, change the data loading type from the default `train_dataset` to `val_dataset` through the `--val-dataset` setting: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --val-dataset +``` + +3.Use `config` file `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` analyze the dataset, change the display of all generated classes to specific classes. Take the display of `person` classes as an example: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --class-name person +``` + +4.Use `config` file `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` analyze the dataset, redefine the area rule through `--area-rule` . Take `30 70 125` as an example, the area rule becomes `[0,30,70,125,1e5]`: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --area-rule 30 70 125 +``` + +5.Use `config` file `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` analyze the dataset, change the display of four function renderings to only display `Function 1` as an example: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --func show_bbox_num +``` + +6.Use `config` file `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` analyze the dataset, modify the picture saving address to `work_dirs/dataset_analysis`: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --out-dir work_dirs/dataset_analysis +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/dataset_converters.md b/third_party/mmyolo/docs/en/useful_tools/dataset_converters.md new file mode 100644 index 0000000000000000000000000000000000000000..72ad968c14a0c4a8445b8fa57772903b823faa10 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/dataset_converters.md @@ -0,0 +1,55 @@ +# Dataset Conversion + +The folder `tools/data_converters` currently contains `ballon2coco.py`, `yolo2coco.py`, and `labelme2coco.py` - three dataset conversion tools. + +- `ballon2coco.py` converts the `balloon` dataset (this small dataset is for starters only) to COCO format. + +```shell +python tools/dataset_converters/balloon2coco.py +``` + +- `yolo2coco.py` converts a dataset from `yolo-style` **.txt** format to COCO format, please use it as follows: + +```shell +python tools/dataset_converters/yolo2coco.py /path/to/the/root/dir/of/your_dataset +``` + +Instructions: + +1. `image_dir` is the root directory of the yolo-style dataset you need to pass to the script, which should contain `images`, `labels`, and `classes.txt`. `classes.txt` is the class declaration corresponding to the current dataset. One class a line. The structure of the root directory should be formatted as this example shows: + +```bash +. +└── $ROOT_PATH + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... +``` + +2. The script will automatically check if `train.txt`, `val.txt`, and `test.txt` have already existed under `image_dir`. If these files are located, the script will organize the dataset accordingly. Otherwise, the script will convert the dataset into one file. The image paths in these files must be **ABSOLUTE** paths. +3. By default, the script will create a folder called `annotations` in the `image_dir` directory which stores the converted JSON file. If `train.txt`, `val.txt`, and `test.txt` are not found, the output file is `result.json`. Otherwise, the corresponding JSON file will be generated, named as `train.json`, `val.json`, and `test.json`. The `annotations` folder may look similar to this: + +```bash +. +└── $ROOT_PATH + ├── annotations + │ ├── result.json + │ └── ... + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/download_dataset.md b/third_party/mmyolo/docs/en/useful_tools/download_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..8a3e57ec6d14036813ccc7c9e586b99f939126d1 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/download_dataset.md @@ -0,0 +1,11 @@ +# Download Dataset + +`tools/misc/download_dataset.py` supports downloading datasets such as `COCO`, `VOC`, `LVIS` and `Balloon`. + +```shell +python tools/misc/download_dataset.py --dataset-name coco2017 +python tools/misc/download_dataset.py --dataset-name voc2007 +python tools/misc/download_dataset.py --dataset-name voc2012 +python tools/misc/download_dataset.py --dataset-name lvis +python tools/misc/download_dataset.py --dataset-name balloon [--save-dir ${SAVE_DIR}] [--unzip] +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/extract_subcoco.md b/third_party/mmyolo/docs/en/useful_tools/extract_subcoco.md new file mode 100644 index 0000000000000000000000000000000000000000..b2c7e06cf36c9b56d4aa91ec128601ef39674abc --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/extract_subcoco.md @@ -0,0 +1,60 @@ +# Extracts a subset of COCO + +The training dataset of the COCO2017 dataset includes 118K images, and the validation set includes 5K images, which is a relatively large dataset. Loading JSON in debugging or quick verification scenarios will consume more resources and bring slower startup speed. + +The `extract_subcoco.py` script provides the ability to extract a specified number/classes/area-size of images. The user can use the `--num-img`, `--classes`, `--area-size` parameter to get a COCO subset of the specified condition of images. + +For example, extract images use scripts as follows: + +```shell +python tools/misc/extract_subcoco.py \ + ${ROOT} \ + ${OUT_DIR} \ + --num-img 20 \ + --classes cat dog person \ + --area-size small +``` + +It gone be extract 20 images, and only includes annotations which belongs to cat(or dog/person) and bbox area size is small, after filter by class and area size, the empty annotation images won't be chosen, guarantee the images be extracted definitely has annotation info. + +Currently, only support COCO2017. In the future will support user-defined datasets of standard coco JSON format. + +The root path folder format is as follows: + +```text +├── root +│ ├── annotations +│ ├── train2017 +│ ├── val2017 +│ ├── test2017 +``` + +1. Extract 10 training images and 10 validation images using only 5K validation sets. + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 10 +``` + +2. Extract 20 training images using the training set and 20 validation images using the validation set. + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set +``` + +3. Set the global seed to 1. The default is no setting. + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set --seed 1 +``` + +4. Extract images by specify classes + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --classes cat dog person +``` + +5. Extract images by specify anchor size + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --area-size small +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/log_analysis.md b/third_party/mmyolo/docs/en/useful_tools/log_analysis.md new file mode 100644 index 0000000000000000000000000000000000000000..c45170aaaadb97855c51e67819df52ce3868a141 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/log_analysis.md @@ -0,0 +1,82 @@ +# Log Analysis + +## Curve plotting + +`tools/analysis_tools/analyze_logs.py` in MMDetection plots loss/mAP curves given a training log file. Run `pip install seaborn` first to install the dependency. + +```shell +mim run mmdet analyze_logs plot_curve \ + ${LOG} \ # path of train log in json format + [--keys ${KEYS}] \ # the metric that you want to plot, default to 'bbox_mAP' + [--start-epoch ${START_EPOCH}] # the epoch that you want to start, default to 1 + [--eval-interval ${EVALUATION_INTERVAL}] \ # the evaluation interval when training, default to 1 + [--title ${TITLE}] \ # title of figure + [--legend ${LEGEND}] \ # legend of each plot, default to None + [--backend ${BACKEND}] \ # backend of plt, default to None + [--style ${STYLE}] \ # style of plt, default to 'dark' + [--out ${OUT_FILE}] # the path of output file +# [] stands for optional parameters, when actually entering the command line, you do not need to enter [] +``` + +Examples: + +- Plot the classification loss of some run. + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls \ + --legend loss_cls + ``` + + + +- Plot the classification and regression loss of some run, and save the figure to a pdf. + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls loss_bbox \ + --legend loss_cls loss_bbox \ + --out losses_yolov5_s.pdf + ``` + + + +- Compare the bbox mAP of two runs in the same figure. + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json \ + --keys bbox_mAP \ + --legend yolov5_s yolov5_n \ + --eval-interval 10 # Note that the evaluation interval must be the same as during training. Otherwise, it will raise an error. + ``` + + + +## Compute the average training speed + +```shell +mim run mmdet analyze_logs cal_train_time \ + ${LOG} \ # path of train log in json format + [--include-outliers] # include the first value of every epoch when computing the average time +``` + +Examples: + +```shell +mim run mmdet analyze_logs cal_train_time \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json +``` + +The output is expected to be like the following. + +```text +-----Analyze train time of yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json----- +slowest epoch 278, average time is 0.1705 s/iter +fastest epoch 300, average time is 0.1510 s/iter +time std over epochs is 0.0026 +average iter time: 0.1556 s/iter +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/model_converters.md b/third_party/mmyolo/docs/en/useful_tools/model_converters.md new file mode 100644 index 0000000000000000000000000000000000000000..09fb52df13c2861f691672d7fe1d27e69af5d0e3 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/model_converters.md @@ -0,0 +1,54 @@ +# Convert Model + +The six scripts under the `tools/model_converters` directory can help users convert the keys in the official pre-trained model of YOLO to the format of MMYOLO, and use MMYOLO to fine-tune the model. + +## YOLOv5 + +Take conversion `yolov5s.pt` as an example: + +1. Clone the official YOLOv5 code to the local (currently the maximum supported version is `v6.1`): + +```shell +git clone -b v6.1 https://github.com/ultralytics/yolov5.git +cd yolov5 +``` + +2. Download official weight file: + +```shell +wget https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s.pt +``` + +3. Copy file `tools/model_converters/yolov5_to_mmyolo.py` to the path of YOLOv5 official code clone: + +```shell +cp ${MMDET_YOLO_PATH}/tools/model_converters/yolov5_to_mmyolo.py yolov5_to_mmyolo.py +``` + +4. Conversion + +```shell +python yolov5_to_mmyolo.py --src ${WEIGHT_FILE_PATH} --dst mmyolov5.pt +``` + +The converted `mmyolov5.pt` can be used by MMYOLO. The official weight conversion of YOLOv6 is also used in the same way. + +## YOLOX + +The conversion of YOLOX model **does not need** to download the official YOLOX code, just download the weight. + +Take conversion `yolox_s.pth` as an example: + +1. Download official weight file: + +```shell +wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.pth +``` + +2. Conversion + +```shell +python tools/model_converters/yolox_to_mmyolo.py --src yolox_s.pth --dst mmyolox.pt +``` + +The converted `mmyolox.pt` can be used by MMYOLO. diff --git a/third_party/mmyolo/docs/en/useful_tools/optimize_anchors.md b/third_party/mmyolo/docs/en/useful_tools/optimize_anchors.md new file mode 100644 index 0000000000000000000000000000000000000000..460bc6e2fa3f4bec41b39901f1e66c442802911c --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/optimize_anchors.md @@ -0,0 +1,38 @@ +# Optimize anchors size + +Script `tools/analysis_tools/optimize_anchors.py` supports three methods to optimize YOLO anchors including `k-means` +anchor cluster, `Differential Evolution` and `v5-k-means`. + +## k-means + +In k-means method, the distance criteria is based IoU, python shell as follow: + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm k-means \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} +``` + +## Differential Evolution + +In differential_evolution method, based differential evolution algorithm, use `avg_iou_cost` as minimum target function, python shell as follow: + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm DE \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} +``` + +## v5-k-means + +In v5-k-means method, clustering standard as same with YOLOv5 which use shape-match, python shell as follow: + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm v5-k-means \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --prior_match_thr ${PRIOR_MATCH_THR} \ + --out-dir ${OUT_DIR} +``` diff --git a/third_party/mmyolo/docs/en/useful_tools/print_config.md b/third_party/mmyolo/docs/en/useful_tools/print_config.md new file mode 100644 index 0000000000000000000000000000000000000000..2a6ee79f36c749491a1b5095792b708755fca279 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/print_config.md @@ -0,0 +1,20 @@ +# Print the whole config + +`print_config.py` in MMDetection prints the whole config verbatim, expanding all its imports. The command is as following. + +```shell +mim run mmdet print_config \ + ${CONFIG} \ # path of the config file + [--save-path] \ # save path of whole config, suffixed with .py, .json or .yml + [--cfg-options ${OPTIONS [OPTIONS...]}] # override some settings in the used config +``` + +Examples: + +```shell +mim run mmdet print_config \ + configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py \ + --save-path ./work_dirs/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py +``` + +Running the above command will save the `yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py` config file with the inheritance relationship expanded to \`\`yolov5_s-v61_syncbn_fast_1xb4-300e_balloon_whole.py`in the`./work_dirs\` folder. diff --git a/third_party/mmyolo/docs/en/useful_tools/vis_scheduler.md b/third_party/mmyolo/docs/en/useful_tools/vis_scheduler.md new file mode 100644 index 0000000000000000000000000000000000000000..f1526342c9ee80236f7c146231430818811e2a82 --- /dev/null +++ b/third_party/mmyolo/docs/en/useful_tools/vis_scheduler.md @@ -0,0 +1,44 @@ +# Hyper-parameter Scheduler Visualization + +`tools/analysis_tools/vis_scheduler` aims to help the user to check the hyper-parameter scheduler of the optimizer(without training), which support the "learning rate", "momentum", and "weight_decay". + +```bash +python tools/analysis_tools/vis_scheduler.py \ + ${CONFIG_FILE} \ + [-p, --parameter ${PARAMETER_NAME}] \ + [-d, --dataset-size ${DATASET_SIZE}] \ + [-n, --ngpus ${NUM_GPUs}] \ + [-o, --out-dir ${OUT_DIR}] \ + [--title ${TITLE}] \ + [--style ${STYLE}] \ + [--window-size ${WINDOW_SIZE}] \ + [--cfg-options] +``` + +**Description of all arguments**: + +- `config`: The path of a model config file. +- **`-p, --parameter`**: The param to visualize its change curve, choose from "lr", "momentum" or "wd". Default to use "lr". +- **`-d, --dataset-size`**: The size of the datasets. If set,`DATASETS.build` will be skipped and `${DATASET_SIZE}` will be used as the size. Default to use the function `DATASETS.build`. +- **`-n, --ngpus`**: The number of GPUs used in training, default to be 1. +- **`-o, --out-dir`**: The output path of the curve plot, default not to output. +- `--title`: Title of figure. If not set, default to be config file name. +- `--style`: Style of plt. If not set, default to be `whitegrid`. +- `--window-size`: The shape of the display window. If not specified, it will be set to `12*7`. If used, it must be in the format `'W*H'`. +- `--cfg-options`: Modifications to the configuration file, refer to [Learn about Configs](../tutorials/config.md). + +```{note} +Loading annotations maybe consume much time, you can directly specify the size of the dataset with `-d, dataset-size` to save time. +``` + +You can use the following command to plot the step learning rate schedule used in the config `configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py`: + +```shell +python tools/analysis_tools/vis_scheduler.py \ + configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py \ + --dataset-size 118287 \ + --ngpus 8 \ + --out-dir ./output +``` + +
diff --git a/third_party/mmyolo/docs/zh_cn/Makefile b/third_party/mmyolo/docs/zh_cn/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d4bb2cbb9eddb1bb1b4f366623044af8e4830919 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/third_party/mmyolo/docs/zh_cn/_static/css/readthedocs.css b/third_party/mmyolo/docs/zh_cn/_static/css/readthedocs.css new file mode 100644 index 0000000000000000000000000000000000000000..353aa9e285a5639b0f34ecb3b16115cff1ad25ed --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/_static/css/readthedocs.css @@ -0,0 +1,6 @@ +.header-logo { + background-image: url("../image/mmyolo-logo.png"); + background-size: 115px 40px; + height: 40px; + width: 115px; +} diff --git a/third_party/mmyolo/docs/zh_cn/_static/image/mmyolo-logo.png b/third_party/mmyolo/docs/zh_cn/_static/image/mmyolo-logo.png new file mode 100644 index 0000000000000000000000000000000000000000..41318aec92d86749d327bc5f9b9c689632ffc735 Binary files /dev/null and b/third_party/mmyolo/docs/zh_cn/_static/image/mmyolo-logo.png differ diff --git a/third_party/mmyolo/docs/zh_cn/advanced_guides/cross-library_application.md b/third_party/mmyolo/docs/zh_cn/advanced_guides/cross-library_application.md new file mode 100644 index 0000000000000000000000000000000000000000..d95f68cd22cdfc6218c24c7c02936f7fb04fd247 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/advanced_guides/cross-library_application.md @@ -0,0 +1 @@ +# MMYOLO 跨库应用解析 diff --git a/third_party/mmyolo/docs/zh_cn/api.rst b/third_party/mmyolo/docs/zh_cn/api.rst new file mode 100644 index 0000000000000000000000000000000000000000..39223a34f849b4b66dafea7fe9c9fdd34d06ecfe --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/api.rst @@ -0,0 +1,80 @@ +mmyolo.datasets +-------------------- + +datasets +^^^^^^^^^^ +.. automodule:: mmyolo.datasets + :members: + +transforms +^^^^^^^^^^^^ +.. automodule:: mmyolo.datasets.transforms + :members: + +mmyolo.engine +-------------- + +hooks +^^^^^^^^^^ +.. automodule:: mmyolo.engine.hooks + :members: + +optimizers +^^^^^^^^^^ +.. automodule:: mmyolo.engine.optimizers + :members: + +mmyolo.models +-------------- + +backbones +^^^^^^^^^^ +.. automodule:: mmyolo.models.backbones + :members: + +data_preprocessor +^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmyolo.models.data_preprocessor + :members: + +dense_heads +^^^^^^^^^^^^ +.. automodule:: mmyolo.models.dense_heads + :members: + +detectors +^^^^^^^^^^ +.. automodule:: mmyolo.models.detectors + :members: + +layers +^^^^^^^^^^ +.. automodule:: mmyolo.models.layers + :members: + +losses +^^^^^^^^^^ +.. automodule:: mmyolo.models.losses + :members: + +necks +^^^^^^^^^^^^ +.. automodule:: mmyolo.models.necks + :members: + + +task_modules +^^^^^^^^^^^^^^^ +.. automodule:: mmyolo.models.task_modules + :members: + +utils +^^^^^^^^^^ +.. automodule:: mmyolo.models.utils + :members: + + +mmyolo.utils +-------------- +.. automodule:: mmyolo.utils + :members: diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/amp_training.md b/third_party/mmyolo/docs/zh_cn/common_usage/amp_training.md new file mode 100644 index 0000000000000000000000000000000000000000..c7803abfea4487734b05de80705689d30c796e1a --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/amp_training.md @@ -0,0 +1,13 @@ +# 自动混合精度(AMP)训练 + +如果要开启自动混合精度(AMP)训练,在训练命令最后加上 `--amp` 即可, 命令如下: + +```shell +python tools/train.py python ./tools/train.py ${CONFIG} --amp +``` + +具体例子如下: + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py --amp +``` diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/freeze_layers.md b/third_party/mmyolo/docs/zh_cn/common_usage/freeze_layers.md new file mode 100644 index 0000000000000000000000000000000000000000..ca0613903b65a6b2ba6986b3dea2830ee4465a2b --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/freeze_layers.md @@ -0,0 +1,28 @@ +# 冻结指定网络层权重 + +## 冻结 backbone 权重 + +在 MMYOLO 中我们可以通过设置 `frozen_stages` 参数去冻结主干网络的部分 `stage`, 使这些 `stage` 的参数不参与模型的更新。 +需要注意的是:`frozen_stages = i` 表示的意思是指从最开始的 `stage` 开始到第 `i` 层 `stage` 的所有参数都会被冻结。下面是 `YOLOv5` 的例子,其他算法也是同样的逻辑: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + backbone=dict( + frozen_stages=1 # 表示第一层 stage 以及它之前的所有 stage 中的参数都会被冻结 + )) +``` + +## 冻结 neck 权重 + +MMYOLO 中也可以通过参数 `freeze_all` 去冻结整个 `neck` 的参数。下面是 `YOLOv5` 的例子,其他算法也是同样的逻辑: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + neck=dict( + freeze_all=True # freeze_all=True 时表示整个 neck 的参数都会被冻结 + )) +``` diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/mim_usage.md b/third_party/mmyolo/docs/zh_cn/common_usage/mim_usage.md new file mode 100644 index 0000000000000000000000000000000000000000..aaf26920e15e0856c2f0fcb0a7fdd845766c44b7 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/mim_usage.md @@ -0,0 +1,89 @@ +# 使用 mim 跨库调用其他 OpenMMLab 仓库的脚本 + +```{note} +1. 目前暂不支持跨库调用所有脚本,正在修复中。等修复完成,本文档会添加更多的例子。 +2. 绘制 mAP 和 计算平均训练速度 两项功能在 MMDetection dev-3.x 分支中修复,目前需要通过源码安装该分支才能成功调用。 +``` + +## 日志分析 + +### 曲线图绘制 + +MMDetection 中的 `tools/analysis_tools/analyze_logs.py` 可利用指定的训练 log 文件绘制 loss/mAP 曲线图, 第一次运行前请先运行 `pip install seaborn` 安装必要依赖。 + +```shell +mim run mmdet analyze_logs plot_curve \ + ${LOG} \ # 日志文件路径 + [--keys ${KEYS}] \ # 需要绘制的指标,默认为 'bbox_mAP' + [--start-epoch ${START_EPOCH}] # 起始的 epoch,默认为 1 + [--eval-interval ${EVALUATION_INTERVAL}] \ # 评估间隔,默认为 1 + [--title ${TITLE}] \ # 图片标题,无默认值 + [--legend ${LEGEND}] \ # 图例,默认为 None + [--backend ${BACKEND}] \ # 绘制后端,默认为 None + [--style ${STYLE}] \ # 绘制风格,默认为 'dark' + [--out ${OUT_FILE}] # 输出文件路径 +# [] 代表可选参数,实际输入命令行时,不用输入 [] +``` + +样例: + +- 绘制分类损失曲线图 + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls \ + --legend loss_cls + ``` + + + +- 绘制分类损失、回归损失曲线图,保存图片为对应的 pdf 文件 + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls loss_bbox \ + --legend loss_cls loss_bbox \ + --out losses_yolov5_s.pdf + ``` + + + +- 在同一图像中比较两次运行结果的 bbox mAP + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json \ + --keys bbox_mAP \ + --legend yolov5_s yolov5_n \ + --eval-interval 10 # 注意评估间隔必须和训练时设置的一致,否则会报错 + ``` + + + +### 计算平均训练速度 + +```shell +mim run mmdet analyze_logs cal_train_time \ + ${LOG} \ # 日志文件路径 + [--include-outliers] # 计算时包含每个 epoch 的第一个数据 +``` + +样例: + +```shell +mim run mmdet analyze_logs cal_train_time \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json +``` + +输出以如下形式展示: + +```text +-----Analyze train time of yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json----- +slowest epoch 278, average time is 0.1705 s/iter +fastest epoch 300, average time is 0.1510 s/iter +time std over epochs is 0.0026 +average iter time: 0.1556 s/iter +``` diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/module_combination.md b/third_party/mmyolo/docs/zh_cn/common_usage/module_combination.md new file mode 100644 index 0000000000000000000000000000000000000000..011836f68fb9b35434d7e823c382ce5357dd2f9f --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/module_combination.md @@ -0,0 +1,280 @@ +# 算法组合替换教程 + +## Loss 组合替换教程 + +OpenMMLab 2.0 体系中 MMYOLO、MMDetection、MMClassification 中的 loss 注册表都继承自 MMEngine 中的根注册表。 因此用户可以在 MMYOLO 中使用来自 MMDetection、MMClassification 中实现的 loss 而无需重新实现。 + +### 替换 YOLOv5 Head 中的 loss_cls 函数 + +1. 假设我们想使用 `LabelSmoothLoss` 作为 `loss_cls` 的损失函数。因为 `LabelSmoothLoss` 已经在 MMClassification 中实现了,所以可以直接在配置文件中进行替换。配置文件如下: + +```python +# 请先使用命令: mim install "mmcls>=1.0.0rc2",安装 mmcls +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model = dict( + bbox_head=dict( + loss_cls=dict( + _delete_=True, + _scope_='mmcls', # 临时替换 scope 为 mmcls + type='LabelSmoothLoss', + label_smooth_val=0.1, + mode='multi_label', + reduction='mean', + loss_weight=0.5))) +``` + +2. 假设我们想使用 `VarifocalLoss` 作为 `loss_cls` 的损失函数。因为 `VarifocalLoss` 在 MMDetection 已经实现好了,所以可以直接替换。配置文件如下: + +```python +model = dict( + bbox_head=dict( + loss_cls=dict( + _delete_=True, + _scope_='mmdet', + type='VarifocalLoss', + loss_weight=1.0))) +``` + +3. 假设我们想使用 `FocalLoss` 作为 `loss_cls` 的损失函数。配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model = dict( + bbox_head=dict( + loss_cls= dict( + _delete_=True, + _scope_='mmdet', + type='FocalLoss', + loss_weight=1.0))) +``` + +4. 假设我们想使用 `QualityFocalLoss` 作为 `loss_cls` 的损失函数。配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model = dict( + bbox_head=dict( + loss_cls= dict( + _delete_=True, + _scope_='mmdet', + type='QualityFocalLoss', + loss_weight=1.0))) +``` + +### 替换 YOLOv5 Head 中的 loss_obj 函数 + +`loss_obj` 的替换与 `loss_cls` 的替换类似,我们可以使用已经实现好的损失函数对 `loss_obj` 的损失函数进行替换 + +1. 假设我们想使用 `VarifocalLoss` 作为 `loss_obj` 的损失函数 + +```python +model = dict( + bbox_head=dict( + loss_obj=dict( + _delete_=True, + _scope_='mmdet', + type='VarifocalLoss', + loss_weight=1.0))) +``` + +2. 假设我们想使用 `FocalLoss` 作为 `loss_obj` 的损失函数。 + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model = dict( + bbox_head=dict( + loss_cls= dict( + _delete_=True, + _scope_='mmdet', + type='FocalLoss', + loss_weight=1.0))) +``` + +3. 假设我们想使用 `QualityFocalLoss` 作为 `loss_obj` 的损失函数。 + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model = dict( + bbox_head=dict( + loss_cls= dict( + _delete_=True, + _scope_='mmdet', + type='QualityFocalLoss', + loss_weight=1.0))) +``` + +#### 注意 + +1. 在本教程中损失函数的替换是运行不报错的,但无法保证性能一定会上升。 +2. 本次损失函数的替换都是以 YOLOv5 算法作为例子的,但是 MMYOLO 下的多个算法,如 YOLOv6,YOLOX 等算法都可以按照上述的例子进行替换。 + +## Model 和 Loss 组合替换 + +在 MMYOLO 中,model 即网络本身和 loss 是解耦的,用户可以简单的通过修改配置文件中 model 和 loss 来组合不同模块。下面给出两个具体例子。 + +(1) YOLOv5 model 组合 YOLOv7 loss,配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='YOLOv7Head', + head_module=dict( + type='YOLOv5HeadModule', + num_classes=80, + in_channels=[256, 512, 1024], + widen_factor=0.5, + featmap_strides=[8, 16, 32], + num_base_priors=3))) +``` + +(2) RTMDet model 组合 YOLOv6 loss,配置文件如下: + +```python +_base_ = './rtmdet_l_syncbn_8xb32-300e_coco.py' +model = dict( + bbox_head=dict( + _delete_=True, + type='YOLOv6Head', + head_module=dict( + type='RTMDetSepBNHeadModule', + num_classes=80, + in_channels=256, + stacked_convs=2, + feat_channels=256, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU', inplace=True), + share_conv=True, + pred_kernel_size=1, + featmap_strides=[8, 16, 32]), + loss_bbox=dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False)), + train_cfg=dict( + _delete_=True, + initial_epoch=4, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=80, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=80, + topk=13, + alpha=1, + beta=6) + )) +``` + +## Backbone + Neck + HeadModule 的组合替换 + +### 1. YOLOv5 Backbone 替换 + +(1) 假设想将 `RTMDet backbone + yolov5 neck + yolov5 head` 作为 `YOLOv5` 的完整网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +widen_factor = 0.5 +deepen_factor = 0.33 + +model = dict( + backbone=dict( + _delete_=True, + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU', inplace=True)) +) +``` + +(2) `YOLOv6EfficientRep backbone + yolov5 neck + yolov5 head` 作为 `YOLOv5` 的完整网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + backbone=dict( + type='YOLOv6EfficientRep', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)) +) +``` + +### 2. YOLOv5 Neck 替换 + +(1) 假设想将 `yolov5 backbone + yolov6 neck + yolov5 head` 作为 `YOLOv5` 的完整网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + neck = dict( + type = 'YOLOv6RepPAFPN', + in_channels = [256, 512, 1024], + out_channels = [128, 256, 512], # 注意 YOLOv6RepPAFPN 的输出通道是[128, 256, 512] + num_csp_blocks = 12, + act_cfg = dict(type='ReLU', inplace = True), + ), + bbox_head = dict( + head_module = dict( + in_channels = [128, 256, 512])) # head 部分输入通道要做相应更改 +) +``` + +(2) 假设想将 `yolov5 backbone + yolov7 neck + yolov5 head` 作为 `YOLOv5` 的完整网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = _base_.widen_factor + +model = dict( + neck = dict( + _delete_=True, # 将 _base_ 中关于 neck 的字段删除 + type = 'YOLOv7PAFPN', + deepen_factor = deepen_factor, + widen_factor = widen_factor, + upsample_feats_cat_first = False, + in_channels = [256, 512, 1024], + out_channels = [128, 256, 512], + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg = dict(type='SiLU', inplace=True), + ), + bbox_head = dict( + head_module = dict( + in_channels = [256, 512, 1024])) # 注意使用 YOLOv7PAFPN 后 head 部分输入通道数是 neck 输出通道数的两倍 +) +``` + +### 3. YOLOv5 HeadModule 替换 + +(1) 假设想将 `yolov5 backbone + yolov5 neck + yolo7 headmodule` 作为 `YOLOv5` 的完整网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +strides = [8, 16, 32] +num_classes = 1 # 根据自己的数据集调整 + +model = dict( + bbox_head=dict( + type='YOLOv7Head', + head_module=dict( + type='YOLOv7HeadModule', + num_classes=num_classes, + in_channels=[256, 512, 1024], + featmap_strides=strides, + num_base_priors=3))) +``` diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/ms_training_testing.md b/third_party/mmyolo/docs/zh_cn/common_usage/ms_training_testing.md new file mode 100644 index 0000000000000000000000000000000000000000..1f271c54df6517bb515b4312ee7b921beeb7b6ba --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/ms_training_testing.md @@ -0,0 +1,41 @@ +# 多尺度训练和测试 + +## 多尺度训练 + +MMYOLO 中目前支持了主流的 YOLOv5、YOLOv6、YOLOv7、YOLOv8 和 RTMDet 等算法,其默认配置均为单尺度 640x640 训练。 在 MM 系列开源库中常用的多尺度训练有两种实现方式: + +1. 在 `train_pipeline` 中输出的每张图都是不定尺度的,然后在 [DataPreprocessor](https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/data_preprocessors/data_preprocessor.py) 中将不同尺度的输入图片 + 通过 [stack_batch](https://github.com/open-mmlab/mmengine/blob/dbae83c52fa54d6dda08b6692b124217fe3b2135/mmengine/model/base_model/data_preprocessor.py#L260-L261) 函数填充到同一尺度,从而组成 batch 进行训练。MMDet 中大部分算法都是采用这个实现方式。 +2. 在 `train_pipeline` 中输出的每张图都是固定尺度的,然后直接在 `DataPreprocessor` 中进行 batch 张图片的上下采样,从而实现多尺度训练功能 + +在 MMYOLO 中两种多尺度训练方式都是支持的。理论上第一种实现方式所生成的尺度会更加丰富,但是由于其对单张图进行独立增强,训练效率不如第二种方式。所以我们更推荐使用第二种方式。 + +以 `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py` 配置为例,其默认配置采用的是 640x640 固定尺度训练,假设想实现以 32 为倍数,且多尺度范围为 (480, 800) 的训练方式,则可以参考 YOLOX 做法通过 DataPreprocessor 中的 [YOLOXBatchSyncRandomResize](https://github.com/open-mmlab/mmyolo/blob/dc85144fab20a970341550794857a2f2f9b11564/mmyolo/models/data_preprocessors/data_preprocessor.py#L20) 实现。 + +在 `configs/yolov5` 路径下新建配置,命名为 `configs/yolov5/yolov5_s-v61_fast_1xb12-ms-40e_cat.py`,其内容如下: + +```python +_base_ = 'yolov5_s-v61_fast_1xb12-40e_cat.py' + +model = dict( + data_preprocessor=dict( + type='YOLOv5DetDataPreprocessor', + pad_size_divisor=32, + batch_augments=[ + dict( + type='YOLOXBatchSyncRandomResize', + # 多尺度范围是 480~800 + random_size_range=(480, 800), + # 输出尺度需要被 32 整除 + size_divisor=32, + # 每隔 1 个迭代改变一次输出输出 + interval=1) + ]) +) +``` + +上述配置就可以实现多尺度训练了。为了方便,我们已经在 `configs/yolov5/` 下已经提供了该配置。其余 YOLO 系列算法也是类似做法。 + +## 多尺度测试 + +MMYOLO 多尺度测试功能等同于测试时增强 TTA,目前已经支持,详情请查看 [测试时增强 TTA](./tta.md) 。 diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/multi_necks.md b/third_party/mmyolo/docs/zh_cn/common_usage/multi_necks.md new file mode 100644 index 0000000000000000000000000000000000000000..a4a17052729205884c6259b2087cd2a51044c7b0 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/multi_necks.md @@ -0,0 +1,40 @@ +# 应用多个 Neck + +如果你想堆叠多个 Neck,可以直接在配置文件中的 Neck 参数,MMYOLO 支持以 `List` 形式拼接多个 Neck 配置,你需要保证上一个 Neck 的输出通道与下一个 Neck +的输入通道相匹配。如需要调整通道,可以插入 `mmdet.ChannelMapper` 模块用来对齐多个 Neck 之间的通道数量。具体配置如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = _base_.widen_factor +model = dict( + type='YOLODetector', + neck=[ + dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=[256, 512, 1024], + out_channels=[256, 512, 1024], + # 因为 out_channels 由 widen_factor 控制,YOLOv5PAFPN 的 out_channels = out_channels * widen_factor + num_csp_blocks=3, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True)), + dict( + type='mmdet.ChannelMapper', + in_channels=[128, 256, 512], + out_channels=128, + ), + dict( + type='mmdet.DyHead', + in_channels=128, + out_channels=256, + num_blocks=2, + # disable zero_init_offset to follow official implementation + zero_init_offset=False) + ], + bbox_head=dict(head_module=dict(in_channels=[512, 512, 512])) + # 因为 out_channels 由 widen_factor 控制,YOLOv5HeadModuled 的 in_channels * widen_factor 才会等于最后一个 neck 的 out_channels +) +``` diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/output_predictions.md b/third_party/mmyolo/docs/zh_cn/common_usage/output_predictions.md new file mode 100644 index 0000000000000000000000000000000000000000..b11f856d674852582bbac3b50ac1d48148c366b8 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/output_predictions.md @@ -0,0 +1,40 @@ +# 输出模型预测结果 + +如果想将预测结果保存为特定的文件,用于离线评估,目前 MMYOLO 支持 json 和 pkl 两种格式。 + +```{note} +json 文件仅保存 `image_id`、`bbox`、`score` 和 `category_id`; json 文件可以使用 json 库读取。 +pkl 保存内容比 json 文件更多,还会保存预测图片的文件名和尺寸等一系列信息; pkl 文件可以使用 pickle 库读取。 +``` + +## 输出为 json 文件 + +如果想将预测结果输出为 json 文件,则命令如下: + +```shell +python tools/test.py ${CONFIG} ${CHECKPOINT} --json-prefix ${JSON_PREFIX} +``` + +`--json-prefix` 后的参数输入为文件名前缀(无需输入 `.json` 后缀),也可以包含路径。举一个具体例子: + +```shell +python tools/test.py configs\yolov5\yolov5_s-v61_syncbn_8xb16-300e_coco.py yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth --json-prefix work_dirs/demo/json_demo +``` + +运行以上命令会在 `work_dirs/demo` 文件夹下,输出 `json_demo.bbox.json` 文件。 + +## 输出为 pkl 文件 + +如果想将预测结果输出为 pkl 文件,则命令如下: + +```shell +python tools/test.py ${CONFIG} ${CHECKPOINT} --out ${OUTPUT_FILE} [--cfg-options ${OPTIONS [OPTIONS...]}] +``` + +`--out` 后的参数输入为完整文件名(**必须输入** `.pkl` 或 `.pickle` 后缀),也可以包含路径。举一个具体例子: + +```shell +python tools/test.py configs\yolov5\yolov5_s-v61_syncbn_8xb16-300e_coco.py yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth --out work_dirs/demo/pkl_demo.pkl +``` + +运行以上命令会在 `work_dirs/demo` 文件夹下,输出 `pkl_demo.pkl` 文件。 diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/plugins.md b/third_party/mmyolo/docs/zh_cn/common_usage/plugins.md new file mode 100644 index 0000000000000000000000000000000000000000..337111f9975393bdc4804ebaf64860e40dfa9fc5 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/plugins.md @@ -0,0 +1,34 @@ +# 给主干网络增加插件 + +MMYOLO 支持在 Backbone 的不同 Stage 后增加如 `none_local`、`dropblock` 等插件,用户可以直接通过修改 config 文件中 `backbone` 的 `plugins`参数来实现对插件的管理。例如为 `YOLOv5` 增加`GeneralizedAttention` 插件,其配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + backbone=dict( + plugins=[ + dict( + cfg=dict( + type='GeneralizedAttention', + spatial_range=-1, + num_heads=8, + attention_type='0011', + kv_stride=2), + stages=(False, False, True, True)) + ])) +``` + +`cfg` 参数表示插件的具体配置, `stages` 参数表示是否在 backbone 对应的 stage 后面增加插件,长度需要和 backbone 的 stage 数量相同。 + +目前 `MMYOLO` 支持了如下插件: + +
+支持的插件 + +1. [CBAM](https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/models/plugins/cbam.py#L86) +2. [GeneralizedAttention](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/bricks/generalized_attention.py#L13) +3. [NonLocal2d](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/bricks/non_local.py#L250) +4. [ContextBlock](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/cnn/bricks/context_block.py#L18) + +
diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/registries_info.md b/third_party/mmyolo/docs/zh_cn/common_usage/registries_info.md new file mode 100644 index 0000000000000000000000000000000000000000..4a9d184cd56b69262bf3831f0d175ab1ca52eb13 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/registries_info.md @@ -0,0 +1,788 @@ +# MM 系列开源库注册表 + +(注意:本文档是通过 .dev_scripts/print_registers.py 脚本自动生成) + +## MMdetection (3.0.0rc6) + +
MMdetection Module Components
+
+ + + + + + + + + + + + + + + + + + + + +
visualizeroptimizer constructorloopparameter schedulerdata sampler
  • DetLocalVisualizer
  • LearningRateDecayOptimizerConstructor
  • TeacherStudentValLoop
  • QuadraticWarmupParamScheduler
  • QuadraticWarmupLR
  • QuadraticWarmupMomentum
  • AspectRatioBatchSampler
  • ClassAwareSampler
  • MultiSourceSampler
  • GroupMultiSourceSampler
+
+ + + + + + + + + + + + + + + + + + + + +
metrichookdatasettask util (part 1)task util (part 2)
  • CityScapesMetric
  • CocoMetric
  • CocoOccludedSeparatedMetric
  • CocoPanopticMetric
  • CrowdHumanMetric
  • DumpDetResults
  • DumpProposals
  • LVISMetric
  • OpenImagesMetric
  • VOCMetric
  • CheckInvalidLossHook
  • MeanTeacherHook
  • MemoryProfilerHook
  • NumClassCheckHook
  • PipelineSwitchHook
  • SetEpochInfoHook
  • SyncNormHook
  • DetVisualizationHook
  • YOLOXModeSwitchHook
  • FastStopTrainingHook
  • BaseDetDataset
  • CocoDataset
  • CityscapesDataset
  • CocoPanopticDataset
  • CrowdHumanDataset
  • MultiImageMixDataset
  • DeepFashionDataset
  • LVISV05Dataset
  • LVISDataset
  • LVISV1Dataset
  • Objects365V1Dataset
  • Objects365V2Dataset
  • OpenImagesDataset
  • OpenImagesChallengeDataset
  • XMLDataset
  • VOCDataset
  • WIDERFaceDataset
  • MaxIoUAssigner
  • ApproxMaxIoUAssigner
  • ATSSAssigner
  • CenterRegionAssigner
  • DynamicSoftLabelAssigner
  • GridAssigner
  • HungarianAssigner
  • BboxOverlaps2D
  • BBoxL1Cost
  • IoUCost
  • ClassificationCost
  • FocalLossCost
  • DiceCost
  • CrossEntropyLossCost
  • MultiInstanceAssigner
  • PointAssigner
  • AnchorGenerator
  • SSDAnchorGenerator
  • LegacyAnchorGenerator
  • LegacySSDAnchorGenerator
  • YOLOAnchorGenerator
  • PointGenerator
  • MlvlPointGenerator
  • RegionAssigner
  • SimOTAAssigner
  • TaskAlignedAssigner
  • UniformAssigner
  • BucketingBBoxCoder
  • DeltaXYWHBBoxCoder
+
+ + + + + + + + + + + + + + + + + + +
task util (part 3)transform (part 1)transform (part 2)transform (part 3)
  • DistancePointBBoxCoder
  • LegacyDeltaXYWHBBoxCoder
  • PseudoBBoxCoder
  • TBLRBBoxCoder
  • YOLOBBoxCoder
  • CombinedSampler
  • RandomSampler
  • InstanceBalancedPosSampler
  • IoUBalancedNegSampler
  • MaskPseudoSampler
  • MultiInsRandomSampler
  • OHEMSampler
  • PseudoSampler
  • ScoreHLRSampler
  • AutoAugment
  • RandAugment
  • ColorTransform
  • Color
  • Brightness
  • Contrast
  • Sharpness
  • Solarize
  • SolarizeAdd
  • Posterize
  • Equalize
  • AutoContrast
  • Invert
  • PackDetInputs
  • ToTensor
  • ImageToTensor
  • Transpose
  • WrapFieldsToLists
  • GeomTransform
  • ShearX
  • ShearY
  • Rotate
  • TranslateX
  • TranslateY
  • InstaBoost
  • LoadImageFromNDArray
  • LoadMultiChannelImageFromFiles
  • LoadAnnotations
  • LoadPanopticAnnotations
  • LoadProposals
  • FilterAnnotations
  • LoadEmptyAnnotations
  • InferencerLoader
  • Resize
  • FixShapeResize
  • RandomFlip
  • RandomShift
  • Pad
  • RandomCrop
  • SegRescale
  • PhotoMetricDistortion
  • Expand
  • MinIoURandomCrop
  • Corrupt
  • Albu
  • RandomCenterCropPad
  • CutOut
  • Mosaic
  • MixUp
  • RandomAffine
  • YOLOXHSVRandomAug
  • CopyPaste
  • RandomErasing
  • CachedMosaic
  • CachedMixUp
  • MultiBranch
  • RandomOrder
  • ProposalBroadcaster
+
+ + + + + + + + + + + + + + + + + + +
model (part 1)model (part 2)model (part 3)model (part 4)
  • SiLU
  • DropBlock
  • ExpMomentumEMA
  • SinePositionalEncoding
  • LearnedPositionalEncoding
  • DynamicConv
  • MSDeformAttnPixelDecoder
  • Linear
  • NormedLinear
  • NormedConv2d
  • PixelDecoder
  • TransformerEncoderPixelDecoder
  • CSPDarknet
  • CSPNeXt
  • Darknet
  • ResNet
  • ResNetV1d
  • DetectoRS_ResNet
  • DetectoRS_ResNeXt
  • EfficientNet
  • HourglassNet
  • HRNet
  • MobileNetV2
  • PyramidVisionTransformer
  • PyramidVisionTransformerV2
  • ResNeXt
  • RegNet
  • Res2Net
  • ResNeSt
  • BFP
  • ChannelMapper
  • CSPNeXtPAFPN
  • CTResNetNeck
  • DilatedEncoder
  • DyHead
  • FPG
  • FPN
  • FPN_CARAFE
  • HRFPN
  • NASFPN
  • NASFCOS_FPN
  • PAFPN
  • RFP
  • SSDNeck
  • SSH
  • YOLOV3Neck
  • YOLOXPAFPN
  • SSDVGG
  • SwinTransformer
  • TridentResNet
  • DetDataPreprocessor
  • BatchSyncRandomResize
  • BatchFixedSizePad
  • MultiBranchDataPreprocessor
  • BatchResize
  • BoxInstDataPreprocessor
  • AnchorFreeHead
  • AnchorHead
  • ATSSHead
  • FCOSHead
  • AutoAssignHead
  • CondInstBboxHead
  • CondInstMaskHead
  • BoxInstBboxHead
  • BoxInstMaskHead
  • RPNHead
  • StageCascadeRPNHead
  • CascadeRPNHead
  • CenterNetHead
  • CenterNetUpdateHead
  • CornerHead
  • CentripetalHead
  • DETRHead
  • ConditionalDETRHead
  • DABDETRHead
  • DDODHead
  • DeformableDETRHead
  • DINOHead
  • EmbeddingRPNHead
  • FoveaHead
+
+ + + + + + + + + + + + + + + + + + +
model (part 5)model (part 6)model (part 7)model (part 8)
  • RetinaHead
  • FreeAnchorRetinaHead
  • AssociativeEmbeddingLoss
  • BalancedL1Loss
  • CrossEntropyLoss
  • DiceLoss
  • FocalLoss
  • GaussianFocalLoss
  • QualityFocalLoss
  • DistributionFocalLoss
  • GHMC
  • GHMR
  • IoULoss
  • BoundedIoULoss
  • GIoULoss
  • DIoULoss
  • CIoULoss
  • EIoULoss
  • KnowledgeDistillationKLDivLoss
  • MSELoss
  • SeesawLoss
  • SmoothL1Loss
  • L1Loss
  • VarifocalLoss
  • FSAFHead
  • GuidedAnchorHead
  • GARetinaHead
  • GARPNHead
  • GFLHead
  • PAAHead
  • LADHead
  • LDHead
  • MaskFormerHead
  • Mask2FormerHead
  • NASFCOSHead
  • PISARetinaHead
  • SSDHead
  • PISASSDHead
  • RepPointsHead
  • RetinaSepBNHead
  • RTMDetHead
  • RTMDetSepBNHead
  • RTMDetInsHead
  • RTMDetInsSepBNHead
  • SABLRetinaHead
  • SOLOHead
  • DecoupledSOLOHead
  • DecoupledSOLOLightHead
  • SOLOV2Head
  • TOODHead
  • VFNetHead
  • YOLACTHead
  • YOLACTProtonet
  • YOLOV3Head
  • YOLOFHead
  • YOLOXHead
  • SingleStageDetector
  • ATSS
  • AutoAssign
  • DetectionTransformer
  • SingleStageInstanceSegmentor
  • BoxInst
  • TwoStageDetector
  • CascadeRCNN
  • CenterNet
  • CondInst
  • DETR
  • ConditionalDETR
  • CornerNet
  • CrowdDet
  • Detectron2Wrapper
  • DABDETR
  • DDOD
  • DeformableDETR
  • DINO
  • FastRCNN
  • FasterRCNN
  • FCOS
+
+ + + + + + + + + + + + + + + + + + +
model (part 9)model (part 10)model (part 11)model (part 12)
  • FOVEA
  • FSAF
  • GFL
  • GridRCNN
  • HybridTaskCascade
  • KnowledgeDistillationSingleStageDetector
  • LAD
  • MaskFormer
  • Mask2Former
  • MaskRCNN
  • MaskScoringRCNN
  • NASFCOS
  • PAA
  • TwoStagePanopticSegmentor
  • PanopticFPN
  • PointRend
  • SparseRCNN
  • QueryInst
  • RepPointsDetector
  • RetinaNet
  • RPN
  • RTMDet
  • SCNet
  • SemiBaseDetector
  • SoftTeacher
  • SOLO
  • SOLOv2
  • TOOD
  • TridentFasterRCNN
  • VFNet
  • YOLACT
  • YOLOV3
  • YOLOF
  • YOLOX
  • BBoxHead
  • ConvFCBBoxHead
  • Shared2FCBBoxHead
  • Shared4Conv1FCBBoxHead
  • DIIHead
  • DoubleConvFCBBoxHead
  • MultiInstanceBBoxHead
  • SABLHead
  • SCNetBBoxHead
  • CascadeRoIHead
  • StandardRoIHead
  • DoubleHeadRoIHead
  • DynamicRoIHead
  • GridRoIHead
  • HybridTaskCascadeRoIHead
  • FCNMaskHead
  • CoarseMaskHead
  • DynamicMaskHead
  • FeatureRelayHead
  • FusedSemanticHead
  • GlobalContextHead
  • GridHead
  • HTCMaskHead
  • MaskPointHead
  • MaskIoUHead
  • SCNetMaskHead
  • SCNetSemanticHead
  • MaskScoringRoIHead
  • MultiInstanceRoIHead
  • PISARoIHead
  • PointRendRoIHead
  • GenericRoIExtractor
  • SingleRoIExtractor
  • SCNetRoIHead
  • ResLayer
  • SparseRoIHead
  • TridentRoIHead
  • BaseSemanticHead
  • PanopticFPNHead
  • BasePanopticFusionHead
  • HeuristicFusionHead
  • MaskFormerFusionHead
+
+
MMdetection Tools
+
+ + + + + + + + + + + + + + + + + + +
tools/dataset_converterstools/deploymenttoolstools/misc
  • pascal_voc.py
  • images2coco.py
  • cityscapes.py
  • mmdet2torchserve.py
  • test_torchserver.py
  • mmdet_handler.py
  • dist_test.sh
  • slurm_test.sh
  • test.py
  • dist_train.sh
  • train.py
  • slurm_train.sh
  • download_dataset.py
  • get_image_metas.py
  • gen_coco_panoptic_test_info.py
  • split_coco.py
  • get_crowdhuman_id_hw.py
  • print_config.py
+
+ + + + + + + + + + + + + + + + + + +
tools/model_converterstools/analysis_tools.dev_scripts (part 1).dev_scripts (part 2)
  • upgrade_model_version.py
  • upgrade_ssd_version.py
  • detectron2_to_mmdet.py
  • selfsup2mmdet.py
  • detectron2pytorch.py
  • regnet2mmdet.py
  • publish_model.py
  • benchmark.py
  • eval_metric.py
  • robustness_eval.py
  • confusion_matrix.py
  • optimize_anchors.py
  • browse_dataset.py
  • test_robustness.py
  • coco_error_analysis.py
  • coco_occluded_separated_recall.py
  • analyze_results.py
  • analyze_logs.py
  • get_flops.py
  • convert_test_benchmark_script.py
  • gather_test_benchmark_metric.py
  • benchmark_valid_flops.py
  • benchmark_train.py
  • test_benchmark.sh
  • download_checkpoints.py
  • benchmark_test_image.py
  • covignore.cfg
  • benchmark_full_models.txt
  • test_init_backbone.py
  • batch_train_list.txt
  • diff_coverage_test.sh
  • batch_test_list.py
  • linter.sh
  • gather_train_benchmark_metric.py
  • train_benchmark.sh
  • benchmark_inference_fps.py
  • benchmark_options.py
  • check_links.py
  • benchmark_test.py
  • benchmark_train_models.txt
  • convert_train_benchmark_script.py
  • gather_models.py
  • benchmark_filter.py
+
+ +## MMclassification (1.0.0rc5) + +
MMclassification Module Components
+
+ + + + + + + + + + + + + + + + + + + + +
visualizerdata sampleroptimizerbatch augmentmetric
  • ClsVisualizer
  • RepeatAugSampler
  • Adan
  • Lamb
  • Mixup
  • CutMix
  • ResizeMix
  • Accuracy
  • SingleLabelMetric
  • MultiLabelMetric
  • AveragePrecision
  • MultiTasksMetric
  • VOCMultiLabelMetric
  • VOCAveragePrecision
+
+ + + + + + + + + + + + + + + + + + +
hookdatasettransform (part 1)transform (part 2)
  • ClassNumCheckHook
  • EMAHook
  • SetAdaptiveMarginsHook
  • PreciseBNHook
  • PrepareProtoBeforeValLoopHook
  • SwitchRecipeHook
  • VisualizationHook
  • BaseDataset
  • CIFAR10
  • CIFAR100
  • CUB
  • CustomDataset
  • KFoldDataset
  • ImageNet
  • ImageNet21k
  • MNIST
  • FashionMNIST
  • MultiLabelDataset
  • MultiTaskDataset
  • VOC
  • AutoAugment
  • RandAugment
  • Shear
  • Translate
  • Rotate
  • AutoContrast
  • Invert
  • Equalize
  • Solarize
  • SolarizeAdd
  • Posterize
  • Contrast
  • ColorTransform
  • Brightness
  • Sharpness
  • Cutout
  • PackClsInputs
  • PackMultiTaskInputs
  • Transpose
  • ToPIL
  • ToNumpy
  • Collect
  • RandomCrop
  • RandomResizedCrop
  • EfficientNetRandomCrop
  • RandomErasing
  • EfficientNetCenterCrop
  • ResizeEdge
  • ColorJitter
  • Lighting
  • Albumentations
  • Albu
+
+ + + + + + + + + + + + + + + + + + +
model (part 1)model (part 2)model (part 3)model (part 4)
  • AlexNet
  • ShiftWindowMSA
  • ClsDataPreprocessor
  • VisionTransformer
  • BEiT
  • Conformer
  • ConvMixer
  • ResNet
  • ResNetV1c
  • ResNetV1d
  • ResNeXt
  • CSPDarkNet
  • CSPResNet
  • CSPResNeXt
  • DaViT
  • DistilledVisionTransformer
  • DeiT3
  • DenseNet
  • PoolFormer
  • EfficientFormer
  • EfficientNet
  • EfficientNetV2
  • HorNet
  • HRNet
  • InceptionV3
  • LeNet5
  • MixMIMTransformer
  • MlpMixer
  • MobileNetV2
  • MobileNetV3
  • MobileOne
  • MViT
  • RegNet
  • RepLKNet
  • RepMLPNet
  • RepVGG
  • Res2Net
  • ResNeSt
  • ResNet_CIFAR
  • RevVisionTransformer
  • SEResNet
  • SEResNeXt
  • ShuffleNetV1
  • ShuffleNetV2
  • SwinTransformer
  • SwinTransformerV2
  • T2T_ViT
  • TIMMBackbone
  • TNT
  • PCPVT
  • SVT
  • VAN
  • VGG
  • HuggingFaceClassifier
  • ImageClassifier
  • TimmClassifier
  • ClsHead
  • ConformerHead
  • VisionTransformerClsHead
  • DeiTClsHead
  • EfficientFormerClsHead
  • LinearClsHead
  • AsymmetricLoss
  • CrossEntropyLoss
  • FocalLoss
  • LabelSmoothLoss
  • SeesawLoss
  • ArcFaceClsHead
  • MultiLabelClsHead
  • CSRAClsHead
  • MultiLabelLinearClsHead
  • MultiTaskHead
  • StackedLinearClsHead
  • GlobalAveragePooling
  • GeneralizedMeanPooling
  • HRFuseScales
  • LinearReduction
  • ImageToImageRetriever
  • AverageClsScoreTTA
+
+
MMclassification Tools
+
+ + + + + + + + + + + + + + + + + + + + +
tools/misctools/visualizationstools/torchserve.dev_scriptstools/analysis_tools
  • verify_dataset.py
  • print_config.py
  • browse_dataset.py
  • vis_scheduler.py
  • vis_cam.py
  • mmcls_handler.py
  • mmcls2torchserve.py
  • test_torchserver.py
  • compare_init.py
  • ckpt_tree.py
  • generate_readme.py
  • eval_metric.py
  • analyze_results.py
  • analyze_logs.py
  • get_flops.py
+
+ + + + + + + + + + + + + + + + + + +
.dev_scripts/benchmark_regressiontoolstools/model_converters (part 1)tools/model_converters (part 2)
  • bench_train.yml
  • 4-benchmark_speed.py
  • 3-benchmark_train.py
  • 1-benchmark_valid.py
  • 2-benchmark_test.py
  • dist_test.sh
  • slurm_test.sh
  • test.py
  • dist_train.sh
  • train.py
  • slurm_train.sh
  • kfold-cross-valid.py
  • efficientnet_to_mmcls.py
  • repvgg_to_mmcls.py
  • clip_to_mmcls.py
  • reparameterize_model.py
  • shufflenetv2_to_mmcls.py
  • van2mmcls.py
  • hornet2mmcls.py
  • mixmimx_to_mmcls.py
  • edgenext_to_mmcls.py
  • torchvision_to_mmcls.py
  • twins2mmcls.py
  • revvit_to_mmcls.py
  • convnext_to_mmcls.py
  • replknet_to_mmcls.py
  • efficientnetv2_to_mmcls.py
  • mobilenetv2_to_mmcls.py
  • mlpmixer_to_mmcls.py
  • davit_to_mmcls.py
  • vgg_to_mmcls.py
  • deit3_to_mmcls.py
  • eva_to_mmcls.py
  • publish_model.py
  • tinyvit_to_mmcls.py
+
+ +## MMsegmentation (1.0.0rc5) + +
MMsegmentation Module Components
+
+ + + + + + + + + + + + + + + + + + + + +
task utilvisualizerhookoptimizer wrapper constructormetric
  • OHEMPixelSampler
  • SegLocalVisualizer
  • SegVisualizationHook
  • LearningRateDecayOptimizerConstructor
  • LayerDecayOptimizerConstructor
  • CitysMetric
  • IoUMetric
+
+ + + + + + + + + + + + + + + + + + +
dataset (part 1)dataset (part 2)transform (part 1)transform (part 2)
  • BaseSegDataset
  • ADE20KDataset
  • ChaseDB1Dataset
  • CityscapesDataset
  • COCOStuffDataset
  • DarkZurichDataset
  • MultiImageMixDataset
  • DecathlonDataset
  • DRIVEDataset
  • HRFDataset
  • iSAIDDataset
  • ISPRSDataset
  • LIPDataset
  • LoveDADataset
  • NightDrivingDataset
  • PascalContextDataset
  • PascalContextDataset59
  • PotsdamDataset
  • STAREDataset
  • SynapseDataset
  • PascalVOCDataset
  • PackSegInputs
  • LoadAnnotations
  • LoadImageFromNDArray
  • LoadBiomedicalImageFromFile
  • LoadBiomedicalAnnotation
  • LoadBiomedicalData
  • ResizeToMultiple
  • Rerange
  • CLAHE
  • RandomCrop
  • RandomRotate
  • RGB2Gray
  • AdjustGamma
  • SegRescale
  • PhotoMetricDistortion
  • RandomCutOut
  • RandomRotFlip
  • RandomMosaic
  • GenerateEdge
  • ResizeShortestEdge
  • BioMedical3DRandomCrop
  • BioMedicalGaussianNoise
  • BioMedicalGaussianBlur
  • BioMedicalRandomGamma
  • BioMedical3DPad
  • BioMedical3DRandomFlip
+
+ + + + + + + + + + + + + + + + + + +
model (part 1)model (part 2)model (part 3)model (part 4)
  • VisionTransformer
  • BEiT
  • BiSeNetV1
  • BiSeNetV2
  • CGNet
  • ERFNet
  • CrossEntropyLoss
  • DiceLoss
  • FocalLoss
  • LovaszLoss
  • TverskyLoss
  • ANNHead
  • APCHead
  • ASPPHead
  • FCNHead
  • CCHead
  • DAHead
  • DMHead
  • DNLHead
  • DPTHead
  • EMAHead
  • EncHead
  • FPNHead
  • GCHead
  • ISAHead
  • KernelUpdator
  • KernelUpdateHead
  • IterativeDecodeHead
  • LRASPPHead
  • Mask2FormerHead
  • MaskFormerHead
  • NLHead
  • OCRHead
  • PointHead
  • PSAHead
  • PSPHead
  • SegformerHead
  • SegmenterMaskTransformerHead
  • DepthwiseSeparableASPPHead
  • DepthwiseSeparableFCNHead
  • SETRMLAHead
  • SETRUPHead
  • STDCHead
  • UPerHead
  • FastSCNN
  • ResNet
  • ResNetV1c
  • ResNetV1d
  • HRNet
  • ICNet
  • MAE
  • MixVisionTransformer
  • MobileNetV2
  • MobileNetV3
  • ResNeSt
  • ResNeXt
  • STDCNet
  • STDCContextPathNet
  • SwinTransformer
  • TIMMBackbone
  • PCPVT
  • SVT
  • DeconvModule
  • InterpConv
  • UNet
  • SegDataPreProcessor
  • Feature2Pyramid
  • FPN
  • ICNeck
  • JPU
  • MLANeck
  • MultiLevelNeck
  • EncoderDecoder
  • CascadeEncoderDecoder
  • SegTTAModel
+
+
MMsegmentation Tools
+
+ + + + + + + + + + + + + + + + + + +
tools/deploymenttools/misctools/torchservetools/analysis_tools
  • pytorch2torchscript.py
  • browse_dataset.py
  • publish_model.py
  • print_config.py
  • mmseg_handler.py
  • mmseg2torchserve.py
  • test_torchserve.py
  • benchmark.py
  • confusion_matrix.py
  • analyze_logs.py
  • get_flops.py
+
+ + + + + + + + + + + + + + + + +
toolstools/model_converterstools/dataset_converters
  • dist_test.sh
  • slurm_test.sh
  • test.py
  • dist_train.sh
  • train.py
  • slurm_train.sh
  • swin2mmseg.py
  • vitjax2mmseg.py
  • twins2mmseg.py
  • stdc2mmseg.py
  • vit2mmseg.py
  • mit2mmseg.py
  • beit2mmseg.py
  • voc_aug.py
  • hrf.py
  • drive.py
  • pascal_context.py
  • vaihingen.py
  • stare.py
  • synapse.py
  • isaid.py
  • cityscapes.py
  • loveda.py
  • potsdam.py
  • chase_db1.py
  • coco_stuff164k.py
  • coco_stuff10k.py
+
+ +## MMengine (0.6.0) + +
MMengine Module Components
+
+ + + + + + + + + + + + + + + + + + + + +
log_processorvisualizermetricevaluatorrunner
  • LogProcessor
  • Visualizer
  • DumpResults
  • Evaluator
  • Runner
+
+ + + + + + + + + + + + + + + + + + + + +
optimizer wrapper constructorCollate Functionsdata samplervis_backenddataset
  • DefaultOptimWrapperConstructor
  • pseudo_collate
  • default_collate
  • DefaultSampler
  • InfiniteSampler
  • LocalVisBackend
  • WandbVisBackend
  • TensorboardVisBackend
  • ConcatDataset
  • RepeatDataset
  • ClassBalancedDataset
+
+ + + + + + + + + + + + + + + + + + + + +
optim_wrapperloopmodel_wrappermodelweight initializer
  • OptimWrapper
  • AmpOptimWrapper
  • ApexOptimWrapper
  • EpochBasedTrainLoop
  • IterBasedTrainLoop
  • ValLoop
  • TestLoop
  • DistributedDataParallel
  • DataParallel
  • MMDistributedDataParallel
  • MMSeparateDistributedDataParallel
  • StochasticWeightAverage
  • ExponentialMovingAverage
  • MomentumAnnealingEMA
  • BaseDataPreprocessor
  • ImgDataPreprocessor
  • BaseTTAModel
  • ToyModel
  • Constant
  • Xavier
  • Normal
  • TruncNormal
  • Uniform
  • Kaiming
  • Caffe2Xavier
  • Pretrained
+
+ + + + + + + + + + + + + + + + + + +
hookoptimizerparameter scheduler (part 1)parameter scheduler (part 2)
  • CheckpointHook
  • EMAHook
  • EmptyCacheHook
  • IterTimerHook
  • LoggerHook
  • NaiveVisualizationHook
  • ParamSchedulerHook
  • ProfilerHook
  • NPUProfilerHook
  • RuntimeInfoHook
  • DistSamplerSeedHook
  • SyncBuffersHook
  • PrepareTTAHook
  • ASGD
  • Adadelta
  • Adagrad
  • Adam
  • AdamW
  • Adamax
  • LBFGS
  • Optimizer
  • RMSprop
  • Rprop
  • SGD
  • SparseAdam
  • ZeroRedundancyOptimizer
  • StepParamScheduler
  • MultiStepParamScheduler
  • ConstantParamScheduler
  • ExponentialParamScheduler
  • CosineAnnealingParamScheduler
  • LinearParamScheduler
  • PolyParamScheduler
  • OneCycleParamScheduler
  • CosineRestartParamScheduler
  • ReduceOnPlateauParamScheduler
  • ConstantLR
  • CosineAnnealingLR
  • ExponentialLR
  • LinearLR
  • MultiStepLR
  • StepLR
  • PolyLR
  • OneCycleLR
  • CosineRestartLR
  • ReduceOnPlateauLR
  • ConstantMomentum
  • CosineAnnealingMomentum
  • ExponentialMomentum
  • LinearMomentum
  • MultiStepMomentum
  • StepMomentum
  • PolyMomentum
  • CosineRestartMomentum
  • ReduceOnPlateauMomentum
+
+ +## MMCV (2.0.0rc4) + +
MMCV Module Components
+
+ + + + + + + + + + + + + + + + + + +
transformmodel (part 1)model (part 2)model (part 3)
  • LoadImageFromFile
  • LoadAnnotations
  • Compose
  • KeyMapper
  • TransformBroadcaster
  • RandomChoice
  • RandomApply
  • Normalize
  • Resize
  • Pad
  • CenterCrop
  • RandomGrayscale
  • MultiScaleFlipAug
  • TestTimeAug
  • RandomChoiceResize
  • RandomFlip
  • RandomResize
  • ToTensor
  • ImageToTensor
  • ReLU
  • LeakyReLU
  • PReLU
  • RReLU
  • ReLU6
  • ELU
  • Sigmoid
  • Tanh
  • SiLU
  • Clamp
  • Clip
  • GELU
  • ContextBlock
  • Conv1d
  • Conv2d
  • Conv3d
  • Conv
  • Conv2dAdaptivePadding
  • BN
  • BN1d
  • BN2d
  • BN3d
  • SyncBN
  • GN
  • LN
  • IN
  • IN1d
  • IN2d
  • IN3d
  • zero
  • reflect
  • replicate
  • ConvModule
  • ConvWS
  • ConvAWS
  • DropPath
  • Dropout
  • GeneralizedAttention
  • HSigmoid
  • HSwish
  • NonLocal2d
  • Swish
  • nearest
  • bilinear
  • pixel_shuffle
  • deconv
  • ConvTranspose2d
  • deconv3d
  • ConvTranspose3d
  • MultiheadAttention
  • FFN
  • BaseTransformerLayer
  • TransformerLayerSequence
+
+
MMCV Tools
+
+ + + + + + + + + + + + +
.dev_scripts
  • check_installation.py
+
diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/resume_training.md b/third_party/mmyolo/docs/zh_cn/common_usage/resume_training.md new file mode 100644 index 0000000000000000000000000000000000000000..36431e32dc89ea6d38333e547d11173d3c6c1996 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/resume_training.md @@ -0,0 +1,9 @@ +# 恢复训练 + +恢复训练是指从之前某次训练保存下来的状态开始继续训练,这里的状态包括模型的权重、优化器和优化器参数调整策略的状态。 + +用户可以在训练命令最后加上 `--resume` 恢复训练,程序会自动从 `work_dirs` 中加载最新的权重文件恢复训练。如果 `work_dir` 中有最新的 checkpoint(例如该训练在上一次训练时被中断),则会从该 checkpoint 恢复训练,否则(例如上一次训练还没来得及保存 checkpoint 或者启动了新的训练任务)会重新开始训练。下面是一个恢复训练的示例: + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py --resume +``` diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/set_random_seed.md b/third_party/mmyolo/docs/zh_cn/common_usage/set_random_seed.md new file mode 100644 index 0000000000000000000000000000000000000000..6f747c54e890fae5816fbd5632cde8ac61f38f29 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/set_random_seed.md @@ -0,0 +1,20 @@ +# 设置随机种子 + +如果想要在训练时指定随机种子,可以使用以下命令: + +```shell +python ./tools/train.py \ + ${CONFIG} \ # 配置文件路径 + --cfg-options randomness.seed=2023 \ # 设置随机种子为 2023 + [randomness.diff_rank_seed=True] \ # 根据 rank 来设置不同的种子。 + [randomness.deterministic=True] # 把 cuDNN 后端确定性选项设置为 True +# [] 代表可选参数,实际输入命令行时,不用输入 [] +``` + +`randomness` 有三个参数可设置,具体含义如下: + +- `randomness.seed=2023` ,设置随机种子为 2023。 + +- `randomness.diff_rank_seed=True`,根据 rank 来设置不同的种子,`diff_rank_seed` 默认为 False。 + +- `randomness.deterministic=True`,把 cuDNN 后端确定性选项设置为 True,即把`torch.backends.cudnn.deterministic` 设为 True,把 `torch.backends.cudnn.benchmark` 设为False。`deterministic` 默认为 False。更多细节见 https://pytorch.org/docs/stable/notes/randomness.html。 diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/set_syncbn.md b/third_party/mmyolo/docs/zh_cn/common_usage/set_syncbn.md new file mode 100644 index 0000000000000000000000000000000000000000..a654a2b46f388371d031767de01fba7a618ceb70 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/set_syncbn.md @@ -0,0 +1 @@ +# 开启和关闭 SyncBatchNorm diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/single_multi_channel_applications.md b/third_party/mmyolo/docs/zh_cn/common_usage/single_multi_channel_applications.md new file mode 100644 index 0000000000000000000000000000000000000000..a20ef90445a2333994395c1638e2eb528e78762e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/single_multi_channel_applications.md @@ -0,0 +1,188 @@ +# 单通道和多通道应用案例 + +## 在单通道图像数据集上训练示例 + +MMYOLO 中默认的训练图片均为彩色三通道数据,如果希望采用单通道数据集进行训练和测试,预计需要修改的地方包括: + +1. 所有的图片处理 pipeline 都要支持单通道运算 +2. 模型的骨干网络的第一个卷积层输入通道需要从 3 改成 1 +3. 如果希望加载 COCO 预训练权重,则需要处理第一个卷积层权重尺寸不匹配问题 + +下面以 `cat` 数据集为例,描述整个修改过程,如果你使用的是自定义灰度图像数据集,你可以跳过数据集预处理这一步。 + +### 1 数据集预处理 + +自定义数据集的处理训练可参照[自定义数据集 标注+训练+测试+部署 全流程](../recommended_topics/labeling_to_deployment_tutorials.md)。 + +`cat` 是一个三通道彩色图片数据集,为了方便演示,你可以运行下面的代码和命令,将数据集图片替换为单通道图片,方便后续验证。 + +**1. 下载 `cat` 数据集进行解压** + +```shell +python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete +``` + +**2. 将数据集转换为灰度图** + +```python +import argparse +import imghdr +import os +from typing import List +import cv2 + +def parse_args(): + parser = argparse.ArgumentParser(description='data_path') + parser.add_argument('path', type=str, help='Original dataset path') + return parser.parse_args() + +def main(): + args = parse_args() + + path = args.path + '/images/' + save_path = path + file_list: List[str] = os.listdir(path) + # Grayscale conversion of each imager + for file in file_list: + if imghdr.what(path + '/' + file) != 'jpeg': + continue + img = cv2.imread(path + '/' + file) + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + cv2.imwrite(save_path + '/' + file, img) + +if __name__ == '__main__': + main() +``` + +将上述脚本命名为 `cvt_single_channel.py`, 运行命令为: + +```shell +python cvt_single_channel.py data/cat +``` + +### 2 修改 base 配置文件 + +**目前 MMYOLO 的一些图像处理函数例如颜色空间变换还不兼容单通道图片,如果直接采用单通道数据训练需要修改部分 pipeline,工作量较大**。为了解决不兼容问题,推荐的做法是将单通道图片作为采用三通道图片方式读取将其加载为三通道数据,但是在输入到网络前将其转换为单通道格式。这种做法会稍微增加一些运算负担,但是用户基本不需要修改代码即可使用。 + +以 `projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py`为 `base` 配置,将其复制到 `configs/yolov5` 目录下,在同级配置路径下新增 `yolov5_s-v61_syncbn_fast_1xb32-100e_cat_single_channel.py` 文件。 我们可以 `mmyolo/models/data_preprocessors/data_preprocessor.py` 文件中继承 `YOLOv5DetDataPreprocessor` 并命名新类为 `YOLOv5SCDetDataPreprocessor`, 在其中将图片转成单通道,添加依赖库并在`mmyolo/models/data_preprocessors/__init__.py`中注册新类。 `YOLOv5SCDetDataPreprocessor` 示例代码为: + +```python +@MODELS.register_module() +class YOLOv5SCDetDataPreprocessor(YOLOv5DetDataPreprocessor): + """Rewrite collate_fn to get faster training speed. + + Note: It must be used together with `mmyolo.datasets.utils.yolov5_collate` + """ + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding, bgr2rgb conversion and convert to single channel image based on ``DetDataPreprocessor``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # TODO: Supports multi-scale training + if self._channel_conversion and inputs.shape[1] == 3: + inputs = inputs[:, [2, 1, 0], ...] + + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + + # Convert to single channel image + inputs = inputs.mean(dim=1, keepdim=True) + + return {'inputs': inputs, 'data_samples': data_samples} +``` + +此时 `yolov5_s-v61_syncbn_fast_1xb32-100e_cat_single_channel.py`配置文件内容为如下所示: + +```python +_base_ = 'yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py' + +_base_.model.data_preprocessor.type = 'YOLOv5SCDetDataPreprocessor' +``` + +### 3 预训练模型加载问题 + +直接使用原三通道的预训练模型,理论上会导致精度有所降低(未实验验证)。可采用的解决思路:将输入层 3 通道每个通道的权重调整为原 3 通道权重的平均值, 或将输入层每个通道的权重调整为原3通道某一通道权重,也可以对输入层权重不做修改直接训练,具体效果根据实际情况有所不同。这里采用将输入层 3 个通道权重调整为预训练 3 通道权重平均值的方式。 + +```python +import torch + +def main(): + # 加载权重文件 + state_dict = torch.load( + 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' + ) + + # 修改输入层权重 + weights = state_dict['state_dict']['backbone.stem.conv.weight'] + avg_weight = weights.mean(dim=1, keepdim=True) + state_dict['state_dict']['backbone.stem.conv.weight'] = avg_weight + + # 保存修改后的权重到新文件 + torch.save( + state_dict, + 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187_single_channel.pth' + ) + +if __name__ == '__main__': + main() +``` + +此时 `yolov5_s-v61_syncbn_fast_1xb32-100e_cat_single_channel.py`配置文件内容为如下所示: + +```python +_base_ = 'yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py' + +_base_.model.data_preprocessor.type = 'YOLOv5SCDetDataPreprocessor' + +load_from = './checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187_single_channel.pth' +``` + +### 4 模型训练效果 + + + +左图是实际标签,右图是目标检测结果。 + +```shell + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.958 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.958 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.881 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.969 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.969 +bbox_mAP_copypaste: 0.958 1.000 1.000 -1.000 -1.000 0.958 +Epoch(val) [100][116/116] coco/bbox_mAP: 0.9580 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 1.0000 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9580 +``` + +## 在多通道图像数据集上训练示例 + +TODO diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/specify_device.md b/third_party/mmyolo/docs/zh_cn/common_usage/specify_device.md new file mode 100644 index 0000000000000000000000000000000000000000..772e43dfc60f800223c68bc2e9196665d043b17e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/specify_device.md @@ -0,0 +1,23 @@ +# 指定特定设备训练或推理 + +如果你有多张 GPU,比如 8 张,其编号分别为 `0, 1, 2, 3, 4, 5, 6, 7`,使用单卡训练或推理时会默认使用卡 0。如果想指定其他卡进行训练或推理,可以使用以下命令: + +```shell +CUDA_VISIBLE_DEVICES=5 python ./tools/train.py ${CONFIG} #train +CUDA_VISIBLE_DEVICES=5 python ./tools/test.py ${CONFIG} ${CHECKPOINT_FILE} #test +``` + +如果设置`CUDA_VISIBLE_DEVICES`为 -1 或者一个大于 GPU 最大编号的数,比如 8,将会使用 CPU 进行训练或者推理。 + +如果你想使用其中几张卡并行训练,可以使用如下命令: + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 ./tools/dist_train.sh ${CONFIG} ${GPU_NUM} +``` + +这里 `GPU_NUM` 为 4。另外如果在一台机器上多个任务同时多卡训练,需要设置不同的端口,比如以下命令: + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG} 4 +CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG} 4 +``` diff --git a/third_party/mmyolo/docs/zh_cn/common_usage/tta.md b/third_party/mmyolo/docs/zh_cn/common_usage/tta.md new file mode 100644 index 0000000000000000000000000000000000000000..7baf7a4e04a7a985438288a10a6557130888786e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/common_usage/tta.md @@ -0,0 +1,87 @@ +# 测试时增强相关说明 + +## 测试时增强 TTA + +MMYOLO 在 v0.5.0+ 版本中增加对 TTA 的支持,用户可以在进行评估时候指定 `--tta` 参数使用。 以 `YOLOv5-s` 为例,其单卡 TTA 测试命令为: + +```shell +python tools/test.py configs/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco/yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739-b804c1ad.pth --tta +``` + +TTA 功能的正常运行必须确保配置中存在 `tta_model` 和 `tta_pipeline` 两个变量,详情可以参考 [det_p5_tta.py](https://github.com/open-mmlab/mmyolo/blob/dev/configs/_base_/det_p5_tta.py)。 + +MMYOLO 中默认的 TTA 会先执行 3 个多尺度增强,然后在每个尺度中执行 2 种水平翻转增强,一共 6 个并行的 pipeline。以 `YOLOv5-s` 为例,其 TTA 配置为: + +```python +img_scales = [(640, 640), (320, 320), (960, 960)] + +_multiscale_resize_transforms = [ + dict( + type='Compose', + transforms=[ + dict(type='YOLOv5KeepRatioResize', scale=s), + dict( + type='LetterResize', + scale=s, + allow_scale_up=False, + pad_val=dict(img=114)) + ]) for s in img_scales +] + +tta_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TestTimeAug', + transforms=[ + _multiscale_resize_transforms, + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] +``` + +其示意图如下所示: + +```text + LoadImageFromFile + / | \ +(RatioResize,LetterResize) (RatioResize,LetterResize) (RatioResize,LetterResize) + / \ / \ / \ + RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip RandomFlip + | | | | | | + LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn LoadAnn + | | | | | | + PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn PackDetIn +``` + +你可以修改 `img_scales` 来支持不同的多尺度增强,也可以插入新的 pipeline 从而实现自定义 TTA 需求。 假设你只想进行水平翻转增强,则配置应该修改为如下: + +```python +tta_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TestTimeAug', + transforms=[ + [ + dict(type='mmdet.RandomFlip', prob=1.), + dict(type='mmdet.RandomFlip', prob=0.) + ], [dict(type='mmdet.LoadAnnotations', with_bbox=True)], + [ + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param', 'flip', + 'flip_direction')) + ] + ]) +] +``` diff --git a/third_party/mmyolo/docs/zh_cn/conf.py b/third_party/mmyolo/docs/zh_cn/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..70eb0c0e95eca1321a18853ee8da2a5f0807f986 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/conf.py @@ -0,0 +1,117 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import subprocess +import sys + +import pytorch_sphinx_theme + +sys.path.insert(0, os.path.abspath('../../')) + +# -- Project information ----------------------------------------------------- + +project = 'MMYOLO' +copyright = '2022, OpenMMLab' +author = 'MMYOLO Authors' +version_file = '../../mmyolo/version.py' + + +def get_version(): + with open(version_file) as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +# The full version, including alpha/beta/rc tags +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_markdown_tables', + 'sphinx_copybutton', +] + +myst_enable_extensions = ['colon_fence'] +myst_heading_anchors = 3 + +autodoc_mock_imports = [ + 'matplotlib', 'pycocotools', 'terminaltables', 'mmyolo.version', 'mmcv.ops' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +# The master toctree document. +master_doc = 'index' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'sphinx_rtd_theme' +html_theme = 'pytorch_sphinx_theme' +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] + +html_theme_options = { + 'menu': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/open-mmlab/mmyolo' + }, + ], + # Specify the language of shared menu + 'menu_lang': 'cn', +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_css_files = ['css/readthedocs.css'] + +language = 'zh_CN' + +# -- Extension configuration ------------------------------------------------- +# Ignore >>> when copying code +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + + +def builder_inited_handler(app): + subprocess.run(['./stat.py']) + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) diff --git a/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_instance_segmentation.md b/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_instance_segmentation.md new file mode 100644 index 0000000000000000000000000000000000000000..2b9e6aab8d54e39d260d56a2c2327ad7a6d9c80b --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_instance_segmentation.md @@ -0,0 +1,330 @@ +# 15 分钟上手 MMYOLO 实例分割 + +实例分割是计算机视觉中的一个任务,旨在将图像中的每个对象都分割出来,并为每个对象分配一个唯一的标识符。与语义分割不同,实例分割不仅分割出图像中的不同类别,还将同一类别的不同实例分开。 + +
+Instance Segmentation +
+ +以可供下载的气球 balloon 小数据集为例,带大家 15 分钟轻松上手 MMYOLO 实例分割。整个流程包含如下步骤: + +- [环境安装](#环境安装) +- [数据集准备](#数据集准备) +- [配置准备](#配置准备) +- [模型训练](#模型训练) +- [模型测试](#模型测试) +- [EasyDeploy 模型部署](#easydeploy-模型部署) + +本文以 YOLOv5-s 为例,其余 YOLO 系列算法的气球 balloon 小数据集 demo 配置请查看对应的算法配置文件夹下。 + +## 环境安装 + +假设你已经提前安装好了 Conda,接下来安装 PyTorch + +```shell +conda create -n mmyolo python=3.8 -y +conda activate mmyolo +# 如果你有 GPU +conda install pytorch torchvision -c pytorch +# 如果你是 CPU +# conda install pytorch torchvision cpuonly -c pytorch +``` + +安装 MMYOLO 和依赖库 + +```shell +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +pip install -U openmim +mim install -r requirements/mminstall.txt +# Install albumentations +mim install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +# "-v" 指详细说明,或更多的输出 +# "-e" 表示在可编辑模式下安装项目,因此对代码所做的任何本地修改都会生效,从而无需重新安装。 +``` + +```{note} +温馨提醒:由于本仓库采用的是 OpenMMLab 2.0,请最好新建一个 conda 虚拟环境,防止和 OpenMMLab 1.0 已经安装的仓库冲突。 +``` + +详细环境配置操作请查看 [安装和验证](./installation.md) + +## 数据集准备 + +Balloon 数据集是一个包括 74 张图片的单类别数据集, 包括了训练所需的标注信息。 样例图片如下所示: + +
+balloon dataset +
+ +你只需执行如下命令即可下载并且直接用起来 + +```shell +python tools/misc/download_dataset.py --dataset-name balloon --save-dir ./data/balloon --unzip --delete +python ./tools/dataset_converters/balloon2coco.py +``` + +data 位于 mmyolo 工程目录下, `train.json`, `val.json` 中存放的是 COCO 格式的标注,`data/balloon/train`, `data/balloon/val` 中存放的是所有图片 + +## 配置准备 + +以 YOLOv5 算法为例,考虑到用户显存和内存有限,我们需要修改一些默认训练参数来让大家愉快的跑起来,核心需要修改的参数如下 + +- YOLOv5 是 Anchor-Based 类算法,不同的数据集需要自适应计算合适的 Anchor +- 默认配置是 8 卡,每张卡 batch size 为 16,现将其改成单卡,每张卡 batch size 为 4 +- 原则上 batch size 改变后,学习率也需要进行线性缩放,但是实测发现不需要 + +具体操作为在 `configs/yolov5/ins_seg` 文件夹下新建 `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` 配置文件(为了方便大家直接使用,我们已经提供了该配置),并把以下内容复制配置文件中。 + +```python +_base_ = './yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py' # noqa + +data_root = 'data/balloon/' +# 训练集标注路径 +train_ann_file = 'train.json' +train_data_prefix = 'train/' # 训练集图片路径 +# 测试集标注路径 +val_ann_file = 'val.json' +val_data_prefix = 'val/' # 验证集图片路径 +metainfo = { + 'classes': ('balloon', ), # 数据集类别名称 + 'palette': [ + (220, 20, 60), + ] +} +num_classes = 1 +# 批处理大小batch size设置为 4 +train_batch_size_per_gpu = 4 +# dataloader 加载进程数 +train_num_workers = 2 +log_interval = 1 +##################### +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=train_data_prefix), + ann_file=train_ann_file)) +val_dataloader = dict( + dataset=dict( + data_root=data_root, + metainfo=metainfo, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) +test_dataloader = val_dataloader +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = val_evaluator +default_hooks = dict(logger=dict(interval=log_interval)) +##################### + +model = dict(bbox_head=dict(head_module=dict(num_classes=num_classes))) +``` + +以上配置从 `yolov5_ins_s-v61_syncbn_fast_8xb16-300e_coco_instance.py` 中继承,并根据 balloon 数据的特点更新了 `data_root`、`metainfo`、`train_dataloader`、`val_dataloader`、`num_classes` 等配置。 + +## 模型训练 + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py +``` + +运行以上训练命令 `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance` 文件夹会被自动生成,权重文件以及此次的训练配置文件将会保存在此文件夹中。 在 1660 低端显卡上,整个训练过程大概需要 30 分钟。 + +
+image +
+ +在 `val.json` 上性能如下所示: + +```text + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.330 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.509 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.317 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.150 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.396 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.454 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.317 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525 +``` + +上述性能是通过 COCO API 打印,其中 -1 表示不存在对于尺度的物体。 + +### 一些注意事项 + +在训练过程中会打印如下关键警告: + +- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon. + +这个警告都不会对性能有任何影响。第一个警告是说明由于当前训练的类别数是 1,根据 YOLOv5 算法的社区, 分类分支的 loss 始终是 0,这是正常现象。 + +### 中断后恢复训练 + +如果训练中途停止,可以在训练命令最后加上 `--resume` ,程序会自动从 `work_dirs` 中加载最新的权重文件恢复训练。 + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --resume +``` + +### 节省显存策略 + +上述配置大概需要 1.0G 显存,如果你的显存不够,可以考虑开启混合精度训练 + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py --amp +``` + +### 训练可视化 + +MMYOLO 目前支持本地、TensorBoard 以及 WandB 等多种后端可视化,默认是采用本地可视化方式,你可以切换为 WandB 等实时可视化训练过程中各类指标。 + +#### 1 WandB 可视化使用 + +WandB 官网注册并在 https://wandb.ai/settings 获取到 WandB 的 API Keys。 + +
+image +
+ +```shell +pip install wandb +# 运行了 wandb login 后输入上文中获取到的 API Keys ,便登录成功。 +wandb login +``` + +在 `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py` 配置文件最后添加 WandB 配置 + +```python +visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) +``` + +重新运行训练命令便可以在命令行中提示的网页链接中看到 loss、学习率和 coco/bbox_mAP 等数据可视化了。 + +```shell +python tools/train.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py +``` + +#### 2 Tensorboard 可视化使用 + +安装 Tensorboard 环境 + +```shell +pip install tensorboard +``` + +同上述在配置文件 `configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py`配置的最后添加 `tensorboard` 配置 + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')]) +``` + +重新运行训练命令后,Tensorboard 文件会生成在可视化文件夹 `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/vis_data` 下, +运行下面的命令便可以在网页链接使用 Tensorboard 查看 loss、学习率和 coco/bbox_mAP 等可视化数据了: + +```shell +tensorboard --logdir=work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance +``` + +## 模型测试 + +```shell +python tools/test.py configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \ + work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ + --show-dir show_results +``` + +运行以上测试命令, 你不仅可以得到**模型训练**部分所打印的 AP 性能,还可以将推理结果图片自动保存至 `work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/{timestamp}/show_results` 文件夹中。下面为其中一张结果图片,左图为实际标注,右图为模型推理结果。 + +
+result_img +
+ +如果你使用了 `WandbVisBackend` 或者 `TensorboardVisBackend`,则还可以在浏览器窗口可视化模型推理结果。 + +## 特征图相关可视化 + +MMYOLO 中提供了特征图相关可视化脚本,用于分析当前模型训练效果。 详细使用流程请参考 [特征图可视化](../recommended_topics/visualization.md) + +由于 `test_pipeline` 直接可视化会存在偏差,故将需要 `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` 中 `test_pipeline` + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +修改为如下配置: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 删除 YOLOv5KeepRatioResize, 将 LetterResize 修改成 mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) # 删除 pad_param +] +``` + +我们选择 `data/balloon/train/3927754171_9011487133_b.jpg` 图片作为例子,可视化 YOLOv5 backbone 和 neck 层的输出特征图。 + +```shell +python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \ + configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \ + work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers backbone \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +结果会保存到当前路径的 output 文件夹下。上图中绘制的 3 个输出特征图对应大中小输出特征图。 + +**2. 可视化 YOLOv5 neck 输出的 3 个通道** + +```shell +python demo/featmap_vis_demo.py data/balloon/train/3927754171_9011487133_b.jpg \ + configs/yolov5/ins_seg/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance.py \ + work_dirs/yolov5_ins_s-v61_syncbn_fast_8xb16-300e_balloon_instance/best_coco_bbox_mAP_epoch_300.pth \ --target-layers neck \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +**3. Grad-Based CAM 可视化** + +TODO + +## EasyDeploy 模型部署 + +TODO + +至此本教程结束。 + +以上完整内容可以查看 [15_minutes_instance_segmentation.ipynb](../../../demo/15_minutes_instance_segmentation.ipynb)。 如果你在训练或者测试过程中碰到问题,请先查看 [常见错误排除步骤](../recommended_topics/troubleshooting_steps.md), 如果依然无法解决欢迎提 issue。 diff --git a/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_object_detection.md b/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_object_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..51022baa94c4fdd5b4dc65479f23a7cdbd349fb2 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_object_detection.md @@ -0,0 +1,533 @@ +# 15 分钟上手 MMYOLO 目标检测 + +目标检测任务是指给定一张图片,网络预测出图片中所包括的所有物体类别和对应的边界框 + +
+object detection +
+ +以我们提供的猫 cat 小数据集为例,带大家 15 分钟轻松上手 MMYOLO 目标检测。整个流程包含如下步骤: + +- [环境安装](#环境安装) +- [数据集准备](#数据集准备) +- [配置准备](#配置准备) +- [模型训练](#模型训练) +- [模型测试](#模型测试) +- [EasyDeploy 模型部署](#easydeploy-模型部署) + +本文以 YOLOv5-s 为例,其余 YOLO 系列算法的猫 cat 小数据集 demo 配置请查看对应的算法配置文件夹下。 + +## 环境安装 + +假设你已经提前安装好了 Conda,接下来安装 PyTorch + +```shell +conda create -n mmyolo python=3.8 -y +conda activate mmyolo +# 如果你有 GPU +conda install pytorch torchvision -c pytorch +# 如果你是 CPU +# conda install pytorch torchvision cpuonly -c pytorch +``` + +安装 MMYOLO 和依赖库 + +```shell +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +pip install -U openmim +mim install -r requirements/mminstall.txt +# Install albumentations +mim install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +# "-v" 指详细说明,或更多的输出 +# "-e" 表示在可编辑模式下安装项目,因此对代码所做的任何本地修改都会生效,从而无需重新安装。 +``` + +```{note} +温馨提醒:由于本仓库采用的是 OpenMMLab 2.0,请最好新建一个 conda 虚拟环境,防止和 OpenMMLab 1.0 已经安装的仓库冲突。 +``` + +详细环境配置操作请查看 [安装和验证](./installation.md) + +## 数据集准备 + +Cat 数据集是一个包括 144 张图片的单类别数据集(本 cat 数据集由 @RangeKing 提供原始图片,由 @PeterH0323 进行数据清洗), 包括了训练所需的标注信息。 样例图片如下所示: + +
+cat dataset +
+ +你只需执行如下命令即可下载并且直接用起来 + +```shell +python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete +``` + +数据集组织格式如下所示: + +
+image +
+ +data 位于 mmyolo 工程目录下, `data/cat/annotations` 中存放的是 COCO 格式的标注,`data/cat/images` 中存放的是所有图片 + +## 配置准备 + +以 YOLOv5 算法为例,考虑到用户显存和内存有限,我们需要修改一些默认训练参数来让大家愉快的跑起来,核心需要修改的参数如下 + +- YOLOv5 是 Anchor-Based 类算法,不同的数据集需要自适应计算合适的 Anchor +- 默认配置是 8 卡,每张卡 batch size 为 16,现将其改成单卡,每张卡 batch size 为 12 +- 默认训练 epoch 是 300,将其改成 40 epoch +- 由于数据集太小,我们选择固定 backbone 网络权重 +- 原则上 batch size 改变后,学习率也需要进行线性缩放,但是实测发现不需要 + +具体操作为在 `configs/yolov5` 文件夹下新建 `yolov5_s-v61_fast_1xb12-40e_cat.py` 配置文件(为了方便大家直接使用,我们已经提供了该配置),并把以下内容复制配置文件中。 + +```python +# 基于该配置进行继承并重写部分配置 +_base_ = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +data_root = './data/cat/' # 数据集根路径 +class_name = ('cat', ) # 数据集类别名称 +num_classes = len(class_name) # 数据集类别数 +# metainfo 必须要传给后面的 dataloader 配置,否则无效 +# palette 是可视化时候对应类别的显示颜色 +# palette 长度必须大于或等于 classes 长度 +metainfo = dict(classes=class_name, palette=[(20, 220, 60)]) + +# 基于 tools/analysis_tools/optimize_anchors.py 自适应计算的 anchor +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] +# 最大训练 40 epoch +max_epochs = 40 +# bs 为 12 +train_batch_size_per_gpu = 12 +# dataloader 加载进程数 +train_num_workers = 4 + +# 加载 COCO 预训练权重 +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +model = dict( + # 固定整个 backbone 权重,不进行训练 + backbone=dict(frozen_stages=4), + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors) + )) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + data_root=data_root, + metainfo=metainfo, + # 数据集标注文件 json 路径 + ann_file='annotations/trainval.json', + # 数据集前缀 + data_prefix=dict(img='images/'))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/test.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +_base_.optim_wrapper.optimizer.batch_size_per_gpu = train_batch_size_per_gpu + +val_evaluator = dict(ann_file=data_root + 'annotations/test.json') +test_evaluator = val_evaluator + +default_hooks = dict( + # 每隔 10 个 epoch 保存一次权重,并且最多保存 2 个权重 + # 模型评估时候自动保存最佳模型 + checkpoint=dict(interval=10, max_keep_ckpts=2, save_best='auto'), + # warmup_mim_iter 参数非常关键,因为 cat 数据集非常小,默认的最小 warmup_mim_iter 是 1000,导致训练过程学习率偏小 + param_scheduler=dict(max_epochs=max_epochs, warmup_mim_iter=10), + # 日志打印间隔为 5 + logger=dict(type='LoggerHook', interval=5)) +# 评估间隔为 10 +train_cfg = dict(max_epochs=max_epochs, val_interval=10) +``` + +以上配置从 `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py` 中继承,并根据 cat 数据的特点更新了 `data_root`、`metainfo`、`train_dataloader`、`val_dataloader`、`num_classes` 等配置。 + +## 模型训练 + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py +``` + +运行以上训练命令 `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat` 文件夹会被自动生成,权重文件以及此次的训练配置文件将会保存在此文件夹中。 在 1660 低端显卡上,整个训练过程大概需要 8 分钟。 + +
+image +
+ +在 `test.json` 上性能如下所示: + +```text + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.631 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.909 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.747 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.631 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.627 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.703 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.703 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.703 +``` + +上述性能是通过 COCO API 打印,其中 -1 表示不存在对于尺度的物体。根据 COCO 定义的规则,Cat 数据集里面全部是大物体,不存在小和中等规模物体。 + +### 一些注意事项 + +在训练过程中会打印如下两个关键警告: + +- You are using `YOLOv5Head` with num_classes == 1. The loss_cls will be 0. This is a normal phenomenon. +- The model and loaded state dict do not match exactly + +这两个警告都不会对性能有任何影响。第一个警告是说明由于当前训练的类别数是 1,根据 YOLOv5 算法的社区, 分类分支的 loss 始终是 0,这是正常现象。第二个警告是因为目前是采用微调模式进行训练,我们加载了 COCO 80 个类的预训练权重, +这会导致最后的 Head 模块卷积通道数不对应,从而导致这部分权重无法加载,这也是正常现象。 + +### 中断后恢复训练 + +如果训练中途停止,可以在训练命令最后加上 `--resume` ,程序会自动从 `work_dirs` 中加载最新的权重文件恢复训练。 + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --resume +``` + +### 节省显存策略 + +上述配置大概需要 3.0G 显存,如果你的显存不够,可以考虑开启混合精度训练 + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py --amp +``` + +### 训练可视化 + +MMYOLO 目前支持本地、TensorBoard 以及 WandB 等多种后端可视化,默认是采用本地可视化方式,你可以切换为 WandB 等实时可视化训练过程中各类指标。 + +#### 1 WandB 可视化使用 + +WandB 官网注册并在 https://wandb.ai/settings 获取到 WandB 的 API Keys。 + +
+image +
+ +```shell +pip install wandb +# 运行了 wandb login 后输入上文中获取到的 API Keys ,便登录成功。 +wandb login +``` + +在 `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py` 配置文件最后添加 WandB 配置 + +```python +visualizer = dict(vis_backends = [dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) +``` + +重新运行训练命令便可以在命令行中提示的网页链接中看到 loss、学习率和 coco/bbox_mAP 等数据可视化了。 + +```shell +python tools/train.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py +``` + +
+image +
+
+image +
+ +#### 2 Tensorboard 可视化使用 + +安装 Tensorboard 依赖 + +```shell +pip install tensorboard +``` + +同上述在配置文件 `configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py`配置的最后添加 `tensorboard` 配置 + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'), dict(type='TensorboardVisBackend')]) +``` + +重新运行训练命令后,Tensorboard 文件会生成在可视化文件夹 `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/vis_data` 下, +运行下面的命令便可以在网页链接使用 Tensorboard 查看 loss、学习率和 coco/bbox_mAP 等可视化数据了: + +```shell +tensorboard --logdir=work_dirs/yolov5_s-v61_fast_1xb12-40e_cat +``` + +## 模型测试 + +```shell +python tools/test.py configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --show-dir show_results +``` + +运行以上测试命令, 你不仅可以得到**模型训练**部分所打印的 AP 性能,还可以将推理结果图片自动保存至 `work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/{timestamp}/show_results` 文件夹中。下面为其中一张结果图片,左图为实际标注,右图为模型推理结果。 + +
+result_img +
+ +如果你使用了 `WandbVisBackend` 或者 `TensorboardVisBackend`,则还可以在浏览器窗口可视化模型推理结果。 + +## 特征图相关可视化 + +MMYOLO 中提供了特征图相关可视化脚本,用于分析当前模型训练效果。 详细使用流程请参考 [特征图可视化](../recommended_topics/visualization.md) + +由于 `test_pipeline` 直接可视化会存在偏差,故将需要 `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` 中 `test_pipeline` + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +修改为如下配置: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 删除 YOLOv5KeepRatioResize, 将 LetterResize 修改成 mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) # 删除 pad_param +] +``` + +我们选择 `data/cat/images/IMG_20221020_112705.jpg` 图片作为例子,可视化 YOLOv5 backbone 和 neck 层的输出特征图。 + +**1. 可视化 YOLOv5 backbone 输出的 3 个通道** + +```shell +python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layers backbone \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +结果会保存到当前路径的 output 文件夹下。上图中绘制的 3 个输出特征图对应大中小输出特征图。由于本次训练的 backbone 实际上没有参与训练,从上图可以看到,大物体 cat 是在小特征图进行预测,这符合目标检测分层检测思想。 + +**2. 可视化 YOLOv5 neck 输出的 3 个通道** + +```shell +python demo/featmap_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layers neck \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +从上图可以看出,由于 neck 是参与训练的,并且由于我们重新设置了 anchor, 强行让 3 个输出特征图都拟合同一个尺度的物体,导致 neck 输出的 3 个图类似,破坏了 backbone 原先的预训练分布。同时也可以看出 40 epoch 训练上述数据集是不够的,特征图效果不佳。 + +**3. Grad-Based CAM 可视化** + +基于上述特征图可视化效果,我们可以分析特征层 bbox 级别的 Grad CAM。 + +安装 `grad-cam` 依赖: + +```shell +pip install "grad-cam" +``` + +(a) 查看 neck 输出的最小输出特征图的 Grad CAM + +```shell +python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layer neck.out_layers[2] +``` + +
+image +
+ +(b) 查看 neck 输出的中等输出特征图的 Grad CAM + +```shell +python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layer neck.out_layers[1] +``` + +
+image +
+ +(c) 查看 neck 输出的最大输出特征图的 Grad CAM + +```shell +python demo/boxam_vis_demo.py data/cat/images/IMG_20221020_112705.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --target-layer neck.out_layers[0] +``` + +
+image +
+ +## EasyDeploy 模型部署 + +此处我们将通过 MMYOLO 的 [EasyDeploy](../../../projects/easydeploy/) 来演示模型的转换部署和基本推理。 + +首先需要在当前 MMYOLO 的虚拟环境中按照 EasyDeploy 的 [基本文档](../../../projects/easydeploy/docs/model_convert.md) 对照自己的设备安装好所需的各个库。 + +```shell +pip install onnx onnxruntime +pip install onnx-simplifier # 如果需要使用 simplify 功能需要安装 +pip install tensorrt # 如果有 GPU 环境并且需要输出 TensorRT 模型需要继续执行 +``` + +完成安装后就可以用以下命令对已经训练好的针对 cat 数据集的模型一键转换部署,当前设备的 ONNX 版本为 1.13.0,TensorRT 版本为 8.5.3.1,故可保持 `--opset` 为 11,其余各项参数的具体含义和参数值需要对照使用的 config 文件进行调整。此处我们先导出 CPU 版本的 ONNX 模型,`--backend` 为 ONNXRUNTIME。 + +```shell +python projects/easydeploy/tools/export_onnx.py \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --backend ONNXRUNTIME \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +成功运行后就可以在 `work-dir` 下得到转换后的 ONNX 模型,默认使用 `end2end.onnx` 命名。 + +接下来我们使用此 `end2end.onnx` 模型来进行一个基本的图片推理: + +```shell +python projects/easydeploy/tools/image-demo.py \ + data/cat/images/IMG_20210728_205117.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \ + --device cpu +``` + +成功完成推理后会在默认的 MMYOLO 根目录下的 `output` 文件夹生成推理结果图,如果想直观看到结果而不需要保存,可以在上述命令结尾加上 `--show` ,为了方便展示,下图是生成结果的截取部分。 + +
+image +
+ +我们继续转换对应 TensorRT 的 engine 文件,因为 TensorRT 需要对应当前环境以及部署使用的版本进行,所以一定要确认导出参数,这里我们导出对应 TensorRT8 版本的文件,`--backend` 为 2。 + +```shell +python projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/epoch_40.pth \ + --work-dir work_dirs/yolov5_s-v61_fast_1xb12-40e_cat \ + --img-size 640 640 \ + --batch 1 \ + --device cuda:0 \ + --simplify \ + --opset 11 \ + --backend 2 \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +成功执行后得到的 `end2end.onnx` 就是对应 TensorRT8 部署需要的 ONNX 文件,我们使用这个文件完成 TensorRT engine 的转换。 + +```shell +python projects/easydeploy/tools/build_engine.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.onnx \ + --img-size 640 640 \ + --device cuda:0 +``` + +成功执行后会在 `work-dir` 下生成 `end2end.engine` 文件: + +```text +work_dirs/yolov5_s-v61_fast_1xb12-40e_cat +├── 202302XX_XXXXXX +│ ├── 202302XX_XXXXXX.log +│ └── vis_data +│ ├── 202302XX_XXXXXX.json +│ ├── config.py +│ └── scalars.json +├── best_coco +│ └── bbox_mAP_epoch_40.pth +├── end2end.engine +├── end2end.onnx +├── epoch_30.pth +├── epoch_40.pth +├── last_checkpoint +└── yolov5_s-v61_fast_1xb12-40e_cat.py +``` + +我们继续使用 `image-demo.py` 进行图片推理: + +```shell +python projects/easydeploy/tools/image-demo.py \ + data/cat/images/IMG_20210728_205312.jpg \ + configs/yolov5/yolov5_s-v61_fast_1xb12-40e_cat.py \ + work_dirs/yolov5_s-v61_fast_1xb12-40e_cat/end2end.engine \ + --device cuda:0 +``` + +此处依旧选择在 `output` 下保存推理结果而非直接显示结果,同样为了方便展示,下图是生成结果的截取部分。 + +
+image +
+ +这样我们就完成了将训练完成的模型进行转换部署并且检查推理结果的工作。至此本教程结束。 + +以上完整内容可以查看 [15_minutes_object_detection.ipynb](../../..//demo/15_minutes_object_detection.ipynb)。 如果你在训练或者测试过程中碰到问题,请先查看 [常见错误排除步骤](../recommended_topics/troubleshooting_steps.md),如果依然无法解决欢迎提 [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose)。 diff --git a/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_rotated_object_detection.md b/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_rotated_object_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..ce4455c244c7a00b76a9d0ca974978a16e611d9f --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/get_started/15_minutes_rotated_object_detection.md @@ -0,0 +1,3 @@ +# 15 分钟上手 MMYOLO 旋转框目标检测 + +TODO diff --git a/third_party/mmyolo/docs/zh_cn/get_started/article.md b/third_party/mmyolo/docs/zh_cn/get_started/article.md new file mode 100644 index 0000000000000000000000000000000000000000..07f75e42b2ed57655161dff9377abcbdc5b074e3 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/get_started/article.md @@ -0,0 +1,105 @@ +# 中文解读资源汇总 + +本文汇总了 MMYOLO 或相关的 [OpenMMLab](https://www.zhihu.com/people/openmmlab) 解读的部分文章(更多文章和视频见 [OpenMMLabCourse](https://github.com/open-mmlab/OpenMMLabCourse) ),如果您有推荐的文章(不一定是 OpenMMLab 发布的文章,可以是自己写的文章),非常欢迎提 Pull Request 添加到这里。 + +## MMYOLO 解读文章和资源 + +### 脚本命令速查表 + +
+ +
+ +你可以点击[链接](https://pan.baidu.com/s/1QEaqT7YayUdEvh1an0gjHg?pwd=yolo),下载高清版 PDF 文件。 + +### 文章 + +- [社区协作,简洁易用,快来开箱新一代 YOLO 系列开源库](https://zhuanlan.zhihu.com/p/575615805) +- [MMYOLO 社区倾情贡献,RTMDet 原理社区开发者解读来啦!](https://zhuanlan.zhihu.com/p/569777684) +- [MMYOLO 自定义数据集从标注到部署保姆级教程](https://zhuanlan.zhihu.com/p/595497726) +- [满足一切需求的 MMYOLO 可视化:测试过程可视化](https://zhuanlan.zhihu.com/p/593179372) +- [MMYOLO 想你所想: 训练过程可视化](https://zhuanlan.zhihu.com/p/608586878) +- [YOLOv8 深度详解!一文看懂,快速上手](https://zhuanlan.zhihu.com/p/598566644) +- [玩转 MMYOLO 基础类第一期: 配置文件太复杂?继承用法看不懂?配置全解读来了](https://zhuanlan.zhihu.com/p/577715188) +- [玩转 MMYOLO 工具类第一期: 特征图可视化](https://zhuanlan.zhihu.com/p/578141381?) +- [玩转 MMYOLO 实用类第一期:源码阅读和调试「必备」技巧文档](https://zhuanlan.zhihu.com/p/580885852) +- [玩转 MMYOLO 基础类第二期:工程文件结构简析](https://zhuanlan.zhihu.com/p/584807195) +- [玩转 MMYOLO 实用类第二期:10分钟换遍主干网络文档](https://zhuanlan.zhihu.com/p/585641598) + +### 视频 + +#### 工具类 + +| | 内容 | 视频 | 课程中的代码/文档 | +| :---: | :------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 第1讲 | 特征图可视化 | [![Link](https://i2.hdslb.com/bfs/archive/480a0eb41fce26e0acb65f82a74501418eee1032.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV188411s7o8) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV188411s7o8)](https://www.bilibili.com/video/BV188411s7o8) | [特征图可视化文档](https://zhuanlan.zhihu.com/p/578141381)
[特征图可视化.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/%5B%E5%B7%A5%E5%85%B7%E7%B1%BB%E7%AC%AC%E4%B8%80%E6%9C%9F%5D%E7%89%B9%E5%BE%81%E5%9B%BE%E5%8F%AF%E8%A7%86%E5%8C%96.ipynb) | +| 第2讲 | 基于 sahi 的大图推理 | [![Link](https://i0.hdslb.com/bfs/archive/62c41f508dbcf63a4c721738171612d2d7069ac2.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1EK411R7Ws/) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1EK411R7Ws)](https://www.bilibili.com/video/BV1EK411R7Ws/) | [10分钟轻松掌握大图推理.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[工具类第二期]10分钟轻松掌握大图推理.ipynb) | + +#### 基础类 + +| | 内容 | 视频 | 课程中的代码/文档 | +| :---: | :--------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------: | +| 第1讲 | 配置全解读 | [![Link](https://i1.hdslb.com/bfs/archive/e06daf640ea39b3c0700bb4dc758f1a253f33e13.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1214y157ck) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1214y157ck)](https://www.bilibili.com/video/BV1214y157ck) | [配置全解读文档](https://zhuanlan.zhihu.com/p/577715188) | +| 第2讲 | 工程文件结构简析 | [![Link](https://i2.hdslb.com/bfs/archive/41030efb84d0cada06d5451c1e6e9bccc0cdb5a3.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1LP4y117jS)[![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1LP4y117jS)](https://www.bilibili.com/video/BV1LP4y117jS) | [工程文件结构简析文档](https://zhuanlan.zhihu.com/p/584807195) | + +#### 实用类 + +| | 内容 | 视频 | 课程中的代码/文档 | +| :---: | :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 第1讲 | 源码阅读和调试「必备」技巧 | [![Link](https://i2.hdslb.com/bfs/archive/790d2422c879ff20488910da1c4422b667ea6af7.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1N14y1V7mB) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1N14y1V7mB)](https://www.bilibili.com/video/BV1N14y1V7mB) | [源码阅读和调试「必备」技巧文档](https://zhuanlan.zhihu.com/p/580885852) | +| 第2讲 | 10分钟换遍主干网络 | [![Link](https://i0.hdslb.com/bfs/archive/c51f1aef7c605856777249a7b4478f44bd69f3bd.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1JG4y1d7GC) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1JG4y1d7GC)](https://www.bilibili.com/video/BV1JG4y1d7GC) | [10分钟换遍主干网络文档](https://zhuanlan.zhihu.com/p/585641598)
[10分钟换遍主干网络.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第二期]10分钟换遍主干网络.ipynb) | +| 第3讲 | 自定义数据集从标注到部署保姆级教程 | [![Link](https://i2.hdslb.com/bfs/archive/13f566c89a18c9c881713b63ec14da952d4c0b14.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1RG4y137i5) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1RG4y137i5)](https://www.bilibili.com/video/BV1RG4y137i5) | [自定义数据集从标注到部署保姆级教程](../recommended_topics/labeling_to_deployment_tutorials.md) | +| 第4讲 | 顶会第一步 · 模块自定义 | [![Link](http://i2.hdslb.com/bfs/archive/5b23d41ac57466824eaf185ef806ef734414e93b.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1yd4y1j7VD) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1yd4y1j7VD)](https://www.bilibili.com/video/BV1yd4y1j7VD) | [顶会第一步·模块自定义.ipynb](https://github.com/open-mmlab/OpenMMLabCourse/blob/main/codes/MMYOLO_tutorials/[实用类第四期]顶会第一步·模块自定义.ipynb) | + +#### 源码解读类 + +#### 演示类 + +| | 内容 | 视频 | +| :---: | :----------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 第1期 | 特征图可视化 | [![Link](https://i0.hdslb.com/bfs/archive/081f300c84d6556f40d984cfbe801fc0644ff449.jpg@112w_63h_1c.webp)](https://www.bilibili.com/video/BV1je4y1478R/) [![bilibili](https://img.shields.io/badge/dynamic/json?label=views&style=social&logo=bilibili&query=data.stat.view&url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Fweb-interface%2Fview%3Fbvid%3DBV1je4y1478R)](https://www.bilibili.com/video/BV1je4y1478R/) | + +## MMDetection 解读文章和资源 + +### 文章 + +- [MMDetection 3.0:目标检测新基准与前沿](https://zhuanlan.zhihu.com/p/575246786) +- [目标检测、实例分割、旋转框样样精通!详解高性能检测算法 RTMDet](https://zhuanlan.zhihu.com/p/598846422) +- [MMDetection 支持数据增强神器 Simple Copy Paste 全过程](https://zhuanlan.zhihu.com/p/559940982) + +### 知乎问答和资源 + +- [深度学习科研,如何高效进行代码和实验管理?](https://www.zhihu.com/question/269707221/answer/2480772257) +- [深度学习方面的科研工作中的实验代码有什么规范和写作技巧?如何妥善管理实验数据?](https://www.zhihu.com/question/268193800/answer/2586000037) +- [COCO 数据集上 1x 模式下为什么不采用多尺度训练?](https://www.zhihu.com/question/462170786/answer/1915119662) +- [MMDetection 中 SOTA 论文源码中将训练过程中 BN 层的 eval 打开?](https://www.zhihu.com/question/471189603/answer/2195540892) +- [基于 PyTorch 的 MMDetection 中训练的随机性来自何处?](https://www.zhihu.com/question/453511684/answer/1839683634) + +## MMEngine 解读文章和资源 + +- [从 MMCV 到 MMEngine,架构升级,体验升级!](https://zhuanlan.zhihu.com/p/571830155) + +## MMCV 解读文章和资源 + +- [MMCV 全新升级,新增超全数据变换功能,还有两大变化](https://zhuanlan.zhihu.com/p/572550592) +- [手把手教你如何高效地在 MMCV 中贡献算子](https://zhuanlan.zhihu.com/p/464492627) + +## PyTorch 解读文章和资源 + +- [PyTorch1.11 亮点一览:TorchData、functorch、DDP 静态图](https://zhuanlan.zhihu.com/p/486222256) +- [PyTorch1.12 亮点一览:DataPipe + TorchArrow 新的数据加载与处理范式](https://zhuanlan.zhihu.com/p/537868554) +- [PyTorch 源码解读之 nn.Module:核心网络模块接口详解](https://zhuanlan.zhihu.com/p/340453841) +- [PyTorch 源码解读之 torch.autograd:梯度计算详解](https://zhuanlan.zhihu.com/p/321449610) +- [PyTorch 源码解读之 torch.utils.data:解析数据处理全流程](https://zhuanlan.zhihu.com/p/337850513) +- [PyTorch 源码解读之 torch.optim:优化算法接口详解](https://zhuanlan.zhihu.com/p/346205754) +- [PyTorch 源码解读之 DP & DDP:模型并行和分布式训练解析](https://zhuanlan.zhihu.com/p/343951042) +- [PyTorch 源码解读之 BN & SyncBN:BN 与 多卡同步 BN 详解](https://zhuanlan.zhihu.com/p/337732517) +- [PyTorch 源码解读之 torch.cuda.amp: 自动混合精度详解](https://zhuanlan.zhihu.com/p/348554267) +- [PyTorch 源码解读之 cpp_extension:揭秘 C++/CUDA 算子实现和调用全流程](https://zhuanlan.zhihu.com/p/348555597) +- [PyTorch 源码解读之即时编译篇](https://zhuanlan.zhihu.com/p/361101354) +- [PyTorch 源码解读之分布式训练了解一下?](https://zhuanlan.zhihu.com/p/361314953) +- [PyTorch 源码解读之 torch.serialization & torch.hub](https://zhuanlan.zhihu.com/p/364239544) + +## 其他 + +- [Type Hints 入门教程,让代码更加规范整洁](https://zhuanlan.zhihu.com/p/519335398) diff --git a/third_party/mmyolo/docs/zh_cn/get_started/dependencies.md b/third_party/mmyolo/docs/zh_cn/get_started/dependencies.md new file mode 100644 index 0000000000000000000000000000000000000000..8713c139379e4a5103200af2ee66cf792c6a7887 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/get_started/dependencies.md @@ -0,0 +1,60 @@ +# 依赖 + +下表为 MMYOLO 和 MMEngine, MMCV, MMDetection 依赖库的版本要求,请安装正确的版本以避免安装问题。 + +| MMYOLO version | MMDetection version | MMEngine version | MMCV version | +| :------------: | :----------------------: | :----------------------: | :---------------------: | +| main | mmdet>=3.0.0, \<3.1.0 | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 | +| 0.6.0 | mmdet>=3.0.0, \<3.1.0 | mmengine>=0.7.1, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 | +| 0.5.0 | mmdet>=3.0.0rc6, \<3.1.0 | mmengine>=0.6.0, \<1.0.0 | mmcv>=2.0.0rc4, \<2.1.0 | +| 0.4.0 | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.3.0 | mmdet>=3.0.0rc5, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.2.0 | mmdet>=3.0.0rc3, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.3 | mmdet>=3.0.0rc3, \<3.1.0 | mmengine>=0.3.1, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.2 | mmdet>=3.0.0rc2, \<3.1.0 | mmengine>=0.3.0, \<1.0.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.1 | mmdet==3.0.0rc1 | mmengine>=0.1.0, \<0.2.0 | mmcv>=2.0.0rc0, \<2.1.0 | +| 0.1.0 | mmdet==3.0.0rc0 | mmengine>=0.1.0, \<0.2.0 | mmcv>=2.0.0rc0, \<2.1.0 | + +本节中,我们将演示如何用 PyTorch 准备一个环境。 + +MMYOLO 支持在 Linux,Windows 和 macOS 上运行。它的基本环境依赖为: + +- Python 3.7+ +- PyTorch 1.7+ +- CUDA 9.2+ +- GCC 5.4+ + +```{note} +如果你对 PyTorch 有经验并且已经安装了它,你可以直接跳转到下一小节。否则,你可以按照下述步骤进行准备 +``` + +**步骤 0.** 从 [官方网站](https://docs.conda.io/en/latest/miniconda.html) 下载并安装 Miniconda。 + +**步骤 1.** 创建并激活一个 conda 环境。 + +```shell +conda create -n mmyolo python=3.8 -y +conda activate mmyolo +``` + +**步骤 2.** 基于 [PyTorch 官方说明](https://pytorch.org/get-started/locally/) 安装 PyTorch。 + +在 GPU 平台上: + +```shell +conda install pytorch torchvision -c pytorch +``` + +在 CPU 平台上: + +```shell +conda install pytorch torchvision cpuonly -c pytorch +``` + +**步骤 3.** 验证 PyTorch 安装 + +```shell +python -c "import torch; print(torch.__version__); print(torch.cuda.is_available())" +``` + +如果是在 GPU 平台上,那么会打印版本信息和 True 字符,否则打印版本信息和 False 字符。 diff --git a/third_party/mmyolo/docs/zh_cn/get_started/installation.md b/third_party/mmyolo/docs/zh_cn/get_started/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..be77bccc961cfb21e49570d54df24c326c0b77ad --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/get_started/installation.md @@ -0,0 +1,129 @@ +# 安装和验证 + +## 最佳实践 + +**步骤 0.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine)、 [MMCV](https://github.com/open-mmlab/mmcv) 和 [MMDetection](https://github.com/open-mmlab/mmdetection) 。 + +```shell +pip install -U openmim +mim install "mmengine>=0.6.0" +mim install "mmcv>=2.0.0rc4,<2.1.0" +mim install "mmdet>=3.0.0,<4.0.0" +``` + +如果你当前已经处于 mmyolo 工程目录下,则可以采用如下简化写法 + +```shell +cd mmyolo +pip install -U openmim +mim install -r requirements/mminstall.txt +``` + +**注意:** + +a. 在 MMCV-v2.x 中,`mmcv-full` 改名为 `mmcv`,如果你想安装不包含 CUDA 算子精简版,可以通过 `mim install mmcv-lite>=2.0.0rc1` 来安装。 + +b. 如果使用 `albumentations`,我们建议使用 `pip install -r requirements/albu.txt` 或者 `pip install -U albumentations --no-binary qudida,albumentations` 进行安装。 如果简单地使用 `pip install albumentations==1.0.1` 进行安装,则会同时安装 `opencv-python-headless`(即便已经安装了 `opencv-python` 也会再次安装)。我们建议在安装 albumentations 后检查环境,以确保没有同时安装 `opencv-python` 和 `opencv-python-headless`,因为同时安装可能会导致一些问题。更多细节请参考 [官方文档](https://albumentations.ai/docs/getting_started/installation/#note-on-opencv-dependencies) 。 + +**步骤 1.** 安装 MMYOLO + +方案 1. 如果你基于 MMYOLO 框架开发自己的任务,建议从源码安装 + +```shell +git clone https://github.com/open-mmlab/mmyolo.git +cd mmyolo +# Install albumentations +mim install -r requirements/albu.txt +# Install MMYOLO +mim install -v -e . +# "-v" 指详细说明,或更多的输出 +# "-e" 表示在可编辑模式下安装项目,因此对代码所做的任何本地修改都会生效,从而无需重新安装。 +``` + +方案 2. 如果你将 MMYOLO 作为依赖或第三方 Python 包,使用 MIM 安装 + +```shell +mim install "mmyolo" +``` + +## 验证安装 + +为了验证 MMYOLO 是否安装正确,我们提供了一些示例代码来执行模型推理。 + +**步骤 1.** 我们需要下载配置文件和模型权重文件。 + +```shell +mim download mmyolo --config yolov5_s-v61_syncbn_fast_8xb16-300e_coco --dest . +``` + +下载将需要几秒钟或更长时间,这取决于你的网络环境。完成后,你会在当前文件夹中发现两个文件 `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py` 和 `yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth`。 + +**步骤 2.** 推理验证 + +方案 1. 如果你通过源码安装的 MMYOLO,那么直接运行以下命令进行验证: + +```shell +python demo/image_demo.py demo/demo.jpg \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth + +# 可选参数 +# --out-dir ./output *检测结果输出到指定目录下,默认为./output, 当--show参数存在时,不保存检测结果 +# --device cuda:0 *使用的计算资源,包括cuda, cpu等,默认为cuda:0 +# --show *使用该参数表示在屏幕上显示检测结果,默认为False +# --score-thr 0.3 *置信度阈值,默认为0.3 +``` + +运行结束后,在 `output` 文件夹中可以看到检测结果图像,图像中包含有网络预测的检测框。 + +支持输入类型包括 + +- 单张图片, 支持 `jpg`, `jpeg`, `png`, `ppm`, `bmp`, `pgm`, `tif`, `tiff`, `webp`。 +- 文件目录,会遍历文件目录下所有图片文件,并输出对应结果。 +- 网址,会自动从对应网址下载图片,并输出结果。 + +方案 2. 如果你通过 MIM 安装的 MMYOLO, 那么可以打开你的 Python 解析器,复制并粘贴以下代码: + +```python +from mmdet.apis import init_detector, inference_detector + +config_file = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' +checkpoint_file = 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' +model = init_detector(config_file, checkpoint_file, device='cpu') # or device='cuda:0' +inference_detector(model, 'demo/demo.jpg') +``` + +你将会看到一个包含 `DetDataSample` 的列表,预测结果在 `pred_instance` 里,包含有预测框、预测分数 和 预测类别。 + +## 通过 Docker 使用 MMYOLO + +我们提供了一个 [Dockerfile](https://github.com/open-mmlab/mmyolo/blob/main/docker/Dockerfile) 来构建一个镜像。请确保你的 [docker 版本](https://docs.docker.com/engine/install/) >=`19.03`。 + +温馨提示;国内用户建议取消掉 [Dockerfile](https://github.com/open-mmlab/mmyolo/blob/main/docker/Dockerfile#L19-L20) 里面 `Optional` 后两行的注释,可以获得火箭一般的下载提速: + +```dockerfile +# (Optional) +RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +``` + +构建命令: + +```shell +# build an image with PyTorch 1.9, CUDA 11.1 +# If you prefer other versions, just modified the Dockerfile +docker build -t mmyolo docker/ +``` + +用以下命令运行 Docker 镜像: + +```shell +export DATA_DIR=/path/to/your/dataset +docker run --gpus all --shm-size=8g -it -v ${DATA_DIR}:/mmyolo/data mmyolo +``` + +其余自定义安装流程请查看 [自定义安装](../tutorials/custom_installation.md) + +## 排除故障 + +如果你在安装过程中遇到一些问题,你可以在 GitHub 上 [打开一个问题](https://github.com/open-mmlab/mmyolo/issues/new/choose)。 diff --git a/third_party/mmyolo/docs/zh_cn/get_started/overview.md b/third_party/mmyolo/docs/zh_cn/get_started/overview.md new file mode 100644 index 0000000000000000000000000000000000000000..a6adc41748326cc9f9e34929b31a2abc07fc5a7c --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/get_started/overview.md @@ -0,0 +1,81 @@ +# 概述 + +## MMYOLO 介绍 + +
+图片 +
+ +MMYOLO 是一个基于 PyTorch 和 MMDetection 的 YOLO 系列算法开源工具箱,它是 [OpenMMLab](https://openmmlab.com/) 项目的一部分。 MMYOLO 定位为 YOLO 系列热门开源库以及工业应用核心库,其愿景图如下所示: + +
+愿景图 +
+ +目前支持的任务如下: + +
+支持的任务 + +- 目标检测 +- 旋转框目标检测 + +
+ +目前支持的 YOLO 系列算法如下: + +
+支持的算法 + +- YOLOv5 +- YOLOX +- RTMDet +- RTMDet-Rotated +- YOLOv6 +- YOLOv7 +- PPYOLOE +- YOLOv8 + +
+ +目前支持的数据集如下: + +
+支持的数据集 + +- COCO Dataset +- VOC Dataset +- CrowdHuman Dataset +- DOTA 1.0 Dataset + +
+ +MMYOLO 支持在 Linux、Windows、macOS 上运行, 支持 PyTorch 1.7 及其以上版本运行。它具有如下三个特性: + +- 🕹️ **统一便捷的算法评测** + + MMYOLO 统一了各类 YOLO 算法模块的实现,并提供了统一的评测流程,用户可以公平便捷地进行对比分析。 + +- 📚 **丰富的入门和进阶文档** + + MMYOLO 提供了从入门到部署到进阶和算法解析等一系列文档,方便不同用户快速上手和扩展。 + +- 🧩 **模块化设计** + + MMYOLO 将框架解耦成不同的模块组件,通过组合不同的模块和训练测试策略,用户可以便捷地构建自定义模型。 + +基类-P5 + 图为 RangeKing@GitHub 提供,非常感谢! + +## 本文档使用指南 + +MMYOLO 中将文档结构分成 6 个部分,对应不同需求的用户。 + +- **开启 MMYOLO 之旅**。本部分是第一次使用 MMYOLO 用户的必读文档,请全文仔细阅读 +- **推荐专题**。本部分是 MMYOLO 中提供的以主题形式的精华文档,包括了 MMYOLO 中大量的特性等。强烈推荐使用 MMYOLO 的所有用户阅读 +- **常用功能**。本部分提供了训练测试过程中用户经常会用到的各类常用功能,用户可以在用到时候再次查阅 +- **实用工具**。本部分是 tools 下使用工具的汇总文档,便于大家能够快速的愉快使用 MMYOLO 中提供的各类脚本 +- **基础和进阶教程**。本部分涉及到 MMYOLO 中的一些基本概念和进阶教程等,适合想详细了解 MMYOLO 设计思想和结构设计的用户 +- **其他**。其余部分包括模型仓库、说明和接口文档等等 + +不同需求的用户可以按需选择你心怡的内容阅读。如果你对本文档有异议或者更好的优化办法,欢迎给 MMYOLO 提 PR ~, 请参考 [如何给 MMYOLO 贡献代码](../recommended_topics/contributing.md) diff --git a/third_party/mmyolo/docs/zh_cn/index.rst b/third_party/mmyolo/docs/zh_cn/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..9f150ac6c8d66e6c64603cff69e7d6be15b6ab01 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/index.rst @@ -0,0 +1,122 @@ +欢迎来到 MMYOLO 中文文档! +======================================= +您可以在页面右上角切换中英文文档。 + +.. toctree:: + :maxdepth: 2 + :caption: 开启 MMYOLO 之旅 + + get_started/overview.md + get_started/dependencies.md + get_started/installation.md + get_started/15_minutes_object_detection.md + get_started/15_minutes_rotated_object_detection.md + get_started/15_minutes_instance_segmentation.md + get_started/article.md + +.. toctree:: + :maxdepth: 2 + :caption: 推荐专题 + + recommended_topics/contributing.md + recommended_topics/training_testing_tricks.md + recommended_topics/model_design.md + recommended_topics/algorithm_descriptions/index.rst + recommended_topics/application_examples/index.rst + recommended_topics/replace_backbone.md + recommended_topics/complexity_analysis.md + recommended_topics/labeling_to_deployment_tutorials.md + recommended_topics/visualization.md + recommended_topics/deploy/index.rst + recommended_topics/troubleshooting_steps.md + recommended_topics/mm_basics.md + recommended_topics/dataset_preparation.md + +.. toctree:: + :maxdepth: 2 + :caption: 常用功能 + + common_usage/resume_training.md + common_usage/syncbn.md + common_usage/amp_training.md + common_usage/ms_training_testing.md + common_usage/tta.md + common_usage/plugins.md + common_usage/freeze_layers.md + common_usage/output_predictions.md + common_usage/set_random_seed.md + common_usage/module_combination.md + common_usage/mim_usage.md + common_usage/multi_necks.md + common_usage/specify_device.md + common_usage/single_multi_channel_applications.md + common_usage/registries_info.md + + +.. toctree:: + :maxdepth: 2 + :caption: 实用工具 + + useful_tools/browse_coco_json.md + useful_tools/browse_dataset.md + useful_tools/print_config.md + useful_tools/dataset_analysis.md + useful_tools/optimize_anchors.md + useful_tools/extract_subcoco.md + useful_tools/vis_scheduler.md + useful_tools/dataset_converters.md + useful_tools/download_dataset.md + useful_tools/log_analysis.md + useful_tools/model_converters.md + +.. toctree:: + :maxdepth: 2 + :caption: 基础教程 + + tutorials/config.md + tutorials/data_flow.md + tutorials/rotated_detection.md + tutorials/custom_installation.md + tutorials/warning_notes.md + tutorials/faq.md + + +.. toctree:: + :maxdepth: 2 + :caption: 进阶教程 + + advanced_guides/cross-library_application.md + + +.. toctree:: + :maxdepth: 2 + :caption: 模型仓库 + + model_zoo.md + +.. toctree:: + :maxdepth: 1 + :caption: 说明 + + notes/changelog.md + notes/compatibility.md + notes/conventions.md + notes/code_style.md + +.. toctree:: + :maxdepth: 1 + :caption: 接口文档(英文) + + api.rst + +.. toctree:: + :caption: 语言切换 + + switch_language.md + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/third_party/mmyolo/docs/zh_cn/make.bat b/third_party/mmyolo/docs/zh_cn/make.bat new file mode 100644 index 0000000000000000000000000000000000000000..922152e96a04a242e6fc40f124261d74890617d8 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/third_party/mmyolo/docs/zh_cn/model_zoo.md b/third_party/mmyolo/docs/zh_cn/model_zoo.md new file mode 100644 index 0000000000000000000000000000000000000000..1091f9f5719f9b3e373cd7a04d32efd0097023f6 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/model_zoo.md @@ -0,0 +1,94 @@ +# 模型库和评测 + +本页面用于汇总 MMYOLO 中支持的各类模型性能和相关评测指标,方便用户对比分析。 + +## COCO 数据集 + +
+ +
+ +| Model | Arch | Size | Batch Size | Epoch | SyncBN | AMP | Mem (GB) | Params(M) | FLOPs(G) | TRT-FP16-GPU-Latency(ms) | Box AP | TTA Box AP | +| :--------------: | :--: | :--: | :--------: | :---: | :----: | :-: | :------: | :-------: | :------: | :----------------------: | :----: | :--------: | +| YOLOv5-n | P5 | 640 | 8xb16 | 300 | Yes | Yes | 1.5 | 1.87 | 2.26 | 1.14 | 28.0 | 30.7 | +| YOLOv6-v2.0-n | P5 | 640 | 8xb32 | 400 | Yes | Yes | 6.04 | 4.32 | 5.52 | 1.37 | 36.2 | | +| YOLOv8-n | P5 | 640 | 8xb16 | 500 | Yes | Yes | 2.5 | 3.16 | 4.4 | 1.53 | 37.4 | 39.9 | +| RTMDet-tiny | P5 | 640 | 8xb32 | 300 | Yes | No | 11.9 | 4.90 | 8.09 | 2.31 | 41.8 | 43.2 | +| YOLOv6-v2.0-tiny | P5 | 640 | 8xb32 | 400 | Yes | Yes | 8.13 | 9.70 | 12.37 | 2.19 | 41.0 | | +| YOLOv7-tiny | P5 | 640 | 8xb16 | 300 | Yes | Yes | 2.7 | 6.23 | 6.89 | 1.88 | 37.5 | | +| YOLOX-tiny | P5 | 416 | 8xb32 | 300 | No | Yes | 4.9 | 5.06 | 7.63 | 1.19 | 34.3 | | +| RTMDet-s | P5 | 640 | 8xb32 | 300 | Yes | No | 16.3 | 8.89 | 14.84 | 2.89 | 45.7 | 47.3 | +| YOLOv5-s | P5 | 640 | 8xb16 | 300 | Yes | Yes | 2.7 | 7.24 | 8.27 | 1.89 | 37.7 | 40.2 | +| YOLOv6-v2.0-s | P5 | 640 | 8xb32 | 400 | Yes | Yes | 8.88 | 17.22 | 21.94 | 2.67 | 44.0 | | +| YOLOv8-s | P5 | 640 | 8xb16 | 500 | Yes | Yes | 4.0 | 11.17 | 14.36 | 2.61 | 45.1 | 46.8 | +| YOLOX-s | P5 | 640 | 8xb32 | 300 | No | Yes | 9.8 | 8.97 | 13.40 | 2.38 | 41.9 | | +| PPYOLOE+ -s | P5 | 640 | 8xb8 | 80 | Yes | No | 4.7 | 7.93 | 8.68 | 2.54 | 43.5 | | +| RTMDet-m | P5 | 640 | 8xb32 | 300 | Yes | No | 29.0 | 24.71 | 39.21 | 6.23 | 50.2 | 51.9 | +| YOLOv5-m | P5 | 640 | 8xb16 | 300 | Yes | Yes | 5.0 | 21.19 | 24.53 | 4.28 | 45.3 | 46.9 | +| YOLOv6-v2.0-m | P5 | 640 | 8xb32 | 300 | Yes | Yes | 16.69 | 34.25 | 40.7 | 5.12 | 48.4 | | +| YOLOv8-m | P5 | 640 | 8xb16 | 500 | Yes | Yes | 7.0 | 25.9 | 39.57 | 5.78 | 50.6 | 52.3 | +| YOLOX-m | P5 | 640 | 8xb32 | 300 | No | Yes | 17.6 | 25.33 | 36.88 | 5.31 | 47.5 | | +| PPYOLOE+ -m | P5 | 640 | 8xb8 | 80 | Yes | No | 8.4 | 23.43 | 24.97 | 5.47 | 49.5 | | +| RTMDet-l | P5 | 640 | 8xb32 | 300 | Yes | No | 45.2 | 52.32 | 80.12 | 10.13 | 52.3 | 53.7 | +| YOLOv5-l | P5 | 640 | 8xb16 | 300 | Yes | Yes | 8.1 | 46.56 | 54.65 | 6.8 | 48.8 | 49.9 | +| YOLOv6-v2.0-l | P5 | 640 | 8xb32 | 300 | Yes | Yes | 20.86 | 58.53 | 71.43 | 8.78 | 51.0 | | +| YOLOv7-l | P5 | 640 | 8xb16 | 300 | Yes | Yes | 10.3 | 36.93 | 52.42 | 6.63 | 50.9 | | +| YOLOv8-l | P5 | 640 | 8xb16 | 500 | Yes | Yes | 9.1 | 43.69 | 82.73 | 8.97 | 53.0 | 54.4 | +| YOLOX-l | P5 | 640 | 8xb8 | 300 | No | Yes | 8.0 | 54.21 | 77.83 | 9.23 | 50.1 | | +| PPYOLOE+ -l | P5 | 640 | 8xb8 | 80 | Yes | No | 13.2 | 52.20 | 55.05 | 8.2 | 52.6 | | +| RTMDet-x | P5 | 640 | 8xb32 | 300 | Yes | No | 63.4 | 94.86 | 145.41 | 17.89 | 52.8 | 54.2 | +| YOLOv7-x | P5 | 640 | 8xb16 | 300 | Yes | Yes | 13.7 | 71.35 | 95.06 | 11.63 | 52.8 | | +| YOLOv8-x | P5 | 640 | 8xb16 | 500 | Yes | Yes | 12.4 | 68.23 | 132.10 | 14.22 | 54.0 | 55.0 | +| YOLOX-x | P5 | 640 | 8xb8 | 300 | No | Yes | 9.8 | 99.07 | 144.39 | 15.35 | 51.4 | | +| PPYOLOE+ -x | P5 | 640 | 8xb8 | 80 | Yes | No | 19.1 | 98.42 | 105.48 | 14.02 | 54.2 | | +| YOLOv5-n | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 5.8 | 3.25 | 2.30 | | 35.9 | | +| YOLOv5-s | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 10.5 | 12.63 | 8.45 | | 44.4 | | +| YOLOv5-m | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 19.1 | 35.73 | 25.05 | | 51.3 | | +| YOLOv5-l | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 30.5 | 76.77 | 55.77 | | 53.7 | | +| YOLOv7-w | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 27.0 | 82.31 | 45.07 | | 54.1 | | +| YOLOv7-e | P6 | 1280 | 8xb16 | 300 | Yes | Yes | 42.5 | 114.69 | 64.48 | | 55.1 | | + +- 所有模型均使用 COCO train2017 作为训练集,在 COCO val2017 上验证精度 +- TRT-FP16-GPU-Latency(ms) 是指在 NVIDIA Tesla T4 设备上采用 TensorRT 8.4,batch size 为 1, 测试 shape 为 640x640 且仅包括模型 forward 的 GPU Compute time (YOLOX-tiny 测试 shape 是 416x416) +- 模型参数量和 FLOPs 是采用 [get_flops](https://github.com/open-mmlab/mmyolo/blob/dev/tools/analysis_tools/get_flops.py) 脚本得到,不同的运算方式可能略有不同 +- RTMDet 性能是通过 [MMRazor 知识蒸馏](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/distillation/README.md) 训练后的结果 +- MMYOLO 中暂时只实现了 YOLOv6 2.0 版本,并且 L 和 M 为没有经过知识蒸馏的结果 +- YOLOv8 是引入了实例分割标注优化后的结果,YOLOv5、YOLOv6 和 YOLOv7 没有采用实例分割标注优化 +- PPYOLOE+ 使用 Obj365 作为预训练权重,因此 COCO 训练的 epoch 数只需要 80 +- YOLOX-tiny、YOLOX-s 和 YOLOX-m 为采用了 RTMDet 中所提优化器参数训练所得,性能相比原始实现有不同程度提升 + +详情见如下内容 + +- [RTMDet](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet) +- [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) +- [YOLOv6](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov6) +- [YOLOv7](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov7) +- [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov8) +- [YOLOX](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolox) +- [PPYOLO-E](https://github.com/open-mmlab/mmyolo/blob/main/configs/ppyoloe) + +## VOC 数据集 + +| Backbone | size | Batchsize | AMP | Mem (GB) | box AP(COCO metric) | +| :------: | :--: | :-------: | :-: | :------: | :-----------------: | +| YOLOv5-n | 512 | 64 | Yes | 3.5 | 51.2 | +| YOLOv5-s | 512 | 64 | Yes | 6.5 | 62.7 | +| YOLOv5-m | 512 | 64 | Yes | 12.0 | 70.1 | +| YOLOv5-l | 512 | 32 | Yes | 10.0 | 73.1 | + +详情见如下内容 + +- [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) + +## CrowdHuman 数据集 + +| Backbone | size | SyncBN | AMP | Mem (GB) | ignore_iof_thr | box AP50(CrowDHuman Metric) | MR | JI | +| :------: | :--: | :----: | :-: | :------: | :------------: | :-------------------------: | :--: | :---: | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | -1 | 85.79 | 48.7 | 75.33 | +| YOLOv5-s | 640 | Yes | Yes | 2.6 | 0.5 | 86.17 | 48.8 | 75.87 | + +详情见如下内容 + +- [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) + +## DOTA 1.0 数据集 diff --git a/third_party/mmyolo/docs/zh_cn/notes/changelog.md b/third_party/mmyolo/docs/zh_cn/notes/changelog.md new file mode 100644 index 0000000000000000000000000000000000000000..90fef595865d7e41d8864da582de98efab943948 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/notes/changelog.md @@ -0,0 +1,365 @@ +# 更新日志 + +## v0.6.0 (15/8/2023) + +### 亮点 + +- 支持 YOLOv5 实例分割 +- 基于 MMPose 支持 YOLOX-Pose +- 添加 15 分钟的实例分割教程 +- YOLOv5 支持使用 mask 标注来优化边界框 +- 添加多尺度训练和测试文档 + +### 新特性 + +- 添加训练和测试技巧文档 (#659) +- 支持设置 `cache_size_limit` 参数,并支持 mmdet 3.0.0 (#707) +- 支持 YOLOv5u 和 YOLOv6 3.0 推理 (#624, #744) +- 支持仅模型推断 (#733) +- 添加 YOLOv8 deepstream 配置 (#633) +- 在 MMYOLO 应用程序中添加电离图示例 (#643) + +### Bug 修复 + +- 修复 browse_dataset 以可视化测试和验证集的问题 (#641) +- 修复安装文档错误 (#662) +- 修复 yolox-l ckpt 链接 (#677) +- 修正 YOLOv7 和 YOLOv8 图表中的拼写错误 (#621, #710) +- 调整 `boxam_vis_demo.py` 中包导入的顺序 (#655) + +### 完善 + +- 优化 `convert_kd_ckpt_to_student.py` 文件 (#647) +- 添加 FAQ 和 training_testing_tricks 的英文文档 (#691, #693) + +### 贡献者 + +总共 21 位开发者参与了本次版本 + +感谢 @Lum1104,@azure-wings,@FeiGeChuanShu,@Lingrui Gu,@Nioolek,@huayuan4396,@RangeKing,@danielhonies,@yechenzhi,@JosonChan1998,@kitecats,@Qingrenn,@triple-Mu,@kikefdezl,@zhangrui-wolf,@xin-li-67,@Ben-Louis,@zgzhengSEU,@VoyagerXvoyagerx,@tang576225574,@hhaAndroid + +## v0.5.0 (2/3/2023) + +### 亮点 + +1. 支持了 [RTMDet-R](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/README.md#rotated-object-detection) 旋转框目标检测任务和算法 +2. [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md) 支持使用 mask 标注提升目标检测模型性能 +3. 支持 [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/razor/subnets/README.md) 搜索的 NAS 子网络作为 YOLO 系列算法的 backbone +4. 支持调用 [MMRazor](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/distillation/README.md) 对 RTMDet 进行知识蒸馏 +5. [MMYOLO](https://mmyolo.readthedocs.io/zh_CN/dev/) 文档结构优化,内容全面升级 +6. 基于 RTMDet 训练超参提升 YOLOX 精度和训练速度 +7. 支持模型参数量、FLOPs 计算和提供 T4 设备上 GPU 延时数据,并更新了 [Model Zoo](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/model_zoo.md) +8. 支持测试时增强 TTA +9. 支持 RTMDet、YOLOv8 和 YOLOv7 assigner 可视化 + +### 新特性 + +01. 支持 RTMDet 实例分割任务的推理 (#583) +02. 美化 MMYOLO 中配置文件并增加更多注释 (#501, #506, #516, #529, #531, #539) +03. 重构并优化中英文文档 (#568, #573, #579, #584, #587, #589, #596, #599, #600) +04. 支持 fast 版本的 YOLOX (#518) +05. EasyDeploy 中支持 DeepStream,并添加说明文档 (#485, #545, #571) +06. 新增混淆矩阵绘制脚本 (#572) +07. 新增单通道应用案例 (#460) +08. 支持 auto registration (#597) +09. Box CAM 支持 YOLOv7、YOLOv8 和 PPYOLOE (#601) +10. 新增自动化生成 MM 系列 repo 注册信息和 tools 脚本 (#559) +11. 新增 YOLOv7 模型结构图 (#504) +12. 新增如何指定特定 GPU 训练和推理文档 (#503) +13. 新增训练或者测试时检查 `metainfo` 是否全为小写 (#535) +14. 增加 Twitter、Discord、Medium 和 YouTube 等链接 (#555) + +### Bug 修复 + +1. 修复 isort 版本问题 (#492, #497) +2. 修复 assigner 可视化模块的 type 错误 (#509) +3. 修复 YOLOv8 文档链接错误 (#517) +4. 修复 EasyDeploy 中的 RTMDet Decoder 错误 (#519) +5. 修复一些文档链接错误 (#537) +6. 修复 RTMDet-Tiny 权重路径错误 (#580) + +### 完善 + +1. 完善更新 `contributing.md` +2. 优化 `DetDataPreprocessor` 支使其支持多任务 (#511) +3. 优化 `gt_instances_preprocess` 使其可以用于其他 YOLO 算法 (#532) +4. 新增 `yolov7-e6e` 权重转换脚本 (#570) +5. 参考 YOLOv8 推理代码修改 PPYOLOE (#614) + +### 贡献者 + +总共 22 位开发者参与了本次版本 + +@triple-Mu, @isLinXu, @Audrey528, @TianWen580, @yechenzhi, @RangeKing, @lyviva, @Nioolek, @PeterH0323, @tianleiSHI, @aptsunny, @satuoqaq, @vansin, @xin-li-67, @VoyagerXvoyagerx, +@landhill, @kitecats, @tang576225574, @HIT-cwh, @AI-Tianlong, @RangiLyu, @hhaAndroid + +## v0.4.0 (18/1/2023) + +### 亮点 + +1. 实现了 [YOLOv8](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md) 目标检测模型,并通过 [projects/easydeploy](https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy) 支持了模型部署 +2. 新增了中英文版本的 [YOLOv8 原理和实现全解析文档](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/algorithm_descriptions/yolov8_description.md) + +### 新特性 + +1. 新增 YOLOv8 和 PPYOLOE 模型结构图 (#459, #471) +2. 调整最低支持 Python 版本从 3.6 升级为 3.7 (#449) +3. TensorRT-8 中新增新的 YOLOX decoder 写法 (#450) +4. 新增学习率可视化曲线脚本 (#479) +5. 新增脚本命令速查表 (#481) + +### Bug 修复 + +1. 修复 `optimize_anchors.py` 脚本导入错误问题 (#452) +2. 修复 `get_started.md` 中安装步骤错误问题 (#474) +3. 修复使用 `RTMDet` P6 模型时候 neck 报错问题 (#480) + +### 视频 + +1. 发布了 [玩转 MMYOLO 之实用篇(四):顶会第一步 · 模块自定义](https://www.bilibili.com/video/BV1yd4y1j7VD/) + +### 贡献者 + +总共 9 位开发者参与了本次版本 + +谢谢 @VoyagerXvoyagerx, @tianleiSHI, @RangeKing, @PeterH0323, @Nioolek, @triple-Mu, @lyviva, @Zheng-LinXiao, @hhaAndroid + +## v0.3.0 (8/1/2023) + +### 亮点 + +1. 实现了 [RTMDet](https://github.com/open-mmlab/mmyolo/blob/dev/configs/rtmdet/README.md) 的快速版本。RTMDet-s 8xA100 训练只需要 14 个小时,训练速度相比原先版本提升 2.6 倍。 +2. 支持 [PPYOLOE](https://github.com/open-mmlab/mmyolo/blob/dev/configs/ppyoloe/README.md) 训练。 +3. 支持 [YOLOv5](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov5/crowdhuman/yolov5_s-v61_8xb16-300e_ignore_crowdhuman.py) 的 `iscrowd` 属性训练。 +4. 支持 [YOLOv5 正样本分配结果可视化](https://github.com/open-mmlab/mmyolo/blob/dev/projects/assigner_visualization/README.md) +5. 新增 [YOLOv6 原理和实现全解析文档](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/algorithm_descriptions/yolov6_description.md) + +### 新特性 + +01. 新增 `crowdhuman` 数据集 (#368) +02. EasyDeploy 中支持 TensorRT 推理 (#377) +03. 新增 `YOLOX` 结构图描述 (#402) +04. 新增视频推理脚本 (#392) +05. EasyDeploy 中支持 `YOLOv7` 部署 (#427) +06. 支持从 CLI 中的特定检查点恢复训练 (#393) +07. 将元信息字段设置为小写(#362、#412) +08. 新增模块组合文档 (#349, #352, #345) +09. 新增关于如何冻结 backbone 或 neck 权重的文档 (#418) +10. 在 `how_to.md` 中添加不使用预训练权重的文档 (#404) +11. 新增关于如何设置随机种子的文档 (#386) +12. 将 `rtmdet_description.md` 文档翻译成英文 (#353) + +### Bug 修复 + +01. 修复设置 `--class-id-txt` 时输出注释文件中的错误 (#430) +02. 修复 `YOLOv5` head 中的批量推理错误 (#413) +03. 修复某些 head 的类型提示(#415、#416、#443) +04. 修复 expected a non-empty list of Tensors 错误 (#376) +05. 修复 `YOLOv7` 训练中的设备不一致错误(#397) +06. 修复 `LetterResize` 中的 `scale_factor` 和 `pad_param` 值 (#387) +07. 修复 readthedocs 的 docstring 图形渲染错误 (#400) +08. 修复 `YOLOv6` 从训练到验证时的断言错误 (#378) +09. 修复 `np.int` 和旧版 builder.py 导致的 CI 错误 (#389) +10. 修复 MMDeploy 重写器 (#366) +11. 修复 MMYOLO 单元测试错误 (#351) +12. 修复 `pad_param` 错误 (#354) +13. 修复 head 推理两次的错误(#342) +14. 修复自定义数据集训练 (#428) + +### 完善 + +01. 更新 `useful_tools.md` (#384) +02. 更新英文版 `custom_dataset.md` (#381) +03. 重写函数删除上下文参数 (#395) +04. 弃用 `np.bool` 类型别名 (#396) +05. 为自定义数据集添加新的视频链接 (#365) +06. 仅为模型导出 onnx (#361) +07. 添加 MMYOLO 回归测试 yml (#359) +08. 更新 `article.md` 中的视频教程 (#350) +09. 添加部署 demo (#343) +10. 优化 debug 模式下大图的可视化效果(#346) +11. 改进 `browse_dataset` 的参数并支持 `RepeatDataset` (#340, #338) + +### 视频 + +1. 发布了 [基于 sahi 的大图推理](https://www.bilibili.com/video/BV1EK411R7Ws/) +2. 发布了 [自定义数据集从标注到部署保姆级教程](https://www.bilibili.com/video/BV1RG4y137i5) + +### 贡献者 + +总共 28 位开发者参与了本次版本 + +谢谢 @RangeKing, @PeterH0323, @Nioolek, @triple-Mu, @matrixgame2018, @xin-li-67, @tang576225574, @kitecats, @Seperendity, @diplomatist, @vaew, @wzr-skn, @VoyagerXvoyagerx, @MambaWong, @tianleiSHI, @caj-github, @zhubochao, @lvhan028, @dsghaonan, @lyviva, @yuewangg, @wang-tf, @satuoqaq, @grimoire, @RunningLeon, @hanrui1sensetime, @RangiLyu, @hhaAndroid + +## v0.2.0(1/12/2022) + +### 亮点 + +1. 支持 [YOLOv7](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov7) P5 和 P6 模型 +2. 支持 [YOLOv6](https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov6/README.md) 中的 ML 大模型 +3. 支持 [Grad-Based CAM 和 Grad-Free CAM](https://github.com/open-mmlab/mmyolo/blob/dev/demo/boxam_vis_demo.py) +4. 基于 sahi 支持 [大图推理](https://github.com/open-mmlab/mmyolo/blob/dev/demo/large_image_demo.py) +5. projects 文件夹下新增 [easydeploy](https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy/README.md) 项目 +6. 新增 [自定义数据集教程](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/user_guides/custom_dataset.md) + +### 新特性 + +1. `browse_dataset.py` 脚本支持可视化原图、数据增强后和中间结果功能 (#304) +2. `image_demo.py` 新增预测结果保存为 labelme 格式功能 (#288, #314) +3. 新增 labelme 格式转 COCO 格式脚本 `labelme2coco` (#308, #313) +4. 新增 COCO 数据集切分脚本 `coco_split.py` (#311) +5. `how-to.md` 文档中新增两个 backbone 替换案例以及更新 `plugin.md` (#291) +6. 新增贡献者文档 `contributing.md` and 代码规范文档 `code_style.md` (#322) +7. 新增如何通过 mim 跨库调用脚本文档 (#321) +8. `YOLOv5` 支持 RV1126 设备部署 (#262) + +### Bug 修复 + +1. 修复 `MixUp` padding 错误 (#319) +2. 修复 `LetterResize` 和 `YOLOv5KeepRatioResize` 中 `scale_factor` 参数顺序错误 (#305) +3. 修复 `YOLOX Nano` 模型训练错误问题 (#285) +4. 修复 `RTMDet` 部署没有导包的错误 (#287) +5. 修复 int8 部署配置错误 (#315) +6. 修复 `basebackbone` 中 `make_stage_plugins` 注释 (#296) +7. 部署模块支持切换为 deploy 模式功能 (#324) +8. 修正 `RTMDet` 模型结构图中的错误 (#317) + +### 完善 + +1. `test.py` 中新增 json 格式导出选项 (#316) +2. `extract_subcoco.py` 脚本中新增基于面积阈值过滤规则 (#286) +3. 部署相关中文文档翻译为英文 (#289) +4. 新增 `YOLOv6` 算法描述大纲文档 (#252) +5. 完善 `config.md` (#297, #303) +6. 完善 `mosiac9` 的 docstring (#307) +7. 完善 `browse_coco_json.py` 脚本输入参数 (#309) +8. 重构 `dataset_analysis.py` 中部分函数使其更加通用 (#294) + +### 视频 + +1. 发布了 [工程文件结构简析](https://www.bilibili.com/video/BV1LP4y117jS) +2. 发布了 [10分钟换遍主干网络文档](https://www.bilibili.com/video/BV1JG4y1d7GC) + +### 贡献者 + +总共 14 位开发者参与了本次版本 + +谢谢 @fcakyon, @matrixgame2018, @MambaWong, @imAzhou, @triple-Mu, @RangeKing, @PeterH0323, @xin-li-67, @kitecats, @hanrui1sensetime, @AllentDan, @Zheng-LinXiao, @hhaAndroid, @wanghonglie + +## v0.1.3(10/11/2022) + +### 新特性 + +1. 支持 CBAM 插件并提供插件文档 (#246) +2. 新增 YOLOv5 P6 模型结构图和相关说明 (#273) + +### Bug 修复 + +1. 基于 mmengine 0.3.1 修复保存最好权重时训练失败问题 +2. 基于 mmdet 3.0.0rc3 修复 `add_dump_metric` 报错 (#253) +3. 修复 backbone 不支持 `init_cfg` 问题 (#272) +4. 基于 mmdet 3.0.0rc3 改变 typing 导入方式 (#261) + +### 完善 + +1. `featmap_vis_demo` 支持文件夹和 url 输入 (#248) +2. 部署 docker 文件完善 (#242) + +### 贡献者 + +总共 10 位开发者参与了本次版本 + +谢谢 @kitecats, @triple-Mu, @RangeKing, @PeterH0323, @Zheng-LinXiao, @tkhe, @weikai520, @zytx121, @wanghonglie, @hhaAndroid + +## v0.1.2(3/11/2022) + +### 亮点 + +1. 支持 ONNXRuntime 和 TensorRT 的 [YOLOv5/YOLOv6/YOLOX/RTMDet 部署](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy) +2. 支持 [YOLOv6](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov6) s/t/n 模型训练 +3. YOLOv5 支持 [P6 大分辨率 1280 尺度训练](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5) +4. YOLOv5 支持 [VOC 数据集训练](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/voc) +5. 支持 [PPYOLOE](https://github.com/open-mmlab/mmyolo/blob/main/configs/ppyoloe) 和 [YOLOv7](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov7) 模型推理和官方权重转化 +6. How-to 文档中新增 YOLOv5 替换 [backbone 教程](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/advanced_guides/how_to.md#%E8%B7%A8%E5%BA%93%E4%BD%BF%E7%94%A8%E4%B8%BB%E5%B9%B2%E7%BD%91%E7%BB%9C) + +### 新特性 + +1. 新增 `optimize_anchors` 脚本 (#175) +2. 新增 `extract_subcoco` 脚本 (#186) +3. 新增 `yolo2coco` 转换脚本 (#161) +4. 新增 `dataset_analysis` 脚本 (#172) +5. 移除 Albu 版本限制 (#187) + +### Bug 修复 + +1. 修复当设置 `cfg.resume` 时候不生效问题 (#221) +2. 修复特征图可视化脚本中不显示 bbox 问题 (#204) +3. 更新 RTMDet 的 metafile (#188) +4. 修复 test_pipeline 中的可视化错误 (#166) +5. 更新 badges (#140) + +### 完善 + +1. 优化 Readthedoc 显示页面 (#209) +2. 为 base model 添加模块结构图的 docstring (#196) +3. 支持 LoadAnnotations 中不包括任何实例逻辑 (#161) +4. 更新 `image_demo` 脚本以支持文件夹和 url 路径 (#128) +5. 更新 pre-commit hook (#129) + +### 文档 + +1. 将 `yolov5_description.md`、 `yolov5_tutorial.md` 和 `visualization.md` 翻译为英文 (#138, #198, #206) +2. 新增部署相关中文文档 (#220) +3. 更新 `config.md`、`faq.md` 和 `pull_request_template.md` (#190, #191, #200) +4. 更新 `article` 页面 (#133) + +### 视频 + +1. 发布了[特征图可视化视频](https://www.bilibili.com/video/BV188411s7o8) +2. 发布了 [YOLOv5 配置文件解读视频](https://www.bilibili.com/video/BV1214y157ck) +3. 发布了 [RTMDet-s 特征图可视化 demo 视频](https://www.bilibili.com/video/BV1je4y1478R) +4. 发布了[源码解读和必备调试技巧视频](https://www.bilibili.com/video/BV1N14y1V7mB) + +### 贡献者 + +总共 14 位开发者参与了本次版本 + +谢谢 @imAzhou, @triple-Mu, @RangeKing, @PeterH0323, @xin-li-67, @Nioolek, @kitecats, @Bin-ze, @JiayuXu0, @cydiachen, @zhiqwang, @Zheng-LinXiao, @hhaAndroid, @wanghonglie + +## v0.1.1(29/9/2022) + +基于 MMDetection 的 RTMDet 高精度低延时目标检测算法,我们也同步发布了 RTMDet,并提供了 RTMDet 原理和实现全解析中文文档 + +### 亮点 + +1. 支持了 [RTMDet](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet) +2. 新增了 [RTMDet 原理和实现全解析中文文档](https://github.com/open-mmlab/mmyolo/blob/main/docs/zh_cn/algorithm_descriptions/rtmdet_description.md) +3. 支持对 backbone 自定义插件,并更新了 How-to 文档 (#75) + +### Bug 修复 + +1. 修复一些文档错误 (#66, #72, #76, #83, #86) +2. 修复权重链接错误 (#63) +3. 修复 `LetterResize` 使用 `imscale` api 时候输出不符合预期的 bug (#105) + +### 完善 + +1. 缩减 docker 镜像尺寸 (#67) +2. 简化 BaseMixImageTransform 中 Compose 逻辑 (#71) +3. test 脚本支持 dump 结果 (#84) + +#### 贡献者 + +总共 13 位开发者参与了本次版本 + +谢谢 @wanghonglie, @hhaAndroid, @yang-0201, @PeterH0323, @RangeKing, @satuoqaq, @Zheng-LinXiao, @xin-li-67, @suibe-qingtian, @MambaWong, @MichaelCai0912, @rimoire, @Nioolek + +## v0.1.0(21/9/2022) + +我们发布了 MMYOLO 开源库,其基于 MMEngine, MMCV 2.x 和 MMDetection 3.x 库. 目前实现了目标检测功能,后续会扩展为多任务。 + +### 亮点 + +1. 支持 YOLOv5/YOLOX 训练,支持 YOLOv6 推理。部署即将支持。 +2. 重构了 MMDetection 的 YOLOX,提供了更快的训练和推理速度。 +3. 提供了详细入门和进阶教程, 包括 YOLOv5 从入门到部署、YOLOv5 算法原理和实现全解析、 特征图可视化等教程。 diff --git a/third_party/mmyolo/docs/zh_cn/notes/code_style.md b/third_party/mmyolo/docs/zh_cn/notes/code_style.md new file mode 100644 index 0000000000000000000000000000000000000000..6e169b3713e1f0c31c8fc8a46ad3e225088c35a4 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/notes/code_style.md @@ -0,0 +1,606 @@ +# 代码规范 + +## 代码规范标准 + +### PEP 8 —— Python 官方代码规范 + +[Python 官方的代码风格指南](https://www.python.org/dev/peps/pep-0008/),包含了以下几个方面的内容: + +- 代码布局,介绍了 Python 中空行、断行以及导入相关的代码风格规范。比如一个常见的问题:当我的代码较长,无法在一行写下时,何处可以断行? + +- 表达式,介绍了 Python 中表达式空格相关的一些风格规范。 + +- 尾随逗号相关的规范。当列表较长,无法一行写下而写成如下逐行列表时,推荐在末项后加逗号,从而便于追加选项、版本控制等。 + + ```python + # Correct: + FILES = ['setup.cfg', 'tox.ini'] + # Correct: + FILES = [ + 'setup.cfg', + 'tox.ini', + ] + # Wrong: + FILES = ['setup.cfg', 'tox.ini',] + # Wrong: + FILES = [ + 'setup.cfg', + 'tox.ini' + ] + ``` + +- 命名相关规范、注释相关规范、类型注解相关规范,我们将在后续章节中做详细介绍。 + + "A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is the most important." PEP 8 -- Style Guide for Python Code + +:::{note} +PEP 8 的代码规范并不是绝对的,项目内的一致性要优先于 PEP 8 的规范。OpenMMLab 各个项目都在 setup.cfg 设定了一些代码规范的设置,请遵照这些设置。一个例子是在 PEP 8 中有如下一个例子: + +```python +# Correct: +hypot2 = x*x + y*y +# Wrong: +hypot2 = x * x + y * y +``` + +这一规范是为了指示不同优先级,但 OpenMMLab 的设置中通常没有启用 yapf 的 `ARITHMETIC_PRECEDENCE_INDICATION` 选项,因而格式规范工具不会按照推荐样式格式化,以设置为准。 +::: + +### Google 开源项目风格指南 + +[Google 使用的编程风格指南](https://google.github.io/styleguide/pyguide.html),包括了 Python 相关的章节。相较于 PEP 8,该指南提供了更为详尽的代码指南。该指南包括了语言规范和风格规范两个部分。 + +其中,语言规范对 Python 中很多语言特性进行了优缺点的分析,并给出了使用指导意见,如异常、Lambda 表达式、列表推导式、metaclass 等。 + +风格规范的内容与 PEP 8 较为接近,大部分约定建立在 PEP 8 的基础上,也有一些更为详细的约定,如函数长度、TODO 注释、文件与 socket 对象的访问等。 + +推荐将该指南作为参考进行开发,但不必严格遵照,一来该指南存在一些 Python 2 兼容需求,例如指南中要求所有无基类的类应当显式地继承 Object, 而在仅使用 Python 3 的环境中,这一要求是不必要的,依本项目中的惯例即可。二来 OpenMMLab 的项目作为框架级的开源软件,不必对一些高级技巧过于避讳,尤其是 MMCV。但尝试使用这些技巧前应当认真考虑是否真的有必要,并寻求其他开发人员的广泛评估。 + +另外需要注意的一处规范是关于包的导入,在该指南中,要求导入本地包时必须使用路径全称,且导入的每一个模块都应当单独成行,通常这是不必要的,而且也不符合目前项目的开发惯例,此处进行如下约定: + +```python +# Correct +from mmcv.cnn.bricks import (Conv2d, build_norm_layer, DropPath, MaxPool2d, + Linear) +from ..utils import ext_loader + +# Wrong +from mmcv.cnn.bricks import Conv2d, build_norm_layer, DropPath, MaxPool2d, \ + Linear # 使用括号进行连接,而不是反斜杠 +from ...utils import is_str # 最多向上回溯一层,过多的回溯容易导致结构混乱 +``` + +OpenMMLab 项目使用 pre-commit 工具自动格式化代码,详情见[贡献代码](../recommended_topics/contributing.md#代码风格)。 + +## 命名规范 + +### 命名规范的重要性 + +优秀的命名是良好代码可读的基础。基础的命名规范对各类变量的命名做了要求,使读者可以方便地根据代码名了解变量是一个类 / 局部变量 / 全局变量等。而优秀的命名则需要代码作者对于变量的功能有清晰的认识,以及良好的表达能力,从而使读者根据名称就能了解其含义,甚至帮助了解该段代码的功能。 + +### 基础命名规范 + +| 类型 | 公有 | 私有 | +| --------------- | ---------------- | ------------------ | +| 模块 | lower_with_under | \_lower_with_under | +| 包 | lower_with_under | | +| 类 | CapWords | \_CapWords | +| 异常 | CapWordsError | | +| 函数(方法) | lower_with_under | \_lower_with_under | +| 函数 / 方法参数 | lower_with_under | | +| 全局 / 类内常量 | CAPS_WITH_UNDER | \_CAPS_WITH_UNDER | +| 全局 / 类内变量 | lower_with_under | \_lower_with_under | +| 变量 | lower_with_under | \_lower_with_under | +| 局部变量 | lower_with_under | | + +注意: + +- 尽量避免变量名与保留字冲突,特殊情况下如不可避免,可使用一个后置下划线,如 class\_ +- 尽量不要使用过于简单的命名,除了约定俗成的循环变量 i,文件变量 f,错误变量 e 等。 +- 不会被用到的变量可以命名为 \_,逻辑检查器会将其忽略。 + +### 命名技巧 + +良好的变量命名需要保证三点: + +1. 含义准确,没有歧义 +2. 长短适中 +3. 前后统一 + +```python +# Wrong +class Masks(metaclass=ABCMeta): # 命名无法表现基类;Instance or Semantic? + pass + +# Correct +class BaseInstanceMasks(metaclass=ABCMeta): + pass + +# Wrong,不同地方含义相同的变量尽量用统一的命名 +def __init__(self, inplanes, planes): + pass + +def __init__(self, in_channels, out_channels): + pass +``` + +常见的函数命名方法: + +- 动宾命名法:crop_img, init_weights +- 动宾倒置命名法:imread, bbox_flip + +注意函数命名与参数的顺序,保证主语在前,符合语言习惯: + +- check_keys_exist(key, container) +- check_keys_contain(container, key) + +注意避免非常规或统一约定的缩写,如 nb -> num_blocks,in_nc -> in_channels + +## docstring 规范 + +### 为什么要写 docstring + +docstring 是对一个类、一个函数功能与 API 接口的详细描述,有两个功能,一是帮助其他开发者了解代码功能,方便 debug 和复用代码;二是在 Readthedocs 文档中自动生成相关的 API reference 文档,帮助不了解源代码的社区用户使用相关功能。 + +### 如何写 docstring + +与注释不同,一份规范的 docstring 有着严格的格式要求,以便于 Python 解释器以及 sphinx 进行文档解析,详细的 docstring 约定参见 [PEP 257](https://www.python.org/dev/peps/pep-0257/)。此处以例子的形式介绍各种文档的标准格式,参考格式为 [Google 风格](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)。 + +1. 模块文档 + + 代码风格规范推荐为每一个模块(即 Python 文件)编写一个 docstring,但目前 OpenMMLab 项目大部分没有此类 docstring,因此不做硬性要求。 + + ```python + """A one line summary of the module or program, terminated by a period. + + Leave one blank line. The rest of this docstring should contain an + overall description of the module or program. Optionally, it may also + contain a brief description of exported classes and functions and/or usage + examples. + + Typical usage example: + + foo = ClassFoo() + bar = foo.FunctionBar() + """ + ``` + +2. 类文档 + + 类文档是我们最常需要编写的,此处,按照 OpenMMLab 的惯例,我们使用了与 Google 风格不同的写法。如下例所示,文档中没有使用 Attributes 描述类属性,而是使用 Args 描述 __init__ 函数的参数。 + + 在 Args 中,遵照 `parameter (type): Description.` 的格式,描述每一个参数类型和功能。其中,多种类型可使用 `(float or str)` 的写法,可以为 None 的参数可以写为 `(int, optional)`。 + + ```python + class BaseRunner(metaclass=ABCMeta): + """The base class of Runner, a training helper for PyTorch. + + All subclasses should implement the following APIs: + + - ``run()`` + - ``train()`` + - ``val()`` + - ``save_checkpoint()`` + + Args: + model (:obj:`torch.nn.Module`): The model to be run. + batch_processor (callable, optional): A callable method that process + a data batch. The interface of this method should be + ``batch_processor(model, data, train_mode) -> dict``. + Defaults to None. + optimizer (dict or :obj:`torch.optim.Optimizer`, optional): It can be + either an optimizer (in most cases) or a dict of optimizers + (in models that requires more than one optimizer, e.g., GAN). + Defaults to None. + work_dir (str, optional): The working directory to save checkpoints + and logs. Defaults to None. + logger (:obj:`logging.Logger`): Logger used during training. + Defaults to None. (The default value is just for backward + compatibility) + meta (dict, optional): A dict records some import information such as + environment info and seed, which will be logged in logger hook. + Defaults to None. + max_epochs (int, optional): Total training epochs. Defaults to None. + max_iters (int, optional): Total training iterations. Defaults to None. + """ + + def __init__(self, + model, + batch_processor=None, + optimizer=None, + work_dir=None, + logger=None, + meta=None, + max_iters=None, + max_epochs=None): + ... + ``` + + 另外,在一些算法实现的主体类中,建议加入原论文的链接;如果参考了其他开源代码的实现,则应加入 modified from,而如果是直接复制了其他代码库的实现,则应加入 copied from ,并注意源码的 License。如有必要,也可以通过 .. math:: 来加入数学公式 + + ```python + # 参考实现 + # This func is modified from `detectron2 + # `_. + + # 复制代码 + # This code was copied from the `ubelt + # library`_. + + # 引用论文 & 添加公式 + class LabelSmoothLoss(nn.Module): + r"""Initializer for the label smoothed cross entropy loss. + + Refers to `Rethinking the Inception Architecture for Computer Vision + `_. + + This decreases gap between output scores and encourages generalization. + Labels provided to forward can be one-hot like vectors (NxC) or class + indices (Nx1). + And this accepts linear combination of one-hot like labels from mixup or + cutmix except multi-label task. + + Args: + label_smooth_val (float): The degree of label smoothing. + num_classes (int, optional): Number of classes. Defaults to None. + mode (str): Refers to notes, Options are "original", "classy_vision", + "multi_label". Defaults to "classy_vision". + reduction (str): The method used to reduce the loss. + Options are "none", "mean" and "sum". Defaults to 'mean'. + loss_weight (float): Weight of the loss. Defaults to 1.0. + + Note: + if the ``mode`` is "original", this will use the same label smooth + method as the original paper as: + + .. math:: + (1-\epsilon)\delta_{k, y} + \frac{\epsilon}{K} + + where :math:`\epsilon` is the ``label_smooth_val``, :math:`K` is + the ``num_classes`` and :math:`\delta_{k,y}` is Dirac delta, + which equals 1 for k=y and 0 otherwise. + + if the ``mode`` is "classy_vision", this will use the same label + smooth method as the `facebookresearch/ClassyVision + `_ repo as: + + .. math:: + \frac{\delta_{k, y} + \epsilon/K}{1+\epsilon} + + if the ``mode`` is "multi_label", this will accept labels from + multi-label task and smoothing them as: + + .. math:: + (1-2\epsilon)\delta_{k, y} + \epsilon + ``` + +```{note} +注意 \`\`here\`\`、\`here\`、"here" 三种引号功能是不同。 + +在 reStructured 语法中,\`\`here\`\` 表示一段代码;\`here\` 表示斜体;"here" 无特殊含义,一般可用来表示字符串。其中 \`here\` 的用法与 Markdown 中不同,需要多加留意。 +另外还有 :obj:\`type\` 这种更规范的表示类的写法,但鉴于长度,不做特别要求,一般仅用于表示非常用类型。 +``` + +3. 方法(函数)文档 + + 函数文档与类文档的结构基本一致,但需要加入返回值文档。对于较为复杂的函数和类,可以使用 Examples 字段加入示例;如果需要对参数加入一些较长的备注,可以加入 Note 字段进行说明。 + + 对于使用较为复杂的类或函数,比起看大段大段的说明文字和参数文档,添加合适的示例更能帮助用户迅速了解其用法。需要注意的是,这些示例最好是能够直接在 Python 交互式环境中运行的,并给出一些相对应的结果。如果存在多个示例,可以使用注释简单说明每段示例,也能起到分隔作用。 + + ```python + def import_modules_from_strings(imports, allow_failed_imports=False): + """Import modules from the given list of strings. + + Args: + imports (list | str | None): The given module names to be imported. + allow_failed_imports (bool): If True, the failed imports will return + None. Otherwise, an ImportError is raise. Defaults to False. + + Returns: + List[module] | module | None: The imported modules. + All these three lines in docstring will be compiled into the same + line in readthedocs. + + Examples: + >>> osp, sys = import_modules_from_strings( + ... ['os.path', 'sys']) + >>> import os.path as osp_ + >>> import sys as sys_ + >>> assert osp == osp_ + >>> assert sys == sys_ + """ + ... + ``` + + 如果函数接口在某个版本发生了变化,需要在 docstring 中加入相关的说明,必要时添加 Note 或者 Warning 进行说明,例如: + + ```python + class CheckpointHook(Hook): + """Save checkpoints periodically. + + Args: + out_dir (str, optional): The root directory to save checkpoints. If + not specified, ``runner.work_dir`` will be used by default. If + specified, the ``out_dir`` will be the concatenation of + ``out_dir`` and the last level directory of ``runner.work_dir``. + Defaults to None. `Changed in version 1.3.15.` + + Warning: + Before v1.3.15, the ``out_dir`` argument indicates the path where the + checkpoint is stored. However, in v1.3.15 and later, ``out_dir`` + indicates the root directory and the final path to save checkpoint is + the concatenation of out_dir and the last level directory of + ``runner.work_dir``. Suppose the value of ``out_dir`` is + "/path/of/A" and the value of ``runner.work_dir`` is "/path/of/B", + then the final path will be "/path/of/A/B". + ``` + + 如果参数或返回值里带有需要展开描述字段的 dict,则应该采用如下格式: + + ```python + def func(x): + r""" + Args: + x (None): A dict with 2 keys, ``padded_targets``, and ``targets``. + + - ``targets`` (list[Tensor]): A list of tensors. + Each tensor has the shape of :math:`(T_i)`. Each + element is the index of a character. + - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`. + Each item is the length of a word. + + Returns: + dict: A dict with 2 keys, ``padded_targets``, and ``targets``. + + - ``targets`` (list[Tensor]): A list of tensors. + Each tensor has the shape of :math:`(T_i)`. Each + element is the index of a character. + - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`. + Each item is the length of a word. + """ + return x + ``` + +```{important} +为了生成 readthedocs 文档,文档的编写需要按照 ReStructrued 文档格式,否则会产生文档渲染错误,在提交 PR 前,最好生成并预览一下文档效果。 +语法规范参考: + +- [reStructuredText Primer - Sphinx documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#) +- [Example Google Style Python Docstrings ‒ napoleon 0.7 documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google) +``` + +## 注释规范 + +### 为什么要写注释 + +对于一个开源项目,团队合作以及社区之间的合作是必不可少的,因而尤其要重视合理的注释。不写注释的代码,很有可能过几个月自己也难以理解,造成额外的阅读和修改成本。 + +### 如何写注释 + +最需要写注释的是代码中那些技巧性的部分。如果你在下次代码审查的时候必须解释一下,那么你应该现在就给它写注释。对于复杂的操作,应该在其操作开始前写上若干行注释。对于不是一目了然的代码,应在其行尾添加注释。 +—— Google 开源项目风格指南 + +```python +# We use a weighted dictionary search to find out where i is in +# the array. We extrapolate position based on the largest num +# in the array and the array size and then do binary search to +# get the exact number. +if i & (i-1) == 0: # True if i is 0 or a power of 2. +``` + +为了提高可读性, 注释应该至少离开代码2个空格. +另一方面, 绝不要描述代码. 假设阅读代码的人比你更懂Python, 他只是不知道你的代码要做什么. +—— Google 开源项目风格指南 + +```python +# Wrong: +# Now go through the b array and make sure whenever i occurs +# the next element is i+1 + +# Wrong: +if i & (i-1) == 0: # True if i bitwise and i-1 is 0. +``` + +在注释中,可以使用 Markdown 语法,因为开发人员通常熟悉 Markdown 语法,这样可以便于交流理解,如可使用单反引号表示代码和变量(注意不要和 docstring 中的 ReStructured 语法混淆) + +```python +# `_reversed_padding_repeated_twice` is the padding to be passed to +# `F.pad` if needed (e.g., for non-zero padding types that are +# implemented as two ops: padding + conv). `F.pad` accepts paddings in +# reverse order than the dimension. +self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2) +``` + +### 注释示例 + +1. 出自 `mmcv/utils/registry.py`,对于较为复杂的逻辑结构,通过注释,明确了优先级关系。 + + ```python + # self.build_func will be set with the following priority: + # 1. build_func + # 2. parent.build_func + # 3. build_from_cfg + if build_func is None: + if parent is not None: + self.build_func = parent.build_func + else: + self.build_func = build_from_cfg + else: + self.build_func = build_func + ``` + +2. 出自 `mmcv/runner/checkpoint.py`,对于 bug 修复中的一些特殊处理,可以附带相关的 issue 链接,帮助其他人了解 bug 背景。 + + ```python + def _save_ckpt(checkpoint, file): + # The 1.6 release of PyTorch switched torch.save to use a new + # zipfile-based file format. It will cause RuntimeError when a + # checkpoint was saved in high version (PyTorch version>=1.6.0) but + # loaded in low version (PyTorch version<1.6.0). More details at + # https://github.com/open-mmlab/mmpose/issues/904 + if digit_version(TORCH_VERSION) >= digit_version('1.6.0'): + torch.save(checkpoint, file, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, file) + ``` + +## 类型注解 + +### 为什么要写类型注解 + +类型注解是对函数中变量的类型做限定或提示,为代码的安全性提供保障、增强代码的可读性、避免出现类型相关的错误。 +Python 没有对类型做强制限制,类型注解只起到一个提示作用,通常你的 IDE 会解析这些类型注解,然后在你调用相关代码时对类型做提示。另外也有类型注解检查工具,这些工具会根据类型注解,对代码中可能出现的问题进行检查,减少 bug 的出现。 +需要注意的是,通常我们不需要注释模块中的所有函数: + +1. 公共的 API 需要注释 +2. 在代码的安全性,清晰性和灵活性上进行权衡是否注释 +3. 对于容易出现类型相关的错误的代码进行注释 +4. 难以理解的代码请进行注释 +5. 若代码中的类型已经稳定,可以进行注释. 对于一份成熟的代码,多数情况下,即使注释了所有的函数,也不会丧失太多的灵活性. + +### 如何写类型注解 + +1. 函数 / 方法类型注解,通常不对 self 和 cls 注释。 + + ```python + from typing import Optional, List, Tuple + + # 全部位于一行 + def my_method(self, first_var: int) -> int: + pass + + # 另起一行 + def my_method( + self, first_var: int, + second_var: float) -> Tuple[MyLongType1, MyLongType1, MyLongType1]: + pass + + # 单独成行(具体的应用场合与行宽有关,建议结合 yapf 自动化格式使用) + def my_method( + self, first_var: int, second_var: float + ) -> Tuple[MyLongType1, MyLongType1, MyLongType1]: + pass + + # 引用尚未被定义的类型 + class MyClass: + def __init__(self, + stack: List["MyClass"]) -> None: + pass + ``` + + 注:类型注解中的类型可以是 Python 内置类型,也可以是自定义类,还可以使用 Python 提供的 wrapper 类对类型注解进行装饰,一些常见的注解如下: + + ```python + # 数值类型 + from numbers import Number + + # 可选类型,指参数可以为 None + from typing import Optional + def foo(var: Optional[int] = None): + pass + + # 联合类型,指同时接受多种类型 + from typing import Union + def foo(var: Union[float, str]): + pass + + from typing import Sequence # 序列类型 + from typing import Iterable # 可迭代类型 + from typing import Any # 任意类型 + from typing import Callable # 可调用类型 + + from typing import List, Dict # 列表和字典的泛型类型 + from typing import Tuple # 元组的特殊格式 + # 虽然在 Python 3.9 中,list, tuple 和 dict 本身已支持泛型,但为了支持之前的版本 + # 我们在进行类型注解时还是需要使用 List, Tuple, Dict 类型 + # 另外,在对参数类型进行注解时,尽量使用 Sequence & Iterable & Mapping + # List, Tuple, Dict 主要用于返回值类型注解 + # 参见 https://docs.python.org/3/library/typing.html#typing.List + ``` + +2. 变量类型注解,一般用于难以直接推断其类型时 + + ```python + # Recommend: 带类型注解的赋值 + a: Foo = SomeUndecoratedFunction() + a: List[int]: [1, 2, 3] # List 只支持单一类型泛型,可使用 Union + b: Tuple[int, int] = (1, 2) # 长度固定为 2 + c: Tuple[int, ...] = (1, 2, 3) # 变长 + d: Dict[str, int] = {'a': 1, 'b': 2} + + # Not Recommend:行尾类型注释 + # 虽然这种方式被写在了 Google 开源指南中,但这是一种为了支持 Python 2.7 版本 + # 而补充的注释方式,鉴于我们只支持 Python 3, 为了风格统一,不推荐使用这种方式。 + a = SomeUndecoratedFunction() # type: Foo + a = [1, 2, 3] # type: List[int] + b = (1, 2, 3) # type: Tuple[int, ...] + c = (1, "2", 3.5) # type: Tuple[int, Text, float] + ``` + +3. 泛型 + + 上文中我们知道,typing 中提供了 list 和 dict 的泛型类型,那么我们自己是否可以定义类似的泛型呢? + + ```python + from typing import TypeVar, Generic + + KT = TypeVar('KT') + VT = TypeVar('VT') + + class Mapping(Generic[KT, VT]): + def __init__(self, data: Dict[KT, VT]): + self._data = data + + def __getitem__(self, key: KT) -> VT: + return self._data[key] + ``` + + 使用上述方法,我们定义了一个拥有泛型能力的映射类,实际用法如下: + + ```python + mapping = Mapping[str, float]({'a': 0.5}) + value: float = example['a'] + ``` + + 另外,我们也可以利用 TypeVar 在函数签名中指定联动的多个类型: + + ```python + from typing import TypeVar, List + + T = TypeVar('T') # Can be anything + A = TypeVar('A', str, bytes) # Must be str or bytes + + + def repeat(x: T, n: int) -> List[T]: + """Return a list containing n references to x.""" + return [x]*n + + + def longest(x: A, y: A) -> A: + """Return the longest of two strings.""" + return x if len(x) >= len(y) else y + ``` + +更多关于类型注解的写法请参考 [typing](https://docs.python.org/3/library/typing.html)。 + +### 类型注解检查工具 + +[mypy](https://mypy.readthedocs.io/en/stable/) 是一个 Python 静态类型检查工具。根据你的类型注解,mypy 会检查传参、赋值等操作是否符合类型注解,从而避免可能出现的 bug。 + +例如如下的一个 Python 脚本文件 test.py: + +```python +def foo(var: int) -> float: + return float(var) + +a: str = foo('2.0') +b: int = foo('3.0') # type: ignore +``` + +运行 mypy test.py 可以得到如下检查结果,分别指出了第 4 行在函数调用和返回值赋值两处类型错误。而第 5 行同样存在两个类型错误,由于使用了 type: ignore 而被忽略了,只有部分特殊情况可能需要此类忽略。 + +``` +test.py:4: error: Incompatible types in assignment (expression has type "float", variable has type "int") +test.py:4: error: Argument 1 to "foo" has incompatible type "str"; expected "int" +Found 2 errors in 1 file (checked 1 source file) +``` diff --git a/third_party/mmyolo/docs/zh_cn/notes/compatibility.md b/third_party/mmyolo/docs/zh_cn/notes/compatibility.md new file mode 100644 index 0000000000000000000000000000000000000000..a92521efc6583bb6ee548b6668efad06c5890453 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/notes/compatibility.md @@ -0,0 +1,47 @@ +# MMYOLO 兼容性说明 + +## MMYOLO v0.3.0 + +### METAINFO 修改 + +为了和 OpenMMLab 其他仓库统一,将 Dataset 里 `METAINFO` 的所有键从大写改为小写。 + +| 在 v0.3.0 之前 | v0.3.0 及之后 | +| :------------: | :-----------: | +| CLASSES | classes | +| PALETTE | palette | +| DATASET_TYPE | dataset_type | + +### 关于图片 shape 顺序的说明 + +在 OpenMMLab 2.0 中, 为了与 OpenCV 的输入参数相一致,图片处理 pipeline 中关于图像 shape 的输入参数总是以 `(width, height)` 的顺序排列。 +相反,为了计算方便,经过 pipeline 和 model 的字段的顺序是 `(height, width)`。具体来说在每个数据 pipeline 处理的结果中,字段和它们的值含义如下: + +- img_shape: (height, width) +- ori_shape: (height, width) +- pad_shape: (height, width) +- batch_input_shape: (height, width) + +以 `Mosaic` 为例,其初始化参数如下所示: + +```python +@TRANSFORMS.register_module() +class Mosaic(BaseTransform): + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + prob: float = 1.0) -> None: + ... + + # img_scale 顺序应该是 (width, height) + self.img_scale = img_scale + + def transform(self, results: dict) -> dict: + ... + + results['img'] = mosaic_img + # (height, width) + results['img_shape'] = mosaic_img.shape[:2] +``` diff --git a/third_party/mmyolo/docs/zh_cn/notes/conventions.md b/third_party/mmyolo/docs/zh_cn/notes/conventions.md new file mode 100644 index 0000000000000000000000000000000000000000..7c2370ffbb7a8d8752b7917f476ab1a96e52bbd6 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/notes/conventions.md @@ -0,0 +1,37 @@ +# 默认约定 + +如果你想把 MMYOLO 修改为自己的项目,请遵循下面的约定。 + +## 关于图片 shape 顺序的说明 + +在OpenMMLab 2.0中, 为了与 OpenCV 的输入参数相一致,图片处理 pipeline 中关于图像 shape 的输入参数总是以 `(width, height)` 的顺序排列。 +相反,为了计算方便,经过 pipeline 和 model 的字段的顺序是 `(height, width)`。具体来说在每个数据 pipeline 处理的结果中,字段和它们的值含义如下: + +- img_shape: (height, width) +- ori_shape: (height, width) +- pad_shape: (height, width) +- batch_input_shape: (height, width) + +以 `Mosaic` 为例,其初始化参数如下所示: + +```python +@TRANSFORMS.register_module() +class Mosaic(BaseTransform): + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + prob: float = 1.0) -> None: + ... + + # img_scale 顺序应该是 (width, height) + self.img_scale = img_scale + + def transform(self, results: dict) -> dict: + ... + + results['img'] = mosaic_img + # (height, width) + results['img_shape'] = mosaic_img.shape[:2] +``` diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/index.rst b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..54bc8b8e7762ee80680b7c95ad4a411badc7b135 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/index.rst @@ -0,0 +1,10 @@ +算法原理和实现全解析 +******************** + +.. toctree:: + :maxdepth: 1 + + yolov5_description.md + yolov6_description.md + rtmdet_description.md + yolov8_description.md diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/rtmdet_description.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/rtmdet_description.md new file mode 100644 index 0000000000000000000000000000000000000000..a13c7f5744af006ee72a181a7afbe3bdfb91b0e4 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/rtmdet_description.md @@ -0,0 +1,645 @@ +# RTMDet 原理和实现全解析 + +## 0 简介 + +高性能,低延时的单阶段目标检测器 + +
+RTMDet_structure_v1.3 +
+ +以上结构图由 RangeKing@github 绘制。 + +最近一段时间,开源界涌现出了大量的高精度目标检测项目,其中最突出的就是 YOLO 系列,OpenMMLab 也在与社区的合作下推出了 MMYOLO。 +在调研了当前 YOLO 系列的诸多改进模型后,MMDetection 核心开发者针对这些设计以及训练方式进行了经验性的总结,并进行了优化,推出了高精度、低延时的单阶段目标检测器 RTMDet, **R**eal-**t**ime **M**odels for Object **Det**ection +(**R**elease **t**o **M**anufacture) + +RTMDet 由 tiny/s/m/l/x 一系列不同大小的模型组成,为不同的应用场景提供了不同的选择。 +其中,RTMDet-x 在 52.6 mAP 的精度下达到了 300+ FPS 的推理速度。 + +```{note} +注:推理速度和精度测试(不包含 NMS)是在 1 块 NVIDIA 3090 GPU 上的 `TensorRT 8.4.3, cuDNN 8.2.0, FP16, batch size=1` 条件里测试的。 +``` + +而最轻量的模型 RTMDet-tiny,在仅有 4M 参数量的情况下也能够达到 40.9 mAP,且推理速度 \< 1 ms。 + +
+RTMDet_精度图 +
+ +上图中的精度是和 300 epoch 训练下的公平对比,为不使用蒸馏的结果。 + +| | mAP | Params | Flops | Inference speed | +| -------------------------- | --------------- | -------------- | ------------ | --------------- | +| Baseline(YOLOX) | 40.2 | 9M | 13.4G | 1.2ms | +| + AdamW + Flat Cosine | 40.6 (+0.4) | 9M | 13.4G | 1.2ms | +| + CSPNeXt backbone & PAFPN | 41.8 (+1.2) | 10.07M (+1.07) | 14.8G (+1.4) | 1.22ms (+0.02) | +| + SepBNHead | 41.8 (+0) | 8.89M (-1.18) | 14.8G | 1.22ms | +| + Label Assign & Loss | 42.9 (+1.1) | 8.89M | 14.8G | 1.22ms | +| + Cached Mosaic & MixUp | 44.2 (+1.3) | 8.89M | 14.8G | 1.22ms | +| + RSB-pretrained backbone | **44.5 (+0.3)** | 8.89M | 14.8G | 1.22ms | + +- 官方开源地址: https://github.com/open-mmlab/mmdetection/blob/3.x/configs/rtmdet/README.md +- MMYOLO 开源地址: https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/README.md + +## 1 v1.0 算法原理和 MMYOLO 实现解析 + +### 1.1 数据增强模块 + +RTMDet 采用了多种数据增强的方式来增加模型的性能,主要包括单图数据增强: + +- **RandomResize 随机尺度变换** +- **RandomCrop 随机裁剪** +- **HSVRandomAug 颜色空间增强** +- **RandomFlip 随机水平翻转** + +以及混合类数据增强: + +- **Mosaic 马赛克** +- **MixUp 图像混合** + +数据增强流程如下: + +
+image +
+ +其中 RandomResize 超参在大模型 M,L,X 和小模型 S, Tiny 上是不一样的,大模型由于参数较多,可以使用 large scale jitter 策略即参数为 (0.1,2.0),而小模型采用 stand scale jitter 策略即 (0.5, 2.0) 策略。 +MMDetection 开源库中已经对单图数据增强进行了封装,用户通过简单的修改配置即可使用库中提供的任何数据增强功能,且都是属于比较常规的数据增强,不需要特殊介绍。下面将具体介绍混合类数据增强的具体实现。 + +与 YOLOv5 不同的是,YOLOv5 认为在 S 和 Nano 模型上使用 MixUp 是过剩的,小模型不需要这么强的数据增强。而 RTMDet 在 S 和 Tiny 上也使用了 MixUp,这是因为 RTMDet 在最后 20 epoch 会切换为正常的 aug, 并通过训练证明这个操作是有效的。 并且 RTMDet 为混合类数据增强引入了 Cache 方案,有效地减少了图像处理的时间, 和引入了可调超参 `max_cached_images` ,当使用较小的 cache 时,其效果类似 `repeated augmentation`。具体介绍如下: + +| | Use cache | ms / 100 imgs | +| ------ | --------- | ------------- | +| Mosaic | | 87.1 | +| Mosaic | √ | **24.0** | +| MixUp | | 19.3 | +| MixUp | √ | **12.4** | + +| | RTMDet-s | RTMDet-l | +| ----------------------------- | -------- | -------- | +| Mosaic + MixUp + 20e finetune | 43.9 | **51.3** | + +#### 1.1.1 为图像混合数据增强引入 Cache + +Mosaic&MixUp 涉及到多张图片的混合,它们的耗时会是普通数据增强的 K 倍(K 为混入图片的数量)。 如在 YOLOv5 中,每次做 Mosaic 时, 4 张图片的信息都需要从硬盘中重新加载。 而 RTMDet 只需要重新载入当前的一张图片,其余参与混合增强的图片则从缓存队列中获取,通过牺牲一定内存空间的方式大幅提升了效率。 另外通过调整 cache 的大小以及 pop 的方式,也可以调整增强的强度。 + +
+data cache +
+ +如图所示,cache 队列中预先储存了 N 张已加载的图像与标签数据,每一个训练 step 中只需加载一张新的图片及其标签数据并更新到 cache 队列中(cache 队列中的图像可重复,如图中出现两次 img3),同时如果 cache 队列长度超过预设长度,则随机 pop 一张图(为了 Tiny 模型训练更稳定,在 Tiny 模型中不采用随机 pop 的方式, 而是移除最先加入的图片),当需要进行混合数据增强时,只需要从 cache 中随机选择需要的图像进行拼接等处理,而不需要全部从硬盘中加载,节省了图像加载的时间。 + +```{note} +cache 队列的最大长度 N 为可调整参数,根据经验性的原则,当为每一张需要混合的图片提供十个缓存时,可以认为提供了足够的随机性,而 Mosaic 增强是四张图混合,因此 cache 数量默认 N=40, 同理 MixUp 的 cache 数量默认为20, tiny 模型需要更稳定的训练条件,因此其 cache 数量也为其余规格模型的一半( MixUp 为10,Mosaic 为20) +``` + +在具体实现中,MMYOLO 设计了 `BaseMiximageTransform` 类来支持多张图像混合数据增强: + +```python +if self.use_cached: + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + self.results_cache.append(copy.deepcopy(results)) # 将当前加载的图片数据缓存到 cache 中 + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: # 除了tiny模型,self.random_pop=True + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results +else: + assert 'dataset' in results + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) +``` + +#### 1.1.2 Mosaic + +Mosaic 是将 4 张图拼接为 1 张大图,相当于变相的增加了 batch size,具体步骤为: + +1. 根据索引随机从自定义数据集中再采样3个图像,可能重复 + +```python +def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes +``` + +2. 随机选出 4 幅图像相交的中点。 + +```python +# mosaic center x, y +center_x = int( + random.uniform(*self.center_ratio_range) * self.img_scale[1]) +center_y = int( + random.uniform(*self.center_ratio_range) * self.img_scale[0]) +center_position = (center_x, center_y) +``` + +3. 根据采样的 index 读取图片并拼接, 拼接前会先进行 `keep-ratio` 的 resize 图片(即为最大边一定是 640)。 + +```python +# keep_ratio resize +scale_ratio_i = min(self.img_scale[0] / h_i, + self.img_scale[1] / w_i) +img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) +``` + +4. 拼接后,把 bbox 和 label 全部拼接起来,然后对 bbox 进行裁剪但是不过滤(可能出现一些无效框) + +```python +mosaic_bboxes.clip_([2 * self.img_scale[0], 2 * self.img_scale[1]]) +``` + +更多的关于 Mosaic 原理的详情可以参考 [YOLOv5 原理和实现全解析](./yolov5_description.md) 中的 Mosaic 原理分析。 + +#### 1.1.3 MixUp + +RTMDet 的 MixUp 实现方式与 YOLOX 中一样,只不过增加了类似上文中提到的 cache 功能。 + +更多的关于 MixUp 原理的详情也可以参考 [YOLOv5 原理和实现全解析](./yolov5_description.md) 中的 MixUp 原理分析。 + +#### 1.1.4 强弱两阶段训练 + +Mosaic+MixUp 失真度比较高,持续用太强的数据增强对模型并不一定有益。YOLOX 中率先使用了强弱两阶段的训练方式,但由于引入了旋转,错切导致 box 标注产生误差,需要在第二阶段引入额外的 L1 loss 来纠正回归分支的性能。 + +为了使数据增强的方式更为通用,RTMDet 在前 280 epoch 使用不带旋转的 Mosaic+MixUp, 且通过混入 8 张图片来提升强度以及正样本数。后 20 epoch 使用比较小的学习率在比较弱的增强下进行微调,同时在 EMA 的作用下将参数缓慢更新至模型,能够得到比较大的提升。 + +| | RTMDet-s | RTMDet-l | +| ----------------------------- | -------- | -------- | +| LSJ + rand crop | 42.3 | 46.7 | +| Mosaic+MixUp | 41.9 | 49.8 | +| Mosaic + MixUp + 20e finetune | 43.9 | **51.3** | + +### 1.2 模型结构 + +RTMDet 模型整体结构和 [YOLOX](https://arxiv.org/abs/2107.08430) 几乎一致,由 `CSPNeXt` + `CSPNeXtPAFPN` + `共享卷积权重但分别计算 BN 的 SepBNHead` 构成。内部核心模块也是 `CSPLayer`,但对其中的 `Basic Block` 进行了改进,提出了 `CSPNeXt Block`。 + +#### 1.2.1 Backbone + +`CSPNeXt` 整体以 `CSPDarknet` 为基础,共 5 层结构,包含 1 个 `Stem Layer` 和 4 个 `Stage Layer`: + +- `Stem Layer` 是 3 层 3x3 kernel 的 `ConvModule` ,不同于之前的 `Focus` 模块或者 1 层 6x6 kernel 的 `ConvModule` 。 + +- `Stage Layer` 总体结构与已有模型类似,前 3 个 `Stage Layer` 由 1 个 `ConvModule` 和 1 个 `CSPLayer` 组成。第 4 个 `Stage Layer` 在 `ConvModule` 和 `CSPLayer` 中间增加了 `SPPF` 模块(MMDetection 版本为 `SPP` 模块)。 + +- 如模型图 Details 部分所示,`CSPLayer` 由 3 个 `ConvModule` + n 个 `CSPNeXt Block`(带残差连接) + 1 个 `Channel Attention` 模块组成。`ConvModule` 为 1 层 3x3 `Conv2d` + `BatchNorm` + `SiLU` 激活函数。`Channel Attention` 模块为 1 层 `AdaptiveAvgPool2d` + 1 层 1x1 `Conv2d` + `Hardsigmoid` 激活函数。`CSPNeXt Block` 模块在下节详细讲述。 + +- 如果想阅读 Backbone - `CSPNeXt` 的源码,可以 [**点此**](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/backbones/cspnext.py#L16-L171) 跳转。 + +#### 1.2.2 CSPNeXt Block + +Darknet (图 a)使用 1x1 与 3x3 卷积的 `Basic Block`。[YOLOv6](https://arxiv.org/abs/2209.02976) 、[YOLOv7](https://arxiv.org/abs/2207.02696) 、[PPYOLO-E](https://arxiv.org/abs/2203.16250) (图 b & c)使用了重参数化 Block。但重参数化的训练代价高,且不易量化,需要其他方式来弥补量化误差。 +RTMDet 则借鉴了最近比较热门的 [ConvNeXt](https://arxiv.org/abs/2201.03545) 、[RepLKNet](https://arxiv.org/abs/2203.06717) 的做法,为 `Basic Block` 加入了大 kernel 的 `depth-wise` 卷积(图 d),并将其命名为 `CSPNeXt Block`。 + +
+BasicBlock +
+ +关于不同 kernel 大小的实验结果,如下表所示。 + +| Kernel size | params | flops | latency-bs1-TRT-FP16 / ms | mAP | +| ------------ | ---------- | --------- | ------------------------- | -------- | +| 3x3 | 50.8 | 79.61G | 2.1 | 50.0 | +| **5x5** | **50.92M** | **79.7G** | **2.11** | **50.9** | +| 7x7 | 51.1 | 80.34G | 2.73 | 51.1 | + +如果想阅读 `Basic Block` 和 `CSPNeXt Block` 源码,可以[**点此**](https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/layers/csp_layer.py#L79-L146)跳转。 + +#### 1.2.3 调整检测器不同 stage 间的 block 数 + +由于 `CSPNeXt Block` 内使用了 `depth-wise` 卷积,单个 block 内的层数增多。如果保持原有的 stage 内的 block 数,则会导致模型的推理速度大幅降低。 + +RTMDet 重新调整了不同 stage 间的 block 数,并调整了通道的超参,在保证了精度的情况下提升了推理速度。 + +关于不同 block 数的实验结果,如下表所示。 + +| Num blocks | params | flops | latency-bs1-TRT-FP16 / ms | mAP | +| ---------------------------------- | --------- | --------- | ------------------------- | -------- | +| L+3-9-9-3 | 53.4 | 86.28 | 2.6 | 51.4 | +| L+3-6-6-3 | 50.92M | 79.7G | 2.11 | 50.9 | +| **L+3-6-6-3 + channel attention** | **52.3M** | **79.9G** | **2.4** | **51.3** | + +最后不同大小模型的 block 数设置,可以参见[源码](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/backbones/cspnext.py#L50-L56) 。 + +#### 1.2.4 Neck + +Neck 模型结构和 YOLOX 几乎一样,只不过内部的 block 进行了替换。 + +#### 1.2.5 Backbone 与 Neck 之间的参数量和计算量的均衡 + +[EfficientDet](https://arxiv.org/abs/1911.09070) 、[NASFPN](https://arxiv.org/abs/1904.07392) 等工作在改进 Neck 时往往聚焦于如何修改特征融合的方式。 但引入过多的连接会增加检测器的延时,并增加内存开销。 + +所以 RTMDet 选择不引入额外的连接,而是改变 Backbone 与 Neck 间参数量的配比。该配比是通过手动调整 Backbone 和 Neck 的 `expand_ratio` 参数来实现的,其数值在 Backbone 和 Neck 中都为 0.5。`expand_ratio` 实际上是改变 `CSPLayer` 中各层通道数的参数(具体可见模型图 `CSPLayer` 部分)。如果想进行不同配比的实验,可以通过调整配置文件中的 [backbone {expand_ratio}](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py#L32) 和 [neck {expand_ratio}](https://github.com/open-mmlab/mmyolo/blob/main/configs/rtmdet/rtmdet_l_8xb32-300e_coco.py#L45) 参数完成。 + +实验发现,当 Neck 在整个模型中的参数量占比更高时,延时更低,且对精度的影响很小。作者在直播答疑时回复,RTMDet 在 Neck 这一部分的实验参考了 [GiraffeDet](https://arxiv.org/abs/2202.04256) 的做法,但没有像 GiraffeDet 一样引入额外连接(详细可参见 [RTMDet 发布视频](https://www.bilibili.com/video/BV1e841147GD) 31分40秒左右的内容)。 + +关于不同参数量配比的实验结果,如下表所示。 + +| Model size | Backbone | Neck | params | flops | latency / ms | mAP | +| ----------- | -------- | ------- | ---------- | ---------- | ------------- | -------- | +| **S** | **47%** | **45%** | **8.54M** | **15.76G** | **1.21** | **43.9** | +| S | 63% | 29% | 9.01M | 15.85G | 1.37 | 43.7 | +| **L** | **47%** | **45%** | **50.92M** | **79.7G** | **2.11** | **50.9** | +| L | 63% | 29% | 57.43M | 93.73 | 2.57 | 51.0 | + +如果想阅读 Neck - `CSPNeXtPAFPN` 的源码,可以[**点此**](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/necks/cspnext_pafpn.py#L15-L201) 跳转。 + +#### 1.2.6 Head + +传统的 YOLO 系列都使用同一 Head 进行分类和回归。YOLOX 则将分类和回归分支解耦,PPYOLO-E 和 YOLOv6 则引入了 [TOOD](https://arxiv.org/abs/2108.07755) 中的结构。它们在不同特征层级之间都使用独立的 Head,因此 Head 在模型中也占有较多的参数量。 + +RTMDet 参考了 [NAS-FPN](https://arxiv.org/abs/1904.07392) 中的做法,使用了 `SepBNHead`,在不同层之间共享卷积权重,但是独立计算 BN(BatchNorm) 的统计量。 + +关于不同结构 Head 的实验结果,如下表所示。 + +| Head type | params | flops | latency / ms | mAP | +| ------------------ | --------- | --------- | ------------- | -------- | +| Fully-shared head | 52.32 | 80.23 | 2.44 | 48.0 | +| Separated head | 57.03 | 80.23 | 2.44 | 51.2 | +| **SepBN** **head** | **52.32** | **80.23** | **2.44** | **51.3** | + +同时,RTMDet 也延续了作者之前在 [NanoDet](https://zhuanlan.zhihu.com/p/306530300) 中的思想,使用 [Quality Focal Loss](https://arxiv.org/abs/2011.12885) ,并去掉 Objectness 分支,进一步将 Head 轻量化。 + +如果想阅读 Head 中 `RTMDetSepBNHeadModule` 的源码,可以[**点此**](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/dense_heads/rtmdet_head.py#L24-L189) 跳转。 + +```{note} +注:MMYOLO 和 MMDetection 中 Neck 和 Head 的具体实现稍有不同。 +``` + +### 1.3 正负样本匹配策略 + +正负样本匹配策略或者称为标签匹配策略 `Label Assignment` 是目标检测模型训练中最核心的问题之一, 更好的标签匹配策略往往能够使得网络更好学习到物体的特征以提高检测能力。 + +早期的样本标签匹配策略一般都是基于 `空间以及尺度信息的先验` 来决定样本的选取。 典型案例如下: + +- `FCOS` 中先限定网格中心点在 `GT` 内筛选后然后再通过不同特征层限制尺寸来决定正负样本 +- `RetinaNet` 则是通过 `Anchor` 与 `GT` 的最大 `IOU` 匹配来划分正负样本 +- `YOLOV5` 的正负样本则是通过样本的宽高比先筛选一部分, 然后通过位置信息选取 `GT` 中心落在的 `Grid` 以及临近的两个作为正样本 + +但是上述方法都是属于基于 `先验` 的静态匹配策略, 就是样本的选取方式是根据人的经验规定的。 不会随着网络的优化而进行自动优化选取到更好的样本, 近些年涌现了许多优秀的动态标签匹配策略: + +- `OTA` 提出使用 `Sinkhorn` 迭代求解匹配中的最优传输问题 +- `YOLOX` 中使用 `OTA` 的近似算法 `SimOTA` , `TOOD` 将分类分数以及 `IOU` 相乘计算 `Cost` 矩阵进行标签匹配等等 + +这些算法将 `预测的 Bboxes 与 GT 的 IOU ` 和 `分类分数` 或者是对应 `分类 Loss` 和 `回归 Loss` 拿来计算 `Matching Cost` 矩阵再通过 `top-k` 的方式动态决定样本选取以及样本个数。通过这种方式, +在网络优化的过程中会自动选取对分类或者回归更加敏感有效的位置的样本, 它不再只依赖先验的静态的信息, 而是使用当前的预测结果去动态寻找最优的匹配, 只要模型的预测越准确, 匹配算法求得的结果也会更优秀。但是在网络训练的初期, 网络的分类以及回归是随机初始化, 这个时候还是需要 `先验` 来约束, 以达到 `冷启动` 的效果。 + +`RTMDet` 作者也是采用了动态的 `SimOTA` 做法,不过其对动态的正负样本分配策略进行了改进。 之前的动态匹配策略( `HungarianAssigner` 、`OTA` )往往使用与 `Loss` 完全一致的代价函数作为匹配的依据,但我们经过实验发现这并不一定时最优的。 使用更多 `Soften` 的 `Cost` 以及先验,能够提升性能。 + +#### 1.3.1 Bbox 编解码过程 + +RTMDet 的 BBox Coder 采用的是 `mmdet.DistancePointBBoxCoder`。 + +该类的 docstring 为 `This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, right) and decode it back to the original.` + +编码器将 gt bboxes (x1, y1, x2, y2) 编码为 (top, bottom, left, right),并且解码至原图像上。 + +MMDet 编码的核心源码: + +```python +def bbox2distance(points: Tensor, bbox: Tensor, ...) -> Tensor: + """ + points (Tensor): 相当于 scale 值 stride ,且每个预测点仅为一个正方形 anchor 的 anchor point [x, y],Shape (n, 2) or (b, n, 2). + bbox (Tensor): Bbox 为乘上 stride 的网络预测值,格式为 xyxy,Shape (n, 4) or (b, n, 4). + """ + # 计算点距离四边的距离 + left = points[..., 0] - bbox[..., 0] + top = points[..., 1] - bbox[..., 1] + right = bbox[..., 2] - points[..., 0] + bottom = bbox[..., 3] - points[..., 1] + + ... + + return torch.stack([left, top, right, bottom], -1) +``` + +MMDetection 解码的核心源码: + +```python +def distance2bbox(points: Tensor, distance: Tensor, ...) -> Tensor: + """ + 通过距离反算 bbox 的 xyxy + points (Tensor): 正方形的预测 anchor 的 anchor point [x, y],Shape (B, N, 2) or (N, 2). + distance (Tensor): 距离四边的距离。(left, top, right, bottom). Shape (B, N, 4) or (N, 4) + """ + + # 反算 bbox xyxy + x1 = points[..., 0] - distance[..., 0] + y1 = points[..., 1] - distance[..., 1] + x2 = points[..., 0] + distance[..., 2] + y2 = points[..., 1] + distance[..., 3] + + bboxes = torch.stack([x1, y1, x2, y2], -1) + + ... + + return bboxes +``` + +#### 1.3.2 匹配策略 + +`RTMDet` 提出了 `Dynamic Soft Label Assigner` 来实现标签的动态匹配策略, 该方法主要包括使用 **位置先验信息损失** , **样本回归损失** , **样本分类损失** , 同时对三个损失进行了 `Soft` 处理进行参数调优, 以达到最佳的动态匹配效果。 + +该方法 Matching Cost 矩阵由如下损失构成: + +```python +cost_matrix = soft_cls_cost + iou_cost + soft_center_prior +``` + +1. Soft_Center_Prior + +```{math} +C\_{center} = \\alpha^{|x\_{pred}-x\_{gt}|-\\beta} +``` + +```python +# valid_prior Tensor[N,4] 表示anchor point +# 4分别表示 x, y, 以及对应的特征层的 stride, stride +gt_center = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2.0 +valid_prior = priors[valid_mask] +strides = valid_prior[:, 2] +# 计算gt与anchor point的中心距离并转换到特征图尺度 +distance = (valid_prior[:, None, :2] - gt_center[None, :, :] + ).pow(2).sum(-1).sqrt() / strides[:, None] +# 以10为底计算位置的软化损失,限定在gt的6个单元格以内 +soft_center_prior = torch.pow(10, distance - 3) +``` + +2. IOU_Cost + +```{math} +C\_{reg} = -log(IOU) +``` + +```python +# 计算回归 bboxes 和 gts 的 iou +pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes) +# 将 iou 使用 log 进行 soft , iou 越小 cost 更小 +iou_cost = -torch.log(pairwise_ious + EPS) * 3 +``` + +3. Soft_Cls_Cost + +```{math} +C\_{cls} = CE(P,Y\_{soft}) \*(Y\_{soft}-P)^2 +``` + +```python +# 生成分类标签 + gt_onehot_label = ( + F.one_hot(gt_labels.to(torch.int64), + pred_scores.shape[-1]).float().unsqueeze(0).repeat( + num_valid, 1, 1)) +valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1) +# 不单单将分类标签为01,而是换成与 gt 的 iou +soft_label = gt_onehot_label * pairwise_ious[..., None] +# 使用 quality focal loss 计算分类损失 cost ,与实际的分类损失计算保持一致 +scale_factor = soft_label - valid_pred_scores.sigmoid() +soft_cls_cost = F.binary_cross_entropy_with_logits( + valid_pred_scores, soft_label, + reduction='none') * scale_factor.abs().pow(2.0) +soft_cls_cost = soft_cls_cost.sum(dim=-1) +``` + +通过计算上述三个损失的和得到最终的 `cost_matrix` 后, 再使用 `SimOTA` 决定每一个 `GT` 匹配的样本的个数并决定最终的样本。具体操作如下所示: + +1. 首先通过自适应计算每一个 `gt` 要选取的样本数量: 取每一个 `gt` 与所有 `bboxes` 前 `13` 大的 `iou`, 得到它们的和取整后作为这个 `gt` 的 `样本数目` , 最少为 `1` 个, 记为 `dynamic_ks`。 +2. 对于每一个 `gt` , 将其 `cost_matrix` 矩阵前 `dynamic_ks` 小的位置作为该 `gt` 的正样本。 +3. 对于某一个 `bbox`, 如果被匹配到多个 `gt` 就将与这些 `gts` 的 `cost_marix` 中最小的那个作为其 `label`。 + +在网络训练初期,因参数初始化,回归和分类的损失值 `Cost` 往往较大, 这时候 `IOU` 比较小, 选取的样本较少,主要起作用的是 `Soft_center_prior` 也就是位置信息,优先选取位置距离 `GT` 比较近的样本作为正样本,这也符合人们的理解,在网络前期给少量并且有足够质量的样本,以达到冷启动。 +当网络进行训练一段时间过后,分类分支和回归分支都进行了一定的优化后,这时 `IOU` 变大, 选取的样本也逐渐增多,这时网络也有能力学习到更多的样本,同时因为 `IOU_Cost` 以及 `Soft_Cls_Cost` 变小,网络也会动态的找到更有利优化分类以及回归的样本点。 + +在 `Resnet50-1x` 的三种损失的消融实验: + +| Soft_cls_cost | Soft_center_prior | Log_IoU_cost | mAP | +| :------------ | :---------------- | :----------- | :--- | +| × | × | × | 39.9 | +| √ | × | × | 40.3 | +| √ | √ | × | 40.8 | +| √ | √ | √ | 41.3 | + +与其他主流 `Assign` 方法在 `Resnet50-1x` 的对比实验: + +| method | mAP | +| :-----------: | :--- | +| ATSS | 39.2 | +| PAA | 40.4 | +| OTA | 40.7 | +| TOOD(w/o TAH) | 40.7 | +| Ours | 41.3 | + +无论是 `Resnet50-1x` 还是标准的设置下,还是在`300epoch` + `havy augmentation`, 相比于 `SimOTA` 、 `OTA` 以及 `TOOD` 中的 `TAL` 均有提升。 + +| 300e + Mosaic & MixUP | mAP | +| :-------------------- | :--- | +| RTMDet-s + SimOTA | 43.2 | +| RTMDet-s + DSLA | 44.5 | + +### 1.4 Loss 设计 + +参与 Loss 计算的共有两个值:`loss_cls` 和 `loss_bbox`,其各自使用的 Loss 方法如下: + +- `loss_cls`:`mmdet.QualityFocalLoss` +- `loss_bbox`:`mmdet.GIoULoss` + +权重比例是:`loss_cls` : `loss_bbox` = `1 : 2` + +#### QualityFocalLoss + +Quality Focal Loss (QFL) 是 [Generalized Focal Loss: Learning Qualified and Distributed Bounding Boxes for Dense Object Detection](https://arxiv.org/abs/2006.04388) 的一部分。 + +
+image +
+ +普通的 Focal Loss 公式: + +```{math} +{FL}(p) = -(1-p_t)^\gamma\log(p_t),p_t = \begin{cases} +p, & {when} \ y = 1 \\ +1 - p, & {when} \ y = 0 +\end{cases} +``` + +其中 {math}`y\in{1,0}` 指定真实类,{math}`p\in[0,1]` 表示标签 {math}`y = 1` 的类估计概率。{math}`\gamma` 是可调聚焦参数。具体来说,FL 由标准交叉熵部分 {math}`-\log(p_t)` 和动态比例因子部分 {math}`-(1-p_t)^\gamma` 组成,其中比例因子 {math}`-(1-p_t)^\gamma` 在训练期间自动降低简单类对于 loss 的比重,并且迅速将模型集中在困难类上。 + +首先 {math}`y = 0` 表示质量得分为 0 的负样本,{math}`0 < y \leq 1` 表示目标 IoU 得分为 y 的正样本。为了针对连续的标签,扩展 FL 的两个部分: + +1. 交叉熵部分 {math}`-\log(p_t)` 扩展为完整版本 {math}`-((1-y)\log(1-\sigma)+y\log(\sigma))` +2. 比例因子部分 {math}`-(1-p_t)^\gamma` 被泛化为估计 {math}`\gamma` 与其连续标签 {math}`y` 的绝对距离,即 {math}`|y-\sigma|^\beta (\beta \geq 0)` 。 + +结合上面两个部分之后,我们得出 QFL 的公式: + +```{math} +{QFL}(\sigma) = -|y-\sigma|^\beta((1-y)\log(1-\sigma)+y\log(\sigma)) +``` + +具体作用是:可以将离散标签的 `focal loss` 泛化到连续标签上,将 bboxes 与 gt 的 IoU 的作为分类分数的标签,使得分类分数为表征回归质量的分数。 + +MMDetection 实现源码的核心部分: + +```python +@weighted_loss +def quality_focal_loss(pred, target, beta=2.0): + """ + pred (torch.Tensor): 用形状(N,C)联合表示预测分类和质量(IoU),C是类的数量。 + target (tuple([torch.Tensor])): 目标类别标签的形状为(N,),目标质量标签的形状是(N,,)。 + beta (float): 计算比例因子的 β 参数. + """ + ... + + # label表示类别id,score表示质量分数 + label, score = target + + # 负样本质量分数0来进行监督 + pred_sigmoid = pred.sigmoid() + scale_factor = pred_sigmoid + zerolabel = scale_factor.new_zeros(pred.shape) + + # 计算交叉熵部分的值 + loss = F.binary_cross_entropy_with_logits( + pred, zerolabel, reduction='none') * scale_factor.pow(beta) + + # 得出 IoU 在区间 (0,1] 的 bbox + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = pred.size(1) + pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1) + pos_label = label[pos].long() + + # 正样本由 IoU 范围在 (0,1] 的 bbox 来监督 + # 计算动态比例因子 + scale_factor = score[pos] - pred_sigmoid[pos, pos_label] + + # 计算两部分的 loss + loss[pos, pos_label] = F.binary_cross_entropy_with_logits( + pred[pos, pos_label], score[pos], + reduction='none') * scale_factor.abs().pow(beta) + + # 得出最终 loss + loss = loss.sum(dim=1, keepdim=False) + return loss +``` + +#### GIoULoss + +论文:[Generalized Intersection over Union: A Metric and A Loss for Bounding Box Regression](https://arxiv.org/abs/1902.09630) + +GIoU Loss 用于计算两个框重叠区域的关系,重叠区域越大,损失越小,反之越大。而且 GIoU 是在 \[0,2\] 之间,因为其值被限制在了一个较小的范围内,所以网络不会出现剧烈的波动,证明了其具有比较好的稳定性。 + +下图是基本的实现流程图: + +
+image +
+ +MMDetection 实现源码的核心部分: + +```python +def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): + ... + + # 求两个区域的面积 + area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * ( + bboxes1[..., 3] - bboxes1[..., 1]) + area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * ( + bboxes2[..., 3] - bboxes2[..., 1]) + + if is_aligned: + # 得出两个 bbox 重合的左上角 lt 和右下角 rb + lt = torch.max(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2] + rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2] + + # 求重合面积 + wh = fp16_clamp(rb - lt, min=0) + overlap = wh[..., 0] * wh[..., 1] + + if mode in ['iou', 'giou']: + ... + else: + union = area1 + if mode == 'giou': + # 得出两个 bbox 最小凸闭合框的左上角 lt 和右下角 rb + enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2]) + enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:]) + else: + ... + + # 求重合面积 / gt bbox 面积 的比率,即 IoU + eps = union.new_tensor([eps]) + union = torch.max(union, eps) + ious = overlap / union + + ... + + # 求最小凸闭合框面积 + enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0) + enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] + enclose_area = torch.max(enclose_area, eps) + + # 计算 giou + gious = ious - (enclose_area - union) / enclose_area + return gious + +@weighted_loss +def giou_loss(pred, target, eps=1e-7): + gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps) + loss = 1 - gious + return loss +``` + +### 1.5 优化策略和训练过程 + +
+ +
+ +### 1.6 推理和后处理过程 + +
+ +
+ +**(1) 特征图输入** + +预测的图片输入大小为 640 x 640, 通道数为 3 ,经过 CSPNeXt, CSPNeXtPAFPN 层的 8 倍、16 倍、32 倍下采样得到 80 x 80, 40 x 40, 20 x 20 三个尺寸的特征图。以 rtmdet-l 模型为例,此时三层通道数都为 256,经过 `bbox_head` 层得到两个分支,分别为 `rtm_cls` 类别预测分支,将通道数从 256 变为 80,80 对应所有类别数量; `rtm_reg` 边框回归分支将通道数从 256 变为 4,4 代表框的坐标。 + +**(2) 初始化网格** + +根据特征图尺寸初始化三个网格,大小分别为 6400 (80 x 80)、1600 (40 x 40)、400 (20 x 20),如第一个层 shape 为 torch.Size(\[ 6400, 2 \]),最后一个维度是 2,为网格点的横纵坐标,而 6400 表示当前特征层的网格点数量。 + +**(3) 维度变换** + +经过 `_predict_by_feat_single` 函数,将从 head 提取的单一图像的特征转换为 bbox 结果输入,得到三个列表 `cls_score_list`,`bbox_pred_list`,`mlvl_priors`,详细大小如图所示。之后分别遍历三个特征层,分别对 class 类别预测分支、bbox 回归分支进行处理。以第一层为例,对 bbox 预测分支 \[ 4,80,80 \] 维度变换为 \[ 6400,4 \],对类别预测分支 \[ 80,80,80 \] 变化为 \[ 6400,80 \],并对其做归一化,确保类别置信度在 0 - 1 之间。 + +**(4) 阈值过滤** + +先使用一个 `nms_pre` 操作,先过滤大部分置信度比较低的预测结果(比如 `score_thr` 阈值设置为 0.05,则去除当前预测置信度低于 0.05 的结果),然后得到 bbox 坐标、所在网格的坐标、置信度、标签的信息。经过三个特征层遍历之后,分别整合这三个层得到的的四个信息放入 results 列表中。 + +**(5) 还原到原图尺度** + +最后将网络的预测结果映射到整图当中,得到 bbox 在整图中的坐标值 + +**(6) NMS** + +进行 nms 操作,最终预测得到的返回值为经过后处理的每张图片的检测结果,包含分类置信度,框的 labels,框的四个坐标 + +## 2 总结 + +本文对 RTMDet 原理和在 MMYOLO 实现进行了详细解析,希望能帮助用户理解算法实现过程。同时请注意:由于 RTMDet 本身也在不断更新, +本开源库也会不断迭代,请及时阅读和同步最新版本。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov5_description.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov5_description.md new file mode 100644 index 0000000000000000000000000000000000000000..2ef01111b3ae7a3305ded0293bce9e9387510996 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov5_description.md @@ -0,0 +1,652 @@ +# YOLOv5 原理和实现全解析 + +## 0 简介 + +
+YOLOv5-P5_structure_v3.4 +图 1:YOLOv5-l-P5 模型结构 +
+ +
+YOLOv5-P6_structure_v1.1 +图 2:YOLOv5-l-P6 模型结构 +
+ +以上结构图由 RangeKing@github 绘制。 + +YOLOv5 是一个面向实时工业应用而开源的目标检测算法,受到了广泛关注。我们认为让 YOLOv5 爆火的原因不单纯在于 YOLOv5 算法本身的优异性,更多的在于开源库的实用和鲁棒性。简单来说 YOLOv5 开源库的主要特点为: + +1. **友好和完善的部署支持** +2. **算法训练速度极快**,在 300 epoch 情况下训练时长和大部分 one-stage 算法如 RetinaNet、ATSS 和 two-stage 算法如 Faster R-CNN 在 12 epoch 的训练时间接近 +3. 框架进行了**非常多的 corner case 优化**,功能和文档也比较丰富 + +如图 1 和 2 所示,YOLOv5 的 P5 和 P6 版本主要差异在于网络结构和图片输入分辨率。其他区别,如 anchors 个数和 loss 权重可详见[配置文件](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py)。本文将从 YOLOv5 算法本身原理讲起,然后重点分析 MMYOLO 中的实现。关于 YOLOv5 的使用指南和速度等对比请阅读本文的后续内容。 + +```{hint} +没有特殊说明情况下,本文默认描述的是 P5 模型。 +``` + +希望本文能够成为你入门和掌握 YOLOv5 的核心文档。由于 YOLOv5 本身也在不断迭代更新,我们也会不断的更新本文档。请注意阅读最新版本。 + +MMYOLO 实现配置:https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/ + +YOLOv5 官方开源库地址:https://github.com/ultralytics/yolov5 + +## 1 v6.1 算法原理和 MMYOLO 实现解析 + +YOLOv5 官方 release 地址:https://github.com/ultralytics/yolov5/releases/tag/v6.1 + +
+YOLOv5精度图 +
+ +
+YOLOv5精度速度图 +
+ +性能如上表所示。YOLOv5 有 P5 和 P6 两个不同训练输入尺度的模型,P6 即为 1280x1280 输入的大模型,通常用的是 P5 常规模型,输入尺寸是 640x640 。本文解读的也是 P5 模型结构。 + +通常来说,目标检测算法都可以分成数据增强、模型结构、loss 计算等组件,YOLOv5 也一样,如下所示: + +
+训练测试策略 +
+ +下面将从原理和结合 MMYOLO 的具体实现方面进行简要分析。 + +### 1.1 数据增强模块 + +YOLOv5 目标检测算法中使用的数据增强比较多,包括: + +- **Mosaic 马赛克** +- **RandomAffine 随机仿射变换** +- **MixUp** +- **图像模糊等采用 Albu 库实现的变换** +- **HSV 颜色空间增强** +- **随机水平翻转** + +其中 Mosaic 数据增强概率为 1,表示一定会触发,而对于 small 和 nano 两个版本的模型不使用 MixUp,其他的 l/m/x 系列模型则采用了 0.1 的概率触发 MixUp。小模型能力有限,一般不会采用 MixUp 等强数据增强策略。 + +其核心的 Mosaic + RandomAffine + MixUp 过程简要绘制如下: + +
+image +
+ +下面对其进行简要分析。 + +#### 1.1.1 Mosaic 马赛克 + +
+image +
+ +Mosaic 属于混合类数据增强,因为它在运行时候需要 4 张图片拼接,变相的相当于增加了训练的 batch size。其运行过程简要概况为: + +1. 随机生成拼接后 4 张图的交接中心点坐标,此时就相当于确定了 4 张拼接图片的交接点 +2. 随机选出另外 3 张图片的索引以及读取对应的标注 +3. 对每张图片采用保持宽高比的 resize 操作将其缩放到指定大小 +4. 按照上下左右规则,计算每张图片在待输出图片中应该放置的位置,因为图片可能出界故还需要计算裁剪坐标 +5. 利用裁剪坐标将缩放后的图片裁剪,然后贴到前面计算出的位置,其余位置全部补 114 像素值 +6. 对每张图片的标注也进行相应处理 + +注意:由于拼接了 4 张图,所以输出图片面积会扩大 4 倍,从 640x640 变成 1280x1280,因此要想恢复为 640x640, +必须要再接一个 **RandomAffine 随机仿射变换,否则图片面积就一直是扩大 4 倍的**。 + +#### 1.1.2 RandomAffine 随机仿射变换 + +
+image +
+ +随机仿射变换有两个目的: + +1. 对图片进行随机几何仿射变换 +2. 将 Mosaic 输出的扩大 4 倍的图片还原为 640x640 尺寸 + +随机仿射变换包括平移、旋转、缩放、错切等几何增强操作,同时由于 Mosaic 和 RandomAffine 属于比较强的增强操作,会引入较大噪声,因此需要对增强后的标注进行处理,过滤规则为: + +1. 增强后的 gt bbox 宽高要大于 wh_thr +2. 增强后的 gt bbox 面积和增强前的 gt bbox 面积比要大于 ar_thr,防止增强太严重 +3. 最大宽高比要小于 area_thr,防止宽高比改变太多 + +由于旋转后标注框会变大导致不准确,因此目标检测里面很少会使用旋转数据增强。 + +#### 1.1.3 MixUp + +
+image +
+ +MixUp 和 Mosaic 类似也属于混合图片类增强方法。随机选出另外一张图后将两图再随机混合。具体实现方法有多种,常见的做法是要么将 label 直接拼接起来,要么将 label 也采用 alpha 方法混合。原作者的做法非常简单,对 label 即直接拼接,而图片通过分布采样混合。 + +需要特别注意的是: +**YOLOv5 实现的 MixUp 中,随机出来的另一张图也需要经过 Mosaic 马赛克 + RandomAffine 随机仿射变换 的增强后才能混合。这个和其他开源库实现可能不太一样**。 + +#### 1.1.4 图像模糊和其他数据增强策略 + +
+image +
+ +剩下的数据增强包括 + +- **图像模糊等采用 Albu 库实现的变换** +- **HSV 颜色空间增强** +- **随机水平翻转** + +MMDetection 开源库中已经对 Albu 第三方数据增强库进行了封装,使用户可以简单的通过配置即可使用 Albu 库中提供的任何数据增强功能。而 HSV 颜色空间增强和随机水平翻转都是属于比较常规的数据增强,不需要特殊介绍。 + +#### 1.1.5 MMYOLO 实现解析 + +常规的单图数据增强例如随机翻转等比较容易实现,而 Mosaic 类的混合数据增强则不太容易。在 MMDetection 复现的 YOLOX 算法中提出了 MultiImageMixDataset 数据集包装器的概念,其实现过程如下: + +
+image +
+ +对于 Mosaic 等混合类数据增强策略,会需要额外实现一个 `get_indexes` 方法来获取其他图片索引,然后用得到的 4 张图片信息就可以进行 Mosaic 增强了。 +以 MMDetection 中实现的 YOLOX 为例,其配置文件写法如下所示: + +```python +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0), + ... +] + +train_dataset = dict( + # use MultiImageMixDataset wrapper to support mosaic and mixup + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + pipeline=train_pipeline) +``` + +MultiImageMixDataset 数据集包装器传入一个包括 Mosaic 和 RandAffine 等数据增强,而 CocoDataset 中也需要传入一个包括图片和标注加载的 pipeline。通过这种方式就可以快速的实现混合类数据增强。 + +但是上述实现有一个缺点: +**对于不熟悉 MMDetection 的用户来说,其经常会忘记 Mosaic 必须要和 MultiImageMixDataset 配合使用,否则会报错,而且这样会加大复杂度和理解难度**。 + +为了解决这个问题,在 MMYOLO 中我们进一步进行了简化。直接让 pipeline 能够获取到 dataset 对象,此时就可以将 Mosaic 等混合类数据增强的实现和使用变成和随机翻转一样。 +此时在 MMYOLO 中 YOLOX 的配置写法变成如下所示: + +```python +pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) +] + +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=pre_transform), + ... +] +``` + +这样就不再需要 MultiImageMixDataset 了,使用和理解上会更加简单。 + +回到 YOLOv5 配置上,因为 YOLOv5 实现的 MixUp 中,随机选出来的另一张图也需要经过 Mosaic 马赛克+RandomAffine 随机仿射变换 增强后才能混合,故YOLOv5-m 数据增强配置如下所示: + +```python +pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) +] + +mosaic_transform= [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.1, 1.9), # scale = 0.9 + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *pre_transform, + *mosaic_transform, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[ + *pre_transform, + *mosaic_transform + ]), + ... +] +``` + +### 1.2 网络结构 + +本小结由 RangeKing@github 撰写,非常感谢!!! + +YOLOv5 网络结构是标准的 `CSPDarknet` + `PAFPN` + `非解耦 Head`。 + +YOLOv5 网络结构大小由 `deepen_factor` 和 `widen_factor` 两个参数决定。其中 `deepen_factor` 控制网络结构深度,即 `CSPLayer` 中 `DarknetBottleneck` 模块堆叠的数量;`widen_factor` 控制网络结构宽度,即模块输出特征图的通道数。以 YOLOv5-l 为例,其 `deepen_factor = widen_factor = 1.0` 。P5 和 P6 的模型整体结构分别如图 1 和图 2 所示。 + +图的上半部分为模型总览;下半部分为具体网络结构,其中的模块均标有序号,方便用户与 YOLOv5 官方仓库的配置文件对应;中间部分为各子模块的具体构成。 + +如果想使用 netron 可视化网络结构图细节,可以直接在 netron 中将 MMDeploy 导出的 ONNX 文件格式文件打开。 + +```{hint} +1.2 小节涉及的特征维度(shape)都为 (B, C, H, W)。 +``` + +#### 1.2.1 Backbone + +在 MMYOLO 中 `CSPDarknet` 继承自 `BaseBackbone`,整体结构和 `ResNet` 类似。P5 模型共 5 层结构,包含 1 个 `Stem Layer` 和 4 个 `Stage Layer`: + +- `Stem Layer` 是 1 个 6x6 kernel 的 `ConvModule`,相较于 v6.1 版本之前的 `Focus` 模块更加高效。 +- 除了最后一个 `Stage Layer`,其他均由 1 个 `ConvModule` 和 1 个 `CSPLayer` 组成。如上图 Details 部分所示。 + 其中 `ConvModule` 为 3x3的 `Conv2d` + `BatchNorm` + `SiLU 激活函数`。`CSPLayer` 即 YOLOv5 官方仓库中的 C3 模块,由 3 个 `ConvModule` + n 个 `DarknetBottleneck`(带残差连接) 组成。 +- 最后一个 `Stage Layer` 在最后增加了 `SPPF` 模块。`SPPF` 模块是将输入串行通过多个 5x5 大小的 `MaxPool2d` 层,与 `SPP` 模块效果相同,但速度更快。 +- P5 模型会在 `Stage Layer` 2-4 之后分别输出一个特征图进入 `Neck` 结构。以 640x640 输入图片为例,其输出特征为 (B,256,80,80)、(B,512,40,40) 和 (B,1024,20,20),对应的 stride 分别为 8/16/32。 +- P6 模型会在 `Stage Layer` 2-5 之后分别输出一个特征图进入 `Neck` 结构。以 1280x1280 输入图片为例,其输出特征为 (B,256,160,160)、(B,512,80,80)、(B,768,40,40) 和 (B,1024,20,20),对应的 stride 分别为 8/16/32/64。 + +#### 1.2.2 Neck + +YOLOv5 官方仓库的配置文件中并没有 Neck 部分,为方便用户与其他目标检测网络结构相对应,我们将官方仓库的 `Head` 拆分成 `PAFPN` 和 `Head` 两部分。 + +基于 `BaseYOLONeck` 结构,YOLOv5 `Neck` 也是遵循同一套构建流程,对于不存在的模块,我们采用 `nn.Identity` 代替。 + +Neck 模块输出的特征图和 Backbone 完全一致。即 P5 模型为 (B,256,80,80)、 (B,512,40,40) 和 (B,1024,20,20);P6 模型为 (B,256,160,160)、(B,512,80,80)、(B,768,40,40) 和 (B,1024,20,20)。 + +#### 1.2.3 Head + +YOLOv5 Head 结构和 YOLOv3 完全一样,为 `非解耦 Head`。Head 模块只包括 3 个不共享权重的卷积,用于将输入特征图进行变换而已。 + +前面的 PAFPN 依然是输出 3 个不同尺度的特征图,shape 为 (B,256,80,80)、 (B,512,40,40) 和 (B,1024,20,20)。 +由于 YOLOv5 是非解耦输出,即分类和 bbox 检测等都是在同一个卷积的不同通道中完成。以 COCO 80 类为例: + +- P5 模型在输入为 640x640 分辨率情况下,其 Head 模块输出的 shape 分别为 `(B, 3x(4+1+80),80,80)`, `(B, 3x(4+1+80),40,40)` 和 `(B, 3x(4+1+80),20,20)`。 +- P6 模型在输入为 1280x1280 分辨率情况下,其 Head 模块输出的 shape 分别为 `(B, 3x(4+1+80),160,160)`, `(B, 3x(4+1+80),80,80)`, `(B, 3x(4+1+80),40,40)` 和 `(B, 3x(4+1+80),20,20)`。 + 其中 3 表示 3 个 anchor,4 表示 bbox 预测分支,1 表示 obj 预测分支,80 表示 COCO 数据集类别预测分支。 + +### 1.3 正负样本匹配策略 + +正负样本匹配策略的核心是确定预测特征图的所有位置中哪些位置应该是正样本,哪些是负样本,甚至有些是忽略样本。 +匹配策略是目标检测算法的核心,一个好的匹配策略可以显著提升算法性能。 + +YOLOV5 的匹配策略简单总结为:**采用了 anchor 和 gt_bbox 的 shape 匹配度作为划分规则,同时引入跨邻域网格策略来增加正样本**。 +其主要包括如下两个核心步骤: + +1. 对于任何一个输出层,抛弃了常用的基于 Max IoU 匹配的规则,而是直接采用 shape 规则匹配,也就是该 GT Bbox 和当前层的 Anchor 计算宽高比,如果宽高比例大于设定阈值,则说明该 GT Bbox 和 Anchor 匹配度不够,将该 GT Bbox 暂时丢掉,在该层预测中该 GT Bbox 对应的网格内的预测位置认为是负样本 +2. 对于剩下的 GT Bbox(也就是匹配上的 GT Bbox),计算其落在哪个网格内,同时利用四舍五入规则,找出最近的两个网格,将这三个网格都认为是负责预测该 GT Bbox 的,可以粗略估计正样本数相比之前的 YOLO 系列,至少增加了三倍 + +下面会对每个部分进行详细说明,部分描述和图示直接或间接参考自官方 [Repo](https://github.com/ultralytics/YOLOv5/issues/6998#44)。 + +#### 1.3.1 Anchor 设置 + +YOLOv5 是 Anchor-based 的目标检测算法,其 Anchor size 的获取方式与 YOLOv3 类似,也是使用聚类获得,其不同之处在于聚类使用的标准不再是基于 IoU 的,而是使用形状上的宽高比作为聚类准则(即 shape-match )。 + +在用户更换了数据集后,可以使用 MMYOLO 里带有的 Anchor 分析工具,对自己的数据集进行分析,确定合适的 Anchor size。 + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} --algorithm v5-k-means + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} --output-dir ${OUTPUT_DIR} +``` + +然后在 [config 文件](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) 里修改默认 Anchor size: + +```python +anchors = [[(10, 13), (16, 30), (33, 23)], [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] +``` + +#### 1.3.2 Bbox 编解码过程 + +在 Anchor-based 算法中,预测框通常会基于 Anchor 进行变换,然后预测变换量,这对应 GT Bbox 编码过程,而在预测后需要进行 Pred Bbox 解码,还原为真实尺度的 Bbox,这对应 Pred Bbox 解码过程。 + +在 YOLOv3 中,回归公式为: + +```{math} +b_x=\sigma(t_x)+c_x \\ +b_y=\sigma(t_y)+c_y \\ +b_w=a_w\cdot e^{t_w} \\ +b_h=a_h\cdot e^{t_h} \\ +``` + +公式中, + +```{math} +a_w 代表 Anchor 的宽度 \\ +c_x 代表 Grid 所处的坐标 \\ +\sigma 代表 Sigmoid 公式。 +``` + +而在 YOLOv5 中,回归公式为: + +```{math} +b_x=(2\cdot\sigma(t_x)-0.5)+c_x \\ +b_y=(2\cdot\sigma(t_y)-0.5)+c_y \\ +b_w=a_w\cdot(2\cdot\sigma(t_w))^2 \\ +b_h=a_h\cdot(2\cdot\sigma(t_h))^2 +``` + +改进之处主要有以下两点: + +- 中心点坐标范围从 (0, 1) 调整至 (-0.5, 1.5) +- 宽高范围从 + +```{math} +(0,+\infty) +``` + +调整至 + +```{math} +(0,4a_{wh}) +``` + +这个改进具有以下好处: + +- **新的中心点设置能更好的预测到 0 和 1**。这有助于更精准回归出 box 坐标。 + +
+image +
+ +- 宽高回归公式中 exp(x) 是无界的,这会导致**梯度失去控制**,造成训练不稳定。YOLOv5 中改进后的宽高回归公式优化了此问题。 + +
+image +
+ +#### 1.3.3 匹配策略 + +在 MMYOLO 设计中,无论网络是 Anchor-based 还是 Anchor-free,**我们统一使用 prior 称呼 Anchor**。 + +正样本匹配包含以下两步: + +**(1) “比例”比较** + +将 GT Bbox 的 WH 与 Prior 的 WH 进行“比例”比较。 + +比较流程: + +```{math} +r_w = w\_{gt} / w\_{pt} \\ +r_h = h\_{gt} / h\_{pt} \\ +r_w^{max}=max(r_w, 1/r_w) \\ +r_h^{max}=max(r_h, 1/r_h) \\ +r^{max}=max(r_w^{max}, r_h^{max}) \\ +if\ \ r_{max} < prior\_match\_thr: match! +``` + +此处我们用一个 GT Bbox 与 P3 特征图的 Prior 进行匹配的案例进行讲解和图示: + +
+image +
+ +prior1 匹配失败的原因是 + +```{math} +h\_{gt}\ /\ h\_{prior}\ =\ 4.8\ >\ prior\_match\_thr +``` + +**(2) 为步骤 1 中 match 的 GT 分配对应的正样本** + +依然沿用上面的例子: + +GT Bbox (cx, cy, w, h) 值为 (26, 37, 36, 24), + +Prior WH 值为 \[(15, 5), (24, 16), (16, 24)\],在 P3 特征图上,stride 为 8。通过计算,prior2 和 prior3 能够 match。 + +计算过程如下: + +**(2.1) 将 GT Bbox 的中心点坐标对应到 P3 的 grid 上** + +```{math} +GT_x^{center_grid}=26/8=3.25 \\ +GT_y^{center_grid}=37/8=4.625 +``` + +
+image +
+ +**(2.2)** 将 GT Bbox 中心点所在的 grid 分成四个象限,**由于中心点落在了左下角的象限当中,那么会将物体的左、下两个 grid 也认为是正样本** + +
+image +
+ +下图展示中心点落到不同位置时的正样本分配情况: + +
+image +
+ +那么 YOLOv5 的 Assign 方式具体带来了哪些改进? + +- 一个 GT Bbox 能够匹配多个 Prior + +- 一个 GT Bbox 和一个Prior 匹配时,能分配 1-3 个正样本 + +- 以上策略能**适度缓解目标检测中常见的正负样本不均衡问题**。 + +而 YOLOv5 中的回归方式,和 Assign 方式是相互呼应的: + +1. 中心点回归方式: + +
+image +
+ +2. WH 回归方式: + +
+image +
+ +### 1.4 Loss 设计 + +YOLOv5 中总共包含 3 个 Loss,分别为: + +- Classes loss:使用的是 BCE loss +- Objectness loss:使用的是 BCE loss +- Location loss:使用的是 CIoU loss + +三个 loss 按照一定比例汇总: + +```{math} +Loss=\lambda_1L_{cls}+\lambda_2L_{obj}+\lambda_3L_{loc} +``` + +P3、P4、P5 层对应的 Objectness loss 按照不同权重进行相加,默认的设置是 + +```python +obj_level_weights=[4., 1., 0.4] +``` + +```{math} +L_{obj}=4.0\cdot L_{obj}^{small}+1.0\cdot L_{obj}^{medium}+0.4\cdot L_{obj}^{large} +``` + +在复现中我们发现 YOLOv5 中使用的 CIoU 与目前最新官方 CIoU 存在一定的差距,差距体现在 alpha 参数的计算。 + +官方版本: + +参考资料:https://github.com/Zzh-tju/CIoU/blob/master/layers/modules/multibox_loss.py#L53-L55 + +```python +alpha = (ious > 0.5).float() * v / (1 - ious + v) +``` + +YOLOv5 版本: + +```python +alpha = v / (v - ious + (1 + eps)) +``` + +这是一个有趣的细节,后续需要测试不同 alpha 计算方式情况下带来的精度差距。 + +### 1.5 优化策略和训练过程 + +YOLOv5 对每个优化器的参数组进行非常精细的控制,简单来说包括如下部分。 + +#### 1.5.1 优化器分组 + +将优化参数分成 Conv/Bias/BN 三组,在 WarmUp 阶段,不同组采用不同的 lr 以及 momentum 更新曲线。 +同时在 WarmUp 阶段采用的是 iter-based 更新策略,而在非 WarmUp 阶段则变成 epoch-based 更新策略,可谓是 trick 十足。 + +MMYOLO 中是采用 YOLOv5OptimizerConstructor 优化器构造器实现优化器参数分组。优化器构造器的作用就是对一些特殊的参数组初始化过程进行精细化控制,因此可以很好的满足需求。 + +而不同的参数组采用不同的调度曲线功能则是通过 YOLOv5ParamSchedulerHook 实现。而不同的参数组采用不同的调度曲线功能则是通过 YOLOv5ParamSchedulerHook 实现。 + +#### 1.5.2 weight decay 参数自适应 + +作者针对不同的 batch size 采用了不同的 weight decay 策略,具体来说为: + +1. 当训练 batch size \<= 64 时,weight decay 不变 +2. 当训练 batch size > 64 时,weight decay 会根据总 batch size 进行线性缩放 + +MMYOLO 也是通过 YOLOv5OptimizerConstructor 实现。 + +#### 1.5.3 梯度累加 + +为了最大化不同 batch size 情况下的性能,作者设置总 batch size 小于 64 时候会自动开启梯度累加功能。 + +训练过程和大部分 YOLO 类似,包括如下策略: + +1. 没有使用预训练权重 +2. 没有采用多尺度训练策略,同时可以开启 cudnn.benchmark 进一步加速训练 +3. 使用了 EMA 策略平滑模型 +4. 默认采用 AMP 自动混合精度训练 + +需要特意说明的是:YOLOv5 官方对于 small 模型是采用单卡 v100 训练,bs 为 128,而 m/l/x 等是采用不同数目的多卡实现的, +这种训练策略不太规范,**为此在 MMYOLO 中全部采用了 8 卡,每卡 16 bs 的设置,同时为了避免性能差异,训练时候开启了 SyncBN**。 + +### 1.6 推理和后处理过程 + +YOLOv5 后处理过程和 YOLOv3 非常类似,实际上 YOLO 系列的后处理逻辑都是类似的。 + +#### 1.6.1 核心控制参数 + +1. **multi_label** + +对于多类别预测来说需要考虑是否是多标签任务,也就是同一个预测位置会预测的多个类别概率,和是否当作单类处理。因为 YOLOv5 采用 sigmoid 预测模式,在考虑多标签情况下可能会出现一个物体检测出两个不同类别的框,这有助于评估指标 mAP,但是不利于实际应用。 +因此在需要算评估指标时候 multi_label 是 True,而推理或者实际应用时候是 False + +2. **score_thr 和 nms_thr** + +score_thr 阈值用于过滤类别分值,低于分值的检测框当做背景处理,nms_thr 是 nms 时阈值。同样的,在计算评估指标 mAP 阶段可以将 score_thr 设置的非常低,这通常能够提高召回率,从而提升 mAP,但是对于实际应用来说没有意义,且会导致推理过程极慢。为此在测试和推理阶段会设置不同的阈值 + +3. **nms_pre 和 max_per_img** + +nms_pre 表示 nms 前的最大保留检测框数目,这通常是为了防止 nms 运行时候输入框过多导致速度过慢问题,默认值是 30000。 +max_per_img 表示最终保留的最大检测框数目,通常设置为 300。 + +以 COCO 80 类为例,假设输入图片大小为 640x640 + +
+image +
+ +其推理和后处理过程为: + +**(1) 维度变换** + +YOLOv5 输出特征图尺度为 80x80、40x40 和 20x20 的三个特征图,每个位置共 3 个 anchor,因此输出特征图通道为 3x(5+80)=255。 +YOLOv5 是非解耦输出头,而其他大部分算法都是解耦输出头,为了统一后处理逻辑,我们提前将其进行解耦,分成了类别预测分支、bbox 预测分支和 obj 预测分支。 + +将三个不同尺度的类别预测分支、bbox 预测分支和 obj 预测分支进行拼接,并进行维度变换。为了后续方便处理,会将原先的通道维度置换到最后,类别预测分支、bbox 预测分支和 obj 预测分支的 shape 分别为 (b, 3x80x80+3x40x40+3x20x20, 80)=(b,25200,80),(b,25200,4),(b,25200,1)。 + +**(2) 解码还原到原图尺度** + +分类预测分支和 obj 分支需要进行 sigmoid 计算,而 bbox 预测分支需要进行解码,还原为真实的原图解码后 xyxy 格式 + +**(3) 第一次阈值过滤** + +遍历 batch 中的每张图,然后用 score_thr 对类别预测分值进行阈值过滤,去掉低于 score_thr 的预测结果 + +**(4) 第二次阈值过滤** + +将 obj 预测分值和过滤后的类别预测分值相乘,然后依然采用 score_thr 进行阈值过滤。 +在这过程中还需要考虑 **multi_label 和 nms_pre,确保过滤后的检测框数目不会多于 nms_pre**。 + +**(5) 还原到原图尺度和 nms** + +基于前处理过程,将剩下的检测框还原到网络输出前的原图尺度,然后进行 nms 即可。最终输出的检测框不能多于 **max_per_img**。 + +#### 1.6.2 batch shape 策略 + +为了加速验证集的推理过程,作者提出了 batch shape 策略,其核心原则是:**确保在 batch 推理过程中同一个 batch 内的图片 pad 像素最少,不要求整个验证过程中所有 batch 的图片尺度一样**。 + +其大概流程是:将整个测试或者验证数据的宽高比进行排序,然后依据 batch 设置将排序后的图片组成一个 batch, +同时计算这个 batch 内最佳的 batch shape,防止 pad 像素过多。最佳 batch shape 计算原则为在保持宽高比的情况下进行 pad,不追求正方形图片输出。 + +```python + image_shapes = [] + for data_info in data_list: + image_shapes.append((data_info['width'], data_info['height'])) + + image_shapes = np.array(image_shapes, dtype=np.float64) + + n = len(image_shapes) # number of images + batch_index = np.floor(np.arange(n) / self.batch_size).astype( + np.int64) # batch index + number_of_batches = batch_index[-1] + 1 # number of batches + + aspect_ratio = image_shapes[:, 1] / image_shapes[:, 0] # aspect ratio + irect = aspect_ratio.argsort() + + data_list = [data_list[i] for i in irect] + + aspect_ratio = aspect_ratio[irect] + # Set training image shapes + shapes = [[1, 1]] * number_of_batches + for i in range(number_of_batches): + aspect_ratio_index = aspect_ratio[batch_index == i] + min_index, max_index = aspect_ratio_index.min( + ), aspect_ratio_index.max() + if max_index < 1: + shapes[i] = [max_index, 1] + elif min_index > 1: + shapes[i] = [1, 1 / min_index] + + batch_shapes = np.ceil( + np.array(shapes) * self.img_size / self.size_divisor + + self.pad).astype(np.int64) * self.size_divisor + + for i, data_info in enumerate(data_list): + data_info['batch_shape'] = batch_shapes[batch_index[i]] +``` + +## 2 总结 + +本文对 YOLOv5 原理和在 MMYOLO 实现进行了详细解析,希望能帮助用户理解算法实现过程。同时请注意:由于 YOLOv5 本身也在不断更新,本开源库也会不断迭代,请及时阅读和同步最新版本。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov6_description.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov6_description.md new file mode 100644 index 0000000000000000000000000000000000000000..29bede362ba31137cb0a78839ebbeb79de822ef3 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov6_description.md @@ -0,0 +1,452 @@ +# YOLOv6 原理和实现全解析 + +## 0 简介 + +
+YOLOv6-S +图 1:YOLOv6-S 模型结构 +
+ +
+YOLOv6-L +图 2:YOLOv6-L 模型结构 +
+ +以上结构图由 wzr-skn@github 绘制。 + +YOLOv6 提出了一系列适用于各种工业场景的模型,包括 N/T/S/M/L,考虑到模型的大小,其架构有所不同,以获得更好的精度-速度权衡。本算法专注于检测的精度和推理效率,并在网络结构、训练策略等算法层面进行了多项改进和优化。 + +简单来说 YOLOv6 开源库的主要特点为: + +1. 统一设计了更高效的 Backbone 和 Neck:受到硬件感知神经网络设计思想的启发,基于 RepVGG style 设计了可重参数化、更高效的骨干网络 EfficientRep Backbone 和 Rep-PAN Neck。 +2. 相比于 YOLOX 的 Decoupled Head,进一步优化设计了简洁有效的 Efficient Decoupled Head,在维持精度的同时,降低了一般解耦头带来的额外延时开销。 +3. 在训练策略上,采用 Anchor-free 的策略,同时辅以 SimOTA 标签分配策略以及 SIoU 边界框回归损失来进一步提高检测精度。 + +本文将从 YOLOv6 算法本身原理讲起,然后重点分析 MMYOLO 中的实现。关于 YOLOv6 的使用指南和速度等对比请阅读本文的后续内容。 + +希望本文能够成为你入门和掌握 YOLOv6 的核心文档。由于 YOLOv6 本身也在不断迭代更新,我们也会不断的更新本文档。请注意阅读最新版本。 + +MMYOLO 实现配置:https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov6/ + +YOLOv6 官方开源库地址:https://github.com/meituan/YOLOv6 + +## 1 YOLOv6 2.0 算法原理和 MMYOLO 实现解析 + +YOLOv6 2.0 官方 release 地址:https://github.com/meituan/YOLOv6/releases/tag/0.2.0 + +
+YOLOv6精度图 +
+ +
+YOLOv6精度速度图 +
+ +YOLOv6 和 YOLOv5 一样也可以分成数据增强、模型结构、loss 计算等组件,如下所示: + +
+训练测试策略 +
+ +下面将从原理和结合 MMYOLO 的具体实现方面进行简要分析。 + +### 1.1 数据增强模块 + +YOLOv6 目标检测算法中使用的数据增强与 YOLOv5 基本一致,唯独不一样的是没有使用 Albu 的数据增强方式: + +- **Mosaic 马赛克** +- **RandomAffine 随机仿射变换** +- **MixUp** +- ~~**图像模糊等采用 Albu 库实现的变换**~~ +- **HSV 颜色空间增强** +- **随机水平翻转** + +关于每一个增强的详细解释,详情请看 [YOLOv5 数据增强模块](yolov5_description.md) + +另外,YOLOv6 参考了 YOLOX 的数据增强方式,分为 2 种增强方法组,一开始和 YOLOv5 一致,但是在最后 15 个 epoch 的时候将 `Mosaic` 使用 `YOLOv5KeepRatioResize` + `LetterResize` 替代了,个人感觉是为了拟合真实情况。 + +### 1.2 网络结构 + +YOLOv6 N/T/S 模型的网络结构由 `EfficientRep` + `Rep-PAN` + `Efficient decoupled Head` 构成,M/L 模型的网络结构则由 `CSPBep` + `CSPRepPAFPN` + `Efficient decoupled Head` 构成。其中,Backbone 和 Neck 部分的结构与 YOLOv5 较为相似,但不同的是其采用了重参数化结构 `RepVGG Block` 替换掉了原本的 `ConvModule`,在此基础上,将 `CSPLayer` 改进为了多个 `RepVGG` 堆叠的 `RepStageBlock`(N/T/S 模型)或 `BepC3StageBlock`(M/L 模型);Head 部分则参考了 FCOS 和 YOLOX 的检测头,将回归与分类分支解耦成两个分支进行预测。YOLOv6-S 和 YOLOv6-L 整体结构分别如图 1 和图 2 所示。 + +#### 1.2.1 Backbone + +已有研究表明,多分支的网络结构通常比单分支网络性能更加优异,例如 YOLOv5 的 `CSPDarknet`,但是这种结构会导致并行度降低进而增加推理延时;相反,类似于 `VGG` 的单分支网络则具有并行度高、内存占用小的优点,因此推理效率更高。而 `RepVGG` 则同时具备上述两种结构的优点,在训练时可解耦成多分支拓扑结构提升模型精度,实际部署时可等效融合为单个 3×3 卷积提升推理速度,`RepVGG` 示意图如下。因此,YOLOv6 基于 `RepVGG` 重参数化结构设计了高效的骨干网络 `EfficientRep` 和 `CSPBep`,其可以充分利用硬件算力,提升模型表征能力的同时降低推理延时。 + +image + +在 N/T/S 模型中,YOLOv6 使用了 `EfficientRep` 作为骨干网络,其包含 1 个 `Stem Layer` 和 4 个 `Stage Layer`,具体细节如下: + +- `Stem Layer` 中采用 stride=2 的 `RepVGGBlock` 替换了 stride=2 的 6×6 `ConvModule`。 +- `Stage Layer` 结构与 YOLOv5 基本相似,将每个 `Stage layer` 的 1 个 `ConvModule` 和 1 个 `CSPLayer` 分别替换为 1 个 `RepVGGBlock` 和 1 个 `RepStageBlock`,如上图 Details 部分所示。其中,第一个 `RepVGGBlock` 会做下采样和 `Channel` 维度变换,而每个 `RepStageBlock` 则由 n 个 `RepVGGBlock` 组成。此外,仍然在第 4 个 `Stage Layer` 最后增加 `SPPF` 模块后输出。 + +在 M/L 模型中,由于模型容量进一步增大,直接使用多个 `RepVGGBlock` 堆叠的 `RepStageBlock` 结构计算量和参数量呈现指数增长。因此,为了权衡计算负担和模型精度,在 M/L 模型中使用了 `CSPBep` 骨干网络,其采用 `BepC3StageBlock` 替换了小模型中的 `RepStageBlock` 。如下图所示,`BepC3StageBlock` 由 3 个 1×1 的 `ConvModule` 和多个子块(每个子块由两个 `RepVGGBlock` 残差连接)组成。 + +image + +#### 1.2.2 Neck + +Neck 部分结构仍然在 YOLOv5 基础上进行了模块的改动,同样采用 `RepStageBlock` 或 `BepC3StageBlock` 对原本的 `CSPLayer` 进行了替换,需要注意的是,Neck 中 `Down Sample` 部分仍然使用了 stride=2 的 3×3 `ConvModule`,而不是像 Backbone 一样替换为 `RepVGGBlock`。 + +#### 1.2.3 Head + +不同于传统的 YOLO 系列检测头,YOLOv6 参考了 FCOS 和 YOLOX 中的做法,将分类和回归分支解耦成两个分支进行预测并且去掉了 obj 分支。同时,采用了 hybrid-channel 策略构建了更高效的解耦检测头,将中间 3×3 的 `ConvModule` 减少为 1 个,在维持精度的同时进一步减少了模型耗费,降低了推理延时。此外,需要说明的是,YOLOv6 在 Backobone 和 Neck 部分使用的激活函数是 `ReLU`,而在 Head 部分则使用的是 `SiLU`。 + +由于 YOLOv6 是解耦输出,分类和 bbox 检测通过不同卷积完成。以 COCO 80 类为例: + +- P5 模型在输入为 640x640 分辨率情况下,其 Head 模块输出的 shape 分别为 `(B,4,80,80)`, `(B,80,80,80)`, `(B,4,40,40)`, `(B,80,40,40)`, `(B,4,20,20)`, `(B,80,20,20)`。 + +### 1.3 正负样本匹配策略 + +YOLOv6 采用的标签匹配策略与 [TOOD](https://arxiv.org/abs/2108.07755) +相同, 前 4 个 epoch 采用 `ATSSAssigner` 作为标签匹配策略的 `warm-up` , +后续使用 `TaskAlignedAssigner` 算法选择正负样本, 基于官方开源代码, `MMYOLO` 中也对两个 assigner 算法进行了优化, 改进为 `Batch` 维度进行计算, +能够一定程度的加快速度。 下面会对每个部分进行详细说明。 + +#### 1.3.1 Anchor 设置 + +YOLOv6 采用与 YOLOX 一样的 Anchor-free 无锚范式,省略了聚类和繁琐的 Anchor 超参设定,泛化能力强,解码逻辑简单。在训练的过程中会根据 feature size 去自动生成先验框。 + +使用 `mmdet.MlvlPointGenerator` 生成 anchor points。 + +```python +prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, # 网格中心点 + strides=[8, 16, 32]) , + +# 调用生成多层 anchor points: list[torch.Tensor] +# 每一层都是 (featrue_h*feature_w,4), 4 表示 (x,y,stride_h,stride_w) +self.mlvl_priors = self.prior_generator.grid_priors( + self.featmap_sizes, + with_stride=True) +``` + +#### 1.3.2 Bbox 编解码过程 + +YOLOv6 的 BBox Coder 采用的是 `DistancePointBBoxCoder`。 + +网络 bbox 预测的值为 (top, bottom, left, right),解码器将 `anchor point` 通过四个距离解码到坐标 (x1,y1,x2,y2)。 + +MMYOLO 中解码的核心源码: + +```python +def decode(points: torch.Tensor, pred_bboxes: torch.Tensor, stride: torch.Tensor) -> torch.Tensor: + """ + 将预测值解码转化 bbox 的 xyxy + points (Tensor): 生成的 anchor point [x, y],Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): 预测距离四边的距离。(left, top, right, bottom). Shape (B, N, 4) or (N, 4) + stride (Tensor): 特征图下采样倍率. + """ + # 首先将预测值转化为原图尺度 + distance = pred_bboxes * stride[None, :, None] + # 根据点以及到四条边距离转为 bbox 的 x1y1x2y2 + x1 = points[..., 0] - distance[..., 0] + y1 = points[..., 1] - distance[..., 1] + x2 = points[..., 0] + distance[..., 2] + y2 = points[..., 1] + distance[..., 3] + + bboxes = torch.stack([x1, y1, x2, y2], -1) + + return bboxes +``` + +#### 1.3.3 匹配策略 + +- 0 \<= epoch \< 4,使用 `BatchATSSAssigner` +- epoch >= 4,使用 `BatchTaskAlignedAssigner` + +#### ATSSAssigner + +ATSSAssigner 是 [ATSS](https://arxiv.org/abs/1912.02424) 中提出的标签匹配策略。 +ATSS 的匹配策略简单总结为:**通过中心点距离先验对样本进行初筛,然后自适应生成 IoU 阈值筛选正样本。** +YOLOv6 的实现种主要包括如下三个核心步骤: + +1. 因为 YOLOv6 是 Anchor-free,所以首先将 `anchor point` 转化为大小为 `5*strdie` 的 `anchor`。 +2. 对于每一个 `GT`,在 `FPN` 的每一个特征层上, 计算与该层所有 `anchor` 中心点距离(位置先验), + 然后优先选取距离 `topK` 近的样本,作为 **初筛样本**。 +3. 对于每一个 `GT`,计算其 **初筛样本** 的 `IoU` 的均值 `mean`与标准差 `std`,将 `mean + std` + 作为该 `GT` 的正样本的 **自适应 IoU 阈值** ,大于该 **自适应阈值** 且中心点在 `GT` 内部的 `anchor` + 才作为正样本,使得样本能够被 `assign` 到合适的 `FPN` 特征层上。 + +下图中,(a) 所示中等大小物体被 assign 到 FPN 的中层,(b) 所示偏大的物体被 assign 到 FPN 中检测大物体和偏大物体的两个层。 + +
+image +
+ +```python +# 1. 首先将anchor points 转化为 anchors +# priors为(point_x,point_y,stride_w,stride_h), shape 为(N,4) +cell_half_size = priors[:, 2:] * 2.5 +priors_gen = torch.zeros_like(priors) +priors_gen[:, :2] = priors[:, :2] - cell_half_size +priors_gen[:, 2:] = priors[:, :2] + cell_half_size +priors = priors_gen +# 2. 计算 anchors 与 GT 的 IoU +overlaps = self.iou_calculator(gt_bboxes.reshape([-1, 4]), priors) +# 3. 计算 anchor 与 GT 的中心距离 +distances, priors_points = bbox_center_distance( + gt_bboxes.reshape([-1, 4]), priors) +# 4. 根据中心点距离,在 FPN 的每一层选取 TopK 临近的样本作为初筛样本 +is_in_candidate, candidate_idxs = self.select_topk_candidates( + distances, num_level_priors, pad_bbox_flag) +# 5. 对于每一个 GT 计算其对应初筛样本的均值与标准差的和, 作为该GT的样本阈值 +overlaps_thr_per_gt, iou_candidates = self.threshold_calculator( + is_in_candidate, candidate_idxs, overlaps, num_priors, batch_size, + num_gt) +# 6. 筛选大于阈值的样本作为正样本 +is_pos = torch.where( + iou_candidates > overlaps_thr_per_gt.repeat([1, 1, num_priors]), + is_in_candidate, torch.zeros_like(is_in_candidate)) +# 6. 保证样本中心点在 GT 内部且不超图像边界 +pos_mask = is_pos * is_in_gts * pad_bbox_flag +``` + +#### TaskAlignedAssigner + +TaskAlignedAssigner 是 [TOOD](https://arxiv.org/abs/2108.07755) 中提出的一种动态样本匹配策略。 +由于 `ATSSAssigner` 是属于静态标签匹配策略,其选取正样本的策略主要根据 `anchor` 的位置进行挑选, +并不会随着网络的优化而选取到更好的样本。在目标检测中,分类和回归的任务最终作用于同一个目标,所以 +`TaskAlignedAssigner` 认为样本的选取应该更加关注到对分类以及回归都友好的样本点。 + +`TaskAlignedAssigner` 的匹配策略简单总结为: **根据分类与回归的分数加权的分数选择正样本**。 + +1. 对于每一个 `GT`,对所有的 `预测框` 基于 **GT类别对应分类分数** 与 **预测框与 GT 的 IoU** 的加权得到一个关联分类以及回归的对齐分数 `alignment_metrics`。 +2. 对于每一个 `GT`,直接基于 `alignment_metrics` 对齐分数选取 `topK` 大的作为正样本。 + +因为在网络初期参数随机, `分类分数` 和 `预测框与 GT 的 IoU` 都不准确,所以需要经过前 4 个 `epoch` 的 `ATSSAssigner` +的 `warm-up`。经过预热之后的 `TaskAlignedAssigner` 标签匹配策略就不使用中心距离的先验, +而是直接对每一个`GT` 选取 `alignment_metrics` 中 `topK` 大的样本作为正样本。 + +```python +# 1. 基于分类分数与回归的 IoU 计算对齐分数 alignment_metrics +alignment_metrics = bbox_scores.pow(self.alpha) * overlaps.pow( + self.beta) +# 2. 保证中心点在 GT 内部的 mask +is_in_gts = select_candidates_in_gts(priors, gt_bboxes) +# 3. 选取 TopK 大的对齐分数的样本 +topk_metric = self.select_topk_candidates( + alignment_metrics * is_in_gts, + topk_mask=pad_bbox_flag.repeat([1, 1, self.topk]).bool()) +``` + +### 1.4 Loss 设计 + +参与 Loss 计算的共有两个值:loss_cls 和 loss_bbox,其各自使用的 Loss 方法如下: + +- Classes loss:使用的是 `mmdet.VarifocalLoss` +- BBox loss:l/m/s使用的是 `GIoULoss`, t/n 用的是 `SIoULoss` + +权重比例是:`loss_cls` : `loss_bbox` = `1 : 2.5` + +#### 分类损失函数 VarifocalLoss + +Varifocal Loss (VFL) 是 [VarifocalNet: An IoU-aware Dense Object Detector](https://arxiv.org/abs/2008.13367) +中的损失函数。 + +
+image +
+ +`VFL` 是在 `GFL` 的基础上做的改进,`GFL`详情请看 [GFL详解](rtmdet_description.md) + +在上述标签匹配策略中提到过选择样本应该优先考虑分类回归都友好的样本点, +这是由于目标检测包含的分类与回归两个子任务都是作用于同一个物体。 +与 `GFL` 思想相同,都是将 **预测框与 `GT` 的 `IoU` 软化作为分类的标签**,使得分类分数关联回归质量, +使其在后处理 NMS 阶段有**分类回归一致性很强的分值排序策略**,以达到选取优秀预测框的目的。 + +Varifocal Loss 原本的公式: + +```{math} +{VFL}(p,q)= \begin{cases} +-q(qlog(p) +(1-q)log(1-p)), & q > 0 \\ +-\alpha p^\gamma log(1-p), & q = 0 +\end{cases} +``` + +其中 {math}`q` 是预测 `bboxes` 与 `GT` 的 `IoU`,使用软标签的形式作为分类的标签。 +{math}`p\in[0,1]` 表示分类分数。 + +1. 对于负样本,即当 {math}`q = 0` 时,标准交叉熵部分为 {math}`-\log(p)`,负样本权重使用 {math}`\alpha p^\gamma` 作为 `focal weight` + 使样本聚焦与困难样本上,这与 `Focal Loss` 基本一致。 +2. 对于正样本,即当 {math}`q > 0` 时,首先计算标准二值交叉熵部分 {math}`-(qlog(p) +(1-q)log(1-p))`, + 但是针对正样本的权重设置,`Varifocal Loss` 中并没有采用类似 {math}`\alpha p^\gamma`的方式降权, + 而是认为在网络的学习过程中正样本相对于负样本的学习信号来说更为重要,所以使用了分类的标签 {math}`q`, + 即 `IoU` 作为 `focal weight`, 使得聚焦到具有高质量的样本上。 + +但是 YOLOv6 中的 Varifocal Loss 公式采用 `TOOD` 中的 `Task ALignment Learning (TAL)`, +将预测的 `IoU` 根据之前标签匹配策略中的分类对齐度 `alignment_metrics` 进行了归一化, +得到归一化 {math}`\hat{t}`。 +具体实现方式为: + +对于每一个 `Gt`,找到所有样本中与 `Gt` 最大的 `IoU`,具有最大 `alignment_metrics` 的样本位置的 {math}`\hat{t} = max(Iou)` + +```{math} +\hat{t} = AlignmentMetrics / max(AlignmentMetrics) * max(IoU) +``` + +最终 YOLOv6 分类损失损失函数为: + +```{math} +{VFL}(p,\hat{t})= \begin{cases} +-\hat{t}(\hat{t}log(p) +(1-\hat{t})log(1-p)), & \hat{t} > 0 \\ +-\alpha p^\gamma log(1-p), & \hat{t} = 0 +\end{cases} +``` + +MMDetection 实现源码的核心部分: + +```python +def varifocal_loss(pred, target, alpha=0.75, gamma=2.0, iou_weighted=True): + """ + pred (torch.Tensor): 预测的分类分数,形状为 (B,N,C) , N 表示 anchor 数量, C 表示类别数 + target (torch.Tensor): 经过对齐度归一化后的 IoU 分数,形状为 (B,N,C),数值范围为 0~1 + alpha (float, optional): 调节正负样本之间的平衡因子,默认 0.75. + gamma (float, optional): 负样本 focal 权重因子, 默认 2.0. + iou_weighted (bool, optional): 正样本是否用 IoU 加权 + """ + pred_sigmoid = pred.sigmoid() + target = target.type_as(pred) + if iou_weighted: + # 计算权重,正样本(target > 0)中权重为 target, + # 负样本权重为 alpha*pred_simogid^2 + focal_weight = target * (target > 0.0).float() + \ + alpha * (pred_sigmoid - target).abs().pow(gamma) * \ + (target <= 0.0).float() + else: + focal_weight = (target > 0.0).float() + \ + alpha * (pred_sigmoid - target).abs().pow(gamma) * \ + (target <= 0.0).float() + # 计算二值交叉熵后乘以权重 + loss = F.binary_cross_entropy_with_logits( + pred, target, reduction='none') * focal_weight + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss +``` + +#### 回归损失函数 GIoU Loss / SIoU Loss + +在 YOLOv6 中,针对不同大小的模型采用了不同的回归损失函数,其中 l/m/s使用的是 `GIoULoss`, t/n 用的是 `SIoULoss`。 + +其中` GIoULoss` 详情请看 [GIoU详解](rtmdet_description.md)。 + +##### SIou Loss + +SIoU 损失函数是 [SIoU Loss: More Powerful Learning for Bounding Box Regression](https://arxiv.org/pdf/2205.12740.pdf) +中提出的度量预测框与 `GT` 的匹配度的指标,由于之前的`GIoU`, `CIoU`, `DIoU` 都没有考虑预测框向 `GT` +框回归的角度,然而角度也确实是回归中一个重要的影响因素,因此提出了全新的`SIoU`。 + +SIoU 损失主要由四个度量方面组成: + +- IoU成本 +- 角度成本 +- 距离成本 +- 形状成本 + +如下图所示,**角度成本** 就是指图中预测框 {math}`B` 向 {math}`B^{GT}` 的回归过程中, +尽可能去使得优化过程中的不确定性因素减少,比如现将图中的角度 {math}`\alpha` 或者 {math}`\beta` +变为 0 ,再去沿着 `x` 轴或者 `y` 轴去回归边界。 + +
+image +
+ +MMYOLO 实现源码的核心部分: + +```python + +def bbox_overlaps(bboxes1, bboxes2, mode='siou', is_aligned=False, eps=1e-6): + # 两个box的顶点x1,y1,x2,y2 + bbox1_x1, bbox1_y1 = pred[:, 0], pred[:, 1] + bbox1_x2, bbox1_y2 = pred[:, 2], pred[:, 3] + bbox2_x1, bbox2_y1 = target[:, 0], target[:, 1] + bbox2_x2, bbox2_y2 = target[:, 2], target[:, 3] + # 交集 + overlap = (torch.min(bbox1_x2, bbox2_x2) - + torch.max(bbox1_x1, bbox2_x1)).clamp(0) * \ + (torch.min(bbox1_y2, bbox2_y2) - + torch.max(bbox1_y1, bbox2_y1)).clamp(0) + # 并集 + w1, h1 = bbox1_x2 - bbox1_x1, bbox1_y2 - bbox1_y1 + w2, h2 = bbox2_x2 - bbox2_x1, bbox2_y2 - bbox2_y1 + union = (w1 * h1) + (w2 * h2) - overlap + eps + # IoU = 交集/并集 + ious = overlap / union + # 最小外界矩的宽高 + enclose_x1y1 = torch.min(pred[:, :2], target[:, :2]) + enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:]) + enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + enclose_w = enclose_wh[:, 0] # enclose_w + enclose_h = enclose_wh[:, 1] # enclose_h + elif iou_mode == 'siou': + # 1.计算 σ (两个box中心点距离): + # sigma_cw,sigma_ch:上图中cw,ch + sigma_cw = (bbox2_x1 + bbox2_x2) / 2 - (bbox1_x1 + bbox1_x2) / 2 + eps + sigma_ch = (bbox2_y1 + bbox2_y2) / 2 - (bbox1_y1 + bbox1_y2) / 2 + eps + sigma = torch.pow(sigma_cw**2 + sigma_ch**2, 0.5) + + # 2. 在 α 和 β 中选择一个小的角度(小于π/4)去优化 + sin_alpha = torch.abs(sigma_ch) / sigma + sin_beta = torch.abs(sigma_cw) / sigma + sin_alpha = torch.where(sin_alpha <= math.sin(math.pi / 4), sin_alpha, + sin_beta) + + # 角度损失 = 1 - 2 * ( sin^2 ( arcsin(x) - (π / 4) ) ) = cos(2α-π/2) = sin(2α) + # 这里就是角度损失,当 α=0 或者 α=90° 时损失为 0, 当 α=45° 损失为 1 + angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) + + # 3.这里将角度损失与距离损失进行融合 + # Distance cost = Σ_(t=x,y) (1 - e ^ (- γ ρ_t)) + rho_x = (sigma_cw / enclose_w)**2 # ρ_x:x轴中心点距离距离损失 + rho_y = (sigma_ch / enclose_h)**2 # ρ_y:y轴中心点距离距离损失 + gamma = 2 - angle_cost # γ + # 当 α=0, angle_cost=0, gamma=2, dis_cost_x = 1 - e ^ (-2 p_x),因为 ρ_x>0, 主要优化距离 + # 当 α=45°,angle_cost=1, gamma=1, dis_cost_x = 1 - e ^ (-1* p_x),因为 ρ_x<1, 主要优化角度 + distance_cost = (1 - torch.exp(-1 * gamma * rho_x)) + ( + 1 - torch.exp(-1 * gamma * rho_y)) + + # 4.形状损失 就是两个box之间的宽高比 + # Shape cost = Ω = Σ_(t=w,h) ( ( 1 - ( e ^ (-ω_t) ) ) ^ θ ) + omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) # ω_w + omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) # ω_h + shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), + siou_theta) + torch.pow( + 1 - torch.exp(-1 * omiga_h), siou_theta) + + # 5.综合 IoU、角度、距离以及形状信息 + # SIoU = IoU - ( (Distance Cost + Shape Cost) / 2 ) + ious = ious - ((distance_cost + shape_cost) * 0.5) + + return ious.clamp(min=-1.0, max=1.0) + +@weighted_loss +def siou_loss(pred, target, eps=1e-7): + sious = bbox_overlaps(pred, target, mode='siou', is_aligned=True, eps=eps) + loss = 1 - sious + return loss +``` + +#### Object Loss + +在 YOLOv6 中,由于额外的置信度预测头可能与 `Aligned Head` 有所冲突,经实验验证在不同大小的模型上也都有掉点, +所以最后选择弃用 `Objectness` 分支。 + +### 1.5 优化策略和训练过程 + +#### 1.5.1 优化器分组 + +与 YOLOv5 一致,详情请看 [YOLOv5 优化器分组](yolov5_description.md) + +#### 1.5.2 weight decay 参数自适应 + +与 YOLOv5 一致,详情请看 [YOLOv5 weight decay 参数自适应](yolov5_description.md) + +### 1.6 推理和后处理过程 + +YOLOv6 后处理过程和 YOLOv5 高度类似,实际上 YOLO 系列的后处理逻辑都是类似的。 +详情请看 [YOLOv5 推理和后处理过程](yolov5_description.md) + +## 2 总结 + +本文对 YOLOv6 原理和在 MMYOLO 实现进行了详细解析,希望能帮助用户理解算法实现过程。同时请注意:由于 YOLOv6 本身也在不断更新,本开源库也会不断迭代,请及时阅读和同步最新版本。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov8_description.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov8_description.md new file mode 100644 index 0000000000000000000000000000000000000000..fb5e218db61442b302400a6df1c024ad00c77ff1 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/algorithm_descriptions/yolov8_description.md @@ -0,0 +1,244 @@ +# YOLOv8 原理和实现全解析 + +## 0 简介 + +
+YOLOv8-P5_structure +图 1:YOLOv8-P5 模型结构 +
+ +以上结构图由 RangeKing@github 绘制。 + +YOLOv8 是 Ultralytics 公司在 2023 年 1月 10 号开源的 YOLOv5 的下一个重大更新版本,目前支持图像分类、物体检测和实例分割任务,在还没有开源时就收到了用户的广泛关注。 + +按照官方描述,YOLOv8 是一个 SOTA 模型,它建立在以前 YOLO 版本的成功基础上,并引入了新的功能和改进,以进一步提升性能和灵活性。具体创新包括一个新的骨干网络、一个新的 Ancher-Free 检测头和一个新的损失函数,可以在从 CPU 到 GPU 的各种硬件平台上运行。 +不过 Ultralytics 并没有直接将开源库命名为 YOLOv8,而是直接使用 Ultralytics 这个词,原因是 Ultralytics 将这个库定位为算法框架,而非某一个特定算法,一个主要特点是可扩展性。其希望这个库不仅仅能够用于 YOLO 系列模型,而是能够支持非 YOLO 模型以及分类分割姿态估计等各类任务。 +总而言之,Ultralytics 开源库的两个主要优点是: + +- **融合众多当前 SOTA 技术于一体** +- **未来将支持其他 YOLO 系列以及 YOLO 之外的更多算法** + +
+YOLOv8-table +图 2:YOLOv8 性能曲线 +
+ +下表为官方在 COCO Val 2017 数据集上测试的 mAP、参数量和 FLOPs 结果。可以看出 YOLOv8 相比 YOLOv5 精度提升非常多,但是 N/S/M 模型相应的参数量和 FLOPs 都增加了不少,从上图也可以看出相比 YOLOV5 大部分模型推理速度变慢了。 + +| **模型** | **YOLOv5** | **params(M)** | **FLOPs@640 (B)** | **YOLOv8** | **params(M)** | **FLOPs@640 (B)** | +| -------- | ----------- | ------------- | ----------------- | ----------- | ------------- | ----------------- | +| n | 28.0(300e) | 1.9 | 4.5 | 37.3 (500e) | 3.2 | 8.7 | +| s | 37.4 (300e) | 7.2 | 16.5 | 44.9 (500e) | 11.2 | 28.6 | +| m | 45.4 (300e) | 21.2 | 49.0 | 50.2 (500e) | 25.9 | 78.9 | +| l | 49.0 (300e) | 46.5 | 109.1 | 52.9 (500e) | 43.7 | 165.2 | +| x | 50.7 (300e) | 86.7 | 205.7 | 53.9 (500e) | 68.2 | 257.8 | + +额外提一句,现在各个 YOLO 系列改进算法都在 COCO 上面有明显性能提升,但是在自定义数据集上面的泛化性还没有得到广泛验证,至今依然听到不少关于 YOLOv5 泛化性能较优异的说法。**对各系列 YOLO 泛化性验证也是 MMYOLO 中一个特别关心和重点发力的方向。** + +阅读本文前,如果你对 YOLOv5、YOLOv6 和 RTMDet 不熟悉,可以先看下如下文档: + +1. [YOLOv5 原理和实现全解析](https://mmyolo.readthedocs.io/zh_CN/latest/algorithm_descriptions/yolov5_description.html) +2. [YOLOv6 原理和实现全解析](https://mmyolo.readthedocs.io/zh_CN/latest/algorithm_descriptions/yolov6_description.html) +3. [RTMDet 原理和实现全解析](https://mmyolo.readthedocs.io/zh_CN/latest/algorithm_descriptions/rtmdet_description.html) + +## 1 YOLOv8 概述 + +YOLOv8 算法的核心特性和改动可以归结为如下: + +1. **提供了一个全新的 SOTA 模型,包括 P5 640 和 P6 1280 分辨率的目标检测网络和基于 YOLACT 的实例分割模型。和 YOLOv5 一样,基于缩放系数也提供了 N/S/M/L/X 尺度的不同大小模型,用于满足不同场景需求** +2. **骨干网络和 Neck 部分可能参考了 YOLOv7 ELAN 设计思想,将 YOLOv5 的 C3 结构换成了梯度流更丰富的 C2f 结构,并对不同尺度模型调整了不同的通道数,属于对模型结构精心微调,不再是无脑一套参数应用所有模型,大幅提升了模型性能。不过这个 C2f 模块中存在 Split 等操作对特定硬件部署没有之前那么友好了** +3. **Head 部分相比 YOLOv5 改动较大,换成了目前主流的解耦头结构,将分类和检测头分离,同时也从 Anchor-Based 换成了 Anchor-Free** +4. **Loss 计算方面采用了 TaskAlignedAssigner 正样本分配策略,并引入了 Distribution Focal Loss** +5. **训练的数据增强部分引入了 YOLOX 中的最后 10 epoch 关闭 Mosiac 增强的操作,可以有效地提升精度** + +从上面可以看出,YOLOv8 主要参考了最近提出的诸如 YOLOX、YOLOv6、YOLOv7 和 PPYOLOE 等算法的相关设计,本身的创新点不多,偏向工程实践,主推的还是 ultralytics 这个框架本身。 + +下面将按照模型结构设计、Loss 计算、训练数据增强、训练策略和模型推理过程共 5 个部分详细介绍 YOLOv8 目标检测的各种改进,实例分割部分暂时不进行描述。 + +## 2 模型结构设计 + +模型完整图示可以看图 1。 + +在暂时不考虑 Head 情况下,对比 YOLOv5 和 YOLOv8 的 yaml 配置文件可以发现改动较小。 + +
+yaml +图 3:YOLOv5 和 YOLOv8 YAML 文件对比 +
+ +左侧为 YOLOv5-s,右侧为 YOLOv8-s + +骨干网络和 Neck 的具体变化为: + +- 第一个卷积层的 kernel 从 6x6 变成了 3x3 +- 所有的 C3 模块换成 C2f,结构如下所示,可以发现多了更多的跳层连接和额外的 Split 操作 + +
+module +图 4:YOLOv5 和 YOLOv8 模块对比 +
+ +- 去掉了 Neck 模块中的 2 个卷积连接层 +- Backbone 中 C2f 的 block 数从 3-6-9-3 改成了 3-6-6-3 +- 查看 N/S/M/L/X 等不同大小模型,可以发现 N/S 和 L/X 两组模型只是改了缩放系数,但是 S/M/L 等骨干网络的通道数设置不一样,没有遵循同一套缩放系数。如此设计的原因应该是同一套缩放系数下的通道设置不是最优设计,YOLOv7 网络设计时也没有遵循一套缩放系数作用于所有模型 + +Head 部分变化最大,从原先的耦合头变成了解耦头,并且从 YOLOv5 的 Anchor-Based 变成了 Anchor-Free。其结构如下所示: + +
+head +图 5:YOLOv8 Head 结构 +
+ +可以看出,不再有之前的 objectness 分支,只有解耦的分类和回归分支,并且其回归分支使用了 Distribution Focal Loss 中提出的积分形式表示法。 + +## 3 Loss 计算 + +Loss 计算过程包括 2 个部分: 正负样本分配策略和 Loss 计算。 +现代目标检测器大部分都会在正负样本分配策略上面做文章,典型的如 YOLOX 的 simOTA、TOOD 的 TaskAlignedAssigner 和 RTMDet 的 DynamicSoftLabelAssigner,这类 Assigner 大都是动态分配策略,而 YOLOv5 采用的依然是静态分配策略。考虑到动态分配策略的优异性,YOLOv8 算法中则直接引用了 TOOD 的 TaskAlignedAssigner。 +TaskAlignedAssigner 的匹配策略简单总结为: 根据分类与回归的分数加权的分数选择正样本。 + +```{math} +t=s^\alpha+u^\beta +``` + +`s` 是标注类别对应的预测分值,`u` 是预测框和 gt 框的 iou,两者相乘就可以衡量对齐程度。 + +1. 对于每一个 GT,对所有的预测框基于 GT 类别对应分类分数,预测框与 GT 的 IoU 的加权得到一个关联分类以及回归的对齐分数 `alignment_metrics` +2. 对于每一个 GT,直接基于 `alignment_metrics` 对齐分数选取 topK 大的作为正样本 + +Loss 计算包括 2 个分支: **分类和回归分支,没有了之前的 objectness 分支**。 + +- 分类分支依然采用 BCE Loss +- 回归分支需要和 Distribution Focal Loss 中提出的积分形式表示法绑定,因此使用了 Distribution Focal Loss, 同时还使用了 CIoU Loss + +3 个 Loss 采用一定权重比例加权即可。 + +## 4 训练数据增强 + +数据增强方面和 YOLOv5 差距不大,只不过引入了 YOLOX 中提出的最后 10 个 epoch 关闭 Mosaic 的操作。假设训练 epoch 是 500,其示意图如下所示: + +
+head +图 6:pipeline +
+ +考虑到不同模型应该采用的数据增强强度不一样,因此对于不同大小模型,有部分超参会进行修改,典型的如大模型会开启 MixUp 和 CopyPaste。数据增强后典型效果如下所示: + +
+head +图 7:results +
+ +上述效果可以运行 [browse_dataset](https://github.com/open-mmlab/mmyolo/blob/dev/tools/analysis_tools/browse_dataset.py) 脚本得到。由于每个 pipeline 都是比较常规的操作,本文不再赘述。如果想了解每个 pipeline 的细节,可以查看 MMYOLO 中 [YOLOv5 的算法解析文档](https://mmyolo.readthedocs.io/zh_CN/latest/algorithm_descriptions/yolov5_description.html#id2) 。 + +## 5 训练策略 + +YOLOv8 的训练策略和 YOLOv5 没有啥区别,最大区别就是**模型的训练总 epoch 数从 300 提升到了 500**,这也导致训练时间急剧增加。以 YOLOv8-S 为例,其训练策略汇总如下: + +| 配置 | YOLOv8-s P5 参数 | +| ---------------------- | ------------------------------- | +| optimizer | SGD | +| base learning rate | 0.01 | +| Base weight decay | 0.0005 | +| optimizer momentum | 0.937 | +| batch size | 128 | +| learning rate schedule | linear | +| training epochs | **500** | +| warmup iterations | max(1000,3 * iters_per_epochs) | +| input size | 640x640 | +| EMA decay | 0.9999 | + +## 6 模型推理过程 + +YOLOv8 的推理过程和 YOLOv5 几乎一样,唯一差别在于前面需要对 Distribution Focal Loss 中的积分表示 bbox 形式进行解码,变成常规的 4 维度 bbox,后续计算过程就和 YOLOv5 一样了。 + +以 COCO 80 类为例,假设输入图片大小为 640x640,MMYOLO 中实现的推理过程示意图如下所示: + +
+head +图 8:results +
+ +其推理和后处理过程为: + +**(1) bbox 积分形式转换为 4d bbox 格式** + +对 Head 输出的 bbox 分支进行转换,利用 Softmax 和 Conv 计算将积分形式转换为 4 维 bbox 格式 + +**(2) 维度变换** + +YOLOv8 输出特征图尺度为 `80x80`、`40x40` 和 `20x20` 的三个特征图。Head 部分输出分类和回归共 6 个尺度的特征图。 +将 3 个不同尺度的类别预测分支、bbox 预测分支进行拼接,并进行维度变换。为了后续方便处理,会将原先的通道维度置换到最后,类别预测分支 和 bbox 预测分支 shape 分别为 (b, 80x80+40x40+20x20, 80)=(b,8400,80),(b,8400,4)。 + +**(3) 解码还原到原图尺度** + +分类预测分支进行 Sigmoid 计算,而 bbox 预测分支需要进行解码,还原为真实的原图解码后 xyxy 格式。 + +**(4) 阈值过滤** + +遍历 batch 中的每张图,采用 `score_thr` 进行阈值过滤。在这过程中还需要考虑 **multi_label 和 nms_pre,确保过滤后的检测框数目不会多于 nms_pre。** + +**(5) 还原到原图尺度和 nms** + +基于前处理过程,将剩下的检测框还原到网络输出前的原图尺度,然后进行 nms 即可。最终输出的检测框不能多于 **max_per_img。** + +有一个特别注意的点:**YOLOv5 中采用的 Batch shape 推理策略,在 YOLOv8 推理中暂时没有开启,不清楚后面是否会开启,在 MMYOLO 中快速测试了下,如果开启 Batch shape 会涨大概 0.1~0.2。** + +## 7 特征图可视化 + +MMYOLO 中提供了一套完善的特征图可视化工具,可以帮助用户可视化特征的分布情况。 为了和官方性能对齐,此处依然采用官方权重进行可视化。 + +以 YOLOv8-s 模型为例,第一步需要下载官方权重,然后将该权重通过 [yolov8_to_mmyolo](https://github.com/open-mmlab/mmyolo/blob/dev/tools/model_converters/yolov8_to_mmyolo.py) 脚本将去转换到 MMYOLO 中,注意必须要将脚本置于官方仓库下才能正确运行,假设得到的权重名字为 mmyolov8s.pth。 + +假设想可视化 backbone 输出的 3 个特征图效果,则只需要 + +```bash +cd mmyolo +python demo/featmap_vis_demo.py demo/demo.jpg configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py mmyolov8s.pth --channel-reductio squeeze_mean +``` + +需要特别注意,为了确保特征图和图片叠加显示能对齐效果,需要先将原先的 `test_pipeline` 替换为如下: + +```Python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 这里将 LetterResize 修改成 mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +
+head +图 9:featmap +
+ +从上图可以看出**不同输出特征图层主要负责预测不同尺度的物体**。 + +我们也可以可视化 Neck 层的 3 个输出层特征图: + +```bash +cd mmyolo +python demo/featmap_vis_demo.py demo/demo.jpg configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py mmyolov8s.pth --channel-reductio squeeze_mean --target-layers neck +``` + +
+head +图 10:featmap +
+ +**从上图可以发现物体处的特征更加聚焦。** + +## 总结 + +本文详细分析和总结了最新的 YOLOv8 算法,从整体设计到模型结构、Loss 计算、训练数据增强、训练策略和推理过程进行了详细的说明,并提供了大量的示意图供大家方便理解。 +简单来说 YOLOv8 是一个包括了图像分类、Anchor-Free 物体检测和实例分割的高效算法,检测部分设计参考了目前大量优异的最新的 YOLO 改进算法,实现了新的 SOTA。不仅如此还推出了一个全新的框架。不过这个框架还处于早期阶段,还需要不断完善。 + +MMYOLO 开源地址: https://github.com/open-mmlab/mmyolo/blob/dev/configs/yolov8/README.md + +MMYOLO 算法解析教程:https://mmyolo.readthedocs.io/zh_CN/latest/algorithm_descriptions/index.html#id2 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/application_examples/index.rst b/third_party/mmyolo/docs/zh_cn/recommended_topics/application_examples/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..f552dbe5a5c56aca0edc524cecc1b20786e9793e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/application_examples/index.rst @@ -0,0 +1,7 @@ +MMYOLO 应用范例介绍 +******************** + +.. toctree:: + :maxdepth: 1 + + ionogram_detection.md diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..84e6daf00431b9fc620a36aee8f34f41f042c84f --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md @@ -0,0 +1,306 @@ +# 基于 MMYOLO 的频高图实时目标检测 benchmark + +## 数据集构建 + +数字频高图是获取电离层实时信息最重要的途径。电离层结构检测对精准提取电离层关键参数,具有非常重要的研究意义。 + +利用中国科学院在海南、武汉、怀来获取的不同季节的 4311 张频高图建立数据集,使用 [labelme](https://github.com/wkentaro/labelme) 人工标注出 E 层、Es-c 层、Es-l 层、F1 层、F2 层、Spread F 层共 6 种结构。[数据集下载](https://github.com/VoyagerXvoyagerx/Ionogram_detection/releases/download/Dataset/Iono4311.zip) + +
+ + +使用 labelme 标注的图像预览 + +
+ +1. 数据集准备 + +下载数据后,放置在 MMYOLO 仓库的根目录下,使用 `unzip test.zip` 命令(linux)解压至当前文件夹。解压后的文件夹结构为: + +```shell +Iono4311/ +├── images +| ├── 20130401005200.png +| └── ... +└── labels + ├── 20130401005200.json + └── ... +``` + +其中,`images` 目录下存放输入图片,`labels` 目录下存放使用 labelme 标注得到的 json 文件。 + +2. 数据集格式转换 + +使用MMYOLO提供的 `tools/dataset_converters/labelme2coco.py` 脚本将 labelme 格式的标注文件转换为 COCO 格式的标注文件。 + +```shell +python tools/dataset_converters/labelme2coco.py --img-dir ./Iono4311/images \ + --labels-dir ./Iono4311/labels \ + --out ./Iono4311/annotations/annotations_all.json +``` + +3. 浏览数据集 + +使用下面的命令可以将 COCO 的 label 在图片上进行显示,这一步可以验证刚刚转换是否有问题。 + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir ./Iono4311/images \ + --ann-file ./Iono4311/annotations/annotations_all.json +``` + +4. 划分训练集、验证集、测试集 + +设置 70% 的图片为训练集,15% 作为验证集,15% 为测试集。 + +```shell +python tools/misc/coco_split.py --json ./Iono4311/annotations/annotations_all.json \ + --out-dir ./Iono4311/annotations \ + --ratios 0.7 0.15 0.15 \ + --shuffle \ + --seed 14 +``` + +划分后的文件夹结构: + +```shell +Iono4311/ +├── annotations +│ ├── annotations_all.json +│ ├── class_with_id.txt +│ ├── test.json +│ ├── train.json +│ └── val.json +├── classes_with_id.txt +├── images +├── labels +├── test_images +├── train_images +└── val_images +``` + +## 配置文件 + +配置文件存放在目录 `/projects/misc/ionogram_detection/` 下。 + +1. 数据集分析 + +使用 `tools/analysis_tools/dataset_analysis.py` 从数据集中采样 200 张图片进行可视化分析: + +```shell +python tools/analysis_tools/dataset_analysis.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + --out-dir output +``` + +得到以下输出: + +```shell +The information obtained is as follows: ++------------------------------+ +| Information of dataset class | ++---------------+--------------+ +| Class name | Bbox num | ++---------------+--------------+ +| E | 98 | +| Es-l | 27 | +| Es-c | 46 | +| F1 | 100 | +| F2 | 194 | +| Spread-F | 6 | ++---------------+--------------+ +``` + +说明本数据集存在样本不均衡的现象。 + +
+ + +各类别目标大小统计 + +
+ +根据统计结果,E、Es-l、Esc、F1 类别以小目标居多,F2、Spread F 类主要是中等大小目标。 + +2. 可视化 config 中的数据处理部分 + +以 YOLOv5-s 为例,根据配置文件中的 `train_pipeline`,训练时采用的数据增强策略包括: + +- 马赛克增强 +- 随机仿射变换 +- Albumentations 数据增强工具包(包括多种数字图像处理方法) +- HSV 随机增强图像 +- 随机水平翻转 + +使用 `tools/analysis_tools/browse_dataset.py` 脚本的 **'pipeline'** 模式,可以可视化每个 pipeline 的输出效果: + +```shell +python tools/analysis_tools/browse_dataset.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + -m pipeline \ + --out-dir output +``` + +
+ + +pipeline 输出可视化 + +
+ +3. 优化 Anchor 尺寸 + +使用分析工具中的 `tools/analysis_tools/optimize_anchors.py` 脚本得到适用于本数据集的先验锚框尺寸。 + +```shell +python tools/analysis_tools/optimize_anchors.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + --algorithm v5-k-means \ + --input-shape 640 640 \ + --prior-match-thr 4.0 \ + --out-dir work_dirs/dataset_analysis_5_s +``` + +4. 模型复杂度分析 + +根据配置文件,使用分析工具中的 `tools/analysis_tools/get_flops.py` 脚本可以得到模型的参数量、浮点计算量等信息。以 YOLOv5-s 为例: + +```shell +python tools/analysis_tools/get_flops.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py +``` + +得到如下输出,表示模型的浮点运算量为 7.947G,一共有 7.036M 个可学习参数。 + +```shell +============================== +Input shape: torch.Size([640, 640]) +Model Flops: 7.947G +Model Parameters: 7.036M +============================== +``` + +## 训练和测试 + +1. 训练 + +训练可视化:本范例按照[标注+训练+测试+部署全流程](https://mmyolo.readthedocs.io/zh_CN/dev/recommended_topics/labeling_to_deployment_tutorials.html#id11)中的步骤安装和配置 [wandb](https://wandb.ai/site)。 + +调试技巧:在调试代码的过程中,有时需要训练几个 epoch,例如调试验证过程或者权重的保存是否符合期望。对于继承自 `BaseDataset` 的数据集(如本范例中的 `YOLOv5CocoDataset`),在 `train_dataloader` 中的 `dataset` 字段增加 `indices` 参数,即可指定每个 epoch 迭代的样本数,减少迭代时间。 + +```python +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=1, + dataset=dict( + type=_base_.dataset_type, + indices=200, # 设置 indices=200,表示每个 epoch 只迭代 200 个样本 + data_root=data_root, + metainfo=metainfo, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) +``` + +启动训练: + +```shell +python tools/train.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py +``` + +2. 测试 + +指定配置文件和模型的路径以启动测试: + +```shell +python tools/test.py projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py \ + work_dirs/yolov5_s-v61_fast_1xb96-100e_ionogram/xxx +``` + +## 实验与结果分析 + +### 选择合适的 batch size + +- Batch size 主导了训练速度。通常,理想的 batch size 是是硬件能支持的最大 batch size。 +- 当显存占用没有达到饱和时,如果 batch size 翻倍,训练吞吐量也应该翻倍(或接近翻倍),训练时间应该减半或接近减半。 +- 使用**混合精度训练**可以加快训练速度、减小显存。在执行 `train.py` 脚本时添加 `--amp` 参数即可开启。 + +硬件信息: + +- GPU:V100,显存 32G +- CPU:10核,内存 40G + +实验结果: + +| Model | Epoch(best) | AMP | Batchsize | Num workers | Memory Allocated | Training Time | Val mAP | +| -------- | ----------- | ----- | --------- | ----------- | ---------------- | ------------- | ------- | +| YOLOv5-s | 100(82) | False | 32 | 6 | 35.07% | 54 min | 0.575 | +| YOLOv5-s | 100(96) | True | 32 | 6 | 24.93% | 49 min | 0.578 | +| YOLOv5-s | 100(100) | False | 96 | 6 | 96.64% | 48 min | 0.571 | +| YOLOv5-s | 100(100) | True | 96 | 6 | 54.66% | **37** min | 0.575 | +| YOLOv5-s | 100(90) | True | 144 | 6 | 77.06% | 39 min | 0.573 | +| YOLOv5-s | 200(148) | True | 96 | 6 | 54.66% | 72 min | 0.575 | +| YOLOv5-s | 200(188) | True | 96 | **8** | 54.66% | 67 min | 0.576 | + +
+ + +不同 batch size 的训练过程中,数据加载时间 `data_time` 占每步总时长的比例 + +
+ +分析结果,可以得出以下结论: + +- 混合精度训练对模型的精度几乎没有影响,并且可以明显减少显存占用。 +- Batch size 增加 3 倍,和训练时长并没有相应地减小 3 倍。根据训练过程中 `data_time` 的记录,batch size 越大,`data_time` 也越大,说明数据加载成为了限制训练速度的瓶颈。增大加载数据的进程数 `num_workers` 可以加快数据加载。 + +### 消融实验 + +为了得到适用于本数据集的训练流水线,以 YOLOv5-s 模型为例,进行以下消融实验。 + +#### 不同数据增强方法 + +| Aug Method | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py) | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | +| ---------- | ------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------- | +| Mosaic | | √ | √ | √ | √ | +| Affine | | | √ | √ | √ | +| Albu | | | | √ | √ | +| HSV | | | | √ | √ | +| Flip | | | | | √ | +| Val mAP | 0.507 | 0.550 | 0.572 | 0.567 | 0.575 | + +结果表明,马赛克增强和随机仿射变换可以对验证集表现带来明显的提升。 + +#### 是否使用预训练权重 + +在配置文件中,修改 `load_from = None` 即可不使用预训练权重。对不使用预训练权重的实验,将基础学习率增大四倍,训练轮数增加至 200 轮,使模型得到较为充分的训练。 + +| Model | Epoch(best) | FLOPs(G) | Params(M) | Pretrain | Val mAP | Config | +| -------- | ----------- | -------- | --------- | -------- | ------- | ------------------------------------------------------------------------------------------------ | +| YOLOv5-s | 100(82) | 7.95 | 7.04 | Coco | 0.575 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | +| YOLOv5-s | 200(145) | 7.95 | 7.04 | None | 0.565 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py) | +| YOLOv6-s | 100(54) | 24.2 | 18.84 | Coco | 0.584 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py) | +| YOLOv6-s | 200(188) | 24.2 | 18.84 | None | 0.557 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py) | + +
+ + +训练过程中的损失下降对比图 + +
+ +损失下降曲线表明,使用预训练权重时,loss 下降得更快。可见即使是自然图像数据集上预训练的模型,在雷达图像数据集上微调时,也可以加快模型收敛。 + +### 频高图结构检测 benchmark + +| Model | epoch(best) | FLOPs(G) | Params(M) | pretrain | val mAP | test mAP | Config | Log | +| ----------- | ----------- | -------- | --------- | -------- | ------- | -------- | ------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- | +| YOLOv5-s | 100(82) | 7.95 | 7.04 | Coco | 0.575 | 0.584 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_s_20230105_213510.json) | +| YOLOv5-m | 100(70) | 24.05 | 20.89 | Coco | 0.587 | 0.586 | [config](/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov5_m_20230106_004642.json) | +| YOLOv6-s | 100(54) | 24.2 | 18.84 | Coco | 0.584 | 0.594 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_s_20230107_003207.json) | +| YOLOv6-m | 100(76) | 37.08 | 44.42 | Coco | 0.590 | 0.590 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_m_20230107_201029.json) | +| YOLOv6-l | 100(76) | 71.33 | 58.47 | Coco | 0.605 | 0.597 | [config](/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov6_l_20230108_005634.json) | +| YOLOv7-tiny | 100(78) | 6.57 | 6.02 | Coco | 0.549 | 0.568 | [config](/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_tiny_20230215_202837.json) | +| YOLOv7-x | 100(58) | 94.27 | 70.85 | Coco | 0.602 | 0.595 | [config](/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/yolov7_x_20230110_165832.json) | +| rtmdet-tiny | 100(100) | 8.03 | 4.88 | Coco | 0.582 | 0.589 | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_tiny_20230310_125440.json) | +| rtmdet-s | 100(92) | 14.76 | 8.86 | Coco | 0.588 | 0.585 | [config](/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py) | [log](https://github.com/VoyagerXvoyagerx/Ionogram_detection/blob/main/logs/rtmdet_s_20230310_163853.json) | diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/complexity_analysis.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/complexity_analysis.md new file mode 100644 index 0000000000000000000000000000000000000000..362a3315c888ef772ed22dd64508356208235fdc --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/complexity_analysis.md @@ -0,0 +1,117 @@ +# 模型复杂度分析 + +我们提供了 `tools/analysis_tools/get_flops.py` 脚本来帮助进行 MMYOLO 系列中所有模型的复杂度分析。目前支持计算并输出给定模型的 parameters, activation 以及 flops;同时支持以网络结构或表格的形式打印输出每一层网络的复杂度信息。 + +调用命令如下: + +```shell +python tools/analysis_tools/get_flops.py + ${CONFIG_FILE} \ # 配置文件路径 + [--shape ${IMAGE_SIZE}] \ # 输入图像大小(int),默认取 640*640 + [--show-arch ${ARCH_DISPLAY}] \ # 以网络结构形式逐层展示复杂度信息 + [--not-show-table ${TABLE_DISPLAY}] \ # 以表格形式逐层展示复杂度信息 + [--cfg-options ${CFG_OPTIONS}] # 配置文件参数修改选项 +# [] 代表可选参数,实际输入命令行时,不用输入 [] +``` + +接下来以 RTMDet 中的 `rtmdet_s_syncbn_fast_8xb32-300e_coco.py` 配置文件为例,详细展示该脚本的几种使用情形: + +## 样例 1: 打印模型的 Flops 和 Parameters,并以表格形式展示每层网络复杂度 + +```shell +python tools/analysis_tools/get_flops.py configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py +``` + +输出如下: + +```python +============================== +Input shape: torch.Size([640, 640]) +Model Flops: 14.835G +Model Parameters: 8.887M +============================== +``` + +| module | #parameters or shape | #flops | #activations | +| :-------------------------------- | :------------------- | :------ | :----------: | +| model | 8.887M | 14.835G | 35.676M | +| backbone | 4.378M | 5.416G | 22.529M | +| backbone.stem | 7.472K | 0.765G | 6.554M | +| backbone.stem.0 | 0.464K | 47.514M | 1.638M | +| backbone.stem.1 | 2.336K | 0.239G | 1.638M | +| backbone.stem.2 | 4.672K | 0.478G | 3.277M | +| backbone.stage1 | 42.4K | 0.981G | 7.373M | +| backbone.stage1.0 | 18.56K | 0.475G | 1.638M | +| backbone.stage1.1 | 23.84K | 0.505G | 5.734M | +| backbone.stage2 | 0.21M | 1.237G | 4.915M | +| backbone.stage2.0 | 73.984K | 0.473G | 0.819M | +| backbone.stage2.1 | 0.136M | 0.764G | 4.096M | +| backbone.stage3 | 0.829M | 1.221G | 2.458M | +| backbone.stage3.0 | 0.295M | 0.473G | 0.41M | +| backbone.stage3.1 | 0.534M | 0.749G | 2.048M | +| backbone.stage4 | 3.29M | 1.211G | 1.229M | +| backbone.stage4.0 | 1.181M | 0.472G | 0.205M | +| backbone.stage4.1 | 0.657M | 0.263G | 0.307M | +| backbone.stage4.2 | 1.452M | 0.476G | 0.717M | +| neck | 3.883M | 4.366G | 8.141M | +| neck.reduce_layers.2 | 0.132M | 52.634M | 0.102M | +| neck.reduce_layers.2.conv | 0.131M | 52.429M | 0.102M | +| neck.reduce_layers.2.bn | 0.512K | 0.205M | 0 | +| neck.top_down_layers | 0.491M | 1.23G | 4.506M | +| neck.top_down_layers.0 | 0.398M | 0.638G | 1.638M | +| neck.top_down_layers.1 | 92.608K | 0.593G | 2.867M | +| neck.downsample_layers | 0.738M | 0.472G | 0.307M | +| neck.downsample_layers.0 | 0.148M | 0.236G | 0.205M | +| neck.downsample_layers.1 | 0.59M | 0.236G | 0.102M | +| neck.bottom_up_layers | 1.49M | 0.956G | 2.15M | +| neck.bottom_up_layers.0 | 0.3M | 0.48G | 1.434M | +| neck.bottom_up_layers.1 | 1.19M | 0.476G | 0.717M | +| neck.out_layers | 1.033M | 1.654G | 1.075M | +| neck.out_layers.0 | 0.148M | 0.945G | 0.819M | +| neck.out_layers.1 | 0.295M | 0.472G | 0.205M | +| neck.out_layers.2 | 0.59M | 0.236G | 51.2K | +| neck.upsample_layers | | 1.229M | 0 | +| neck.upsample_layers.0 | | 0.41M | 0 | +| neck.upsample_layers.1 | | 0.819M | 0 | +| bbox_head.head_module | 0.625M | 5.053G | 5.006M | +| bbox_head.head_module.cls_convs | 0.296M | 2.482G | 2.15M | +| bbox_head.head_module.cls_convs.0 | 0.295M | 2.481G | 2.15M | +| bbox_head.head_module.cls_convs.1 | 0.512K | 0.819M | 0 | +| bbox_head.head_module.cls_convs.2 | 0.512K | 0.205M | 0 | +| bbox_head.head_module.reg_convs | 0.296M | 2.482G | 2.15M | +| bbox_head.head_module.reg_convs.0 | 0.295M | 2.481G | 2.15M | +| bbox_head.head_module.reg_convs.1 | 0.512K | 0.819M | 0 | +| bbox_head.head_module.reg_convs.2 | 0.512K | 0.205M | 0 | +| bbox_head.head_module.rtm_cls | 30.96K | 86.016M | 0.672M | +| bbox_head.head_module.rtm_cls.0 | 10.32K | 65.536M | 0.512M | +| bbox_head.head_module.rtm_cls.1 | 10.32K | 16.384M | 0.128M | +| bbox_head.head_module.rtm_cls.2 | 10.32K | 4.096M | 32K | +| bbox_head.head_module.rtm_reg | 1.548K | 4.301M | 33.6K | +| bbox_head.head_module.rtm_reg.0 | 0.516K | 3.277M | 25.6K | +| bbox_head.head_module.rtm_reg.1 | 0.516K | 0.819M | 6.4K | +| bbox_head.head_module.rtm_reg.2 | 0.516K | 0.205M | 1.6K | + +## 样例 2:以网络结构形式逐层展示模型复杂度信息 + +```shell +python tools/analysis_tools/get_flops.py configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py --show-arch +``` + +由于该网络结构复杂,输出较长。以下仅展示 bbox_head.head_module.rtm_reg 部分的输出: + +```python +(rtm_reg): ModuleList( + #params: 1.55K, #flops: 4.3M, #acts: 33.6K + (0): Conv2d( + 128, 4, kernel_size=(1, 1), stride=(1, 1) + #params: 0.52K, #flops: 3.28M, #acts: 25.6K + ) + (1): Conv2d( + 128, 4, kernel_size=(1, 1), stride=(1, 1) + #params: 0.52K, #flops: 0.82M, #acts: 6.4K + ) + (2): Conv2d( + 128, 4, kernel_size=(1, 1), stride=(1, 1) + #params: 0.52K, #flops: 0.2M, #acts: 1.6K + ) +``` diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/contributing.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/contributing.md new file mode 100644 index 0000000000000000000000000000000000000000..a950f1d2caef7ddd7b0873187b59cc5270e8b169 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/contributing.md @@ -0,0 +1,325 @@ +# 如何给 MMYOLO 贡献代码 + +欢迎加入 MMYOLO 社区,我们致力于打造最前沿的计算机视觉基础库,我们欢迎任何类型的贡献,包括但不限于 + +**修复错误** + +修复代码实现错误的步骤如下: + +1. 如果提交的代码改动较大,建议先提交 issue,并正确描述 issue 的现象、原因和复现方式,讨论后确认修复方案。 +2. 修复错误并补充相应的单元测试,提交拉取请求。 + +**新增功能或组件** + +1. 如果新功能或模块涉及较大的代码改动,建议先提交 issue,确认功能的必要性。 +2. 实现新增功能并添单元测试,提交拉取请求。 + +**文档补充** + +修复文档可以直接提交拉取请求 + +添加文档或将文档翻译成其他语言步骤如下 + +1. 提交 issue,确认添加文档的必要性。 +2. 添加文档,提交拉取请求。 + +## 准备工作 + +拉取请求工作的命令都是用 Git 去实现的,该章节详细描述 `Git 配置` 以及与 `GitHub 绑定` + +### 1. Git 配置 + +首先,确认电脑是否安装了 Git。Linux 系统和 macOS 系统,一般默认安装 Git,如未安装可在 [Git-Downloads](https://git-scm.com/downloads) 下载。 + +```shell +# 在命令提示符(cmd)或终端下输入以下命令,查看 Git 版本 +git --version +``` + +其次,检测自己 `Git Config` 是否配置 + +```shell +# 在命令提示符(cmd)或终端下输入以下命令,查看 Git Config 是否配置 +git config --global --list +``` + +若 `user.name` 和 `user.email` 为空,则输入以下命令进行配置。 + +```shell +git config --global user.name "这里换上你的用户名" +git config --global user.email "这里换上你的邮箱" +``` + +最后,在 `git bash` 或者 `终端` 中,输入以下命令生成密钥文件。生成成功后,会在用户目录下出现 `.ssh` 文件,其中 `id_rsa.pub` 是公钥文件。 + +```shell +# useremail 是 GitHub 的邮箱 +ssh-keygen -t rsa -C "useremail" +``` + +### 2. GitHub 绑定 + +首先,用记事本打开 `id_rsa.pub` 公钥文件,并复制里面全部内容。 + +其次,登录 GitHub 账户找到下图位置进行设置。 + + + +点击 `New SSH key` 新增一个 SSH keys,将刚才复制的内容粘贴到下图所示的 Key 中,Title 可以写设备名称,最后确认即可。 + + + +最后,在 `git bash` 或者 `终端` 中输入以下命令,验证 SSH 是否与 GitHub 账户匹配。如果匹配,输入 `yes` 就成功啦~ + +```shell +ssh -T git@github.com +``` + + + +## 拉取请求工作流 + +如果你对拉取请求不了解,没关系,接下来的内容将会从零开始,一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式,可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) + +### 1. 复刻仓库 + +当你第一次提交拉取请求时,先复刻 OpenMMLab 原代码库,点击 GitHub 页面右上角的 **Fork** 按钮,复刻后的代码库将会出现在你的 GitHub 个人主页下。 + + + +将代码克隆到本地 + +```shell +git clone git@github.com:{username}/mmyolo.git +``` + +进入项目并添加原代码库为上游代码库 + +```bash +cd mmyolo +git remote add upstream git@github.com:open-mmlab/mmyolo +``` + +检查 remote 是否添加成功,在终端输入 `git remote -v` + +```bash +origin git@github.com:{username}/mmyolo.git (fetch) +origin git@github.com:{username}/mmyolo.git (push) +upstream git@github.com:open-mmlab/mmyolo (fetch) +upstream git@github.com:open-mmlab/mmyolo (push) +``` + +```{note} +这里对 origin 和 upstream 进行一个简单的介绍,当我们使用 git clone 来克隆代码时,会默认创建一个 origin 的 remote,它指向我们克隆的代码库地址,而 upstream 则是我们自己添加的,用来指向原始代码库地址。当然如果你不喜欢他叫 upstream,也可以自己修改,比如叫 open-mmlab。我们通常向 origin 提交代码(即 fork 下来的远程仓库),然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突,再从 upstream 拉取最新的代码,和本地分支解决冲突,再提交到 origin。 +``` + +### 2. 配置 pre-commit + +在本地开发环境中,我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格,以确保代码风格的统一。在提交代码,需要先安装 pre-commit(需要在 MMYOLO 目录下执行): + +```shell +pip install -U pre-commit +pre-commit install +``` + +检查 pre-commit 是否配置成功,并安装 `.pre-commit-config.yaml` 中的钩子: + +```shell +pre-commit run --all-files +``` + + + + + +```{note} +如果你是中国用户,由于网络原因,可能会出现安装失败的情况,这时可以使用国内源 + +pre-commit install -c .pre-commit-config-zh-cn.yaml + +pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml +``` + +如果安装过程被中断,可以重复执行 `pre-commit run ...` 继续安装。 + +如果提交的代码不符合代码风格规范,pre-commit 会发出警告,并自动修复部分错误。 + + + +如果我们想临时绕开 pre-commit 的检查提交一次代码,可以在 `git commit` 时加上 `--no-verify`(需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查)。 + +```shell +git commit -m "xxx" --no-verify +``` + +### 3. 创建开发分支 + +安装完 pre-commit 之后,我们需要基于 dev 创建开发分支,建议的分支命名规则为 `username/pr_name`。 + +```shell +git checkout -b yhc/refactor_contributing_doc +``` + +在后续的开发中,如果本地仓库的 dev 分支落后于 upstream 的 dev 分支,我们需要先拉取 upstream 的代码进行同步,再执行上面的命令 + +```shell +git pull upstream dev +``` + +### 4. 提交代码并在本地通过单元测试 + +- MMYOLO 引入了 mypy 来做静态类型检查,以增加代码的鲁棒性。因此我们在提交代码时,需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。 + +- 提交的代码同样需要通过单元测试 + + ```shell + # 通过全量单元测试 + pytest tests + + # 我们需要保证提交的代码能够通过修改模块的单元测试,以 yolov5_coco dataset 为例 + pytest tests/test_datasets/test_yolov5_coco.py + ``` + + 如果你由于缺少依赖无法运行修改模块的单元测试,可以参考[指引-单元测试](#单元测试) + +- 如果修改/添加了文档,参考[指引](#文档渲染)确认文档渲染正常。 + +### 5. 推送代码到远程 + +代码通过单元测试和 pre-commit 检查后,将代码推送到远程仓库,如果是第一次推送,可以在 `git push` 后加上 `-u` 参数以关联远程分支 + +```shell +git push -u origin {branch_name} +``` + +这样下次就可以直接使用 `git push` 命令推送代码了,而无需指定分支和远程仓库。 + +### 6. 提交拉取请求(PR) + +(1) 在 GitHub 的 Pull request 界面创建拉取请求 + + +(2) 根据指引修改 PR 描述,以便于其他开发者更好地理解你的修改 + +```{note} +注意在 PR branch 左侧的 base 需要修改为 dev 分支 +``` + + + +描述规范详见[拉取请求规范](#拉取请求规范) + +  + +**注意事项** + +(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响,并关联相关 Issue(具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)) + +(b) 如果是第一次为 OpenMMLab 做贡献,需要签署 CLA + + + +(c) 检查提交的 PR 是否通过 CI(集成测试) + + + +MMYOLO 会在 Linux 上,基于不同版本的 Python、PyTorch 对提交的代码进行单元测试,以保证代码的正确性,如果有任何一个没有通过,我们可点击上图中的 `Details` 来查看具体的测试信息,以便于我们修改代码。 + +(3) 如果 PR 通过了 CI,那么就可以等待其他开发者的 review,并根据 reviewer 的意见,修改代码,并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤,直到 reviewer 同意合入 PR。 + + + +所有 reviewer 同意合入 PR 后,我们会尽快将 PR 合并到 dev 分支。 + +### 7. 解决冲突 + +随着时间的推移,我们的代码库会不断更新,这时候,如果你的 PR 与 dev 分支存在冲突,你需要解决冲突,解决冲突的方式有两种: + +```shell +git fetch --all --prune +git rebase upstream/dev +``` + +或者 + +```shell +git fetch --all --prune +git merge upstream/dev +``` + +如果你非常善于处理冲突,那么可以使用 rebase 的方式来解决冲突,因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用,那么可以使用 `merge` 的方式来解决冲突。 + +## 指引 + +### 单元测试 + +在提交修复代码错误或新增特性的拉取请求时,我们应该尽可能的让单元测试覆盖所有提交的代码,计算单元测试覆盖率的方法如下 + +```shell +python -m coverage run -m pytest /path/to/test_file +python -m coverage html +# check file in htmlcov/index.html +``` + +### 文档渲染 + +在提交修复代码错误或新增特性的拉取请求时,可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。 +本地生成渲染后的文档的方法如下 + +```shell +pip install -r requirements/docs.txt +cd docs/zh_cn/ +# or docs/en +make html +# check file in ./docs/zh_cn/_build/html/index.html +``` + +## 代码风格 + +### Python + +[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范,我们使用以下工具检查和格式化代码 + +- [flake8](https://github.com/PyCQA/flake8):Python 官方发布的代码规范检查工具,是多个检查工具的封装 +- [isort](https://github.com/timothycrosley/isort):自动调整模块导入顺序的工具 +- [yapf](https://github.com/google/yapf):Google 发布的代码规范检查工具 +- [codespell](https://github.com/codespell-project/codespell):检查单词拼写是否有误 +- [mdformat](https://github.com/executablebooks/mdformat):检查 markdown 文件的工具 +- [docformatter](https://github.com/myint/docformatter):格式化 docstring 的工具 + +yapf 和 isort 的配置可以在 [setup.cfg](../../../setup.cfg) 找到 + +通过配置 [pre-commit hook](https://pre-commit.com/) ,我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`, +修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`,调整 `requirments.txt` 的包顺序。 +pre-commit 钩子的配置可以在 [.pre-commit-config](../../../.pre-commit-config.yaml) 找到。 + +pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。 + +更具体的规范请参考 [OpenMMLab 代码规范](../notes/code_style.md)。 + +### C++ and CUDA + +C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) + +## 拉取请求规范 + +1. 使用 [pre-commit hook](https://pre-commit.com),尽量减少代码风格相关问题 + +2. 一个`拉取请求`对应一个短期分支 + +3. 粒度要细,一个`拉取请求`只做一件事情,避免超大的`拉取请求` + + - Bad:实现 Faster R-CNN + - Acceptable:给 Faster R-CNN 添加一个 box head + - Good:给 box head 增加一个参数来支持自定义的 conv 层数 + +4. 每次 Commit 时需要提供清晰且有意义 commit 信息 + +5. 提供清晰且有意义的`拉取请求`描述 + + - 标题写明白任务名称,一般格式:\[Prefix\] Short description of the pull request (Suffix) + - prefix:新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被 review) + - 描述里介绍`拉取请求`的主要修改内容,结果,以及对其他部分的影响, 参考`拉取请求`模板 + - 关联相关的`议题` (issue) 和其他`拉取请求` + +6. 如果引入了其他三方库,或借鉴了三方库的代码,请确认他们的许可证和 mmyolo 兼容,并在借鉴的代码上补充 `This code is inspired from http://` diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/dataset_preparation.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/dataset_preparation.md new file mode 100644 index 0000000000000000000000000000000000000000..25603a5c1859b736b6b0602607b3902db27f18d8 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/dataset_preparation.md @@ -0,0 +1,144 @@ +# 数据集格式准备和说明 + +## DOTA 数据集 + +### 下载 DOTA 数据集 + +数据集可以从 DOTA 数据集的主页 [DOTA](https://captain-whu.github.io/DOTA/dataset.html) +或 [OpenDataLab](https://opendatalab.org.cn/DOTA_V1.0) 下载。 + +我们推荐使用 [OpenDataLab](https://opendatalab.org.cn/DOTA_V1.0) 下载,其中的文件夹结构已经按照需要排列好了,只需要解压即可,不需要费心去调整文件夹结构。 + +下载后解压数据集,并按如下文件夹结构存放。 + +```none +${DATA_ROOT} +├── train +│ ├── images +│ │ ├── P0000.png +│ │ ├── ... +│ ├── labelTxt-v1.0 +│ │ ├── labelTxt +│ │ │ ├── P0000.txt +│ │ │ ├── ... +│ │ ├── trainset_reclabelTxt +│ │ │ ├── P0000.txt +│ │ │ ├── ... +├── val +│ ├── images +│ │ ├── P0003.png +│ │ ├── ... +│ ├── labelTxt-v1.0 +│ │ ├── labelTxt +│ │ │ ├── P0003.txt +│ │ │ ├── ... +│ │ ├── valset_reclabelTxt +│ │ │ ├── P0003.txt +│ │ │ ├── ... +├── test +│ ├── images +│ │ ├── P0006.png +│ │ ├── ... + +``` + +其中,以 `reclabelTxt` 为结尾的文件夹存放了水平检测框的标注,目前仅使用了 `labelTxt-v1.0` 中旋转框的标注。 + +### 数据集切片 + +我们提供了 `tools/dataset_converters/dota/dota_split.py` 脚本用于 DOTA 数据集的准备和切片。 + +```shell +python tools/dataset_converters/dota/dota_split.py \ + [--splt-config ${SPLIT_CONFIG}] \ + [--data-root ${DATA_ROOT}] \ + [--out-dir ${OUT_DIR}] \ + [--ann-subdir ${ANN_SUBDIR}] \ + [--phase ${DATASET_PHASE}] \ + [--nproc ${NPROC}] \ + [--save-ext ${SAVE_EXT}] \ + [--overwrite] +``` + +脚本依赖于 shapely 包,请先通过 `pip install shapely` 安装 shapely。 + +**参数说明**: + +- `--splt-config` : 切片参数的配置文件。 +- `--data-root`: DOTA 数据集的存放位置。 +- `--out-dir`: 切片后的输出位置。 +- `--ann-subdir`: 标注文件夹的名字。 默认为 `labelTxt-v1.0` 。 +- `--phase`: 数据集的阶段。默认为 `trainval test` 。 +- `--nproc`: 进程数量。 默认为 8 。 +- `--save-ext`: 输出图像的扩展名,如置空则与原图保持一致。 默认为 `None` 。 +- `--overwrite`: 如果目标文件夹已存在,是否允许覆盖。 + +基于 DOTA 数据集论文中提供的配置,我们提供了两种切片配置。 + +`./split_config/single_scale.json` 用于单尺度 `single-scale` 切片 +`./split_config/multi_scale.json` 用于多尺度 `multi-scale` 切片 + +DOTA 数据集通常使用 `trainval` 集进行训练,然后使用 `test` 集进行在线验证,大多数论文提供的也是在线验证的精度。 +如果你需要进行本地验证,可以准备 `train` 集和 `val` 集进行训练和测试。 + +示例: + +使用单尺度切片配置准备 `trainval` 和 `test` 集 + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/single_scale.json' + --data-root ${DATA_ROOT} \ + --out-dir ${OUT_DIR} +``` + +准备 DOTA-v1.5 数据集,它的标注文件夹名字是 `labelTxt-v1.5` + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/single_scale.json' + --data-root ${DATA_ROOT} \ + --out-dir ${OUT_DIR} \ + --ann-subdir 'labelTxt-v1.5' +``` + +使用单尺度切片配置准备 `train` 和 `val` 集 + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/single_scale.json' + --data-root ${DATA_ROOT} \ + --phase train val \ + --out-dir ${OUT_DIR} +``` + +使用多尺度切片配置准备 `trainval` 和 `test` 集 + +```shell +python tools/dataset_converters/dota/dota_split.py + --split-config 'tools/dataset_converters/dota/split_config/multi_scale.json' + --data-root ${DATA_ROOT} \ + --out-dir ${OUT_DIR} +``` + +在运行完成后,输出的结构如下: + +```none +${OUT_DIR} +├── trainval +│ ├── images +│ │ ├── P0000__1024__0___0.png +│ │ ├── ... +│ ├── annfiles +│ │ ├── P0000__1024__0___0.txt +│ │ ├── ... +├── test +│ ├── images +│ │ ├── P0006__1024__0___0.png +│ │ ├── ... +│ ├── annfiles +│ │ ├── P0006__1024__0___0.txt +│ │ ├── ... +``` + +此时将配置文件中的 `data_root` 修改为 ${OUT_DIR} 即可开始使用 DOTA 数据集训练。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/easydeploy_guide.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/easydeploy_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..8f337e6cfc2ed719a4761d5fa58bc1f0af210028 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/easydeploy_guide.md @@ -0,0 +1,5 @@ +# EasyDeploy 部署 + +本项目作为 MMYOLO 的部署 project 单独存在,意图剥离 MMDeploy 当前的体系,独自支持用户完成模型训练后的转换和部署功能,使用户的学习和工程成本下降。 + +当前支持对 ONNX 格式和 TensorRT 格式的转换,后续对其他推理平台也会支持起来。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/index.rst b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..3d5f08bc92c0e7037932ecfcd91462fe2db9738c --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/index.rst @@ -0,0 +1,16 @@ +MMDeploy 部署必备教程 +************************ + +.. toctree:: + :maxdepth: 1 + + mmdeploy_guide.md + mmdeploy_yolov5.md + +EasyDeploy 部署必备教程 +************************ + +.. toctree:: + :maxdepth: 1 + + easydeploy_guide.md diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/mmdeploy_guide.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/mmdeploy_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..e935d36e99ee38bb743a1efa5d275d04d0be320d --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/mmdeploy_guide.md @@ -0,0 +1,415 @@ +# MMDeploy 部署 + +## MMDeploy 介绍 + +MMDeploy 是 [OpenMMLab](https://openmmlab.com/) 模型部署工具箱,**为各算法库提供统一的部署体验**。基于 MMDeploy,开发者可以轻松从训练 repo 生成指定硬件所需 SDK,省去大量适配时间。 + +更多介绍和使用指南见 https://mmdeploy.readthedocs.io/zh_CN/latest/get_started.html + +## 算法支持列表 + +目前支持的 model-backend 组合: + +| Model | Task | OnnxRuntime | TensorRT | Model config | +| :----- | :-------------- | :---------: | :------: | :---------------------------------------------------------------------: | +| YOLOv5 | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5) | +| YOLOv6 | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov6) | +| YOLOX | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolox) | +| RTMDet | ObjectDetection | Y | Y | [config](https://github.com/open-mmlab/mmyolo/tree/main/configs/rtmdet) | + +ncnn 和其他后端的支持会在后续支持。 + +## 安装 + +按照[说明](https://mmdeploy.readthedocs.io/zh_CN/latest/get_started.html)安装 mmdeploy。 + +```{note} +如果安装的是 mmdeploy 预编译包,那么也请通过 ‘git clone https://github.com/open-mmlab/mmdeploy.git –depth=1’ 下载 mmdeploy 源码。因为它包含了部署时所需的 tools 文件夹。 +``` + +## MMYOLO 中部署相关配置说明 + +所有部署配置文件在 [`configs/deploy`](../../../configs/deploy/) 目录下。 + +您可以部署静态输入或者动态输入的模型,因此您需要修改模型配置文件中与此相关的数据处理流程。 + +MMDeploy 将后处理整合到自定义的算子中,因此您可以修改 `codebase_config` 中的 `post_processing` 参数来调整后处理策略,参数描述如下: + +```python +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +``` + +- `score_threshold`:在 `nms` 之前筛选候选框的类别分数阈值。 +- `confidence_threshold`:在 `nms` 之前筛选候选框的置信度分数阈值。 +- `iou_threshold`:在 `nms` 中去除重复框的 `iou` 阈值。 +- `max_output_boxes_per_class`:每个类别最大的输出框数量。 +- `pre_top_k`:在 `nms` 之前对候选框分数排序然后固定候选框的个数。 +- `keep_top_k`:`nms` 算法最终输出的候选框个数。 +- `background_label_id`:MMYOLO 算法中没有背景类别信息,置为 `-1` 即可。 + +### 静态输入配置 + +#### (1) 模型配置文件介绍 + +以 MMYOLO 中的 `YOLOv5` 模型配置为例,下面是对部署时使用的模型配置文件参数说明介绍。 + +```python +_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +``` + +`_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py'` 继承了训练时构建模型的配置。 + +`test_pipeline` 为部署时对输入图像进行处理的流程,`LetterResize` 控制了输入图像的尺寸,同时限制了导出模型所能接受的输入尺寸。 + +`test_dataloader` 为部署时构建数据加载器配置,`batch_shapes_cfg` 控制了是否启用 `batch_shapes` 策略,详细内容可以参考 [yolov5 配置文件说明](../../tutorials/config.md) 。 + +#### (2) 部署配置文件介绍 + +以 `MMYOLO` 中的 `YOLOv5` 部署配置为例,下面是对配置文件参数说明介绍。 + +`ONNXRuntime` 部署 `YOLOv5` 可以使用 [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_static.py) 配置。 + +```python +_base_ = ['./base_static.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +`backend_config` 中指定了部署后端 `type='onnxruntime'`,其他信息可参考第三小节。 + +`TensorRT` 部署 `YOLOv5` 可以使用 [`detection_tensorrt_static-640x640.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_tensorrt_static-640x640.py) 配置。 + +```python +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False +``` + +`backend_config` 中指定了后端 `type=‘tensorrt’`。 + +与 `ONNXRuntime` 部署配置不同的是,`TensorRT` 需要指定输入图片尺寸和构建引擎文件需要的参数,包括: + +- `onnx_config` 中指定 `input_shape=(640, 640)` +- `backend_config['common_config']` 中包括 `fp16_mode=False` 和 `max_workspace_size=1 << 30`, `fp16_mode` 表示是否以 `fp16` 的参数格式构建引擎,`max_workspace_size` 表示当前 `gpu` 设备最大显存, 单位为 `GB`。`fp16` 的详细配置可以参考 [`detection_tensorrt-fp16_static-640x640.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_tensorrt-fp16_static-640x640.py) +- `backend_config['model_inputs']['input_shapes']['input']` 中 `min_shape` /`opt_shape`/`max_shape` 对应的值在静态输入下应该保持相同,即默认均为 `[1, 3, 640, 640]`。 + +`use_efficientnms` 是 `MMYOLO` 系列新引入的配置,表示在导出 `onnx` 时是否启用`Efficient NMS Plugin`来替换 `MMDeploy` 中的 `TRTBatchedNMS plugin` 。 + +可以参考 `TensorRT` 官方实现的 [Efficient NMS Plugin](https://github.com/NVIDIA/TensorRT/blob/main/plugin/efficientNMSPlugin/README.md) 获取更多详细信息。 + +**注意**,这个功能仅仅在 TensorRT >= 8.0 版本才能使用,无需编译开箱即用。 + +### 动态输入配置 + +#### (1) 模型配置文件介绍 + +当您部署动态输入模型时,您无需修改任何模型配置文件,仅需要修改部署配置文件即可。 + +#### (2) 部署配置文件介绍 + +`ONNXRuntime` 部署 `YOLOv5` 可以使用 [`detection_onnxruntime_dynamic.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_dynamic.py) 配置。 + +```python +_base_ = ['./base_dynamic.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +`backend_config` 中指定了后端 `type='onnxruntime'`,其他配置与上一节在 ONNXRuntime 部署静态输入模型相同。 + +`TensorRT` 部署 `YOLOv5` 可以使用 [`detection_tensorrt_dynamic-192x192-960x960.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py) 配置。 + +```python +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False +``` + +`backend_config` 中指定了后端 `type='tensorrt'`,由于 `TensorRT` 动态输入与静态输入有所不同,您可以了解更多动态输入相关信息通过访问 [TensorRT dynamic input official introduction](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-843/developer-guide/index.html#work_dynamic_shapes)。 + +`TensorRT` 部署需要配置 `min_shape`, `opt_shape`, `max_shape` ,`TensorRT` 限制输入图片的尺寸在 `min_shape` 和 ` max_shape` 之间。 + +`min_shape` 为输入图片的最小尺寸,`opt_shape` 为输入图片常见尺寸, 在这个尺寸下推理性能最好,`max_shape` 为输入图片的最大尺寸。 + +`use_efficientnms` 配置与上节 `TensorRT` 静态输入配置相同。 + +### INT8 量化配置 + +!!! 部署 TensorRT INT8 模型教程即将发布 !!! + +## 模型转换 + +### 使用方法 + +#### 从源码安装的 MMDeploy + +设置 `MMDeploy` 根目录为环境变量 `MMDEPLOY_DIR` ,例如 `export MMDEPLOY_DIR=/the/root/path/of/MMDeploy` + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + ${DEPLOY_CFG_PATH} \ + ${MODEL_CFG_PATH} \ + ${MODEL_CHECKPOINT_PATH} \ + ${INPUT_IMG} \ + --test-img ${TEST_IMG} \ + --work-dir ${WORK_DIR} \ + --calib-dataset-cfg ${CALIB_DATA_CFG} \ + --device ${DEVICE} \ + --log-level INFO \ + --show \ + --dump-info +``` + +### 参数描述 + +- `deploy_cfg` : mmdeploy 针对此模型的部署配置,包含推理框架类型、是否量化、输入 shape 是否动态等。配置文件之间可能有引用关系,`configs/deploy/detection_onnxruntime_static.py` 是一个示例。 +- `model_cfg` : MMYOLO 算法库的模型配置,例如 `configs/deploy/model/yolov5_s-deploy.py`, 与 mmdeploy 的路径无关。 +- `checkpoint` : torch 模型路径。可以 http/https 开头,详见 `mmengine.fileio` 的实现。 +- `img` : 模型转换时,用做测试的图像文件路径。 +- `--test-img` : 用于测试模型的图像文件路径。默认设置成`None`。 +- `--work-dir` : 工作目录,用来保存日志和模型文件。 +- `--calib-dataset-cfg` : 此参数只有int8模式下生效,用于校准数据集配置文件。若在int8模式下未传入参数,则会自动使用模型配置文件中的'val'数据集进行校准。 +- `--device` : 用于模型转换的设备。 默认是`cpu`,对于 trt 可使用 `cuda:0` 这种形式。 +- `--log-level` : 设置日记的等级,选项包括`'CRITICAL', 'FATAL', 'ERROR', 'WARN', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'`。 默认是`INFO`。 +- `--show` : 是否显示检测的结果。 +- `--dump-info` : 是否输出 SDK 信息。 + +#### 通过 pip install 安装的 MMDeploy + +假设当前的工作目录为 mmyolo 的根目录, 那么以 [YoloV5](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py) 模型为例,你可以从[此处](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth)下载对应的 checkpoint,并使用以下代码将之转换为 onnx 模型: + +```python +from mmdeploy.apis import torch2onnx +from mmdeploy.backend.sdk.export_info import export2SDK + +img = 'demo/demo.jpg' +work_dir = 'mmdeploy_models/mmyolo/onnx' +save_file = 'end2end.onnx' +deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +model_checkpoint = 'checkpoints/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' +device = 'cpu' + +# 1. convert model to onnx +torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg, + model_checkpoint, device) + +# 2. extract pipeline info for inference by MMDeploy SDK +export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, + device=device) +``` + +## 模型规范 + +在使用转换后的模型进行推理之前,有必要了解转换结果的结构。 它存放在 `--work-dir` 指定的路路径下。 + +上例中的`mmdeploy_models/mmyolo/onnx`,结构如下: + +``` +mmdeploy_models/mmyolo/onnx +├── deploy.json +├── detail.json +├── end2end.onnx +└── pipeline.json +``` + +重要的是: + +- **end2end.onnx**: 推理引擎文件。可用 ONNX Runtime 推理 +- ***xxx*.json**: mmdeploy SDK 推理所需的 meta 信息 + +整个文件夹被定义为**mmdeploy SDK model**。换言之,**mmdeploy SDK model**既包括推理引擎,也包括推理 meta 信息。 + +## 模型推理 + +### 后端模型推理 + +以上述模型转换后的 `end2end.onnx` 为例,你可以使用如下代码进行推理: + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = 'configs/deploy/detection_onnxruntime_dynamic.py' +model_cfg = 'configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +device = 'cpu' +backend_model = ['mmdeploy_models/mmyolo/onnx/end2end.onnx'] +image = 'demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='work_dir/output_detection.png') +``` + +运行上述代码后,你可以在 `work_dir` 中看到推理的结果图片 `output_detection.png`。 + +### SDK模型推理 + +你也可以参考如下代码,对 SDK model 进行推理: + +```python +from mmdeploy_runtime import Detector +import cv2 + +img = cv2.imread('demo/demo.jpg') + +# create a detector +detector = Detector(model_path='mmdeploy_models/mmyolo/onnx', + device_name='cpu', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('work_dir/output_detection.png', img) +``` + +除了python API,mmdeploy SDK 还提供了诸如 C、C++、C#、Java等多语言接口。 +你可以参考[样例](https://github.com/open-mmlab/mmdeploy/tree/main/demo)学习其他语言接口的使用方法。 + +## 模型评测 + +当您将 PyTorch 模型转换为后端支持的模型后,您可能需要验证模型的精度,使用 `${MMDEPLOY_DIR}/tools/test.py` + +```shell +python3 ${MMDEPLOY_DIR}/tools/test.py \ + ${DEPLOY_CFG} \ + ${MODEL_CFG} \ + --model ${BACKEND_MODEL_FILES} \ + --device ${DEVICE} \ + --work-dir ${WORK_DIR} \ + [--cfg-options ${CFG_OPTIONS}] \ + [--show] \ + [--show-dir ${OUTPUT_IMAGE_DIR}] \ + [--interval ${INTERVAL}] \ + [--wait-time ${WAIT_TIME}] \ + [--log2file work_dirs/output.txt] + [--speed-test] \ + [--warmup ${WARM_UP}] \ + [--log-interval ${LOG_INTERVERL}] \ + [--batch-size ${BATCH_SIZE}] \ + [--uri ${URI}] +``` + +### 参数描述 + +- `deploy_cfg`: 部署配置文件。 +- `model_cfg`: MMYOLO 模型配置文件。 +- `--model`: 导出的后端模型。 例如, 如果我们导出了 TensorRT 模型,我们需要传入后缀为 ".engine" 文件路径。 +- `--device`: 运行模型的设备。请注意,某些后端会限制设备。例如,TensorRT 必须在 cuda 上运行。 +- `--work-dir`: 模型转换、报告生成的路径。 +- `--cfg-options`: 传入额外的配置,将会覆盖当前部署配置。 +- `--show`: 是否在屏幕上显示评估结果。 +- `--show-dir`: 保存评估结果的目录。(只有给出这个参数才会保存结果)。 +- `--interval`: 屏幕上显示评估结果的间隔。 +- `--wait-time`: 每个窗口的显示时间 +- `--log2file`: 将评估结果(和速度)记录到文件中。 +- `--speed-test`: 是否开启速度测试。 +- `--warmup`: 在计算推理时间之前进行预热,需要先开启 `speed-test`。 +- `--log-interval`: 每个日志之间的间隔,需要先设置 `speed-test`。 +- `--batch-size`: 推理的批量大小,它将覆盖数据配置中的 `samples_per_gpu`。默认为 `1`。请注意,并非所有模型都支持 `batch_size > 1`。 +- `--uri`: 在边缘设备上推理时的 ipv4 或 ipv6 端口号。 + +注意:`${MMDEPLOY_DIR}/tools/test.py` 中的其他参数用于速度测试。他们不影响评估。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/mmdeploy_yolov5.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/mmdeploy_yolov5.md new file mode 100644 index 0000000000000000000000000000000000000000..e035e17641559853ae44c59b16c2c151f7073abd --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/deploy/mmdeploy_yolov5.md @@ -0,0 +1,572 @@ +# YOLOv5 部署全流程说明 + +请先参考 [`部署必备指南`](./mmdeploy_guide.md) 了解部署配置文件等相关信息。 + +## 模型训练和测试 + +模型训练和测试请参考 [YOLOv5 从入门到部署全流程](./mmdeploy_yolov5.md) 。 + +## 准备 MMDeploy 运行环境 + +安装 `MMDeploy` 请参考 [`源码手动安装`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/zh_cn/01-how-to-build/build_from_source.md) ,选择您所使用的平台编译 `MMDeploy` 和自定义算子。 + +*注意!* 如果环境安装有问题,可以查看 [`MMDeploy FAQ`](https://github.com/open-mmlab/mmdeploy/blob/dev-1.x/docs/zh_cn/faq.md) 或者在 `issuse` 中提出您的问题。 + +## 准备模型配置文件 + +本例将以基于 `coco` 数据集预训练的 `YOLOv5` 配置和权重进行部署的全流程讲解,包括静态/动态输入模型导出和推理,`TensorRT` / `ONNXRuntime` 两种后端部署和测试。 + +### 静态输入配置 + +#### (1) 模型配置文件 + +当您需要部署静态输入模型时,您应该确保模型的输入尺寸是固定的,比如在测试流程或测试数据集加载时输入尺寸为 `640x640`。 + +您可以查看 [`yolov5_s-static.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/model/yolov5_s-static.py) 中测试流程或测试数据集加载部分,如下所示: + +```python +_base_ = '../../yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=False, + use_mini_pad=False, + ), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +test_dataloader = dict( + dataset=dict(pipeline=test_pipeline, batch_shapes_cfg=None)) +``` + +由于 `yolov5` 在测试时会开启 `allow_scale_up` 和 `use_mini_pad` 改变输入图像的尺寸来取得更高的精度,但是会给部署静态输入模型造成输入尺寸不匹配的问题。 + +该配置相比与原始配置文件进行了如下修改: + +- 关闭 `test_pipline` 中改变尺寸相关的配置,如 `LetterResize` 中 `allow_scale_up=False` 和 `use_mini_pad=False` 。 +- 关闭 `test_dataloader` 中 `batch shapes` 策略,即 `batch_shapes_cfg=None` 。 + +#### (2) 部署配置文件 + +当您部署在 `ONNXRuntime` 时,您可以查看 [`detection_onnxruntime_static.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/detection_onnxruntime_static.py) ,如下所示: + +```python +_base_ = ['./base_static.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +默认配置中的 `post_processing` 后处理参数是当前模型与 `pytorch` 模型精度对齐的配置,若您需要修改相关参数,可以参考 [`部署必备指南`](./mmdeploy_guide.md) 的详细介绍。 + +当您部署在 `TensorRT` 时,您可以查看 [`detection_tensorrt_static-640x640.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/detection_tensorrt_static-640x640.py) ,如下所示: + +```python +_base_ = ['./base_static.py'] +onnx_config = dict(input_shape=(640, 640)) +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 640, 640], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 640, 640]))) + ]) +use_efficientnms = False +``` + +本例使用了默认的输入尺寸 `input_shape=(640, 640)` ,构建网络以 `fp32` 模式即 `fp16_mode=False`,并且默认构建 `TensorRT` 构建引擎所使用的显存 `max_workspace_size=1 << 30` 即最大为 `1GB` 显存。 + +### 动态输入配置 + +#### (1) 模型配置文件 + +当您需要部署动态输入模型时,模型的输入可以为任意尺寸(`TensorRT` 会限制最小和最大输入尺寸),因此使用默认的 [`yolov5_s-v61_syncbn_8xb16-300e_coco.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py) 模型配置文件即可,其中数据处理和数据集加载器部分如下所示: + +```python +batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=val_batch_size_per_gpu, + img_size=img_scale[0], + size_divisor=32, + extra_pad_ratio=0.5) + +test_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, + data_prefix=dict(img='val2017/'), + ann_file='annotations/instances_val2017.json', + pipeline=test_pipeline, + batch_shapes_cfg=batch_shapes_cfg)) +``` + +其中 `LetterResize` 类初始化传入了 `allow_scale_up=False` 控制输入的小图像是否上采样,同时默认 `use_mini_pad=False` 关闭了图片最小填充策略,`val_dataloader['dataset']`中传入了 `batch_shapes_cfg=batch_shapes_cfg`,即按照 `batch` 内的输入尺寸进行最小填充。上述策略会改变输入图像的尺寸,因此动态输入模型在测试时会按照上述数据集加载器动态输入。 + +#### (2) 部署配置文件 + +当您部署在 `ONNXRuntime` 时,您可以查看 [`detection_onnxruntime_dynamic.py`](https://github.com/open-mmlab/mmyolo/blob/main/configs/deploy/detection_onnxruntime_dynamic.py) ,如下所示: + +```python +_base_ = ['./base_dynamic.py'] +codebase_config = dict( + type='mmyolo', + task='ObjectDetection', + model_type='end2end', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1), + module=['mmyolo.deploy']) +backend_config = dict(type='onnxruntime') +``` + +与静态输入配置仅有 `_base_ = ['./base_dynamic.py']` 不同,动态输入会额外继承 `dynamic_axes` 属性。其他配置与静态输入配置相同。 + +当您部署在 `TensorRT` 时,您可以查看 [`detection_tensorrt_dynamic-192x192-960x960.py`](https://github.com/open-mmlab/mmyolo/tree/main/configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py) ,如下所示: + +```python +_base_ = ['./base_dynamic.py'] +backend_config = dict( + type='tensorrt', + common_config=dict(fp16_mode=False, max_workspace_size=1 << 30), + model_inputs=[ + dict( + input_shapes=dict( + input=dict( + min_shape=[1, 3, 192, 192], + opt_shape=[1, 3, 640, 640], + max_shape=[1, 3, 960, 960]))) + ]) +use_efficientnms = False +``` + +本例构建网络以 `fp32` 模式即 `fp16_mode=False`,构建 `TensorRT` 构建引擎所使用的显存 `max_workspace_size=1 << 30` 即最大为 `1GB` 显存。 + +同时默认配置 `min_shape=[1, 3, 192, 192]`,`opt_shape=[1, 3, 640, 640]` ,`max_shape=[1, 3, 960, 960]` ,意为该模型所能接受的输入尺寸最小为 `192x192` ,最大为 `960x960`,最常见尺寸为 `640x640`。 + +当您部署自己的模型时,需要根据您的输入图像尺寸进行调整。 + +## 模型转换 + +本教程所使用的 `MMDeploy` 根目录为 `/home/openmmlab/dev/mmdeploy`,请注意修改为您的 `MMDeploy` 目录。 +预训练权重下载于 [yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth](https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth) ,保存在本地的 `/home/openmmlab/dev/mmdeploy/yolov5s.pth`。 + +```shell +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth -O /home/openmmlab/dev/mmdeploy/yolov5s.pth +``` + +命令行执行以下命令配置相关路径: + +```shell +export MMDEPLOY_DIR=/home/openmmlab/dev/mmdeploy +export PATH_TO_CHECKPOINTS=/home/openmmlab/dev/mmdeploy/yolov5s.pth +``` + +### YOLOv5 静态输入模型导出 + +#### ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cpu +``` + +#### TensorRT + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cuda:0 +``` + +### YOLOv5 动态输入模型导出 + +#### ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_onnxruntime_dynamic.py \ + configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cpu + --dump-info +``` + +#### TensorRT + +```shell +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py \ + configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir \ + --show \ + --device cuda:0 + --dump-info +``` + +当您使用上述命令转换模型时,您将会在 `work_dir` 文件夹下发现以下文件: + +![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/760f3f7f-aa23-46cf-987c-717d3490246f) + +或者 + +![image](https://github.com/open-mmlab/mmdeploy/assets/110151316/732bcd9a-fca0-40ba-b5af-540a47eb9c35) + +在导出 `onnxruntime`模型后,您将得到图1的六个文件,其中 `end2end.onnx` 表示导出的`onnxruntime`模型,`xxx.json` 表示 `MMDeploy SDK` 推理所需要的 meta 信息。 + +在导出 `TensorRT`模型后,您将得到图2的七个文件,其中 `end2end.onnx` 表示导出的中间模型,`MMDeploy`利用该模型自动继续转换获得 `end2end.engine` 模型用于 `TensorRT `部署,`xxx.json` 表示 `MMDeploy SDK` 推理所需要的 meta 信息。 + +## 模型评测 + +当您转换模型成功后,可以使用 `${MMDEPLOY_DIR}/tools/test.py` 工具对转换后的模型进行评测。下面是对 `ONNXRuntime` 和 `TensorRT` 静态模型的评测,动态模型评测修改传入模型配置即可。 + +### ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir/end2end.onnx \ + --device cpu \ + --work-dir work_dir +``` + +执行完成您将看到命令行输出检测结果指标如下: + +![image](https://user-images.githubusercontent.com/92794867/199380483-cf8d867b-7309-4994-938a-f743f4cada77.png) + +### TensorRT + +**注意**: TensorRT 需要执行设备是 `cuda` + +```shell +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir/end2end.engine \ + --device cuda:0 \ + --work-dir work_dir +``` + +执行完成您将看到命令行输出检测结果指标如下: + +![image](https://user-images.githubusercontent.com/92794867/199380370-da15cfca-2723-4e5b-b6cf-0afb5f44a66a.png) + +**未来我们将会支持模型测速等更加实用的脚本** + +# 使用 Docker 部署测试 + +`MMYOLO` 提供了一个 [`Dockerfile`](https://github.com/open-mmlab/mmyolo/blob/main/docker/Dockerfile_deployment) 用于构建镜像。请确保您的 `docker` 版本大于等于 `19.03`。 + +温馨提示;国内用户建议取消掉 [`Dockerfile`](https://github.com/open-mmlab/mmyolo/blob/main/docker/Dockerfile_deployment) 里面 `Optional` 后两行的注释,可以获得火箭一般的下载提速: + +```dockerfile +# (Optional) +RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ + pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +``` + +构建命令: + +```bash +# build an image with PyTorch 1.12, CUDA 11.6, TensorRT 8.2.4 ONNXRuntime 1.8.1 +docker build -f docker/Dockerfile_deployment -t mmyolo:v1 . +``` + +用以下命令运行 Docker 镜像: + +```bash +export DATA_DIR=/path/to/your/dataset +docker run --gpus all --shm-size=8g -it --name mmyolo -v ${DATA_DIR}:/openmmlab/mmyolo/data/coco mmyolo:v1 +``` + +`DATA_DIR` 是 COCO 数据的路径。 + +复制以下脚本到 `docker` 容器 `/openmmlab/mmyolo/script.sh`: + +```bash +#!/bin/bash +wget -q https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + -O yolov5s.pth +export MMDEPLOY_DIR=/openmmlab/mmdeploy +export PATH_TO_CHECKPOINTS=/openmmlab/mmyolo/yolov5s.pth + +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir_trt \ + --device cuda:0 + +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir_trt/end2end.engine \ + --device cuda:0 \ + --work-dir work_dir_trt + +python3 ${MMDEPLOY_DIR}/tools/deploy.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + ${PATH_TO_CHECKPOINTS} \ + demo/demo.jpg \ + --work-dir work_dir_ort \ + --device cpu + +python3 ${MMDEPLOY_DIR}/tools/test.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + --model work_dir_ort/end2end.onnx \ + --device cpu \ + --work-dir work_dir_ort +``` + +在 `/openmmlab/mmyolo` 下运行: + +```bash +sh script.sh +``` + +脚本会自动下载 `MMYOLO` 的 `YOLOv5` 预训练权重并使用 `MMDeploy` 进行模型转换和测试。您将会看到以下输出: + +- TensorRT: + + ![image](https://user-images.githubusercontent.com/92794867/199657349-1bad9196-c00b-4a65-84f5-80f51e65a2bd.png) + +- ONNXRuntime: + + ![image](https://user-images.githubusercontent.com/92794867/199657283-95412e84-3ba4-463f-b4b2-4bf52ec4acbd.png) + +可以看到,经过 `MMDeploy` 部署的模型与 [MMYOLO-YOLOv5](https://github.com/open-mmlab/mmyolo/tree/main/configs/yolov5#results-and-models) 的 mAP-37.7 差距在 1% 以内。 + +如果您需要测试您的模型推理速度,可以使用以下命令: + +- TensorRT + +```shell +python3 ${MMDEPLOY_DIR}/tools/profiler.py \ + configs/deploy/detection_tensorrt_static-640x640.py \ + configs/deploy/model/yolov5_s-static.py \ + data/coco/val2017 \ + --model work_dir_trt/end2end.engine \ + --device cuda:0 +``` + +- ONNXRuntime + +```shell +python3 ${MMDEPLOY_DIR}/tools/profiler.py \ + configs/deploy/detection_onnxruntime_static.py \ + configs/deploy/model/yolov5_s-static.py \ + data/coco/val2017 \ + --model work_dir_ort/end2end.onnx \ + --device cpu +``` + +## 模型推理 + +### 后端模型推理 + +#### ONNXRuntime + +以上述模型转换后的 `end2end.onnx` 为例,您可以使用如下代码进行推理: + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = './configs/deploy/detection_onnxruntime_dynamic.py' +model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +device = 'cpu' +backend_model = ['./work_dir/end2end.onnx'] +image = '../mmyolo/demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='work_dir/output_detection.png') +``` + +#### TensorRT + +以上述模型转换后的 `end2end.engine` 为例,您可以使用如下代码进行推理: + +```python +from mmdeploy.apis.utils import build_task_processor +from mmdeploy.utils import get_input_shape, load_config +import torch + +deploy_cfg = './configs/deploy/detection_tensorrt_dynamic-192x192-960x960.py' +model_cfg = '../mmyolo/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' +device = 'cuda:0' +backend_model = ['./work_dir/end2end.engine'] +image = '../mmyolo/demo/demo.jpg' + +# read deploy_cfg and model_cfg +deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg) + +# build task and backend model +task_processor = build_task_processor(model_cfg, deploy_cfg, device) +model = task_processor.build_backend_model(backend_model) + +# process input image +input_shape = get_input_shape(deploy_cfg) +model_inputs, _ = task_processor.create_input(image, input_shape) + +# do model inference +with torch.no_grad(): + result = model.test_step(model_inputs) + +# visualize results +task_processor.visualize( + image=image, + model=model, + result=result[0], + window_name='visualize', + output_file='work_dir/output_detection.png') +``` + +### SDK 模型推理 + +#### ONNXRuntime + +以上述模型转换后的 `end2end.onnx` 为例,您可以使用如下代码进行 `SDK` 推理: + +```python +from mmdeploy_runtime import Detector +import cv2 + +img = cv2.imread('../mmyolo/demo/demo.jpg') + +# create a detector +detector = Detector(model_path='work_dir', + device_name='cpu', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('work_dir/output_detection.png', img) +``` + +#### TensorRT + +以上述模型转换后的 `end2end.engine` 为例,您可以使用如下代码进行 `SDK` 推理: + +```python +from mmdeploy_runtime import Detector +import cv2 + +img = cv2.imread('../mmyolo/demo/demo.jpg') + +# create a detector +detector = Detector(model_path='work_dir', + device_name='cuda', device_id=0) +# perform inference +bboxes, labels, masks = detector(img) + +# visualize inference result +indices = [i for i in range(len(bboxes))] +for index, bbox, label_id in zip(indices, bboxes, labels): + [left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4] + if score < 0.3: + continue + + cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0)) + +cv2.imwrite('work_dir/output_detection.png', img) +``` + +除了python API,mmdeploy SDK 还提供了诸如 C、C++、C#、Java等多语言接口。 +你可以参考[样例](https://github.com/open-mmlab/mmdeploy/tree/main/demo)学习其他语言接口的使用方法。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md new file mode 100644 index 0000000000000000000000000000000000000000..d4e3ddf8f6a107d19cf48f5677d75f47a7b7351c --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md @@ -0,0 +1,1326 @@ +# 标注+训练+测试+部署全流程 + +在平时的工作学习中,我们经常会遇到一些任务需要训练自定义的私有数据集,开源数据集去作为上线模型的场景比较少,这就需要我们对自己的私有数据集进行一系列的操作,以确保模型能够上线生产服务于客户。 + +```{SeeAlso} +本文档配套的视频已发布在 B 站,可前去查看: [自定义数据集从标注到部署保姆级教程](https://www.bilibili.com/video/BV1RG4y137i5) +``` + +```{Note} +本教程所有指令是在 Linux 上面完成,Windows 也是完全可用的,但是命令和操作稍有不同。 +``` + +本教程默认您已经完成 MMYOLO 的安装,如果未安装,请参考文档 [开始你的第一步](https://mmyolo.readthedocs.io/zh_CN/latest/get_started.html#id1) 进行安装。 + +本教程涵盖从 用户自定义图片数据集标注 到 最终进行训练和部署 的整体流程。步骤概览如下: + +01. 数据集准备:`tools/misc/download_dataset.py` +02. 使用 [labelme](https://github.com/wkentaro/labelme) 和算法进行辅助和优化数据集标注:`demo/image_demo.py` + labelme +03. 使用脚本转换成 COCO 数据集格式:`tools/dataset_converters/labelme2coco.py` +04. 数据集划分为训练集、验证集和测试集:`tools/misc/coco_split.py` +05. 根据数据集内容新建 config 文件 +06. 数据集可视化分析:`tools/analysis_tools/dataset_analysis.py` +07. 优化 Anchor 尺寸:`tools/analysis_tools/optimize_anchors.py` +08. 可视化 config 配置中数据处理部分: `tools/analysis_tools/browse_dataset.py` +09. 训练:`tools/train.py` +10. 推理:`demo/image_demo.py` +11. 部署 + +```{Note} +在训练得到模型权重和验证集的 mAP 后,用户需要对预测错误的 bad case 进行深入分析,以便优化模型,MMYOLO 在后续会增加这个功能,敬请期待。 +``` + +下面详细介绍每一步。 + +## 1. 数据集准备 + +- 如果您现在暂时没有自己的数据集,亦或者想尝试用一个小型数据集来跑通我们的整体流程,可以使用本教程提供的一个 144 张图片的 `cat` 数据集(本 `cat` 数据集由 @RangeKing 提供原始图片,由 @PeterH0323 进行数据清洗)。本教程的剩余部分都将以此 `cat` 数据集为例进行讲解。 + +
+cat dataset +
+ +下载也非常简单,只需要一条命令即可完成(数据集压缩包大小 `217 MB`): + +```shell +python tools/misc/download_dataset.py --dataset-name cat --save-dir ./data/cat --unzip --delete +``` + +该命令会自动下载数据集到 `./data/cat` 文件夹中,该文件的目录结构是: + +```shell +. +└── ./data/cat + ├── images # 图片文件 + │ ├── image1.jpg + │ ├── image2.png + │ └── ... + ├── labels # labelme 标注文件 + │ ├── image1.json + │ ├── image2.json + │ └── ... + ├── annotations # 数据集划分的 COCO 文件 + │ ├── annotations_all.json # 全量数据的 COCO label 文件 + │ ├── trainval.json # 划分比例 80% 的数据 + │ └── test.json # 划分比例 20% 的数据 + └── class_with_id.txt # id + class_name 文件 +``` + +这个数据集可以直接训练,如果您想体验整个流程的话,可以将 `images` 文件夹**以外的**其余文件都删除。 + +- 如您已经有数据,可以将其组成下面的结构: + +```shell +. +└── $DATA_ROOT + └── images + ├── image1.jpg + ├── image2.png + └── ... +``` + +## 2. 使用 labelme 和算法进行辅助和优化数据集标注 + +通常,标注有 2 种方法: + +- 软件或者算法辅助 + 人工修正 label(推荐,降本提速) +- 仅人工标注 + +```{Note} +目前我们也在考虑接入第三方库来支持通过 GUI 界面调用 MMYOLO 推理接口实现算法辅助标注和人工优化标注一体功能。 +如果您有兴趣或者想法可以在 issue 留言或直接联系我们! +``` + +### 2.1 软件或者算法辅助 + 人工修正 label + +辅助标注的原理是用已有模型进行推理,将得出的推理信息保存为标注软件 label 文件格式。然后人工操作标注软件加载生成好的 label 文件,只需要检查每张图片的目标是否标准,以及是否有漏掉、错标的目标。【软件或者算法辅助 + 人工修正 label】这种方式可以节省很多时间和精力,达到**降本提速**的目的。 + +```{Note} +如果已有模型(典型的如 COCO 预训练模型)没有您自定义新数据集的类别,建议先人工打 100 张左右的图片 label,训练个初始模型,然后再进行辅助标注。 +``` + +下面会分别介绍其过程: + +#### 2.1.1 软件或者算法辅助 + +使用 MMYOLO 提供的模型推理脚本 `demo/image_demo.py`,并设置 `--to-labelme` 则可以将推理结果生成 labelme 格式的 label 文件,具体用法如下: + +```shell +python demo/image_demo.py img \ + config \ + checkpoint + [--out-dir OUT_DIR] \ + [--device DEVICE] \ + [--show] \ + [--deploy] \ + [--score-thr SCORE_THR] \ + [--class-name CLASS_NAME] + [--to-labelme] +``` + +其中: + +- `img`: 图片的路径,支持文件夹、文件、URL; +- `config`:用到的模型 config 文件路径; +- `checkpoint`:用到的模型权重文件路径; +- `--out-dir`:推理结果输出到指定目录下,默认为 `./output`,当 `--show` 参数存在时,不保存检测结果; +- `--device`:使用的计算资源,包括 `CUDA`, `CPU` 等,默认为 `cuda:0`; +- `--show`:使用该参数表示在屏幕上显示检测结果,默认为 `False`; +- `--deploy`:是否切换成 deploy 模式; +- `--score-thr`:置信度阈值,默认为 `0.3`; +- `--to-labelme`:是否导出 `labelme` 格式的 label 文件,不可以与 `--show` 参数同时存在 + +例子: + +这里使用 YOLOv5-s 作为例子来进行辅助标注刚刚下载的 `cat` 数据集,先下载 YOLOv5-s 的权重: + +```shell +mkdir work_dirs +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth -P ./work_dirs +``` + +由于 COCO 80 类数据集中已经包括了 `cat` 这一类,因此我们可以直接加载 COCO 预训练权重进行辅助标注。 + +```shell +python demo/image_demo.py ./data/cat/images \ + ./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + ./work_dirs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --out-dir ./data/cat/labels \ + --class-name cat \ + --to-labelme +``` + +```{Tip} +- 如果您的数据集需要标注多类,可以采用类似 `--class-name class1 class2` 格式输入; +- 如果全部输出,则删掉 `--class-name` 这个 flag 即可全部类都输出。 +``` + +生成的 label 文件会在 `--out-dir` 中: + +```shell +. +└── $OUT_DIR + ├── image1.json + ├── image1.json + └── ... +``` + +这是一张原图及其生成的 json 例子: + +
+ 图片 + 图片 +
+ +#### 2.1.2 人工标注 + +本教程使用的标注软件是 [labelme](https://github.com/wkentaro/labelme) + +- 安装 labelme + +```shell +conda create -n labelme python=3.8 +conda activate labelme +pip install labelme==5.1.1 +``` + +- 启动 labelme + +```shell +labelme ${图片文件夹路径(即上一步的图片文件夹)} \ + --output ${label文件所处的文件夹路径(即上一步的 --out-dir)} \ + --autosave \ + --nodata +``` + +其中: + +- `--output`:labelme 标注文件保存路径,如果该路径下已经存在部分图片的标注文件,则会进行加载; +- `--autosave`:标注文件自动保存,会略去一些繁琐的保存步骤; +- `--nodata`:每张图片的标注文件中不保存图片的 base64 编码,设置了这个 flag 会大大减少标注文件的大小。 + +例子: + +```shell +cd /path/to/mmyolo +labelme ./data/cat/images --output ./data/cat/labels --autosave --nodata +``` + +输入命令之后 labelme 就会启动,然后进行 label 检查即可。如果 labelme 启动失败,命令行输入 `export QT_DEBUG_PLUGINS=1` 查看具体缺少什么库,安装一下即可。 + +
+label UI +
+ +```{warning} +标注的时候务必使用 `rectangle`,快捷键 `Ctrl + R`(如下图) + +
+rectangle +
+``` + +### 2.2 仅人工标注 + +步骤和 【2.1.2 人工标注】 相同,只是这里是直接标注,没有预先生成的 label 。 + +## 3. 使用脚本转换成 COCO 数据集格式 + +### 3.1 使用脚本转换 + +MMYOLO 提供脚本将 labelme 的 label 转换为 COCO label + +```shell +python tools/dataset_converters/labelme2coco.py --img-dir ${图片文件夹路径} \ + --labels-dir ${label 文件夹位置} \ + --out ${输出 COCO label json 路径} \ + [--class-id-txt ${class_with_id.txt 路径}] +``` + +其中: +`--class-id-txt`:是数据集 `id class_name` 的 `.txt` 文件: + +- 如果不指定,则脚本会自动生成,生成在 `--out` 同级的目录中,保存文件名为 `class_with_id.txt`; +- 如果指定,脚本仅会进行读取但不会新增或者覆盖,同时,脚本里面还会判断是否存在 `.txt` 中其他的类,如果出现了会报错提示,届时,请用户检查 `.txt` 文件并加入新的类及其 `id`。 + +`.txt` 文件的例子如下( `id` 可以和 COCO 一样,从 `1` 开始): + +```text +1 cat +2 dog +3 bicycle +4 motorcycle + +``` + +例子: + +以本教程的 `cat` 数据集为例: + +```shell +python tools/dataset_converters/labelme2coco.py --img-dir ./data/cat/images \ + --labels-dir ./data/cat/labels \ + --out ./data/cat/annotations/annotations_all.json +``` + +本次演示的 `cat` 数据集(注意不需要包括背景类),可以看到生成的 `class_with_id.txt` 中只有 `1` 类: + +```text +1 cat + +``` + +### 3.2 检查转换的 COCO label + +使用下面的命令可以将 COCO 的 label 在图片上进行显示,这一步可以验证刚刚转换是否有问题: + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir ${图片文件夹路径} \ + --ann-file ${COCO label json 路径} +``` + +例子: + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir ./data/cat/images \ + --ann-file ./data/cat/annotations/annotations_all.json +``` + +
+Image +
+ +```{SeeAlso} +关于 `tools/analysis_tools/browse_coco_json.py` 的更多用法请参考 [可视化 COCO label](https://mmyolo.readthedocs.io/zh_CN/latest/user_guides/useful_tools.html#coco)。 +``` + +## 4. 数据集划分为训练集、验证集和测试集 + +通常,自定义图片都是一个大文件夹,里面全部都是图片,需要我们自己去对图片进行训练集、验证集、测试集的划分,如果数据量比较少,可以不划分验证集。下面是划分脚本的具体用法: + +```shell +python tools/misc/coco_split.py --json ${COCO label json 路径} \ + --out-dir ${划分 label json 保存根路径} \ + --ratios ${划分比例} \ + [--shuffle] \ + [--seed ${划分的随机种子}] +``` + +其中: + +- `--ratios`:划分的比例,如果只设置了 2 个,则划分为 `trainval + test`,如果设置为 3 个,则划分为 `train + val + test`。支持两种格式 —— 整数、小数: + - 整数:按比例进行划分,代码中会进行归一化之后划分数据集。例子: `--ratio 2 1 1`(代码里面会转换成 `0.5 0.25 0.25`) or `--ratio 3 1`(代码里面会转换成 `0.75 0.25`) + - 小数:划分为比例。**如果加起来不为 1 ,则脚本会进行自动归一化修正**。例子: `--ratio 0.8 0.1 0.1` or `--ratio 0.8 0.2` +- `--shuffle`: 是否打乱数据集再进行划分; +- `--seed`:设定划分的随机种子,不设置的话自动生成随机种子。 + +例子: + +```shell +python tools/misc/coco_split.py --json ./data/cat/annotations/annotations_all.json \ + --out-dir ./data/cat/annotations \ + --ratios 0.8 0.2 \ + --shuffle \ + --seed 10 +``` + +
+Image +
+ +## 5. 根据数据集内容新建 config 文件 + +确保数据集目录是这样的: + +```shell +. +└── $DATA_ROOT + ├── annotations + │ ├── trainval.json # 根据上面的指令只划分 trainval + test,如果您使用 3 组划分比例的话,这里是 train.json、val.json、test.json + │ └── test.json + ├── images + │ ├── image1.jpg + │ ├── image1.png + │ └── ... + └── ... +``` + +因为是我们自定义的数据集,所以我们需要自己新建一个 config 并加入需要修改的部分信息。 + +关于新的 config 的命名: + +- 这个 config 继承的是 `yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`; +- 训练的类以本教程提供的数据集中的类 `cat` 为例(如果是自己的数据集,可以自定义类型的总称); +- 本教程测试的显卡型号是 1 x 3080Ti 12G 显存,电脑内存 32G,可以训练 YOLOv5-s 最大批次是 `batch size = 32`(详细机器资料可见附录); +- 训练轮次是 `100 epoch`。 + +综上所述:可以将其命名为 `yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py`,并将其放置在文件夹 `configs/custom_dataset` 中。 + +我们可以在 configs 目录下新建一个新的目录 `custom_dataset`,同时在里面新建该 config 文件,并添加以下内容: + +
+Image +
+ +```python +_base_ = '../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +max_epochs = 100 # 训练的最大 epoch +data_root = './data/cat/' # 数据集目录的绝对路径 +# data_root = '/root/workspace/mmyolo/data/cat/' # Docker 容器里面数据集目录的绝对路径 + +# 结果保存的路径,可以省略,省略保存的文件名位于 work_dirs 下 config 同名的文件夹中 +# 如果某个 config 只是修改了部分参数,修改这个变量就可以将新的训练文件保存到其他地方 +work_dir = './work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat' + +# load_from 可以指定本地路径或者 URL,设置了 URL 会自动进行下载,因为上面已经下载过,我们这里设置本地路径 +# 因为本教程是在 cat 数据集上微调,故这里需要使用 `load_from` 来加载 MMYOLO 中的预训练模型,这样可以在加快收敛速度的同时保证精度 +load_from = './work_dirs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +# 根据自己的 GPU 情况,修改 batch size,YOLOv5-s 默认为 8卡 x 16bs +train_batch_size_per_gpu = 32 +train_num_workers = 4 # 推荐使用 train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 # 每 interval 轮迭代进行一次保存一次权重 + +# 根据自己的 GPU 情况,修改 base_lr,修改的比例是 base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 4 + +anchors = [ # 此处已经根据数据集特点更新了 anchor,关于 anchor 的生成,后面小节会讲解 + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +class_name = ('cat', ) # 根据 class_with_id.txt 类别信息,设置 class_name +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(220, 20, 60)] # 画图时候的颜色,随便设置即可 +) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, # 第几个 epoch 后验证,这里设置 20 是因为前 20 个 epoch 精度不高,测试意义不大,故跳过 + val_interval=save_epoch_intervals # 每 val_interval 轮迭代进行一次测试评估 +) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + + # loss_cls 会根据 num_classes 动态调整,但是 num_classes = 1 的时候,loss_cls 恒为 0 + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + # 数据量太少的话,可以使用 RepeatDataset ,在每个 epoch 内重复当前数据集 n 次,这里设置 5 是重复 5 次 + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + # 设置间隔多少个 epoch 保存模型,以及保存模型最多几个,`save_best` 是另外保存最佳模型(推荐) + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + # logger 输出的间隔 + logger=dict(type='LoggerHook', interval=10)) + +``` + +```{Note} +我们在 `projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` 放了一份相同的 config 文件,用户可以选择复制到 `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` 路径直接开始训练。 +``` + +## 6. 数据集可视化分析 + +脚本 `tools/analysis_tools/dataset_analysis.py` 能够帮助用户得到数据集的分析图。该脚本可以生成 4 种分析图: + +- 显示类别和 bbox 实例个数的分布图:`show_bbox_num` +- 显示类别和 bbox 实例宽、高的分布图:`show_bbox_wh` +- 显示类别和 bbox 实例宽/高比例的分布图:`show_bbox_wh_ratio` +- 基于面积规则下,显示类别和 bbox 实例面积的分布图:`show_bbox_area` + +脚本使用方式如下: + +```shell +python tools/analysis_tools/dataset_analysis.py ${CONFIG} \ + [--val-dataset ${TYPE}] \ + [--class-name ${CLASS_NAME}] \ + [--area-rule ${AREA_RULE}] \ + [--func ${FUNC}] \ + [--out-dir ${OUT_DIR}] +``` + +例子: + +以本教程 `cat` 数据集 的 config 为例: + +查看训练集数据分布情况: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --out-dir work_dirs/dataset_analysis_cat/train_dataset +``` + +查看验证集数据分布情况: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --out-dir work_dirs/dataset_analysis_cat/val_dataset \ + --val-dataset +``` + +效果(点击图片可查看大图): + + + + + + + + + + + + + + + + + + + + +
+ 基于面积规则下,显示类别和 bbox 实例面积的分布图 + + 显示类别和 bbox 实例宽、高的分布图 +
+ YOLOv5CocoDataset_bbox_area + + YOLOv5CocoDataset_bbox_wh +
+ 显示类别和 bbox 实例个数的分布图 + + 显示类别和 bbox 实例宽/高比例的分布图 +
+ YOLOv5CocoDataset_bbox_num + + YOLOv5CocoDataset_bbox_ratio +
+ +```{Note} +因为本教程使用的 cat 数据集数量比较少,故 config 里面用了 RepeatDataset,显示的数目实际上都是重复了 5 次。如果您想得到无重复的分析结果,可以暂时将 RepeatDataset 下面的 `times` 参数从 `5` 改成 `1`。 +``` + +经过输出的图片分析可以得出,本教程使用的 `cat` 数据集的训练集具有以下情况: + +- 图片全部是 `large object`; +- 类别 cat 的数量是 `655`; +- bbox 的宽高比例大部分集中在 `1.0 ~ 1.11`,比例最小值是 `0.36`,最大值是 `2.9`; +- bbox 的宽大部分是 `500 ~ 600` 左右,高大部分是 `500 ~ 600` 左右。 + +```{SeeAlso} +关于 `tools/analysis_tools/dataset_analysis.py` 的更多用法请参考 [可视化数据集分析](https://mmyolo.readthedocs.io/zh_CN/latest/user_guides/useful_tools.html#id4)。 +``` + +## 7. 优化 Anchor 尺寸 + +```{Warning} +该步骤仅适用于 anchor-base 的模型,例如 YOLOv5; + +Anchor-free 的模型可以跳过此步骤,例如 YOLOv6、YOLOX。 +``` + +脚本 `tools/analysis_tools/optimize_anchors.py` 支持 YOLO 系列中三种锚框生成方式,分别是 `k-means`、`Differential Evolution`、`v5-k-means`. + +本示例使用的是 YOLOv5 进行训练,使用的是 `640 x 640` 的输入大小,使用 `v5-k-means` 进行描框的优化: + +```shell +python tools/analysis_tools/optimize_anchors.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --algorithm v5-k-means \ + --input-shape 640 640 \ + --prior-match-thr 4.0 \ + --out-dir work_dirs/dataset_analysis_cat +``` + +```{Note} +因为该命令使用的是 k-means 聚类算法,存在一定的随机性,这与初始化有关。故每次执行得到的 Anchor 都会有些不一样,但是都是基于传递进去的数据集来进行生成的,故不会有什么不良影响。 +``` + +经过计算的 Anchor 如下: + +
+Anchor +
+ +修改 config 文件里面的 `anchors` 变量: + +```python +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] +``` + +```{SeeAlso} +关于 `tools/analysis_tools/optimize_anchors.py` 的更多用法请参考 [优化锚框尺寸](https://mmyolo.readthedocs.io/zh_CN/latest/user_guides/useful_tools.html#id8)。 +``` + +## 8. 可视化 config 配置中数据处理部分 + +脚本 `tools/analysis_tools/browse_dataset.py` 能够帮助用户去直接窗口可视化 config 配置中数据处理部分,同时可以选择保存可视化图片到指定文件夹内。 + +下面演示使用我们刚刚新建的 config 文件 `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` 来可视化图片,该命令会使得图片直接弹出显示,每张图片持续 `3` 秒,图片不进行保存: + +```shell +python tools/analysis_tools/browse_dataset.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py \ + --show-interval 3 +``` + +
+image +
+ +
+image +
+ +```{SeeAlso} +关于 `tools/analysis_tools/browse_dataset.py` 的更多用法请参考 [可视化数据集](https://mmyolo.readthedocs.io/zh_CN/latest/user_guides/useful_tools.html#id3)。 +``` + +## 9. 训练 + +下面会从以下 3 点来进行讲解: + +1. 训练可视化 +2. YOLOv5 模型训练 +3. 切换 YOLO 模型训练 + +### 9.1 训练可视化 + +如果需要采用浏览器对训练过程可视化,MMYOLO 目前提供 2 种方式 [wandb](https://wandb.ai/site) 和 [TensorBoard](https://tensorflow.google.cn/tensorboard),根据自己的情况选择其一即可(后续会扩展更多可视化后端支持)。 + +#### 9.1.1 wandb + +wandb 可视化需要在[官网](https://wandb.ai/site)注册,并在 https://wandb.ai/settings 获取到 wandb 的 API Keys。 + +
+image +
+ +然后在命令行进行安装 + +```shell +pip install wandb +# 运行了 wandb login 后输入上文中获取到的 API Keys ,便登录成功。 +wandb login +``` + +
+推理图片 +
+ +在我们刚刚新建的 config 文件 `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` 的最后添加 wandb 配置: + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'), dict(type='WandbVisBackend')]) +``` + +#### 9.1.2 TensorBoard + +安装 Tensorboard 环境 + +```shell +pip install tensorboard +``` + +在我们刚刚新建的 config 文件 `configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` 中添加 `tensorboard` 配置 + +```python +visualizer = dict(vis_backends=[dict(type='LocalVisBackend'),dict(type='TensorboardVisBackend')]) +``` + +运行训练命令后,Tensorboard 文件会生成在可视化文件夹 `work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat/${TIMESTAMP}/vis_data` 下, +运行下面的命令便可以在网页链接使用 Tensorboard 查看 loss、学习率和 coco/bbox_mAP 等可视化数据了: + +```shell +tensorboard --logdir=work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat +``` + +### 9.2 执行训练 + +使用下面命令进行启动训练(训练大约需要 2.5 个小时): + +```shell +python tools/train.py configs/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py +``` + +如果您开启了 wandb 的话,可以登录到自己的账户,在 wandb 中查看本次训练的详细信息了: + +
+Image +
+ +
+Image +
+ +下面是 `1 x 3080Ti`、`batch size = 32`,训练 `100 epoch` 最佳精度权重 `work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_98.pth` 得出来的精度(详细机器资料可见附录): + +```shell + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.968 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.968 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.886 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.977 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.977 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.977 + +bbox_mAP_copypaste: 0.968 1.000 1.000 -1.000 -1.000 0.968 +Epoch(val) [98][116/116] coco/bbox_mAP: 0.9680 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 1.0000 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9680 +``` + +```{Tip} +在一般的 finetune 最佳实践中都会推荐将 backbone 固定不参与训练,并且学习率 lr 也进行相应缩放,但是在本教程中发现这种做法会出现一定程度掉点。猜测可能原因是 cat 类别已经在 COCO 数据集中,而本教程使用的 cat 数据集数量比较小导致的。 +``` + +下表是采用 MMYOLO YOLOv5 预训练模型 `yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth` 在没对 cat 数据集进行 finetune 的测试精度,可以看到 `cat` 类别的 mAP 只有 `0.866`,经过我们 finetune `mAP` 提升到了 `0.968`,提升了 `10.2 %`,可以证明训练是非常成功的: + +```shell ++---------------+-------+--------------+-----+----------------+------+ +| category | AP | category | AP | category | AP | ++---------------+-------+--------------+-----+----------------+------+ +| person | nan | bicycle | nan | car | nan | +| motorcycle | nan | airplane | nan | bus | nan | +| train | nan | truck | nan | boat | nan | +| traffic light | nan | fire hydrant | nan | stop sign | nan | +| parking meter | nan | bench | nan | bird | nan | +| cat | 0.866 | dog | nan | horse | nan | +| sheep | nan | cow | nan | elephant | nan | +| bear | nan | zebra | nan | giraffe | nan | +| backpack | nan | umbrella | nan | handbag | nan | +| tie | nan | suitcase | nan | frisbee | nan | +| skis | nan | snowboard | nan | sports ball | nan | +| kite | nan | baseball bat | nan | baseball glove | nan | +| skateboard | nan | surfboard | nan | tennis racket | nan | +| bottle | nan | wine glass | nan | cup | nan | +| fork | nan | knife | nan | spoon | nan | +| bowl | nan | banana | nan | apple | nan | +| sandwich | nan | orange | nan | broccoli | nan | +| carrot | nan | hot dog | nan | pizza | nan | +| donut | nan | cake | nan | chair | nan | +| couch | nan | potted plant | nan | bed | nan | +| dining table | nan | toilet | nan | tv | nan | +| laptop | nan | mouse | nan | remote | nan | +| keyboard | nan | cell phone | nan | microwave | nan | +| oven | nan | toaster | nan | sink | nan | +| refrigerator | nan | book | nan | clock | nan | +| vase | nan | scissors | nan | teddy bear | nan | +| hair drier | nan | toothbrush | nan | None | None | ++---------------+-------+--------------+-----+----------------+------+ +``` + +```{SeeAlso} +关于如何得到预训练权重的精度,可以详见附录【2. 如何测试数据集在预训练权重的精度】 +``` + +### 9.3 尝试 MMYOLO 其他模型 + +MMYOLO 集成了多种 YOLO 算法,切换非常方便,无需重新熟悉一个新的 repo,直接切换 config 文件就可以轻松切换 YOLO 模型,只需简单 3 步即可切换模型: + +1. 新建 config 文件 +2. 下载预训练权重 +3. 启动训练 + +下面以 YOLOv6-s 为例,进行讲解。 + +1. 搭建一个新的 config: + +```python +_base_ = '../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +max_epochs = 100 # 训练的最大 epoch +data_root = './data/cat/' # 数据集目录的绝对路径 + +# 结果保存的路径,可以省略,省略保存的文件名位于 work_dirs 下 config 同名的文件夹中 +# 如果某个 config 只是修改了部分参数,修改这个变量就可以将新的训练文件保存到其他地方 +work_dir = './work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat' + +# load_from 可以指定本地路径或者 URL,设置了 URL 会自动进行下载,因为上面已经下载过,我们这里设置本地路径 +# 因为本教程是在 cat 数据集上微调,故这里需要使用 `load_from` 来加载 MMYOLO 中的预训练模型,这样可以在加快收敛速度的同时保证精度 +load_from = './work_dirs/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa + +# 根据自己的 GPU 情况,修改 batch size,YOLOv6-s 默认为 8卡 x 32bs +train_batch_size_per_gpu = 32 +train_num_workers = 4 # 推荐使用 train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 # 每 interval 轮迭代进行一次保存一次权重 + +# 根据自己的 GPU 情况,修改 base_lr,修改的比例是 base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 8 + +class_name = ('cat', ) # 根据 class_with_id.txt 类别信息,设置 class_name +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(220, 20, 60)] # 画图时候的颜色,随便设置即可 +) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, # 第几个 epoch 后验证,这里设置 20 是因为前 20 个 epoch 精度不高,测试意义不大,故跳过 + val_interval=save_epoch_intervals, # 每 val_interval 轮迭代进行一次测试评估 + dynamic_intervals=[(max_epochs - _base_.num_last_epochs, 1)] +) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes)) +) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + # 数据量太少的话,可以使用 RepeatDataset ,在每个 epoch 内重复当前数据集 n 次,这里设置 5 是重复 5 次 + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + # 设置间隔多少个 epoch 保存模型,以及保存模型最多几个,`save_best` 是另外保存最佳模型(推荐) + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + # logger 输出的间隔 + logger=dict(type='LoggerHook', interval=10)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - _base_.num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] + +``` + +```{Note} +同样,我们在 `projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py` 放了一份相同的 config 文件,用户可以选择复制到 `configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py` 路径直接开始训练。 + +虽然新的 config 看上去好像很多东西,其实很多都是重复的,用户可以用对比软件对比一下即可看出大部分的配置都是和 `yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py` 相同的。因为这 2 个 config 文件需要继承不同的 config,所以还是要添加一些必要的配置。 +``` + +2. 下载 YOLOv6-s 的预训练权重 + +```bash +wget https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth -P work_dirs/ +``` + +3. 训练 + +```shell +python tools/train.py configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py +``` + +在我的实验中,最佳模型是 `work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth`,其精度如下: + +```bash + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.987 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.987 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.895 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.989 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.989 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.989 + +bbox_mAP_copypaste: 0.987 1.000 1.000 -1.000 -1.000 0.987 +Epoch(val) [96][116/116] coco/bbox_mAP: 0.9870 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 1.0000 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9870 +``` + +以上演示的是如何在 MMYOLO 中切换模型,可以快速对不同模型进行精度对比,精度高的模型可以上线生产。在我的实验中,YOLOv6 最佳精度 `0.9870` 比 YOLOv5 最佳精度 `0.9680` 高出 `1.9 %`,故后续我们使用 YOLOv6 来进行讲解。 + +## 10. 推理 + +使用最佳的模型进行推理,下面命令中的最佳模型路径是 `./work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth`,请用户自行修改为自己训练的最佳模型路径。 + +```shell +python demo/image_demo.py ./data/cat/images \ + ./configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + ./work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth \ + --out-dir ./data/cat/pred_images +``` + +
+推理图片 +
+ +```{Tip} +如果推理结果不理想,这里举例 2 种情况: + +1. 模型欠拟合: + 需要先判断是不是训练 epoch 不够导致的欠拟合,如果是训练不够,则修改 config 文件里面的 `max_epochs` 和 `work_dir` 参数,或者根据上面的命名方式新建一个 config 文件,重新进行训练。 + +2. 数据集需优化: + 如果 epoch 加上去了还是不行,可以增加数据集数量,同时可以重新检查并优化数据集的标注,然后重新进行训练。 +``` + +## 11. 部署 + +MMYOLO 提供两种部署方式: + +1. [MMDeploy](https://github.com/open-mmlab/mmdeploy) 框架进行部署 +2. 使用 `projects/easydeploy` 进行部署 + +### 11.1 MMDeploy 框架进行部署 + +考虑到部署的机器环境千差万别,很多时候在本地机器可以,但是在生产环境则不一定,这里推荐使用 Docker,做到环境一次部署,终身使用,节省运维搭建环境和部署生产的时间。 + +本小节会从一下几个小点进行展开讲解: + +1. 构建 Docker 镜像 +2. 创建 Docker 容器 +3. 转换 TensorRT 模型 +4. 部署模型执行推理 + +```{SeeAlso} +如果是对 Docker 不熟悉的用户,可以参考 MMDeploy 的 [源码手动安装](https://mmdeploy.readthedocs.io/zh_CN/latest/01-how-to-build/build_from_source.html) 文档直接在本地编译。安装完之后,可以直接跳到 【11.1.3 转换 TensorRT 模型】 小节。 +``` + +#### 11.1.1 构建 Docker 镜像 + +```shell +git clone -b dev-1.x https://github.com/open-mmlab/mmdeploy.git +cd mmdeploy +docker build docker/GPU/ -t mmdeploy:gpu --build-arg USE_SRC_INSIDE=true +``` + +其中 `USE_SRC_INSIDE=true` 是拉取基础进行之后在内部切换国内源,构建速度会快一些。 + +执行脚本后,会进行构建,此刻需要等一段时间: + +
+Image +
+ +#### 11.1.2 创建 Docker 容器 + +```shell +export MMYOLO_PATH=/path/to/local/mmyolo # 先将您机器上 MMYOLO 的路径写入环境变量 +docker run --gpus all --name mmyolo-deploy -v ${MMYOLO_PATH}:/root/workspace/mmyolo -it mmdeploy:gpu /bin/bash +``` + +
+Image +
+ +可以看到本地的 MMYOLO 环境已经挂载到容器里面了 + +
+Image +
+ +```{SeeAlso} +有关这部分的详细介绍可以看 MMDeploy 官方文档 [使用 Docker 镜像](https://mmdeploy.readthedocs.io/zh_CN/latest/01-how-to-build/build_from_docker.html#docker) +``` + +#### 11.1.3 转换 TensorRT 模型 + +首先需要在 Docker 容器里面安装 MMYOLO 和 `pycuda`: + +```shell +export MMYOLO_PATH=/root/workspace/mmyolo # 镜像中的路径,这里不需要修改 +cd ${MMYOLO_PATH} +export MMYOLO_VERSION=$(python -c "import mmyolo.version as v; print(v.__version__)") # 查看训练使用的 MMYOLO 版本号 +echo "Using MMYOLO ${MMYOLO_VERSION}" +mim install --no-cache-dir mmyolo==${MMYOLO_VERSION} +pip install --no-cache-dir pycuda==2022.2 +``` + +进行模型转换 + +```shell +cd /root/workspace/mmdeploy +python ./tools/deploy.py \ + ${MMYOLO_PATH}/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + ${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + ${MMYOLO_PATH}/work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat/best_coco/bbox_mAP_epoch_96.pth \ + ${MMYOLO_PATH}/data/cat/images/mmexport1633684751291.jpg \ + --test-img ${MMYOLO_PATH}/data/cat/images/mmexport1633684751291.jpg \ + --work-dir ./work_dir/yolov6_s_syncbn_fast_1xb32-100e_cat_deploy_dynamic_fp16 \ + --device cuda:0 \ + --log-level INFO \ + --show \ + --dump-info +``` + +
+Image +
+ +等待一段时间,出现了 `All process success.` 即为成功: + +
+Image +
+ +查看导出的路径,可以看到如下图所示的文件结构: + +```shell +$WORK_DIR + ├── deploy.json + ├── detail.json + ├── end2end.engine + ├── end2end.onnx + └── pipeline.json +``` + +```{SeeAlso} +关于转换模型的详细介绍,请参考 [如何转换模型](https://mmdeploy.readthedocs.io/zh_CN/latest/02-how-to-run/convert_model.html) +``` + +#### 11.1.4 部署模型执行推理 + +需要将 `${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py` 里面的 `data_root` 修改为 Docker 容器里面的路径: + +```python +data_root = '/root/workspace/mmyolo/data/cat/' # Docker 容器里面数据集目录的绝对路径 +``` + +执行速度和精度测试: + +```shell +python tools/test.py \ + ${MMYOLO_PATH}/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + ${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + --model ./work_dir/yolov6_s_syncbn_fast_1xb32-100e_cat_deploy_dynamic_fp16/end2end.engine \ + --speed-test \ + --device cuda +``` + +速度测试如下,可见平均推理速度是 `24.10 ms`,对比 PyTorch 推理有速度提升,同时显存也下降了很多: + +```shell +Epoch(test) [ 10/116] eta: 0:00:20 time: 0.1919 data_time: 0.1330 memory: 12 +Epoch(test) [ 20/116] eta: 0:00:15 time: 0.1220 data_time: 0.0939 memory: 12 +Epoch(test) [ 30/116] eta: 0:00:12 time: 0.1168 data_time: 0.0850 memory: 12 +Epoch(test) [ 40/116] eta: 0:00:10 time: 0.1241 data_time: 0.0940 memory: 12 +Epoch(test) [ 50/116] eta: 0:00:08 time: 0.0974 data_time: 0.0696 memory: 12 +Epoch(test) [ 60/116] eta: 0:00:06 time: 0.0865 data_time: 0.0547 memory: 16 +Epoch(test) [ 70/116] eta: 0:00:05 time: 0.1521 data_time: 0.1226 memory: 16 +Epoch(test) [ 80/116] eta: 0:00:04 time: 0.1364 data_time: 0.1056 memory: 12 +Epoch(test) [ 90/116] eta: 0:00:03 time: 0.0923 data_time: 0.0627 memory: 12 +Epoch(test) [100/116] eta: 0:00:01 time: 0.0844 data_time: 0.0583 memory: 12 +[tensorrt]-110 times per count: 24.10 ms, 41.50 FPS +Epoch(test) [110/116] eta: 0:00:00 time: 0.1085 data_time: 0.0832 memory: 12 +``` + +精度测试如下。此配置采用 FP16 格式推理,会有一定程度掉点,但是推理速度更快、显存占比更小: + +```shell + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.954 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 1.000 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.975 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.954 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.860 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.965 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.965 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.965 + +INFO - bbox_mAP_copypaste: 0.954 1.000 0.975 -1.000 -1.000 0.954 +INFO - Epoch(test) [116/116] coco/bbox_mAP: 0.9540 coco/bbox_mAP_50: 1.0000 coco/bbox_mAP_75: 0.9750 coco/bbox_mAP_s: -1.0000 coco/bbox_mAP_m: -1.0000 coco/bbox_mAP_l: 0.9540 +``` + +部署模型图片推理演示: + +```{Note} +用户可以参考 MMDeploy 的 SDK 部署方式,使用 C++ 来进行部署,进而进一步提升推理速度。 +``` + +```shell +cd ${MMYOLO_PATH}/demo +python deploy_demo.py \ + ${MMYOLO_PATH}/data/cat/images/mmexport1633684900217.jpg \ + ${MMYOLO_PATH}/configs/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py \ + /root/workspace/mmdeploy/work_dir/yolov6_s_syncbn_fast_1xb32-100e_cat_deploy_dynamic_fp16/end2end.engine \ + --deploy-cfg ${MMYOLO_PATH}/configs/deploy/detection_tensorrt-fp16_dynamic-192x192-960x960.py \ + --out-dir ${MMYOLO_PATH}/work_dirs/deploy_predict_out \ + --device cuda:0 \ + --score-thr 0.5 +``` + +```{Warning} +该脚本 `deploy_demo.py` 暂时没有做批量推理的处理,而且代码前处理还需要完善,暂时不能完全展现出推理的速度,只能演示推理的结果,后续会优化,敬请期待。 +``` + +执行之后,可以看到在 `--out-dir` 下面的推理图片结果: + +
+Image +
+ +```{Note} +您也可以做其他优化调整,例如增大 batch,量化 int8 等等。 +``` + +#### 11.1.4 保存和加载 Docker 容器 + +因为如果每次都进行 docker 镜像的构建,特别费时间,此时您可以考虑使用 docker 自带的打包 api 进行打包和加载。 + +```shell +# 保存,得到的 tar 包可以放到移动硬盘 +docker save mmyolo-deploy > mmyolo-deploy.tar + +# 加载镜像到系统 +docker load < /path/to/mmyolo-deploy.tar +``` + +### 11.2 使用 `projects/easydeploy` 进行部署 + +```{SeeAlso} +详见[部署文档](https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy/README_zh-CN.md) +``` + +TODO: 下个版本会完善这个部分... + +## 附录 + +### 1. 本教程训练机器的详细环境的资料如下: + +```shell +sys.platform: linux +Python: 3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:58:50) [GCC 10.3.0] +CUDA available: True +numpy_random_seed: 2147483648 +GPU 0: NVIDIA GeForce RTX 3080 Ti +CUDA_HOME: /usr/local/cuda +NVCC: Cuda compilation tools, release 11.5, V11.5.119 +GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0 +PyTorch: 1.10.0 +PyTorch compiling details: PyTorch built with: + - GCC 7.3 + - C++ Version: 201402 + - Intel(R) oneAPI Math Kernel Library Version 2021.4-Product Build 20210904 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX2 + - CUDA Runtime 11.3 + - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode; + arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70; + -gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode; + arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37 + - CuDNN 8.2 + - Magma 2.5.2 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.2.0, + CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden + -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK + -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra + -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas + -Wno-sign-compare -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic + -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new + -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format + -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, + TORCH_VERSION=1.10.0, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, + USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, + +TorchVision: 0.11.0 +OpenCV: 4.6.0 +MMEngine: 0.3.1 +MMCV: 2.0.0rc3 +MMDetection: 3.0.0rc3 +MMYOLO: 0.2.0+cf279a5 +``` + +### 2. 如何测试数据集在预训练权重的精度: + +```{Warning} +前提:该类在 COCO 80 类中! +``` + +本小节以 `cat` 数据集为例进行讲解,使用的是: + +- config 文件:`configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py` +- 权重 `yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth` + +1. 修改 config 文件中的路径 + +因为 `configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py` 是继承于 `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py`,故主要修改 `configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py` 文件即可。 + +| 修改前 | 修改后 | +| --------------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | +| `data_root = 'data/coco/'` | `data_root = './data/cat/'` | +| `ann_file='annotations/instances_train2017.json'` | `ann_file='annotations/trainval.json'` | +| data_prefix=dict(img='train2017/')\` | `data_prefix=dict(img='images/')` | +| `val_evaluator` 中的 `ann_file=data_root + 'annotations/instances_val2017.json'` | `val_evaluator` 中的 `dict(ann_file=data_root + 'annotations/trainval.json')` | + +2. 修改标签 + +```{note} +建议直接复制一份标签,防止弄坏好的标签 +``` + +将 `trainval.json` 里面的 "categories" 字段改为 COCO 原本的: + +```json + "categories": [{"supercategory": "person","id": 1,"name": "person"},{"supercategory": "vehicle","id": 2,"name": "bicycle"},{"supercategory": "vehicle","id": 3,"name": "car"},{"supercategory": "vehicle","id": 4,"name": "motorcycle"},{"supercategory": "vehicle","id": 5,"name": "airplane"},{"supercategory": "vehicle","id": 6,"name": "bus"},{"supercategory": "vehicle","id": 7,"name": "train"},{"supercategory": "vehicle","id": 8,"name": "truck"},{"supercategory": "vehicle","id": 9,"name": "boat"},{"supercategory": "outdoor","id": 10,"name": "traffic light"},{"supercategory": "outdoor","id": 11,"name": "fire hydrant"},{"supercategory": "outdoor","id": 13,"name": "stop sign"},{"supercategory": "outdoor","id": 14,"name": "parking meter"},{"supercategory": "outdoor","id": 15,"name": "bench"},{"supercategory": "animal","id": 16,"name": "bird"},{"supercategory": "animal","id": 17,"name": "cat"},{"supercategory": "animal","id": 18,"name": "dog"},{"supercategory": "animal","id": 19,"name": "horse"},{"supercategory": "animal","id": 20,"name": "sheep"},{"supercategory": "animal","id": 21,"name": "cow"},{"supercategory": "animal","id": 22,"name": "elephant"},{"supercategory": "animal","id": 23,"name": "bear"},{"supercategory": "animal","id": 24,"name": "zebra"},{"supercategory": "animal","id": 25,"name": "giraffe"},{"supercategory": "accessory","id": 27,"name": "backpack"},{"supercategory": "accessory","id": 28,"name": "umbrella"},{"supercategory": "accessory","id": 31,"name": "handbag"},{"supercategory": "accessory","id": 32,"name": "tie"},{"supercategory": "accessory","id": 33,"name": "suitcase"},{"supercategory": "sports","id": 34,"name": "frisbee"},{"supercategory": "sports","id": 35,"name": "skis"},{"supercategory": "sports","id": 36,"name": "snowboard"},{"supercategory": "sports","id": 37,"name": "sports ball"},{"supercategory": "sports","id": 38,"name": "kite"},{"supercategory": "sports","id": 39,"name": "baseball bat"},{"supercategory": "sports","id": 40,"name": "baseball glove"},{"supercategory": "sports","id": 41,"name": "skateboard"},{"supercategory": "sports","id": 42,"name": "surfboard"},{"supercategory": "sports","id": 43,"name": "tennis racket"},{"supercategory": "kitchen","id": 44,"name": "bottle"},{"supercategory": "kitchen","id": 46,"name": "wine glass"},{"supercategory": "kitchen","id": 47,"name": "cup"},{"supercategory": "kitchen","id": 48,"name": "fork"},{"supercategory": "kitchen","id": 49,"name": "knife"},{"supercategory": "kitchen","id": 50,"name": "spoon"},{"supercategory": "kitchen","id": 51,"name": "bowl"},{"supercategory": "food","id": 52,"name": "banana"},{"supercategory": "food","id": 53,"name": "apple"},{"supercategory": "food","id": 54,"name": "sandwich"},{"supercategory": "food","id": 55,"name": "orange"},{"supercategory": "food","id": 56,"name": "broccoli"},{"supercategory": "food","id": 57,"name": "carrot"},{"supercategory": "food","id": 58,"name": "hot dog"},{"supercategory": "food","id": 59,"name": "pizza"},{"supercategory": "food","id": 60,"name": "donut"},{"supercategory": "food","id": 61,"name": "cake"},{"supercategory": "furniture","id": 62,"name": "chair"},{"supercategory": "furniture","id": 63,"name": "couch"},{"supercategory": "furniture","id": 64,"name": "potted plant"},{"supercategory": "furniture","id": 65,"name": "bed"},{"supercategory": "furniture","id": 67,"name": "dining table"},{"supercategory": "furniture","id": 70,"name": "toilet"},{"supercategory": "electronic","id": 72,"name": "tv"},{"supercategory": "electronic","id": 73,"name": "laptop"},{"supercategory": "electronic","id": 74,"name": "mouse"},{"supercategory": "electronic","id": 75,"name": "remote"},{"supercategory": "electronic","id": 76,"name": "keyboard"},{"supercategory": "electronic","id": 77,"name": "cell phone"},{"supercategory": "appliance","id": 78,"name": "microwave"},{"supercategory": "appliance","id": 79,"name": "oven"},{"supercategory": "appliance","id": 80,"name": "toaster"},{"supercategory": "appliance","id": 81,"name": "sink"},{"supercategory": "appliance","id": 82,"name": "refrigerator"},{"supercategory": "indoor","id": 84,"name": "book"},{"supercategory": "indoor","id": 85,"name": "clock"},{"supercategory": "indoor","id": 86,"name": "vase"},{"supercategory": "indoor","id": 87,"name": "scissors"},{"supercategory": "indoor","id": 88,"name": "teddy bear"},{"supercategory": "indoor","id": 89,"name": "hair drier"},{"supercategory": "indoor","id": 90,"name": "toothbrush"}], +``` + +同时,将 `"annotations"` 字段里面的 `"category_id"` 改为 COCO 对应的 `id` ,例如本例子的 `cat` 是 `17`,下面展示部分修改结果: + +```json + "annotations": [ + { + "iscrowd": 0, + "category_id": 17, # 这个 "category_id" 改为 COCO 对应的 id,例如本例子的 cat 是 17 + "id": 32, + "image_id": 32, + "bbox": [ + 822.49072265625, + 958.3897094726562, + 1513.693115234375, + 988.3231811523438 + ], + "area": 1496017.9949368387, + "segmentation": [ + [ + 822.49072265625, + 958.3897094726562, + 822.49072265625, + 1946.712890625, + 2336.183837890625, + 1946.712890625, + 2336.183837890625, + 958.3897094726562 + ] + ] + } + ] +``` + +3. 执行命令 + +```shell +python tools\test.py configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + work_dirs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --cfg-options test_evaluator.classwise=True +``` + +执行之后就可以看到测试后的指标了: + +```shell ++---------------+-------+--------------+-----+----------------+------+ +| category | AP | category | AP | category | AP | ++---------------+-------+--------------+-----+----------------+------+ +| person | nan | bicycle | nan | car | nan | +| motorcycle | nan | airplane | nan | bus | nan | +| train | nan | truck | nan | boat | nan | +| traffic light | nan | fire hydrant | nan | stop sign | nan | +| parking meter | nan | bench | nan | bird | nan | +| cat | 0.866 | dog | nan | horse | nan | +| sheep | nan | cow | nan | elephant | nan | +| bear | nan | zebra | nan | giraffe | nan | +| backpack | nan | umbrella | nan | handbag | nan | +| tie | nan | suitcase | nan | frisbee | nan | +| skis | nan | snowboard | nan | sports ball | nan | +| kite | nan | baseball bat | nan | baseball glove | nan | +| skateboard | nan | surfboard | nan | tennis racket | nan | +| bottle | nan | wine glass | nan | cup | nan | +| fork | nan | knife | nan | spoon | nan | +| bowl | nan | banana | nan | apple | nan | +| sandwich | nan | orange | nan | broccoli | nan | +| carrot | nan | hot dog | nan | pizza | nan | +| donut | nan | cake | nan | chair | nan | +| couch | nan | potted plant | nan | bed | nan | +| dining table | nan | toilet | nan | tv | nan | +| laptop | nan | mouse | nan | remote | nan | +| keyboard | nan | cell phone | nan | microwave | nan | +| oven | nan | toaster | nan | sink | nan | +| refrigerator | nan | book | nan | clock | nan | +| vase | nan | scissors | nan | teddy bear | nan | +| hair drier | nan | toothbrush | nan | None | None | ++---------------+-------+--------------+-----+----------------+------+ +``` diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/mm_basics.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/mm_basics.md new file mode 100644 index 0000000000000000000000000000000000000000..2d8098b162d9d7659ff4d71c260720de435ac993 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/mm_basics.md @@ -0,0 +1 @@ +# MM 系列仓库必备基础 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/model_design.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/model_design.md new file mode 100644 index 0000000000000000000000000000000000000000..287cf032ef14dc2d2b2473f83f2c78b24db0ec7e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/model_design.md @@ -0,0 +1,107 @@ +# MMYOLO 模型设计相关说明 + +## YOLO 系列模型基类 + +下图为 RangeKing@GitHub 提供,非常感谢! + +
+基类 P5 +图 1:P5 模型结构 +
+ +
+基类 P6 +图 2:P6 模型结构 +
+ +YOLO 系列算法大部分采用了统一的算法搭建结构,典型的如 Darknet + PAFPN。为了让用户快速理解 YOLO 系列算法架构,我们特意设计了如上图中的 BaseBackbone + BaseYOLONeck 结构。 + +抽象 BaseBackbone 的好处包括: + +1. 子类不需要关心 forward 过程,只要类似建造者模式一样构建模型即可。 +2. 可以通过配置实现定制插件功能,用户可以很方便的插入一些类似注意力模块。 +3. 所有子类自动支持 frozen 某些 stage 和 frozen bn 功能。 + +抽象 BaseYOLONeck 也有同样好处。 + +### BaseBackbone + +- 如图 1 所示,对于 P5 而言,BaseBackbone 为包含 1 个 stem 层 + 4 个 stage 层的类似 ResNet 的基础结构。 +- 如图 2 所示,对于 P6 而言,BaseBackbone 为包含 1 个 stem 层 + 5 个 stage 层的结构。 + +不同算法的主干网络继承 BaseBackbone,用户可以通过实现内部的 `build_xx` 方法,使用自定义的基础模块来构建每一层的内部结构。 + +### BaseYOLONeck + +与 `BaseBackbone` 的设计类似,我们为 MMYOLO 系列的 Neck 层进行了重构,主要分为 `Reduce 层`, `UpSample 层`,`TopDown 层`,`DownSample 层`,`BottomUP 层`以及`输出卷积层`,每一层结构都可以通过继承重写 `build_xx` 方法来实现自定义的内部结构。 + +### BaseDenseHead + +MMYOLO 系列沿用 MMDetection 中设计的 `BaseDenseHead` 作为其 Head 结构的基类,但是进一步拆分了 HeadModule. 以 YOLOv5 为例,其 [HeadModule](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/dense_heads/yolov5_head.py#L24) 中的 forward 实现代替了原有的 forward 实现。 + +## HeadModule 说明 + +
+HeadModule +
+ +如上图所示,虚线部分为 [MMDetection](https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/base_dense_head.py) 中的实现,实线部分为 [MMYOLO](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/dense_heads/yolov5_head.py) 中的实现。MMYOLO版本与原实现相比具备具有以下优势: + +1. MMDetection 中将 `bbox_head` 拆分为 `assigner` + `box coder` + `sampler` 三个大的组件,但由于 3 个组件之间的传递为了通用性,需要封装额外的对象来处理,统一之后用户可以不用进行拆分。不刻意强求划分三大组件的好处为:不再需要对内部数据进行数据封装,简化了代码逻辑,减轻了社区使用难度和算法复现难度。 +2. 速度更快,用户在自定义实现算法时候,可以不依赖于原有框架,对部分代码进行深度优化。 + +总的来说,在 MMYOLO 中只需要做到将 `model` + `loss_by_feat` 部分解耦,用户就可以通过修改配置实现任意模型配合任意的 `loss_by_feat` 计算过程。例如将 YOLOv5 模型应用 YOLOX 的 `loss_by_feat` 等。 + +以 MMDetection 中 YOLOX 配置为例,其 Head 模块配置写法为: + +```python +bbox_head=dict( + type='YOLOXHead', + num_classes=80, + in_channels=128, + feat_channels=128, + stacked_convs=2, + strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='Swish'), + ... + loss_obj=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)), +train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), +``` + +在 MMYOLO 中抽取 `head_module` 后,新的配置写法为: + +```python +bbox_head=dict( + type='YOLOXHead', + head_module=dict( + type='YOLOXHeadModule', + num_classes=80, + in_channels=256, + feat_channels=256, + widen_factor=widen_factor, + stacked_convs=2, + featmap_strides=(8, 16, 32), + use_depthwise=False, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='SiLU', inplace=True), + ), + ... + loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox_aux=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0)), +train_cfg=dict( + assigner=dict( + type='mmdet.SimOTAAssigner', + center_radius=2.5, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'))), +``` diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/replace_backbone.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/replace_backbone.md new file mode 100644 index 0000000000000000000000000000000000000000..d78a25206418795457144e1dfd5461f4ac0193e6 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/replace_backbone.md @@ -0,0 +1,307 @@ +# 轻松更换主干网络 + +```{note} +1. 使用其他主干网络时,你需要保证主干网络的输出通道与 Neck 的输入通道相匹配。 +2. 下面给出的配置文件,仅能确保训练可以正确运行,直接训练性能可能不是最优的。因为某些 backbone 需要配套特定的学习率、优化器等超参数。后续会在“训练技巧章节”补充训练调优相关内容。 +``` + +## 使用 MMYOLO 中注册的主干网络 + +假设想将 `YOLOv6EfficientRep` 作为 `YOLOv5` 的主干网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +model = dict( + backbone=dict( + type='YOLOv6EfficientRep', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)) +) +``` + +## 跨库使用主干网络 + +OpenMMLab 2.0 体系中 MMYOLO、MMDetection、MMClassification、MMSelfsup 中的模型注册表都继承自 MMEngine 中的根注册表,允许这些 OpenMMLab 开源库直接使用彼此已经实现的模块。 因此用户可以在 MMYOLO 中使用来自 MMDetection、MMClassification、MMSelfsup 的主干网络,而无需重新实现。 + +### 使用在 MMDetection 中实现的主干网络 + +1. 假设想将 `ResNet-50` 作为 `YOLOv5` 的主干网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [512, 1024, 2048] + +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmdet.ResNet', # 使用 mmdet 中的 ResNet + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), + neck=dict( + type='YOLOv5PAFPN', + widen_factor=widen_factor, + in_channels=channels, # 注意:ResNet-50 输出的3个通道是 [512, 1024, 2048],和原先的 yolov5-s neck 不匹配,需要更改 + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # head 部分输入通道也要做相应更改 + widen_factor=widen_factor)) +) +``` + +2. 假设想将 `SwinTransformer-Tiny` 作为 `YOLOv5` 的主干网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [192, 384, 768] +checkpoint_file = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa + +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmdet.SwinTransformer', # 使用 mmdet 中的 SwinTransformer + embed_dims=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + patch_norm=True, + out_indices=(1, 2, 3), + with_cp=False, + convert_weights=True, + init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # 注意:SwinTransformer-Tiny 输出的3个通道是 [192, 384, 768],和原先的 yolov5-s neck 不匹配,需要更改 + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # head 部分输入通道也要做相应更改 + widen_factor=widen_factor)) +) +``` + +### 使用在 MMClassification 中实现的主干网络 + +1. 假设想将 `ConvNeXt-Tiny` 作为 `YOLOv5` 的主干网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# 请先使用命令: mim install "mmcls>=1.0.0rc2",安装 mmcls +# 导入 mmcls.models 使得可以调用 mmcls 中注册的模块 +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth' # noqa +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [192, 384, 768] + +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmcls.ConvNeXt', # 使用 mmcls 中的 ConvNeXt + arch='tiny', + out_indices=(1, 2, 3), + drop_path_rate=0.4, + layer_scale_init_value=1.0, + gap_before_final_norm=False, + init_cfg=dict( + type='Pretrained', checkpoint=checkpoint_file, + prefix='backbone.')), # MMCls 中主干网络的预训练权重含义 prefix='backbone.',为了正常加载权重,需要把这个 prefix 去掉。 + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # 注意:ConvNeXt-Tiny 输出的3个通道是 [192, 384, 768],和原先的 yolov5-s neck 不匹配,需要更改 + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # head 部分输入通道也要做相应更改 + widen_factor=widen_factor)) +) +``` + +2. 假设想将 `MobileNetV3-small` 作为 `YOLOv5` 的主干网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# 请先使用命令: mim install "mmcls>=1.0.0rc2",安装 mmcls +# 导入 mmcls.models 使得可以调用 mmcls 中注册的模块 +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth' # noqa +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [24, 48, 96] + +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmcls.MobileNetV3', # 使用 mmcls 中的 MobileNetV3 + arch='small', + out_indices=(3, 8, 11), # 修改 out_indices + init_cfg=dict( + type='Pretrained', + checkpoint=checkpoint_file, + prefix='backbone.')), # MMCls 中主干网络的预训练权重含义 prefix='backbone.',为了正常加载权重,需要把这个 prefix 去掉。 + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # 注意:MobileNetV3-small 输出的3个通道是 [24, 48, 96],和原先的 yolov5-s neck 不匹配,需要更改 + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # head 部分输入通道也要做相应更改 + widen_factor=widen_factor)) +) +``` + +### 通过 MMClassification 使用 `timm` 中实现的主干网络 + +由于 MMClassification 提供了 Py**T**orch **Im**age **M**odels (`timm`) 主干网络的封装,用户也可以通过 MMClassification 直接使用 `timm` 中的主干网络。假设想将 `EfficientNet-B1`作为 `YOLOv5` 的主干网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# 请先使用命令: mim install "mmcls>=1.0.0rc2",安装 mmcls +# 以及: pip install timm,安装 timm +# 导入 mmcls.models 使得可以调用 mmcls 中注册的模块 +custom_imports = dict(imports=['mmcls.models'], allow_failed_imports=False) + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [40, 112, 320] + +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmcls.TIMMBackbone', # 使用 mmcls 中的 timm 主干网络 + model_name='efficientnet_b1', # 使用 TIMM 中的 efficientnet_b1 + features_only=True, + pretrained=True, + out_indices=(2, 3, 4)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # 注意:EfficientNet-B1 输出的3个通道是 [40, 112, 320],和原先的 yolov5-s neck 不匹配,需要更改 + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # head 部分输入通道也要做相应更改 + widen_factor=widen_factor)) +) +``` + +### 使用在 MMSelfSup 中实现的主干网络 + +假设想将 MMSelfSup 中 `MoCo v3` 自监督训练的 `ResNet-50` 作为 `YOLOv5` 的主干网络,则配置文件如下: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +# 请先使用命令: mim install "mmselfsup>=1.0.0rc3",安装 mmselfsup +# 导入 mmselfsup.models 使得可以调用 mmselfsup 中注册的模块 +custom_imports = dict(imports=['mmselfsup.models'], allow_failed_imports=False) +checkpoint_file = 'https://download.openmmlab.com/mmselfsup/1.x/mocov3/mocov3_resnet50_8xb512-amp-coslr-800e_in1k/mocov3_resnet50_8xb512-amp-coslr-800e_in1k_20220927-e043f51a.pth' # noqa +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [512, 1024, 2048] + +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmselfsup.ResNet', + depth=50, + num_stages=4, + out_indices=(2, 3, 4), # 注意:MMSelfSup 中 ResNet 的 out_indices 比 MMdet 和 MMCls 的要大 1 + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)), + neck=dict( + type='YOLOv5PAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + in_channels=channels, # 注意:ResNet-50 输出的3个通道是 [512, 1024, 2048],和原先的 yolov5-s neck 不匹配,需要更改 + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # head 部分输入通道也要做相应更改 + widen_factor=widen_factor)) +) +``` + +### 不使用预训练权重 + +通常情况下,骨干网络初始化都是优先选择预训练权重。如果你不想使用预训练权重,而是想从头开始训练时模型时, +我们可以将 `backbone` 中的 `init_cfg` 设置为 `None`,此时骨干网络将会以默认的初始化方法进行初始化, +而不会使用训练好的预训练权重进行初始。以下是以 `YOLOv5` 使用 resnet 作为主干网络为例子,其余算法也是同样的处理: + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +deepen_factor = _base_.deepen_factor +widen_factor = 1.0 +channels = [512, 1024, 2048] + +model = dict( + backbone=dict( + _delete_=True, # 将 _base_ 中关于 backbone 的字段删除 + type='mmdet.ResNet', # 使用 mmdet 中的 ResNet + depth=50, + num_stages=4, + out_indices=(1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=None # init_cfg 设置为 None,则 backbone 将不会使用预训练好的权重进行初始化了 + ), + neck=dict( + type='YOLOv5PAFPN', + widen_factor=widen_factor, + in_channels=channels, # 注意:ResNet-50 输出的 3 个通道是 [512, 1024, 2048],和原先的 yolov5-s neck 不匹配,需要更改 + out_channels=channels), + bbox_head=dict( + type='YOLOv5Head', + head_module=dict( + type='YOLOv5HeadModule', + in_channels=channels, # head 部分输入通道也要做相应更改 + widen_factor=widen_factor)) +) +``` diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/training_testing_tricks.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/training_testing_tricks.md new file mode 100644 index 0000000000000000000000000000000000000000..ba67063f02c0ac609e573263003047a4919521ae --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/training_testing_tricks.md @@ -0,0 +1,303 @@ +# 训练和测试技巧 + +MMYOLO 中已经支持了大部分 YOLO 系列目标检测相关算法。不同算法可能涉及到一些实用技巧。本章节将基于所实现的目标检测算法,详细描述 MMYOLO 中已经支持的常用的训练和测试技巧。 + +## 训练技巧 + +### 提升检测性能 + +#### 1 开启多尺度训练 + +在目标检测领域,多尺度训练是一个非常常用的技巧,但是在 YOLO 中大部分模型的训练输入都是单尺度的 640x640,原因有两个方面: + +1. 单尺度训练速度快。当训练 epoch 在 300 或者 500 的时候训练效率是用户非常关注的,多尺度训练会比较慢 +2. 训练 pipeline 中隐含了多尺度增强,等价于应用了多尺度训练,典型的如 `Mosaic`、`RandomAffine` 和 `Resize` 等,故没有必要再次引入模型输入的多尺度训练 + +在 COCO 数据集上进行了简单实验,如果直接在 YOLOv5 的 DataLoader 输出后再次引入多尺度训练增强实际性能提升非常小,但是这不代表用户自定义数据集微调模式下没有明显增益。如果想在 MMYOLO 中对 YOLO 系列算法开启多尺度训练,可以参考 [多尺度训练文档](../common_usage/ms_training_testing.md) + +#### 2 使用 Mask 标注优化目标检测性能 + +在数据集标注完备例如同时存在边界框和实例分割标注但任务只需要其中部分标注情况下,可以借助完备的数据标注训练单一任务从而提升性能。在目标检测中同样可以借鉴实例分割标注来提升目标检测性能。 以下是 YOLOv8 额外引入实例分割标注优化目标检测结果。 性能增益如下所示: + +
+ +
+ +从上述曲线图可以看出,不同尺度模型都有了不同程度性能提升。需要注意的是 `Mask Refine` 仅仅的是作用在数据增强阶段,对模型其他训练部分不需要任何改动,且不会影响训练速度。具体如下所示: + +
+ +
+ +上述的 Mask 表示实例分割标注发挥关键作用的数据增强变换,将该技巧应用到其他 YOLO 系列中均有不同程度涨点。 + +#### 3 训练后期关闭强增强提升检测性能 + +该策略是在 YOLOX 算法中第一次被提出可以极大的提升检测性能。 论文中指出虽然 Mosaic+MixUp 可以极大的提升目标检测性能,但是它生成的训练图片远远脱离自然图片的真实分布,并且 Mosaic 大量的裁剪操作会带来很多不准确的标注框,所以 YOLOX 提出在最后 15 个 epoch 关掉强增强,转而使用较弱的增强,从而为了让检测器避开不准确标注框的影响,在自然图片的数据分布下完成最终的收敛。 + +该策略已经被应用到了大部分 YOLO 算法中,以 YOLOv8 为例其数据增强 pipeline 如下所示: + +
+ +
+ +不过在何时关闭强增强是一个超参,如果关闭太早则可能没有充分发挥 Mosaic 等强增强效果,如果关闭太晚则由于之前已经过拟合,此时再关闭则没有任何增益。 在 YOLOv8 实验中可以观察到该现象 + +| Backbone | Mask Refine | box AP | Epoch of best mAP | +| :------: | :---------: | :---------: | :---------------: | +| YOLOv8-n | No | 37.2 | 500 | +| YOLOv8-n | Yes | 37.4 (+0.2) | 499 | +| YOLOv8-s | No | 44.2 | 430 | +| YOLOv8-s | Yes | 45.1 (+0.9) | 460 | +| YOLOv8-m | No | 49.8 | 460 | +| YOLOv8-m | Yes | 50.6 (+0.8) | 480 | +| YOLOv8-l | No | 52.1 | 460 | +| YOLOv8-l | Yes | 53.0 (+0.9) | 491 | +| YOLOv8-x | No | 52.7 | 450 | +| YOLOv8-x | Yes | 54.0 (+1.3) | 460 | + +从上表可以看出: + +- 大模型在 COCO 数据集训练 500 epoch 会过拟合,在过拟合情况下再关闭 Mosaic 等强增强效果没有效果 +- 使用 Mask 标注可以缓解过拟合,并且提升性能 + +#### 4 加入纯背景图片抑制误报率 + +对于非开放世界数据集目标检测而言,训练和测试都是在固定类别上进行,一旦应用到没有训练过的类别图片上有可能会产生误报,一个常见的缓解策略是加入一定比例的纯背景图片。 在大部分 YOLO 系列中都是默认开启了加入纯背景图片抑制误报率功能,用户只需要设置 `train_dataloader.dataset.filter_cfg.filter_empty_gt` 为 False 即可,表示将纯背景图片不过滤掉加入训练。 + +#### 5 试试 AdamW 也许效果显著 + +YOLOv5,YOLOv6,YOLOv7 和 YOLOv8 等都是采用了 SGD 优化器,该参数器对参数的设置比较严格,而 AdamW 则正好相反,其对学习率设置等没有那么敏感。因此如果用户在自定义数据集微调可以尝试选择 AdamW 优化器。我们在 YOLOX 中进行了简单尝试,发现在 tiny、s 和 m 尺度模型上将其优化器替换为 AdamW 均有一定程度涨点。 + +| Backbone | Size | Batch Size | RTMDet-Hyp | Box AP | +| :--------: | :--: | :--------: | :--------: | :---------: | +| YOLOX-tiny | 416 | 8xb8 | No | 32.7 | +| YOLOX-tiny | 416 | 8xb32 | Yes | 34.3 (+1.6) | +| YOLOX-s | 640 | 8xb8 | No | 40.7 | +| YOLOX-s | 640 | 8xb32 | Yes | 41.9 (+1.2) | +| YOLOX-m | 640 | 8xb8 | No | 46.9 | +| YOLOX-m | 640 | 8xb32 | Yes | 47.5 (+0.6) | + +具体见 [configs/yolox/README.md](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolox/README.md#--results-and-models)。 + +#### 6 考虑 ignore 场景避免不确定性标注 + +以 CrowdHuman 为例,其是一个拥挤行人检测数据集,下面是一张典型图片: + +
+ +
+ +图片来自 [detectron2 issue](https://github.com/facebookresearch/detectron2/issues/1909)。黄色打叉的区域表示 `iscrowd` 标注。原因有两个方面: + +- 这个区域不是真的人,例如海报上的人 +- 该区域过于拥挤,很难标注 + +在该场景下,你不能简单的将这类标注删掉,因为你一旦删掉就表示当做背景区域来训练了,但是其和背景是不一样的,首先海报上的人和真人很像,并且拥挤区域确实有人只是不好标注。如果你简单的将其当做背景训练,那么会造成漏报。最合适的做法应该是把拥挤区域当做忽略区域即该区域的任何输出都直接忽略,不计算任何 Loss,不强迫模型拟合。 + +MMYOLO 在 YOLOv5 上简单快速的验证了 `iscrowd` 标注的作用,性能如下所示: + +| Backbone | ignore_iof_thr | box AP50(CrowDHuman Metric) | MR | JI | +| :------: | :------------: | :-------------------------: | :--: | :---: | +| YOLOv5-s | -1 | 85.79 | 48.7 | 75.33 | +| YOLOv5-s | 0.5 | 86.17 | 48.8 | 75.87 | + +`ignore_iof_thr`为 -1 表示不考虑忽略标签,可以看出性能有一定程度的提升,具体见 [CrowdHuman 结果](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/README.md#crowdhuman)。 如果你的自定义数据集上也有上述情况,则建议你考虑 ignore 场景避免不确定性标注。 + +#### 7 使用知识蒸馏 + +知识蒸馏是一个被广泛使用的技巧,可以将大模型性能转移到小模型上从而提升小模型检测性能。 目前 MMYOLO 和 MMRazor 已支持了该功能,并在 RTMDet 上进行了初步验证。 + +| Model | box AP | +| :------------: | :---------: | +| RTMDet-tiny | 41.0 | +| RTMDet-tiny \* | 41.8 (+0.8) | +| RTMDet-s | 44.6 | +| RTMDet-s \* | 45.7 (+1.1) | +| RTMDet-m | 49.3 | +| RTMDet-m \* | 50.2 (+0.9) | +| RTMDet-l | 51.4 | +| RTMDet-l \* | 52.3 (+0.9) | + +星号即为采用了大模型蒸馏的结果,详情见 [Distill RTMDet](https://github.com/open-mmlab/mmyolo/tree/main/configs/rtmdet/distillation)。 + +#### 8 更大的模型用更强的增强参数 + +如果你基于默认配置修改了模型或者替换了骨干网络,那么建议你基于此刻模型大小来缩放数据增强参数。 一般来说更大的模型需要使用更强的增强参数,否则可能无法发挥大模型的效果,反之如果小模型应用了较强的增强则可能会欠拟合。 以 RTMDet 为例,我们可以观察其不同模型大小的数据增强参数 + +
+ +
+ +其中 `random_resize_ratio_range` 表示 `RandomResize` 的随机缩放范围,`mosaic_max_cached_images/mixup_max_cached_images`表示 `Mosaic/MixUp` 增强时候缓存的图片个数,可以用于调整增强的强度。 YOLO 系列模型都是遵循同一套参数设置原则。 + +### 加快训练速度 + +#### 1 单尺度训练开启 cudnn_benchmark + +YOLO 系列算法中大部分网络输入图片大小都是固定的即单尺度,此时可以开启 `cudnn_benchmark` 来加快训练速度。该参数主要针对 PyTorch 的 cuDNN 底层库进行设置, 设置这个标志可以让内置的 cuDNN 自动寻找最适合当前配置的高效算法来优化运行效率。如果是多尺度模式开启该标志则会不断的寻找最优算法,反而会拖慢训练速度。 + +在 MMYOLO 中开启 `cudnn_benchmark`,只需要在配置中设置 `env_cfg = dict(cudnn_benchmark=True)` + +#### 2 使用带缓存的 Mosaic 和 MixUp + +如果你的数据增强中应用了 Mosaic 和 MixUp,并且经过排查训练瓶颈来自图片的随机读取,那么建议将常规的 Mosaic 和 MixUp 替换为 RTMDet 中提出的带缓存的版本。 + +| Data Aug | Use cache | ms/100 imgs | +| :------: | :-------: | :---------: | +| Mosaic | No | 87.1 | +| Mosaic | Yes | 24.0 | +| MixUp | No | 19.3 | +| MixUp | Yes | 12.4 | + +Mosaic 和 MixUp 涉及到多张图片的混合,它们的耗时会是普通数据增强的 K 倍(K 为混入图片的数量)。 如在 YOLOv5 中每次做 Mosaic 时, 4 张图片的信息都需要从硬盘中重新加载。 而带缓存的 Mosaic 和 MixUp 只需要重新载入当前的一张图片,其余参与混合增强的图片则从缓存队列中获取,通过牺牲一定内存空间的方式大幅提升了效率。 + +
+data cache +
+ +如图所示,cache 队列中预先储存了 N 张已加载的图像与标签数据,每一个训练 step 中只需加载一张新的图片及其标签数据并更新到 cache 队列中(cache 队列中的图像可重复,如图中出现两次 img3),同时如果 cache 队列长度超过预设长度,则随机 pop 一张图,当需要进行混合数据增强时,只需要从 cache 中随机选择需要的图像进行拼接等处理,而不需要全部从硬盘中加载,节省了图像加载的时间。 + +### 减少超参 + +YOLOv5 中通过实践提供了一些减少超参数的方法,下面详细说明。 + +#### 1 Loss 权重自适应,少 1 个超参 + +一般来说,对于不同的任务或者不同的类别,可能需要针对性的设置超参,而这通常比较难。YOLOv5 中根据实践提出了一些根据类别数和检测输出层个数来自适应缩放 Loss 权重的方法,如下所示: + +```python +# scaled based on number of detection layers +loss_cls=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), +loss_bbox=dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + eps=1e-7, + reduction='mean', + loss_weight=loss_bbox_weight * (3 / num_det_layer + return_iou=True), +loss_obj=dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)), +``` + +`loss_cls` 可以根据自定义类别数和检测层数对 `loss_weight` 进行自适应缩放,`loss_bbox` 可以根据检测层数进行自适应计算,而 `loss_obj` 可以根据输入图片大小和检测层数进行自适应缩放。这种策略可以让用户不用去设置 Loss 权重超参。 +需要说明的是:这个只是经验规则,并不是说是最佳设置组合,只是作为一个参考。 + +#### 2 Weight Decay 和 Loss 输出值基于 Batch Size 自适应,少 2 个超参 + +一般来说,在不同的 `Batch Size` 上进行训练,需要遵循学习率自动缩放规则。但是在各个数据集上验证表明 YOLOv5 实际上在改变 `Batch Size` 时候不缩放学习率也可以取得不错的效果,甚至有时候你缩放了效果还更差。原因就在于代码中存在 `Weight Decay` 和 Loss 输出值基于 `Batch Size` 自适应的技巧。在 YOLOv5 中会基于当前训练的总 `Batch Size` 来缩放 `Weight Decay` 和 Loss 输出值。对应代码为: + +```python +# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/engine/optimizers/yolov5_optim_constructor.py#L86 +if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') +``` + +```python +# https://github.com/open-mmlab/mmyolo/blob/dev/mmyolo/models/dense_heads/yolov5_head.py#L635 + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) +``` + +在不同的 Batch Size 下 Loss 的权重是不一样大的,Batch Size 越大,Loss 就越大,梯度就越大,我个人猜测这可以等价于 Batch Size 增大时候,学习率线性增加的场合。 +实际上从 YOLOv5 的 [YOLOv5 Study: mAP vs Batch-Size](https://github.com/ultralytics/yolov5/discussions/2452) 中可以发现确实是希望用户在修改 Batch Size 时不需要修改其他参数也可以相近的性能。上述两个策略是一个非常不错的训练技巧。 + +### 减少训练显存 + +如何减少训练显存是一个经常谈论的问题,所涉及的技术也非常多。 MMYOLO 的训练执行器来自 MMEngine,因此如何减少训练显存可以查阅 MMEngine 的文档。 MMEngine 目前支持梯度累加、梯度检查点和大模型训练技术,详情见 +[节省显存](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/save_gpu_memory.html)。 + +## 测试技巧 + +### 推理速度和测试精度的平衡 + +在模型性能测试时候,我们一般是要求 mAP 越高越好,但是在实际应用或者推理时候我们希望在保证低误报率和漏报率情况下模型推理越快越好,或者说测试只关注 mAP 而忽略了后处理和评估速度,而实际落地应用时候会追求速度和精度的平衡。 +在 YOLO 系列中可以通过控制某些参数实现速度和精度平衡,下面以 YOLOv5 为例对其进行详细描述。 + +#### 1 推理时避免一个检测框输出多个类别 + +YOLOv5 在训练分类分支时候采用的是 BCE Loss 即 `use_sigmoid=True`。假设物体类别数是 4,那么分类分支输出的类别数是 4 而不是 5,并且由于使用的是 sigmoid 而非 softmax 预测模式,很可能在某个位置预测出多个满足过滤阈值的检测框,也就是会出现一个预测 bbox 对应多个预测 label 的情况。如下图所示 + +
+multi-label +
+ +一般在计算 mAP 时候过滤阈值为 0.001,由于 sigmoid 非竞争性预测模式会导致一个框对应多个 label。这种计算方式可以提高 mAP 计算时候的召回率,但是实际落地应用会不方便。 + +一个常用的办法就是提高过滤阈值,但是如果你不需要出现较多漏报,此时推荐你修改 `multi_label` 参数为 False,其位于配置的 `mode.test_cfg.multi_label` 中,默认值是 True 表示允许一个检测框对应多个 label。 + +#### 2 简化 test pipeline + +注意到 YOLOv5 的 test pipeline 为如下: + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +其使用了两个不同功能的 Resize,目的依然是提高评估时候的 mAP 值。在实际落地应用时候你可以简化该 pipeline,如下所示: + +```python +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LetterResize', + scale=_base_.img_scale, + allow_scale_up=True, + use_mini_pad=True), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +实际上 YOLOv5 算法在实际应用时候是采用简化的 pipeline,并将 multi_label 设为 False, score_thr 提高为 0.25, iou_threshold 降低为 0.45。 +在 YOLOv5 配置中我们提供了一套 detect 落地时候的配置参数,具体见 [yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn-detect_8xb16-300e_coco.py)。 + +#### 3 Batch Shape 策略加快测试速度 + +Batch Shape 是 YOLOv5 中提出的可以加快推理的一个测试技巧,其思路是不再强制要求整个测试过程图片都是 640x640,而是可以变尺度测试,只需要保证当前 batch 内的 shape 是一样的就行。这种方式可以减少额外的图片像素填充,从而实现加速推理过程。 +Batch Shape 的具体实现可以参考 [链接](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/utils.py#L55)。MMYOLO 中几乎所有算法在测试时候都是默认开启了 Batch Shape 策略。 如果用户想关闭该功能,可以设置 `val_dataloader.dataset.batch_shapes_cfg=None`。 + +在实际落地场景下,因为动态 shape 没有固定 shape 快且高效,所以一般会不采用这个策略。 + +### TTA 提升测试精度 + +TTA 测试时增强是一个万能的涨点技巧,在打比赛时候非常有用。MMYOLO 已经支持了 TTA,只需要在测试时候输入 `--tta` 即可开启。详情见 [TTA 说明](https://github.com/open-mmlab/mmyolo/blob/dev/docs/zh_cn/common_usage/tta.md)。 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/troubleshooting_steps.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/troubleshooting_steps.md new file mode 100644 index 0000000000000000000000000000000000000000..cc4fc2e21af949e988dc97e80cb3736ded32f6ee --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/troubleshooting_steps.md @@ -0,0 +1,111 @@ +# 常见错误排除步骤 + +本文档收集用户经常碰到的常见错误情况,并提供详细的排查步骤。如果你发现阅读本文你没有找到正确的解决方案,请联系我们或者提 PR 进行更新。提 PR 请参考 [如何给 MMYOLO 贡献代码](../recommended_topics/contributing.md) + +## xxx is not in the model registry + +这个错误信息是指某个模块没有被注册到 model 中。 这个错误出现的原因非常多,典型的情况有: + +1. 你新增的模块没有在类别前面加上注册器装饰器 @MODELS.register_module() +2. 虽然注册了,但是注册错了位置,例如你实际想注册到 MMYOLO 中,但是你导入的 MODELS 是 MMDet 包里面的 +3. 你注册了且注册正确了,但是没有在对应的 `__init__.py` 中加入导致没有被导入 +4. 以上 3 个步骤都确认没问题,但是你是新增 py 文件来自定义模块的却没有重新安装 MMYOLO 导致没有生效,此时你可以重新安装一遍,即使你是 -e 模式安装也需要重新安装 +5. 如果你是在 mmyolo 包路径下新增了一个 package, 除上述步骤外,你还需要在 [register_all_modules](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/utils/setup_env.py#L8) 函数中增加其导包代码,否则该 package 不会被自动触发 +6. 你的环境中有多个版本 MMYOLO,你注册的和实际运行的实际上不是同一套代码,导致没有生效。此时你可以在程序运行前输入 `PYTHONPATH="$(dirname $0)/..":$PYTHONPATH` 强行使用当前代码 + +## loss_bbox 始终为 0 + +该原因出现主要有两个原因: + +1. 训练过程中没有 GT 标注数据 +2. 参数设置不合理导致训练中没有正样本 + +第一种情况出现的概率更大。 `loss_bbox` 通常是只考虑正样本的 loss,如果训练中没有正样本则始终为 0。如果是第一种原因照成的 `loss_bbox` 始终为 0,那么通常意味着你配置不对,特别是 dataset 部分的配置不正确。 +一个非常典型的情况是用户的 `dataset` 中 `metainfo` 设置不正确或者设置了但是没有传给 dataset 导致加载后没有找到对应类别的 GT Bbox 标注。 这种情况请仔细阅读我们提供的 [示例配置](https://github.com/open-mmlab/mmyolo/blob/main/projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py#L27) 。 +验证 dataset 配置是否正确的一个最直接的途径是运行 [browse_dataset 脚本](https://github.com/open-mmlab/mmyolo/blob/main/tools/analysis_tools/browse_dataset.py),如果可视化效果正确则说明是正确的。 + +## MMCV 安装时间非常久 + +这通常意味着你在自己编译 MMCV 而不是直接下载使用我们提供的预编译包。 MMCV 中包括了大量的自定义的 CUDA 算子,如果从源码安装则需要非常久的时间去编译,并且由于其安装成功依赖于严格的底层环境信息,需要多个库的版本一致才可以。如果用户自己编译大概率会失败。 +我们不推荐用户自己去编译 MMCV 而应该优先选择预编译包。如果你当前的环境中我们没有提供对应的预编译包,那么建议你可以快速换一个 Conda 环境,并安装有预编译包的 Torch。 以 torch1.8.0+cu102 为例,如果你想查看目前查看所有的预编译包,可以查看 https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html。 + +## 基于官方配置继承新建的配置出现 unexpected keyword argument + +这通常是由于你没有删除 base 配置中的额外参数。 可以在你新建配置所修改的字典中增加 `_delete_=True` 删掉 base 中该类之前的所有参数。 + +## The testing results of the whole dataset is empty + +这通常说明训练效果太差导致网络没有预测出任何符合阈值要求的检测框。 出现这种现象有多个原因,典型的为: + +1. 当前为前几个 epoch,网络当前训练效果还较差,等后续训练久一点后可能就不会出现该警告了 +2. 配置设置不正确,网络虽然正常训练但是实际上无效训练,例如前面的 `loss_bbox` 始终为 0 就会导致上述警告 +3. 超参设置不合理 + +## ValueError: not enough values to unpack(expected 2, got 0) + +这个错误通常是在 epoch 切换时候出现。这是 PyTorch 1.7 的已知问题,在 PyTorch 1.8+ 中已经修复。如果在 PyTorch 1.7 中想修复这个问题,可以简单的设置 dataloader 参数 `persistent_workers` 为 False。 + +## ValueError: need at least one array to concatenate + +这个是一个非常常见的错误,可能出现在训练一开始或者训练正常但是评估时候。不管出现在何阶段,均说明你的配置不对: + +1. 最常见的错误就是 `num_classes` 参数设置不对。在 MMYOLO 或者 MMDet 中大部分配置都是以 COCO 数据为例,因此配置中默认的 `num_classes` 是 80, 如果用户自定义数据集没有正确修改这个字段则会出现上述错误。 + MMYOLO 中有些算法配置会在多个模块中都需要 `num_classes` 参数,用户经常出现的错误就是仅仅修改了某一个地方的 `num_classes` 而没有将所有的 `num_classes` 都修改。想快速解决这个问题,可以使用 [print_config](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/print_config.py) + 脚本打印下全配置,然后全局搜索 `num_classes` 确认是否有没有修改的模块。 +2. 该错误还可能会出现在对 `dataset.metainfo.classes` 参数设置不对造成的。当用户希望训练自己的数据集但是未能正确的修改 `dataset.metainfo.classes` 参数,而默认的使用 `COCO` 数据集中的类别时,且用户自定义数据集的所有类别不在 `COCO` 数据集的类别里就会出现该错误。这时需要用户核对并修改正确的 `dataset.metainfo.classes` 信息。 + +## 评估时候 IndexError: list index out of range + +具体输出信息是 + +```text + File "site-packages/mmdet/evaluation/metrics/coco_metric.py", line 216, in results2json + data['category_id'] = self.cat_ids[label] +IndexError: list index out of range +``` + +可以看出是评估时候类别索引越界,这个通常的原因是配置中的 `num_classes` 设置不正确,默认的 `num_classes` 是 80,如果你自定义类别小于 80,那么就有可能出现类别越界。注意算法配置的 `num_classes` 一般会用到多个模块,你可能只改了某几个而漏掉了一些。想快速解决这个问题,可以使用 [print_config](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/print_config.py) +脚本打印下全配置,然后全局搜索 `num_classes` 确认是否有没有修改的模块。 + +## 训练中不打印 loss,但是程序依然正常训练和评估 + +这通常是因为一个训练 epoch 没有超过 50 个迭代,而 MMYOLO 中默认的打印间隔是 50。你可以修改 `default_hooks.logger.interval` 参数。 + +## GPU out of memory + +1. 存在大量 ground truth boxes 或者大量 anchor 的场景,可能在 assigner 会 OOM。 +2. 使用 --amp 来开启混合精度训练。 +3. 你也可以尝试使用 MMDet 中的 AvoidCUDAOOM 来避免该问题。首先它将尝试调用 torch.cuda.empty_cache()。如果失败,将会尝试把输入类型转换到 FP16。如果仍然失败,将会把输入从 GPUs 转换到 CPUs 进行计算。这里提供了两个使用的例子: + +```python +from mmdet.utils import AvoidCUDAOOM + +output = AvoidCUDAOOM.retry_if_cuda_oom(some_function)(input1, input2) +``` + +你也可也使用 AvoidCUDAOOM 作为装饰器让代码遇到 OOM 的时候继续运行: + +```python +from mmdet.utils import AvoidCUDAOOM + +@AvoidCUDAOOM.retry_if_cuda_oom +def function(*args, **kwargs): + ... + return xxx +``` + +## Loss goes Nan + +1. 检查数据的标注是否正常, 长或宽为 0 的框可能会导致回归 loss 变为 nan,一些小尺寸(宽度或高度小于 1)的框在数据增强后也会导致此问题。 因此,可以检查标注并过滤掉那些特别小甚至面积为 0 的框,并关闭一些可能会导致 0 面积框出现数据增强。 +2. 降低学习率:由于某些原因,例如 batch size 大小的变化, 导致当前学习率可能太大。 您可以降低为可以稳定训练模型的值。 +3. 延长 warm up 的时间:一些模型在训练初始时对学习率很敏感。 +4. 添加 gradient clipping: 一些模型需要梯度裁剪来稳定训练过程。 你可以在 config 设置 `optim_wrapper.clip_grad=dict(max_norm=xx)` + +## 训练中其他不符合预期或者错误 + +如果训练或者评估中出现了不属于上述描述的问题,由于原因不明,现提供常用的排除流程: + +1. 首先确认配置是否正确,可以使用 [print_config](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/print_config.py) 脚本打印全部配置,如果运行成功则说明配置语法没有错误 +2. 确认 COCO 格式的 json 标注是否正确,可以使用 [browse_coco_json.py](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/browse_coco_json.py) 脚本确认 +3. 确认 dataset 部分配置是否正确,这一步骤几乎是必须要提前运行的,可以提前排查很多问题,可以使用 [browse_dataset.py](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/browse_dataset.py) 脚本确认 +4. 如果以上 3 步都没有问题,那么出问题可能在 model 部分了。这个部分的排除没有特别的办法,你可以单独写一个脚本来仅运行 model 部分并通过调试来确认,如果对于 model 中多个模块的输入构建存在困惑,可以参考对应模块的单元测试写法 diff --git a/third_party/mmyolo/docs/zh_cn/recommended_topics/visualization.md b/third_party/mmyolo/docs/zh_cn/recommended_topics/visualization.md new file mode 100644 index 0000000000000000000000000000000000000000..ed4bbf94d72128ebeb91282bf7ebc6fcecf5c96f --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/recommended_topics/visualization.md @@ -0,0 +1,542 @@ +# 关于可视化的一切 + +本文包括特征图可视化和 Grad-Based 和 Grad-Free CAM 可视化 + +## 特征图可视化 + +
+image +
+可视化可以给深度学习的模型训练和测试过程提供直观解释。 + +MMYOLO 中,将使用 MMEngine 提供的 `Visualizer` 可视化器进行特征图可视化,其具备如下功能: + +- 支持基础绘图接口以及特征图可视化。 +- 支持选择模型中的不同层来得到特征图,包含 `squeeze_mean` , `select_max` , `topk` 三种显示方式,用户还可以使用 `arrangement` 自定义特征图显示的布局方式。 + +### 特征图绘制 + +你可以调用 `demo/featmap_vis_demo.py` 来简单快捷地得到可视化结果,为了方便理解,将其主要参数的功能梳理如下: + +- `img`:选择要用于特征图可视化的图片,支持单张图片或者图片路径列表。 + +- `config`:选择算法的配置文件。 + +- `checkpoint`:选择对应算法的权重文件。 + +- `--out-file`:将得到的特征图保存到本地,并指定路径和文件名。 + +- `--device`:指定用于推理图片的硬件,`--device cuda:0` 表示使用第 1 张 GPU 推理,`--device cpu` 表示用 CPU 推理。 + +- `--score-thr`:设置检测框的置信度阈值,只有置信度高于这个值的框才会显示。 + +- `--preview-model`:可以预览模型,方便用户理解模型的特征层结构。 + +- `--target-layers`:对指定层获取可视化的特征图。 + + - 可以单独输出某个层的特征图,例如: `--target-layers backbone` , `--target-layers neck` , `--target-layers backbone.stage4` 等。 + - 参数为列表时,也可以同时输出多个层的特征图,例如: `--target-layers backbone.stage4 neck` 表示同时输出 backbone 的 stage4 层和 neck 的三层一共四层特征图。 + +- `--channel-reduction`:输入的 Tensor 一般是包括多个通道的,`channel_reduction` 参数可以将多个通道压缩为单通道,然后和图片进行叠加显示,有以下三个参数可以设置: + + - `squeeze_mean`:将输入的 C 维度采用 mean 函数压缩为一个通道,输出维度变成 (1, H, W)。 + - `select_max`:将输入先在空间维度 sum,维度变成 (C, ),然后选择值最大的通道。 + - `None`:表示不需要压缩,此时可以通过 `topk` 参数可选择激活度最高的 `topk` 个特征图显示。 + +- `--topk`:只有在 `channel_reduction` 参数为 `None` 的情况下, `topk` 参数才会生效,其会按照激活度排序选择 `topk` 个通道,然后和图片进行叠加显示,并且此时会通过 `--arrangement` 参数指定显示的布局,该参数表示为一个数组,两个数字需要以空格分开,例如: `--topk 5 --arrangement 2 3` 表示以 `2行 3列` 显示激活度排序最高的 5 张特征图, `--topk 7 --arrangement 3 3` 表示以 `3行 3列` 显示激活度排序最高的 7 张特征图。 + + - 如果 topk 不是 -1,则会按照激活度排序选择 topk 个通道显示。 + - 如果 topk = -1,此时通道 C 必须是 1 或者 3 表示输入数据是图片,否则报错提示用户应该设置 `channel_reduction` 来压缩通道。 + +- 考虑到输入的特征图通常非常小,函数默认将特征图进行上采样后方便进行可视化。 + +**注意:当图片和特征图尺度不一样时候,`draw_featmap` 函数会自动进行上采样对齐。如果你的图片在推理过程中前处理存在类似 Pad 的操作此时得到的特征图也是 Pad 过的,那么直接上采样就可能会出现不对齐问题。** + +### 用法示例 + +以预训练好的 YOLOv5-s 模型为例: + +请提前下载 YOLOv5-s 模型权重到本仓库根路径下: + +```shell +cd mmyolo +wget https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth +``` + +(1) 将多通道特征图采用 `select_max` 参数压缩为单通道并显示, 通过提取 `backbone` 层输出进行特征图可视化,将得到 `backbone` 三个输出层的特征图: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone \ + --channel-reduction select_max +``` + +
+image +
+ +实际上上述代码存在图片和特征图不对齐问题,解决办法有两个: + +1. 修改 YOLOv5 配置,让后处理只是简单的 Resize 即可,这对于可视化是没有啥影响的 + +2. 可视化时候图片应该用前处理后的,而不能用前处理前的 + +**为了简单目前这里采用第一种解决办法,后续会采用第二种方案修复,让大家可以不修改配置即可使用**。具体来说是将原先的 `test_pipeline` 替换为仅仅 Resize 版本。 + +旧的 `test_pipeline` 为: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +修改为如下配置: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 这里将 LetterResize 修改成 mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +正确效果如下: + +
+image +
+ +(2) 将多通道特征图采用 `squeeze_mean` 参数压缩为单通道并显示, 通过提取 `neck` 层输出进行特征图可视化,将得到 `neck` 三个输出层的特征图: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers neck \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +(3) 将多通道特征图采用 `squeeze_mean` 参数压缩为单通道并显示, 通过提取 `backbone.stage4` 和 `backbone.stage3` 层输出进行特征图可视化,将得到两个输出层的特征图: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone.stage4 backbone.stage3 \ + --channel-reduction squeeze_mean +``` + +
+image +
+ +(4) 利用 `--topk 3 --arrangement 2 2` 参数选择多通道特征图中激活度最高的 3 个通道并采用 `2x2` 布局显示, 用户可以通过 `arrangement` 参数选择自己想要的布局,特征图将自动布局,先按每个层中的 `top3` 特征图按 `2x2` 的格式布局,再将每个层按 `2x2` 布局: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone.stage3 backbone.stage4 \ + --channel-reduction None \ + --topk 3 \ + --arrangement 2 2 +``` + +
+image +
+ +(5) 存储绘制后的图片,在绘制完成后,可以选择本地窗口显示,也可以存储到本地,只需要加入参数 `--out-file xxx.jpg`: + +```shell +python demo/featmap_vis_demo.py demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --target-layers backbone \ + --channel-reduction select_max \ + --out-file featmap_backbone.jpg +``` + +## Grad-Based 和 Grad-Free CAM 可视化 + +目标检测 CAM 可视化相比于分类 CAM 复杂很多且差异很大。本文只是简要说明用法,后续会单独开文档详细描述实现原理和注意事项。 + +你可以调用 `demo/boxmap_vis_demo.py` 来简单快捷地得到 Box 级别的 AM 可视化结果,目前已经支持 `YOLOv5/YOLOv6/YOLOX/RTMDet`。 + +以 YOLOv5 为例,和特征图可视化绘制一样,你需要先修改 `test_pipeline`,否则会出现特征图和原图不对齐问题。 + +旧的 `test_pipeline` 为: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] +``` + +修改为如下配置: + +```python +test_pipeline = [ + dict( + type='LoadImageFromFile', + backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=img_scale, keep_ratio=False), # 这里将 LetterResize 修改成 mmdet.Resize + dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] +``` + +(1) 使用 `GradCAM` 方法可视化 neck 模块的最后一个输出层的 AM 图 + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth + +``` + +
+image +
+ +相对应的特征图 AM 图如下: + +
+image +
+ +可以看出 `GradCAM` 效果可以突出 box 级别的 AM 信息。 + +你可以通过 `--topk` 参数选择仅仅可视化预测分值最高的前几个预测框 + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --topk 2 +``` + +
+image +
+ +(2) 使用 `AblationCAM` 方法可视化 neck 模块的最后一个输出层的 AM 图 + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --method ablationcam +``` + +
+image +
+ +由于 `AblationCAM` 是通过每个通道对分值的贡献程度来加权,因此无法实现类似 `GradCAM` 的仅仅可视化 box 级别的 AM 信息, 但是你可以使用 `--norm-in-bbox` 来仅仅显示 bbox 内部 AM + +```shell +python demo/boxam_vis_demo.py \ + demo/dog.jpg \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth \ + --method ablationcam \ + --norm-in-bbox +``` + +
+image +
+ +## 可视化 COCO 标签 + +脚本 `tools/analysis_tools/browse_coco_json.py` 能够使用可视化显示 COCO 标签在图片的情况。 + +```shell +python tools/analysis_tools/browse_coco_json.py [--data-root ${DATA_ROOT}] \ + [--img-dir ${IMG_DIR}] \ + [--ann-file ${ANN_FILE}] \ + [--wait-time ${WAIT_TIME}] \ + [--disp-all] [--category-names CATEGORY_NAMES [CATEGORY_NAMES ...]] \ + [--shuffle] +``` + +其中,如果图片、标签都在同一个文件夹下的话,可以指定 `--data-root` 到该文件夹,然后 `--img-dir` 和 `--ann-file` 指定该文件夹的相对路径,代码会自动拼接。 +如果图片、标签文件不在同一个文件夹下的话,则无需指定 `--data-root` ,直接指定绝对路径的 `--img-dir` 和 `--ann-file` 即可。 + +例子: + +1. 查看 `COCO` 全部类别,同时展示 `bbox`、`mask` 等所有类型的标注: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --disp-all +``` + +如果图片、标签不在同一个文件夹下的话,可以使用绝对路径: + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir '/dataset/image/coco/train2017' \ + --ann-file '/label/instances_train2017.json' \ + --disp-all +``` + +2. 查看 `COCO` 全部类别,同时仅展示 `bbox` 类型的标注,并打乱显示: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --shuffle +``` + +3. 只查看 `bicycle` 和 `person` 类别,同时仅展示 `bbox` 类型的标注: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --category-names 'bicycle' 'person' +``` + +4. 查看 `COCO` 全部类别,同时展示 `bbox`、`mask` 等所有类型的标注,并打乱显示: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --disp-all \ + --shuffle +``` + +## 可视化数据集 + +```shell +python tools/analysis_tools/browse_dataset.py \ + ${CONFIG_FILE} \ + [-o, --output-dir ${OUTPUT_DIR}] \ + [-p, --phase ${DATASET_PHASE}] \ + [-n, --show-number ${NUMBER_IMAGES_DISPLAY}] \ + [-i, --show-interval ${SHOW_INTERRVAL}] \ + [-m, --mode ${DISPLAY_MODE}] \ + [--cfg-options ${CFG_OPTIONS}] +``` + +**所有参数的说明**: + +- `config` : 模型配置文件的路径。 +- `-o, --output-dir`: 保存图片文件夹,如果没有指定,默认为 `'./output'`。 +- **`-p, --phase`**: 可视化数据集的阶段,只能为 `['train', 'val', 'test']` 之一,默认为 `'train'`。 +- **`-n, --show-number`**: 可视化样本数量。如果没有指定,默认展示数据集的所有图片。 +- **`-m, --mode`**: 可视化的模式,只能为 `['original', 'transformed', 'pipeline']` 之一。 默认为 `'transformed'`。 +- `--cfg-options` : 对配置文件的修改,参考[学习配置文件](../../tutorials/config.md)。 + +```shell +`-m, --mode` 用于设置可视化的模式,默认设置为 'transformed'。 +- 如果 `--mode` 设置为 'original',则获取原始图片; +- 如果 `--mode` 设置为 'transformed',则获取预处理后的图片; +- 如果 `--mode` 设置为 'pipeline',则获得数据流水线所有中间过程图片。 +``` + +**示例**: + +1. **'original'** 模式 : + +```shell +python ./tools/analysis_tools/browse_dataset.py configs/yolov5/yolov5_balloon.py --phase val --output-dir tmp --mode original +``` + +- `--phase val`: 可视化验证集, 可简化为 `-p val`; +- `--output-dir tmp`: 可视化结果保存在 "tmp" 文件夹, 可简化为 `-o tmp`; +- `--mode original`: 可视化原图, 可简化为 `-m original`; +- `--show-number 100`: 可视化100张图,可简化为 `-n 100`; + +2.**'transformed'** 模式 : + +```shell +python ./tools/analysis_tools/browse_dataset.py configs/yolov5/yolov5_balloon.py +``` + +3.**'pipeline'** 模式 : + +```shell +python ./tools/analysis_tools/browse_dataset.py configs/yolov5/yolov5_balloon.py -m pipeline +``` + +
+Image +
+ +## 可视化数据集分析 + +脚本 `tools/analysis_tools/dataset_analysis.py` 能够帮助用户得到四种功能的结果图,并将图片保存到当前运行目录下的 `dataset_analysis` 文件夹中。 + +关于该脚本的功能的说明: + +通过 `main()` 的数据准备,得到每个子函数所需要的数据。 + +功能一:显示类别和 bbox 实例个数的分布图,通过子函数 `show_bbox_num` 生成。 + + + +功能二:显示类别和 bbox 实例宽、高的分布图,通过子函数 `show_bbox_wh` 生成。 + + + +功能三:显示类别和 bbox 实例宽/高比例的分布图,通过子函数 `show_bbox_wh_ratio` 生成。 + + + +功能四:基于面积规则下,显示类别和 bbox 实例面积的分布图,通过子函数 `show_bbox_area` 生成。 + + + +打印列表显示,通过脚本中子函数 `show_class_list` 和 `show_data_list` 生成。 + + + +```shell +python tools/analysis_tools/dataset_analysis.py ${CONFIG} \ + [-h] \ + [--val-dataset ${TYPE}] \ + [--class-name ${CLASS_NAME}] \ + [--area-rule ${AREA_RULE}] \ + [--func ${FUNC}] \ + [--out-dir ${OUT_DIR}] +``` + +例子: + +1. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,其中默认设置:数据加载类型为 `train_dataset` ,面积规则设置为 `[0,32,96,1e5]` ,生成包含所有类的结果图并将图片保存到当前运行目录下 `./dataset_analysis` 文件夹中: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py +``` + +2. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--val-dataset` 设置将数据加载类型由默认的 `train_dataset` 改为 `val_dataset`: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --val-dataset +``` + +3. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--class-name` 设置将生成所有类改为特定类显示,以显示 `person` 为例: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --class-name person +``` + +4. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--area-rule` 重新定义面积规则,以 `30 70 125` 为例,面积规则变为 `[0,30,70,125,1e5]`: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --area-rule 30 70 125 +``` + +5. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--func` 设置,将显示四个功能效果图改为只显示 `功能一` 为例: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --func show_bbox_num +``` + +6. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--out-dir` 设置修改图片保存地址,以 `work_dirs/dataset_analysis` 地址为例: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --out-dir work_dirs/dataset_analysis +``` + +## 优化器参数策略可视化 + +`tools/analysis_tools/vis_scheduler.py` 旨在帮助用户检查优化器的超参数调度器(无需训练),支持学习率(learning rate)、动量(momentum)和权值衰减(weight decay)。 + +```shell +python tools/analysis_tools/vis_scheduler.py \ + ${CONFIG_FILE} \ + [-p, --parameter ${PARAMETER_NAME}] \ + [-d, --dataset-size ${DATASET_SIZE}] \ + [-n, --ngpus ${NUM_GPUs}] \ + [-o, --out-dir ${OUT_DIR}] \ + [--title ${TITLE}] \ + [--style ${STYLE}] \ + [--window-size ${WINDOW_SIZE}] \ + [--cfg-options] +``` + +**所有参数的说明**: + +- `config` : 模型配置文件的路径。 +- **`-p, parameter`**: 可视化参数名,只能为 `["lr", "momentum", "wd"]` 之一, 默认为 `"lr"`. +- **`-d, --dataset-size`**: 数据集的大小。如果指定,`DATASETS.build` 将被跳过并使用这个数值作为数据集大小,默认使用 `DATASETS.build` 所得数据集的大小。 +- **`-n, --ngpus`**: 使用 GPU 的数量, 默认为1。 +- **`-o, --out-dir`**: 保存的可视化图片的文件夹路径,默认不保存。 +- `--title`: 可视化图片的标题,默认为配置文件名。 +- `--style`: 可视化图片的风格,默认为 `whitegrid`。 +- `--window-size`: 可视化窗口大小,如果没有指定,默认为 `12*7`。如果需要指定,按照格式 `'W*H'`。 +- `--cfg-options`: 对配置文件的修改,参考[学习配置文件](../tutorials/config.md)。 + +```{note} +部分数据集在解析标注阶段比较耗时,推荐直接将 `-d, dataset-size` 指定数据集的大小,以节约时间。 +``` + +你可以使用如下命令来绘制配置文件 `configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py` 将会使用的学习率变化曲线: + +```shell +python tools/analysis_tools/vis_scheduler.py \ + configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py \ + --dataset-size 118287 \ + --ngpus 8 \ + --out-dir ./output +``` + +
+ +## 大图推理 (TODO) diff --git a/third_party/mmyolo/docs/zh_cn/stat.py b/third_party/mmyolo/docs/zh_cn/stat.py new file mode 100755 index 0000000000000000000000000000000000000000..44505546c751e0dafbc4c4713ef374933c605ca8 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/stat.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +import functools as func +import glob +import os.path as osp +import re + +import numpy as np + +url_prefix = 'https://github.com/open-mmlab/mmyolo/blob/main/' + +files = sorted(glob.glob('../configs/*/README.md')) + +stats = [] +titles = [] +num_ckpts = 0 + +for f in files: + url = osp.dirname(f.replace('../', url_prefix)) + + with open(f) as content_file: + content = content_file.read() + + title = content.split('\n')[0].replace('# ', '').strip() + ckpts = { + x.lower().strip() + for x in re.findall(r'\[model\]\((https?.*)\)', content) + } + + if len(ckpts) == 0: + continue + + _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] + assert len(_papertype) > 0 + papertype = _papertype[0] + + paper = {(papertype, title)} + + titles.append(title) + num_ckpts += len(ckpts) + + statsmsg = f""" +\t* [{papertype}] [{title}]({url}) ({len(ckpts)} ckpts) +""" + stats.append((paper, ckpts, statsmsg)) + +allpapers = func.reduce(lambda a, b: a.union(b), [p for p, _, _ in stats]) +msglist = '\n'.join(x for _, _, x in stats) + +papertypes, papercounts = np.unique([t for t, _ in allpapers], + return_counts=True) +countstr = '\n'.join( + [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) + +modelzoo = f""" +# Model Zoo Statistics + +* Number of papers: {len(set(titles))} +{countstr} + +* Number of checkpoints: {num_ckpts} + +{msglist} +""" + +with open('modelzoo_statistics.md', 'w') as f: + f.write(modelzoo) diff --git a/third_party/mmyolo/docs/zh_cn/switch_language.md b/third_party/mmyolo/docs/zh_cn/switch_language.md new file mode 100644 index 0000000000000000000000000000000000000000..57b71ebfe41843c8bc8ad29d01d4657f0770465e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/switch_language.md @@ -0,0 +1,3 @@ +## English + +## 简体中文 diff --git a/third_party/mmyolo/docs/zh_cn/tutorials/config.md b/third_party/mmyolo/docs/zh_cn/tutorials/config.md new file mode 100644 index 0000000000000000000000000000000000000000..d43a4fceb71b4580403b12b58d8ef8447633632f --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/tutorials/config.md @@ -0,0 +1,553 @@ +# 学习 YOLOv5 配置文件 + +MMYOLO 和其他 OpenMMLab 仓库使用 [MMEngine 的配置文件系统](https://mmengine.readthedocs.io/zh_cn/latest/tutorials/config.md)。 配置文件使用了模块化和继承设计,以便于进行各类实验。 + +## 配置文件的内容 + +MMYOLO 采用模块化设计,所有功能的模块都可以通过配置文件进行配置。 以 [yolov5_s-v61_syncbn_8xb16-300e_coco.py](https://github.com/open-mmlab/mmyolo/blob/main/configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py) 为例,我们将根据不同的功能模块介绍配置文件中的各个字段: + +### 重要参数 + +如下参数是修改训练配置时经常需要修改的参数。例如缩放因子 `deepen_factor` 和 `widen_factor`,MMYOLO 中的网络基本都使用它们来控制模型的大小。所以我们推荐在配置文件中单独定义这些参数。 + +```python +img_scale = (640, 640) # 高度,宽度 +deepen_factor = 0.33 # 控制网络结构深度的缩放因子,YOLOv5-s 为 0.33 +widen_factor = 0.5 # 控制网络结构宽度的缩放因子,YOLOv5-s 为 0.5 +max_epochs = 300 # 最大训练轮次 300 轮 +save_epoch_intervals = 10 # 验证间隔,每 10 个 epoch 验证一次 +train_batch_size_per_gpu = 16 # 训练时单个 GPU 的 Batch size +train_num_workers = 8 # 训练时单个 GPU 分配的数据加载线程数 +val_batch_size_per_gpu = 1 # 验证时单个 GPU 的 Batch size +val_num_workers = 2 # 验证时单个 GPU 分配的数据加载线程数 +``` + +### 模型配置 + +在 MMYOLO 的配置中,我们使用 `model` 字段来配置检测算法的组件。 除了 `backbone`、`neck` 等神经网络组件外,还需要 `data_preprocessor`、`train_cfg` 和 `test_cfg`。 `data_preprocessor` 负责对 dataloader 输出的每一批数据进行预处理。 模型配置中的 `train_cfg` 和 `test_cfg` 用于设置训练和测试组件的超参数。 + +```python +anchors = [[(10, 13), (16, 30), (33, 23)], # 多尺度的先验框基本尺寸 + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] +strides = [8, 16, 32] # 先验框生成器的步幅 + +model = dict( + type='YOLODetector', #检测器名 + data_preprocessor=dict( # 数据预处理器的配置,通常包括图像归一化和 padding + type='mmdet.DetDataPreprocessor', # 数据预处理器的类型,还可以选择 'YOLOv5DetDataPreprocessor' 训练速度更快 + mean=[0., 0., 0.], # 用于预训练骨干网络的图像归一化通道均值,按 R、G、B 排序 + std=[255., 255., 255.], # 用于预训练骨干网络的图像归一化通道标准差,按 R、G、B 排序 + bgr_to_rgb=True), # 是否将图像通道从 BGR 转为 RGB + backbone=dict( # 主干网络的配置文件 + type='YOLOv5CSPDarknet', # 主干网络的类别,目前可选用 'YOLOv5CSPDarknet', 'YOLOv6EfficientRep', 'YOLOXCSPDarknet' 3种 + deepen_factor=deepen_factor, # 控制网络结构深度的缩放因子 + widen_factor=widen_factor, # 控制网络结构宽度的缩放因子 + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), # 归一化层(norm layer)的配置项 + act_cfg=dict(type='SiLU', inplace=True)), # 激活函数(activation function)的配置项 + neck=dict( + type='YOLOv5PAFPN', # 检测器的 neck 是 YOLOv5FPN,我们同样支持 'YOLOv6RepPAFPN', 'YOLOXPAFPN' + deepen_factor=deepen_factor, # 控制网络结构深度的缩放因子 + widen_factor=widen_factor, # 控制网络结构宽度的缩放因子 + in_channels=[256, 512, 1024], # 输入通道数,与 Backbone 的输出通道一致 + out_channels=[256, 512, 1024], # 输出通道数,与 Head 的输入通道一致 + num_csp_blocks=3, # CSPLayer 中 bottlenecks 的数量 + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), # 归一化层(norm layer)的配置项 + act_cfg=dict(type='SiLU', inplace=True)), # 激活函数(activation function)的配置项 + bbox_head=dict( + type='YOLOv5Head', # bbox_head 的类型是 'YOLOv5Head', 我们目前也支持 'YOLOv6Head', 'YOLOXHead' + head_module=dict( + type='YOLOv5HeadModule', # head_module 的类型是 'YOLOv5HeadModule', 我们目前也支持 'YOLOv6HeadModule', 'YOLOXHeadModule' + num_classes=80, # 分类的类别数量 + in_channels=[256, 512, 1024], # 输入通道数,与 Neck 的输出通道一致 + widen_factor=widen_factor, # 控制网络结构宽度的缩放因子 + featmap_strides=[8, 16, 32], # 多尺度特征图的步幅 + num_base_priors=3), # 在一个点上,先验框的数量 + prior_generator=dict( # 先验框(prior)生成器的配置 + type='mmdet.YOLOAnchorGenerator', # 先验框生成器的类型是 mmdet 中的 'YOLOAnchorGenerator' + base_sizes=anchors, # 多尺度的先验框基本尺寸 + strides=strides), # 先验框生成器的步幅, 与 FPN 特征步幅一致。如果未设置 base_sizes,则当前步幅值将被视为 base_sizes。 + ), + test_cfg=dict( + multi_label=True, # 对于多类别预测来说是否考虑多标签,默认设置为 True + nms_pre=30000, # NMS 前保留的最大检测框数目 + score_thr=0.001, # 过滤类别的分值,低于 score_thr 的检测框当做背景处理 + nms=dict(type='nms', # NMS 的类型 + iou_threshold=0.65), # NMS 的阈值 + max_per_img=300)) # 每张图像 NMS 后保留的最大检测框数目 +``` + +### 数据集和评测器配置 + +在使用 [执行器](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html) 进行训练、测试、验证时,我们需要配置 [Dataloader](https://pytorch.org/docs/stable/data.html?highlight=data%20loader#torch.utils.data.DataLoader) 。构建数据 dataloader 需要设置数据集(dataset)和数据处理流程(data pipeline)。 由于这部分的配置较为复杂,我们使用中间变量来简化 dataloader 配置的编写。由于 MMYOLO 中各类轻量目标检测算法使用了更加复杂的数据增强方法,因此会比 MMDetection 中的其他模型拥有更多样的数据集配置。 + +YOLOv5 的训练与测试的数据流存在一定差异,这里我们分别进行介绍。 + +```python +dataset_type = 'CocoDataset' # 数据集类型,这将被用来定义数据集 +data_root = 'data/coco/' # 数据的根路径 + +pre_transform = [ # 训练数据读取流程 + dict( + type='LoadImageFromFile'), # 第 1 个流程,从文件路径里加载图像 + dict(type='LoadAnnotations', # 第 2 个流程,对于当前图像,加载它的注释信息 + with_bbox=True) # 是否使用标注框(bounding box),目标检测需要设置为 True +] + +albu_train_transforms = [ # YOLOv5-v6.1 仓库中,引入了 Albumentation 代码库进行图像的数据增广, 请确保其版本为 1.0.+ + dict(type='Blur', p=0.01), # 图像模糊,模糊概率 0.01 + dict(type='MedianBlur', p=0.01), # 均值模糊,模糊概率 0.01 + dict(type='ToGray', p=0.01), # 随机转换为灰度图像,转灰度概率 0.01 + dict(type='CLAHE', p=0.01) # CLAHE(限制对比度自适应直方图均衡化) 图像增强方法,直方图均衡化概率 0.01 +] +train_pipeline = [ # 训练数据处理流程 + *pre_transform, # 引入前述定义的训练数据读取流程 + dict( + type='Mosaic', # Mosaic 数据增强方法 + img_scale=img_scale, # Mosaic 数据增强后的图像尺寸 + pad_val=114.0, # 空区域填充像素值 + pre_transform=pre_transform), # 之前创建的 pre_transform 训练数据读取流程 + dict( + type='YOLOv5RandomAffine', # YOLOv5 的随机仿射变换 + max_rotate_degree=0.0, # 最大旋转角度 + max_shear_degree=0.0, # 最大错切角度 + scaling_ratio_range=(0.5, 1.5), # 图像缩放系数的范围 + border=(-img_scale[0] // 2, -img_scale[1] // 2), # 从输入图像的高度和宽度两侧调整输出形状的距离 + border_val=(114, 114, 114)), # 边界区域填充像素值 + dict( + type='mmdet.Albu', # mmdet 中的 Albumentation 数据增强 + transforms=albu_train_transforms, # 之前创建的 albu_train_transforms 数据增强流程 + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), # HSV通道随机增强 + dict(type='mmdet.RandomFlip', prob=0.5), # 随机翻转,翻转概率 0.5 + dict( + type='mmdet.PackDetInputs', # 将数据转换为检测器输入格式的流程 + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +train_dataloader = dict( # 训练 dataloader 配置 + batch_size=train_batch_size_per_gpu, # 训练时单个 GPU 的 Batch size + num_workers=train_num_workers, # 训练时单个 GPU 分配的数据加载线程数 + persistent_workers=True, # 如果设置为 True,dataloader 在迭代完一轮之后不会关闭数据读取的子进程,可以加速训练 + pin_memory=True, # 开启锁页内存,节省 CPU 内存拷贝时间 + sampler=dict( # 训练数据的采样器 + type='DefaultSampler', # 默认的采样器,同时支持分布式和非分布式训练。请参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/sampler.py + shuffle=True), # 随机打乱每个轮次训练数据的顺序 + dataset=dict( # 训练数据集的配置 + type=dataset_type, + data_root=data_root, + ann_file='annotations/instances_train2017.json', # 标注文件路径 + data_prefix=dict(img='train2017/'), # 图像路径前缀 + filter_cfg=dict(filter_empty_gt=False, min_size=32), # 图像和标注的过滤配置 + pipeline=train_pipeline)) # 这是由之前创建的 train_pipeline 定义的数据处理流程 +``` + +YOLOv5 测试阶段采用 [Letter Resize](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/transforms/transforms.py#L116) 的方法来将所有的测试图像统一到相同尺度,进而有效保留了图像的长宽比。因此我们在验证和评测时,都采用相同的数据流进行推理。 + +```python +test_pipeline = [ # 测试数据处理流程 + dict( + type='LoadImageFromFile'), # 第 1 个流程,从文件路径里加载图像 + dict(type='YOLOv5KeepRatioResize', # 第 2 个流程,保持长宽比的图像大小缩放 + scale=img_scale), # 图像缩放的目标尺寸 + dict( + type='LetterResize', # 第 3 个流程,满足多种步幅要求的图像大小缩放 + scale=img_scale, # 图像缩放的目标尺寸 + allow_scale_up=False, # 当 ratio > 1 时,是否允许放大图像, + pad_val=dict(img=114)), # 空区域填充像素值 + dict(type='LoadAnnotations', with_bbox=True), # 第 4 个流程,对于当前图像,加载它的注释信息 + dict( + type='mmdet.PackDetInputs', # 将数据转换为检测器输入格式的流程 + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, # 验证时单个 GPU 的 Batch size + num_workers=val_num_workers, # 验证时单个 GPU 分配的数据加载线程数 + persistent_workers=True, # 如果设置为 True,dataloader 在迭代完一轮之后不会关闭数据读取的子进程,可以加速训练 + pin_memory=True, # 开启锁页内存,节省 CPU 内存拷贝时间 + drop_last=False, # 是否丢弃最后未能组成一个批次的数据 + sampler=dict( + type='DefaultSampler', # 默认的采样器,同时支持分布式和非分布式训练 + shuffle=False), # 验证和测试时不打乱数据顺序 + dataset=dict( + type=dataset_type, + data_root=data_root, + test_mode=True, # 开启测试模式,避免数据集过滤图像和标注 + data_prefix=dict(img='val2017/'), # 图像路径前缀 + ann_file='annotations/instances_val2017.json', # 标注文件路径 + pipeline=test_pipeline, # 这是由之前创建的 test_pipeline 定义的数据处理流程 + batch_shapes_cfg=dict( # batch shapes 配置 + type='BatchShapePolicy', # 确保在 batch 推理过程中同一个 batch 内的图像 pad 像素最少,不要求整个验证过程中所有 batch 的图像尺度一样 + batch_size=val_batch_size_per_gpu, # batch shapes 策略的 batch size,等于验证时单个 GPU 的 Batch size + img_size=img_scale[0], # 图像的尺寸 + size_divisor=32, # padding 后的图像的大小应该可以被 pad_size_divisor 整除 + extra_pad_ratio=0.5))) # 额外需要 pad 的像素比例 + +test_dataloader = val_dataloader +``` + +[评测器](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/evaluation.html) 用于计算训练模型在验证和测试数据集上的指标。评测器的配置由一个或一组评价指标(Metric)配置组成: + +```python +val_evaluator = dict( # 验证过程使用的评测器 + type='mmdet.CocoMetric', # 用于评估检测的 AR、AP 和 mAP 的 coco 评价指标 + proposal_nums=(100, 1, 10), # 用于评估检测任务时,选取的Proposal数量 + ann_file=data_root + 'annotations/instances_val2017.json', # 标注文件路径 + metric='bbox', # 需要计算的评价指标,`bbox` 用于检测 +) +test_evaluator = val_evaluator # 测试过程使用的评测器 +``` + +由于测试数据集没有标注文件,因此 MMYOLO 中的 `test_dataloader` 和 `test_evaluator` 配置通常等于 `val`。 如果要保存在测试数据集上的检测结果,则可以像这样编写配置: + +```python +# 在测试集上推理, +# 并将检测结果转换格式以用于提交结果 +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + data_prefix=dict(img='test2017/'), + test_mode=True, + pipeline=test_pipeline)) +test_evaluator = dict( + type='mmdet.CocoMetric', + ann_file=data_root + 'annotations/image_info_test-dev2017.json', + metric='bbox', + format_only=True, # 只将模型输出转换为coco的 JSON 格式并保存 + outfile_prefix='./work_dirs/coco_detection/test') # 要保存的 JSON 文件的前缀 +``` + +### 训练和测试的配置 + +MMEngine 的 Runner 使用 Loop 来控制训练,验证和测试过程。 +用户可以使用这些字段设置最大训练轮次和验证间隔。 + +```python +max_epochs = 300 # 最大训练轮次 300 轮 +save_epoch_intervals = 10 # 验证间隔,每 10 轮验证一次 + +train_cfg = dict( + type='EpochBasedTrainLoop', # 训练循环的类型,请参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=max_epochs, # 最大训练轮次 300 轮 + val_interval=save_epoch_intervals) # 验证间隔,每 10 个 epoch 验证一次 +val_cfg = dict(type='ValLoop') # 验证循环的类型 +test_cfg = dict(type='TestLoop') # 测试循环的类型 +``` + +MMEngine 也支持动态评估间隔,例如你可以在前面 280 epoch 训练阶段中,每间隔 10 个 epoch 验证一次,到最后 20 epoch 训练中每隔 1 个 epoch 验证一次,则配置写法为: + +```python +max_epochs = 300 # 最大训练轮次 300 轮 +save_epoch_intervals = 10 # 验证间隔,每 10 轮验证一次 + +train_cfg = dict( + type='EpochBasedTrainLoop', # 训练循环的类型,请参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py + max_epochs=max_epochs, # 最大训练轮次 300 轮 + val_interval=save_epoch_intervals, # 验证间隔,每 10 个 epoch 验证一次 + dynamic_intervals=[(280, 1)]) # 到 280 epoch 开始切换为间隔 1 的评估方式 +val_cfg = dict(type='ValLoop') # 验证循环的类型 +test_cfg = dict(type='TestLoop') # 测试循环的类型 +``` + +### 优化相关配置 + +`optim_wrapper` 是配置优化相关设置的字段。优化器封装(OptimWrapper)不仅提供了优化器的功能,还支持梯度裁剪、混合精度训练等功能。更多内容请看[优化器封装教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html). + +```python +optim_wrapper = dict( # 优化器封装的配置 + type='OptimWrapper', # 优化器封装的类型。可以切换至 AmpOptimWrapper 来启用混合精度训练 + optimizer=dict( # 优化器配置。支持 PyTorch 的各种优化器。请参考 https://pytorch.org/docs/stable/optim.html#algorithms + type='SGD', # 随机梯度下降优化器 + lr=0.01, # 基础学习率 + momentum=0.937, # 带动量的随机梯度下降 + weight_decay=0.0005, # 权重衰减 + nesterov=True, # 开启Nesterov momentum,公式详见 http://www.cs.toronto.edu/~hinton/absps/momentum.pdf + batch_size_per_gpu=train_batch_size_per_gpu), # 该选项实现了自动权重衰减系数缩放 + clip_grad=None, # 梯度裁剪的配置,设置为 None 关闭梯度裁剪。使用方法请见 https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html + constructor='YOLOv5OptimizerConstructor') # YOLOv5 优化器构建器 + +``` + +`param_scheduler` 字段用于配置参数调度器(Parameter Scheduler)来调整优化器的超参数(例如学习率和动量)。 用户可以组合多个调度器来创建所需的参数调整策略。 在[参数调度器教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/param_scheduler.html) 和参数调度器 API 文档 中查找更多信息。在 YOLOv5 中,参数调度实现比较复杂,难以通过 `param_scheduler` 实现。所以我们采用了 `YOLOv5ParamSchedulerHook` 来实现(见下节),这样做更简单但是通用性较差。 + +```python +param_scheduler = None +``` + +### 钩子配置 + +用户可以在训练、验证和测试循环上添加钩子,以便在运行期间插入一些操作。配置中有两种不同的钩子字段,一种是 `default_hooks`,另一种是 `custom_hooks`。 + +`default_hooks` 是一个字典,用于配置运行时必须使用的钩子。这些钩子具有默认优先级,如果未设置,runner 将使用默认值。如果要禁用默认钩子,用户可以将其配置设置为 `None`。 + +```python +default_hooks = dict( + param_scheduler=dict( + type='YOLOv5ParamSchedulerHook', # MMYOLO 中默认采用 Hook 方式进行优化器超参数的调节 + scheduler_type='linear', + lr_factor=0.01, + max_epochs=max_epochs), + checkpoint=dict( + type='CheckpointHook', # 按照给定间隔保存模型的权重的 Hook + interval=save_epoch_intervals, # 每 10 轮保存 1 次权重文件 + max_keep_ckpts=3)) # 最多保存 3 个权重文件 +``` + +`custom_hooks` 是一个列表。用户可以在这个字段中加入自定义的钩子,例如 `EMAHook`。 + +```python +custom_hooks = [ + dict( + type='EMAHook', # 实现权重 EMA(指数移动平均) 更新的 Hook + ema_type='ExpMomentumEMA', # YOLO 中使用的带动量 EMA + momentum=0.0001, # EMA 的动量参数 + update_buffers=True, # 是否计算模型的参数和缓冲的 running averages + priority=49) # 优先级略高于 NORMAL(50) +] +``` + +### 运行相关配置 + +```python +default_scope = 'mmyolo' # 默认的注册器域名,默认从此注册器域中寻找模块。请参考 https://mmengine.readthedocs.io/zh_CN/latest/tutorials/registry.html + +env_cfg = dict( + cudnn_benchmark=True, # 是否启用 cudnn benchmark, 推荐单尺度训练时开启,可加速训练 + mp_cfg=dict( # 多进程设置 + mp_start_method='fork', # 使用 fork 来启动多进程。‘fork’ 通常比 ‘spawn’ 更快,但可能存在隐患。请参考 https://github.com/pytorch/pytorch/issues/1355 + opencv_num_threads=0), # 关闭 opencv 的多线程以避免系统超负荷 + dist_cfg=dict(backend='nccl'), # 分布式相关设置 +) + +vis_backends = [dict(type='LocalVisBackend')] # 可视化后端,请参考 https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html +visualizer = dict( + type='mmdet.DetLocalVisualizer', vis_backends=vis_backends, name='visualizer') +log_processor = dict( + type='LogProcessor', # 日志处理器用于处理运行时日志 + window_size=50, # 日志数值的平滑窗口 + by_epoch=True) # 是否使用 epoch 格式的日志。需要与训练循环的类型保存一致。 + +log_level = 'INFO' # 日志等级 +load_from = None # 从给定路径加载模型检查点作为预训练模型。这不会恢复训练。 +resume = False # 是否从 `load_from` 中定义的检查点恢复。 如果 `load_from` 为 None,它将恢复 `work_dir` 中的最新检查点。 +``` + +## 配置文件继承 + +在 `config/_base_` 文件夹目前有运行时的默认设置(default runtime)。由 `_base_` 下的组件组成的配置,被我们称为 _原始配置(primitive)_。 + +对于同一文件夹下的所有配置,推荐**只有一个**对应的**原始配置**文件。所有其他的配置文件都应该继承自这个**原始配置**文件。这样就能保证配置文件的最大继承深度为 3。 + +为了便于理解,我们建议贡献者继承现有方法。例如,如果在 YOLOv5s 的基础上做了一些修改,比如修改网络深度,用户首先可以通过指定 `_base_ = ./yolov5_s-v61_syncbn_8xb16-300e_coco.py` 来集成基础的 YOLOv5 结构,然后修改配置文件中的必要参数以完成继承。 + +如果你在构建一个与任何现有方法不共享结构的全新方法,那么可以在 `configs` 文件夹下创建一个新的例如 `yolov100` 文件夹。 + +更多细节请参考 [MMEngine 配置文件教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/config.html)。 + +通过设置 `_base_` 字段,我们可以设置当前配置文件继承自哪些文件。 + +当 `_base_` 为文件路径字符串时,表示继承一个配置文件的内容。 + +```python +_base_ = '../_base_/default_runtime.py' +``` + +当 `_base_` 是多个文件路径的列表时,表示继承多个文件。 + +```python +_base_ = [ + './yolov5_s-v61_syncbn_8xb16-300e_coco.py', + '../_base_/default_runtime.py' +] +``` + +如果需要检查配置文件,可以通过运行 `mim run mmdet print_config /PATH/TO/CONFIG` 来查看完整的配置。 + +### 忽略基础配置文件里的部分内容 + +有时,您也许会设置 `_delete_=True` 去忽略基础配置文件里的一些域内容。 您也许可以参照 [MMEngine 配置文件教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/config.html) 来获得一些简单的指导。 + +在 MMYOLO 里,例如为了改变 RTMDet 的主干网络的某些内容: + +```python +model = dict( + type='YOLODetector', + data_preprocessor=dict(...), + backbone=dict( + type='CSPNeXt', + arch='P5', + expand_ratio=0.5, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + channel_attention=True, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict(...), + bbox_head=dict(...)) +``` + +如果想把 RTMDet 主干网络的 `CSPNeXt` 改成 `YOLOv6EfficientRep`,因为 `CSPNeXt` 和 `YOLOv6EfficientRep` 中有不同的字段(`channel_attention` 和 `expand_ratio`),这时候就需要使用 `_delete_=True` 将新的键去替换 `backbone` 域内所有老的键。 + +```python +_base_ = '../rtmdet/rtmdet_l_syncbn_8xb32-300e_coco.py' +model = dict( + backbone=dict( + _delete_=True, + type='YOLOv6EfficientRep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict(...), + bbox_head=dict(...)) +``` + +### 使用配置文件里的中间变量 + +配置文件里会使用一些中间变量,例如数据集里的 `train_pipeline`/`test_pipeline`。我们在定义新的 `train_pipeline`/`test_pipeline` 之后,需要将它们传递到 `data` 里。例如,我们想在训练或测试时,改变 YOLOv5 网络的 `img_scale` 训练尺度并在训练时添加 `YOLOv5MixUp` 数据增强,`img_scale/train_pipeline/test_pipeline` 是我们想要修改的中间变量。 + +**注**:使用 `YOLOv5MixUp` 数据增强时,需要将 `YOLOv5MixUp` 之前的训练数据处理流程定义在其 `pre_transform` 中。详细过程和图解可参见 [YOLOv5 原理和实现全解析](../recommended_topics/algorithm_descriptions/yolov5_description.md)。 + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +img_scale = (1280, 1280) # 高度,宽度 +affine_scale = 0.9 # 仿射变换尺度 + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', # YOLOv5 的 MixUp (图像混合) 数据增强 + prob=0.1, # MixUp 概率 + pre_transform=[*pre_transform,*mosaic_affine_pipeline]), # MixUp 之前的训练数据处理流程,包含 数据预处理流程、 'Mosaic' 和 'YOLOv5RandomAffine' + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +test_pipeline = [ + dict( + type='LoadImageFromFile'), + dict(type='YOLOv5KeepRatioResize', scale=img_scale), + dict( + type='LetterResize', + scale=img_scale, + allow_scale_up=False, + pad_val=dict(img=114)), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +val_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +test_dataloader = dict(dataset=dict(pipeline=test_pipeline)) +``` + +我们首先定义新的 `train_pipeline`/`test_pipeline` 然后传递到 `data` 里。 + +同样的,如果我们想从 `SyncBN` 切换到 `BN` 或者 `MMSyncBN`,我们需要修改配置文件里的每一个 `norm_cfg`。 + +```python +_base_ = './yolov5_s-v61_syncbn_8xb16-300e_coco.py' +norm_cfg = dict(type='BN', requires_grad=True) +model = dict( + backbone=dict(norm_cfg=norm_cfg), + neck=dict(norm_cfg=norm_cfg), + ...) +``` + +### 复用 \_base\_ 文件中的变量 + +如果用户希望在当前配置中复用 `_base_` 文件中的变量,则可以通过使用 `{{_base_.xxx}}` 的方式来获取对应变量的拷贝。而在新版 MMEngine 中,还支持省略 `{{}}` 的写法。例如: + +```python +_base_ = '../_base_/default_runtime.py' + +pre_transform = _base_.pre_transform # 变量 pre_transform 等于 _base_ 中定义的 pre_transform +``` + +## 通过脚本参数修改配置 + +当运行 `tools/train.py` 和 `tools/test.py` 时,可以通过 `--cfg-options` 来修改配置文件。 + +- 更新字典链中的配置 + + 可以按照原始配置文件中的 dict 键顺序地指定配置预选项。例如,使用 `--cfg-options model.backbone.norm_eval=False` 将模型主干网络中的所有 BN 模块都改为 `train` 模式。 + +- 更新配置列表中的键 + + 在配置文件里,一些字典型的配置被包含在列表中。例如,数据训练流程 `data.train.pipeline` 通常是一个列表,比如 `[dict(type='LoadImageFromFile'), ...]`。如果需要将 `'LoadImageFromFile'` 改成 `'LoadImageFromNDArray'`,需要写成下述形式:`--cfg-options data.train.pipeline.0.type=LoadImageFromNDArray`. + +- 更新列表或元组的值 + + 如果要更新的值是列表或元组。例如,配置文件通常设置 `model.data_preprocessor.mean=[123.675, 116.28, 103.53]`。如果需要改变这个键,可以通过 `--cfg-options model.data_preprocessor.mean="[127,127,127]"` 来重新设置。需要注意,引号 `"` 是支持列表或元组数据类型所必需的,并且在指定值的引号内**不允许**有空格。 + +## 配置文件名称风格 + +我们遵循以下样式来命名配置文件。建议贡献者遵循相同的风格。 + +``` +{algorithm name}_{model component names [component1]_[component2]_[...]}-[version id]_[norm setting]_[data preprocessor type]_{training settings}_{training dataset information}_[testing dataset information].py +``` + +文件名分为 8 个部分,其中 4 个必填部分、4 个可选部分。 每个部分用 `_` 连接,每个部分内的单词应该用 `-` 连接。`{}` 表示必填部分,`[]` 表示选填部分。 + +- `{algorithm name}`:算法的名称。 它可以是检测器名称,例如 `yolov5`, `yolov6`, `yolox` 等。 +- `{component names}`:算法中使用的组件名称,如 backbone、neck 等。例如 yolov5_s代表其深度缩放因子`deepen_factor=0.33` 以及其宽度缩放因子 `widen_factor=0.5`。 +- `[version_id]` (可选):由于 YOLO 系列算法迭代速度远快于传统目标检测算法,因此采用 `version id` 来区分不同子版本之间的差异。例如 YOLOv5 的 3.0 版本采用 `Focus` 层作为第一个下采样层,而 6.0 以后的版本采用 `Conv` 层作为第一个下采样层。 +- `[norm_setting]` (可选):`bn` 表示 `Batch Normalization`, `syncbn` 表示 `Synchronized Batch Normalization`。 +- `[data preprocessor type]` (可选):`fast` 表示调用 [YOLOv5DetDataPreprocessor](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/models/data_preprocessors/data_preprocessor.py#L9) 并配合 [yolov5_collate](https://github.com/open-mmlab/mmyolo/blob/main/mmyolo/datasets/utils.py#L12) 进行数据预处理,训练速度比默认的 `mmdet.DetDataPreprocessor` 更快,但是对多任务处理的灵活性较低。 +- `{training settings}`:训练设置的信息,例如 batch 大小、数据增强、损失、参数调度方式和训练最大轮次/迭代。 例如:`8xb16-300e_coco` 表示使用 8 个 GPU 每个 GPU 16 张图,并训练 300 个 epoch。 + 缩写介绍: + - `{gpu x batch_per_gpu}`:GPU 数和每个 GPU 的样本数。例如 `4x4b` 是 4 个 GPU 每个 GPU 4 张图的缩写。 + - `{schedule}`:训练方案,MMYOLO 中默认为 300 个 epoch。 +- `{training dataset information}`:训练数据集,例如 `coco`, `cityscapes`, `voc-0712`, `wider-face`, `balloon`。 +- `[testing dataset information]` (可选):测试数据集,用于训练和测试在不同数据集上的模型配置。 如果没有注明,则表示训练和测试的数据集类型相同。 diff --git a/third_party/mmyolo/docs/zh_cn/tutorials/custom_installation.md b/third_party/mmyolo/docs/zh_cn/tutorials/custom_installation.md new file mode 100644 index 0000000000000000000000000000000000000000..d20d659f6b3847f51b034aa7b4f295ef7502c1c0 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/tutorials/custom_installation.md @@ -0,0 +1,111 @@ +# 自定义安装 + +## CUDA 版本 + +在安装 PyTorch 时,你需要指定 CUDA 的版本。如果你不清楚应该选择哪一个,请遵循我们的建议。 + +- 对于 Ampere 架构的 NVIDIA GPU,例如 GeForce 30 系列 以及 NVIDIA A100,CUDA 11 是必需的。 +- 对于更早的 NVIDIA GPU,CUDA 11 是向后兼容 (backward compatible) 的,但 CUDA 10.2 能够提供更好的兼容性,也更加轻量。 + +请确保你的 GPU 驱动版本满足最低的版本需求,参阅 NVIDIA 官方的 [CUDA 工具箱和相应的驱动版本关系表](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)。 + +```{note} +如果按照我们的最佳实践进行安装,CUDA 运行时库就足够了,因为我们提供相关 CUDA 代码的预编译,不需要进行本地编译。 +但如果你希望从源码进行 MMCV 的编译,或是进行其他 CUDA 算子的开发,那么就必须安装完整的 CUDA 工具链,参见 +[NVIDIA 官网](https://developer.nvidia.com/cuda-downloads) ,另外还需要确保该 CUDA 工具链的版本与 PyTorch 安装时 +的配置相匹配(如用 `conda install` 安装 PyTorch 时指定的 cudatoolkit 版本)。 +``` + +## 不使用 MIM 安装 MMEngine + +要使用 pip 而不是 MIM 来安装 MMEngine,请遵照 [MMEngine 安装指南](https://mmengine.readthedocs.io/en/latest/get_started/installation.html)。 + +例如,你可以通过以下命令安装 MMEngine: + +```shell +pip install "mmengine>=0.6.0" +``` + +## 不使用 MIM 安装 MMCV + +MMCV 包含 C++ 和 CUDA 扩展,因此其对 PyTorch 的依赖比较复杂。MIM 会自动解析这些 依赖,选择合适的 MMCV 预编译包,使安装更简单,但它并不是必需的。 + +要使用 pip 而不是 MIM 来安装 MMCV,请遵照 [MMCV 安装指南](https://mmcv.readthedocs.io/zh_CN/2.x/get_started/installation.html)。 +它需要您用指定 URL 的形式手动指定对应的 PyTorch 和 CUDA 版本。 + +例如,下述命令将会安装基于 PyTorch 1.12.x 和 CUDA 11.6 编译的 mmcv: + +```shell +pip install "mmcv>=2.0.0rc4" -f https://download.openmmlab.com/mmcv/dist/cu116/torch1.12.0/index.html +``` + +## 在 CPU 环境中安装 + +我们的代码能够建立在只使用 CPU 的环境(CUDA 不可用)。 + +在 CPU 模式下,可以进行模型训练(需要 MMCV 版本 >= `2.0.0rc1`)、测试或者推理,然而以下功能将在 CPU 模式下不能使用: + +- Deformable Convolution +- Modulated Deformable Convolution +- ROI pooling +- Deformable ROI pooling +- CARAFE: Content-Aware ReAssembly of FEatures +- SyncBatchNorm +- CrissCrossAttention: Criss-Cross Attention +- MaskedConv2d +- Temporal Interlace Shift +- nms_cuda +- sigmoid_focal_loss_cuda +- bbox_overlaps + +因此,如果尝试使用包含上述操作的模型进行训练/测试/推理,将会报错。下表列出了由于依赖上述算子而无法在 CPU 上运行的相关模型: + +| 操作 | 模型 | +| :-----------------------------------------------------: | :--------------------------------------------------------------------------------------: | +| Deformable Convolution/Modulated Deformable Convolution | DCN、Guided Anchoring、RepPoints、CentripetalNet、VFNet、CascadeRPN、NAS-FCOS、DetectoRS | +| MaskedConv2d | Guided Anchoring | +| CARAFE | CARAFE | +| SyncBatchNorm | ResNeSt | + +## 在 Google Colab 中安装 + +[Google Colab](https://colab.research.google.com/) 通常已经包含了 PyTorch 环境,因此我们只需要安装 MMEngine、MMCV、MMDetection 和 MMYOLO 即可,命令如下: + +**步骤 1.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMEngine](https://github.com/open-mmlab/mmengine) 、 [MMCV](https://github.com/open-mmlab/mmcv) 和 [MMDetection](https://github.com/open-mmlab/mmdetection) 。 + +```shell +!pip3 install openmim +!mim install "mmengine>=0.6.0" +!mim install "mmcv>=2.0.0rc4,<2.1.0" +!mim install "mmdet>=3.0.0,<4.0.0" +``` + +**步骤 2.** 使用源码安装 MMYOLO: + +```shell +!git clone https://github.com/open-mmlab/mmyolo.git +%cd mmyolo +!pip install -e . +``` + +**步骤 3.** 验证安装是否成功: + +```python +import mmyolo +print(mmyolo.__version__) +# 预期输出: 0.1.0 或其他版本号 +``` + +```{note} +在 Jupyter 中,感叹号 `!` 用于执行外部命令,而 `%cd` 是一个[魔术命令](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd),用于切换 Python 的工作路径。 +``` + +## 使用多个 MMYOLO 版本进行开发 + +训练和测试的脚本已经在 `PYTHONPATH` 中进行了修改,以确保脚本使用当前目录中的 MMYOLO。 + +要使环境中安装默认的 MMYOLO 而不是当前正在在使用的,可以删除出现在相关脚本中的如下代码: + +```shell +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH +``` diff --git a/third_party/mmyolo/docs/zh_cn/tutorials/data_flow.md b/third_party/mmyolo/docs/zh_cn/tutorials/data_flow.md new file mode 100644 index 0000000000000000000000000000000000000000..804004dea2ee07ed462dde0fc04b75f814791528 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/tutorials/data_flow.md @@ -0,0 +1,121 @@ +# 混合类图片数据增强更新 + +混合类图片数据增强是指类似 Mosaic 和 MixUp 一样,在运行过程中需要获取多张图片的标注信息进行融合。 在 OpenMMLab 数据增强 pipeline 中一般是获取不到数据集其他索引的。 为了实现上述功能,在 MMDetection 复现的 YOLOX 中提出了 [MultiImageMixDataset](https://github.com/open-mmlab/mmdetection/blob/master/mmdet/datasets/dataset_wrappers.py#L338) 数据集包装器的概念。 + +`MultiImageMixDataset` 数据集包装器会传入一个包括 `Mosaic` 和 `RandAffine` 等数据增强,而 `CocoDataset` 中也需要传入一个包括图片和标注加载的 `pipeline` 。通过这种方式就可以快速的实现混合类数据增强。其配置用法如下所示: + +```python +train_pipeline = [ + dict(type='Mosaic', img_scale=img_scale, pad_val=114.0), + dict( + type='RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='MixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0), + ... +] +train_dataset = dict( + # use MultiImageMixDataset wrapper to support mosaic and mixup + type='MultiImageMixDataset', + dataset=dict( + type='CocoDataset', + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + pipeline=train_pipeline) + +``` + +但是上述实现起来会有一个缺点:对于不熟悉 MMDetection 的用户来说,其经常会忘记 Mosaic 必须要和 `MultiImageMixDataset` 配合使用,而且这样会加大复杂度和理解难度。 + +为了解决这个问题,在 MMYOLO 中进一步进行了简化。直接让 `pipeline` 获取到 `dataset` 对象,此时就可以将 `Mosaic` 等混合类数据增强的实现和使用随机翻转的操作一样,不再需要数据集包装器。新的配置写法为: + +```python +pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) +] +train_pipeline = [ + *pre_transform, + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='mmdet.RandomAffine', + scaling_ratio_range=(0.1, 2), + border=(-img_scale[0] // 2, -img_scale[1] // 2)), + dict( + type='YOLOXMixUp', + img_scale=img_scale, + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=pre_transform), + ... +] +``` + +一个稍微复杂点的包括 MixUp 的 YOLOv5-m 配置如下所示: + +```python +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] +``` + +其实现过程非常简单,只需要在 Dataset 中将本身对象传给 pipeline 即可,具体代码如下: + +```python +def prepare_data(self, idx) -> Any: + """Pass the dataset to the pipeline during training to support mixed + data augmentation, such as Mosaic and MixUp.""" + if self.test_mode is False: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) + else: + return super().prepare_data(idx) +``` diff --git a/third_party/mmyolo/docs/zh_cn/tutorials/faq.md b/third_party/mmyolo/docs/zh_cn/tutorials/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..71ee01d47498717fa2bea21ae9da3f9548e0a73e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/tutorials/faq.md @@ -0,0 +1,111 @@ +# 常见问题解答 + +我们在这里列出了使用时的一些常见问题及其相应的解决方案。 如果您发现有一些问题被遗漏,请随时提 PR 丰富这个列表。 如果您无法在此获得帮助,请创建 [issue](https://github.com/open-mmlab/mmyolo/issues/new/choose) 提问,但是请在模板中填写所有必填信息,这有助于我们更快定位问题。 + +## 为什么要推出 MMYOLO? + +为什么要推出 MMYOLO? 为何要单独开一个仓库而不是直接放到 MMDetection 中? 自从开源后,不断收到社区小伙伴们类似的疑问,答案可以归纳为以下三点: + +**(1) 统一运行和推理平台** + +目前目标检测领域出现了非常多 YOLO 的改进算法,并且非常受大家欢迎,但是这类算法基于不同框架不同后端实现,存在较大差异,缺少统一便捷的从训练到部署的公平评测流程。 + +**(2) 协议限制** + +众所周知,YOLOv5 以及其衍生的 YOLOv6 和 YOLOv7 等算法都是 GPL 3.0 协议,不同于 MMDetection 的 Apache 协议。由于协议问题,无法将 MMYOLO 直接并入 MMDetection 中。 + +**(3) 多任务支持** + +还有一层深远的原因: **MMYOLO 任务不局限于 MMDetection**,后续会支持更多任务例如基于 MMPose 实现关键点相关的应用,基于 MMTracking 实现追踪相关的应用,因此不太适合直接并入 MMDetection 中。 + +## projects 文件夹是用来干什么的? + +projects 文件夹是 OpenMMLab 2.0 中引入的一个全新文件夹。其初衷有如下 3 点: + +1. 便于社区贡献。由于 OpenMMLab 系列代码库对于代码合入有一套规范严谨的流程,这不可避免的会导致算法复现周期很长,不利于社区贡献 +2. 便于快速支持新算法。算法开发周期过长同样会导致用户无法尽快体验最新算法 +3. 便于快速支持新方向和新特性。新发展方向或者一些新的特性可能和现如今代码库中的设计有些不兼容,没法快速合入到代码库中 + +综上所述,projects 文件夹的引入主要是解决算法复现周期过长导致的新算法支持速度较慢,新特性支持较复杂等多个问题。 projects 中每个文件夹属于一个完全独立的工程,社区用户可以通过 +projects 快速支持一些在当前版本中较难支持或者想快速支持的新算法和新特性。等后续设计稳定或者代码符合合入规范,则会考虑合入到主分支中。 + +## YOLOv5 backbone 替换为 Swin 后效果很差 + +在 [轻松更换主干网络](../recommended_topics/replace_backbone.md) 一文中我们提供了大量替换 backbone 的教程,但是该文档只是教用户如何替换 backbone,直接训练不一定能得到比较优异的结果。原因是 +不同 backbone 所需要的训练超参是不一样的,以 Swin 和 YOLOv5 backbone 为例两者差异较大,Swin 属于 transformer 系列算法,而 YOLOv5 backbone 属于卷积系列算法,其训练的优化器、学习率以及其他超参差异较大。 +如果强行将 Swin 作为 YOLOv5 backbone 且想取得不错的效果,需要同时调整诸多参数。 + +## MM 系列开源库中有很多组件,如何在 MMYOLO 中使用? + +在 OpenMMLab 2.0 中对多个 MM 系列开源库之间的模块跨库调用功能进行增强。目前在 MMYOLO 中可以在配置文件中通过 `MM 算法库 A.模块名` 来之间调用 MM 算法库 A 中已经被注册的任意模块。 具体例子可以参考 +[轻松更换主干网络](../recommended_topics/replace_backbone.md) 中使用在 MMClassification 中实现的主干网络章节,其他模块调用也是相同的用法。 + +## MMYOLO 中是否可以加入纯背景图片进行训练? + +将纯背景图片加入训练大部分情况可以抑制误报率,是否将纯背景图片加入训练功能已经大部分数据集上支持了。以 `YOLOv5CocoDataset` 为例,核心控制参数是 `train_dataloader.dataset.filter_cfg.filter_empty_gt`,如果 `filter_empty_gt` 为 True 表示将纯背景图片过滤掉不加入训练, +反之将纯背景图片加入到训练中。 目前 MMYOLO 中大部分算法都是默认将纯背景图片加入训练中。 + +## MMYOLO 是否有计算模型推理 FPS 脚本? + +MMYOLO 是基于 MMDet 3.x 来开发的,在 MMDet 3.x 中提供了计算模型推理 FPS 的脚本。 具体脚本为 [benchmark](https://github.com/open-mmlab/mmdetection/blob/3.x/tools/analysis_tools/benchmark.py)。我们推荐大家使用 mim 直接跨库启动 MMDet 中的脚本而不是直接复制到 MMYOLO 中。 +关于如果通过 mim 启动 MMDet 中脚本,可以查看 [使用 mim 跨库调用其他 OpenMMLab 仓库的脚本](../common_usage/mim_usage.md)。 + +## MMDeploy 和 EasyDeploy 有啥区别? + +MMDeploy 是由 OpenMMLab 中部署团队开发的针对 OpenMMLab 系列算法库提供部署支持的开源库,支持各种后端和自定义等等强大功能。 EasyDeploy 是由社区小伙伴提供的一个相比 MMDeploy 更加简单易用的部署 projects。 +EasyDeploy 支持的功能目前没有 MMDeploy 多,但是使用上更加简单。 MMYOLO 中同时提供对 MMDeploy 和 EasyDeploy 的支持,用户可以根据自己需求选择。 + +## COCOMetric 中如何查看每个类的 AP + +只需要在配置中设置 `test_evaluator.classwise` 为 True,或者在 test.py 运行时候增加 `--cfg-options test_evaluator.classwise=True` 即可。 + +## MMYOLO 中为何没有支持 MMDet 类似的自动学习率缩放功能? + +原因是实验发现 YOLO 系列算法不是非常满足线性缩放功能。在多个数据集上验证发现会出现不基于 batch size 自动学习率缩放效果好于缩放的情形。因此暂时 MMYOLO 还没有支持自动学习率缩放功能。 + +## 自己训练的模型权重尺寸为啥比官方发布的大? + +原因是用户自己训练的权重通常包括 `optimizer`、`ema_state_dict` 和 `message_hub` 等额外数据,这部分数据我们会在模型发布时候自动删掉,而用户直接基于框架跑的模型权重是全部保留的,所以用户自己训练的模型权重尺寸会比官方发布的大。 +你可以使用 [publish_model.py](https://github.com/open-mmlab/mmyolo/blob/main/tools/misc/publish_model.py) 脚本删掉额外字段。 + +## RTMDet 为何训练所占显存比 YOLOv5 多很多? + +训练显存较多的原因主要是 assigner 部分的差异。YOLOv5 采用的是非常简单且高效的 shape 匹配 assigner,而 RTMDet 中采用的是动态的全 batch 计算的 dynamic soft label assigner,其内部的 Cost 矩阵需要消耗比较多的显存,特别是当前 batch 中标注框过多时候。 +后续我们会考虑解决这个问题。 + +## 修改一些代码后是否需要重新安装 MMYOLO + +在不新增 py 代码情况下, 如果你遵循最佳实践,即使用 `mim install -v -e .` 安装的 MMYOLO,则对本地代码所作的任何修改都会生效,无需重新安装。但是如果你是新增了 py 文件然后在里面新增的代码,则依然需要重新安装即运行 `mim install -v -e .`。 + +## 如何使用多个 MMYOLO 版本进行开发 + +若你拥有多个 MMYOLO 工程文件夹,例如 mmyolo-v1, mmyolo-v2。 在使用不同版本 MMYOLO 时候,你可以在终端运行前设置 + +```shell +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH +``` + +使得当前环境生效。如果要使用环境中安装默认的 MMYOLO 而不是当前正在在使用的,可以删除出现上述命令或者通过如下命令重置 + +```shell +unset PYTHONPATH +``` + +## 训练中保存最好模型 + +用户可以通过在配置中设置 `default_hooks.checkpoint.save_best` 参数来选择根据什么指标来筛选最优模型。以 `COCO` 数据集检测任务为例, +`default_hooks.checkpoint.save_best` 可以选择输入的参数有: + +1. `auto` 将会根据验证集中的第一个评价指标作为筛选条件。 +2. `coco/bbox_mAP` 将会根据 `bbox_mAP` 作为筛选条件。 +3. `coco/bbox_mAP_50` 将会根据 `bbox_mAP_50` 作为筛选条件。 +4. `coco/bbox_mAP_75` 将会根据 `bbox_mAP_75` 作为筛选条件。 +5. `coco/bbox_mAP_s` 将会根据 `bbox_mAP_s` 作为筛选条件。 +6. `coco/bbox_mAP_m` 将会根据 `bbox_mAP_m` 作为筛选条件。 +7. `coco/bbox_mAP_l` 将会根据 `bbox_mAP_l` 作为筛选条件。 + +此外用户还可以选择筛选的逻辑,通过设置配置中的 `default_hooks.checkpoint.rule` 来选择判断逻辑,如:`default_hooks.checkpoint.rule=greater` 表示指标越大越好。更详细的使用可以参考 [checkpoint_hook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py) 来修改 + +## 如何进行非正方形输入尺寸训练和测试? + +在 YOLO 系列算法中默认配置基本上都是 640x640 或者 1280x1280 正方形尺度输入训练的。用户如果想进行非正方形尺度训练,你可以修改配置中 `image_scale` 参数,并将其他对应位置进行修改即可。用户可以参考我们提供的 [yolov5_s-v61_fast_1xb12-40e_608x352_cat.py](https://github.com/open-mmlab/mmyolo/tree/dev/configs/yolov5/yolov5_s-v61_fast_1xb12-40e_608x352_cat.py) 配置。 diff --git a/third_party/mmyolo/docs/zh_cn/tutorials/rotated_detection.md b/third_party/mmyolo/docs/zh_cn/tutorials/rotated_detection.md new file mode 100644 index 0000000000000000000000000000000000000000..1ee974b104c975e72c8ce6485245d44ed5c4c915 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/tutorials/rotated_detection.md @@ -0,0 +1,329 @@ +# 旋转目标检测 + +旋转目标检测(Rotated Object Detection),又称为有向目标检测(Oriented Object Detection),试图在检测出目标位置的同时得到目标的方向信息。它通过重新定义目标表示形式,以及增加回归自由度数量的操作,实现旋转矩形、四边形甚至任意形状的目标检测。旋转目标检测在人脸识别、场景文字、遥感影像、自动驾驶、医学图像、机器人抓取等领域都有广泛应用。 + +关于旋转目标检测的详细介绍请参考文档 [MMRotate 基础知识](https://mmrotate.readthedocs.io/zh_CN/1.x/overview.html) + +MMYOLO 中的旋转目标检测依赖于 MMRotate 1.x,请参考文档 [开始你的第一步](https://mmrotate.readthedocs.io/zh_CN/1.x/get_started.html) 安装 MMRotate 1.x。 + +本教程将介绍如何在 MMYOLO 中训练和使用旋转目标检测模型,目前支持了 RTMDet-R。 + +## 数据集准备 + +对于旋转目标检测数据集,目前最常用的数据集是 DOTA 数据集,由于DOTA数据集中的图像分辨率较大,因此需要进行切片处理,数据集准备请参考 [Preparing DOTA Dataset](https://github.com/open-mmlab/mmyolo/tools/dataset_converters/dota_split). + +处理后的数据集结构如下: + +```none +mmyolo +├── data +│ ├── split_ss_dota +│ │ ├── trainval +│ │ │ ├── images +│ │ │ ├── annfiles +│ │ ├── test +│ │ │ ├── images +│ │ │ ├── annfiles +│ ├── split_ms_dota +│ │ ├── trainval +│ │ │ ├── images +│ │ │ ├── annfiles +│ │ ├── test +│ │ │ ├── images +│ │ │ ├── annfiles +``` + +其中 `split_ss_dota` 是单尺度切片,`split_ms_dota` 是多尺度切片,可以根据需要选择。 + +对于自定义数据集,我们建议将数据转换为 DOTA 格式并离线进行转换,如此您只需在数据转换后修改 config 的数据标注路径和类别即可。 + +为了方便使用,我们同样提供了基于 COCO 格式的旋转标注格式,将多边形检测框储存在 COCO 标注的 segmentation 标签中,示例如下: + +```json +{ + "id": 131, + "image_id": 72, + "bbox": [123, 167, 11, 37], + "area": 271.5, + "category_id": 1, + "segmentation": [[123, 167, 128, 204, 134, 201, 132, 167]], + "iscrowd": 0, +} +``` + +## 配置文件 + +这里以 RTMDet-R 为例介绍旋转目标检测的配置文件,其中大部分和水平检测模型相同,主要介绍它们的差异,包括数据集和评测器配置、检测头、可视化等。 + +得益于 MMEngine 的配置文件系统,大部分模块都可以调用 MMRotate 中的模块。 + +### 数据集和评测器配置 + +关于配置文件的基础请先阅读 [学习 YOLOV5 配置文件](./config.md). 下面介绍旋转目标检测的一些必要设置。 + +```python +dataset_type = 'YOLOv5DOTADataset' # 数据集类型,这将被用来定义数据集 +data_root = 'data/split_ss_dota/' # 数据的根路径 + +angle_version = 'le90' # 角度范围的定义,目前支持 oc, le90 和 le135 + +train_pipeline = [ + # 训练数据读取流程 + dict( + type='LoadImageFromFile'), # 第 1 个流程,从文件路径里加载图像 + dict(type='LoadAnnotations', # 第 2 个流程,对于当前图像,加载它的注释信息 + with_bbox=True, # 是否使用标注框 (bounding box),目标检测需要设置为 True + box_type='qbox'), # 指定读取的标注格式,旋转框数据集默认的数据格式为四边形 + dict(type='mmrotate.ConvertBoxType', # 第 3 个流程,转换标注格式 + box_type_mapping=dict(gt_bboxes='rbox')), # 将四边形标注转化为旋转框标注 + + # 训练数据处理流程 + dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), + dict(type='mmdet.RandomFlip', + prob=0.75, + direction=['horizontal', 'vertical', 'diagonal']), + dict(type='mmrotate.RandomRotate', # 旋转数据增强 + prob=0.5, # 旋转概率 0.5 + angle_range=180, # 旋转范围 180 + rotate_type='mmrotate.Rotate', # 旋转方法 + rect_obj_labels=[9, 11]), # 由于 DOTA 数据集中标号为 9 的 'storage-tank' 和标号 11 的 'roundabout' 两类为正方形标注,无需角度信息,旋转中将这两类保持为水平 + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='RegularizeRotatedBox', # 统一旋转框表示形式 + angle_version=angle_version), # 根据角度的定义方式进行 + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + collate_fn=dict(type='yolov5_collate'), + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( # 训练数据集的配置 + type=dataset_type, + data_root=data_root, + ann_file='trainval/annfiles/', # 标注文件夹路径 + data_prefix=dict(img_path='trainval/images/'), # 图像路径前缀 + img_shape=(1024, 1024), # 图像大小 + filter_cfg=dict(filter_empty_gt=True), # 标注的过滤配置 + pipeline=train_pipeline)) # 这是由之前创建的 train_pipeline 定义的数据处理流程 +``` + +RTMDet-R 保持论文内的配置,默认仅采用随机旋转增强,得益于 BoxType 设计,在数据增强阶段,大部分增强无需改动代码即可直接支持,例如 MixUp 和 Mosaic 等,可以直接在 pipeline 中使用。 + +```{Warning} +目前已知 Albu 数据增强仅支持水平框,在使用其他的数据增强时建议先使用 可视化数据集脚本 `browse_dataset.py` 验证数据增强是否正确。 +``` + +RTMDet-R 测试阶段仅采用 Resize 和 Pad,在验证和评测时,都采用相同的数据流进行推理。 + +```python +val_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='mmdet.Resize', scale=(1024, 1024), keep_ratio=True), + dict( + type='mmdet.Pad', size=(1024, 1024), + pad_val=dict(img=(114, 114, 114))), + # 和训练时一致,先读取标注再转换标注格式 + dict( + type='LoadAnnotations', + with_bbox=True, + box_type='qbox', + _scope_='mmdet'), + dict( + type='mmrotate.ConvertBoxType', + box_type_mapping=dict(gt_bboxes='rbox')), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=persistent_workers, + pin_memory=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='trainval/annfiles/', + data_prefix=dict(img_path='trainval/images/'), + img_shape=(1024, 1024), + test_mode=True, + batch_shapes_cfg=batch_shapes_cfg, + pipeline=val_pipeline)) +``` + +[评测器](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/evaluation.html) 用于计算训练模型在验证和测试数据集上的指标。评测器的配置由一个或一组评价指标(Metric)配置组成: + +```python +val_evaluator = dict( # 验证过程使用的评测器 + type='mmrotate.DOTAMetric', # 用于评估旋转目标检测的 mAP 的 dota 评价指标 + metric='mAP' # 需要计算的评价指标 +) +test_evaluator = val_evaluator # 测试过程使用的评测器 +``` + +由于 DOTA 测试数据集没有标注文件, 如果要保存在测试数据集上的检测结果,则可以像这样编写配置: + +```python +# 在测试集上推理, +# 并将检测结果转换格式以用于提交结果 +test_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=val_num_workers, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_prefix=dict(img_path='test/images/'), + img_shape=(1024, 1024), + test_mode=True, + batch_shapes_cfg=batch_shapes_cfg, + pipeline=test_pipeline)) +test_evaluator = dict( + type='mmrotate.DOTAMetric', + format_only=True, # 只将模型输出转换为 DOTA 的 txt 提交格式并压缩成 zip + merge_patches=True, # 将切片结果合并成大图检测结果 + outfile_prefix='./work_dirs/dota_detection/submission') # 输出测试文件夹的路径 +``` + +如果使用基于 COCO 格式的旋转框标注,只需要修改 pipeline 中数据读取流程和训练数据集的配置,以训练数据为例: + +```python + +dataset_type='YOLOv5CocoDataset' + +train_pipeline = [ + # 训练数据读取流程 + dict( + type='LoadImageFromFile'), # 第 1 个流程,从文件路径里加载图像 + dict(type='LoadAnnotations', # 第 2 个流程,对于当前图像,加载它的注释信息 + with_bbox=True, # 是否使用标注框 (bounding box),目标检测需要设置为 True + with_mask=True, # 读取储存在 segmentation 标注中的多边形标注 + poly2mask=False) # 不执行 poly2mask,后续会将 poly 转化成检测框 + dict(type='ConvertMask2BoxType', # 第 3 个流程,将 mask 标注转化为 boxtype + box_type='rbox'), # 目标类型是 rbox 旋转框 + # 剩余的其他 pipeline + ... +] + +metainfo = dict( # DOTA 数据集的 metainfo + classes=('plane', 'baseball-diamond', 'bridge', 'ground-track-field', + 'small-vehicle', 'large-vehicle', 'ship', 'tennis-court', + 'basketball-court', 'storage-tank', 'soccer-ball-field', + 'roundabout', 'harbor', 'swimming-pool', 'helicopter')) + +train_dataloader = dict( + dataset=dict( # 训练数据集的配置 + type=dataset_type, + metainfo=metainfo, + data_root=data_root, + ann_file='train/train.json', # 标注文件路径 + data_prefix=dict(img='train/images/'), # 图像路径前缀 + filter_cfg=dict(filter_empty_gt=True), # 标注的过滤配置 + pipeline=train_pipeline), # 数据处理流程 +) +``` + +### 模型配置 + +对于旋转目标检测器,在模型配置中 backbone 和 neck 的配置和其他模型是一致的,主要差异在检测头上。目前仅支持 RTMDet-R 旋转目标检测,下面介绍新增的参数: + +1. `angle_version` 角度范围,用于在训练时限制角度的范围,可选的角度范围有 `le90`, `le135` 和 `oc`。 + +2. `angle_coder` 角度编码器,和 bbox coder 类似,用于编码和解码角度。 + + 默认使用的角度编码器是 `PseudoAngleCoder`,即”伪角度编码器“,并不进行编解码,直接回归角度参数。这样设计的目标是能更好的自定义角度编码方式,而无需重写代码,例如 CSL,DCL,PSC 等方法。 + +3. `use_hbbox_loss` 是否使用水平框 loss。考虑到部分角度编码解码过程不可导,直接使用旋转框的损失函数无法学习角度,因此引入该参数用于将框和角度分开训练。 + +4. `loss_angle` 角度损失函数。在设定`use_hbbox_loss=True` 时必须设定,而使用旋转框损失时可选,此时可以作为回归损失的辅助。 + +通过组合 `use_hbbox_loss` 和 `loss_angle` 可以控制旋转框训练时的回归损失计算方式,共有三种组合方式: + +- `use_hbbox_loss=False` 且 `loss_angle` 为 None. + + 此时框预测和角度预测进行合并,直接对旋转框预测进行回归,此时 `loss_bbox` 应当设定为旋转框损失,例如 `RotatedIoULoss`。 + 这种方案和水平检测模型的回归方式基本一致,只是多了额外的角度编解码过程。 + + ``` + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + └────►decode──(a)─┘ + ``` + +- `use_hbbox_loss=False`,同时设定 `loss_angle`. + + 此时会增加额外的角度回归和分类损失,具体的角度损失类型需要根据角度编码器 `angle_code` 进行选择。 + + ``` + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + ├────►decode──(a)─┘ + │ + └───────────────────────────────────────────►loss_angle + ``` + +- `use_hbbox_loss=True` 且 `loss_angle` 为 None. + + 此时框预测和角度预测完全分离,将两个分支视作两个任务进行训练。 + 此时 `loss_bbox` 要设定为水平框的损失函数,例如 `IoULoss` 。 + + ``` + bbox_pred──(tblr)──►decode──►hbox_pred──(xyxy)──►loss_bbox + + angle_pred──────────────────────────────────────►loss_angle + ``` + +除了检测头中的参数,在test_cfg中还增加了 `decoded_with_angle` 参数用来控制推理时角度的处理逻辑,默认设定为 True 。 +设计这个参数的目标是让训练过程和推理过程的逻辑对齐,该参数会影响最终的精度。 + +当 `decoded_with_angle=True` 时,将框和角度同时送入 `bbox_coder` 中。 +此时要使用旋转框的编解码器,例如`DistanceAnglePointCoder`。 + +``` +bbox_pred────(tblr)───┐ + ▼ +angle_pred decode──(xywha)──►rbox_pred + │ ▲ + └────►decode──(a)─┘ +``` + +当 `decoded_with_angle=False` 时,首先解码出水平检测框,之后将角度 concat 到检测框。 +此时要使用水平框的编解码器,例如`DistancePointBBoxCoder`。 + +``` +bbox_pred──(tblr)─►decode + │ (xyxy) + ▼ + format───(xywh)──►concat──(xywha)──►rbox_pred + ▲ +angle_pred────────►decode────(a)───────┘ +``` + +### 可视化器 + +由于旋转框和水平框的差异,旋转目标检测模型需要使用 MMRotate 中的 `RotLocalVisualizer`,配置如下: + +```python +vis_backends = [dict(type='LocalVisBackend')] # 可视化后端,请参考 https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html +visualizer = dict( + type='mmrotate.RotLocalVisualizer', vis_backends=vis_backends, name='visualizer') +``` + +## 实用工具 + +目前测试可用的工具包括: + +[可视化数据集](../useful_tools/browse_dataset.md) diff --git a/third_party/mmyolo/docs/zh_cn/tutorials/warning_notes.md b/third_party/mmyolo/docs/zh_cn/tutorials/warning_notes.md new file mode 100644 index 0000000000000000000000000000000000000000..38b65c983a30da8ed4a57724101a2a3c60cd6bf0 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/tutorials/warning_notes.md @@ -0,0 +1,22 @@ +# 常见警告说明 + +本文档收集用户经常疑惑的警告信息说明,方便大家理解。 + +## xxx registry in mmyolo did not set import location + +完整信息为 The xxx registry in mmyolo did not set import location. Fallback to call `mmyolo.utils.register_all_modules` instead.。 +这个警告的含义说某个模块在导入时候发现没有设置导入的 location,导致无法确定其位置,因此会自动调用 `mmyolo.utils.register_all_modules` 触发包的导入。这个警告属于 MMEngine 中非常底层的模块警告, +用户理解起来可能比较困难,不过对大家使用没有任何影响,可以直接忽略。 + +## save_param_schedulers is true but self.param_schedulers is None + +以 YOLOv5 算法为例,这是因为 YOLOv5 中重新写了参数调度器策略 `YOLOv5ParamSchedulerHook`,因此 MMEngine 中设计的 ParamScheduler 是没有使用的,但是 YOLOv5 配置中也没有设置 `save_param_schedulers` 为 False。 +首先这个警告对性能和恢复训练没有任何影响,用户如果觉得这个警告会影响体验,可以设置 `default_hooks.checkpoint.save_param_scheduler` 为 False 或者训练时候通过命令行设置 `--cfg-options default_hooks.checkpoint.save_param_scheduler=False` 即可。 + +## The loss_cls will be 0. This is a normal phenomenon. + +这个和具体算法有关。以 YOLOv5 为例,其分类 loss 是只考虑正样本的,如果类别是 1,那么分类 loss 和 obj loss 就是功能重复的了,因此在设计上当类别是 1 的时候 loss_cls 是不计算的,因此始终是 0,这是正常现象。 + +## The model and loaded state dict do not match exactly + +这个警告是否会影响性能要根据进一步的打印信息来确定。如果是在微调模式下,由于用户自定义类别不一样无法加载 Head 模块的 COCO 预训练权重,这是一个正常现象,不会影响性能。 diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/browse_coco_json.md b/third_party/mmyolo/docs/zh_cn/useful_tools/browse_coco_json.md new file mode 100644 index 0000000000000000000000000000000000000000..3e33f538fc06e92e2b2a25c8814d10fa7e233bff --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/browse_coco_json.md @@ -0,0 +1,62 @@ +# 可视化 COCO 标签 + +脚本 `tools/analysis_tools/browse_coco_json.py` 能够使用可视化显示 COCO 标签在图片的情况。 + +```shell +python tools/analysis_tools/browse_coco_json.py [--data-root ${DATA_ROOT}] \ + [--img-dir ${IMG_DIR}] \ + [--ann-file ${ANN_FILE}] \ + [--wait-time ${WAIT_TIME}] \ + [--disp-all] [--category-names CATEGORY_NAMES [CATEGORY_NAMES ...]] \ + [--shuffle] +``` + +其中,如果图片、标签都在同一个文件夹下的话,可以指定 `--data-root` 到该文件夹,然后 `--img-dir` 和 `--ann-file` 指定该文件夹的相对路径,代码会自动拼接。 +如果图片、标签文件不在同一个文件夹下的话,则无需指定 `--data-root` ,直接指定绝对路径的 `--img-dir` 和 `--ann-file` 即可。 + +例子: + +1. 查看 `COCO` 全部类别,同时展示 `bbox`、`mask` 等所有类型的标注: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --disp-all +``` + +如果图片、标签不在同一个文件夹下的话,可以使用绝对路径: + +```shell +python tools/analysis_tools/browse_coco_json.py --img-dir '/dataset/image/coco/train2017' \ + --ann-file '/label/instances_train2017.json' \ + --disp-all +``` + +2. 查看 `COCO` 全部类别,同时仅展示 `bbox` 类型的标注,并打乱显示: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --shuffle +``` + +3. 只查看 `bicycle` 和 `person` 类别,同时仅展示 `bbox` 类型的标注: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --category-names 'bicycle' 'person' +``` + +4. 查看 `COCO` 全部类别,同时展示 `bbox`、`mask` 等所有类型的标注,并打乱显示: + +```shell +python tools/analysis_tools/browse_coco_json.py --data-root './data/coco' \ + --img-dir 'train2017' \ + --ann-file 'annotations/instances_train2017.json' \ + --disp-all \ + --shuffle +``` diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/browse_dataset.md b/third_party/mmyolo/docs/zh_cn/useful_tools/browse_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..5d6f08723cd775b7411a6bd883380dd97e2b9368 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/browse_dataset.md @@ -0,0 +1,57 @@ +# 可视化数据集 + +```shell +python tools/analysis_tools/browse_dataset.py \ + ${CONFIG_FILE} \ + [-o, --out-dir ${OUTPUT_DIR}] \ + [-p, --phase ${DATASET_PHASE}] \ + [-n, --show-number ${NUMBER_IMAGES_DISPLAY}] \ + [-i, --show-interval ${SHOW_INTERRVAL}] \ + [-m, --mode ${DISPLAY_MODE}] \ + [--cfg-options ${CFG_OPTIONS}] +``` + +**所有参数的说明**: + +- `config` : 模型配置文件的路径。 +- `-o, --out-dir`: 保存图片文件夹,如果没有指定,默认为 `'./output'`。 +- **`-p, --phase`**: 可视化数据集的阶段,只能为 `['train', 'val', 'test']` 之一,默认为 `'train'`。 +- **`-n, --show-number`**: 可视化样本数量。如果没有指定,默认展示数据集的所有图片。 +- **`-m, --mode`**: 可视化的模式,只能为 `['original', 'transformed', 'pipeline']` 之一。 默认为 `'transformed'`。 +- `--cfg-options` : 对配置文件的修改,参考[学习配置文件](../tutorials/config.md)。 + +```shell +`-m, --mode` 用于设置可视化的模式,默认设置为 'transformed'。 +- 如果 `--mode` 设置为 'original',则获取原始图片; +- 如果 `--mode` 设置为 'transformed',则获取预处理后的图片; +- 如果 `--mode` 设置为 'pipeline',则获得数据流水线所有中间过程图片。 +``` + +**示例**: + +1. **'original'** 模式 : + +```shell +python ./tools/analysis_tools/browse_dataset.py configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py --phase val --out-dir tmp --mode original +``` + +- `--phase val`: 可视化验证集, 可简化为 `-p val`; +- `--out-dir tmp`: 可视化结果保存在 "tmp" 文件夹, 可简化为 `-o tmp`; +- `--mode original`: 可视化原图, 可简化为 `-m original`; +- `--show-number 100`: 可视化100张图,可简化为 `-n 100`; + +2. **'transformed'** 模式 : + +```shell +python ./tools/analysis_tools/browse_dataset.py configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py +``` + +3. **'pipeline'** 模式 : + +```shell +python ./tools/analysis_tools/browse_dataset.py configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py -m pipeline +``` + +
+Image +
diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/dataset_analysis.md b/third_party/mmyolo/docs/zh_cn/useful_tools/dataset_analysis.md new file mode 100644 index 0000000000000000000000000000000000000000..121128c9ecea1e35792ca0901bbbfc7c27c20e5e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/dataset_analysis.md @@ -0,0 +1,80 @@ +# 可视化数据集分析结果 + +脚本 `tools/analysis_tools/dataset_analysis.py` 能够帮助用户得到四种功能的结果图,并将图片保存到当前运行目录下的 `dataset_analysis` 文件夹中。 + +关于该脚本的功能的说明: + +通过 `main()` 的数据准备,得到每个子函数所需要的数据。 + +功能一:显示类别和 bbox 实例个数的分布图,通过子函数 `show_bbox_num` 生成。 + + + +功能二:显示类别和 bbox 实例宽、高的分布图,通过子函数 `show_bbox_wh` 生成。 + + + +功能三:显示类别和 bbox 实例宽/高比例的分布图,通过子函数 `show_bbox_wh_ratio` 生成。 + + + +功能四:基于面积规则下,显示类别和 bbox 实例面积的分布图,通过子函数 `show_bbox_area` 生成。 + + + +打印列表显示,通过脚本中子函数 `show_class_list` 和 `show_data_list` 生成。 + + + +```shell +python tools/analysis_tools/dataset_analysis.py ${CONFIG} \ + [-h] \ + [--val-dataset ${TYPE}] \ + [--class-name ${CLASS_NAME}] \ + [--area-rule ${AREA_RULE}] \ + [--func ${FUNC}] \ + [--out-dir ${OUT_DIR}] +``` + +例子: + +1. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,其中默认设置:数据加载类型为 `train_dataset` ,面积规则设置为 `[0,32,96,1e5]` ,生成包含所有类的结果图并将图片保存到当前运行目录下 `./dataset_analysis` 文件夹中: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py +``` + +2. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--val-dataset` 设置将数据加载类型由默认的 `train_dataset` 改为 `val_dataset`: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --val-dataset +``` + +3. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--class-name` 设置将生成所有类改为特定类显示,以显示 `person` 为例: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --class-name person +``` + +4. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--area-rule` 重新定义面积规则,以 `30 70 125` 为例,面积规则变为 `[0,30,70,125,1e5]`: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --area-rule 30 70 125 +``` + +5. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--func` 设置,将显示四个功能效果图改为只显示 `功能一` 为例: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --func show_bbox_num +``` + +6. 使用 `config` 文件 `configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py` 分析数据集,通过 `--out-dir` 设置修改图片保存地址,以 `work_dirs/dataset_analysis` 地址为例: + +```shell +python tools/analysis_tools/dataset_analysis.py configs/yolov5/voc/yolov5_s-v61_fast_1xb64-50e_voc.py \ + --out-dir work_dirs/dataset_analysis +``` diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/dataset_converters.md b/third_party/mmyolo/docs/zh_cn/useful_tools/dataset_converters.md new file mode 100644 index 0000000000000000000000000000000000000000..38da7fc7e5c7f0ab098acfdb7f25b6d6d773975d --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/dataset_converters.md @@ -0,0 +1,56 @@ +# 数据集转换 + +文件夹 `tools/data_converters/` 目前包含 `ballon2coco.py`、`yolo2coco.py` 和 `labelme2coco.py` 三个数据集转换工具。 + +- `ballon2coco.py` 将 `balloon` 数据集(该小型数据集仅作为入门使用)转换成 COCO 的格式。 + +```shell +python tools/dataset_converters/balloon2coco.py +``` + +- `yolo2coco.py` 将 `yolo-style` **.txt** 格式的数据集转换成 COCO 的格式,请按如下方式使用: + +```shell +python tools/dataset_converters/yolo2coco.py /path/to/the/root/dir/of/your_dataset +``` + +使用说明: + +1. `image_dir` 是需要你传入的待转换的 yolo 格式数据集的根目录,内应包含 `images` 、 `labels` 和 `classes.txt` 文件, `classes.txt` 是当前 dataset 对应的类的声明,一行一个类别。 + `image_dir` 结构如下例所示: + +```bash +. +└── $ROOT_PATH + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... +``` + +2. 脚本会检测 `image_dir` 下是否已有的 `train.txt` 、 `val.txt` 和 `test.txt` 。若检测到文件,则会按照类别进行整理, 否则默认不需要分类。故请确保对应的 `train.txt` 、 `val.txt` 和 `test.txt` 要在 `image_dir` 内。文件内的图片路径必须是**绝对路径**。 +3. 脚本会默认在 `image_dir` 目录下创建 `annotations` 文件夹并将转换结果存在这里。如果在 `image_dir` 下没找到分类文件,输出文件即为一个 `result.json`,反之则会生成需要的 `train.json` 、 `val.json`、 `test.json`,脚本完成后 `annotations` 结构可如下例所示: + +```bash +. +└── $ROOT_PATH + ├── annotations + │ ├── result.json + │ └── ... + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... +``` diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/download_dataset.md b/third_party/mmyolo/docs/zh_cn/useful_tools/download_dataset.md new file mode 100644 index 0000000000000000000000000000000000000000..a4ad6f4132a6f8d33264a2d2118db524a0cae824 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/download_dataset.md @@ -0,0 +1,11 @@ +# 数据集下载 + +脚本 `tools/misc/download_dataset.py` 支持下载数据集,例如 `COCO`、`VOC`、`LVIS` 和 `Balloon`. + +```shell +python tools/misc/download_dataset.py --dataset-name coco2017 +python tools/misc/download_dataset.py --dataset-name voc2007 +python tools/misc/download_dataset.py --dataset-name voc2012 +python tools/misc/download_dataset.py --dataset-name lvis +python tools/misc/download_dataset.py --dataset-name balloon [--save-dir ${SAVE_DIR}] [--unzip] +``` diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/extract_subcoco.md b/third_party/mmyolo/docs/zh_cn/useful_tools/extract_subcoco.md new file mode 100644 index 0000000000000000000000000000000000000000..6093533091795ab75bad7ebfa0d245bb49597a1e --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/extract_subcoco.md @@ -0,0 +1,60 @@ +# 提取 COCO 子集 + +COCO2017 数据集训练数据集包括 118K 张图片,验证集包括 5K 张图片,数据集比较大。在调试或者快速验证程序是否正确的场景下加载 json 会需要消耗较多资源和带来较慢的启动速度,这会导致程序体验不好。 + +`extract_subcoco.py` 脚本提供了按指定图片数量、类别、锚框尺寸来切分图片的功能,用户可以通过 `--num-img`, `--classes`, `--area-size` 参数来得到指定条件的 COCO 子集,从而满足上述需求。 + +例如通过以下脚本切分图片: + +```shell +python tools/misc/extract_subcoco.py \ + ${ROOT} \ + ${OUT_DIR} \ + --num-img 20 \ + --classes cat dog person \ + --area-size small +``` + +会切分出 20 张图片,且这 20 张图片只会保留同时满足类别条件和锚框尺寸条件的标注信息, 没有满足条件的标注信息的图片不会被选择,保证了这 20 张图都是有 annotation info 的。 + +注意: 本脚本目前仅仅支持 COCO2017 数据集,未来会支持更加通用的 COCO JSON 格式数据集 + +输入 root 根路径文件夹格式如下所示: + +```text +├── root +│ ├── annotations +│ ├── train2017 +│ ├── val2017 +│ ├── test2017 +``` + +1. 仅仅使用 5K 张验证集切分出 10 张训练图片和 10 张验证图片 + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 10 +``` + +2. 使用训练集切分出 20 张训练图片,使用验证集切分出 20 张验证图片 + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set +``` + +3. 设置全局种子,默认不设置 + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img 20 --use-training-set --seed 1 +``` + +4. 按指定类别切分图片 + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --classes cat dog person +``` + +5. 按指定锚框尺寸切分图片 + +```shell +python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --area-size small +``` diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/log_analysis.md b/third_party/mmyolo/docs/zh_cn/useful_tools/log_analysis.md new file mode 100644 index 0000000000000000000000000000000000000000..6d0c57398cec5cec9e76a834b414b88928151a45 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/log_analysis.md @@ -0,0 +1,82 @@ +# 日志分析 + +## 曲线图绘制 + +MMDetection 中的 `tools/analysis_tools/analyze_logs.py` 可利用指定的训练 log 文件绘制 loss/mAP 曲线图, 第一次运行前请先运行 `pip install seaborn` 安装必要依赖。 + +```shell +mim run mmdet analyze_logs plot_curve \ + ${LOG} \ # 日志文件路径 + [--keys ${KEYS}] \ # 需要绘制的指标,默认为 'bbox_mAP' + [--start-epoch ${START_EPOCH}] # 起始的 epoch,默认为 1 + [--eval-interval ${EVALUATION_INTERVAL}] \ # 评估间隔,默认为 1 + [--title ${TITLE}] \ # 图片标题,无默认值 + [--legend ${LEGEND}] \ # 图例,默认为 None + [--backend ${BACKEND}] \ # 绘制后端,默认为 None + [--style ${STYLE}] \ # 绘制风格,默认为 'dark' + [--out ${OUT_FILE}] # 输出文件路径 +# [] 代表可选参数,实际输入命令行时,不用输入 [] +``` + +样例: + +- 绘制分类损失曲线图 + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls \ + --legend loss_cls + ``` + + + +- 绘制分类损失、回归损失曲线图,保存图片为对应的 pdf 文件 + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + --keys loss_cls loss_bbox \ + --legend loss_cls loss_bbox \ + --out losses_yolov5_s.pdf + ``` + + + +- 在同一图像中比较两次运行结果的 bbox mAP + + ```shell + mim run mmdet analyze_logs plot_curve \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json \ + yolov5_n-v61_syncbn_fast_8xb16-300e_coco_20220919_090739.log.json \ + --keys bbox_mAP \ + --legend yolov5_s yolov5_n \ + --eval-interval 10 # 注意评估间隔必须和训练时设置的一致,否则会报错 + ``` + + + +## 计算平均训练速度 + +```shell +mim run mmdet analyze_logs cal_train_time \ + ${LOG} \ # 日志文件路径 + [--include-outliers] # 计算时包含每个 epoch 的第一个数据 +``` + +样例: + +```shell +mim run mmdet analyze_logs cal_train_time \ + yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json +``` + +输出以如下形式展示: + +```text +-----Analyze train time of yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700.log.json----- +slowest epoch 278, average time is 0.1705 s/iter +fastest epoch 300, average time is 0.1510 s/iter +time std over epochs is 0.0026 +average iter time: 0.1556 s/iter +``` diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/model_converters.md b/third_party/mmyolo/docs/zh_cn/useful_tools/model_converters.md new file mode 100644 index 0000000000000000000000000000000000000000..b5e7392f3bc13922cb8705484744691d4a7f4035 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/model_converters.md @@ -0,0 +1,52 @@ +# 模型转换 + +文件夹 `tools/model_converters/` 下的六个脚本能够帮助用户将对应YOLO官方的预训练模型中的键转换成 `MMYOLO` 格式,并使用 `MMYOLO` 对模型进行微调。 + +## YOLOv5 + +下面以转换 `yolov5s.pt` 为例: + +1. 将 YOLOv5 官方代码克隆到本地(目前支持的最高版本为 `v6.1` ): + +```shell +git clone -b v6.1 https://github.com/ultralytics/yolov5.git +cd yolov5 +``` + +2. 下载官方权重: + +```shell +wget https://github.com/ultralytics/yolov5/releases/download/v6.1/yolov5s.pt +``` + +3. 将 `tools/model_converters/yolov5_to_mmyolo.py` 文件复制到 YOLOv5 官方代码克隆的路径: + +```shell +cp ${MMDET_YOLO_PATH}/tools/model_converters/yolov5_to_mmyolo.py yolov5_to_mmyolo.py +``` + +4. 执行转换: + +```shell +python yolov5_to_mmyolo.py --src ${WEIGHT_FILE_PATH} --dst mmyolov5.pt +``` + +转换好的 `mmyolov5.pt` 即可以为 MMYOLO 所用。 YOLOv6 官方权重转化也是采用一样的使用方式。 + +## YOLOX + +YOLOX 模型的转换不需要下载 YOLOX 官方代码,只需要下载权重即可。下面以转换 `yolox_s.pth` 为例: + +1. 下载权重: + +```shell +wget https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s.pth +``` + +2. 执行转换: + +```shell +python tools/model_converters/yolox_to_mmyolo.py --src yolox_s.pth --dst mmyolox.pt +``` + +转换好的 `mmyolox.pt` 即可以在 MMYOLO 中使用。 diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/optimize_anchors.md b/third_party/mmyolo/docs/zh_cn/useful_tools/optimize_anchors.md new file mode 100644 index 0000000000000000000000000000000000000000..5ce98371f2bb9ff8491f8ec4c675ca23ad50bfbf --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/optimize_anchors.md @@ -0,0 +1,37 @@ +# 优化锚框尺寸 + +脚本 `tools/analysis_tools/optimize_anchors.py` 支持 YOLO 系列中三种锚框生成方式,分别是 `k-means`、`Differential Evolution`、`v5-k-means`. + +## k-means + +在 k-means 方法中,使用的是基于 IoU 表示距离的聚类方法,具体使用命令如下: + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm k-means \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} +``` + +## Differential Evolution + +在 `Differential Evolution` 方法中,使用的是基于差分进化算法(简称 DE 算法)的聚类方式,其最小化目标函数为 `avg_iou_cost`,具体使用命令如下: + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm DE \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} +``` + +## v5-k-means + +在 v5-k-means 方法中,使用的是 YOLOv5 中基于 `shape-match` 的聚类方式,具体使用命令如下: + +```shell +python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm v5-k-means \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --prior-match-thr ${PRIOR_MATCH_THR} \ + --out-dir ${OUT_DIR} +``` diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/print_config.md b/third_party/mmyolo/docs/zh_cn/useful_tools/print_config.md new file mode 100644 index 0000000000000000000000000000000000000000..904fbd5f3bb82d826752b48c060aed8afb072723 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/print_config.md @@ -0,0 +1,20 @@ +# 打印完整配置文件 + +MMDetection 中的 `tools/misc/print_config.py` 脚本可将所有配置继承关系展开,打印相应的完整配置文件。调用命令如下: + +```shell +mim run mmdet print_config \ + ${CONFIG} \ # 需要打印的配置文件路径 + [--save-path] \ # 保存文件路径,必须以 .py, .json 或者 .yml 结尾 + [--cfg-options ${OPTIONS [OPTIONS...]}] # 通过命令行参数修改配置文件 +``` + +样例: + +```shell +mim run mmdet print_config \ + configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py \ + --save-path ./work_dirs/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon_whole.py +``` + +运行以上命令,会将 `yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py` 继承关系展开后的配置文件保存到 `./work_dirs` 文件夹内的 `yolov5_s-v61_syncbn_fast_1xb4-300e_balloon_whole.py` 文件中。 diff --git a/third_party/mmyolo/docs/zh_cn/useful_tools/vis_scheduler.md b/third_party/mmyolo/docs/zh_cn/useful_tools/vis_scheduler.md new file mode 100644 index 0000000000000000000000000000000000000000..f0d772aebda23efb07721e5e94ef98321ebe6e96 --- /dev/null +++ b/third_party/mmyolo/docs/zh_cn/useful_tools/vis_scheduler.md @@ -0,0 +1,44 @@ +# 可视化优化器参数策略 + +`tools/analysis_tools/vis_scheduler.py` 旨在帮助用户检查优化器的超参数调度器(无需训练),支持学习率(learning rate)、动量(momentum)和权值衰减(weight decay)。 + +```shell +python tools/analysis_tools/vis_scheduler.py \ + ${CONFIG_FILE} \ + [-p, --parameter ${PARAMETER_NAME}] \ + [-d, --dataset-size ${DATASET_SIZE}] \ + [-n, --ngpus ${NUM_GPUs}] \ + [-o, --out-dir ${OUT_DIR}] \ + [--title ${TITLE}] \ + [--style ${STYLE}] \ + [--window-size ${WINDOW_SIZE}] \ + [--cfg-options] +``` + +**所有参数的说明**: + +- `config` : 模型配置文件的路径。 +- **`-p, parameter`**: 可视化参数名,只能为 `["lr", "momentum", "wd"]` 之一, 默认为 `"lr"`. +- **`-d, --dataset-size`**: 数据集的大小。如果指定,`DATASETS.build` 将被跳过并使用这个数值作为数据集大小,默认使用 `DATASETS.build` 所得数据集的大小。 +- **`-n, --ngpus`**: 使用 GPU 的数量, 默认为1。 +- **`-o, --out-dir`**: 保存的可视化图片的文件夹路径,默认不保存。 +- `--title`: 可视化图片的标题,默认为配置文件名。 +- `--style`: 可视化图片的风格,默认为 `whitegrid`。 +- `--window-size`: 可视化窗口大小,如果没有指定,默认为 `12*7`。如果需要指定,按照格式 `'W*H'`。 +- `--cfg-options`: 对配置文件的修改,参考[学习配置文件](../tutorials/config.md)。 + +```{note} +部分数据集在解析标注阶段比较耗时,推荐直接将 `-d, dataset-size` 指定数据集的大小,以节约时间。 +``` + +你可以使用如下命令来绘制配置文件 `configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py` 将会使用的学习率变化曲线: + +```shell +python tools/analysis_tools/vis_scheduler.py \ + configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py \ + --dataset-size 118287 \ + --ngpus 8 \ + --out-dir ./output +``` + +
diff --git a/third_party/mmyolo/mmyolo/__init__.py b/third_party/mmyolo/mmyolo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6a0bd5d30f4a02ae1752dbbf729ef5dfbbc82789 --- /dev/null +++ b/third_party/mmyolo/mmyolo/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmdet +import mmengine +from mmengine.utils import digit_version + +from .version import __version__, version_info + +mmcv_minimum_version = '2.0.0rc4' +mmcv_maximum_version = '2.1.0' +mmcv_version = digit_version(mmcv.__version__) + +mmengine_minimum_version = '0.7.1' +mmengine_maximum_version = '1.0.0' +mmengine_version = digit_version(mmengine.__version__) + +mmdet_minimum_version = '3.0.0' +mmdet_maximum_version = '4.0.0' +mmdet_version = digit_version(mmdet.__version__) + + +assert (mmcv_version >= digit_version(mmcv_minimum_version) + and mmcv_version < digit_version(mmcv_maximum_version)), \ + f'MMCV=={mmcv.__version__} is used but incompatible. ' \ + f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.' + +assert (mmengine_version >= digit_version(mmengine_minimum_version) + and mmengine_version < digit_version(mmengine_maximum_version)), \ + f'MMEngine=={mmengine.__version__} is used but incompatible. ' \ + f'Please install mmengine>={mmengine_minimum_version}, ' \ + f'<{mmengine_maximum_version}.' + +assert (mmdet_version >= digit_version(mmdet_minimum_version) + and mmdet_version < digit_version(mmdet_maximum_version)), \ + f'MMDetection=={mmdet.__version__} is used but incompatible. ' \ + f'Please install mmdet>={mmdet_minimum_version}, ' \ + f'<{mmdet_maximum_version}.' + +__all__ = ['__version__', 'version_info', 'digit_version'] diff --git a/third_party/mmyolo/mmyolo/datasets/__init__.py b/third_party/mmyolo/mmyolo/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9db4390457119feaf13b1d2279c8c8bdf2abcf71 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .pose_coco import PoseCocoDataset +from .transforms import * # noqa: F401,F403 +from .utils import BatchShapePolicy, yolov5_collate +from .yolov5_coco import YOLOv5CocoDataset +from .yolov5_crowdhuman import YOLOv5CrowdHumanDataset +from .yolov5_dota import YOLOv5DOTADataset +from .yolov5_voc import YOLOv5VOCDataset + +__all__ = [ + 'YOLOv5CocoDataset', 'YOLOv5VOCDataset', 'BatchShapePolicy', + 'yolov5_collate', 'YOLOv5CrowdHumanDataset', 'YOLOv5DOTADataset', + 'PoseCocoDataset' +] diff --git a/third_party/mmyolo/mmyolo/datasets/pose_coco.py b/third_party/mmyolo/mmyolo/datasets/pose_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b17f9836aea469f09679d01d605f3629771a1801 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/pose_coco.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any + +from mmengine.dataset import force_full_init + +try: + from mmpose.datasets import CocoDataset as MMPoseCocoDataset +except ImportError: + MMPoseCocoDataset = object + +from ..registry import DATASETS + + +@DATASETS.register_module() +class PoseCocoDataset(MMPoseCocoDataset): + + METAINFO: dict = dict(from_file='configs/_base_/pose/coco.py') + + def __init__(self, *args, **kwargs): + if MMPoseCocoDataset is object: + raise ImportError( + 'Please run "mim install -r requirements/mmpose.txt" ' + 'to install mmpose first for PoseCocoDataset.') + super().__init__(*args, **kwargs) + + @force_full_init + def prepare_data(self, idx) -> Any: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) diff --git a/third_party/mmyolo/mmyolo/datasets/transforms/__init__.py b/third_party/mmyolo/mmyolo/datasets/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7cdcf8625173e05ef884cf1afe17a9a1c992b6cd --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/transforms/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .formatting import PackDetInputs +from .mix_img_transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp +from .transforms import (FilterAnnotations, LetterResize, LoadAnnotations, + Polygon2Mask, PPYOLOERandomCrop, PPYOLOERandomDistort, + RandomAffine, RandomFlip, RegularizeRotatedBox, + RemoveDataElement, Resize, YOLOv5CopyPaste, + YOLOv5HSVRandomAug, YOLOv5KeepRatioResize, + YOLOv5RandomAffine) + +__all__ = [ + 'YOLOv5KeepRatioResize', 'LetterResize', 'Mosaic', 'YOLOXMixUp', + 'YOLOv5MixUp', 'YOLOv5HSVRandomAug', 'LoadAnnotations', + 'YOLOv5RandomAffine', 'PPYOLOERandomDistort', 'PPYOLOERandomCrop', + 'Mosaic9', 'YOLOv5CopyPaste', 'RemoveDataElement', 'RegularizeRotatedBox', + 'Polygon2Mask', 'PackDetInputs', 'RandomAffine', 'RandomFlip', 'Resize', + 'FilterAnnotations' +] diff --git a/third_party/mmyolo/mmyolo/datasets/transforms/formatting.py b/third_party/mmyolo/mmyolo/datasets/transforms/formatting.py new file mode 100644 index 0000000000000000000000000000000000000000..07eb0121eefdeece052695eeb46599a71a62efe3 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/transforms/formatting.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from mmcv.transforms import to_tensor +from mmdet.datasets.transforms import PackDetInputs as MMDET_PackDetInputs +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import BaseBoxes +from mmengine.structures import InstanceData, PixelData + +from mmyolo.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class PackDetInputs(MMDET_PackDetInputs): + """Pack the inputs data for the detection / semantic segmentation / + panoptic segmentation. + + Compared to mmdet, we just add the `gt_panoptic_seg` field and logic. + """ + mapping_table = { + 'gt_bboxes': 'bboxes', + 'gt_bboxes_labels': 'labels', + 'gt_masks': 'masks', + 'gt_keypoints': 'keypoints', + 'gt_keypoints_visible': 'keypoints_visible' + } + + def transform(self, results: dict) -> dict: + """Method to pack the input data. + Args: + results (dict): Result dict from the data pipeline. + Returns: + dict: + - 'inputs' (obj:`torch.Tensor`): The forward data of models. + - 'data_sample' (obj:`DetDataSample`): The annotation info of the + sample. + """ + packed_results = dict() + if 'img' in results: + img = results['img'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + # To improve the computational speed by by 3-5 times, apply: + # If image is not contiguous, use + # `numpy.transpose()` followed by `numpy.ascontiguousarray()` + # If image is already contiguous, use + # `torch.permute()` followed by `torch.contiguous()` + # Refer to https://github.com/open-mmlab/mmdetection/pull/9533 + # for more details + if not img.flags.c_contiguous: + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + img = to_tensor(img) + else: + img = to_tensor(img).permute(2, 0, 1).contiguous() + + packed_results['inputs'] = img + + if 'gt_ignore_flags' in results: + valid_idx = np.where(results['gt_ignore_flags'] == 0)[0] + ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0] + if 'gt_keypoints' in results: + results['gt_keypoints_visible'] = results[ + 'gt_keypoints'].keypoints_visible + results['gt_keypoints'] = results['gt_keypoints'].keypoints + + data_sample = DetDataSample() + instance_data = InstanceData() + ignore_instance_data = InstanceData() + + for key in self.mapping_table.keys(): + if key not in results: + continue + if key == 'gt_masks' or isinstance(results[key], BaseBoxes): + if 'gt_ignore_flags' in results: + instance_data[ + self.mapping_table[key]] = results[key][valid_idx] + ignore_instance_data[ + self.mapping_table[key]] = results[key][ignore_idx] + else: + instance_data[self.mapping_table[key]] = results[key] + else: + if 'gt_ignore_flags' in results: + instance_data[self.mapping_table[key]] = to_tensor( + results[key][valid_idx]) + ignore_instance_data[self.mapping_table[key]] = to_tensor( + results[key][ignore_idx]) + else: + instance_data[self.mapping_table[key]] = to_tensor( + results[key]) + data_sample.gt_instances = instance_data + data_sample.ignored_instances = ignore_instance_data + + if 'gt_seg_map' in results: + gt_sem_seg_data = dict( + sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy())) + data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data) + + # In order to unify the support for the overlap mask annotations + # i.e. mask overlap annotations in (h,w) format, + # we use the gt_panoptic_seg field to unify the modeling + if 'gt_panoptic_seg' in results: + data_sample.gt_panoptic_seg = PixelData( + pan_seg=results['gt_panoptic_seg']) + + img_meta = {} + for key in self.meta_keys: + assert key in results, f'`{key}` is not found in `results`, ' \ + f'the valid keys are {list(results)}.' + img_meta[key] = results[key] + + data_sample.set_metainfo(img_meta) + packed_results['data_samples'] = data_sample + + return packed_results diff --git a/third_party/mmyolo/mmyolo/datasets/transforms/keypoint_structure.py b/third_party/mmyolo/mmyolo/datasets/transforms/keypoint_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..7b8402be9950bc2a635f5269e7959719e8d87ac9 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/transforms/keypoint_structure.py @@ -0,0 +1,248 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta +from copy import deepcopy +from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union + +import numpy as np +import torch +from torch import Tensor + +DeviceType = Union[str, torch.device] +T = TypeVar('T') +IndexType = Union[slice, int, list, torch.LongTensor, torch.cuda.LongTensor, + torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray] + + +class Keypoints(metaclass=ABCMeta): + """The Keypoints class is for keypoints representation. + + Args: + keypoints (Tensor or np.ndarray): The keypoint data with shape of + (N, K, 2). + keypoints_visible (Tensor or np.ndarray): The visibility of keypoints + with shape of (N, K). + device (str or torch.device, Optional): device of keypoints. + Default to None. + clone (bool): Whether clone ``keypoints`` or not. Defaults to True. + flip_indices (list, Optional): The indices of keypoints when the + images is flipped. Defaults to None. + + Notes: + N: the number of instances. + K: the number of keypoints. + """ + + def __init__(self, + keypoints: Union[Tensor, np.ndarray], + keypoints_visible: Union[Tensor, np.ndarray], + device: Optional[DeviceType] = None, + clone: bool = True, + flip_indices: Optional[List] = None) -> None: + + assert len(keypoints_visible) == len(keypoints) + assert keypoints.ndim == 3 + assert keypoints_visible.ndim == 2 + + keypoints = torch.as_tensor(keypoints) + keypoints_visible = torch.as_tensor(keypoints_visible) + + if device is not None: + keypoints = keypoints.to(device=device) + keypoints_visible = keypoints_visible.to(device=device) + + if clone: + keypoints = keypoints.clone() + keypoints_visible = keypoints_visible.clone() + + self.keypoints = keypoints + self.keypoints_visible = keypoints_visible + self.flip_indices = flip_indices + + def flip_(self, + img_shape: Tuple[int, int], + direction: str = 'horizontal') -> None: + """Flip boxes & kpts horizontally in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + direction (str): Flip direction, options are "horizontal", + "vertical" and "diagonal". Defaults to "horizontal" + """ + assert direction == 'horizontal' + self.keypoints[..., 0] = img_shape[1] - self.keypoints[..., 0] + self.keypoints = self.keypoints[:, self.flip_indices] + self.keypoints_visible = self.keypoints_visible[:, self.flip_indices] + + def translate_(self, distances: Tuple[float, float]) -> None: + """Translate boxes and keypoints in-place. + + Args: + distances (Tuple[float, float]): translate distances. The first + is horizontal distance and the second is vertical distance. + """ + assert len(distances) == 2 + distances = self.keypoints.new_tensor(distances).reshape(1, 1, 2) + self.keypoints = self.keypoints + distances + + def rescale_(self, scale_factor: Tuple[float, float]) -> None: + """Rescale boxes & keypoints w.r.t. rescale_factor in-place. + + Note: + Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes + w.r.t ``scale_facotr``. The difference is that ``resize_`` only + changes the width and the height of boxes, but ``rescale_`` also + rescales the box centers simultaneously. + + Args: + scale_factor (Tuple[float, float]): factors for scaling boxes. + The length should be 2. + """ + assert len(scale_factor) == 2 + + scale_factor = self.keypoints.new_tensor(scale_factor).reshape(1, 1, 2) + self.keypoints = self.keypoints * scale_factor + + def clip_(self, img_shape: Tuple[int, int]) -> None: + """Clip bounding boxes and set invisible keypoints outside the image + boundary in-place. + + Args: + img_shape (Tuple[int, int]): A tuple of image height and width. + """ + + kpt_outside = torch.logical_or( + torch.logical_or(self.keypoints[..., 0] < 0, + self.keypoints[..., 1] < 0), + torch.logical_or(self.keypoints[..., 0] > img_shape[1], + self.keypoints[..., 1] > img_shape[0])) + self.keypoints_visible[kpt_outside] *= 0 + + def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None: + """Geometrically transform bounding boxes and keypoints in-place using + a homography matrix. + + Args: + homography_matrix (Tensor or np.ndarray): A 3x3 tensor or ndarray + representing the homography matrix for the transformation. + """ + keypoints = self.keypoints + if isinstance(homography_matrix, np.ndarray): + homography_matrix = keypoints.new_tensor(homography_matrix) + + # Convert keypoints to homogeneous coordinates + keypoints = torch.cat([ + self.keypoints, + self.keypoints.new_ones(*self.keypoints.shape[:-1], 1) + ], + dim=-1) + + # Transpose keypoints for matrix multiplication + keypoints_T = torch.transpose(keypoints, -1, 0).contiguous().flatten(1) + + # Apply homography matrix to corners and keypoints + keypoints_T = torch.matmul(homography_matrix, keypoints_T) + + # Transpose back to original shape + keypoints_T = keypoints_T.reshape(3, self.keypoints.shape[1], -1) + keypoints = torch.transpose(keypoints_T, -1, 0).contiguous() + + # Convert corners and keypoints back to non-homogeneous coordinates + keypoints = keypoints[..., :2] / keypoints[..., 2:3] + + # Convert corners back to bounding boxes and update object attributes + self.keypoints = keypoints + + @classmethod + def cat(cls: Type[T], kps_list: Sequence[T], dim: int = 0) -> T: + """Cancatenates an instance list into one single instance. Similar to + ``torch.cat``. + + Args: + box_list (Sequence[T]): A sequence of instances. + dim (int): The dimension over which the box and keypoint are + concatenated. Defaults to 0. + + Returns: + T: Concatenated instance. + """ + assert isinstance(kps_list, Sequence) + if len(kps_list) == 0: + raise ValueError('kps_list should not be a empty list.') + + assert dim == 0 + assert all(isinstance(keypoints, cls) for keypoints in kps_list) + + th_kpt_list = torch.cat( + [keypoints.keypoints for keypoints in kps_list], dim=dim) + th_kpt_vis_list = torch.cat( + [keypoints.keypoints_visible for keypoints in kps_list], dim=dim) + flip_indices = kps_list[0].flip_indices + return cls( + th_kpt_list, + th_kpt_vis_list, + clone=False, + flip_indices=flip_indices) + + def __getitem__(self: T, index: IndexType) -> T: + """Rewrite getitem to protect the last dimension shape.""" + if isinstance(index, np.ndarray): + index = torch.as_tensor(index, device=self.device) + if isinstance(index, Tensor) and index.dtype == torch.bool: + assert index.dim() < self.keypoints.dim() - 1 + elif isinstance(index, tuple): + assert len(index) < self.keypoints.dim() - 1 + # `Ellipsis`(...) is commonly used in index like [None, ...]. + # When `Ellipsis` is in index, it must be the last item. + if Ellipsis in index: + assert index[-1] is Ellipsis + + keypoints = self.keypoints[index] + keypoints_visible = self.keypoints_visible[index] + if self.keypoints.dim() == 2: + keypoints = keypoints.reshape(1, -1, 2) + keypoints_visible = keypoints_visible.reshape(1, -1) + return type(self)( + keypoints, + keypoints_visible, + flip_indices=self.flip_indices, + clone=False) + + def __repr__(self) -> str: + """Return a strings that describes the object.""" + return self.__class__.__name__ + '(\n' + str(self.keypoints) + ')' + + @property + def num_keypoints(self) -> Tensor: + """Compute the number of visible keypoints for each object.""" + return self.keypoints_visible.sum(dim=1).int() + + def __deepcopy__(self, memo): + """Only clone the tensors when applying deepcopy.""" + cls = self.__class__ + other = cls.__new__(cls) + memo[id(self)] = other + other.keypoints = self.keypoints.clone() + other.keypoints_visible = self.keypoints_visible.clone() + other.flip_indices = deepcopy(self.flip_indices) + return other + + def clone(self: T) -> T: + """Reload ``clone`` for tensors.""" + return type(self)( + self.keypoints, + self.keypoints_visible, + flip_indices=self.flip_indices, + clone=True) + + def to(self: T, *args, **kwargs) -> T: + """Reload ``to`` for tensors.""" + return type(self)( + self.keypoints.to(*args, **kwargs), + self.keypoints_visible.to(*args, **kwargs), + flip_indices=self.flip_indices, + clone=False) + + @property + def device(self) -> torch.device: + """Reload ``device`` from self.tensor.""" + return self.keypoints.device diff --git a/third_party/mmyolo/mmyolo/datasets/transforms/mix_img_transforms.py b/third_party/mmyolo/mmyolo/datasets/transforms/mix_img_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..29e4a4057366374dbdd72fa106b5a3f7ac484d24 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/transforms/mix_img_transforms.py @@ -0,0 +1,1191 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import collections +import copy +from abc import ABCMeta, abstractmethod +from typing import Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmdet.structures.bbox import autocast_box_type +from mmengine.dataset import BaseDataset +from mmengine.dataset.base_dataset import Compose +from numpy import random + +from mmyolo.registry import TRANSFORMS + + +class BaseMixImageTransform(BaseTransform, metaclass=ABCMeta): + """A Base Transform of multiple images mixed. + + Suitable for training on multiple images mixed data augmentation like + mosaic and mixup. + + Cached mosaic transform will random select images from the cache + and combine them into one output image if use_cached is True. + + Args: + pre_transform(Sequence[str]): Sequence of transform object or + config dict to be composed. Defaults to None. + prob(float): The transformation probability. Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + pre_transform: Optional[Sequence[str]] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + + self.max_refetch = max_refetch + self.prob = prob + + self.use_cached = use_cached + self.max_cached_images = max_cached_images + self.random_pop = random_pop + self.results_cache = [] + + if pre_transform is None: + self.pre_transform = None + else: + self.pre_transform = Compose(pre_transform) + + @abstractmethod + def get_indexes(self, dataset: Union[BaseDataset, + list]) -> Union[list, int]: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list or int: indexes. + """ + pass + + @abstractmethod + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + pass + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Data augmentation function. + + The transform steps are as follows: + 1. Randomly generate index list of other images. + 2. Before Mosaic or MixUp need to go through the necessary + pre_transform, such as MixUp' pre_transform pipeline + include: 'LoadImageFromFile','LoadAnnotations', + 'Mosaic' and 'RandomAffine'. + 3. Use mix_img_transform function to implement specific + mix operations. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + + if random.uniform(0, 1) > self.prob: + return results + + if self.use_cached: + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + self.results_cache.append(copy.deepcopy(results)) + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results + else: + assert 'dataset' in results + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + + for _ in range(self.max_refetch): + # get index of one or three other images + if self.use_cached: + indexes = self.get_indexes(self.results_cache) + else: + indexes = self.get_indexes(dataset) + + if not isinstance(indexes, collections.abc.Sequence): + indexes = [indexes] + + if self.use_cached: + mix_results = [ + copy.deepcopy(self.results_cache[i]) for i in indexes + ] + else: + # get images information will be used for Mosaic or MixUp + mix_results = [ + copy.deepcopy(dataset.get_data_info(index)) + for index in indexes + ] + + if self.pre_transform is not None: + for i, data in enumerate(mix_results): + # pre_transform may also require dataset + data.update({'dataset': dataset}) + # before Mosaic or MixUp need to go through + # the necessary pre_transform + _results = self.pre_transform(data) + _results.pop('dataset') + mix_results[i] = _results + + if None not in mix_results: + results['mix_results'] = mix_results + break + print('Repeated calculation') + else: + raise RuntimeError( + 'The loading pipeline of the original dataset' + ' always return None. Please check the correctness ' + 'of the dataset and its pipeline.') + + # Mosaic or MixUp + results = self.mix_img_transform(results) + + if 'mix_results' in results: + results.pop('mix_results') + results['dataset'] = dataset + + return results + + +@TRANSFORMS.register_module() +class Mosaic(BaseMixImageTransform): + """Mosaic augmentation. + + Given 4 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | | + | +-----------+ pad | + | | | | + | | image1 +-----------+ + | | | | + | | | image2 | + center_y |----+-+-----------+-----------+ + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + The mosaic transform steps are as follows: + + 1. Choose the mosaic center as the intersections of 4 images + 2. Get the left top image according to the index, and randomly + sample another 3 images from the custom dataset. + 3. Sub image will be cropped if image is larger than mosaic patch + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_ratio_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 4, 'The length of cache must >= 4, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.center_ratio_range = center_ratio_range + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + mosaic_masks = [] + mosaic_kps = [] + with_mask = True if 'gt_masks' in results else False + with_kps = True if 'gt_keypoints' in results else False + # self.img_scale is wh format + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 2), int(img_scale_w * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)), + self.pad_val, + dtype=results['img'].dtype) + + # mosaic center x, y + center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w) + center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h) + center_position = (center_x, center_y) + + loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for i, loc in enumerate(loc_strs): + if loc == 'top_left': + results_patch = results + else: + results_patch = results['mix_results'][i - 1] + + img_i = results_patch['img'] + h_i, w_i = img_i.shape[:2] + # keep_ratio resize + scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i) + img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) + + # compute the combine parameters + paste_coord, crop_coord = self._mosaic_combine( + loc, center_position, img_i.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] + + # adjust coordinate + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + + padw = x1_p - x1_c + padh = y1_p - y1_c + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + if with_mask and results_patch.get('gt_masks', None) is not None: + gt_masks_i = results_patch['gt_masks'] + gt_masks_i = gt_masks_i.resize(img_i.shape[:2]) + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padw, + direction='horizontal') + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padh, + direction='vertical') + mosaic_masks.append(gt_masks_i) + if with_kps and results_patch.get('gt_keypoints', + None) is not None: + gt_kps_i = results_patch['gt_keypoints'] + gt_kps_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_kps_i.translate_([padw, padh]) + mosaic_kps.append(gt_kps_i) + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks) + results['gt_masks'] = mosaic_masks + if with_kps: + mosaic_kps = mosaic_kps[0].cat(mosaic_kps, 0) + mosaic_kps.clip_([2 * img_scale_h, 2 * img_scale_w]) + results['gt_keypoints'] = mosaic_kps + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds] + results['gt_masks'] = mosaic_masks + if with_kps: + mosaic_kps = mosaic_kps[0].cat(mosaic_kps, 0) + mosaic_kps = mosaic_kps[inside_inds] + results['gt_keypoints'] = mosaic_kps + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + + return results + + def _mosaic_combine( + self, loc: str, center_position_xy: Sequence[float], + img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]: + """Calculate global coordinate of mosaic image and local coordinate of + cropped sub-image. + + Args: + loc (str): Index for the sub-image, loc in ('top_left', + 'top_right', 'bottom_left', 'bottom_right'). + center_position_xy (Sequence[float]): Mixing center for 4 images, + (x, y). + img_shape_wh (Sequence[int]): Width and height of sub-image + + Returns: + tuple[tuple[float]]: Corresponding coordinate of pasting and + cropping + - paste_coord (tuple): paste corner coordinate in mosaic image. + - crop_coord (tuple): crop corner coordinate in mosaic image. + """ + assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') + if loc == 'top_left': + # index0 to top left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + center_position_xy[0], \ + center_position_xy[1] + crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - ( + y2 - y1), img_shape_wh[0], img_shape_wh[1] + + elif loc == 'top_right': + # index1 to top right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + center_position_xy[1] + crop_coord = 0, img_shape_wh[1] - (y2 - y1), min( + img_shape_wh[0], x2 - x1), img_shape_wh[1] + + elif loc == 'bottom_left': + # index2 to bottom left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + center_position_xy[1], \ + center_position_xy[0], \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min( + y2 - y1, img_shape_wh[1]) + + else: + # index3 to bottom right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + center_position_xy[1], \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = 0, 0, min(img_shape_wh[0], + x2 - x1), min(y2 - y1, img_shape_wh[1]) + + paste_coord = x1, y1, x2, y2 + return paste_coord, crop_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_ratio_range={self.center_ratio_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class Mosaic9(BaseMixImageTransform): + """Mosaic9 augmentation. + + Given 9 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + +-------------------------------+------------+ + | pad | pad | | + | +----------+ | | + | | +---------------+ top_right | + | | | top | image2 | + | | top_left | image1 | | + | | image8 o--------+------+--------+---+ + | | | | | | + +----+----------+ | right |pad| + | | center | image3 | | + | left | image0 +---------------+---| + | image7 | | | | + +---+-----------+---+--------+ | | + | | cropped | | bottom_right |pad| + | |bottom_left| | image4 | | + | | image6 | bottom | | | + +---|-----------+ image5 +---------------+---| + | pad | | pad | + +-----------+------------+-------------------+ + + The mosaic transform steps are as follows: + + 1. Get the center image according to the index, and randomly + sample another 8 images from the custom dataset. + 2. Randomly offset the image after Mosaic + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 5 caches for each image suffices for + randomness. Defaults to 50. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + bbox_clip_border: bool = True, + pad_val: Union[float, int] = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 50, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 9, 'The length of cache must >= 9, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + # intermediate variables + self._current_img_shape = [0, 0] + self._center_img_shape = [0, 0] + self._previous_img_shape = [0, 0] + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(8)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 3), int(img_scale_w * 3), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)), + self.pad_val, + dtype=results['img'].dtype) + + # index = 0 is mean original image + # len(results['mix_results']) = 8 + loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + results_all = [results, *results['mix_results']] + for index, results_patch in enumerate(results_all): + img_i = results_patch['img'] + # keep_ratio resize + img_i_h, img_i_w = img_i.shape[:2] + scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w) + img_i = mmcv.imresize( + img_i, + (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i))) + + paste_coord = self._mosaic_combine(loc_strs[index], + img_i.shape[:2]) + + padw, padh = paste_coord[:2] + x1, y1, x2, y2 = (max(x, 0) for x in paste_coord) + mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:] + + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + + # Offset + offset_x = int(random.uniform(0, img_scale_w)) + offset_y = int(random.uniform(0, img_scale_h)) + mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h, + offset_x:offset_x + 2 * img_scale_w] + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes.translate_([-offset_x, -offset_y]) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + return results + + def _mosaic_combine(self, loc: str, + img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]: + """Calculate global coordinate of mosaic image. + + Args: + loc (str): Index for the sub-image. + img_shape_hw (Sequence[int]): Height and width of sub-image + + Returns: + paste_coord (tuple): paste corner coordinate in mosaic image. + """ + assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + img_scale_w, img_scale_h = self.img_scale + + self._current_img_shape = img_shape_hw + current_img_h, current_img_w = self._current_img_shape + previous_img_h, previous_img_w = self._previous_img_shape + center_img_h, center_img_w = self._center_img_shape + + if loc == 'center': + self._center_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + paste_coord = img_scale_w, \ + img_scale_h, \ + img_scale_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'top': + paste_coord = img_scale_w, \ + img_scale_h - current_img_h, \ + img_scale_w + current_img_w, \ + img_scale_h + elif loc == 'top_right': + paste_coord = img_scale_w + previous_img_w, \ + img_scale_h - current_img_h, \ + img_scale_w + previous_img_w + current_img_w, \ + img_scale_h + elif loc == 'right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'bottom_right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h + previous_img_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + previous_img_h + current_img_h + elif loc == 'bottom': + paste_coord = img_scale_w + center_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'bottom_left': + paste_coord = img_scale_w + center_img_w - \ + previous_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w - previous_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h + elif loc == 'top_left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - \ + previous_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h - previous_img_h + + self._previous_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + return paste_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5MixUp(BaseMixImageTransform): + """MixUp data augmentation for YOLOv5. + + .. code:: text + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset. + 2. Randomly obtain the fusion ratio from the beta distribution, + then fuse the target + of the original image and mixup image through this ratio. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + alpha (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + beta (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + pre_transform (Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + alpha: float = 32.0, + beta: float = 32.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.alpha = alpha + self.beta = beta + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOv5 MixUp transform function. + + Args: + results (dict): Result dict + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + ori_img = results['img'] + assert ori_img.shape == retrieve_img.shape + + # Randomly obtain the fusion ratio from the beta distribution, + # which is around 0.5 + ratio = np.random.beta(self.alpha, self.beta) + mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio)) + + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = retrieve_gt_bboxes.cat( + (results['gt_bboxes'], retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + if 'gt_masks' in results: + assert 'gt_masks' in retrieve_results + mixup_gt_masks = results['gt_masks'].cat( + [results['gt_masks'], retrieve_results['gt_masks']]) + results['gt_masks'] = mixup_gt_masks + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + +@TRANSFORMS.register_module() +class YOLOXMixUp(BaseMixImageTransform): + """MixUp data augmentation for YOLOX. + + .. code:: text + + mixup transform + +---------------+--------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + +---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | +-----------------+ | + | pad | + +------------------------------+ + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset and embedded in + the top left patch(after padding and resizing) + 2. The target of mixup transform is the weighted average of mixup + image and origin image. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + bbox_clip_border: bool = True, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.img_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.bbox_clip_border = bbox_clip_border + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOX MixUp transform function. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + assert len( + results['mix_results']) == 1, 'MixUp only support 2 images now !' + + if results['mix_results'][0]['gt_bboxes'].shape[0] == 0: + # empty bbox + return results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + + jit_factor = random.uniform(*self.ratio_range) + is_filp = random.uniform(0, 1) > self.flip_ratio + + if len(retrieve_img.shape) == 3: + out_img = np.ones((self.img_scale[1], self.img_scale[0], 3), + dtype=retrieve_img.dtype) * self.pad_val + else: + out_img = np.ones( + self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val + + # 1. keep_ratio resize + scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0], + self.img_scale[0] / retrieve_img.shape[1]) + retrieve_img = mmcv.imresize( + retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), + int(retrieve_img.shape[0] * scale_ratio))) + + # 2. paste + out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img + + # 3. scale jit + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # 4. flip + if is_filp: + out_img = out_img[:, ::-1, :] + + # 5. random crop + ori_img = results['img'] + origin_h, origin_w = out_img.shape[:2] + target_h, target_w = ori_img.shape[:2] + padded_img = np.ones((max(origin_h, target_h), max( + origin_w, target_w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:origin_h, :origin_w] = out_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, + x_offset:x_offset + target_w] + + # 6. adjust bbox + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_bboxes.clip_([origin_h, origin_w]) + + if is_filp: + retrieve_gt_bboxes.flip_([origin_h, origin_w], + direction='horizontal') + + # 7. filter + cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() + cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_bboxes.clip_([target_h, target_w]) + + # 8. mix up + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img + + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( + (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + + if not self.bbox_clip_border: + # remove outside bbox + inside_inds = mixup_gt_bboxes.is_inside([target_h, + target_w]).numpy() + mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] + mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] + mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] + + if 'gt_keypoints' in results: + # adjust kps + retrieve_gt_keypoints = retrieve_results['gt_keypoints'] + retrieve_gt_keypoints.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_keypoints.clip_([origin_h, origin_w]) + + if is_filp: + retrieve_gt_keypoints.flip_([origin_h, origin_w], + direction='horizontal') + + # filter + cp_retrieve_gt_keypoints = retrieve_gt_keypoints.clone() + cp_retrieve_gt_keypoints.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_keypoints.clip_([target_h, target_w]) + + # mixup + mixup_gt_keypoints = cp_retrieve_gt_keypoints.cat( + (results['gt_keypoints'], cp_retrieve_gt_keypoints), dim=0) + if not self.bbox_clip_border: + # remove outside bbox + mixup_gt_keypoints = mixup_gt_keypoints[inside_inds] + results['gt_keypoints'] = mixup_gt_keypoints + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'max_refetch={self.max_refetch}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str diff --git a/third_party/mmyolo/mmyolo/datasets/transforms/transforms.py b/third_party/mmyolo/mmyolo/datasets/transforms/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..8060e9c727b95ba4cfcef865385f9e40491e26da --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/transforms/transforms.py @@ -0,0 +1,2102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from copy import deepcopy +from typing import List, Sequence, Tuple, Union + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.image.geometric import _scale_size +from mmcv.transforms import BaseTransform, Compose +from mmcv.transforms.utils import cache_randomness +from mmdet.datasets.transforms import FilterAnnotations as FilterDetAnnotations +from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations +from mmdet.datasets.transforms import RandomAffine as MMDET_RandomAffine +from mmdet.datasets.transforms import RandomFlip as MMDET_RandomFlip +from mmdet.datasets.transforms import Resize as MMDET_Resize +from mmdet.structures.bbox import (HorizontalBoxes, autocast_box_type, + get_box_type) +from mmdet.structures.mask import PolygonMasks, polygon_to_bitmap +from numpy import random + +from mmyolo.registry import TRANSFORMS +from .keypoint_structure import Keypoints + +# TODO: Waiting for MMCV support +TRANSFORMS.register_module(module=Compose, force=True) + + +@TRANSFORMS.register_module() +class YOLOv5KeepRatioResize(MMDET_Resize): + """Resize images & bbox(if existed). + + This transform resizes the input image according to ``scale``. + Bboxes (if existed) are then resized with the same scale factor. + + Required Keys: + + - img (np.uint8) + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + + Modified Keys: + + - img (np.uint8) + - img_shape (tuple) + - gt_bboxes (optional) + - scale (float) + + Added Keys: + + - scale_factor (np.float32) + + Args: + scale (Union[int, Tuple[int, int]]): Images scales for resizing. + """ + + def __init__(self, + scale: Union[int, Tuple[int, int]], + keep_ratio: bool = True, + **kwargs): + assert keep_ratio is True + super().__init__(scale=scale, keep_ratio=True, **kwargs) + + @staticmethod + def _get_rescale_ratio(old_size: Tuple[int, int], + scale: Union[float, Tuple[int]]) -> float: + """Calculate the ratio for rescaling. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by + this factor, else if it is a tuple of 2 integers, then + the image will be rescaled as large as possible within + the scale. + + Returns: + float: The resize ratio. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError('Scale must be a number or tuple of int, ' + f'but got {type(scale)}') + + return scale_factor + + def _resize_img(self, results: dict): + """Resize images with ``results['scale']``.""" + assert self.keep_ratio is True + + if results.get('img', None) is not None: + image = results['img'] + original_h, original_w = image.shape[:2] + ratio = self._get_rescale_ratio((original_h, original_w), + self.scale) + + if ratio != 1: + # resize image according to the shape + # NOTE: We are currently testing on COCO that modifying + # this code will not affect the results. + # If you find that it has an effect on your results, + # please feel free to contact us. + image = mmcv.imresize( + img=image, + size=(int(original_w * ratio), int(original_h * ratio)), + interpolation='area' if ratio < 1 else 'bilinear', + backend=self.backend) + + resized_h, resized_w = image.shape[:2] + scale_ratio_h = resized_h / original_h + scale_ratio_w = resized_w / original_w + scale_factor = (scale_ratio_w, scale_ratio_h) + + results['img'] = image + results['img_shape'] = image.shape[:2] + results['scale_factor'] = scale_factor + + +@TRANSFORMS.register_module() +class LetterResize(MMDET_Resize): + """Resize and pad image while meeting stride-multiple constraints. + + Required Keys: + + - img (np.uint8) + - batch_shape (np.int64) (optional) + + Modified Keys: + + - img (np.uint8) + - img_shape (tuple) + - gt_bboxes (optional) + + Added Keys: + - pad_param (np.float32) + + Args: + scale (Union[int, Tuple[int, int]]): Images scales for resizing. + pad_val (dict): Padding value. Defaults to dict(img=0, seg=255). + use_mini_pad (bool): Whether using minimum rectangle padding. + Defaults to True + stretch_only (bool): Whether stretch to the specified size directly. + Defaults to False + allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True + half_pad_param (bool): If set to True, left and right pad_param will + be given by dividing padding_h by 2. If set to False, pad_param is + in int format. We recommend setting this to False for object + detection tasks, and True for instance segmentation tasks. + Default to False. + """ + + def __init__(self, + scale: Union[int, Tuple[int, int]], + pad_val: dict = dict(img=0, mask=0, seg=255), + use_mini_pad: bool = False, + stretch_only: bool = False, + allow_scale_up: bool = True, + half_pad_param: bool = False, + **kwargs): + super().__init__(scale=scale, keep_ratio=True, **kwargs) + + self.pad_val = pad_val + if isinstance(pad_val, (int, float)): + pad_val = dict(img=pad_val, seg=255) + assert isinstance( + pad_val, dict), f'pad_val must be dict, but got {type(pad_val)}' + + self.use_mini_pad = use_mini_pad + self.stretch_only = stretch_only + self.allow_scale_up = allow_scale_up + self.half_pad_param = half_pad_param + + def _resize_img(self, results: dict): + """Resize images with ``results['scale']``.""" + image = results.get('img', None) + if image is None: + return + + # Use batch_shape if a batch_shape policy is configured + if 'batch_shape' in results: + scale = tuple(results['batch_shape']) # hw + else: + scale = self.scale[::-1] # wh -> hw + + image_shape = image.shape[:2] # height, width + + # Scale ratio (new / old) + ratio = min(scale[0] / image_shape[0], scale[1] / image_shape[1]) + + # only scale down, do not scale up (for better test mAP) + if not self.allow_scale_up: + ratio = min(ratio, 1.0) + + ratio = [ratio, ratio] # float -> (float, float) for (height, width) + + # compute the best size of the image + no_pad_shape = (int(round(image_shape[0] * ratio[0])), + int(round(image_shape[1] * ratio[1]))) + + # padding height & width + padding_h, padding_w = [ + scale[0] - no_pad_shape[0], scale[1] - no_pad_shape[1] + ] + if self.use_mini_pad: + # minimum rectangle padding + padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32) + + elif self.stretch_only: + # stretch to the specified size directly + padding_h, padding_w = 0.0, 0.0 + no_pad_shape = (scale[0], scale[1]) + ratio = [scale[0] / image_shape[0], + scale[1] / image_shape[1]] # height, width ratios + + if image_shape != no_pad_shape: + # compare with no resize and padding size + image = mmcv.imresize( + image, (no_pad_shape[1], no_pad_shape[0]), + interpolation=self.interpolation, + backend=self.backend) + + scale_factor = (no_pad_shape[1] / image_shape[1], + no_pad_shape[0] / image_shape[0]) + + if 'scale_factor' in results: + results['scale_factor_origin'] = results['scale_factor'] + results['scale_factor'] = scale_factor + + # padding + top_padding, left_padding = int(round(padding_h // 2 - 0.1)), int( + round(padding_w // 2 - 0.1)) + bottom_padding = padding_h - top_padding + right_padding = padding_w - left_padding + + padding_list = [ + top_padding, bottom_padding, left_padding, right_padding + ] + if top_padding != 0 or bottom_padding != 0 or \ + left_padding != 0 or right_padding != 0: + + pad_val = self.pad_val.get('img', 0) + if isinstance(pad_val, int) and image.ndim == 3: + pad_val = tuple(pad_val for _ in range(image.shape[2])) + + image = mmcv.impad( + img=image, + padding=(padding_list[2], padding_list[0], padding_list[3], + padding_list[1]), + pad_val=pad_val, + padding_mode='constant') + + results['img'] = image + results['img_shape'] = image.shape + if 'pad_param' in results: + results['pad_param_origin'] = results['pad_param'] * \ + np.repeat(ratio, 2) + + if self.half_pad_param: + results['pad_param'] = np.array( + [padding_h / 2, padding_h / 2, padding_w / 2, padding_w / 2], + dtype=np.float32) + else: + # We found in object detection, using padding list with + # int type can get higher mAP. + results['pad_param'] = np.array(padding_list, dtype=np.float32) + + def _resize_masks(self, results: dict): + """Resize masks with ``results['scale']``""" + if results.get('gt_masks', None) is None: + return + + gt_masks = results['gt_masks'] + assert isinstance( + gt_masks, PolygonMasks + ), f'Only supports PolygonMasks, but got {type(gt_masks)}' + + # resize the gt_masks + gt_mask_h = results['gt_masks'].height * results['scale_factor'][1] + gt_mask_w = results['gt_masks'].width * results['scale_factor'][0] + gt_masks = results['gt_masks'].resize( + (int(round(gt_mask_h)), int(round(gt_mask_w)))) + + top_padding, _, left_padding, _ = results['pad_param'] + if int(left_padding) != 0: + gt_masks = gt_masks.translate( + out_shape=results['img_shape'][:2], + offset=int(left_padding), + direction='horizontal') + if int(top_padding) != 0: + gt_masks = gt_masks.translate( + out_shape=results['img_shape'][:2], + offset=int(top_padding), + direction='vertical') + results['gt_masks'] = gt_masks + + def _resize_bboxes(self, results: dict): + """Resize bounding boxes with ``results['scale_factor']``.""" + if results.get('gt_bboxes', None) is None: + return + results['gt_bboxes'].rescale_(results['scale_factor']) + + if len(results['pad_param']) != 4: + return + results['gt_bboxes'].translate_( + (results['pad_param'][2], results['pad_param'][0])) + + if self.clip_object_border: + results['gt_bboxes'].clip_(results['img_shape']) + + def transform(self, results: dict) -> dict: + results = super().transform(results) + if 'scale_factor_origin' in results: + scale_factor_origin = results.pop('scale_factor_origin') + results['scale_factor'] = (results['scale_factor'][0] * + scale_factor_origin[0], + results['scale_factor'][1] * + scale_factor_origin[1]) + if 'pad_param_origin' in results: + pad_param_origin = results.pop('pad_param_origin') + results['pad_param'] += pad_param_origin + return results + + +# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug +@TRANSFORMS.register_module() +class YOLOv5HSVRandomAug(BaseTransform): + """Apply HSV augmentation to image sequentially. + + Required Keys: + + - img + + Modified Keys: + + - img + + Args: + hue_delta ([int, float]): delta of hue. Defaults to 0.015. + saturation_delta ([int, float]): delta of saturation. Defaults to 0.7. + value_delta ([int, float]): delta of value. Defaults to 0.4. + """ + + def __init__(self, + hue_delta: Union[int, float] = 0.015, + saturation_delta: Union[int, float] = 0.7, + value_delta: Union[int, float] = 0.4): + self.hue_delta = hue_delta + self.saturation_delta = saturation_delta + self.value_delta = value_delta + + def transform(self, results: dict) -> dict: + """The HSV augmentation transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + hsv_gains = \ + random.uniform(-1, 1, 3) * \ + [self.hue_delta, self.saturation_delta, self.value_delta] + 1 + hue, sat, val = cv2.split( + cv2.cvtColor(results['img'], cv2.COLOR_BGR2HSV)) + + table_list = np.arange(0, 256, dtype=hsv_gains.dtype) + lut_hue = ((table_list * hsv_gains[0]) % 180).astype(np.uint8) + lut_sat = np.clip(table_list * hsv_gains[1], 0, 255).astype(np.uint8) + lut_val = np.clip(table_list * hsv_gains[2], 0, 255).astype(np.uint8) + + im_hsv = cv2.merge( + (cv2.LUT(hue, lut_hue), cv2.LUT(sat, + lut_sat), cv2.LUT(val, lut_val))) + results['img'] = cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_delta={self.hue_delta}, ' + repr_str += f'saturation_delta={self.saturation_delta}, ' + repr_str += f'value_delta={self.value_delta})' + return repr_str + + +@TRANSFORMS.register_module() +class LoadAnnotations(MMDET_LoadAnnotations): + """Because the yolo series does not need to consider ignore bboxes for the + time being, in order to speed up the pipeline, it can be excluded in + advance. + + Args: + mask2bbox (bool): Whether to use mask annotation to get bbox. + Defaults to False. + poly2mask (bool): Whether to transform the polygons to bitmaps. + Defaults to False. + merge_polygons (bool): Whether to merge polygons into one polygon. + If merged, the storage structure is simpler and training is more + effcient, especially if the mask inside a bbox is divided into + multiple polygons. Defaults to True. + """ + + def __init__(self, + mask2bbox: bool = False, + poly2mask: bool = False, + merge_polygons: bool = True, + **kwargs): + self.mask2bbox = mask2bbox + self.merge_polygons = merge_polygons + assert not poly2mask, 'Does not support BitmapMasks considering ' \ + 'that bitmap consumes more memory.' + super().__init__(poly2mask=poly2mask, **kwargs) + if self.mask2bbox: + assert self.with_mask, 'Using mask2bbox requires ' \ + 'with_mask is True.' + self._mask_ignore_flag = None + + def transform(self, results: dict) -> dict: + """Function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box, label and + semantic segmentation. + """ + if self.mask2bbox: + self._load_masks(results) + if self.with_label: + self._load_labels(results) + self._update_mask_ignore_data(results) + gt_bboxes = results['gt_masks'].get_bboxes(dst_type='hbox') + results['gt_bboxes'] = gt_bboxes + elif self.with_keypoints: + self._load_kps(results) + _, box_type_cls = get_box_type(self.box_type) + results['gt_bboxes'] = box_type_cls( + results.get('bbox', []), dtype=torch.float32) + else: + results = super().transform(results) + self._update_mask_ignore_data(results) + return results + + def _update_mask_ignore_data(self, results: dict) -> None: + if 'gt_masks' not in results: + return + + if 'gt_bboxes_labels' in results and len( + results['gt_bboxes_labels']) != len(results['gt_masks']): + assert len(results['gt_bboxes_labels']) == len( + self._mask_ignore_flag) + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + self._mask_ignore_flag] + + if 'gt_bboxes' in results and len(results['gt_bboxes']) != len( + results['gt_masks']): + assert len(results['gt_bboxes']) == len(self._mask_ignore_flag) + results['gt_bboxes'] = results['gt_bboxes'][self._mask_ignore_flag] + + def _load_bboxes(self, results: dict): + """Private function to load bounding box annotations. + Note: BBoxes with ignore_flag of 1 is not considered. + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + + Returns: + dict: The dict contains loaded bounding box annotations. + """ + gt_bboxes = [] + gt_ignore_flags = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + gt_bboxes.append(instance['bbox']) + gt_ignore_flags.append(instance['ignore_flag']) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + if self.box_type is None: + results['gt_bboxes'] = np.array( + gt_bboxes, dtype=np.float32).reshape((-1, 4)) + else: + _, box_type_cls = get_box_type(self.box_type) + results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32) + + def _load_labels(self, results: dict): + """Private function to load label annotations. + + Note: BBoxes with ignore_flag of 1 is not considered. + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + Returns: + dict: The dict contains loaded label annotations. + """ + gt_bboxes_labels = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + gt_bboxes_labels.append(instance['bbox_label']) + results['gt_bboxes_labels'] = np.array( + gt_bboxes_labels, dtype=np.int64) + + def _load_masks(self, results: dict) -> None: + """Private function to load mask annotations. + + Args: + results (dict): Result dict from :obj:``mmengine.BaseDataset``. + """ + gt_masks = [] + gt_ignore_flags = [] + self._mask_ignore_flag = [] + for instance in results.get('instances', []): + if instance['ignore_flag'] == 0: + if 'mask' in instance: + gt_mask = instance['mask'] + if isinstance(gt_mask, list): + gt_mask = [ + np.array(polygon) for polygon in gt_mask + if len(polygon) % 2 == 0 and len(polygon) >= 6 + ] + if len(gt_mask) == 0: + # ignore + self._mask_ignore_flag.append(0) + else: + if len(gt_mask) > 1 and self.merge_polygons: + gt_mask = self.merge_multi_segment(gt_mask) + gt_masks.append(gt_mask) + gt_ignore_flags.append(instance['ignore_flag']) + self._mask_ignore_flag.append(1) + else: + raise NotImplementedError( + 'Only supports mask annotations in polygon ' + 'format currently') + else: + # TODO: Actually, gt with bbox and without mask needs + # to be retained + self._mask_ignore_flag.append(0) + self._mask_ignore_flag = np.array(self._mask_ignore_flag, dtype=bool) + results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool) + + h, w = results['ori_shape'] + gt_masks = PolygonMasks([mask for mask in gt_masks], h, w) + results['gt_masks'] = gt_masks + + def merge_multi_segment(self, + gt_masks: List[np.ndarray]) -> List[np.ndarray]: + """Merge multi segments to one list. + + Find the coordinates with min distance between each segment, + then connect these coordinates with one thin line to merge all + segments into one. + Args: + gt_masks(List(np.array)): + original segmentations in coco's json file. + like [segmentation1, segmentation2,...], + each segmentation is a list of coordinates. + Return: + gt_masks(List(np.array)): merged gt_masks + """ + s = [] + segments = [np.array(i).reshape(-1, 2) for i in gt_masks] + idx_list = [[] for _ in range(len(gt_masks))] + + # record the indexes with min distance between each segment + for i in range(1, len(segments)): + idx1, idx2 = self.min_index(segments[i - 1], segments[i]) + idx_list[i - 1].append(idx1) + idx_list[i].append(idx2) + + # use two round to connect all the segments + # first round: first to end, i.e. A->B(partial)->C + # second round: end to first, i.e. C->B(remaining)-A + for k in range(2): + # forward first round + if k == 0: + for i, idx in enumerate(idx_list): + # middle segments have two indexes + # reverse the index of middle segments + if len(idx) == 2 and idx[0] > idx[1]: + idx = idx[::-1] + segments[i] = segments[i][::-1, :] + # add the idx[0] point for connect next segment + segments[i] = np.roll(segments[i], -idx[0], axis=0) + segments[i] = np.concatenate( + [segments[i], segments[i][:1]]) + # deal with the first segment and the last one + if i in [0, len(idx_list) - 1]: + s.append(segments[i]) + # deal with the middle segment + # Note that in the first round, only partial segment + # are appended. + else: + idx = [0, idx[1] - idx[0]] + s.append(segments[i][idx[0]:idx[1] + 1]) + # forward second round + else: + for i in range(len(idx_list) - 1, -1, -1): + # deal with the middle segment + # append the remaining points + if i not in [0, len(idx_list) - 1]: + idx = idx_list[i] + nidx = abs(idx[1] - idx[0]) + s.append(segments[i][nidx:]) + return [np.concatenate(s).reshape(-1, )] + + def min_index(self, arr1: np.ndarray, arr2: np.ndarray) -> Tuple[int, int]: + """Find a pair of indexes with the shortest distance. + + Args: + arr1: (N, 2). + arr2: (M, 2). + Return: + tuple: a pair of indexes. + """ + dis = ((arr1[:, None, :] - arr2[None, :, :])**2).sum(-1) + return np.unravel_index(np.argmin(dis, axis=None), dis.shape) + + def _load_kps(self, results: dict) -> None: + """Private function to load keypoints annotations. + + Args: + results (dict): Result dict from + :class:`mmengine.dataset.BaseDataset`. + + Returns: + dict: The dict contains loaded keypoints annotations. + """ + results['height'] = results['img_shape'][0] + results['width'] = results['img_shape'][1] + num_instances = len(results.get('bbox', [])) + + if num_instances == 0: + results['keypoints'] = np.empty( + (0, len(results['flip_indices']), 2), dtype=np.float32) + results['keypoints_visible'] = np.empty( + (0, len(results['flip_indices'])), dtype=np.int32) + results['category_id'] = [] + + results['gt_keypoints'] = Keypoints( + keypoints=results['keypoints'], + keypoints_visible=results['keypoints_visible'], + flip_indices=results['flip_indices'], + ) + + results['gt_ignore_flags'] = np.array([False] * num_instances) + results['gt_bboxes_labels'] = np.array(results['category_id']) - 1 + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_mask={self.with_mask}, ' + repr_str += f'with_seg={self.with_seg}, ' + repr_str += f'mask2bbox={self.mask2bbox}, ' + repr_str += f'poly2mask={self.poly2mask}, ' + repr_str += f"imdecode_backend='{self.imdecode_backend}', " + repr_str += f'backend_args={self.backend_args})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5RandomAffine(BaseTransform): + """Random affine transform data augmentation in YOLOv5 and YOLOv8. It is + different from the implementation in YOLOX. + + This operation randomly generates affine transform matrix which including + rotation, translation, shear and scaling transforms. + If you set use_mask_refine == True, the code will use the masks + annotation to refine the bbox. + Our implementation is slightly different from the official. In COCO + dataset, a gt may have multiple mask tags. The official YOLOv5 + annotation file already combines the masks that an object has, + but our code takes into account the fact that an object has multiple masks. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (PolygonMasks) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + - gt_masks (PolygonMasks) (optional) + + Args: + max_rotate_degree (float): Maximum degrees of rotation transform. + Defaults to 10. + max_translate_ratio (float): Maximum ratio of translation. + Defaults to 0.1. + scaling_ratio_range (tuple[float]): Min and max ratio of + scaling transform. Defaults to (0.5, 1.5). + max_shear_degree (float): Maximum degrees of shear + transform. Defaults to 2. + border (tuple[int]): Distance from width and height sides of input + image to adjust output shape. Only used in mosaic dataset. + Defaults to (0, 0). + border_val (tuple[int]): Border padding values of 3 channels. + Defaults to (114, 114, 114). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + min_bbox_size (float): Width and height threshold to filter bboxes. + If the height or width of a box is smaller than this value, it + will be removed. Defaults to 2. + min_area_ratio (float): Threshold of area ratio between + original bboxes and wrapped bboxes. If smaller than this value, + the box will be removed. Defaults to 0.1. + use_mask_refine (bool): Whether to refine bbox by mask. Deprecated. + max_aspect_ratio (float): Aspect ratio of width and height + threshold to filter bboxes. If max(h/w, w/h) larger than this + value, the box will be removed. Defaults to 20. + resample_num (int): Number of poly to resample to. + """ + + def __init__(self, + max_rotate_degree: float = 10.0, + max_translate_ratio: float = 0.1, + scaling_ratio_range: Tuple[float, float] = (0.5, 1.5), + max_shear_degree: float = 2.0, + border: Tuple[int, int] = (0, 0), + border_val: Tuple[int, int, int] = (114, 114, 114), + bbox_clip_border: bool = True, + min_bbox_size: int = 2, + min_area_ratio: float = 0.1, + use_mask_refine: bool = False, + max_aspect_ratio: float = 20., + resample_num: int = 1000): + assert 0 <= max_translate_ratio <= 1 + assert scaling_ratio_range[0] <= scaling_ratio_range[1] + assert scaling_ratio_range[0] > 0 + self.max_rotate_degree = max_rotate_degree + self.max_translate_ratio = max_translate_ratio + self.scaling_ratio_range = scaling_ratio_range + self.max_shear_degree = max_shear_degree + self.border = border + self.border_val = border_val + self.bbox_clip_border = bbox_clip_border + self.min_bbox_size = min_bbox_size + self.min_area_ratio = min_area_ratio + # The use_mask_refine parameter has been deprecated. + self.use_mask_refine = use_mask_refine + self.max_aspect_ratio = max_aspect_ratio + self.resample_num = resample_num + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """The YOLOv5 random affine transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + img = results['img'] + # self.border is wh format + height = img.shape[0] + self.border[1] * 2 + width = img.shape[1] + self.border[0] * 2 + + # Note: Different from YOLOX + center_matrix = np.eye(3, dtype=np.float32) + center_matrix[0, 2] = -img.shape[1] / 2 + center_matrix[1, 2] = -img.shape[0] / 2 + + warp_matrix, scaling_ratio = self._get_random_homography_matrix( + height, width) + warp_matrix = warp_matrix @ center_matrix + + img = cv2.warpPerspective( + img, + warp_matrix, + dsize=(width, height), + borderValue=self.border_val) + results['img'] = img + results['img_shape'] = img.shape + img_h, img_w = img.shape[:2] + + bboxes = results['gt_bboxes'] + num_bboxes = len(bboxes) + if num_bboxes: + orig_bboxes = bboxes.clone() + orig_bboxes.rescale_([scaling_ratio, scaling_ratio]) + if 'gt_masks' in results: + # If the dataset has annotations of mask, + # the mask will be used to refine bbox. + gt_masks = results['gt_masks'] + + gt_masks_resample = self.resample_masks(gt_masks) + gt_masks = self.warp_mask(gt_masks_resample, warp_matrix, + img_h, img_w) + + # refine bboxes by masks + bboxes = self.segment2box(gt_masks, height, width) + # filter bboxes outside image + valid_index = self.filter_gt_bboxes(orig_bboxes, + bboxes).numpy() + if self.bbox_clip_border: + bboxes.clip_([height - 1e-3, width - 1e-3]) + gt_masks = self.clip_polygons(gt_masks, height, width) + results['gt_masks'] = gt_masks[valid_index] + else: + bboxes.project_(warp_matrix) + if self.bbox_clip_border: + bboxes.clip_([height, width]) + + # filter bboxes + # Be careful: valid_index must convert to numpy, + # otherwise it will raise out of bounds when len(valid_index)=1 + valid_index = self.filter_gt_bboxes(orig_bboxes, + bboxes).numpy() + + results['gt_bboxes'] = bboxes[valid_index] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + valid_index] + results['gt_ignore_flags'] = results['gt_ignore_flags'][ + valid_index] + else: + if 'gt_masks' in results: + results['gt_masks'] = PolygonMasks([], img_h, img_w) + + return results + + def segment2box(self, gt_masks: PolygonMasks, height: int, + width: int) -> HorizontalBoxes: + """ + Convert 1 segment label to 1 box label, applying inside-image + constraint i.e. (xy1, xy2, ...) to (xyxy) + Args: + gt_masks (torch.Tensor): the segment label + width (int): the width of the image. Defaults to 640 + height (int): The height of the image. Defaults to 640 + Returns: + HorizontalBoxes: the clip bboxes from gt_masks. + """ + bboxes = [] + for _, poly_per_obj in enumerate(gt_masks): + # simply use a number that is big enough for comparison with + # coordinates + xy_min = np.array([width * 2, height * 2], dtype=np.float32) + xy_max = np.zeros(2, dtype=np.float32) - 1 + + for p in poly_per_obj: + xy = np.array(p).reshape(-1, 2).astype(np.float32) + x, y = xy.T + inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) + x, y = x[inside], y[inside] + if not any(x): + continue + xy = np.stack([x, y], axis=0).T + + xy_min = np.minimum(xy_min, np.min(xy, axis=0)) + xy_max = np.maximum(xy_max, np.max(xy, axis=0)) + if xy_max[0] == -1: + bbox = np.zeros(4, dtype=np.float32) + else: + bbox = np.concatenate([xy_min, xy_max], axis=0) + bboxes.append(bbox) + + return HorizontalBoxes(np.stack(bboxes, axis=0)) + + # TODO: Move to mmdet + def clip_polygons(self, gt_masks: PolygonMasks, height: int, + width: int) -> PolygonMasks: + """Function to clip points of polygons with height and width. + + Args: + gt_masks (PolygonMasks): Annotations of instance segmentation. + height (int): height of clip border. + width (int): width of clip border. + Return: + clipped_masks (PolygonMasks): + Clip annotations of instance segmentation. + """ + if len(gt_masks) == 0: + clipped_masks = PolygonMasks([], height, width) + else: + clipped_masks = [] + for poly_per_obj in gt_masks: + clipped_poly_per_obj = [] + for p in poly_per_obj: + p = p.copy() + p[0::2] = p[0::2].clip(0, width) + p[1::2] = p[1::2].clip(0, height) + clipped_poly_per_obj.append(p) + clipped_masks.append(clipped_poly_per_obj) + clipped_masks = PolygonMasks(clipped_masks, height, width) + return clipped_masks + + @staticmethod + def warp_poly(poly: np.ndarray, warp_matrix: np.ndarray, img_w: int, + img_h: int) -> np.ndarray: + """Function to warp one mask and filter points outside image. + + Args: + poly (np.ndarray): Segmentation annotation with shape (n, ) and + with format (x1, y1, x2, y2, ...). + warp_matrix (np.ndarray): Affine transformation matrix. + Shape: (3, 3). + img_w (int): Width of output image. + img_h (int): Height of output image. + """ + # TODO: Current logic may cause retained masks unusable for + # semantic segmentation training, which is same as official + # implementation. + poly = poly.reshape((-1, 2)) + poly = np.concatenate((poly, np.ones( + (len(poly), 1), dtype=poly.dtype)), + axis=-1) + # transform poly + poly = poly @ warp_matrix.T + poly = poly[:, :2] / poly[:, 2:3] + + return poly.reshape(-1) + + def warp_mask(self, gt_masks: PolygonMasks, warp_matrix: np.ndarray, + img_w: int, img_h: int) -> PolygonMasks: + """Warp masks by warp_matrix and retain masks inside image after + warping. + + Args: + gt_masks (PolygonMasks): Annotations of semantic segmentation. + warp_matrix (np.ndarray): Affine transformation matrix. + Shape: (3, 3). + img_w (int): Width of output image. + img_h (int): Height of output image. + + Returns: + PolygonMasks: Masks after warping. + """ + masks = gt_masks.masks + + new_masks = [] + for poly_per_obj in masks: + warpped_poly_per_obj = [] + # One gt may have multiple masks. + for poly in poly_per_obj: + valid_poly = self.warp_poly(poly, warp_matrix, img_w, img_h) + if len(valid_poly): + warpped_poly_per_obj.append(valid_poly.reshape(-1)) + # If all the masks are invalid, + # add [0, 0, 0, 0, 0, 0,] here. + if not warpped_poly_per_obj: + # This will be filtered in function `filter_gt_bboxes`. + warpped_poly_per_obj = [ + np.zeros(6, dtype=poly_per_obj[0].dtype) + ] + new_masks.append(warpped_poly_per_obj) + + gt_masks = PolygonMasks(new_masks, img_h, img_w) + return gt_masks + + def resample_masks(self, gt_masks: PolygonMasks) -> PolygonMasks: + """Function to resample each mask annotation with shape (2 * n, ) to + shape (resample_num * 2, ). + + Args: + gt_masks (PolygonMasks): Annotations of semantic segmentation. + """ + masks = gt_masks.masks + new_masks = [] + for poly_per_obj in masks: + resample_poly_per_obj = [] + for poly in poly_per_obj: + poly = poly.reshape((-1, 2)) # xy + poly = np.concatenate((poly, poly[0:1, :]), axis=0) + x = np.linspace(0, len(poly) - 1, self.resample_num) + xp = np.arange(len(poly)) + poly = np.concatenate([ + np.interp(x, xp, poly[:, i]) for i in range(2) + ]).reshape(2, -1).T.reshape(-1) + resample_poly_per_obj.append(poly) + new_masks.append(resample_poly_per_obj) + return PolygonMasks(new_masks, gt_masks.height, gt_masks.width) + + def filter_gt_bboxes(self, origin_bboxes: HorizontalBoxes, + wrapped_bboxes: HorizontalBoxes) -> torch.Tensor: + """Filter gt bboxes. + + Args: + origin_bboxes (HorizontalBoxes): Origin bboxes. + wrapped_bboxes (HorizontalBoxes): Wrapped bboxes + + Returns: + dict: The result dict. + """ + origin_w = origin_bboxes.widths + origin_h = origin_bboxes.heights + wrapped_w = wrapped_bboxes.widths + wrapped_h = wrapped_bboxes.heights + aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16), + wrapped_h / (wrapped_w + 1e-16)) + + wh_valid_idx = (wrapped_w > self.min_bbox_size) & \ + (wrapped_h > self.min_bbox_size) + area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h + + 1e-16) > self.min_area_ratio + aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio + return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx + + @cache_randomness + def _get_random_homography_matrix(self, height: int, + width: int) -> Tuple[np.ndarray, float]: + """Get random homography matrix. + + Args: + height (int): Image height. + width (int): Image width. + + Returns: + Tuple[np.ndarray, float]: The result of warp_matrix and + scaling_ratio. + """ + # Rotation + rotation_degree = random.uniform(-self.max_rotate_degree, + self.max_rotate_degree) + rotation_matrix = self._get_rotation_matrix(rotation_degree) + + # Scaling + scaling_ratio = random.uniform(self.scaling_ratio_range[0], + self.scaling_ratio_range[1]) + scaling_matrix = self._get_scaling_matrix(scaling_ratio) + + # Shear + x_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + y_degree = random.uniform(-self.max_shear_degree, + self.max_shear_degree) + shear_matrix = self._get_shear_matrix(x_degree, y_degree) + + # Translation + trans_x = random.uniform(0.5 - self.max_translate_ratio, + 0.5 + self.max_translate_ratio) * width + trans_y = random.uniform(0.5 - self.max_translate_ratio, + 0.5 + self.max_translate_ratio) * height + translate_matrix = self._get_translation_matrix(trans_x, trans_y) + warp_matrix = ( + translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix) + return warp_matrix, scaling_ratio + + @staticmethod + def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray: + """Get rotation matrix. + + Args: + rotate_degrees (float): Rotate degrees. + + Returns: + np.ndarray: The rotation matrix. + """ + radian = math.radians(rotate_degrees) + rotation_matrix = np.array( + [[np.cos(radian), -np.sin(radian), 0.], + [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]], + dtype=np.float32) + return rotation_matrix + + @staticmethod + def _get_scaling_matrix(scale_ratio: float) -> np.ndarray: + """Get scaling matrix. + + Args: + scale_ratio (float): Scale ratio. + + Returns: + np.ndarray: The scaling matrix. + """ + scaling_matrix = np.array( + [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]], + dtype=np.float32) + return scaling_matrix + + @staticmethod + def _get_shear_matrix(x_shear_degrees: float, + y_shear_degrees: float) -> np.ndarray: + """Get shear matrix. + + Args: + x_shear_degrees (float): X shear degrees. + y_shear_degrees (float): Y shear degrees. + + Returns: + np.ndarray: The shear matrix. + """ + x_radian = math.radians(x_shear_degrees) + y_radian = math.radians(y_shear_degrees) + shear_matrix = np.array([[1, np.tan(x_radian), 0.], + [np.tan(y_radian), 1, 0.], [0., 0., 1.]], + dtype=np.float32) + return shear_matrix + + @staticmethod + def _get_translation_matrix(x: float, y: float) -> np.ndarray: + """Get translation matrix. + + Args: + x (float): X translation. + y (float): Y translation. + + Returns: + np.ndarray: The translation matrix. + """ + translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]], + dtype=np.float32) + return translation_matrix + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(max_rotate_degree={self.max_rotate_degree}, ' + repr_str += f'max_translate_ratio={self.max_translate_ratio}, ' + repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, ' + repr_str += f'max_shear_degree={self.max_shear_degree}, ' + repr_str += f'border={self.border}, ' + repr_str += f'border_val={self.border_val}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@TRANSFORMS.register_module() +class PPYOLOERandomDistort(BaseTransform): + """Random hue, saturation, contrast and brightness distortion. + + Required Keys: + + - img + + Modified Keys: + + - img (np.float32) + + Args: + hue_cfg (dict): Hue settings. Defaults to dict(min=-18, + max=18, prob=0.5). + saturation_cfg (dict): Saturation settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + contrast_cfg (dict): Contrast settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + brightness_cfg (dict): Brightness settings. Defaults to dict( + min=0.5, max=1.5, prob=0.5). + num_distort_func (int): The number of distort function. Defaults + to 4. + """ + + def __init__(self, + hue_cfg: dict = dict(min=-18, max=18, prob=0.5), + saturation_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + contrast_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + brightness_cfg: dict = dict(min=0.5, max=1.5, prob=0.5), + num_distort_func: int = 4): + self.hue_cfg = hue_cfg + self.saturation_cfg = saturation_cfg + self.contrast_cfg = contrast_cfg + self.brightness_cfg = brightness_cfg + self.num_distort_func = num_distort_func + assert 0 < self.num_distort_func <= 4, \ + 'num_distort_func must > 0 and <= 4' + for cfg in [ + self.hue_cfg, self.saturation_cfg, self.contrast_cfg, + self.brightness_cfg + ]: + assert 0. <= cfg['prob'] <= 1., 'prob must >=0 and <=1' + + def transform_hue(self, results): + """Transform hue randomly.""" + if random.uniform(0., 1.) >= self.hue_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.hue_cfg['min'], self.hue_cfg['max']) + u = np.cos(delta * np.pi) + w = np.sin(delta * np.pi) + delta_iq = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) + rgb2yiq_matrix = np.array([[0.114, 0.587, 0.299], + [-0.321, -0.274, 0.596], + [0.311, -0.523, 0.211]]) + yiq2rgb_matric = np.array([[1.0, -1.107, 1.705], [1.0, -0.272, -0.647], + [1.0, 0.956, 0.621]]) + t = np.dot(np.dot(yiq2rgb_matric, delta_iq), rgb2yiq_matrix).T + img = np.dot(img, t) + results['img'] = img + return results + + def transform_saturation(self, results): + """Transform saturation randomly.""" + if random.uniform(0., 1.) >= self.saturation_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.saturation_cfg['min'], + self.saturation_cfg['max']) + + # convert bgr img to gray img + gray = img * np.array([[[0.114, 0.587, 0.299]]], dtype=np.float32) + gray = gray.sum(axis=2, keepdims=True) + gray *= (1.0 - delta) + img *= delta + img += gray + results['img'] = img + return results + + def transform_contrast(self, results): + """Transform contrast randomly.""" + if random.uniform(0., 1.) >= self.contrast_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.contrast_cfg['min'], + self.contrast_cfg['max']) + img *= delta + results['img'] = img + return results + + def transform_brightness(self, results): + """Transform brightness randomly.""" + if random.uniform(0., 1.) >= self.brightness_cfg['prob']: + return results + img = results['img'] + delta = random.uniform(self.brightness_cfg['min'], + self.brightness_cfg['max']) + img += delta + results['img'] = img + return results + + def transform(self, results: dict) -> dict: + """The hue, saturation, contrast and brightness distortion function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + results['img'] = results['img'].astype(np.float32) + + functions = [ + self.transform_brightness, self.transform_contrast, + self.transform_saturation, self.transform_hue + ] + distortions = random.permutation(functions)[:self.num_distort_func] + for func in distortions: + results = func(results) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_cfg={self.hue_cfg}, ' + repr_str += f'saturation_cfg={self.saturation_cfg}, ' + repr_str += f'contrast_cfg={self.contrast_cfg}, ' + repr_str += f'brightness_cfg={self.brightness_cfg}, ' + repr_str += f'num_distort_func={self.num_distort_func})' + return repr_str + + +@TRANSFORMS.register_module() +class PPYOLOERandomCrop(BaseTransform): + """Random crop the img and bboxes. Different thresholds are used in PPYOLOE + to judge whether the clipped image meets the requirements. This + implementation is different from the implementation of RandomCrop in mmdet. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Added Keys: + - pad_param (np.float32) + + Args: + aspect_ratio (List[float]): Aspect ratio of cropped region. Default to + [.5, 2]. + thresholds (List[float]): Iou thresholds for deciding a valid bbox crop + in [min, max] format. Defaults to [.0, .1, .3, .5, .7, .9]. + scaling (List[float]): Ratio between a cropped region and the original + image in [min, max] format. Default to [.3, 1.]. + num_attempts (int): Number of tries for each threshold before + giving up. Default to 50. + allow_no_crop (bool): Allow return without actually cropping them. + Default to True. + cover_all_box (bool): Ensure all bboxes are covered in the final crop. + Default to False. + """ + + def __init__(self, + aspect_ratio: List[float] = [.5, 2.], + thresholds: List[float] = [.0, .1, .3, .5, .7, .9], + scaling: List[float] = [.3, 1.], + num_attempts: int = 50, + allow_no_crop: bool = True, + cover_all_box: bool = False): + self.aspect_ratio = aspect_ratio + self.thresholds = thresholds + self.scaling = scaling + self.num_attempts = num_attempts + self.allow_no_crop = allow_no_crop + self.cover_all_box = cover_all_box + + def _crop_data(self, results: dict, crop_box: Tuple[int, int, int, int], + valid_inds: np.ndarray) -> Union[dict, None]: + """Function to randomly crop images, bounding boxes, masks, semantic + segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + crop_box (Tuple[int, int, int, int]): Expected absolute coordinates + for cropping, (x1, y1, x2, y2). + valid_inds (np.ndarray): The indexes of gt that needs to be + retained. + + Returns: + results (Union[dict, None]): Randomly cropped results, 'img_shape' + key in result dict is updated according to crop size. None will + be returned when there is no valid bbox after cropping. + """ + # crop the image + img = results['img'] + crop_x1, crop_y1, crop_x2, crop_y2 = crop_box + img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] + results['img'] = img + img_shape = img.shape + results['img_shape'] = img.shape + + # crop bboxes accordingly and clip to the image boundary + if results.get('gt_bboxes', None) is not None: + bboxes = results['gt_bboxes'] + bboxes.translate_([-crop_x1, -crop_y1]) + bboxes.clip_(img_shape[:2]) + + results['gt_bboxes'] = bboxes[valid_inds] + + if results.get('gt_ignore_flags', None) is not None: + results['gt_ignore_flags'] = \ + results['gt_ignore_flags'][valid_inds] + + if results.get('gt_bboxes_labels', None) is not None: + results['gt_bboxes_labels'] = \ + results['gt_bboxes_labels'][valid_inds] + + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'][ + valid_inds.nonzero()[0]].crop( + np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) + + # crop semantic seg + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2, + crop_x1:crop_x2] + + return results + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """The random crop transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if results.get('gt_bboxes', None) is None or len( + results['gt_bboxes']) == 0: + return results + + orig_img_h, orig_img_w = results['img'].shape[:2] + gt_bboxes = results['gt_bboxes'] + + thresholds = list(self.thresholds) + if self.allow_no_crop: + thresholds.append('no_crop') + random.shuffle(thresholds) + + for thresh in thresholds: + # Determine the coordinates for cropping + if thresh == 'no_crop': + return results + + found = False + for i in range(self.num_attempts): + crop_h, crop_w = self._get_crop_size((orig_img_h, orig_img_w)) + if self.aspect_ratio is None: + if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0: + continue + + # get image crop_box + margin_h = max(orig_img_h - crop_h, 0) + margin_w = max(orig_img_w - crop_w, 0) + offset_h, offset_w = self._rand_offset((margin_h, margin_w)) + crop_y1, crop_y2 = offset_h, offset_h + crop_h + crop_x1, crop_x2 = offset_w, offset_w + crop_w + + crop_box = [crop_x1, crop_y1, crop_x2, crop_y2] + # Calculate the iou between gt_bboxes and crop_boxes + iou = self._iou_matrix(gt_bboxes, + np.array([crop_box], dtype=np.float32)) + # If the maximum value of the iou is less than thresh, + # the current crop_box is considered invalid. + if iou.max() < thresh: + continue + + # If cover_all_box == True and the minimum value of + # the iou is less than thresh, the current crop_box + # is considered invalid. + if self.cover_all_box and iou.min() < thresh: + continue + + # Get which gt_bboxes to keep after cropping. + valid_inds = self._get_valid_inds( + gt_bboxes, np.array(crop_box, dtype=np.float32)) + if valid_inds.size > 0: + found = True + break + + if found: + results = self._crop_data(results, crop_box, valid_inds) + return results + return results + + @cache_randomness + def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generate crop offset. + + Args: + margin (Tuple[int, int]): The upper bound for the offset generated + randomly. + + Returns: + Tuple[int, int]: The random offset for the crop. + """ + margin_h, margin_w = margin + offset_h = np.random.randint(0, margin_h + 1) + offset_w = np.random.randint(0, margin_w + 1) + + return (offset_h, offset_w) + + @cache_randomness + def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]: + """Randomly generates the crop size based on `image_size`. + + Args: + image_size (Tuple[int, int]): (h, w). + + Returns: + crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels. + """ + h, w = image_size + scale = random.uniform(*self.scaling) + if self.aspect_ratio is not None: + min_ar, max_ar = self.aspect_ratio + aspect_ratio = random.uniform( + max(min_ar, scale**2), min(max_ar, scale**-2)) + h_scale = scale / np.sqrt(aspect_ratio) + w_scale = scale * np.sqrt(aspect_ratio) + else: + h_scale = random.uniform(*self.scaling) + w_scale = random.uniform(*self.scaling) + crop_h = h * h_scale + crop_w = w * w_scale + return int(crop_h), int(crop_w) + + def _iou_matrix(self, + gt_bbox: HorizontalBoxes, + crop_bbox: np.ndarray, + eps: float = 1e-10) -> np.ndarray: + """Calculate iou between gt and image crop box. + + Args: + gt_bbox (HorizontalBoxes): Ground truth bounding boxes. + crop_bbox (np.ndarray): Image crop coordinates in + [x1, y1, x2, y2] format. + eps (float): Default to 1e-10. + Return: + (np.ndarray): IoU. + """ + gt_bbox = gt_bbox.tensor.numpy() + lefttop = np.maximum(gt_bbox[:, np.newaxis, :2], crop_bbox[:, :2]) + rightbottom = np.minimum(gt_bbox[:, np.newaxis, 2:], crop_bbox[:, 2:]) + + overlap = np.prod( + rightbottom - lefttop, + axis=2) * (lefttop < rightbottom).all(axis=2) + area_gt_bbox = np.prod(gt_bbox[:, 2:] - gt_bbox[:, :2], axis=1) + area_crop_bbox = np.prod(crop_bbox[:, 2:] - crop_bbox[:, :2], axis=1) + area_o = (area_gt_bbox[:, np.newaxis] + area_crop_bbox - overlap) + return overlap / (area_o + eps) + + def _get_valid_inds(self, gt_bbox: HorizontalBoxes, + img_crop_bbox: np.ndarray) -> np.ndarray: + """Get which Bboxes to keep at the current cropping coordinates. + + Args: + gt_bbox (HorizontalBoxes): Ground truth bounding boxes. + img_crop_bbox (np.ndarray): Image crop coordinates in + [x1, y1, x2, y2] format. + + Returns: + (np.ndarray): Valid indexes. + """ + cropped_box = gt_bbox.tensor.numpy().copy() + gt_bbox = gt_bbox.tensor.numpy().copy() + + cropped_box[:, :2] = np.maximum(gt_bbox[:, :2], img_crop_bbox[:2]) + cropped_box[:, 2:] = np.minimum(gt_bbox[:, 2:], img_crop_bbox[2:]) + cropped_box[:, :2] -= img_crop_bbox[:2] + cropped_box[:, 2:] -= img_crop_bbox[:2] + + centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2 + valid = np.logical_and(img_crop_bbox[:2] <= centers, + centers < img_crop_bbox[2:]).all(axis=1) + valid = np.logical_and( + valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) + + return np.where(valid)[0] + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(aspect_ratio={self.aspect_ratio}, ' + repr_str += f'thresholds={self.thresholds}, ' + repr_str += f'scaling={self.scaling}, ' + repr_str += f'num_attempts={self.num_attempts}, ' + repr_str += f'allow_no_crop={self.allow_no_crop}, ' + repr_str += f'cover_all_box={self.cover_all_box})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5CopyPaste(BaseTransform): + """Copy-Paste used in YOLOv5 and YOLOv8. + + This transform randomly copy some objects in the image to the mirror + position of the image.It is different from the `CopyPaste` in mmdet. + + Required Keys: + + - img (np.uint8) + - gt_bboxes (BaseBoxes[torch.float32]) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - gt_masks (PolygonMasks) (optional) + + Modified Keys: + + - img + - gt_bboxes + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (optional) + - gt_masks (optional) + + Args: + ioa_thresh (float): Ioa thresholds for deciding valid bbox. + prob (float): Probability of choosing objects. + Defaults to 0.5. + """ + + def __init__(self, ioa_thresh: float = 0.3, prob: float = 0.5): + self.ioa_thresh = ioa_thresh + self.prob = prob + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """The YOLOv5 and YOLOv8 Copy-Paste transform function. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + if len(results.get('gt_masks', [])) == 0: + return results + gt_masks = results['gt_masks'] + assert isinstance(gt_masks, PolygonMasks), \ + 'only support type of PolygonMasks,' \ + ' but get type: %s' % type(gt_masks) + gt_bboxes = results['gt_bboxes'] + gt_bboxes_labels = results.get('gt_bboxes_labels', None) + img = results['img'] + img_h, img_w = img.shape[:2] + + # calculate ioa + gt_bboxes_flip = deepcopy(gt_bboxes) + gt_bboxes_flip.flip_(img.shape) + + ioa = self.bbox_ioa(gt_bboxes_flip, gt_bboxes) + indexes = torch.nonzero((ioa < self.ioa_thresh).all(1))[:, 0] + n = len(indexes) + valid_inds = random.choice( + indexes, size=round(self.prob * n), replace=False) + if len(valid_inds) == 0: + return results + + if gt_bboxes_labels is not None: + # prepare labels + gt_bboxes_labels = np.concatenate( + (gt_bboxes_labels, gt_bboxes_labels[valid_inds]), axis=0) + + # prepare bboxes + copypaste_bboxes = gt_bboxes_flip[valid_inds] + gt_bboxes = gt_bboxes.cat([gt_bboxes, copypaste_bboxes]) + + # prepare images + copypaste_gt_masks = gt_masks[valid_inds] + copypaste_gt_masks_flip = copypaste_gt_masks.flip() + # convert poly format to bitmap format + # example: poly: [[array(0.0, 0.0, 10.0, 0.0, 10.0, 10.0, 0.0, 10.0]] + # -> bitmap: a mask with shape equal to (1, img_h, img_w) + # # type1 low speed + # copypaste_gt_masks_bitmap = copypaste_gt_masks.to_ndarray() + # copypaste_mask = np.sum(copypaste_gt_masks_bitmap, axis=0) > 0 + + # type2 + copypaste_mask = np.zeros((img_h, img_w), dtype=np.uint8) + for poly in copypaste_gt_masks.masks: + poly = [i.reshape((-1, 1, 2)).astype(np.int32) for i in poly] + cv2.drawContours(copypaste_mask, poly, -1, (1, ), cv2.FILLED) + + copypaste_mask = copypaste_mask.astype(bool) + + # copy objects, and paste to the mirror position of the image + copypaste_mask_flip = mmcv.imflip( + copypaste_mask, direction='horizontal') + copypaste_img = mmcv.imflip(img, direction='horizontal') + img[copypaste_mask_flip] = copypaste_img[copypaste_mask_flip] + + # prepare masks + gt_masks = copypaste_gt_masks.cat([gt_masks, copypaste_gt_masks_flip]) + + if 'gt_ignore_flags' in results: + # prepare gt_ignore_flags + gt_ignore_flags = results['gt_ignore_flags'] + gt_ignore_flags = np.concatenate( + [gt_ignore_flags, gt_ignore_flags[valid_inds]], axis=0) + results['gt_ignore_flags'] = gt_ignore_flags + + results['img'] = img + results['gt_bboxes'] = gt_bboxes + if gt_bboxes_labels is not None: + results['gt_bboxes_labels'] = gt_bboxes_labels + results['gt_masks'] = gt_masks + + return results + + @staticmethod + def bbox_ioa(gt_bboxes_flip: HorizontalBoxes, + gt_bboxes: HorizontalBoxes, + eps: float = 1e-7) -> np.ndarray: + """Calculate ioa between gt_bboxes_flip and gt_bboxes. + + Args: + gt_bboxes_flip (HorizontalBoxes): Flipped ground truth + bounding boxes. + gt_bboxes (HorizontalBoxes): Ground truth bounding boxes. + eps (float): Default to 1e-10. + Return: + (Tensor): Ioa. + """ + gt_bboxes_flip = gt_bboxes_flip.tensor + gt_bboxes = gt_bboxes.tensor + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = gt_bboxes_flip.T + b2_x1, b2_y1, b2_x2, b2_y2 = gt_bboxes.T + + # Intersection area + inter_area = (torch.minimum(b1_x2[:, None], + b2_x2) - torch.maximum(b1_x1[:, None], + b2_x1)).clip(0) * \ + (torch.minimum(b1_y2[:, None], + b2_y2) - torch.maximum(b1_y1[:, None], + b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps + + # Intersection over box2 area + return inter_area / box2_area + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(ioa_thresh={self.ioa_thresh},' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class RemoveDataElement(BaseTransform): + """Remove unnecessary data element in results. + + Args: + keys (Union[str, Sequence[str]]): Keys need to be removed. + """ + + def __init__(self, keys: Union[str, Sequence[str]]): + self.keys = [keys] if isinstance(keys, str) else keys + + def transform(self, results: dict) -> dict: + for key in self.keys: + results.pop(key, None) + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(keys={self.keys})' + return repr_str + + +@TRANSFORMS.register_module() +class RegularizeRotatedBox(BaseTransform): + """Regularize rotated boxes. + + Due to the angle periodicity, one rotated box can be represented in + many different (x, y, w, h, t). To make each rotated box unique, + ``regularize_boxes`` will take the remainder of the angle divided by + 180 degrees. + + For convenience, three angle_version can be used here: + + - 'oc': OpenCV Definition. Has the same box representation as + ``cv2.minAreaRect`` the angle ranges in [-90, 0). + - 'le90': Long Edge Definition (90). the angle ranges in [-90, 90). + The width is always longer than the height. + - 'le135': Long Edge Definition (135). the angle ranges in [-45, 135). + The width is always longer than the height. + + Required Keys: + + - gt_bboxes (RotatedBoxes[torch.float32]) + + Modified Keys: + + - gt_bboxes + + Args: + angle_version (str): Angle version. Can only be 'oc', + 'le90', or 'le135'. Defaults to 'le90. + """ + + def __init__(self, angle_version='le90') -> None: + self.angle_version = angle_version + try: + from mmrotate.structures.bbox import RotatedBoxes + self.box_type = RotatedBoxes + except ImportError: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + def transform(self, results: dict) -> dict: + assert isinstance(results['gt_bboxes'], self.box_type) + results['gt_bboxes'] = self.box_type( + results['gt_bboxes'].regularize_boxes(self.angle_version)) + return results + + +@TRANSFORMS.register_module() +class Polygon2Mask(BaseTransform): + """Polygons to bitmaps in YOLOv5. + + Args: + downsample_ratio (int): Downsample ratio of mask. + mask_overlap (bool): Whether to use maskoverlap in mask process. + When set to True, the implementation here is the same as the + official, with higher training speed. If set to True, all gt masks + will compress into one overlap mask, the value of mask indicates + the index of gt masks. If set to False, one mask is a binary mask. + Default to True. + coco_style (bool): Whether to use coco_style to convert the polygons to + bitmaps. Note that this option is only used to test if there is an + improvement in training speed and we recommend setting it to False. + """ + + def __init__(self, + downsample_ratio: int = 4, + mask_overlap: bool = True, + coco_style: bool = False): + self.downsample_ratio = downsample_ratio + self.mask_overlap = mask_overlap + self.coco_style = coco_style + + def polygon2mask(self, + img_shape: Tuple[int, int], + polygons: np.ndarray, + color: int = 1) -> np.ndarray: + """ + Args: + img_shape (tuple): The image size. + polygons (np.ndarray): [N, M], N is the number of polygons, + M is the number of points(Be divided by 2). + color (int): color in fillPoly. + Return: + np.ndarray: the overlap mask. + """ + nh, nw = (img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio) + if self.coco_style: + # This practice can lead to the loss of small objects + # polygons = polygons.resize((nh, nw)).masks + # polygons = np.asarray(polygons).reshape(-1) + # mask = polygon_to_bitmap([polygons], nh, nw) + + polygons = np.asarray(polygons).reshape(-1) + mask = polygon_to_bitmap([polygons], img_shape[0], + img_shape[1]).astype(np.uint8) + mask = mmcv.imresize(mask, (nw, nh)) + else: + mask = np.zeros(img_shape, dtype=np.uint8) + polygons = np.asarray(polygons) + polygons = polygons.astype(np.int32) + shape = polygons.shape + polygons = polygons.reshape(shape[0], -1, 2) + cv2.fillPoly(mask, polygons, color=color) + # NOTE: fillPoly firstly then resize is trying the keep the same + # way of loss calculation when mask-ratio=1. + mask = mmcv.imresize(mask, (nw, nh)) + return mask + + def polygons2masks(self, + img_shape: Tuple[int, int], + polygons: PolygonMasks, + color: int = 1) -> np.ndarray: + """Return a list of bitmap masks. + + Args: + img_shape (tuple): The image size. + polygons (PolygonMasks): The mask annotations. + color (int): color in fillPoly. + Return: + List[np.ndarray]: the list of masks in bitmaps. + """ + if self.coco_style: + nh, nw = (img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio) + masks = polygons.resize((nh, nw)).to_ndarray() + return masks + else: + masks = [] + for si in range(len(polygons)): + mask = self.polygon2mask(img_shape, polygons[si], color) + masks.append(mask) + return np.array(masks) + + def polygons2masks_overlap( + self, img_shape: Tuple[int, int], + polygons: PolygonMasks) -> Tuple[np.ndarray, np.ndarray]: + """Return a overlap mask and the sorted idx of area. + + Args: + img_shape (tuple): The image size. + polygons (PolygonMasks): The mask annotations. + color (int): color in fillPoly. + Return: + Tuple[np.ndarray, np.ndarray]: + the overlap mask and the sorted idx of area. + """ + masks = np.zeros((img_shape[0] // self.downsample_ratio, + img_shape[1] // self.downsample_ratio), + dtype=np.int32 if len(polygons) > 255 else np.uint8) + areas = [] + ms = [] + for si in range(len(polygons)): + mask = self.polygon2mask(img_shape, polygons[si], color=1) + ms.append(mask) + areas.append(mask.sum()) + areas = np.asarray(areas) + index = np.argsort(-areas) + ms = np.array(ms)[index] + for i in range(len(polygons)): + mask = ms[i] * (i + 1) + masks = masks + mask + masks = np.clip(masks, a_min=0, a_max=i + 1) + return masks, index + + def transform(self, results: dict) -> dict: + gt_masks = results['gt_masks'] + assert isinstance(gt_masks, PolygonMasks) + + if self.mask_overlap: + masks, sorted_idx = self.polygons2masks_overlap( + (gt_masks.height, gt_masks.width), gt_masks) + results['gt_bboxes'] = results['gt_bboxes'][sorted_idx] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + sorted_idx] + + # In this case we put gt_masks in gt_panoptic_seg + results.pop('gt_masks') + results['gt_panoptic_seg'] = torch.from_numpy(masks[None]) + else: + masks = self.polygons2masks((gt_masks.height, gt_masks.width), + gt_masks, + color=1) + masks = torch.from_numpy(masks) + # Consistent logic with mmdet + results['gt_masks'] = masks + return results + + +@TRANSFORMS.register_module() +class FilterAnnotations(FilterDetAnnotations): + """Filter invalid annotations. + + In addition to the conditions checked by ``FilterDetAnnotations``, this + filter adds a new condition requiring instances to have at least one + visible keypoints. + """ + + def __init__(self, by_keypoints: bool = False, **kwargs) -> None: + # TODO: add more filter options + super().__init__(**kwargs) + self.by_keypoints = by_keypoints + + @autocast_box_type() + def transform(self, results: dict) -> Union[dict, None]: + """Transform function to filter annotations. + + Args: + results (dict): Result dict. + Returns: + dict: Updated result dict. + """ + assert 'gt_bboxes' in results + gt_bboxes = results['gt_bboxes'] + if gt_bboxes.shape[0] == 0: + return results + + tests = [] + if self.by_box: + tests.append( + ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) & + (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy()) + + if self.by_mask: + assert 'gt_masks' in results + gt_masks = results['gt_masks'] + tests.append(gt_masks.areas >= self.min_gt_mask_area) + + if self.by_keypoints: + assert 'gt_keypoints' in results + num_keypoints = results['gt_keypoints'].num_keypoints + tests.append((num_keypoints > 0).numpy()) + + keep = tests[0] + for t in tests[1:]: + keep = keep & t + + if not keep.any(): + if self.keep_empty: + return None + + keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags', + 'gt_keypoints') + for key in keys: + if key in results: + results[key] = results[key][keep] + + return results + + +# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug +@TRANSFORMS.register_module() +class RandomAffine(MMDET_RandomAffine): + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + img = results['img'] + height = img.shape[0] + self.border[1] * 2 + width = img.shape[1] + self.border[0] * 2 + + warp_matrix = self._get_random_homography_matrix(height, width) + + img = cv2.warpPerspective( + img, + warp_matrix, + dsize=(width, height), + borderValue=self.border_val) + results['img'] = img + results['img_shape'] = img.shape + + bboxes = results['gt_bboxes'] + num_bboxes = len(bboxes) + if num_bboxes: + bboxes.project_(warp_matrix) + if self.bbox_clip_border: + bboxes.clip_([height, width]) + # remove outside bbox + valid_index = bboxes.is_inside([height, width]).numpy() + results['gt_bboxes'] = bboxes[valid_index] + results['gt_bboxes_labels'] = results['gt_bboxes_labels'][ + valid_index] + results['gt_ignore_flags'] = results['gt_ignore_flags'][ + valid_index] + + if 'gt_masks' in results: + raise NotImplementedError('RandomAffine only supports bbox.') + + if 'gt_keypoints' in results: + keypoints = results['gt_keypoints'] + keypoints.project_(warp_matrix) + if self.bbox_clip_border: + keypoints.clip_([height, width]) + results['gt_keypoints'] = keypoints[valid_index] + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(hue_delta={self.hue_delta}, ' + repr_str += f'saturation_delta={self.saturation_delta}, ' + repr_str += f'value_delta={self.value_delta})' + return repr_str + + +# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug +@TRANSFORMS.register_module() +class RandomFlip(MMDET_RandomFlip): + + @autocast_box_type() + def _flip(self, results: dict) -> None: + """Flip images, bounding boxes, and semantic segmentation map.""" + # flip image + results['img'] = mmcv.imflip( + results['img'], direction=results['flip_direction']) + + img_shape = results['img'].shape[:2] + + # flip bboxes + if results.get('gt_bboxes', None) is not None: + results['gt_bboxes'].flip_(img_shape, results['flip_direction']) + + # flip keypoints + if results.get('gt_keypoints', None) is not None: + results['gt_keypoints'].flip_(img_shape, results['flip_direction']) + + # flip masks + if results.get('gt_masks', None) is not None: + results['gt_masks'] = results['gt_masks'].flip( + results['flip_direction']) + + # flip segs + if results.get('gt_seg_map', None) is not None: + results['gt_seg_map'] = mmcv.imflip( + results['gt_seg_map'], direction=results['flip_direction']) + + # record homography matrix for flip + self._record_homography_matrix(results) + + +@TRANSFORMS.register_module() +class Resize(MMDET_Resize): + + def _resize_keypoints(self, results: dict) -> None: + """Resize bounding boxes with ``results['scale_factor']``.""" + if results.get('gt_keypoints', None) is not None: + results['gt_keypoints'].rescale_(results['scale_factor']) + if self.clip_object_border: + results['gt_keypoints'].clip_(results['img_shape']) + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Transform function to resize images, bounding boxes and semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map', + 'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys + are updated in result dict. + """ + if self.scale: + results['scale'] = self.scale + else: + img_shape = results['img'].shape[:2] + results['scale'] = _scale_size(img_shape[::-1], self.scale_factor) + self._resize_img(results) + self._resize_bboxes(results) + self._resize_keypoints(results) + self._resize_masks(results) + self._resize_seg(results) + self._record_homography_matrix(results) + return results diff --git a/third_party/mmyolo/mmyolo/datasets/utils.py b/third_party/mmyolo/mmyolo/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..efa2ff5ef07d73e82c258474db7b0e49edc4825a --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/utils.py @@ -0,0 +1,133 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence + +import numpy as np +import torch +from mmengine.dataset import COLLATE_FUNCTIONS +from mmengine.dist import get_dist_info + +from ..registry import TASK_UTILS + + +@COLLATE_FUNCTIONS.register_module() +def yolov5_collate(data_batch: Sequence, + use_ms_training: bool = False) -> dict: + """Rewrite collate_fn to get faster training speed. + + Args: + data_batch (Sequence): Batch of data. + use_ms_training (bool): Whether to use multi-scale training. + """ + batch_imgs = [] + batch_bboxes_labels = [] + batch_masks = [] + batch_keyponits = [] + batch_keypoints_visible = [] + for i in range(len(data_batch)): + datasamples = data_batch[i]['data_samples'] + inputs = data_batch[i]['inputs'] + batch_imgs.append(inputs) + + gt_bboxes = datasamples.gt_instances.bboxes.tensor + gt_labels = datasamples.gt_instances.labels + if 'masks' in datasamples.gt_instances: + masks = datasamples.gt_instances.masks + batch_masks.append(masks) + if 'gt_panoptic_seg' in datasamples: + batch_masks.append(datasamples.gt_panoptic_seg.pan_seg) + if 'keypoints' in datasamples.gt_instances: + keypoints = datasamples.gt_instances.keypoints + keypoints_visible = datasamples.gt_instances.keypoints_visible + batch_keyponits.append(keypoints) + batch_keypoints_visible.append(keypoints_visible) + + batch_idx = gt_labels.new_full((len(gt_labels), 1), i) + bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes), + dim=1) + batch_bboxes_labels.append(bboxes_labels) + collated_results = { + 'data_samples': { + 'bboxes_labels': torch.cat(batch_bboxes_labels, 0) + } + } + if len(batch_masks) > 0: + collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0) + + if len(batch_keyponits) > 0: + collated_results['data_samples']['keypoints'] = torch.cat( + batch_keyponits, 0) + collated_results['data_samples']['keypoints_visible'] = torch.cat( + batch_keypoints_visible, 0) + + if use_ms_training: + collated_results['inputs'] = batch_imgs + else: + collated_results['inputs'] = torch.stack(batch_imgs, 0) + return collated_results + + +@TASK_UTILS.register_module() +class BatchShapePolicy: + """BatchShapePolicy is only used in the testing phase, which can reduce the + number of pad pixels during batch inference. + + Args: + batch_size (int): Single GPU batch size during batch inference. + Defaults to 32. + img_size (int): Expected output image size. Defaults to 640. + size_divisor (int): The minimum size that is divisible + by size_divisor. Defaults to 32. + extra_pad_ratio (float): Extra pad ratio. Defaults to 0.5. + """ + + def __init__(self, + batch_size: int = 32, + img_size: int = 640, + size_divisor: int = 32, + extra_pad_ratio: float = 0.5): + self.img_size = img_size + self.size_divisor = size_divisor + self.extra_pad_ratio = extra_pad_ratio + _, world_size = get_dist_info() + # During multi-gpu testing, the batchsize should be multiplied by + # worldsize, so that the number of batches can be calculated correctly. + # The index of batches will affect the calculation of batch shape. + self.batch_size = batch_size * world_size + + def __call__(self, data_list: List[dict]) -> List[dict]: + image_shapes = [] + for data_info in data_list: + image_shapes.append((data_info['width'], data_info['height'])) + + image_shapes = np.array(image_shapes, dtype=np.float64) + + n = len(image_shapes) # number of images + batch_index = np.floor(np.arange(n) / self.batch_size).astype( + np.int64) # batch index + number_of_batches = batch_index[-1] + 1 # number of batches + + aspect_ratio = image_shapes[:, 1] / image_shapes[:, 0] # aspect ratio + irect = aspect_ratio.argsort() + + data_list = [data_list[i] for i in irect] + + aspect_ratio = aspect_ratio[irect] + # Set training image shapes + shapes = [[1, 1]] * number_of_batches + for i in range(number_of_batches): + aspect_ratio_index = aspect_ratio[batch_index == i] + min_index, max_index = aspect_ratio_index.min( + ), aspect_ratio_index.max() + if max_index < 1: + shapes[i] = [max_index, 1] + elif min_index > 1: + shapes[i] = [1, 1 / min_index] + + batch_shapes = np.ceil( + np.array(shapes) * self.img_size / self.size_divisor + + self.extra_pad_ratio).astype(np.int64) * self.size_divisor + + for i, data_info in enumerate(data_list): + data_info['batch_shape'] = batch_shapes[batch_index[i]] + + return data_list diff --git a/third_party/mmyolo/mmyolo/datasets/yolov5_coco.py b/third_party/mmyolo/mmyolo/datasets/yolov5_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..55bc899abfcceebfdadf7549e56336725d891dcb --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/yolov5_coco.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Optional + +from mmdet.datasets import BaseDetDataset, CocoDataset + +from ..registry import DATASETS, TASK_UTILS + + +class BatchShapePolicyDataset(BaseDetDataset): + """Dataset with the batch shape policy that makes paddings with least + pixels during batch inference process, which does not require the image + scales of all batches to be the same throughout validation.""" + + def __init__(self, + *args, + batch_shapes_cfg: Optional[dict] = None, + **kwargs): + self.batch_shapes_cfg = batch_shapes_cfg + super().__init__(*args, **kwargs) + + def full_init(self): + """rewrite full_init() to be compatible with serialize_data in + BatchShapePolicy.""" + if self._fully_initialized: + return + # load data information + self.data_list = self.load_data_list() + + # batch_shapes_cfg + if self.batch_shapes_cfg: + batch_shapes_policy = TASK_UTILS.build(self.batch_shapes_cfg) + self.data_list = batch_shapes_policy(self.data_list) + del batch_shapes_policy + + # filter illegal data, such as data that has no annotations. + self.data_list = self.filter_data() + # Get subset data according to indices. + if self._indices is not None: + self.data_list = self._get_unserialized_subset(self._indices) + + # serialize data_list + if self.serialize_data: + self.data_bytes, self.data_address = self._serialize_data() + + self._fully_initialized = True + + def prepare_data(self, idx: int) -> Any: + """Pass the dataset to the pipeline during training to support mixed + data augmentation, such as Mosaic and MixUp.""" + if self.test_mode is False: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) + else: + return super().prepare_data(idx) + + +@DATASETS.register_module() +class YOLOv5CocoDataset(BatchShapePolicyDataset, CocoDataset): + """Dataset for YOLOv5 COCO Dataset. + + We only add `BatchShapePolicy` function compared with CocoDataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/third_party/mmyolo/mmyolo/datasets/yolov5_crowdhuman.py b/third_party/mmyolo/mmyolo/datasets/yolov5_crowdhuman.py new file mode 100644 index 0000000000000000000000000000000000000000..486a8324fb4c7d8a34bf885f1818d2e6f974f6e7 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/yolov5_crowdhuman.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets import CrowdHumanDataset + +from ..registry import DATASETS +from .yolov5_coco import BatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5CrowdHumanDataset(BatchShapePolicyDataset, CrowdHumanDataset): + """Dataset for YOLOv5 CrowdHuman Dataset. + + We only add `BatchShapePolicy` function compared with CrowdHumanDataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/third_party/mmyolo/mmyolo/datasets/yolov5_dota.py b/third_party/mmyolo/mmyolo/datasets/yolov5_dota.py new file mode 100644 index 0000000000000000000000000000000000000000..a9647981333ed725a568a293279873ab9e20db47 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/yolov5_dota.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from ..registry import DATASETS + +try: + from mmrotate.datasets import DOTADataset + MMROTATE_AVAILABLE = True +except ImportError: + from mmengine.dataset import BaseDataset + DOTADataset = BaseDataset + MMROTATE_AVAILABLE = False + + +@DATASETS.register_module() +class YOLOv5DOTADataset(BatchShapePolicyDataset, DOTADataset): + """Dataset for YOLOv5 DOTA Dataset. + + We only add `BatchShapePolicy` function compared with DOTADataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + + def __init__(self, *args, **kwargs): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + super().__init__(*args, **kwargs) diff --git a/third_party/mmyolo/mmyolo/datasets/yolov5_voc.py b/third_party/mmyolo/mmyolo/datasets/yolov5_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..5be764f1db3097645ae1be387e45cafb1b460731 --- /dev/null +++ b/third_party/mmyolo/mmyolo/datasets/yolov5_voc.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets import VOCDataset + +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset +from ..registry import DATASETS + + +@DATASETS.register_module() +class YOLOv5VOCDataset(BatchShapePolicyDataset, VOCDataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with VOCDataset. See + `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/third_party/mmyolo/mmyolo/deploy/__init__.py b/third_party/mmyolo/mmyolo/deploy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4904a9058b41526d9719994ed718ae58336d290e --- /dev/null +++ b/third_party/mmyolo/mmyolo/deploy/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdeploy.codebase.base import MMCodebase + +from .models import * # noqa: F401,F403 +from .object_detection import MMYOLO, YOLOObjectDetection + +__all__ = ['MMCodebase', 'MMYOLO', 'YOLOObjectDetection'] diff --git a/third_party/mmyolo/mmyolo/deploy/models/__init__.py b/third_party/mmyolo/mmyolo/deploy/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b999a0161543d6a9d2ab56d797af740dc7261e4 --- /dev/null +++ b/third_party/mmyolo/mmyolo/deploy/models/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from . import dense_heads # noqa: F401,F403 diff --git a/third_party/mmyolo/mmyolo/deploy/models/dense_heads/__init__.py b/third_party/mmyolo/mmyolo/deploy/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cc423af3ec374cabe2b9f46d2fe4f4dc9755b8e3 --- /dev/null +++ b/third_party/mmyolo/mmyolo/deploy/models/dense_heads/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from . import yolov5_head # noqa: F401,F403 + +__all__ = ['yolov5_head'] diff --git a/third_party/mmyolo/mmyolo/deploy/models/dense_heads/yolov5_head.py b/third_party/mmyolo/mmyolo/deploy/models/dense_heads/yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ac996ba41336243ef091e3e952430382be9ff978 --- /dev/null +++ b/third_party/mmyolo/mmyolo/deploy/models/dense_heads/yolov5_head.py @@ -0,0 +1,189 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from functools import partial +from typing import List, Optional, Tuple + +import torch +from mmdeploy.codebase.mmdet import get_post_processing_params +from mmdeploy.codebase.mmdet.models.layers import multiclass_nms +from mmdeploy.core import FUNCTION_REWRITER +from mmengine.config import ConfigDict +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.deploy.models.layers import efficient_nms +from mmyolo.models.dense_heads import YOLOv5Head + + +def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: int) -> Tensor: + """Decode YOLOv5 bounding boxes. + + Args: + priors (Tensor): Prior boxes in center-offset form. + bbox_preds (Tensor): Predicted bounding boxes. + stride (int): Stride of the feature map. + + Returns: + Tensor: Decoded bounding boxes. + """ + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5Head.predict_by_feat') +def yolov5_head__predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = False, + with_nms: bool = True) -> Tuple[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + Returns: + tuple[Tensor, Tensor]: The first item is an (N, num_box, 5) tensor, + where 5 represent (tl_x, tl_y, br_x, br_y, score), N is batch + size and the score between 0 and 1. The shape of the second + tensor in the tuple is (N, num_box), and each element + represents the class label of the corresponding box. + """ + ctx = FUNCTION_REWRITER.get_context() + detector_type = type(self) + deploy_cfg = ctx.cfg + use_efficientnms = deploy_cfg.get('use_efficientnms', False) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + bbox_decoder = self.bbox_coder.decode + nms_func = multiclass_nms + if use_efficientnms: + if detector_type is YOLOv5Head: + nms_func = partial(efficient_nms, box_coding=0) + bbox_decoder = yolov5_bbox_decoder + else: + nms_func = efficient_nms + + assert len(cls_scores) == len(bbox_preds) + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + num_imgs = cls_scores[0].shape[0] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, dtype=dtype, device=device) + + flatten_priors = torch.cat(mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size[0] * featmap_size[1] * self.num_base_priors, ), + stride) + for featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes) + for cls_score in cls_scores + ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) + + if not with_nms: + return bboxes, scores + + post_params = get_post_processing_params(deploy_cfg) + max_output_boxes_per_class = post_params.max_output_boxes_per_class + iou_threshold = cfg.nms.get('iou_threshold', post_params.iou_threshold) + score_threshold = cfg.get('score_thr', post_params.score_threshold) + pre_top_k = post_params.pre_top_k + keep_top_k = cfg.get('max_per_img', post_params.keep_top_k) + + return nms_func(bboxes, scores, max_output_boxes_per_class, iou_threshold, + score_threshold, pre_top_k, keep_top_k) + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5Head.predict', + backend='rknn') +def yolov5_head__predict__rknn(self, x: Tuple[Tensor], *args, + **kwargs) -> Tuple[Tensor, Tensor, Tensor]: + """Perform forward propagation of the detection head and predict detection + results on the features of the upstream network. + + Args: + x (tuple[Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + """ + outs = self(x) + return outs + + +@FUNCTION_REWRITER.register_rewriter( + func_name='mmyolo.models.dense_heads.yolov5_head.' + 'YOLOv5HeadModule.forward', + backend='rknn') +def yolov5_head_module__forward__rknn( + self, x: Tensor, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + out = [] + for i, feat in enumerate(x): + out.append(self.convs_pred[i](feat)) + return out diff --git a/third_party/mmyolo/mmyolo/deploy/models/layers/__init__.py b/third_party/mmyolo/mmyolo/deploy/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6017cf83425b640eb788a8abf6b253f29d759afb --- /dev/null +++ b/third_party/mmyolo/mmyolo/deploy/models/layers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_nms import efficient_nms + +__all__ = ['efficient_nms'] diff --git a/third_party/mmyolo/mmyolo/deploy/models/layers/bbox_nms.py b/third_party/mmyolo/mmyolo/deploy/models/layers/bbox_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..4db81c0227a36e0315855082dcd8125e1f9be70a --- /dev/null +++ b/third_party/mmyolo/mmyolo/deploy/models/layers/bbox_nms.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdeploy.core import mark +from torch import Tensor + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x, y, w, h]. + Set to 1 means [x1, y1 ,x2, y2]. + + Returns: + tuple[Tensor, Tensor]: (dets, labels), `dets` of shape [N, num_det, 5] + and `labels` of shape [N, num_det]. + """ + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, det_boxes, det_scores, labels = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + dets = torch.cat([det_boxes, det_scores.unsqueeze(2)], -1) + + # retain shape info + batch_size = boxes.size(0) + + dets_shape = dets.shape + label_shape = labels.shape + dets = dets.reshape([batch_size, *dets_shape[1:]]) + labels = labels.reshape([batch_size, *label_shape[1:]]) + return dets, labels + + +@mark('efficient_nms', inputs=['boxes', 'scores'], outputs=['dets', 'labels']) +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +class TRTEfficientNMSop(torch.autograd.Function): + """Efficient NMS op for TensorRT.""" + + @staticmethod + def forward( + ctx, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25, + ): + """Forward function of TRTEfficientNMSop.""" + batch_size, num_boxes, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes, + scores, + background_class=-1, + box_coding=0, + iou_threshold=0.45, + max_output_boxes=100, + plugin_version='1', + score_activation=0, + score_threshold=0.25): + """Symbolic function of TRTEfficientNMSop.""" + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + nums, boxes, scores, classes = out + return nums, boxes, scores, classes diff --git a/third_party/mmyolo/mmyolo/deploy/object_detection.py b/third_party/mmyolo/mmyolo/deploy/object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..7efdfcfb7a46c8bc6b90e76bd06d9065410e55f0 --- /dev/null +++ b/third_party/mmyolo/mmyolo/deploy/object_detection.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Callable, Dict, Optional + +import torch +from mmdeploy.codebase.base import CODEBASE, MMCodebase +from mmdeploy.codebase.mmdet.deploy import ObjectDetection +from mmdeploy.utils import Codebase, Task +from mmengine import Config +from mmengine.registry import Registry + +MMYOLO_TASK = Registry('mmyolo_tasks') + + +@CODEBASE.register_module(Codebase.MMYOLO.value) +class MMYOLO(MMCodebase): + """MMYOLO codebase class.""" + + task_registry = MMYOLO_TASK + + @classmethod + def register_deploy_modules(cls): + """register all rewriters for mmdet.""" + import mmdeploy.codebase.mmdet.models # noqa: F401 + import mmdeploy.codebase.mmdet.ops # noqa: F401 + import mmdeploy.codebase.mmdet.structures # noqa: F401 + + @classmethod + def register_all_modules(cls): + """register all modules.""" + from mmdet.utils.setup_env import \ + register_all_modules as register_all_modules_mmdet + + from mmyolo.utils.setup_env import \ + register_all_modules as register_all_modules_mmyolo + + cls.register_deploy_modules() + register_all_modules_mmyolo(True) + register_all_modules_mmdet(False) + + +def _get_dataset_metainfo(model_cfg: Config): + """Get metainfo of dataset. + + Args: + model_cfg Config: Input model Config object. + + Returns: + list[str]: A list of string specifying names of different class. + """ + from mmyolo import datasets # noqa + from mmyolo.registry import DATASETS + + module_dict = DATASETS.module_dict + for dataloader_name in [ + 'test_dataloader', 'val_dataloader', 'train_dataloader' + ]: + if dataloader_name not in model_cfg: + continue + dataloader_cfg = model_cfg[dataloader_name] + dataset_cfg = dataloader_cfg.dataset + dataset_cls = module_dict.get(dataset_cfg.type, None) + if dataset_cls is None: + continue + if hasattr(dataset_cls, '_load_metainfo') and isinstance( + dataset_cls._load_metainfo, Callable): + meta = dataset_cls._load_metainfo( + dataset_cfg.get('metainfo', None)) + if meta is not None: + return meta + if hasattr(dataset_cls, 'METAINFO'): + return dataset_cls.METAINFO + + return None + + +@MMYOLO_TASK.register_module(Task.OBJECT_DETECTION.value) +class YOLOObjectDetection(ObjectDetection): + """YOLO Object Detection task.""" + + def get_visualizer(self, name: str, save_dir: str): + """Get visualizer. + + Args: + name (str): Name of visualizer. + save_dir (str): Directory to save visualization results. + + Returns: + Visualizer: A visualizer instance. + """ + from mmdet.visualization import DetLocalVisualizer # noqa: F401,F403 + metainfo = _get_dataset_metainfo(self.model_cfg) + visualizer = super().get_visualizer(name, save_dir) + if metainfo is not None: + visualizer.dataset_meta = metainfo + return visualizer + + def build_pytorch_model(self, + model_checkpoint: Optional[str] = None, + cfg_options: Optional[Dict] = None, + **kwargs) -> torch.nn.Module: + """Initialize torch model. + + Args: + model_checkpoint (str): The checkpoint file of torch model, + defaults to `None`. + cfg_options (dict): Optional config key-pair parameters. + Returns: + nn.Module: An initialized torch model generated by other OpenMMLab + codebases. + """ + from copy import deepcopy + + from mmengine.model import revert_sync_batchnorm + from mmengine.registry import MODELS + + from mmyolo.utils import switch_to_deploy + + model = deepcopy(self.model_cfg.model) + preprocess_cfg = deepcopy(self.model_cfg.get('preprocess_cfg', {})) + preprocess_cfg.update( + deepcopy(self.model_cfg.get('data_preprocessor', {}))) + model.setdefault('data_preprocessor', preprocess_cfg) + model = MODELS.build(model) + if model_checkpoint is not None: + from mmengine.runner.checkpoint import load_checkpoint + load_checkpoint(model, model_checkpoint, map_location=self.device) + + model = revert_sync_batchnorm(model) + switch_to_deploy(model) + model = model.to(self.device) + model.eval() + return model diff --git a/third_party/mmyolo/mmyolo/engine/__init__.py b/third_party/mmyolo/mmyolo/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b2e0a126c09797b327f7309d6e980245b7e44773 --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hooks import * # noqa: F401,F403 +from .optimizers import * # noqa: F401,F403 diff --git a/third_party/mmyolo/mmyolo/engine/hooks/__init__.py b/third_party/mmyolo/mmyolo/engine/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b8deebc8827da5b9a3f8c92a2fffe70e42d0bfa --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/hooks/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ppyoloe_param_scheduler_hook import PPYOLOEParamSchedulerHook +from .switch_to_deploy_hook import SwitchToDeployHook +from .yolov5_param_scheduler_hook import YOLOv5ParamSchedulerHook +from .yolox_mode_switch_hook import YOLOXModeSwitchHook + +__all__ = [ + 'YOLOv5ParamSchedulerHook', 'YOLOXModeSwitchHook', 'SwitchToDeployHook', + 'PPYOLOEParamSchedulerHook' +] diff --git a/third_party/mmyolo/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py b/third_party/mmyolo/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..26dfe6ef2d5cf590ea381efb3e42cdc1c5492361 --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/hooks/ppyoloe_param_scheduler_hook.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +from mmengine.hooks import ParamSchedulerHook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +@HOOKS.register_module() +class PPYOLOEParamSchedulerHook(ParamSchedulerHook): + """A hook to update learning rate and momentum in optimizer of PPYOLOE. We + use this hook to implement adaptive computation for `warmup_total_iters`, + which is not possible with the built-in ParamScheduler in mmyolo. + + Args: + warmup_min_iter (int): Minimum warmup iters. Defaults to 1000. + start_factor (float): The number we multiply learning rate in the + first epoch. The multiplication factor changes towards end_factor + in the following epochs. Defaults to 0. + warmup_epochs (int): Epochs for warmup. Defaults to 5. + min_lr_ratio (float): Minimum learning rate ratio. + total_epochs (int): In PPYOLOE, `total_epochs` is set to + training_epochs x 1.2. Defaults to 360. + """ + priority = 9 + + def __init__(self, + warmup_min_iter: int = 1000, + start_factor: float = 0., + warmup_epochs: int = 5, + min_lr_ratio: float = 0.0, + total_epochs: int = 360): + + self.warmup_min_iter = warmup_min_iter + self.start_factor = start_factor + self.warmup_epochs = warmup_epochs + self.min_lr_ratio = min_lr_ratio + self.total_epochs = total_epochs + + self._warmup_end = False + self._base_lr = None + + def before_train(self, runner: Runner): + """Operations before train. + + Args: + runner (Runner): The runner of the training process. + """ + optimizer = runner.optim_wrapper.optimizer + for group in optimizer.param_groups: + # If the param is never be scheduled, record the current value + # as the initial value. + group.setdefault('initial_lr', group['lr']) + + self._base_lr = [ + group['initial_lr'] for group in optimizer.param_groups + ] + self._min_lr = [i * self.min_lr_ratio for i in self._base_lr] + + def before_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None): + """Operations before each training iteration. + + Args: + runner (Runner): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict or tuple or list, optional): Data from dataloader. + """ + cur_iters = runner.iter + optimizer = runner.optim_wrapper.optimizer + dataloader_len = len(runner.train_dataloader) + + # The minimum warmup is self.warmup_min_iter + warmup_total_iters = max( + round(self.warmup_epochs * dataloader_len), self.warmup_min_iter) + + if cur_iters <= warmup_total_iters: + # warm up + alpha = cur_iters / warmup_total_iters + factor = self.start_factor * (1 - alpha) + alpha + + for group_idx, param in enumerate(optimizer.param_groups): + param['lr'] = self._base_lr[group_idx] * factor + else: + for group_idx, param in enumerate(optimizer.param_groups): + total_iters = self.total_epochs * dataloader_len + lr = self._min_lr[group_idx] + ( + self._base_lr[group_idx] - + self._min_lr[group_idx]) * 0.5 * ( + math.cos((cur_iters - warmup_total_iters) * math.pi / + (total_iters - warmup_total_iters)) + 1.0) + param['lr'] = lr diff --git a/third_party/mmyolo/mmyolo/engine/hooks/switch_to_deploy_hook.py b/third_party/mmyolo/mmyolo/engine/hooks/switch_to_deploy_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..28ac345f40c44c974fb33b7bf9756a61fcabf820 --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/hooks/switch_to_deploy_hook.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmengine.hooks import Hook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS +from mmyolo.utils import switch_to_deploy + + +@HOOKS.register_module() +class SwitchToDeployHook(Hook): + """Switch to deploy mode before testing. + + This hook converts the multi-channel structure of the training network + (high performance) to the one-way structure of the testing network (fast + speed and memory saving). + """ + + def before_test_epoch(self, runner: Runner): + """Switch to deploy mode before testing.""" + switch_to_deploy(runner.model) diff --git a/third_party/mmyolo/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py b/third_party/mmyolo/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..777bb49d7abd7fc37385370546d05e70c274b3b7 --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/hooks/yolov5_param_scheduler_hook.py @@ -0,0 +1,130 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import numpy as np +from mmengine.hooks import ParamSchedulerHook +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +def linear_fn(lr_factor: float, max_epochs: int): + """Generate linear function.""" + return lambda x: (1 - x / max_epochs) * (1.0 - lr_factor) + lr_factor + + +def cosine_fn(lr_factor: float, max_epochs: int): + """Generate cosine function.""" + return lambda x: ( + (1 - math.cos(x * math.pi / max_epochs)) / 2) * (lr_factor - 1) + 1 + + +@HOOKS.register_module() +class YOLOv5ParamSchedulerHook(ParamSchedulerHook): + """A hook to update learning rate and momentum in optimizer of YOLOv5.""" + priority = 9 + + scheduler_maps = {'linear': linear_fn, 'cosine': cosine_fn} + + def __init__(self, + scheduler_type: str = 'linear', + lr_factor: float = 0.01, + max_epochs: int = 300, + warmup_epochs: int = 3, + warmup_bias_lr: float = 0.1, + warmup_momentum: float = 0.8, + warmup_mim_iter: int = 1000, + **kwargs): + + assert scheduler_type in self.scheduler_maps + + self.warmup_epochs = warmup_epochs + self.warmup_bias_lr = warmup_bias_lr + self.warmup_momentum = warmup_momentum + self.warmup_mim_iter = warmup_mim_iter + + kwargs.update({'lr_factor': lr_factor, 'max_epochs': max_epochs}) + self.scheduler_fn = self.scheduler_maps[scheduler_type](**kwargs) + + self._warmup_end = False + self._base_lr = None + self._base_momentum = None + + def before_train(self, runner: Runner): + """Operations before train. + + Args: + runner (Runner): The runner of the training process. + """ + optimizer = runner.optim_wrapper.optimizer + for group in optimizer.param_groups: + # If the param is never be scheduled, record the current value + # as the initial value. + group.setdefault('initial_lr', group['lr']) + group.setdefault('initial_momentum', group.get('momentum', -1)) + + self._base_lr = [ + group['initial_lr'] for group in optimizer.param_groups + ] + self._base_momentum = [ + group['initial_momentum'] for group in optimizer.param_groups + ] + + def before_train_iter(self, + runner: Runner, + batch_idx: int, + data_batch: Optional[dict] = None): + """Operations before each training iteration. + + Args: + runner (Runner): The runner of the training process. + batch_idx (int): The index of the current batch in the train loop. + data_batch (dict or tuple or list, optional): Data from dataloader. + """ + cur_iters = runner.iter + cur_epoch = runner.epoch + optimizer = runner.optim_wrapper.optimizer + + # The minimum warmup is self.warmup_mim_iter + warmup_total_iters = max( + round(self.warmup_epochs * len(runner.train_dataloader)), + self.warmup_mim_iter) + + if cur_iters <= warmup_total_iters: + xp = [0, warmup_total_iters] + for group_idx, param in enumerate(optimizer.param_groups): + if group_idx == 2: + # bias learning rate will be handled specially + yp = [ + self.warmup_bias_lr, + self._base_lr[group_idx] * self.scheduler_fn(cur_epoch) + ] + else: + yp = [ + 0.0, + self._base_lr[group_idx] * self.scheduler_fn(cur_epoch) + ] + param['lr'] = np.interp(cur_iters, xp, yp) + + if 'momentum' in param: + param['momentum'] = np.interp( + cur_iters, xp, + [self.warmup_momentum, self._base_momentum[group_idx]]) + else: + self._warmup_end = True + + def after_train_epoch(self, runner: Runner): + """Operations after each training epoch. + + Args: + runner (Runner): The runner of the training process. + """ + if not self._warmup_end: + return + + cur_epoch = runner.epoch + optimizer = runner.optim_wrapper.optimizer + for group_idx, param in enumerate(optimizer.param_groups): + param['lr'] = self._base_lr[group_idx] * self.scheduler_fn( + cur_epoch) diff --git a/third_party/mmyolo/mmyolo/engine/hooks/yolox_mode_switch_hook.py b/third_party/mmyolo/mmyolo/engine/hooks/yolox_mode_switch_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..27711768c3f89b26410ae1373bc920d0bfded603 --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/hooks/yolox_mode_switch_hook.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import Sequence + +from mmengine.hooks import Hook +from mmengine.model import is_model_wrapper +from mmengine.runner import Runner + +from mmyolo.registry import HOOKS + + +@HOOKS.register_module() +class YOLOXModeSwitchHook(Hook): + """Switch the mode of YOLOX during training. + + This hook turns off the mosaic and mixup data augmentation and switches + to use L1 loss in bbox_head. + + Args: + num_last_epochs (int): The number of latter epochs in the end of the + training to close the data augmentation and switch to L1 loss. + Defaults to 15. + """ + + def __init__(self, + num_last_epochs: int = 15, + new_train_pipeline: Sequence[dict] = None): + self.num_last_epochs = num_last_epochs + self.new_train_pipeline_cfg = new_train_pipeline + + def before_train_epoch(self, runner: Runner): + """Close mosaic and mixup augmentation and switches to use L1 loss.""" + epoch = runner.epoch + model = runner.model + if is_model_wrapper(model): + model = model.module + + if (epoch + 1) == runner.max_epochs - self.num_last_epochs: + runner.logger.info(f'New Pipeline: {self.new_train_pipeline_cfg}') + + train_dataloader_cfg = copy.deepcopy(runner.cfg.train_dataloader) + train_dataloader_cfg.dataset.pipeline = self.new_train_pipeline_cfg + # Note: Why rebuild the dataset? + # When build_dataloader will make a deep copy of the dataset, + # it will lead to potential risks, such as the global instance + # object FileClient data is disordered. + # This problem needs to be solved in the future. + new_train_dataloader = Runner.build_dataloader( + train_dataloader_cfg) + runner.train_loop.dataloader = new_train_dataloader + + runner.logger.info('recreate the dataloader!') + runner.logger.info('Add additional bbox reg loss now!') + model.bbox_head.use_bbox_aux = True diff --git a/third_party/mmyolo/mmyolo/engine/optimizers/__init__.py b/third_party/mmyolo/mmyolo/engine/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b598020d05db54cdc1d803d39ebd2c91026a6112 --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/optimizers/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .yolov5_optim_constructor import YOLOv5OptimizerConstructor +from .yolov7_optim_wrapper_constructor import YOLOv7OptimWrapperConstructor + +__all__ = ['YOLOv5OptimizerConstructor', 'YOLOv7OptimWrapperConstructor'] diff --git a/third_party/mmyolo/mmyolo/engine/optimizers/yolov5_optim_constructor.py b/third_party/mmyolo/mmyolo/engine/optimizers/yolov5_optim_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..5e5f42cb5c2c18962f989288b45011c742845c2f --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/optimizers/yolov5_optim_constructor.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.model import is_model_wrapper +from mmengine.optim import OptimWrapper + +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOv5OptimizerConstructor: + """YOLOv5 constructor for optimizers. + + It has the following functions: + + - divides the optimizer parameters into 3 groups: + Conv, Bias and BN + + - support `weight_decay` parameter adaption based on + `batch_size_per_gpu` + + Args: + optim_wrapper_cfg (dict): The config dict of the optimizer wrapper. + Positional fields are + + - ``type``: class name of the OptimizerWrapper + - ``optimizer``: The configuration of optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer wrapper type, + e.g., accumulative_counts, clip_grad, etc. + + The positional fields of ``optimizer`` are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + + paramwise_cfg (dict, optional): Parameter-wise options. Must include + `base_total_batch_size` if not None. If the total input batch + is smaller than `base_total_batch_size`, the `weight_decay` + parameter will be kept unchanged, otherwise linear scaling. + + Example: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optim_wrapper_cfg = dict( + >>> dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01, + >>> momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16)) + >>> paramwise_cfg = dict(base_total_batch_size=64) + >>> optim_wrapper_builder = YOLOv5OptimizerConstructor( + >>> optim_wrapper_cfg, paramwise_cfg) + >>> optim_wrapper = optim_wrapper_builder(model) + """ + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + if paramwise_cfg is None: + paramwise_cfg = {'base_total_batch_size': 64} + assert 'base_total_batch_size' in paramwise_cfg + + if not isinstance(optim_wrapper_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optim_wrapper_cfg)}') + assert 'optimizer' in optim_wrapper_cfg, ( + '`optim_wrapper_cfg` must contain "optimizer" config') + + self.optim_wrapper_cfg = optim_wrapper_cfg + self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer') + self.base_total_batch_size = paramwise_cfg['base_total_batch_size'] + + def __call__(self, model: nn.Module) -> OptimWrapper: + if is_model_wrapper(model): + model = model.module + optimizer_cfg = self.optimizer_cfg.copy() + weight_decay = optimizer_cfg.pop('weight_decay', 0) + + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + params_groups = [], [], [] + + for v in model.modules(): + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + params_groups[2].append(v.bias) + # Includes SyncBatchNorm + if isinstance(v, nn.modules.batchnorm._NormBase): + params_groups[1].append(v.weight) + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + params_groups[0].append(v.weight) + + # Note: Make sure bias is in the last parameter group + optimizer_cfg['params'] = [] + # conv + optimizer_cfg['params'].append({ + 'params': params_groups[0], + 'weight_decay': weight_decay + }) + # bn + optimizer_cfg['params'].append({'params': params_groups[1]}) + # bias + optimizer_cfg['params'].append({'params': params_groups[2]}) + + print_log( + 'Optimizer groups: %g .bias, %g conv.weight, %g other' % + (len(params_groups[2]), len(params_groups[0]), len( + params_groups[1])), 'current') + del params_groups + + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/third_party/mmyolo/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py b/third_party/mmyolo/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..79ea8b69976760c0e45e35f8420d0cc69b13331a --- /dev/null +++ b/third_party/mmyolo/mmyolo/engine/optimizers/yolov7_optim_wrapper_constructor.py @@ -0,0 +1,139 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch.nn as nn +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.model import is_model_wrapper +from mmengine.optim import OptimWrapper + +from mmyolo.models.dense_heads.yolov7_head import ImplicitA, ImplicitM +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +# TODO: Consider merging into YOLOv5OptimizerConstructor +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOv7OptimWrapperConstructor: + """YOLOv7 constructor for optimizer wrappers. + + It has the following functions: + + - divides the optimizer parameters into 3 groups: + Conv, Bias and BN/ImplicitA/ImplicitM + + - support `weight_decay` parameter adaption based on + `batch_size_per_gpu` + + Args: + optim_wrapper_cfg (dict): The config dict of the optimizer wrapper. + Positional fields are + + - ``type``: class name of the OptimizerWrapper + - ``optimizer``: The configuration of optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer wrapper type, + e.g., accumulative_counts, clip_grad, etc. + + The positional fields of ``optimizer`` are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + + paramwise_cfg (dict, optional): Parameter-wise options. Must include + `base_total_batch_size` if not None. If the total input batch + is smaller than `base_total_batch_size`, the `weight_decay` + parameter will be kept unchanged, otherwise linear scaling. + + Example: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optim_wrapper_cfg = dict( + >>> dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01, + >>> momentum=0.9, weight_decay=0.0001, batch_size_per_gpu=16)) + >>> paramwise_cfg = dict(base_total_batch_size=64) + >>> optim_wrapper_builder = YOLOv7OptimWrapperConstructor( + >>> optim_wrapper_cfg, paramwise_cfg) + >>> optim_wrapper = optim_wrapper_builder(model) + """ + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None): + if paramwise_cfg is None: + paramwise_cfg = {'base_total_batch_size': 64} + assert 'base_total_batch_size' in paramwise_cfg + + if not isinstance(optim_wrapper_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optim_wrapper_cfg)}') + assert 'optimizer' in optim_wrapper_cfg, ( + '`optim_wrapper_cfg` must contain "optimizer" config') + + self.optim_wrapper_cfg = optim_wrapper_cfg + self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer') + self.base_total_batch_size = paramwise_cfg['base_total_batch_size'] + + def __call__(self, model: nn.Module) -> OptimWrapper: + if is_model_wrapper(model): + model = model.module + optimizer_cfg = self.optimizer_cfg.copy() + weight_decay = optimizer_cfg.pop('weight_decay', 0) + + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay *= scale_factor + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + params_groups = [], [], [] + for v in model.modules(): + # no decay + # Caution: Coupling with model + if isinstance(v, (ImplicitA, ImplicitM)): + params_groups[0].append(v.implicit) + elif isinstance(v, nn.modules.batchnorm._NormBase): + params_groups[0].append(v.weight) + # apply decay + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + params_groups[1].append(v.weight) # apply decay + + # biases, no decay + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + params_groups[2].append(v.bias) + + # Note: Make sure bias is in the last parameter group + optimizer_cfg['params'] = [] + # conv + optimizer_cfg['params'].append({ + 'params': params_groups[1], + 'weight_decay': weight_decay + }) + # bn ... + optimizer_cfg['params'].append({'params': params_groups[0]}) + # bias + optimizer_cfg['params'].append({'params': params_groups[2]}) + + print_log( + 'Optimizer groups: %g .bias, %g conv.weight, %g other' % + (len(params_groups[2]), len(params_groups[1]), len( + params_groups[0])), 'current') + del params_groups + + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + self.optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/third_party/mmyolo/mmyolo/models/__init__.py b/third_party/mmyolo/mmyolo/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..51c37f0436f131dcd26b9a8115e58fe49d59207e --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backbones import * # noqa: F401,F403 +from .data_preprocessors import * # noqa: F401,F403 +from .dense_heads import * # noqa: F401,F403 +from .detectors import * # noqa: F401,F403 +from .layers import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .plugins import * # noqa: F401,F403 +from .task_modules import * # noqa: F401,F403 diff --git a/third_party/mmyolo/mmyolo/models/backbones/__init__.py b/third_party/mmyolo/mmyolo/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48c8e28b1e7eb97e3f7cb064c75af0dc79b4cc8d --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/backbones/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_backbone import BaseBackbone +from .csp_darknet import YOLOv5CSPDarknet, YOLOv8CSPDarknet, YOLOXCSPDarknet +from .csp_resnet import PPYOLOECSPResNet +from .cspnext import CSPNeXt +from .efficient_rep import YOLOv6CSPBep, YOLOv6EfficientRep +from .yolov7_backbone import YOLOv7Backbone + +__all__ = [ + 'YOLOv5CSPDarknet', 'BaseBackbone', 'YOLOv6EfficientRep', 'YOLOv6CSPBep', + 'YOLOXCSPDarknet', 'CSPNeXt', 'YOLOv7Backbone', 'PPYOLOECSPResNet', + 'YOLOv8CSPDarknet' +] diff --git a/third_party/mmyolo/mmyolo/models/backbones/base_backbone.py b/third_party/mmyolo/mmyolo/models/backbones/base_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..730c7095eccf66b0d563fad96122454c98dff0ac --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/backbones/base_backbone.py @@ -0,0 +1,225 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Sequence, Union + +import torch +import torch.nn as nn +from mmcv.cnn import build_plugin_layer +from mmdet.utils import ConfigType, OptMultiConfig +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class BaseBackbone(BaseModule, metaclass=ABCMeta): + """BaseBackbone backbone used in YOLO series. + + .. code:: text + + Backbone model structure diagram + +-----------+ + | input | + +-----------+ + v + +-----------+ + | stem | + | layer | + +-----------+ + v + +-----------+ + | stage | + | layer 1 | + +-----------+ + v + +-----------+ + | stage | + | layer 2 | + +-----------+ + v + ...... + v + +-----------+ + | stage | + | layer n | + +-----------+ + In P5 model, n=4 + In P6 model, n=5 + + Args: + arch_setting (list): Architecture of BaseBackbone. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels: Number of input image channels. Defaults to 3. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to None. + act_cfg (dict): Config dict for activation layer. + Defaults to None. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + arch_setting: list, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = None, + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + self.num_stages = len(arch_setting) + self.arch_setting = arch_setting + + assert set(out_indices).issubset( + i for i in range(len(arch_setting) + 1)) + + if frozen_stages not in range(-1, len(arch_setting) + 1): + raise ValueError('"frozen_stages" must be in range(-1, ' + 'len(arch_setting) + 1). But received ' + f'{frozen_stages}') + + self.input_channels = input_channels + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.widen_factor = widen_factor + self.deepen_factor = deepen_factor + self.norm_eval = norm_eval + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.plugins = plugins + + self.stem = self.build_stem_layer() + self.layers = ['stem'] + + for idx, setting in enumerate(arch_setting): + stage = [] + stage += self.build_stage_layer(idx, setting) + if plugins is not None: + stage += self.make_stage_plugins(plugins, idx, setting) + self.add_module(f'stage{idx + 1}', nn.Sequential(*stage)) + self.layers.append(f'stage{idx + 1}') + + @abstractmethod + def build_stem_layer(self): + """Build a stem layer.""" + pass + + @abstractmethod + def build_stage_layer(self, stage_idx: int, setting: list): + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + pass + + def make_stage_plugins(self, plugins, stage_idx, setting): + """Make plugins for backbone ``stage_idx`` th stage. + + Currently we support to insert ``context_block``, + ``empirical_attention_block``, ``nonlocal_block``, ``dropout_block`` + into the backbone. + + + An example of plugins format could be: + + Examples: + >>> plugins=[ + ... dict(cfg=dict(type='xxx', arg1='xxx'), + ... stages=(False, True, True, True)), + ... dict(cfg=dict(type='yyy'), + ... stages=(True, True, True, True)), + ... ] + >>> model = YOLOv5CSPDarknet() + >>> stage_plugins = model.make_stage_plugins(plugins, 0, setting) + >>> assert len(stage_plugins) == 1 + + Suppose ``stage_idx=0``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1 -> conv2 -> conv3 -> yyy + + Suppose ``stage_idx=1``, the structure of blocks in the stage would be: + + .. code-block:: none + + conv1 -> conv2 -> conv3 -> xxx -> yyy + + + Args: + plugins (list[dict]): List of plugins cfg to build. The postfix is + required if multiple same type plugins are inserted. + stage_idx (int): Index of stage to build + If stages is missing, the plugin would be applied to all + stages. + setting (list): The architecture setting of a stage layer. + + Returns: + list[nn.Module]: Plugins for current stage + """ + # TODO: It is not general enough to support any channel and needs + # to be refactored + in_channels = int(setting[1] * self.widen_factor) + plugin_layers = [] + for plugin in plugins: + plugin = plugin.copy() + stages = plugin.pop('stages', None) + assert stages is None or len(stages) == self.num_stages + if stages is None or stages[stage_idx]: + name, layer = build_plugin_layer( + plugin['cfg'], in_channels=in_channels) + plugin_layers.append(layer) + return plugin_layers + + def _freeze_stages(self): + """Freeze the parameters of the specified stage so that they are no + longer updated.""" + if self.frozen_stages >= 0: + for i in range(self.frozen_stages + 1): + m = getattr(self, self.layers[i]) + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode: bool = True): + """Convert the model into training mode while keep normalization layer + frozen.""" + super().train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + + def forward(self, x: torch.Tensor) -> tuple: + """Forward batch_inputs from the data_preprocessor.""" + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) diff --git a/third_party/mmyolo/mmyolo/models/backbones/csp_darknet.py b/third_party/mmyolo/mmyolo/models/backbones/csp_darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..92bd69a5a9378a37ed8fb50c52dfba0de6879083 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/backbones/csp_darknet.py @@ -0,0 +1,427 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer, Focus +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import CSPLayerWithTwoConv, SPPFBottleneck +from ..utils import make_divisible, make_round +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv5CSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOv5. + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to: 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv5CSPDarknet + >>> import torch + >>> model = YOLOv5CSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 1024, 3, True, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 9, True, False], [512, 768, 3, True, False], + [768, 1024, 3, True, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return ConvModule( + self.input_channels, + make_divisible(self.arch_setting[0][0], self.widen_factor), + kernel_size=6, + stride=2, + padding=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv_layer = ConvModule( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + """Initialize the parameters.""" + if self.init_cfg is None: + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv8CSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOv8. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5}. + Defaults to P5. + last_stage_out_channels (int): Final layer output channel. + Defaults to 1024. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to: 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + + Example: + >>> from mmyolo.models import YOLOv8CSPDarknet + >>> import torch + >>> model = YOLOv8CSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + # the final out_channels will be set according to the param. + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, None, 3, True, True]], + } + + def __init__(self, + arch: str = 'P5', + last_stage_out_channels: int = 1024, + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + self.arch_settings[arch][-1][1] = last_stage_out_channels + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return ConvModule( + self.input_channels, + make_divisible(self.arch_setting[0][0], self.widen_factor), + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv_layer = ConvModule( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + csp_layer = CSPLayerWithTwoConv( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + """Initialize the parameters.""" + if self.init_cfg is None: + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOXCSPDarknet(BaseBackbone): + """CSP-Darknet backbone used in YOLOX. + + Args: + arch (str): Architecture of CSP-Darknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP + layers. Defaults to (5, 9, 13). + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (Union[dict,list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOXCSPDarknet + >>> import torch + >>> model = YOLOXCSPDarknet() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False], + [256, 512, 9, True, False], [512, 1024, 3, False, True]], + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + use_depthwise: bool = False, + spp_kernal_sizes: Tuple[int] = (5, 9, 13), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + self.use_depthwise = use_depthwise + self.spp_kernal_sizes = spp_kernal_sizes + super().__init__(self.arch_settings[arch], deepen_factor, widen_factor, + input_channels, out_indices, frozen_stages, plugins, + norm_cfg, act_cfg, norm_eval, init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + return Focus( + 3, + make_divisible(64, self.widen_factor), + kernel_size=3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = make_divisible(in_channels, self.widen_factor) + out_channels = make_divisible(out_channels, self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + stage = [] + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + conv_layer = conv( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=self.spp_kernal_sizes, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + return stage diff --git a/third_party/mmyolo/mmyolo/models/backbones/csp_resnet.py b/third_party/mmyolo/mmyolo/models/backbones/csp_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..a42ed489d8872913f4aacce08497c8e48fdace49 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/backbones/csp_resnet.py @@ -0,0 +1,169 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.backbones import BaseBackbone +from mmyolo.models.layers.yolo_bricks import CSPResLayer +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class PPYOLOECSPResNet(BaseBackbone): + """CSP-ResNet backbone used in PPYOLOE. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + block_cfg (dict): Config dict for block. Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True) + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', momentum=0.1, + eps=1e-5). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + attention_cfg (dict): Config dict for `EffectiveSELayer`. + Defaults to dict(type='EffectiveSELayer', + act_cfg=dict(type='HSigmoid')). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + use_large_stem (bool): Whether to use large stem layer. + Defaults to False. + """ + # From left to right: + # in_channels, out_channels, num_blocks + arch_settings = { + 'P5': [[64, 128, 3], [128, 256, 6], [256, 512, 6], [512, 1024, 3]] + } + + def __init__(self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + arch_ovewrite: dict = None, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + attention_cfg: ConfigType = dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None, + use_large_stem: bool = False): + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + arch_setting = [[ + int(in_channels * widen_factor), + int(out_channels * widen_factor), + round(num_blocks * deepen_factor) + ] for in_channels, out_channels, num_blocks in arch_setting] + self.block_cfg = block_cfg + self.use_large_stem = use_large_stem + self.attention_cfg = attention_cfg + + super().__init__( + arch_setting, + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + if self.use_large_stem: + stem = nn.Sequential( + ConvModule( + self.input_channels, + self.arch_setting[0][0] // 2, + 3, + stride=2, + padding=1, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0] // 2, + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0], + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + else: + stem = nn.Sequential( + ConvModule( + self.input_channels, + self.arch_setting[0][0] // 2, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.arch_setting[0][0] // 2, + self.arch_setting[0][0], + 3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks = setting + + cspres_layer = CSPResLayer( + in_channels=in_channels, + out_channels=out_channels, + num_block=num_blocks, + block_cfg=self.block_cfg, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=self.attention_cfg, + use_spp=False) + return [cspres_layer] diff --git a/third_party/mmyolo/mmyolo/models/backbones/cspnext.py b/third_party/mmyolo/mmyolo/models/backbones/cspnext.py new file mode 100644 index 0000000000000000000000000000000000000000..adca9dd9d11baecefda90a99a4188e78c2ca8188 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/backbones/cspnext.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Sequence, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import SPPFBottleneck +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class CSPNeXt(BaseBackbone): + """CSPNeXt backbone used in RTMDet. + + Args: + arch (str): Architecture of CSPNeXt, from {P5, P6}. + Defaults to P5. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin.Defaults to + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + arch_ovewrite (list): Overwrite default arch settings. + Defaults to None. + channel_attention (bool): Whether to add channel attention in each + stage. Defaults to True. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + # From left to right: + # in_channels, out_channels, num_blocks, add_identity, use_spp + arch_settings = { + 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 1024, 3, False, True]], + 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], + [256, 512, 6, True, False], [512, 768, 3, True, False], + [768, 1024, 3, False, True]] + } + + def __init__( + self, + arch: str = 'P5', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Sequence[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + arch_ovewrite: dict = None, + channel_attention: bool = True, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + arch_setting = self.arch_settings[arch] + if arch_ovewrite: + arch_setting = arch_ovewrite + self.channel_attention = channel_attention + self.use_depthwise = use_depthwise + self.conv = DepthwiseSeparableConvModule \ + if use_depthwise else ConvModule + self.expand_ratio = expand_ratio + self.conv_cfg = conv_cfg + + super().__init__( + arch_setting, + deepen_factor, + widen_factor, + input_channels, + out_indices, + frozen_stages=frozen_stages, + plugins=plugins, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, add_identity, use_spp = setting + + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = max(round(num_blocks * self.deepen_factor), 1) + + stage = [] + conv_layer = self.conv( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(conv_layer) + if use_spp: + spp = SPPFBottleneck( + out_channels, + out_channels, + kernel_sizes=5, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + csp_layer = CSPLayer( + out_channels, + out_channels, + num_blocks=num_blocks, + add_identity=add_identity, + use_depthwise=self.use_depthwise, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + channel_attention=self.channel_attention, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(csp_layer) + return stage diff --git a/third_party/mmyolo/mmyolo/models/backbones/efficient_rep.py b/third_party/mmyolo/mmyolo/models/backbones/efficient_rep.py new file mode 100644 index 0000000000000000000000000000000000000000..32e455f06972af148fa56bba1c4178b0e2d540bd --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/backbones/efficient_rep.py @@ -0,0 +1,305 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.layers.yolo_bricks import CSPSPPFBottleneck, SPPFBottleneck +from mmyolo.registry import MODELS +from ..layers import BepC3StageBlock, RepStageBlock +from ..utils import make_round +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv6EfficientRep(BaseBackbone): + """EfficientRep backbone used in YOLOv6. + Args: + arch (str): Architecture of BaseDarknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (Union[dict, list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv6EfficientRep + >>> import torch + >>> model = YOLOv6EfficientRep() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, use_spp + arch_settings = { + 'P5': [[64, 128, 6, False], [128, 256, 12, False], + [256, 512, 18, False], [512, 1024, 6, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + use_cspsppf: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_eval: bool = False, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.block_cfg = block_cfg + self.use_cspsppf = use_cspsppf + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=self.input_channels, + out_channels=int(self.arch_setting[0][0] * self.widen_factor), + kernel_size=3, + stride=2, + )) + return MODELS.build(block_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, use_spp = setting + + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + + rep_stage_block = RepStageBlock( + in_channels=out_channels, + out_channels=out_channels, + num_blocks=num_blocks, + block_cfg=self.block_cfg, + ) + + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2)) + stage = [] + + ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block) + + stage.append(ef_block) + + if use_spp: + spp = SPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.use_cspsppf: + spp = CSPSPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv6CSPBep(YOLOv6EfficientRep): + """CSPBep backbone used in YOLOv6. + Args: + arch (str): Architecture of BaseDarknet, from {P5, P6}. + Defaults to P5. + plugins (list[dict]): List of plugins for stages, each dict contains: + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + input_channels (int): Number of input image channels. Defaults to 3. + out_indices (Tuple[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (Union[dict, list[dict]], optional): Initialization config + dict. Defaults to None. + Example: + >>> from mmyolo.models import YOLOv6CSPBep + >>> import torch + >>> model = YOLOv6CSPBep() + >>> model.eval() + >>> inputs = torch.rand(1, 3, 416, 416) + >>> level_outputs = model(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + ... + (1, 256, 52, 52) + (1, 512, 26, 26) + (1, 1024, 13, 13) + """ + # From left to right: + # in_channels, out_channels, num_blocks, use_spp + arch_settings = { + 'P5': [[64, 128, 6, False], [128, 256, 12, False], + [256, 512, 18, False], [512, 1024, 6, True]] + } + + def __init__(self, + arch: str = 'P5', + plugins: Union[dict, List[dict]] = None, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + hidden_ratio: float = 0.5, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + use_cspsppf: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + block_cfg: ConfigType = dict(type='ConvWrapper'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + self.use_cspsppf = use_cspsppf + super().__init__( + arch=arch, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, num_blocks, use_spp = setting + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + num_blocks = make_round(num_blocks, self.deepen_factor) + + rep_stage_block = BepC3StageBlock( + in_channels=out_channels, + out_channels=out_channels, + num_blocks=num_blocks, + hidden_ratio=self.hidden_ratio, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=2)) + stage = [] + + ef_block = nn.Sequential(MODELS.build(block_cfg), rep_stage_block) + + stage.append(ef_block) + + if use_spp: + spp = SPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.use_cspsppf: + spp = CSPSPPFBottleneck( + in_channels=out_channels, + out_channels=out_channels, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + stage.append(spp) + return stage diff --git a/third_party/mmyolo/mmyolo/models/backbones/yolov7_backbone.py b/third_party/mmyolo/mmyolo/models/backbones/yolov7_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..bb9a5eed85ca1ee6884f7348ef3745a9ceaba032 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/backbones/yolov7_backbone.py @@ -0,0 +1,285 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Tuple, Union + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.backbones.csp_darknet import Focus +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import MaxPoolAndStrideConvBlock +from .base_backbone import BaseBackbone + + +@MODELS.register_module() +class YOLOv7Backbone(BaseBackbone): + """Backbone used in YOLOv7. + + Args: + arch (str): Architecture of YOLOv7Defaults to L. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (2, 3, 4). + frozen_stages (int): Stages to be frozen (stop grad and set eval + mode). -1 means not freezing any parameters. Defaults to -1. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages'. + norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and + config norm layer. Defaults to dict(type='BN', requires_grad=True). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`]): Initialization config dict. + """ + _tiny_stage1_cfg = dict(type='TinyDownSampleBlock', middle_ratio=0.5) + _tiny_stage2_4_cfg = dict(type='TinyDownSampleBlock', middle_ratio=1.0) + _l_expand_channel_2x = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.5, + num_blocks=2, + num_convs_in_block=2) + _l_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.25, + block_ratio=0.25, + num_blocks=2, + num_convs_in_block=2) + _x_expand_channel_2x = dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + _x_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.2, + block_ratio=0.2, + num_blocks=3, + num_convs_in_block=2) + _w_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.5, + num_blocks=2, + num_convs_in_block=2) + _e_no_change_channel = dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + _d_no_change_channel = dict( + type='ELANBlock', + middle_ratio=1 / 3, + block_ratio=1 / 3, + num_blocks=4, + num_convs_in_block=2) + _e2e_no_change_channel = dict( + type='EELANBlock', + num_elan_block=2, + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2) + + # From left to right: + # in_channels, out_channels, Block_params + arch_settings = { + 'Tiny': [[64, 64, _tiny_stage1_cfg], [64, 128, _tiny_stage2_4_cfg], + [128, 256, _tiny_stage2_4_cfg], + [256, 512, _tiny_stage2_4_cfg]], + 'L': [[64, 256, _l_expand_channel_2x], + [256, 512, _l_expand_channel_2x], + [512, 1024, _l_expand_channel_2x], + [1024, 1024, _l_no_change_channel]], + 'X': [[80, 320, _x_expand_channel_2x], + [320, 640, _x_expand_channel_2x], + [640, 1280, _x_expand_channel_2x], + [1280, 1280, _x_no_change_channel]], + 'W': + [[64, 128, _w_no_change_channel], [128, 256, _w_no_change_channel], + [256, 512, _w_no_change_channel], [512, 768, _w_no_change_channel], + [768, 1024, _w_no_change_channel]], + 'E': + [[80, 160, _e_no_change_channel], [160, 320, _e_no_change_channel], + [320, 640, _e_no_change_channel], [640, 960, _e_no_change_channel], + [960, 1280, _e_no_change_channel]], + 'D': [[96, 192, + _d_no_change_channel], [192, 384, _d_no_change_channel], + [384, 768, _d_no_change_channel], + [768, 1152, _d_no_change_channel], + [1152, 1536, _d_no_change_channel]], + 'E2E': [[80, 160, _e2e_no_change_channel], + [160, 320, _e2e_no_change_channel], + [320, 640, _e2e_no_change_channel], + [640, 960, _e2e_no_change_channel], + [960, 1280, _e2e_no_change_channel]], + } + + def __init__(self, + arch: str = 'L', + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + input_channels: int = 3, + out_indices: Tuple[int] = (2, 3, 4), + frozen_stages: int = -1, + plugins: Union[dict, List[dict]] = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + norm_eval: bool = False, + init_cfg: OptMultiConfig = None): + assert arch in self.arch_settings.keys() + self.arch = arch + super().__init__( + self.arch_settings[arch], + deepen_factor, + widen_factor, + input_channels=input_channels, + out_indices=out_indices, + plugins=plugins, + frozen_stages=frozen_stages, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + norm_eval=norm_eval, + init_cfg=init_cfg) + + def build_stem_layer(self) -> nn.Module: + """Build a stem layer.""" + if self.arch in ['L', 'X']: + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + elif self.arch == 'Tiny': + stem = nn.Sequential( + ConvModule( + 3, + int(self.arch_setting[0][0] * self.widen_factor // 2), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + int(self.arch_setting[0][0] * self.widen_factor // 2), + int(self.arch_setting[0][0] * self.widen_factor), + 3, + padding=1, + stride=2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + elif self.arch in ['W', 'E', 'D', 'E2E']: + stem = Focus( + 3, + int(self.arch_setting[0][0] * self.widen_factor), + kernel_size=3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return stem + + def build_stage_layer(self, stage_idx: int, setting: list) -> list: + """Build a stage layer. + + Args: + stage_idx (int): The index of a stage layer. + setting (list): The architecture setting of a stage layer. + """ + in_channels, out_channels, stage_block_cfg = setting + in_channels = int(in_channels * self.widen_factor) + out_channels = int(out_channels * self.widen_factor) + + stage_block_cfg = stage_block_cfg.copy() + stage_block_cfg.setdefault('norm_cfg', self.norm_cfg) + stage_block_cfg.setdefault('act_cfg', self.act_cfg) + + stage_block_cfg['in_channels'] = in_channels + stage_block_cfg['out_channels'] = out_channels + + stage = [] + if self.arch in ['W', 'E', 'D', 'E2E']: + stage_block_cfg['in_channels'] = out_channels + elif self.arch in ['L', 'X']: + if stage_idx == 0: + stage_block_cfg['in_channels'] = out_channels // 2 + + downsample_layer = self._build_downsample_layer( + stage_idx, in_channels, out_channels) + stage.append(MODELS.build(stage_block_cfg)) + if downsample_layer is not None: + stage.insert(0, downsample_layer) + return stage + + def _build_downsample_layer(self, stage_idx: int, in_channels: int, + out_channels: int) -> Optional[nn.Module]: + """Build a downsample layer pre stage.""" + if self.arch in ['E', 'D', 'E2E']: + downsample_layer = MaxPoolAndStrideConvBlock( + in_channels, + out_channels, + use_in_channels_of_middle=True, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif self.arch == 'W': + downsample_layer = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif self.arch == 'Tiny': + if stage_idx != 0: + downsample_layer = nn.MaxPool2d(2, 2) + else: + downsample_layer = None + elif self.arch in ['L', 'X']: + if stage_idx == 0: + downsample_layer = ConvModule( + in_channels, + out_channels // 2, + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + downsample_layer = MaxPoolAndStrideConvBlock( + in_channels, + in_channels, + use_in_channels_of_middle=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return downsample_layer diff --git a/third_party/mmyolo/mmyolo/models/data_preprocessors/__init__.py b/third_party/mmyolo/mmyolo/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3ef4f6d7d801cb8150ebca645ddb3cbf5d1b9599 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/data_preprocessors/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_preprocessor import (PPYOLOEBatchRandomResize, + PPYOLOEDetDataPreprocessor, + YOLOv5DetDataPreprocessor, + YOLOXBatchSyncRandomResize) + +__all__ = [ + 'YOLOv5DetDataPreprocessor', 'PPYOLOEDetDataPreprocessor', + 'PPYOLOEBatchRandomResize', 'YOLOXBatchSyncRandomResize' +] diff --git a/third_party/mmyolo/mmyolo/models/data_preprocessors/data_preprocessor.py b/third_party/mmyolo/mmyolo/models/data_preprocessors/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..a29b90844323836e0264f827edf27aa20dca2507 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/data_preprocessors/data_preprocessor.py @@ -0,0 +1,310 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from mmdet.models import BatchSyncRandomResize +from mmdet.models.data_preprocessors import DetDataPreprocessor +from mmengine import MessageHub, is_list_of +from mmengine.structures import BaseDataElement +from torch import Tensor + +from mmyolo.registry import MODELS + +CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str, + None] + + +@MODELS.register_module() +class YOLOXBatchSyncRandomResize(BatchSyncRandomResize): + """YOLOX batch random resize. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + """ + + def forward(self, inputs: Tensor, data_samples: dict) -> Tensor and dict: + """resize a batch of images and bboxes to shape ``self._input_size``""" + h, w = inputs.shape[-2:] + inputs = inputs.float() + assert isinstance(data_samples, dict) + + if self._input_size is None: + self._input_size = (h, w) + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1 or scale_y != 1: + inputs = F.interpolate( + inputs, + size=self._input_size, + mode='bilinear', + align_corners=False) + + data_samples['bboxes_labels'][:, 2::2] *= scale_x + data_samples['bboxes_labels'][:, 3::2] *= scale_y + + if 'keypoints' in data_samples: + data_samples['keypoints'][..., 0] *= scale_x + data_samples['keypoints'][..., 1] *= scale_y + + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + self._input_size = self._get_random_size( + aspect_ratio=float(w / h), device=inputs.device) + + return inputs, data_samples + + +@MODELS.register_module() +class YOLOv5DetDataPreprocessor(DetDataPreprocessor): + """Rewrite collate_fn to get faster training speed. + + Note: It must be used together with `mmyolo.datasets.utils.yolov5_collate` + """ + + def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs): + super().__init__(*args, non_blocking=non_blocking, **kwargs) + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding and bgr2rgb conversion based on + ``DetDataPreprocessorr``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # TODO: Supports multi-scale training + if self._channel_conversion and inputs.shape[1] == 3: + inputs = inputs[:, [2, 1, 0], ...] + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples_output = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + if 'masks' in data_samples: + data_samples_output['masks'] = data_samples['masks'] + if 'keypoints' in data_samples: + data_samples_output['keypoints'] = data_samples['keypoints'] + data_samples_output['keypoints_visible'] = data_samples[ + 'keypoints_visible'] + + return {'inputs': inputs, 'data_samples': data_samples_output} + + +@MODELS.register_module() +class PPYOLOEDetDataPreprocessor(DetDataPreprocessor): + """Image pre-processor for detection tasks. + + The main difference between PPYOLOEDetDataPreprocessor and + DetDataPreprocessor is the normalization order. The official + PPYOLOE resize image first, and then normalize image. + In DetDataPreprocessor, the order is reversed. + + Note: It must be used together with + `mmyolo.datasets.utils.yolov5_collate` + """ + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization、padding and bgr2rgb conversion based on + ``BaseDataPreprocessor``. This class use batch_augments first, and then + normalize the image, which is different from the `DetDataPreprocessor` + . + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + assert isinstance(data['inputs'], list) and is_list_of( + data['inputs'], torch.Tensor), \ + '"inputs" should be a list of Tensor, but got ' \ + f'{type(data["inputs"])}. The possible reason for this ' \ + 'is that you are not using it with ' \ + '"mmyolo.datasets.utils.yolov5_collate". Please refer to ' \ + '"cconfigs/ppyoloe/ppyoloe_plus_s_fast_8xb8-80e_coco.py".' + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # Process data. + batch_inputs = [] + for _input in inputs: + # channel transform + if self._channel_conversion: + _input = _input[[2, 1, 0], ...] + # Convert to float after channel conversion to ensure + # efficiency + _input = _input.float() + batch_inputs.append(_input) + + # Batch random resize image. + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(batch_inputs, data_samples) + + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'img_metas': img_metas + } + + return {'inputs': inputs, 'data_samples': data_samples} + + +# TODO: No generality. Its input data format is different +# mmdet's batch aug, and it must be compatible in the future. +@MODELS.register_module() +class PPYOLOEBatchRandomResize(BatchSyncRandomResize): + """PPYOLOE batch random resize. + + Args: + random_size_range (tuple): The multi-scale random range during + multi-scale training. + interval (int): The iter interval of change + image size. Defaults to 10. + size_divisor (int): Image size divisible factor. + Defaults to 32. + random_interp (bool): Whether to choose interp_mode randomly. + If set to True, the type of `interp_mode` must be list. + If set to False, the type of `interp_mode` must be str. + Defaults to True. + interp_mode (Union[List, str]): The modes available for resizing + are ('nearest', 'bilinear', 'bicubic', 'area'). + keep_ratio (bool): Whether to keep the aspect ratio when resizing + the image. Now we only support keep_ratio=False. + Defaults to False. + """ + + def __init__(self, + random_size_range: Tuple[int, int], + interval: int = 1, + size_divisor: int = 32, + random_interp=True, + interp_mode: Union[List[str], str] = [ + 'nearest', 'bilinear', 'bicubic', 'area' + ], + keep_ratio: bool = False) -> None: + super().__init__(random_size_range, interval, size_divisor) + self.random_interp = random_interp + self.keep_ratio = keep_ratio + # TODO: need to support keep_ratio==True + assert not self.keep_ratio, 'We do not yet support keep_ratio=True' + + if self.random_interp: + assert isinstance(interp_mode, list) and len(interp_mode) > 1,\ + 'While random_interp==True, the type of `interp_mode`' \ + ' must be list and len(interp_mode) must large than 1' + self.interp_mode_list = interp_mode + self.interp_mode = None + else: + assert isinstance(interp_mode, str),\ + 'While random_interp==False, the type of ' \ + '`interp_mode` must be str' + assert interp_mode in ['nearest', 'bilinear', 'bicubic', 'area'] + self.interp_mode_list = None + self.interp_mode = interp_mode + + def forward(self, inputs: list, + data_samples: dict) -> Tuple[Tensor, Tensor]: + """Resize a batch of images and bboxes to shape ``self._input_size``. + + The inputs and data_samples should be list, and + ``PPYOLOEBatchRandomResize`` must be used with + ``PPYOLOEDetDataPreprocessor`` and ``yolov5_collate`` with + ``use_ms_training == True``. + """ + assert isinstance(inputs, list),\ + 'The type of inputs must be list. The possible reason for this ' \ + 'is that you are not using it with `PPYOLOEDetDataPreprocessor` ' \ + 'and `yolov5_collate` with use_ms_training == True.' + + bboxes_labels = data_samples['bboxes_labels'] + + message_hub = MessageHub.get_current_instance() + if (message_hub.get_info('iter') + 1) % self._interval == 0: + # get current input size + self._input_size, interp_mode = self._get_random_size_and_interp() + if self.random_interp: + self.interp_mode = interp_mode + + # TODO: need to support type(inputs)==Tensor + if isinstance(inputs, list): + outputs = [] + for i in range(len(inputs)): + _batch_input = inputs[i] + h, w = _batch_input.shape[-2:] + scale_y = self._input_size[0] / h + scale_x = self._input_size[1] / w + if scale_x != 1. or scale_y != 1.: + if self.interp_mode in ('nearest', 'area'): + align_corners = None + else: + align_corners = False + _batch_input = F.interpolate( + _batch_input.unsqueeze(0), + size=self._input_size, + mode=self.interp_mode, + align_corners=align_corners) + + # rescale boxes + indexes = bboxes_labels[:, 0] == i + bboxes_labels[indexes, 2] *= scale_x + bboxes_labels[indexes, 3] *= scale_y + bboxes_labels[indexes, 4] *= scale_x + bboxes_labels[indexes, 5] *= scale_y + + data_samples['bboxes_labels'] = bboxes_labels + else: + _batch_input = _batch_input.unsqueeze(0) + + outputs.append(_batch_input) + + # convert to Tensor + return torch.cat(outputs, dim=0), data_samples + else: + raise NotImplementedError('Not implemented yet!') + + def _get_random_size_and_interp(self) -> Tuple[int, int]: + """Randomly generate a shape in ``_random_size_range`` and a + interp_mode in interp_mode_list.""" + size = random.randint(*self._random_size_range) + input_size = (self._size_divisor * size, self._size_divisor * size) + + if self.random_interp: + interp_ind = random.randint(0, len(self.interp_mode_list) - 1) + interp_mode = self.interp_mode_list[interp_ind] + else: + interp_mode = None + return input_size, interp_mode diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/__init__.py b/third_party/mmyolo/mmyolo/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90587c3fbb280082262d48b031a64ea7c69b3dec --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ppyoloe_head import PPYOLOEHead, PPYOLOEHeadModule +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule +from .rtmdet_ins_head import RTMDetInsSepBNHead, RTMDetInsSepBNHeadModule +from .rtmdet_rotated_head import (RTMDetRotatedHead, + RTMDetRotatedSepBNHeadModule) +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule +from .yolov5_ins_head import YOLOv5InsHead, YOLOv5InsHeadModule +from .yolov6_head import YOLOv6Head, YOLOv6HeadModule +from .yolov7_head import YOLOv7Head, YOLOv7HeadModule, YOLOv7p6HeadModule +from .yolov8_head import YOLOv8Head, YOLOv8HeadModule +from .yolox_head import YOLOXHead, YOLOXHeadModule +from .yolox_pose_head import YOLOXPoseHead, YOLOXPoseHeadModule + +__all__ = [ + 'YOLOv5Head', 'YOLOv6Head', 'YOLOXHead', 'YOLOv5HeadModule', + 'YOLOv6HeadModule', 'YOLOXHeadModule', 'RTMDetHead', + 'RTMDetSepBNHeadModule', 'YOLOv7Head', 'PPYOLOEHead', 'PPYOLOEHeadModule', + 'YOLOv7HeadModule', 'YOLOv7p6HeadModule', 'YOLOv8Head', 'YOLOv8HeadModule', + 'RTMDetRotatedHead', 'RTMDetRotatedSepBNHeadModule', 'RTMDetInsSepBNHead', + 'RTMDetInsSepBNHeadModule', 'YOLOv5InsHead', 'YOLOv5InsHeadModule', + 'YOLOXPoseHead', 'YOLOXPoseHeadModule' +] diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/ppyoloe_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/ppyoloe_head.py new file mode 100644 index 0000000000000000000000000000000000000000..f4689876785c40cbd7449cab8f378c8f6d1c1b89 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/ppyoloe_head.py @@ -0,0 +1,374 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from mmengine import MessageHub +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..layers.yolo_bricks import PPYOLOESELayer +from ..utils import gt_instances_preprocess +from .yolov6_head import YOLOv6Head + + +@MODELS.register_module() +class PPYOLOEHeadModule(BaseModule): + """PPYOLOEHead head module used in `PPYOLOE. + + `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}`` + in QFL setting. Defaults to 16. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + featmap_strides: Sequence[int] = (8, 16, 32), + reg_max: int = 16, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.reg_max = reg_max + + if isinstance(in_channels, int): + self.in_channels = [int(in_channels * widen_factor) + ] * self.num_levels + else: + self.in_channels = [int(i * widen_factor) for i in in_channels] + + self._init_layers() + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for conv in self.cls_preds: + conv.bias.data.fill_(bias_init_with_prob(prior_prob)) + conv.weight.data.fill_(0.) + + for conv in self.reg_preds: + conv.bias.data.fill_(1.0) + conv.weight.data.fill_(0.) + + def _init_layers(self): + """initialize conv layers in PPYOLOE head.""" + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.cls_stems = nn.ModuleList() + self.reg_stems = nn.ModuleList() + + for in_channel in self.in_channels: + self.cls_stems.append( + PPYOLOESELayer( + in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) + self.reg_stems.append( + PPYOLOESELayer( + in_channel, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg)) + + for in_channel in self.in_channels: + self.cls_preds.append( + nn.Conv2d(in_channel, self.num_classes, 3, padding=1)) + self.reg_preds.append( + nn.Conv2d(in_channel, 4 * (self.reg_max + 1), 3, padding=1)) + + # init proj + proj = torch.arange(self.reg_max + 1, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tensor: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions. + """ + assert len(x) == self.num_levels + + return multi_apply(self.forward_single, x, self.cls_stems, + self.cls_preds, self.reg_stems, self.reg_preds) + + def forward_single(self, x: Tensor, cls_stem: nn.ModuleList, + cls_pred: nn.ModuleList, reg_stem: nn.ModuleList, + reg_pred: nn.ModuleList) -> Tensor: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + avg_feat = F.adaptive_avg_pool2d(x, (1, 1)) + cls_logit = cls_pred(cls_stem(x, avg_feat) + x) + bbox_dist_preds = reg_pred(reg_stem(x, avg_feat)) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max + 1, h * w]).permute(0, 3, 1, 2) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + +@MODELS.register_module() +class PPYOLOEHead(YOLOv6Head): + """PPYOLOEHead head used in `PPYOLOE `_. + The YOLOv6 head and the PPYOLOE head are only slightly different. + Distribution focal loss is extra used in PPYOLOE, but not in YOLOv6. + + Args: + head_module(ConfigType): Base module used for YOLOv5Head + prior_generator(dict): Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_dfl (:obj:`ConfigDict` or dict): Config of distribution focal + loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + loss_dfl: ConfigType = dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=0.5 / 4), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + self.loss_dfl = MODELS.build(loss_dfl) + # ppyoloe doesn't need loss_obj + self.loss_obj = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + # get epoch information from message hub + message_hub = MessageHub.get_current_instance() + current_epoch = message_hub.get_info('epoch') + + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, reg_max+1, n, 4) -> (bs, n, 4, reg_max+1) + flatten_pred_dists = [ + bbox_pred_org.permute(0, 2, 3, 1).reshape( + num_imgs, -1, (self.head_module.reg_max + 1) * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + pred_scores = torch.sigmoid(flatten_cls_preds) + + if current_epoch < self.initial_epoch: + assigned_result = self.initial_assigner( + flatten_pred_bboxes.detach(), self.flatten_priors_train, + self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag) + else: + assigned_result = self.assigner(flatten_pred_bboxes.detach(), + pred_scores.detach(), + self.flatten_priors_train, + gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + # cls loss + with torch.cuda.amp.autocast(enabled=False): + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores) + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + assigned_scores_sum = assigned_scores.sum() + # reduce_mean between all gpus + assigned_scores_sum = torch.clamp( + reduce_mean(assigned_scores_sum), min=1) + loss_cls /= assigned_scores_sum + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, + assigned_bboxes_pos, + weight=bbox_weight, + avg_factor=assigned_scores_sum) + + # dfl loss + dist_mask = fg_mask_pre_prior.unsqueeze(-1).repeat( + [1, 1, (self.head_module.reg_max + 1) * 4]) + + pred_dist_pos = torch.masked_select( + flatten_dist_preds, + dist_mask).reshape([-1, 4, self.head_module.reg_max + 1]) + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl( + pred_dist_pos.reshape(-1, self.head_module.reg_max + 1), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dfl=loss_dfl) diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..54245a97f404b66eba47e41f03302110c8894134 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_head.py @@ -0,0 +1,368 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule, is_norm +from mmdet.models.task_modules.samplers import PseudoSampler +from mmdet.structures.bbox import distance2bbox +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig, reduce_mean) +from mmengine.model import (BaseModule, bias_init_with_prob, constant_init, + normal_init) +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class RTMDetSepBNHeadModule(BaseModule): + """Detection Head of RTMDet. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. Defaults to 1. + feat_channels (int): Number of hidden channels. Used in child classes. + Defaults to 256 + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN')``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + share_conv: bool = True, + pred_kernel_size: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + super().__init__(init_cfg=init_cfg) + self.share_conv = share_conv + self.num_classes = num_classes + self.pred_kernel_size = pred_kernel_size + self.feat_channels = int(feat_channels * widen_factor) + self.stacked_convs = stacked_convs + self.num_base_priors = num_base_priors + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.featmap_strides = featmap_strides + + self.in_channels = int(in_channels * widen_factor) + + self._init_layers() + + def _init_layers(self): + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + for n in range(len(self.featmap_strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(reg_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.num_classes, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + + if self.share_conv: + for n in range(len(self.featmap_strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + def init_weights(self) -> None: + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + """ + + cls_scores = [] + bbox_preds = [] + for idx, x in enumerate(feats): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + reg_dist = self.rtm_reg[idx](reg_feat) + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + return tuple(cls_scores), tuple(bbox_preds) + + +@MODELS.register_module() +class RTMDetHead(YOLOv5Head): + """RTMDet head. + + Args: + head_module(ConfigType): Base module used for RTMDetHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.GIoULoss', loss_weight=2.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if self.use_sigmoid_cls: + self.cls_out_channels = self.num_classes + else: + self.cls_out_channels = self.num_classes + 1 + # rtmdet doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + if self.train_cfg.get('sampler', None) is not None: + self.sampler = TASK_UTILS.build( + self.train_cfg.sampler, default_args=dict(context=self)) + else: + self.sampler = PseudoSampler(context=self) + + self.featmap_sizes_train = None + self.flatten_priors_train = None + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + return self.head_module(x) + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_bboxes = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_bboxes = flatten_bboxes * self.flatten_priors_train[..., -1, + None] + flatten_bboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_bboxes) + + assigned_result = self.assigner(flatten_bboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + label_weights = assigned_result['assigned_labels_weights'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4) + assign_metrics = assigned_result['assign_metrics'].reshape(-1) + cls_preds = flatten_cls_scores.reshape(-1, self.num_classes) + bbox_preds = flatten_bboxes.reshape(-1, 4) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item() + + loss_cls = self.loss_cls( + cls_preds, (labels, assign_metrics), + label_weights, + avg_factor=avg_factor) + + if len(pos_inds) > 0: + loss_bbox = self.loss_bbox( + bbox_preds[pos_inds], + bbox_targets[pos_inds], + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + else: + loss_bbox = bbox_preds.sum() * 0 + + return dict(loss_cls=loss_cls, loss_bbox=loss_bbox) diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_ins_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_ins_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1d0562aad6fb977516924ef9cd72cdef54ff0016 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_ins_head.py @@ -0,0 +1,725 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, is_norm +from mmcv.ops import batched_nms +from mmdet.models.utils import filter_scores_and_topk +from mmdet.structures.bbox import get_box_tensor, get_box_wh, scale_boxes +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig) +from mmengine import ConfigDict +from mmengine.model import (BaseModule, bias_init_with_prob, constant_init, + normal_init) +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule + + +class MaskFeatModule(BaseModule): + """Mask feature head used in RTMDet-Ins. Copy from mmdet. + + Args: + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels of the mask feature + map branch. + stacked_convs (int): Number of convs in mask feature branch. + num_levels (int): The starting feature map level from RPN that + will be used to predict the mask feature map. + num_prototypes (int): Number of output channel of the mask feature + map branch. This is the channel count of the mask + feature map that to be dynamically convolved with the predicted + kernel. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='ReLU', inplace=True) + norm_cfg (dict): Config dict for normalization layer. Default: None. + """ + + def __init__( + self, + in_channels: int, + feat_channels: int = 256, + stacked_convs: int = 4, + num_levels: int = 3, + num_prototypes: int = 8, + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + norm_cfg: ConfigType = dict(type='BN') + ) -> None: + super().__init__(init_cfg=None) + self.num_levels = num_levels + self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1) + convs = [] + for i in range(stacked_convs): + in_c = in_channels if i == 0 else feat_channels + convs.append( + ConvModule( + in_c, + feat_channels, + 3, + padding=1, + act_cfg=act_cfg, + norm_cfg=norm_cfg)) + self.stacked_convs = nn.Sequential(*convs) + self.projection = nn.Conv2d( + feat_channels, num_prototypes, kernel_size=1) + + def forward(self, features: Tuple[Tensor, ...]) -> Tensor: + # multi-level feature fusion + fusion_feats = [features[0]] + size = features[0].shape[-2:] + for i in range(1, self.num_levels): + f = F.interpolate(features[i], size=size, mode='bilinear') + fusion_feats.append(f) + fusion_feats = torch.cat(fusion_feats, dim=1) + fusion_feats = self.fusion_conv(fusion_feats) + # pred mask feats + mask_features = self.stacked_convs(fusion_feats) + mask_features = self.projection(mask_features) + return mask_features + + +@MODELS.register_module() +class RTMDetInsSepBNHeadModule(RTMDetSepBNHeadModule): + """Detection and Instance Segmentation Head of RTMDet. + + Args: + num_classes (int): Number of categories excluding the background + category. + num_prototypes (int): Number of mask prototype features extracted + from the mask head. Defaults to 8. + dyconv_channels (int): Channel of the dynamic conv layers. + Defaults to 8. + num_dyconvs (int): Number of the dynamic convolution layers. + Defaults to 3. + use_sigmoid_cls (bool): Use sigmoid for class prediction. + Defaults to True. + """ + + def __init__(self, + num_classes: int, + *args, + num_prototypes: int = 8, + dyconv_channels: int = 8, + num_dyconvs: int = 3, + use_sigmoid_cls: bool = True, + **kwargs): + self.num_prototypes = num_prototypes + self.num_dyconvs = num_dyconvs + self.dyconv_channels = dyconv_channels + self.use_sigmoid_cls = use_sigmoid_cls + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + super().__init__(num_classes=num_classes, *args, **kwargs) + + def _init_layers(self): + """Initialize layers of the head.""" + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.kernel_convs = nn.ModuleList() + + self.rtm_cls = nn.ModuleList() + self.rtm_reg = nn.ModuleList() + self.rtm_kernel = nn.ModuleList() + self.rtm_obj = nn.ModuleList() + + # calculate num dynamic parameters + weight_nums, bias_nums = [], [] + for i in range(self.num_dyconvs): + if i == 0: + weight_nums.append( + (self.num_prototypes + 2) * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + elif i == self.num_dyconvs - 1: + weight_nums.append(self.dyconv_channels) + bias_nums.append(1) + else: + weight_nums.append(self.dyconv_channels * self.dyconv_channels) + bias_nums.append(self.dyconv_channels) + self.weight_nums = weight_nums + self.bias_nums = bias_nums + self.num_gen_params = sum(weight_nums) + sum(bias_nums) + pred_pad_size = self.pred_kernel_size // 2 + + for n in range(len(self.featmap_strides)): + cls_convs = nn.ModuleList() + reg_convs = nn.ModuleList() + kernel_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + kernel_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append(cls_convs) + self.reg_convs.append(cls_convs) + self.kernel_convs.append(kernel_convs) + + self.rtm_cls.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.cls_out_channels, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_reg.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * 4, + self.pred_kernel_size, + padding=pred_pad_size)) + self.rtm_kernel.append( + nn.Conv2d( + self.feat_channels, + self.num_gen_params, + self.pred_kernel_size, + padding=pred_pad_size)) + + if self.share_conv: + for n in range(len(self.featmap_strides)): + for i in range(self.stacked_convs): + self.cls_convs[n][i].conv = self.cls_convs[0][i].conv + self.reg_convs[n][i].conv = self.reg_convs[0][i].conv + + self.mask_head = MaskFeatModule( + in_channels=self.in_channels, + feat_channels=self.feat_channels, + stacked_convs=4, + num_levels=len(self.featmap_strides), + num_prototypes=self.num_prototypes, + act_cfg=self.act_cfg, + norm_cfg=self.norm_cfg) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + for m in self.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, mean=0, std=0.01) + if is_norm(m): + constant_init(m, 1) + bias_cls = bias_init_with_prob(0.01) + for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg, + self.rtm_kernel): + normal_init(rtm_cls, std=0.01, bias=bias_cls) + normal_init(rtm_reg, std=0.01, bias=1) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale + levels, each is a 4D-tensor, the channels number is + num_gen_params. + - mask_feat (Tensor): Mask prototype features. + Has shape (batch_size, num_prototypes, H, W). + """ + mask_feat = self.mask_head(feats) + + cls_scores = [] + bbox_preds = [] + kernel_preds = [] + for idx, (x, stride) in enumerate(zip(feats, self.featmap_strides)): + cls_feat = x + reg_feat = x + kernel_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for kernel_layer in self.kernel_convs[idx]: + kernel_feat = kernel_layer(kernel_feat) + kernel_pred = self.rtm_kernel[idx](kernel_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + reg_dist = self.rtm_reg[idx](reg_feat) + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + kernel_preds.append(kernel_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple( + kernel_preds), mask_feat + + +@MODELS.register_module() +class RTMDetInsSepBNHead(RTMDetHead): + """RTMDet Instance Segmentation head. + + Args: + head_module(ConfigType): Base module used for RTMDetInsSepBNHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_mask (:obj:`ConfigDict` or dict): Config of mask loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.GIoULoss', loss_weight=2.0), + loss_mask=dict( + type='mmdet.DiceLoss', + loss_weight=2.0, + eps=5e-6, + reduction='mean'), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) + if isinstance(self.head_module, RTMDetInsSepBNHeadModule): + assert self.use_sigmoid_cls == self.head_module.use_sigmoid_cls + self.loss_mask = MODELS.build(loss_mask) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + kernel_preds: List[Tensor], + mask_feats: Tensor, + score_factors: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted from the head into + bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + kernel_preds (list[Tensor]): Kernel predictions of dynamic + convs for all scale levels, each is a 4D-tensor, has shape + (batch_size, num_params, H, W). + mask_feats (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, num_prototypes, H, W). + score_factors (list[Tensor], optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Defaults to None. + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection and instance + segmentation results of each image after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_kernel_preds = [ + kernel_pred.permute(0, 2, 3, + 1).reshape(num_imgs, -1, + self.head_module.num_gen_params) + for kernel_pred in kernel_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[..., :2].unsqueeze(0), flatten_bbox_preds, + flatten_stride) + + flatten_kernel_preds = torch.cat(flatten_kernel_preds, dim=1) + + results_list = [] + for (bboxes, scores, kernel_pred, mask_feat, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_kernel_preds, mask_feats, + batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + empty_results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict( + labels=labels[:, 0], + kernel_pred=kernel_pred, + priors=flatten_priors)) + labels = results['labels'] + kernel_pred = results['kernel_pred'] + priors = results['priors'] + else: + out = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict( + kernel_pred=kernel_pred, priors=flatten_priors)) + scores, labels, keep_idxs, filtered_results = out + kernel_pred = filtered_results['kernel_pred'] + priors = filtered_results['priors'] + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs], + kernels=kernel_pred, + priors=priors) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_mask_post_process( + results=results, + mask_feat=mask_feat, + cfg=cfg, + rescale_bbox=False, + rescale_mask=rescale, + with_nms=with_nms, + pad_param=pad_param, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list + + def _bbox_mask_post_process( + self, + results: InstanceData, + mask_feat: Tensor, + cfg: ConfigDict, + rescale_bbox: bool = False, + rescale_mask: bool = True, + with_nms: bool = True, + pad_param: Optional[np.ndarray] = None, + img_meta: Optional[dict] = None) -> InstanceData: + """bbox and mask post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + results (:obj:`InstaceData`): Detection instance results, + each item has shape (num_bboxes, ). + mask_feat (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, num_prototypes, H, W). + cfg (ConfigDict): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale_bbox (bool): If True, return boxes in original image space. + Default to False. + rescale_mask (bool): If True, return masks in original image space. + Default to True. + with_nms (bool): If True, do nms before return boxes. + Default to True. + img_meta (dict, optional): Image meta info. Defaults to None. + + Returns: + :obj:`InstanceData`: Detection results of each image + after the post process. + Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + if rescale_bbox: + assert img_meta.get('scale_factor') is not None + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if hasattr(results, 'score_factors'): + # TODO: Add sqrt operation in order to be consistent with + # the paper. + score_factors = results.pop('score_factors') + results.scores = results.scores * score_factors + + # filter small size bboxes + if cfg.get('min_bbox_size', -1) >= 0: + w, h = get_box_wh(results.bboxes) + valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size) + if not valid_mask.all(): + results = results[valid_mask] + + # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg + assert with_nms, 'with_nms must be True for RTMDet-Ins' + if results.bboxes.numel() > 0: + bboxes = get_box_tensor(results.bboxes) + det_bboxes, keep_idxs = batched_nms(bboxes, results.scores, + results.labels, cfg.nms) + results = results[keep_idxs] + # some nms would reweight the score, such as softnms + results.scores = det_bboxes[:, -1] + results = results[:cfg.max_per_img] + + # process masks + mask_logits = self._mask_predict_by_feat(mask_feat, + results.kernels, + results.priors) + + stride = self.prior_generator.strides[0][0] + mask_logits = F.interpolate( + mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear') + if rescale_mask: + # TODO: When use mmdet.Resize or mmdet.Pad, will meet bug + # Use img_meta to crop and resize + ori_h, ori_w = img_meta['ori_shape'][:2] + if isinstance(pad_param, np.ndarray): + pad_param = pad_param.astype(np.int32) + crop_y1, crop_y2 = pad_param[ + 0], mask_logits.shape[-2] - pad_param[1] + crop_x1, crop_x2 = pad_param[ + 2], mask_logits.shape[-1] - pad_param[3] + mask_logits = mask_logits[..., crop_y1:crop_y2, + crop_x1:crop_x2] + mask_logits = F.interpolate( + mask_logits, + size=[ori_h, ori_w], + mode='bilinear', + align_corners=False) + + masks = mask_logits.sigmoid().squeeze(0) + masks = masks > cfg.mask_thr_binary + results.masks = masks + else: + h, w = img_meta['ori_shape'][:2] if rescale_mask else img_meta[ + 'img_shape'][:2] + results.masks = torch.zeros( + size=(results.bboxes.shape[0], h, w), + dtype=torch.bool, + device=results.bboxes.device) + return results + + def _mask_predict_by_feat(self, mask_feat: Tensor, kernels: Tensor, + priors: Tensor) -> Tensor: + """Generate mask logits from mask features with dynamic convs. + + Args: + mask_feat (Tensor): Mask prototype features. + Has shape (num_prototypes, H, W). + kernels (Tensor): Kernel parameters for each instance. + Has shape (num_instance, num_params) + priors (Tensor): Center priors for each instance. + Has shape (num_instance, 4). + Returns: + Tensor: Instance segmentation masks for each instance. + Has shape (num_instance, H, W). + """ + num_inst = kernels.shape[0] + h, w = mask_feat.size()[-2:] + if num_inst < 1: + return torch.empty( + size=(num_inst, h, w), + dtype=mask_feat.dtype, + device=mask_feat.device) + if len(mask_feat.shape) < 4: + mask_feat.unsqueeze(0) + + coord = self.prior_generator.single_level_grid_priors( + (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2) + num_inst = priors.shape[0] + points = priors[:, :2].reshape(-1, 1, 2) + strides = priors[:, 2:].reshape(-1, 1, 2) + relative_coord = (points - coord).permute(0, 2, 1) / ( + strides[..., 0].reshape(-1, 1, 1) * 8) + relative_coord = relative_coord.reshape(num_inst, 2, h, w) + + mask_feat = torch.cat( + [relative_coord, + mask_feat.repeat(num_inst, 1, 1, 1)], dim=1) + weights, biases = self.parse_dynamic_params(kernels) + + n_layers = len(weights) + x = mask_feat.reshape(1, -1, h, w) + for i, (weight, bias) in enumerate(zip(weights, biases)): + x = F.conv2d( + x, weight, bias=bias, stride=1, padding=0, groups=num_inst) + if i < n_layers - 1: + x = F.relu(x) + x = x.reshape(num_inst, h, w) + return x + + def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple: + """split kernel head prediction to conv weight and bias.""" + n_inst = flatten_kernels.size(0) + n_layers = len(self.head_module.weight_nums) + params_splits = list( + torch.split_with_sizes( + flatten_kernels, + self.head_module.weight_nums + self.head_module.bias_nums, + dim=1)) + weight_splits = params_splits[:n_layers] + bias_splits = params_splits[n_layers:] + for i in range(n_layers): + if i < n_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + n_inst * self.head_module.dyconv_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape( + n_inst * self.head_module.dyconv_channels) + else: + weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst) + + return weight_splits, bias_splits + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + raise NotImplementedError diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_rotated_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_rotated_head.py new file mode 100644 index 0000000000000000000000000000000000000000..1428b4fd05065e3dba764313febc46d6125408ac --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/rtmdet_rotated_head.py @@ -0,0 +1,641 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from typing import List, Optional, Sequence, Tuple + +import torch +import torch.nn as nn +from mmdet.models.utils import filter_scores_and_topk +from mmdet.structures.bbox import HorizontalBoxes, distance2bbox +from mmdet.structures.bbox.transforms import bbox_cxcywh_to_xyxy, scale_boxes +from mmdet.utils import (ConfigType, InstanceList, OptConfigType, + OptInstanceList, OptMultiConfig, reduce_mean) +from mmengine.config import ConfigDict +from mmengine.model import normal_init +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .rtmdet_head import RTMDetHead, RTMDetSepBNHeadModule + +try: + from mmrotate.structures.bbox import RotatedBoxes, distance2obb + MMROTATE_AVAILABLE = True +except ImportError: + RotatedBoxes = None + distance2obb = None + MMROTATE_AVAILABLE = False + + +@MODELS.register_module() +class RTMDetRotatedSepBNHeadModule(RTMDetSepBNHeadModule): + """Detection Head Module of RTMDet-R. + + Compared with RTMDet Detection Head Module, RTMDet-R adds + a conv for angle prediction. + An `angle_out_dim` arg is added, which is generated by the + angle_coder module and controls the angle pred dim. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. Defaults to 1. + feat_channels (int): Number of hidden channels. Used in child classes. + Defaults to 256 + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + share_conv (bool): Whether to share conv layers between stages. + Defaults to True. + pred_kernel_size (int): Kernel size of ``nn.Conv2d``. Defaults to 1. + angle_out_dim (int): Encoded length of angle, will passed by head. + Defaults to 1. + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN')``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: int, + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + share_conv: bool = True, + pred_kernel_size: int = 1, + angle_out_dim: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + self.angle_out_dim = angle_out_dim + super().__init__( + num_classes=num_classes, + in_channels=in_channels, + widen_factor=widen_factor, + num_base_priors=num_base_priors, + feat_channels=feat_channels, + stacked_convs=stacked_convs, + featmap_strides=featmap_strides, + share_conv=share_conv, + pred_kernel_size=pred_kernel_size, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def _init_layers(self): + """Initialize layers of the head.""" + super()._init_layers() + self.rtm_ang = nn.ModuleList() + for _ in range(len(self.featmap_strides)): + self.rtm_ang.append( + nn.Conv2d( + self.feat_channels, + self.num_base_priors * self.angle_out_dim, + self.pred_kernel_size, + padding=self.pred_kernel_size // 2)) + + def init_weights(self) -> None: + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + for rtm_ang in self.rtm_ang: + normal_init(rtm_ang, std=0.01) + + def forward(self, feats: Tuple[Tensor, ...]) -> tuple: + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually a tuple of classification scores and bbox prediction + - cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * num_classes. + - bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * 4. + - angle_preds (list[Tensor]): Angle prediction for all scale + levels, each is a 4D-tensor, the channels number is + num_base_priors * angle_out_dim. + """ + + cls_scores = [] + bbox_preds = [] + angle_preds = [] + for idx, x in enumerate(feats): + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs[idx]: + cls_feat = cls_layer(cls_feat) + cls_score = self.rtm_cls[idx](cls_feat) + + for reg_layer in self.reg_convs[idx]: + reg_feat = reg_layer(reg_feat) + + reg_dist = self.rtm_reg[idx](reg_feat) + angle_pred = self.rtm_ang[idx](reg_feat) + + cls_scores.append(cls_score) + bbox_preds.append(reg_dist) + angle_preds.append(angle_pred) + return tuple(cls_scores), tuple(bbox_preds), tuple(angle_preds) + + +@MODELS.register_module() +class RTMDetRotatedHead(RTMDetHead): + """RTMDet-R head. + + Compared with RTMDetHead, RTMDetRotatedHead add some args to support + rotated object detection. + + - `angle_version` used to limit angle_range during training. + - `angle_coder` used to encode and decode angle, which is similar + to bbox_coder. + - `use_hbbox_loss` and `loss_angle` allow custom regression loss + calculation for rotated box. + + There are three combination options for regression: + + 1. `use_hbbox_loss=False` and loss_angle is None. + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + └────►decode──(a)─┘ + + 2. `use_hbbox_loss=False` and loss_angle is specified. + A angle loss is added on angle_pred. + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──►rbox_pred──(xywha)─►loss_bbox + │ ▲ + ├────►decode──(a)─┘ + │ + └───────────────────────────────────────────►loss_angle + + 3. `use_hbbox_loss=True` and loss_angle is specified. + In this case the loss_angle must be set. + + .. code:: text + + bbox_pred──(tblr)──►decode──►hbox_pred──(xyxy)──►loss_bbox + + angle_pred──────────────────────────────────────►loss_angle + + - There's a `decoded_with_angle` flag in test_cfg, which is similar + to training process. + + When `decoded_with_angle=True`: + + .. code:: text + + bbox_pred────(tblr)───┐ + ▼ + angle_pred decode──(xywha)──►rbox_pred + │ ▲ + └────►decode──(a)─┘ + + When `decoded_with_angle=False`: + + .. code:: text + + bbox_pred──(tblr)─►decode + │ (xyxy) + ▼ + format───(xywh)──►concat──(xywha)──►rbox_pred + ▲ + angle_pred────────►decode────(a)───────┘ + + Args: + head_module(ConfigType): Base module used for RTMDetRotatedHead. + prior_generator: Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + angle_version (str): Angle representations. Defaults to 'le90'. + use_hbbox_loss (bool): If true, use horizontal bbox loss and + loss_angle should not be None. Default to False. + angle_coder (:obj:`ConfigDict` or dict): Config of angle coder. + loss_angle (:obj:`ConfigDict` or dict, optional): Config of angle loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', strides=[8, 16, 32], + offset=0), + bbox_coder: ConfigType = dict(type='DistanceAnglePointCoder'), + loss_cls: ConfigType = dict( + type='mmdet.QualityFocalLoss', + use_sigmoid=True, + beta=2.0, + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmrotate.RotatedIoULoss', mode='linear', + loss_weight=2.0), + angle_version: str = 'le90', + use_hbbox_loss: bool = False, + angle_coder: ConfigType = dict(type='mmrotate.PseudoAngleCoder'), + loss_angle: OptConfigType = None, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + self.angle_version = angle_version + self.use_hbbox_loss = use_hbbox_loss + if self.use_hbbox_loss: + assert loss_angle is not None, \ + ('When use hbbox loss, loss_angle needs to be specified') + self.angle_coder = TASK_UTILS.build(angle_coder) + self.angle_out_dim = self.angle_coder.encode_size + if head_module.get('angle_out_dim') is not None: + warnings.warn('angle_out_dim will be overridden by angle_coder ' + 'and does not need to be set manually') + + head_module['angle_out_dim'] = self.angle_out_dim + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + if loss_angle is not None: + self.loss_angle = MODELS.build(loss_angle) + else: + self.loss_angle = None + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + angle_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into bbox + results. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + angle_preds (list[Tensor]): Box angle for each scale level + with shape (N, num_points * angle_dim, H, W) + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 5), + the last dimension 4 arrange as (x, y, w, h, angle). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + # Whether to decode rbox with angle. + # different setting lead to different final results. + # Defaults to True. + decode_with_angle = cfg.get('decode_with_angle', True) + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_angle_preds = [ + angle_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.angle_out_dim) + for angle_pred in angle_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_angle_preds = torch.cat(flatten_angle_preds, dim=1) + flatten_angle_preds = self.angle_coder.decode( + flatten_angle_preds, keepdim=True) + + if decode_with_angle: + flatten_rbbox_preds = torch.cat( + [flatten_bbox_preds, flatten_angle_preds], dim=-1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_rbbox_preds, flatten_stride) + else: + flatten_decoded_hbboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + flatten_decoded_hbboxes = HorizontalBoxes.xyxy_to_cxcywh( + flatten_decoded_hbboxes) + flatten_decoded_bboxes = torch.cat( + [flatten_decoded_hbboxes, flatten_angle_preds], dim=-1) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = RotatedBoxes(bboxes) + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=RotatedBoxes(bboxes[keep_idxs])) + + if rescale: + if pad_param is not None: + results.bboxes.translate_([-pad_param[2], -pad_param[0]]) + + scale_factor = [1 / s for s in img_meta['scale_factor']] + results.bboxes = scale_boxes(results.bboxes, scale_factor) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + + results_list.append(results) + return results_list + + def loss_by_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + angle_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + angle_preds (list[Tensor]): Angle prediction for each scale + level with shape (N, num_anchors * angle_out_dim, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xywha + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_tblrs = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_tblrs = flatten_tblrs * self.flatten_priors_train[..., -1, + None] + flatten_angles = torch.cat([ + angle_pred.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.angle_out_dim) for angle_pred in angle_preds + ], 1) + flatten_decoded_angle = self.angle_coder.decode( + flatten_angles, keepdim=True) + flatten_tblra = torch.cat([flatten_tblrs, flatten_decoded_angle], + dim=-1) + flatten_rbboxes = distance2obb( + self.flatten_priors_train[..., :2], + flatten_tblra, + angle_version=self.angle_version) + if self.use_hbbox_loss: + flatten_hbboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_tblrs) + + assigned_result = self.assigner(flatten_rbboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + label_weights = assigned_result['assigned_labels_weights'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 5) + assign_metrics = assigned_result['assign_metrics'].reshape(-1) + cls_preds = flatten_cls_scores.reshape(-1, self.num_classes) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + avg_factor = reduce_mean(assign_metrics.sum()).clamp_(min=1).item() + + loss_cls = self.loss_cls( + cls_preds, (labels, assign_metrics), + label_weights, + avg_factor=avg_factor) + + pos_bbox_targets = bbox_targets[pos_inds] + + if self.use_hbbox_loss: + bbox_preds = flatten_hbboxes.reshape(-1, 4) + pos_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets[:, :4]) + else: + bbox_preds = flatten_rbboxes.reshape(-1, 5) + angle_preds = flatten_angles.reshape(-1, self.angle_out_dim) + + if len(pos_inds) > 0: + loss_bbox = self.loss_bbox( + bbox_preds[pos_inds], + pos_bbox_targets, + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + loss_angle = angle_preds.sum() * 0 + if self.loss_angle is not None: + pos_angle_targets = bbox_targets[pos_inds][:, 4:5] + pos_angle_targets = self.angle_coder.encode(pos_angle_targets) + loss_angle = self.loss_angle( + angle_preds[pos_inds], + pos_angle_targets, + weight=assign_metrics[pos_inds], + avg_factor=avg_factor) + else: + loss_bbox = bbox_preds.sum() * 0 + loss_angle = angle_preds.sum() * 0 + + losses = dict() + losses['loss_cls'] = loss_cls + losses['loss_bbox'] = loss_bbox + if self.loss_angle is not None: + losses['loss_angle'] = loss_angle + + return losses diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..fb24617fc17c2861ea150b0fb9ceb3d8a145bb9d --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_head.py @@ -0,0 +1,895 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.models.dense_heads.base_dense_head import BaseDenseHead +from mmdet.models.utils import filter_scores_and_topk, multi_apply +from mmdet.structures.bbox import bbox_overlaps +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine.config import ConfigDict +from mmengine.dist import get_dist_info +from mmengine.logging import print_log +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import make_divisible + + +def get_prior_xy_info(index: int, num_base_priors: int, + featmap_sizes: int) -> Tuple[int, int, int]: + """Get prior index and xy index in feature map by flatten index.""" + _, featmap_w = featmap_sizes + priors = index % num_base_priors + xy_index = index // num_base_priors + grid_y = xy_index // featmap_w + grid_x = xy_index % featmap_w + return priors, grid_x, grid_y + + +@MODELS.register_module() +class YOLOv5HeadModule(BaseModule): + """YOLOv5Head head module used in `YOLOv5`. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to (8, 16, 32). + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 3, + featmap_strides: Sequence[int] = (8, 16, 32), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.widen_factor = widen_factor + + self.featmap_strides = featmap_strides + self.num_out_attrib = 5 + self.num_classes + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + + if isinstance(in_channels, int): + self.in_channels = [make_divisible(in_channels, widen_factor) + ] * self.num_levels + else: + self.in_channels = [ + make_divisible(i, widen_factor) for i in in_channels + ] + + self._init_layers() + + def _init_layers(self): + """initialize conv layers in YOLOv5 head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Conv2d(self.in_channels[i], + self.num_base_priors * self.num_out_attrib, + 1) + + self.convs_pred.append(conv_pred) + + def init_weights(self): + """Initialize the bias of YOLOv5 head.""" + super().init_weights() + for mi, s in zip(self.convs_pred, self.featmap_strides): # from + b = mi.bias.data.view(self.num_base_priors, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + # NOTE: The following initialization can only be performed on the + # bias of the category, if the following initialization is + # performed on the bias of mask coefficient, + # there will be a significant decrease in mask AP. + b.data[:, 5:5 + self.num_classes] += math.log( + 0.6 / (self.num_classes - 0.999999)) + + mi.bias.data = b.view(-1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.convs_pred) + + def forward_single(self, x: Tensor, + convs: nn.Module) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + pred_map = convs(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib, + ny, nx) + + cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + + return cls_score, bbox_pred, objectness + + +@MODELS.register_module() +class YOLOv5Head(BaseDenseHead): + """YOLOv5Head head used in `YOLOv5`. + + Args: + head_module(ConfigType): Base module used for YOLOv5Head + prior_generator(dict): Points generator feature maps in + 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss. + prior_match_thr (float): Defaults to 4.0. + ignore_iof_thr (float): Defaults to -1.0. + obj_level_weights (List[float]): Defaults to [4.0, 1.0, 0.4]. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.YOLOAnchorGenerator', + base_sizes=[[(10, 13), (16, 30), (33, 23)], + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]], + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='YOLOv5BBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=0.5), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xywh', + eps=1e-7, + reduction='mean', + loss_weight=0.05, + return_iou=True), + loss_obj: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='mean', + loss_weight=1.0), + prior_match_thr: float = 4.0, + near_neighbor_thr: float = 0.5, + ignore_iof_thr: float = -1.0, + obj_level_weights: List[float] = [4.0, 1.0, 0.4], + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.head_module = MODELS.build(head_module) + self.num_classes = self.head_module.num_classes + self.featmap_strides = self.head_module.featmap_strides + self.num_levels = len(self.featmap_strides) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.loss_cls: nn.Module = MODELS.build(loss_cls) + self.loss_bbox: nn.Module = MODELS.build(loss_bbox) + self.loss_obj: nn.Module = MODELS.build(loss_obj) + + self.prior_generator = TASK_UTILS.build(prior_generator) + self.bbox_coder = TASK_UTILS.build(bbox_coder) + self.num_base_priors = self.prior_generator.num_base_priors[0] + + self.featmap_sizes = [torch.empty(1)] * self.num_levels + + self.prior_match_thr = prior_match_thr + self.near_neighbor_thr = near_neighbor_thr + self.obj_level_weights = obj_level_weights + self.ignore_iof_thr = ignore_iof_thr + + self.special_init() + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + assert len(self.obj_level_weights) == len( + self.featmap_strides) == self.num_levels + if self.prior_match_thr != 4.0: + print_log( + "!!!Now, you've changed the prior_match_thr " + 'parameter to something other than 4.0. Please make sure ' + 'that you have modified both the regression formula in ' + 'bbox_coder and before loss_box computation, ' + 'otherwise the accuracy may be degraded!!!') + + if self.num_classes == 1: + print_log('!!!You are using `YOLOv5Head` with num_classes == 1.' + ' The loss_cls will be 0. This is a normal phenomenon.') + + priors_base_sizes = torch.tensor( + self.prior_generator.base_sizes, dtype=torch.float) + featmap_strides = torch.tensor( + self.featmap_strides, dtype=torch.float)[:, None, None] + self.register_buffer( + 'priors_base_sizes', + priors_base_sizes / featmap_strides, + persistent=False) + + grid_offset = torch.tensor([ + [0, 0], # center + [1, 0], # left + [0, 1], # up + [-1, 0], # right + [0, -1], # bottom + ]).float() + self.register_buffer( + 'grid_offset', grid_offset[:, None], persistent=False) + + prior_inds = torch.arange(self.num_base_priors).float().view( + self.num_base_priors, 1) + self.register_buffer('prior_inds', prior_inds, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + return self.head_module(x) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, labels=labels, bboxes=bboxes[keep_idxs]) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + + if isinstance(batch_data_samples, list): + losses = super().loss(x, batch_data_samples) + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + if self.ignore_iof_thr != -1: + # TODO: Support fast version + # convert ignore gt + batch_target_ignore_list = [] + for i, gt_instances_ignore in enumerate(batch_gt_instances_ignore): + bboxes = gt_instances_ignore.bboxes + labels = gt_instances_ignore.labels + index = bboxes.new_full((len(bboxes), 1), i) + # (batch_idx, label, bboxes) + target = torch.cat((index, labels[:, None].float(), bboxes), + dim=1) + batch_target_ignore_list.append(target) + + # (num_bboxes, 6) + batch_gt_targets_ignore = torch.cat( + batch_target_ignore_list, dim=0) + if batch_gt_targets_ignore.shape[0] != 0: + # Consider regions with ignore in annotations + return self._loss_by_feat_with_ignore( + cls_scores, + bbox_preds, + objectnesses, + batch_gt_instances=batch_gt_instances, + batch_img_metas=batch_img_metas, + batch_gt_instances_ignore=batch_gt_targets_ignore) + + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + scaled_factor = torch.ones(7, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj(objectnesses[i], + target_obj) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls(pred_cls_scores, target_class) + else: + loss_cls += cls_scores[i].sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) + + def _convert_gt_to_norm_format(self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict]) -> Tensor: + if isinstance(batch_gt_instances, torch.Tensor): + # fast version + img_shape = batch_img_metas[0]['batch_input_shape'] + gt_bboxes_xyxy = batch_gt_instances[:, 2:] + xy1, xy2 = gt_bboxes_xyxy.split((2, 2), dim=-1) + gt_bboxes_xywh = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1) + gt_bboxes_xywh[:, 1::2] /= img_shape[0] + gt_bboxes_xywh[:, 0::2] /= img_shape[1] + batch_gt_instances[:, 2:] = gt_bboxes_xywh + + # (num_base_priors, num_bboxes, 6) + batch_targets_normed = batch_gt_instances.repeat( + self.num_base_priors, 1, 1) + else: + batch_target_list = [] + # Convert xyxy bbox to yolo format. + for i, gt_instances in enumerate(batch_gt_instances): + img_shape = batch_img_metas[i]['batch_input_shape'] + bboxes = gt_instances.bboxes + labels = gt_instances.labels + + xy1, xy2 = bboxes.split((2, 2), dim=-1) + bboxes = torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1) + # normalized to 0-1 + bboxes[:, 1::2] /= img_shape[0] + bboxes[:, 0::2] /= img_shape[1] + + index = bboxes.new_full((len(bboxes), 1), i) + # (batch_idx, label, normed_bbox) + target = torch.cat((index, labels[:, None].float(), bboxes), + dim=1) + batch_target_list.append(target) + + # (num_base_priors, num_bboxes, 6) + batch_targets_normed = torch.cat( + batch_target_list, dim=0).repeat(self.num_base_priors, 1, 1) + + # (num_base_priors, num_bboxes, 1) + batch_targets_prior_inds = self.prior_inds.repeat( + 1, batch_targets_normed.shape[1])[..., None] + # (num_base_priors, num_bboxes, 7) + # (img_ind, labels, bbox_cx, bbox_cy, bbox_w, bbox_h, prior_ind) + batch_targets_normed = torch.cat( + (batch_targets_normed, batch_targets_prior_inds), 2) + return batch_targets_normed + + def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes) -> Tensor: + bbox_pred = bbox_pred.sigmoid() + pred_xy = bbox_pred[:, :2] * 2 - 0.5 + pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes + decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1) + return decoded_bbox_pred + + def _loss_by_feat_with_ignore( + self, cls_scores: Sequence[Tensor], bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: Sequence[Tensor]) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (Sequence[Tensor]): Ignore boxes with + batch_ids and labels, each is a 2D-tensor, the channel number + is 6, means that (batch_id, label, xmin, ymin, xmax, ymax). + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + scaled_factor = torch.ones(7, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + not_ignore_flags = bbox_preds[i].new_ones(batch_size, + self.num_base_priors, h, + w) + + ignore_overlaps = bbox_overlaps(self.mlvl_priors[i], + batch_gt_instances_ignore[..., 2:], + 'iof') + ignore_max_overlaps, ignore_max_ignore_index = ignore_overlaps.max( + dim=1) + + batch_inds = batch_gt_instances_ignore[:, + 0][ignore_max_ignore_index] + ignore_inds = (ignore_max_overlaps > self.ignore_iof_thr).nonzero( + as_tuple=True)[0] + batch_inds = batch_inds[ignore_inds].long() + ignore_priors, ignore_grid_xs, ignore_grid_ys = get_prior_xy_info( + ignore_inds, self.num_base_priors, self.featmap_sizes[i]) + not_ignore_flags[batch_inds, ignore_priors, ignore_grid_ys, + ignore_grid_xs] = 0 + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + + not_ignore_weights = not_ignore_flags[img_inds, priors_inds, + grid_y_inds, grid_x_inds] + loss_box_i, iou = self.loss_bbox( + decoded_bbox_pred, + bboxes_targets, + weight=not_ignore_weights, + avg_factor=max(not_ignore_weights.sum(), 1)) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj( + objectnesses[i], + target_obj, + weight=not_ignore_flags, + avg_factor=max(not_ignore_flags.sum(), + 1)) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls( + pred_cls_scores, + target_class, + weight=not_ignore_weights[:, None].repeat( + 1, self.num_classes), + avg_factor=max(not_ignore_weights.sum(), 1)) + else: + loss_cls += cls_scores[i].sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_ins_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_ins_head.py new file mode 100644 index 0000000000000000000000000000000000000000..df94f422e904791252067e22ea8e3a643a77a8d0 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/yolov5_ins_head.py @@ -0,0 +1,740 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from typing import List, Optional, Sequence, Tuple, Union + +import mmcv +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmdet.models.utils import filter_scores_and_topk, multi_apply +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.config import ConfigDict +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..utils import make_divisible +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule + + +class ProtoModule(BaseModule): + """Mask Proto module for segmentation models of YOLOv5. + + Args: + in_channels (int): Number of channels in the input feature map. + middle_channels (int): Number of channels in the middle feature map. + mask_channels (int): Number of channels in the output mask feature + map. This is the channel count of the mask. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + """ + + def __init__(self, + *args, + in_channels: int = 32, + middle_channels: int = 256, + mask_channels: int = 32, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + super().__init__(*args, **kwargs) + self.conv1 = ConvModule( + in_channels, + middle_channels, + kernel_size=3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.conv2 = ConvModule( + middle_channels, + middle_channels, + kernel_size=3, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + middle_channels, + mask_channels, + kernel_size=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + return self.conv3(self.conv2(self.upsample(self.conv1(x)))) + + +@MODELS.register_module() +class YOLOv5InsHeadModule(YOLOv5HeadModule): + """Detection and Instance Segmentation Head of YOLOv5. + + Args: + num_classes (int): Number of categories excluding the background + category. + mask_channels (int): Number of channels in the mask feature map. + This is the channel count of the mask. + proto_channels (int): Number of channels in the proto feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to ``dict(type='BN', momentum=0.03, eps=0.001)``. + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True). + """ + + def __init__(self, + *args, + num_classes: int, + mask_channels: int = 32, + proto_channels: int = 256, + widen_factor: float = 1.0, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + self.mask_channels = mask_channels + self.num_out_attrib_with_proto = 5 + num_classes + mask_channels + self.proto_channels = make_divisible(proto_channels, widen_factor) + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + super().__init__( + *args, + num_classes=num_classes, + widen_factor=widen_factor, + **kwargs) + + def _init_layers(self): + """initialize conv layers in YOLOv5 Ins head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Conv2d( + self.in_channels[i], + self.num_base_priors * self.num_out_attrib_with_proto, 1) + self.convs_pred.append(conv_pred) + + self.proto_pred = ProtoModule( + in_channels=self.in_channels[0], + middle_channels=self.proto_channels, + mask_channels=self.mask_channels, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, objectnesses, and mask predictions. + """ + assert len(x) == self.num_levels + cls_scores, bbox_preds, objectnesses, coeff_preds = multi_apply( + self.forward_single, x, self.convs_pred) + mask_protos = self.proto_pred(x[0]) + return cls_scores, bbox_preds, objectnesses, coeff_preds, mask_protos + + def forward_single( + self, x: Tensor, + convs_pred: nn.Module) -> Tuple[Tensor, Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + pred_map = convs_pred(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, + self.num_out_attrib_with_proto, ny, nx) + + cls_score = pred_map[:, :, 5:self.num_classes + 5, + ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + coeff_pred = pred_map[:, :, self.num_classes + 5:, + ...].reshape(bs, -1, ny, nx) + + return cls_score, bbox_pred, objectness, coeff_pred + + +@MODELS.register_module() +class YOLOv5InsHead(YOLOv5Head): + """YOLOv5 Instance Segmentation and Detection head. + + Args: + mask_overlap(bool): Defaults to True. + loss_mask (:obj:`ConfigDict` or dict): Config of mask loss. + loss_mask_weight (float): The weight of mask loss. + """ + + def __init__(self, + *args, + mask_overlap: bool = True, + loss_mask: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none'), + loss_mask_weight=0.05, + **kwargs): + super().__init__(*args, **kwargs) + self.mask_overlap = mask_overlap + self.loss_mask: nn.Module = MODELS.build(loss_mask) + self.loss_mask_weight = loss_mask_weight + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + + Returns: + dict: A dictionary of loss components. + """ + + if isinstance(batch_data_samples, list): + # TODO: support non-fast version ins segmention + raise NotImplementedError + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['masks'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + coeff_preds: Sequence[Tensor], + proto_preds: Tensor, + batch_gt_instances: Sequence[InstanceData], + batch_gt_masks: Sequence[Tensor], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + coeff_preds (Sequence[Tensor]): Mask coefficient for each scale + level, each is a 4D-tensor, the channel number is + num_priors * mask_channels. + proto_preds (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, mask_channels, H, W). + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_gt_masks (Sequence[Tensor]): Batch of gt_mask. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = cls_scores[0].device + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + loss_mask = torch.zeros(1, device=device) + scaled_factor = torch.ones(8, device=device) + + for i in range(self.num_levels): + batch_size, _, h, w = bbox_preds[i].shape + target_obj = torch.zeros_like(objectnesses[i]) + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + loss_mask += coeff_preds[i].sum() * 0 + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor( + bbox_preds[i].shape)[[3, 2, 3, 2]] + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 8) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + loss_box += bbox_preds[i].sum() * 0 + loss_cls += cls_scores[i].sum() * 0 + loss_obj += self.loss_obj( + objectnesses[i], target_obj) * self.obj_level_weights[i] + loss_mask += coeff_preds[i].sum() * 0 + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh,\ + priors_targets_inds = _chunk_targets + (priors_inds, targets_inds) = priors_targets_inds.long().T + (img_inds, class_inds) = img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + bboxes_targets = torch.cat((grid_xy - grid_xy_long, grid_wh), 1) + + # 4. Calculate loss + # bbox loss + retained_bbox_pred = bbox_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + priors_base_sizes_i = priors_base_sizes_i[priors_inds] + decoded_bbox_pred = self._decode_bbox_to_xywh( + retained_bbox_pred, priors_base_sizes_i) + loss_box_i, iou = self.loss_bbox(decoded_bbox_pred, bboxes_targets) + loss_box += loss_box_i + + # obj loss + iou = iou.detach().clamp(0) + target_obj[img_inds, priors_inds, grid_y_inds, + grid_x_inds] = iou.type(target_obj.dtype) + loss_obj += self.loss_obj(objectnesses[i], + target_obj) * self.obj_level_weights[i] + + # cls loss + if self.num_classes > 1: + pred_cls_scores = cls_scores[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + target_class = torch.full_like(pred_cls_scores, 0.) + target_class[range(batch_targets_scaled.shape[0]), + class_inds] = 1. + loss_cls += self.loss_cls(pred_cls_scores, target_class) + else: + loss_cls += cls_scores[i].sum() * 0 + + # mask regression + retained_coeff_preds = coeff_preds[i].reshape( + batch_size, self.num_base_priors, -1, h, + w)[img_inds, priors_inds, :, grid_y_inds, grid_x_inds] + + _, c, mask_h, mask_w = proto_preds.shape + if batch_gt_masks.shape[-2:] != (mask_h, mask_w): + batch_gt_masks = F.interpolate( + batch_gt_masks[None], (mask_h, mask_w), mode='nearest')[0] + + xywh_normed = batch_targets_scaled[:, 2:6] / scaled_factor[2:6] + area_normed = xywh_normed[:, 2:].prod(1) + xywh_scaled = xywh_normed * torch.tensor( + proto_preds.shape, device=device)[[3, 2, 3, 2]] + xyxy_scaled = bbox_cxcywh_to_xyxy(xywh_scaled) + + for bs in range(batch_size): + match_inds = (img_inds == bs) # matching index + if not match_inds.any(): + continue + + if self.mask_overlap: + mask_gti = torch.where( + batch_gt_masks[bs][None] == + targets_inds[match_inds].view(-1, 1, 1), 1.0, 0.0) + else: + mask_gti = batch_gt_masks[targets_inds][match_inds] + + mask_preds = (retained_coeff_preds[match_inds] + @ proto_preds[bs].view(c, -1)).view( + -1, mask_h, mask_w) + loss_mask_full = self.loss_mask(mask_preds, mask_gti) + loss_mask += ( + self.crop_mask(loss_mask_full[None], + xyxy_scaled[match_inds]).mean(dim=(2, 3)) / + area_normed[match_inds]).mean() + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size, + loss_mask=loss_mask * self.loss_mask_weight * world_size) + + def _convert_gt_to_norm_format(self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict]) -> Tensor: + """Add target_inds for instance segmentation.""" + batch_targets_normed = super()._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + if self.mask_overlap: + batch_size = len(batch_img_metas) + target_inds = [] + for i in range(batch_size): + # find number of targets of each image + num_gts = (batch_gt_instances[:, 0] == i).sum() + # (num_anchor, num_gts) + target_inds.append( + torch.arange(num_gts, device=batch_gt_instances.device). + float().view(1, num_gts).repeat(self.num_base_priors, 1) + + 1) + target_inds = torch.cat(target_inds, 1) + else: + num_gts = batch_gt_instances.shape[0] + target_inds = torch.arange( + num_gts, device=batch_gt_instances.device).float().view( + 1, num_gts).repeat(self.num_base_priors, 1) + batch_targets_normed = torch.cat( + [batch_targets_normed, target_inds[..., None]], 2) + return batch_targets_normed + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + coeff_preds: Optional[List[Tensor]] = None, + proto_preds: Optional[Tensor] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted from the head into + bbox results. + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + coeff_preds (list[Tensor]): Mask coefficients predictions + for all scale levels, each is a 4D-tensor, has shape + (batch_size, mask_channels, H, W). + proto_preds (Tensor): Mask prototype features extracted from the + mask head, has shape (batch_size, mask_channels, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + Returns: + list[:obj:`InstanceData`]: Object detection and instance + segmentation results of each image after the post process. + Each item usually contains following keys. + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + - masks (Tensor): Has a shape (num_instances, h, w). + """ + assert len(cls_scores) == len(bbox_preds) == len(coeff_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_coeff_preds = [ + coeff_pred.permute(0, 2, 3, + 1).reshape(num_imgs, -1, + self.head_module.mask_channels) + for coeff_pred in coeff_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors.unsqueeze(0), flatten_bbox_preds, flatten_stride) + + flatten_coeff_preds = torch.cat(flatten_coeff_preds, dim=1) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(len(featmap_sizes))] + + results_list = [] + for (bboxes, scores, objectness, coeffs, mask_proto, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, flatten_coeff_preds, + proto_preds, batch_img_metas): + ori_shape = img_meta['ori_shape'] + batch_input_shape = img_meta['batch_input_shape'] + input_shape_h, input_shape_w = batch_input_shape + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + input_shape_withoutpad = (input_shape_h - pad_param[0] - + pad_param[1], input_shape_w - + pad_param[2] - pad_param[3]) + else: + pad_param = None + input_shape_withoutpad = batch_input_shape + scale_factor = (input_shape_withoutpad[1] / ori_shape[1], + input_shape_withoutpad[0] / ori_shape[0]) + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + coeffs = coeffs[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + # NOTE: Important + coeffs *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + empty_results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0], coeffs=coeffs)) + labels = results['labels'] + coeffs = results['coeffs'] + else: + out = filter_scores_and_topk( + scores, score_thr, nms_pre, results=dict(coeffs=coeffs)) + scores, labels, keep_idxs, filtered_results = out + coeffs = filtered_results['coeffs'] + + results = InstanceData( + scores=scores, + labels=labels, + bboxes=bboxes[keep_idxs], + coeffs=coeffs) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + + if len(results.bboxes): + masks = self.process_mask(mask_proto, results.coeffs, + results.bboxes, + (input_shape_h, input_shape_w), True) + if rescale: + if pad_param is not None: + # bbox minus pad param + top_pad, _, left_pad, _ = pad_param + results.bboxes -= results.bboxes.new_tensor( + [left_pad, top_pad, left_pad, top_pad]) + # mask crop pad param + top, left = int(top_pad), int(left_pad) + bottom, right = int(input_shape_h - + top_pad), int(input_shape_w - + left_pad) + masks = masks[:, :, top:bottom, left:right] + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + fast_test = cfg.get('fast_test', False) + if fast_test: + masks = F.interpolate( + masks, + size=ori_shape, + mode='bilinear', + align_corners=False) + masks = masks.squeeze(0) + masks = masks > cfg.mask_thr_binary + else: + masks.gt_(cfg.mask_thr_binary) + masks = torch.as_tensor(masks, dtype=torch.uint8) + masks = masks[0].permute(1, 2, + 0).contiguous().cpu().numpy() + masks = mmcv.imresize(masks, + (ori_shape[1], ori_shape[0])) + + if len(masks.shape) == 2: + masks = masks[:, :, None] + masks = torch.from_numpy(masks).permute(2, 0, 1) + + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results.masks = masks.bool() + results_list.append(results) + else: + h, w = ori_shape[:2] if rescale else img_meta['img_shape'][:2] + results.masks = torch.zeros( + size=(0, h, w), dtype=torch.bool, device=bboxes.device) + results_list.append(results) + return results_list + + def process_mask(self, + mask_proto: Tensor, + mask_coeff_pred: Tensor, + bboxes: Tensor, + shape: Tuple[int, int], + upsample: bool = False) -> Tensor: + """Generate mask logits results. + + Args: + mask_proto (Tensor): Mask prototype features. + Has shape (num_instance, mask_channels). + mask_coeff_pred (Tensor): Mask coefficients prediction for + single image. Has shape (mask_channels, H, W) + bboxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4). + shape (Tuple): Batch input shape of image. + upsample (bool): Whether upsample masks results to batch input + shape. Default to False. + Return: + Tensor: Instance segmentation masks for each instance. + Has shape (num_instance, H, W). + """ + c, mh, mw = mask_proto.shape # CHW + masks = ( + mask_coeff_pred @ mask_proto.float().view(c, -1)).sigmoid().view( + -1, mh, mw)[None] + if upsample: + masks = F.interpolate( + masks, shape, mode='bilinear', align_corners=False) # 1CHW + masks = self.crop_mask(masks, bboxes) + return masks + + def crop_mask(self, masks: Tensor, boxes: Tensor) -> Tensor: + """Crop mask by the bounding box. + + Args: + masks (Tensor): Predicted mask results. Has shape + (1, num_instance, H, W). + boxes (Tensor): Tensor of the bbox. Has shape (num_instance, 4). + Returns: + (torch.Tensor): The masks are being cropped to the bounding box. + """ + _, n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) + r = torch.arange( + w, device=masks.device, + dtype=x1.dtype)[None, None, None, :] # rows shape(1, 1, w, 1) + c = torch.arange( + h, device=masks.device, + dtype=x1.dtype)[None, None, :, None] # cols shape(1, h, 1, 1) + + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/yolov6_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/yolov6_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3b01133f04f467de9beab08ac9bae602d4588a96 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/yolov6_head.py @@ -0,0 +1,396 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine import MessageHub +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOv6HeadModule(BaseModule): + """YOLOv6Head head module used in `YOLOv6. + + `_. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors: (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + None, otherwise False. Defaults to "auto". + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + reg_max=0, + featmap_strides: Sequence[int] = (8, 16, 32), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.reg_max = reg_max + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + if isinstance(in_channels, int): + self.in_channels = [int(in_channels * widen_factor) + ] * self.num_levels + else: + self.in_channels = [int(i * widen_factor) for i in in_channels] + + self._init_layers() + + def _init_layers(self): + """initialize conv layers in YOLOv6 head.""" + # Init decouple head + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.stems = nn.ModuleList() + + if self.reg_max > 1: + proj = torch.arange( + self.reg_max + self.num_base_priors, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + for i in range(self.num_levels): + self.stems.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=1, + stride=1, + padding=1 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_convs.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=3, + stride=1, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.reg_convs.append( + ConvModule( + in_channels=self.in_channels[i], + out_channels=self.in_channels[i], + kernel_size=3, + stride=1, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.cls_preds.append( + nn.Conv2d( + in_channels=self.in_channels[i], + out_channels=self.num_base_priors * self.num_classes, + kernel_size=1)) + self.reg_preds.append( + nn.Conv2d( + in_channels=self.in_channels[i], + out_channels=(self.num_base_priors + self.reg_max) * 4, + kernel_size=1)) + + def init_weights(self): + super().init_weights() + bias_init = bias_init_with_prob(0.01) + for conv in self.cls_preds: + conv.bias.data.fill_(bias_init) + conv.weight.data.fill_(0.) + + for conv in self.reg_preds: + conv.bias.data.fill_(1.0) + conv.weight.data.fill_(0.) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.stems, self.cls_convs, + self.cls_preds, self.reg_convs, self.reg_preds) + + def forward_single(self, x: Tensor, stem: nn.Module, cls_conv: nn.Module, + cls_pred: nn.Module, reg_conv: nn.Module, + reg_pred: nn.Module) -> Tuple[Tensor, Tensor]: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + y = stem(x) + cls_x = y + reg_x = y + cls_feat = cls_conv(cls_x) + reg_feat = reg_conv(reg_x) + + cls_score = cls_pred(cls_feat) + bbox_dist_preds = reg_pred(reg_feat) + + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max + self.num_base_priors, + h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + + if self.training: + return cls_score, bbox_preds, bbox_dist_preds + else: + return cls_score, bbox_preds + + +@MODELS.register_module() +class YOLOv6Head(YOLOv5Head): + """YOLOv6Head head used in `YOLOv6 `_. + + Args: + head_module(ConfigType): Base module used for YOLOv6Head + prior_generator(dict): Points generator feature maps + in 2D points-based detectors. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.VarifocalLoss', + use_sigmoid=True, + alpha=0.75, + gamma=2.0, + iou_weighted=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='giou', + bbox_format='xyxy', + reduction='mean', + loss_weight=2.5, + return_iou=False), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + # yolov6 doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.initial_epoch = self.train_cfg['initial_epoch'] + self.initial_assigner = TASK_UTILS.build( + self.train_cfg.initial_assigner) + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + + # Add common attributes to reduce calculation + self.featmap_sizes_train = None + self.num_level_priors = None + self.flatten_priors_train = None + self.stride_tensor = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + # get epoch information from message hub + message_hub = MessageHub.get_current_instance() + current_epoch = message_hub.get_info('epoch') + + num_imgs = len(batch_img_metas) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[:, 0]) + pred_scores = torch.sigmoid(flatten_cls_preds) + + if current_epoch < self.initial_epoch: + assigned_result = self.initial_assigner( + flatten_pred_bboxes.detach(), self.flatten_priors_train, + self.num_level_priors, gt_labels, gt_bboxes, pad_bbox_flag) + else: + assigned_result = self.assigner(flatten_pred_bboxes.detach(), + pred_scores.detach(), + self.flatten_priors_train, + gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + # cls loss + with torch.cuda.amp.autocast(enabled=False): + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores) + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # TODO: Add all_reduce makes training more stable + assigned_scores_sum = assigned_scores.sum() + if assigned_scores_sum > 0: + loss_cls /= assigned_scores_sum + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, + assigned_bboxes_pos, + weight=bbox_weight, + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * world_size, loss_bbox=loss_bbox * world_size) diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/yolov7_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/yolov7_head.py new file mode 100644 index 0000000000000000000000000000000000000000..124883cf4b4c5b51d6643edc7c2f813178d80c78 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/yolov7_head.py @@ -0,0 +1,404 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.dist import get_dist_info +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..layers import ImplicitA, ImplicitM +from ..task_modules.assigners.batch_yolov7_assigner import BatchYOLOv7Assigner +from .yolov5_head import YOLOv5Head, YOLOv5HeadModule + + +@MODELS.register_module() +class YOLOv7HeadModule(YOLOv5HeadModule): + """YOLOv7Head head module used in YOLOv7.""" + + def _init_layers(self): + """initialize conv layers in YOLOv7 head.""" + self.convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Sequential( + ImplicitA(self.in_channels[i]), + nn.Conv2d(self.in_channels[i], + self.num_base_priors * self.num_out_attrib, 1), + ImplicitM(self.num_base_priors * self.num_out_attrib), + ) + self.convs_pred.append(conv_pred) + + def init_weights(self): + """Initialize the bias of YOLOv7 head.""" + super(YOLOv5HeadModule, self).init_weights() + for mi, s in zip(self.convs_pred, self.featmap_strides): # from + mi = mi[1] # nn.Conv2d + + b = mi.bias.data.view(self.num_base_priors, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + + mi.bias.data = b.view(-1) + + +@MODELS.register_module() +class YOLOv7p6HeadModule(YOLOv5HeadModule): + """YOLOv7Head head module used in YOLOv7.""" + + def __init__(self, + *args, + main_out_channels: Sequence[int] = [256, 512, 768, 1024], + aux_out_channels: Sequence[int] = [320, 640, 960, 1280], + use_aux: bool = True, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + **kwargs): + self.main_out_channels = main_out_channels + self.aux_out_channels = aux_out_channels + self.use_aux = use_aux + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + super().__init__(*args, **kwargs) + + def _init_layers(self): + """initialize conv layers in YOLOv7 head.""" + self.main_convs_pred = nn.ModuleList() + for i in range(self.num_levels): + conv_pred = nn.Sequential( + ConvModule( + self.in_channels[i], + self.main_out_channels[i], + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ImplicitA(self.main_out_channels[i]), + nn.Conv2d(self.main_out_channels[i], + self.num_base_priors * self.num_out_attrib, 1), + ImplicitM(self.num_base_priors * self.num_out_attrib), + ) + self.main_convs_pred.append(conv_pred) + + if self.use_aux: + self.aux_convs_pred = nn.ModuleList() + for i in range(self.num_levels): + aux_pred = nn.Sequential( + ConvModule( + self.in_channels[i], + self.aux_out_channels[i], + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(self.aux_out_channels[i], + self.num_base_priors * self.num_out_attrib, 1)) + self.aux_convs_pred.append(aux_pred) + else: + self.aux_convs_pred = [None] * len(self.main_convs_pred) + + def init_weights(self): + """Initialize the bias of YOLOv5 head.""" + super(YOLOv5HeadModule, self).init_weights() + for mi, aux, s in zip(self.main_convs_pred, self.aux_convs_pred, + self.featmap_strides): # from + mi = mi[2] # nn.Conv2d + b = mi.bias.data.view(3, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + mi.bias.data = b.view(-1) + + if self.use_aux: + aux = aux[1] # nn.Conv2d + b = aux.bias.data.view(3, -1) + # obj (8 objects per 640 image) + b.data[:, 4] += math.log(8 / (640 / s)**2) + b.data[:, 5:] += math.log(0.6 / (self.num_classes - 0.99)) + mi.bias.data = b.view(-1) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.main_convs_pred, + self.aux_convs_pred) + + def forward_single(self, x: Tensor, convs: nn.Module, + aux_convs: Optional[nn.Module]) \ + -> Tuple[Union[Tensor, List], Union[Tensor, List], + Union[Tensor, List]]: + """Forward feature of a single scale level.""" + + pred_map = convs(x) + bs, _, ny, nx = pred_map.shape + pred_map = pred_map.view(bs, self.num_base_priors, self.num_out_attrib, + ny, nx) + + cls_score = pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + bbox_pred = pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + objectness = pred_map[:, :, 4:5, ...].reshape(bs, -1, ny, nx) + + if not self.training or not self.use_aux: + return cls_score, bbox_pred, objectness + else: + aux_pred_map = aux_convs(x) + aux_pred_map = aux_pred_map.view(bs, self.num_base_priors, + self.num_out_attrib, ny, nx) + aux_cls_score = aux_pred_map[:, :, 5:, ...].reshape(bs, -1, ny, nx) + aux_bbox_pred = aux_pred_map[:, :, :4, ...].reshape(bs, -1, ny, nx) + aux_objectness = aux_pred_map[:, :, 4:5, + ...].reshape(bs, -1, ny, nx) + + return [cls_score, + aux_cls_score], [bbox_pred, aux_bbox_pred + ], [objectness, aux_objectness] + + +@MODELS.register_module() +class YOLOv7Head(YOLOv5Head): + """YOLOv7Head head used in `YOLOv7 `_. + + Args: + simota_candidate_topk (int): The candidate top-k which used to + get top-k ious to calculate dynamic-k in BatchYOLOv7Assigner. + Defaults to 10. + simota_iou_weight (float): The scale factor for regression + iou cost in BatchYOLOv7Assigner. Defaults to 3.0. + simota_cls_weight (float): The scale factor for classification + cost in BatchYOLOv7Assigner. Defaults to 1.0. + """ + + def __init__(self, + *args, + simota_candidate_topk: int = 20, + simota_iou_weight: float = 3.0, + simota_cls_weight: float = 1.0, + aux_loss_weights: float = 0.25, + **kwargs): + super().__init__(*args, **kwargs) + self.aux_loss_weights = aux_loss_weights + self.assigner = BatchYOLOv7Assigner( + num_classes=self.num_classes, + num_base_priors=self.num_base_priors, + featmap_strides=self.featmap_strides, + prior_match_thr=self.prior_match_thr, + candidate_topk=simota_candidate_topk, + iou_weight=simota_iou_weight, + cls_weight=simota_cls_weight) + + def loss_by_feat( + self, + cls_scores: Sequence[Union[Tensor, List]], + bbox_preds: Sequence[Union[Tensor, List]], + objectnesses: Sequence[Union[Tensor, List]], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + + if isinstance(cls_scores[0], Sequence): + with_aux = True + batch_size = cls_scores[0][0].shape[0] + device = cls_scores[0][0].device + + bbox_preds_main, bbox_preds_aux = zip(*bbox_preds) + objectnesses_main, objectnesses_aux = zip(*objectnesses) + cls_scores_main, cls_scores_aux = zip(*cls_scores) + + head_preds = self._merge_predict_results(bbox_preds_main, + objectnesses_main, + cls_scores_main) + head_preds_aux = self._merge_predict_results( + bbox_preds_aux, objectnesses_aux, cls_scores_aux) + else: + with_aux = False + batch_size = cls_scores[0].shape[0] + device = cls_scores[0].device + + head_preds = self._merge_predict_results(bbox_preds, objectnesses, + cls_scores) + + # Convert gt to norm xywh format + # (num_base_priors, num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + scaled_factors = [ + torch.tensor(head_pred.shape, device=device)[[3, 2, 3, 2]] + for head_pred in head_preds + ] + + loss_cls, loss_obj, loss_box = self._calc_loss( + head_preds=head_preds, + head_preds_aux=None, + batch_targets_normed=batch_targets_normed, + near_neighbor_thr=self.near_neighbor_thr, + scaled_factors=scaled_factors, + batch_img_metas=batch_img_metas, + device=device) + + if with_aux: + loss_cls_aux, loss_obj_aux, loss_box_aux = self._calc_loss( + head_preds=head_preds, + head_preds_aux=head_preds_aux, + batch_targets_normed=batch_targets_normed, + near_neighbor_thr=self.near_neighbor_thr * 2, + scaled_factors=scaled_factors, + batch_img_metas=batch_img_metas, + device=device) + loss_cls += self.aux_loss_weights * loss_cls_aux + loss_obj += self.aux_loss_weights * loss_obj_aux + loss_box += self.aux_loss_weights * loss_box_aux + + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * batch_size * world_size, + loss_obj=loss_obj * batch_size * world_size, + loss_bbox=loss_box * batch_size * world_size) + + def _calc_loss(self, head_preds, head_preds_aux, batch_targets_normed, + near_neighbor_thr, scaled_factors, batch_img_metas, device): + loss_cls = torch.zeros(1, device=device) + loss_box = torch.zeros(1, device=device) + loss_obj = torch.zeros(1, device=device) + + assigner_results = self.assigner( + head_preds, + batch_targets_normed, + batch_img_metas[0]['batch_input_shape'], + self.priors_base_sizes, + self.grid_offset, + near_neighbor_thr=near_neighbor_thr) + # mlvl is mean multi_level + mlvl_positive_infos = assigner_results['mlvl_positive_infos'] + mlvl_priors = assigner_results['mlvl_priors'] + mlvl_targets_normed = assigner_results['mlvl_targets_normed'] + + if head_preds_aux is not None: + # This is mean calc aux branch loss + head_preds = head_preds_aux + + for i, head_pred in enumerate(head_preds): + batch_inds, proir_idx, grid_x, grid_y = mlvl_positive_infos[i].T + num_pred_positive = batch_inds.shape[0] + target_obj = torch.zeros_like(head_pred[..., 0]) + # empty positive sampler + if num_pred_positive == 0: + loss_box += head_pred[..., :4].sum() * 0 + loss_cls += head_pred[..., 5:].sum() * 0 + loss_obj += self.loss_obj( + head_pred[..., 4], target_obj) * self.obj_level_weights[i] + continue + + priors = mlvl_priors[i] + targets_normed = mlvl_targets_normed[i] + + head_pred_positive = head_pred[batch_inds, proir_idx, grid_y, + grid_x] + + # calc bbox loss + grid_xy = torch.stack([grid_x, grid_y], dim=1) + decoded_pred_bbox = self._decode_bbox_to_xywh( + head_pred_positive[:, :4], priors, grid_xy) + target_bbox_scaled = targets_normed[:, 2:6] * scaled_factors[i] + + loss_box_i, iou = self.loss_bbox(decoded_pred_bbox, + target_bbox_scaled) + loss_box += loss_box_i + + # calc obj loss + target_obj[batch_inds, proir_idx, grid_y, + grid_x] = iou.detach().clamp(0).type(target_obj.dtype) + loss_obj += self.loss_obj(head_pred[..., 4], + target_obj) * self.obj_level_weights[i] + + # calc cls loss + if self.num_classes > 1: + pred_cls_scores = targets_normed[:, 1].long() + target_class = torch.full_like( + head_pred_positive[:, 5:], 0., device=device) + target_class[range(num_pred_positive), pred_cls_scores] = 1. + loss_cls += self.loss_cls(head_pred_positive[:, 5:], + target_class) + else: + loss_cls += head_pred_positive[:, 5:].sum() * 0 + return loss_cls, loss_obj, loss_box + + def _merge_predict_results(self, bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + cls_scores: Sequence[Tensor]) -> List[Tensor]: + """Merge predict output from 3 heads. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + + Returns: + List[Tensor]: Merged output. + """ + head_preds = [] + for bbox_pred, objectness, cls_score in zip(bbox_preds, objectnesses, + cls_scores): + b, _, h, w = bbox_pred.shape + bbox_pred = bbox_pred.reshape(b, self.num_base_priors, -1, h, w) + objectness = objectness.reshape(b, self.num_base_priors, -1, h, w) + cls_score = cls_score.reshape(b, self.num_base_priors, -1, h, w) + head_pred = torch.cat([bbox_pred, objectness, cls_score], + dim=2).permute(0, 1, 3, 4, 2).contiguous() + head_preds.append(head_pred) + return head_preds + + def _decode_bbox_to_xywh(self, bbox_pred, priors_base_sizes, + grid_xy) -> Tensor: + bbox_pred = bbox_pred.sigmoid() + pred_xy = bbox_pred[:, :2] * 2 - 0.5 + grid_xy + pred_wh = (bbox_pred[:, 2:] * 2)**2 * priors_base_sizes + decoded_bbox_pred = torch.cat((pred_xy, pred_wh), dim=-1) + return decoded_bbox_pred diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/yolov8_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/yolov8_head.py new file mode 100644 index 0000000000000000000000000000000000000000..292024178ce2c249f63c9ce1168da767d9718fcf --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/yolov8_head.py @@ -0,0 +1,396 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.utils import multi_apply +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig) +from mmengine.dist import get_dist_info +from mmengine.model import BaseModule +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from ..utils import gt_instances_preprocess, make_divisible +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOv8HeadModule(BaseModule): + """YOLOv8HeadModule head module used in `YOLOv8`. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + reg_max (int): Max value of integral set :math: ``{0, ..., reg_max-1}`` + in QFL setting. Defaults to 16. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + featmap_strides: Sequence[int] = (8, 16, 32), + reg_max: int = 16, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.featmap_strides = featmap_strides + self.num_levels = len(self.featmap_strides) + self.num_base_priors = num_base_priors + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.in_channels = in_channels + self.reg_max = reg_max + + in_channels = [] + for channel in self.in_channels: + channel = make_divisible(channel, widen_factor) + in_channels.append(channel) + self.in_channels = in_channels + + self._init_layers() + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for reg_pred, cls_pred, stride in zip(self.reg_preds, self.cls_preds, + self.featmap_strides): + reg_pred[-1].bias.data[:] = 1.0 # box + # cls (.01 objects, 80 classes, 640 img) + cls_pred[-1].bias.data[:self.num_classes] = math.log( + 5 / self.num_classes / (640 / stride)**2) + + def _init_layers(self): + """initialize conv layers in YOLOv8 head.""" + # Init decouple head + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + + reg_out_channels = max( + (16, self.in_channels[0] // 4, self.reg_max * 4)) + cls_out_channels = max(self.in_channels[0], self.num_classes) + + for i in range(self.num_levels): + self.reg_preds.append( + nn.Sequential( + ConvModule( + in_channels=self.in_channels[i], + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + in_channels=reg_out_channels, + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + in_channels=reg_out_channels, + out_channels=4 * self.reg_max, + kernel_size=1))) + self.cls_preds.append( + nn.Sequential( + ConvModule( + in_channels=self.in_channels[i], + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + in_channels=cls_out_channels, + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + in_channels=cls_out_channels, + out_channels=self.num_classes, + kernel_size=1))) + + proj = torch.arange(self.reg_max, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions + """ + assert len(x) == self.num_levels + return multi_apply(self.forward_single, x, self.cls_preds, + self.reg_preds) + + def forward_single(self, x: torch.Tensor, cls_pred: nn.ModuleList, + reg_pred: nn.ModuleList) -> Tuple: + """Forward feature of a single scale level.""" + b, _, h, w = x.shape + cls_logit = cls_pred(x) + bbox_dist_preds = reg_pred(x) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + +@MODELS.register_module() +class YOLOv8Head(YOLOv5Head): + """YOLOv8Head head used in `YOLOv8`. + + Args: + head_module(:obj:`ConfigDict` or dict): Base module used for YOLOv8Head + prior_generator(dict): Points generator feature maps + in 2D points-based detectors. + bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_dfl (:obj:`ConfigDict` or dict): Config of Distribution Focal + Loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0.5, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='none', + loss_weight=0.5), + loss_bbox: ConfigType = dict( + type='IoULoss', + iou_mode='ciou', + bbox_format='xyxy', + reduction='sum', + loss_weight=7.5, + return_iou=False), + loss_dfl=dict( + type='mmdet.DistributionFocalLoss', + reduction='mean', + loss_weight=1.5 / 4), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + self.loss_dfl = MODELS.build(loss_dfl) + # YOLOv8 doesn't need loss_obj + self.loss_obj = None + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + + # Add common attributes to reduce calculation + self.featmap_sizes_train = None + self.num_level_priors = None + self.flatten_priors_train = None + self.stride_tensor = None + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + flatten_pred_dists = [ + bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + assigned_scores_sum = assigned_scores.sum().clamp(min=1) + + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum() + loss_cls /= assigned_scores_sum + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, assigned_bboxes_pos, + weight=bbox_weight) / assigned_scores_sum + + # dfl loss + pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior] + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max - 1, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl( + pred_dist_pos.reshape(-1, self.head_module.reg_max), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + _, world_size = get_dist_info() + return dict( + loss_cls=loss_cls * num_imgs * world_size, + loss_bbox=loss_bbox * num_imgs * world_size, + loss_dfl=loss_dfl * num_imgs * world_size) diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/yolox_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/yolox_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a203298d8536148a7022711eabeee7f04fea8ab4 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/yolox_head.py @@ -0,0 +1,514 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.task_modules.samplers import PseudoSampler +from mmdet.models.utils import multi_apply +from mmdet.structures.bbox import bbox_xyxy_to_cxcywh +from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList, + OptMultiConfig, reduce_mean) +from mmengine.model import BaseModule, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS +from .yolov5_head import YOLOv5Head + + +@MODELS.register_module() +class YOLOXHeadModule(BaseModule): + """YOLOXHead head module used in `YOLOX. + + ``_ + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (Union[int, Sequence]): Number of channels in the input + feature map. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_base_priors (int): The number of priors (points) at a point + on the feature grid + stacked_convs (int): Number of stacking convs of the head. + Defaults to 2. + featmap_strides (Sequence[int]): Downsample factor of each feature map. + Defaults to [8, 16, 32]. + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Defaults to False. + dcn_on_last_conv (bool): If true, use dcn in the last layer of + towers. Defaults to False. + conv_bias (bool or str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Defaults to "auto". + conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for + convolution layer. Defaults to None. + norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization + layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + num_classes: int, + in_channels: Union[int, Sequence], + widen_factor: float = 1.0, + num_base_priors: int = 1, + feat_channels: int = 256, + stacked_convs: int = 2, + featmap_strides: Sequence[int] = [8, 16, 32], + use_depthwise: bool = False, + dcn_on_last_conv: bool = False, + conv_bias: Union[bool, str] = 'auto', + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None, + ): + super().__init__(init_cfg=init_cfg) + self.num_classes = num_classes + self.feat_channels = int(feat_channels * widen_factor) + self.stacked_convs = stacked_convs + self.use_depthwise = use_depthwise + self.dcn_on_last_conv = dcn_on_last_conv + assert conv_bias == 'auto' or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.num_base_priors = num_base_priors + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.featmap_strides = featmap_strides + + if isinstance(in_channels, int): + in_channels = int(in_channels * widen_factor) + self.in_channels = in_channels + + self._init_layers() + + def _init_layers(self): + """Initialize heads for all level feature maps.""" + self.multi_level_cls_convs = nn.ModuleList() + self.multi_level_reg_convs = nn.ModuleList() + self.multi_level_conv_cls = nn.ModuleList() + self.multi_level_conv_reg = nn.ModuleList() + self.multi_level_conv_obj = nn.ModuleList() + for _ in self.featmap_strides: + self.multi_level_cls_convs.append(self._build_stacked_convs()) + self.multi_level_reg_convs.append(self._build_stacked_convs()) + conv_cls, conv_reg, conv_obj = self._build_predictor() + self.multi_level_conv_cls.append(conv_cls) + self.multi_level_conv_reg.append(conv_reg) + self.multi_level_conv_obj.append(conv_obj) + + def _build_stacked_convs(self) -> nn.Sequential: + """Initialize conv layers of a single level head.""" + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + stacked_convs = [] + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type='DCNv2') + else: + conv_cfg = self.conv_cfg + stacked_convs.append( + conv( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=self.conv_bias)) + return nn.Sequential(*stacked_convs) + + def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]: + """Initialize predictor layers of a single level head.""" + conv_cls = nn.Conv2d(self.feat_channels, self.num_classes, 1) + conv_reg = nn.Conv2d(self.feat_channels, 4, 1) + conv_obj = nn.Conv2d(self.feat_channels, 1, 1) + return conv_cls, conv_reg, conv_obj + + def init_weights(self): + """Initialize weights of the head.""" + # Use prior in model initialization to improve stability + super().init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj in zip(self.multi_level_conv_cls, + self.multi_level_conv_obj): + conv_cls.bias.data.fill_(bias_init) + conv_obj.bias.data.fill_(bias_init) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network. + + Args: + x (Tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + Returns: + Tuple[List]: A tuple of multi-level classification scores, bbox + predictions, and objectnesses. + """ + + return multi_apply(self.forward_single, x, self.multi_level_cls_convs, + self.multi_level_reg_convs, + self.multi_level_conv_cls, + self.multi_level_conv_reg, + self.multi_level_conv_obj) + + def forward_single(self, x: Tensor, cls_convs: nn.Module, + reg_convs: nn.Module, conv_cls: nn.Module, + conv_reg: nn.Module, + conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]: + """Forward feature of a single scale level.""" + + cls_feat = cls_convs(x) + reg_feat = reg_convs(x) + + cls_score = conv_cls(cls_feat) + bbox_pred = conv_reg(reg_feat) + objectness = conv_obj(reg_feat) + + return cls_score, bbox_pred, objectness + + +@MODELS.register_module() +class YOLOXHead(YOLOv5Head): + """YOLOXHead head used in `YOLOX `_. + + Args: + head_module(ConfigType): Base module used for YOLOXHead + prior_generator: Points generator feature maps in + 2D points-based detectors. + loss_cls (:obj:`ConfigDict` or dict): Config of classification loss. + loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss. + loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss. + loss_bbox_aux (:obj:`ConfigDict` or dict): Config of bbox aux loss. + train_cfg (:obj:`ConfigDict` or dict, optional): Training config of + anchor head. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of + anchor head. Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + head_module: ConfigType, + prior_generator: ConfigType = dict( + type='mmdet.MlvlPointGenerator', + offset=0, + strides=[8, 16, 32]), + bbox_coder: ConfigType = dict(type='YOLOXBBoxCoder'), + loss_cls: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox: ConfigType = dict( + type='mmdet.IoULoss', + mode='square', + eps=1e-16, + reduction='sum', + loss_weight=5.0), + loss_obj: ConfigType = dict( + type='mmdet.CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + loss_bbox_aux: ConfigType = dict( + type='mmdet.L1Loss', reduction='sum', loss_weight=1.0), + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + init_cfg: OptMultiConfig = None): + self.use_bbox_aux = False + self.loss_bbox_aux = loss_bbox_aux + + super().__init__( + head_module=head_module, + prior_generator=prior_generator, + bbox_coder=bbox_coder, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + loss_obj=loss_obj, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + def special_init(self): + """Since YOLO series algorithms will inherit from YOLOv5Head, but + different algorithms have special initialization process. + + The special_init function is designed to deal with this situation. + """ + self.loss_bbox_aux: nn.Module = MODELS.build(self.loss_bbox_aux) + if self.train_cfg: + self.assigner = TASK_UTILS.build(self.train_cfg.assigner) + # YOLOX does not support sampling + self.sampler = PseudoSampler() + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + return self.head_module(x) + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + batch_gt_instances: Tensor, + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + if batch_gt_instances_ignore is None: + batch_gt_instances_ignore = [None] * num_imgs + + batch_gt_instances = self.gt_instances_preprocess( + batch_gt_instances, len(batch_img_metas)) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_objectness = torch.cat(flatten_objectness, dim=1) + flatten_priors = torch.cat(mlvl_priors) + flatten_bboxes = self.bbox_coder.decode(flatten_priors[..., :2], + flatten_bbox_preds, + flatten_priors[..., 2]) + + (pos_masks, cls_targets, obj_targets, bbox_targets, bbox_aux_target, + num_fg_imgs) = multi_apply( + self._get_targets_single, + flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1), + flatten_cls_preds.detach(), flatten_bboxes.detach(), + flatten_objectness.detach(), batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + # The experimental results show that 'reduce_mean' can improve + # performance on the COCO dataset. + num_pos = torch.tensor( + sum(num_fg_imgs), + dtype=torch.float, + device=flatten_cls_preds.device) + num_total_samples = max(reduce_mean(num_pos), 1.0) + + pos_masks = torch.cat(pos_masks, 0) + cls_targets = torch.cat(cls_targets, 0) + obj_targets = torch.cat(obj_targets, 0) + bbox_targets = torch.cat(bbox_targets, 0) + if self.use_bbox_aux: + bbox_aux_target = torch.cat(bbox_aux_target, 0) + + loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), + obj_targets) / num_total_samples + if num_pos > 0: + loss_cls = self.loss_cls( + flatten_cls_preds.view(-1, self.num_classes)[pos_masks], + cls_targets) / num_total_samples + loss_bbox = self.loss_bbox( + flatten_bboxes.view(-1, 4)[pos_masks], + bbox_targets) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_cls = flatten_cls_preds.sum() * 0 + loss_bbox = flatten_bboxes.sum() * 0 + + loss_dict = dict( + loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj) + + if self.use_bbox_aux: + if num_pos > 0: + loss_bbox_aux = self.loss_bbox_aux( + flatten_bbox_preds.view(-1, 4)[pos_masks], + bbox_aux_target) / num_total_samples + else: + # Avoid cls and reg branch not participating in the gradient + # propagation when there is no ground-truth in the images. + # For more details, please refer to + # https://github.com/open-mmlab/mmdetection/issues/7298 + loss_bbox_aux = flatten_bbox_preds.sum() * 0 + loss_dict.update(loss_bbox_aux=loss_bbox_aux) + + return loss_dict + + @torch.no_grad() + def _get_targets_single( + self, + priors: Tensor, + cls_preds: Tensor, + decoded_bboxes: Tensor, + objectness: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None) -> tuple: + """Compute classification, regression, and objectness targets for + priors in a single image. + + Args: + priors (Tensor): All priors of one image, a 2D-Tensor with shape + [num_priors, 4] in [cx, xy, stride_w, stride_y] format. + cls_preds (Tensor): Classification predictions of one image, + a 2D-Tensor with shape [num_priors, num_classes] + decoded_bboxes (Tensor): Decoded bboxes predictions of one image, + a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y, + br_x, br_y] format. + objectness (Tensor): Objectness predictions of one image, + a 1D-Tensor with shape [num_priors] + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It should includes ``bboxes`` and ``labels`` + attributes. + img_meta (dict): Meta information for current image. + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + tuple: + foreground_mask (list[Tensor]): Binary mask of foreground + targets. + cls_target (list[Tensor]): Classification targets of an image. + obj_target (list[Tensor]): Objectness targets of an image. + bbox_target (list[Tensor]): BBox targets of an image. + bbox_aux_target (int): BBox aux targets of an image. + num_pos_per_img (int): Number of positive samples in an image. + """ + + num_priors = priors.size(0) + num_gts = len(gt_instances) + # No target + if num_gts == 0: + cls_target = cls_preds.new_zeros((0, self.num_classes)) + bbox_target = cls_preds.new_zeros((0, 4)) + bbox_aux_target = cls_preds.new_zeros((0, 4)) + obj_target = cls_preds.new_zeros((num_priors, 1)) + foreground_mask = cls_preds.new_zeros(num_priors).bool() + return (foreground_mask, cls_target, obj_target, bbox_target, + bbox_aux_target, 0) + + # YOLOX uses center priors with 0.5 offset to assign targets, + # but use center priors without offset to regress bboxes. + offset_priors = torch.cat( + [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1) + + scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid() + pred_instances = InstanceData( + bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors) + assign_result = self.assigner.assign( + pred_instances=pred_instances, + gt_instances=gt_instances, + gt_instances_ignore=gt_instances_ignore) + + sampling_result = self.sampler.sample(assign_result, pred_instances, + gt_instances) + pos_inds = sampling_result.pos_inds + num_pos_per_img = pos_inds.size(0) + + pos_ious = assign_result.max_overlaps[pos_inds] + # IOU aware classification score + cls_target = F.one_hot(sampling_result.pos_gt_labels, + self.num_classes) * pos_ious.unsqueeze(-1) + obj_target = torch.zeros_like(objectness).unsqueeze(-1) + obj_target[pos_inds] = 1 + bbox_target = sampling_result.pos_gt_bboxes + bbox_aux_target = cls_preds.new_zeros((num_pos_per_img, 4)) + if self.use_bbox_aux: + bbox_aux_target = self._get_bbox_aux_target( + bbox_aux_target, bbox_target, priors[pos_inds]) + foreground_mask = torch.zeros_like(objectness).to(torch.bool) + foreground_mask[pos_inds] = 1 + return (foreground_mask, cls_target, obj_target, bbox_target, + bbox_aux_target, num_pos_per_img) + + def _get_bbox_aux_target(self, + bbox_aux_target: Tensor, + gt_bboxes: Tensor, + priors: Tensor, + eps: float = 1e-8) -> Tensor: + """Convert gt bboxes to center offset and log width height.""" + gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes) + bbox_aux_target[:, :2] = (gt_cxcywh[:, :2] - + priors[:, :2]) / priors[:, 2:] + bbox_aux_target[:, + 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps) + return bbox_aux_target + + @staticmethod + def gt_instances_preprocess(batch_gt_instances: Tensor, + batch_size: int) -> List[InstanceData]: + """Split batch_gt_instances with batch size. + + Args: + batch_gt_instances (Tensor): Ground truth + a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6] + batch_size (int): Batch size. + + Returns: + List: batch gt instances data, shape [batch_size, InstanceData] + """ + # faster version + batch_instance_list = [] + for i in range(batch_size): + batch_gt_instance_ = InstanceData() + single_batch_instance = \ + batch_gt_instances[batch_gt_instances[:, 0] == i, :] + batch_gt_instance_.bboxes = single_batch_instance[:, 2:] + batch_gt_instance_.labels = single_batch_instance[:, 1] + batch_instance_list.append(batch_gt_instance_) + + return batch_instance_list diff --git a/third_party/mmyolo/mmyolo/models/dense_heads/yolox_pose_head.py b/third_party/mmyolo/mmyolo/models/dense_heads/yolox_pose_head.py new file mode 100644 index 0000000000000000000000000000000000000000..96264e55299676239ce5a4c9b698941d0356bcea --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/dense_heads/yolox_pose_head.py @@ -0,0 +1,409 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import defaultdict +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +from mmcv.ops import batched_nms +from mmdet.models.utils import filter_scores_and_topk +from mmdet.utils import ConfigType, OptInstanceList +from mmengine.config import ConfigDict +from mmengine.model import ModuleList, bias_init_with_prob +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS +from ..utils import OutputSaveFunctionWrapper, OutputSaveObjectWrapper +from .yolox_head import YOLOXHead, YOLOXHeadModule + + +@MODELS.register_module() +class YOLOXPoseHeadModule(YOLOXHeadModule): + """YOLOXPoseHeadModule serves as a head module for `YOLOX-Pose`. + + In comparison to `YOLOXHeadModule`, this module introduces branches for + keypoint prediction. + """ + + def __init__(self, num_keypoints: int, *args, **kwargs): + self.num_keypoints = num_keypoints + super().__init__(*args, **kwargs) + + def _init_layers(self): + """Initializes the layers in the head module.""" + super()._init_layers() + + # The pose branch requires additional layers for precise regression + self.stacked_convs *= 2 + + # Create separate layers for each level of feature maps + pose_convs, offsets_preds, vis_preds = [], [], [] + for _ in self.featmap_strides: + pose_convs.append(self._build_stacked_convs()) + offsets_preds.append( + nn.Conv2d(self.feat_channels, self.num_keypoints * 2, 1)) + vis_preds.append( + nn.Conv2d(self.feat_channels, self.num_keypoints, 1)) + + self.multi_level_pose_convs = ModuleList(pose_convs) + self.multi_level_conv_offsets = ModuleList(offsets_preds) + self.multi_level_conv_vis = ModuleList(vis_preds) + + def init_weights(self): + """Initialize weights of the head.""" + super().init_weights() + + # Use prior in model initialization to improve stability + bias_init = bias_init_with_prob(0.01) + for conv_vis in self.multi_level_conv_vis: + conv_vis.bias.data.fill_(bias_init) + + def forward(self, x: Tuple[Tensor]) -> Tuple[List]: + """Forward features from the upstream network.""" + offsets_pred, vis_pred = [], [] + for i in range(len(x)): + pose_feat = self.multi_level_pose_convs[i](x[i]) + offsets_pred.append(self.multi_level_conv_offsets[i](pose_feat)) + vis_pred.append(self.multi_level_conv_vis[i](pose_feat)) + return (*super().forward(x), offsets_pred, vis_pred) + + +@MODELS.register_module() +class YOLOXPoseHead(YOLOXHead): + """YOLOXPoseHead head used in `YOLO-Pose. + + `_. + Args: + loss_pose (ConfigDict, optional): Config of keypoint OKS loss. + """ + + def __init__( + self, + loss_pose: Optional[ConfigType] = None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.loss_pose = MODELS.build(loss_pose) + self.num_keypoints = self.head_module.num_keypoints + + # set up buffers to save variables generated in methods of + # the class's base class. + self._log = defaultdict(list) + self.sampler = OutputSaveObjectWrapper(self.sampler) + + # ensure that the `sigmas` in self.assigner.oks_calculator + # is on the same device as the model + if hasattr(self.assigner, 'oks_calculator'): + self.add_module('assigner_oks_calculator', + self.assigner.oks_calculator) + + def _clear(self): + """Clear variable buffers.""" + self.sampler.clear() + self._log.clear() + + def loss(self, x: Tuple[Tensor], batch_data_samples: Union[list, + dict]) -> dict: + + if isinstance(batch_data_samples, list): + losses = super().loss(x, batch_data_samples) + else: + outs = self(x) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['keypoints'], + batch_data_samples['keypoints_visible'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + objectnesses: Sequence[Tensor], + kpt_preds: Sequence[Tensor], + vis_preds: Sequence[Tensor], + batch_gt_instances: Tensor, + batch_gt_keypoints: Tensor, + batch_gt_keypoints_visible: Tensor, + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + In addition to the base class method, keypoint losses are also + calculated in this method. + """ + + self._clear() + batch_gt_instances = self.gt_kps_instances_preprocess( + batch_gt_instances, batch_gt_keypoints, batch_gt_keypoints_visible, + len(batch_img_metas)) + + # collect keypoints coordinates and visibility from model predictions + kpt_preds = torch.cat([ + kpt_pred.flatten(2).permute(0, 2, 1).contiguous() + for kpt_pred in kpt_preds + ], + dim=1) + + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + grid_priors = torch.cat(mlvl_priors) + + flatten_kpts = self.decode_pose(grid_priors[..., :2], kpt_preds, + grid_priors[..., 2]) + + vis_preds = torch.cat([ + vis_pred.flatten(2).permute(0, 2, 1).contiguous() + for vis_pred in vis_preds + ], + dim=1) + + # compute detection losses and collect targets for keypoints + # predictions simultaneously + self._log['pred_keypoints'] = list(flatten_kpts.detach().split( + 1, dim=0)) + self._log['pred_keypoints_vis'] = list(vis_preds.detach().split( + 1, dim=0)) + + losses = super().loss_by_feat(cls_scores, bbox_preds, objectnesses, + batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + + kpt_targets, vis_targets = [], [] + sampling_results = self.sampler.log['sample'] + sampling_result_idx = 0 + for gt_instances in batch_gt_instances: + if len(gt_instances) > 0: + sampling_result = sampling_results[sampling_result_idx] + kpt_target = gt_instances['keypoints'][ + sampling_result.pos_assigned_gt_inds] + vis_target = gt_instances['keypoints_visible'][ + sampling_result.pos_assigned_gt_inds] + sampling_result_idx += 1 + kpt_targets.append(kpt_target) + vis_targets.append(vis_target) + + if len(kpt_targets) > 0: + kpt_targets = torch.cat(kpt_targets, 0) + vis_targets = torch.cat(vis_targets, 0) + + # compute keypoint losses + if len(kpt_targets) > 0: + vis_targets = (vis_targets > 0).float() + pos_masks = torch.cat(self._log['foreground_mask'], 0) + bbox_targets = torch.cat(self._log['bbox_target'], 0) + loss_kpt = self.loss_pose( + flatten_kpts.view(-1, self.num_keypoints, 2)[pos_masks], + kpt_targets, vis_targets, bbox_targets) + loss_vis = self.loss_cls( + vis_preds.view(-1, self.num_keypoints)[pos_masks], + vis_targets) / vis_targets.sum() + else: + loss_kpt = kpt_preds.sum() * 0 + loss_vis = vis_preds.sum() * 0 + + losses.update(dict(loss_kpt=loss_kpt, loss_vis=loss_vis)) + + self._clear() + return losses + + @torch.no_grad() + def _get_targets_single( + self, + priors: Tensor, + cls_preds: Tensor, + decoded_bboxes: Tensor, + objectness: Tensor, + gt_instances: InstanceData, + img_meta: dict, + gt_instances_ignore: Optional[InstanceData] = None) -> tuple: + """Calculates targets for a single image, and saves them to the log. + + This method is similar to the _get_targets_single method in the base + class, but additionally saves the foreground mask and bbox targets to + the log. + """ + + # Construct a combined representation of bboxes and keypoints to + # ensure keypoints are also involved in the positive sample + # assignment process + kpt = self._log['pred_keypoints'].pop(0).squeeze(0) + kpt_vis = self._log['pred_keypoints_vis'].pop(0).squeeze(0) + kpt = torch.cat((kpt, kpt_vis.unsqueeze(-1)), dim=-1) + decoded_bboxes = torch.cat((decoded_bboxes, kpt.flatten(1)), dim=1) + + targets = super()._get_targets_single(priors, cls_preds, + decoded_bboxes, objectness, + gt_instances, img_meta, + gt_instances_ignore) + self._log['foreground_mask'].append(targets[0]) + self._log['bbox_target'].append(targets[3]) + return targets + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + kpt_preds: Optional[List[Tensor]] = None, + vis_preds: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into bbox + and keypoint results. + + In addition to the base class method, keypoint predictions are also + calculated in this method. + """ + """calculate predicted bboxes and get the kept instances indices. + + use OutputSaveFunctionWrapper as context manager to obtain + intermediate output from a parent class without copying a + arge block of code + """ + with OutputSaveFunctionWrapper( + filter_scores_and_topk, + super().predict_by_feat.__globals__) as outputs_1: + with OutputSaveFunctionWrapper( + batched_nms, + super()._bbox_post_process.__globals__) as outputs_2: + results_list = super().predict_by_feat(cls_scores, bbox_preds, + objectnesses, + batch_img_metas, cfg, + rescale, with_nms) + keep_indices_topk = [ + out[2][:cfg.max_per_img] for out in outputs_1 + ] + keep_indices_nms = [ + out[1][:cfg.max_per_img] for out in outputs_2 + ] + + num_imgs = len(batch_img_metas) + + # recover keypoints coordinates from model predictions + featmap_sizes = [vis_pred.shape[2:] for vis_pred in vis_preds] + priors = torch.cat(self.mlvl_priors) + strides = [ + priors.new_full((featmap_size.numel() * self.num_base_priors, ), + stride) for featmap_size, stride in zip( + featmap_sizes, self.featmap_strides) + ] + strides = torch.cat(strides) + kpt_preds = torch.cat([ + kpt_pred.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.num_keypoints * 2) for kpt_pred in kpt_preds + ], + dim=1) + flatten_decoded_kpts = self.decode_pose(priors, kpt_preds, strides) + + vis_preds = torch.cat([ + vis_pred.permute(0, 2, 3, 1).reshape( + num_imgs, -1, self.num_keypoints) for vis_pred in vis_preds + ], + dim=1).sigmoid() + + # select keypoints predictions according to bbox scores and nms result + keep_indices_nms_idx = 0 + for pred_instances, kpts, kpts_vis, img_meta, keep_idxs \ + in zip( + results_list, flatten_decoded_kpts, vis_preds, + batch_img_metas, keep_indices_topk): + + pred_instances.bbox_scores = pred_instances.scores + + if len(pred_instances) == 0: + pred_instances.keypoints = kpts[:0] + pred_instances.keypoint_scores = kpts_vis[:0] + continue + + kpts = kpts[keep_idxs] + kpts_vis = kpts_vis[keep_idxs] + + if rescale: + pad_param = img_meta.get('img_meta', None) + scale_factor = img_meta['scale_factor'] + if pad_param is not None: + kpts -= kpts.new_tensor([pad_param[2], pad_param[0]]) + kpts /= kpts.new_tensor(scale_factor).repeat( + (1, self.num_keypoints, 1)) + + keep_idxs_nms = keep_indices_nms[keep_indices_nms_idx] + kpts = kpts[keep_idxs_nms] + kpts_vis = kpts_vis[keep_idxs_nms] + keep_indices_nms_idx += 1 + + pred_instances.keypoints = kpts + pred_instances.keypoint_scores = kpts_vis + + results_list = [r.numpy() for r in results_list] + return results_list + + def decode_pose(self, grids: torch.Tensor, offsets: torch.Tensor, + strides: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression offsets to keypoints. + + Args: + grids (torch.Tensor): The coordinates of the feature map grids. + offsets (torch.Tensor): The predicted offset of each keypoint + relative to its corresponding grid. + strides (torch.Tensor | int): The stride of the feature map for + each instance. + Returns: + torch.Tensor: The decoded keypoints coordinates. + """ + + if isinstance(strides, int): + strides = torch.tensor([strides]).to(offsets) + + strides = strides.reshape(1, -1, 1, 1) + offsets = offsets.reshape(*offsets.shape[:2], -1, 2) + xy_coordinates = (offsets[..., :2] * strides) + grids.unsqueeze(1) + return xy_coordinates + + @staticmethod + def gt_kps_instances_preprocess(batch_gt_instances: Tensor, + batch_gt_keypoints, + batch_gt_keypoints_visible, + batch_size: int) -> List[InstanceData]: + """Split batch_gt_instances with batch size. + + Args: + batch_gt_instances (Tensor): Ground truth + a 2D-Tensor for whole batch, shape [all_gt_bboxes, 6] + batch_size (int): Batch size. + + Returns: + List: batch gt instances data, shape [batch_size, InstanceData] + """ + # faster version + batch_instance_list = [] + for i in range(batch_size): + batch_gt_instance_ = InstanceData() + single_batch_instance = \ + batch_gt_instances[batch_gt_instances[:, 0] == i, :] + keypoints = \ + batch_gt_keypoints[batch_gt_instances[:, 0] == i, :] + keypoints_visible = \ + batch_gt_keypoints_visible[batch_gt_instances[:, 0] == i, :] + batch_gt_instance_.bboxes = single_batch_instance[:, 2:] + batch_gt_instance_.labels = single_batch_instance[:, 1] + batch_gt_instance_.keypoints = keypoints + batch_gt_instance_.keypoints_visible = keypoints_visible + batch_instance_list.append(batch_gt_instance_) + + return batch_instance_list + + @staticmethod + def gt_instances_preprocess(batch_gt_instances: List[InstanceData], *args, + **kwargs) -> List[InstanceData]: + return batch_gt_instances diff --git a/third_party/mmyolo/mmyolo/models/detectors/__init__.py b/third_party/mmyolo/mmyolo/models/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74fb1c6c21c5840a5cd3f45a1a9f827c0e670604 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/detectors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .yolo_detector import YOLODetector + +__all__ = ['YOLODetector'] diff --git a/third_party/mmyolo/mmyolo/models/detectors/yolo_detector.py b/third_party/mmyolo/mmyolo/models/detectors/yolo_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..e6783fbab41287df54f136ea121e827d0603414f --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/detectors/yolo_detector.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmdet.models.detectors.single_stage import SingleStageDetector +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.dist import get_world_size +from mmengine.logging import print_log + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLODetector(SingleStageDetector): + r"""Implementation of YOLO Series + + Args: + backbone (:obj:`ConfigDict` or dict): The backbone config. + neck (:obj:`ConfigDict` or dict): The neck config. + bbox_head (:obj:`ConfigDict` or dict): The bbox head config. + train_cfg (:obj:`ConfigDict` or dict, optional): The training config + of YOLO. Defaults to None. + test_cfg (:obj:`ConfigDict` or dict, optional): The testing config + of YOLO. Defaults to None. + data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of + :class:`DetDataPreprocessor` to process the input data. + Defaults to None. + init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or + list[dict], optional): Initialization config dict. + Defaults to None. + use_syncbn (bool): whether to use SyncBatchNorm. Defaults to True. + """ + + def __init__(self, + backbone: ConfigType, + neck: ConfigType, + bbox_head: ConfigType, + train_cfg: OptConfigType = None, + test_cfg: OptConfigType = None, + data_preprocessor: OptConfigType = None, + init_cfg: OptMultiConfig = None, + use_syncbn: bool = True): + super().__init__( + backbone=backbone, + neck=neck, + bbox_head=bbox_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + data_preprocessor=data_preprocessor, + init_cfg=init_cfg) + + # TODO: Waiting for mmengine support + if use_syncbn and get_world_size() > 1: + torch.nn.SyncBatchNorm.convert_sync_batchnorm(self) + print_log('Using SyncBatchNorm()', 'current') diff --git a/third_party/mmyolo/mmyolo/models/layers/__init__.py b/third_party/mmyolo/mmyolo/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..02753057f2ddf51b0688f4f65ebc52e12be9fa7a --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/layers/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ema import ExpMomentumEMA +from .yolo_bricks import (BepC3StageBlock, BiFusion, CSPLayerWithTwoConv, + DarknetBottleneck, EELANBlock, EffectiveSELayer, + ELANBlock, ImplicitA, ImplicitM, + MaxPoolAndStrideConvBlock, PPYOLOEBasicBlock, + RepStageBlock, RepVGGBlock, SPPFBottleneck, + SPPFCSPBlock, TinyDownSampleBlock) + +__all__ = [ + 'SPPFBottleneck', 'RepVGGBlock', 'RepStageBlock', 'ExpMomentumEMA', + 'ELANBlock', 'MaxPoolAndStrideConvBlock', 'SPPFCSPBlock', + 'PPYOLOEBasicBlock', 'EffectiveSELayer', 'TinyDownSampleBlock', + 'EELANBlock', 'ImplicitA', 'ImplicitM', 'BepC3StageBlock', + 'CSPLayerWithTwoConv', 'DarknetBottleneck', 'BiFusion' +] diff --git a/third_party/mmyolo/mmyolo/models/layers/ema.py b/third_party/mmyolo/mmyolo/models/layers/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..02ed204190ee4a5ab9395eddce5866545caac2c0 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/layers/ema.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional + +import torch +import torch.nn as nn +from mmdet.models.layers import ExpMomentumEMA as MMDET_ExpMomentumEMA +from torch import Tensor + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class ExpMomentumEMA(MMDET_ExpMomentumEMA): + """Exponential moving average (EMA) with exponential momentum strategy, + which is used in YOLO. + + Args: + model (nn.Module): The model to be averaged. + momentum (float): The momentum used for updating ema parameter. + Ema's parameters are updated with the formula: + `averaged_param = (1-momentum) * averaged_param + momentum * + source_param`. Defaults to 0.0002. + gamma (int): Use a larger momentum early in training and gradually + annealing to a smaller value to update the ema model smoothly. The + momentum is calculated as + `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`. + Defaults to 2000. + interval (int): Interval between two updates. Defaults to 1. + device (torch.device, optional): If provided, the averaged model will + be stored on the :attr:`device`. Defaults to None. + update_buffers (bool): if True, it will compute running averages for + both the parameters and the buffers of the model. Defaults to + False. + """ + + def __init__(self, + model: nn.Module, + momentum: float = 0.0002, + gamma: int = 2000, + interval=1, + device: Optional[torch.device] = None, + update_buffers: bool = False): + super().__init__( + model=model, + momentum=momentum, + interval=interval, + device=device, + update_buffers=update_buffers) + assert gamma > 0, f'gamma must be greater than 0, but got {gamma}' + self.gamma = gamma + + # Note: There is no need to re-fetch every update, + # as most models do not change their structure + # during the training process. + self.src_parameters = ( + model.state_dict() + if self.update_buffers else dict(model.named_parameters())) + if not self.update_buffers: + self.src_buffers = model.buffers() + + def avg_func(self, averaged_param: Tensor, source_param: Tensor, + steps: int): + """Compute the moving average of the parameters using the exponential + momentum strategy. + + Args: + averaged_param (Tensor): The averaged parameters. + source_param (Tensor): The source parameters. + steps (int): The number of times the parameters have been + updated. + """ + momentum = (1 - self.momentum) * math.exp( + -float(1 + steps) / self.gamma) + self.momentum + averaged_param.lerp_(source_param, momentum) + + def update_parameters(self, model: nn.Module): + """Update the parameters after each training step. + + Args: + model (nn.Module): The model of the parameter needs to be updated. + """ + if self.steps == 0: + for k, p_avg in self.avg_parameters.items(): + p_avg.data.copy_(self.src_parameters[k].data) + elif self.steps % self.interval == 0: + for k, p_avg in self.avg_parameters.items(): + if p_avg.dtype.is_floating_point: + self.avg_func(p_avg.data, self.src_parameters[k].data, + self.steps) + if not self.update_buffers: + # If not update the buffers, + # keep the buffers in sync with the source model. + for b_avg, b_src in zip(self.module.buffers(), self.src_buffers): + b_avg.data.copy_(b_src.data) + self.steps += 1 diff --git a/third_party/mmyolo/mmyolo/models/layers/yolo_bricks.py b/third_party/mmyolo/mmyolo/models/layers/yolo_bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..19175be1a0e88f5bb7fb87b6810c52050293d890 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/layers/yolo_bricks.py @@ -0,0 +1,1728 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, MaxPool2d, + build_norm_layer) +from mmdet.models.layers.csp_layer import \ + DarknetBottleneck as MMDET_DarknetBottleneck +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.model import BaseModule +from mmengine.utils import digit_version +from torch import Tensor + +from mmyolo.registry import MODELS + +if digit_version(torch.__version__) >= digit_version('1.7.0'): + MODELS.register_module(module=nn.SiLU, name='SiLU') +else: + + class SiLU(nn.Module): + """Sigmoid Weighted Liner Unit.""" + + def __init__(self, inplace=True): + super().__init__() + + def forward(self, inputs) -> Tensor: + return inputs * torch.sigmoid(inputs) + + MODELS.register_module(module=SiLU, name='SiLU') + + +class SPPFBottleneck(BaseModule): + """Spatial pyramid pooling - Fast (SPPF) layer for + YOLOv5, YOLOX and PPYOLOE by Glenn Jocher + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + use_conv_first (bool): Whether to use conv before pooling layer. + In YOLOv5 and YOLOX, the para set to True. + In PPYOLOE, the para set to False. + Defaults to True. + mid_channels_scale (float): Channel multiplier, multiply in_channels + by this amount to get mid_channels. This parameter is valid only + when use_conv_fist=True.Defaults to 0.5. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_sizes: Union[int, Sequence[int]] = 5, + use_conv_first: bool = True, + mid_channels_scale: float = 0.5, + conv_cfg: ConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + if use_conv_first: + mid_channels = int(in_channels * mid_channels_scale) + self.conv1 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + mid_channels = in_channels + self.conv1 = None + self.kernel_sizes = kernel_sizes + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + conv2_in_channels = mid_channels * 4 + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_in_channels = mid_channels * (len(kernel_sizes) + 1) + + self.conv2 = ConvModule( + conv2_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + if self.conv1: + x = self.conv1(x) + if isinstance(self.kernel_sizes, int): + y1 = self.poolings(x) + y2 = self.poolings(y1) + x = torch.cat([x, y1, y2, self.poolings(y2)], dim=1) + else: + x = torch.cat( + [x] + [pooling(x) for pooling in self.poolings], dim=1) + x = self.conv2(x) + return x + + +@MODELS.register_module() +class RepVGGBlock(nn.Module): + """RepVGGBlock is a basic rep-style block, including training and deploy + status This code is based on + https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple): Stride of the convolution. Default: 1 + padding (int, tuple): Padding added to all four sides of + the input. Default: 1 + dilation (int or tuple): Spacing between kernel elements. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + padding_mode (string, optional): Default: 'zeros' + use_se (bool): Whether to use se. Default: False + use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv. + In PPYOLOE+ model backbone, `use_alpha` will be set to True. + Default: False. + use_bn_first (bool): Whether to use bn layer before conv. + In YOLOv6 and YOLOv7, this will be set to True. + In PPYOLOE, this will be set to False. + Default: True. + deploy (bool): Whether in deploy mode. Default: False + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int]] = 3, + stride: Union[int, Tuple[int]] = 1, + padding: Union[int, Tuple[int]] = 1, + dilation: Union[int, Tuple[int]] = 1, + groups: Optional[int] = 1, + padding_mode: Optional[str] = 'zeros', + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + use_se: bool = False, + use_alpha: bool = False, + use_bn_first=True, + deploy: bool = False): + super().__init__() + self.deploy = deploy + self.groups = groups + self.in_channels = in_channels + self.out_channels = out_channels + + assert kernel_size == 3 + assert padding == 1 + + padding_11 = padding - kernel_size // 2 + + self.nonlinearity = MODELS.build(act_cfg) + + if use_se: + raise NotImplementedError('se block not supported yet') + else: + self.se = nn.Identity() + + if use_alpha: + alpha = torch.ones([ + 1, + ], dtype=torch.float32, requires_grad=True) + self.alpha = nn.Parameter(alpha, requires_grad=True) + else: + self.alpha = None + + if deploy: + self.rbr_reparam = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=True, + padding_mode=padding_mode) + + else: + if use_bn_first and (out_channels == in_channels) and stride == 1: + self.rbr_identity = build_norm_layer( + norm_cfg, num_features=in_channels)[1] + else: + self.rbr_identity = None + + self.rbr_dense = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None) + self.rbr_1x1 = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=padding_11, + groups=groups, + bias=False, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, inputs: Tensor) -> Tensor: + """Forward process. + Args: + inputs (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + if hasattr(self, 'rbr_reparam'): + return self.nonlinearity(self.se(self.rbr_reparam(inputs))) + + if self.rbr_identity is None: + id_out = 0 + else: + id_out = self.rbr_identity(inputs) + if self.alpha: + return self.nonlinearity( + self.se( + self.rbr_dense(inputs) + + self.alpha * self.rbr_1x1(inputs) + id_out)) + else: + return self.nonlinearity( + self.se( + self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)) + + def get_equivalent_kernel_bias(self): + """Derives the equivalent kernel and bias in a differentiable way. + + Returns: + tuple: Equivalent kernel and bias + """ + kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense) + kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) + kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity) + if self.alpha: + return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + self.alpha * bias1x1 + biasid + else: + return kernel3x3 + self._pad_1x1_to_3x3_tensor( + kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid + + def _pad_1x1_to_3x3_tensor(self, kernel1x1): + """Pad 1x1 tensor to 3x3. + Args: + kernel1x1 (Tensor): The input 1x1 kernel need to be padded. + + Returns: + Tensor: 3x3 kernel after padded. + """ + if kernel1x1 is None: + return 0 + else: + return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1]) + + def _fuse_bn_tensor(self, branch: nn.Module) -> Tuple[np.ndarray, Tensor]: + """Derives the equivalent kernel and bias of a specific branch layer. + + Args: + branch (nn.Module): The layer that needs to be equivalently + transformed, which can be nn.Sequential or nn.Batchnorm2d + + Returns: + tuple: Equivalent kernel and bias + """ + if branch is None: + return 0, 0 + if isinstance(branch, ConvModule): + kernel = branch.conv.weight + running_mean = branch.bn.running_mean + running_var = branch.bn.running_var + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn.eps + else: + assert isinstance(branch, (nn.SyncBatchNorm, nn.BatchNorm2d)) + if not hasattr(self, 'id_tensor'): + input_dim = self.in_channels // self.groups + kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), + dtype=np.float32) + for i in range(self.in_channels): + kernel_value[i, i % input_dim, 1, 1] = 1 + self.id_tensor = torch.from_numpy(kernel_value).to( + branch.weight.device) + kernel = self.id_tensor + running_mean = branch.running_mean + running_var = branch.running_var + gamma = branch.weight + beta = branch.bias + eps = branch.eps + std = (running_var + eps).sqrt() + t = (gamma / std).reshape(-1, 1, 1, 1) + return kernel * t, beta - running_mean * gamma / std + + def switch_to_deploy(self): + """Switch to deploy mode.""" + if hasattr(self, 'rbr_reparam'): + return + kernel, bias = self.get_equivalent_kernel_bias() + self.rbr_reparam = nn.Conv2d( + in_channels=self.rbr_dense.conv.in_channels, + out_channels=self.rbr_dense.conv.out_channels, + kernel_size=self.rbr_dense.conv.kernel_size, + stride=self.rbr_dense.conv.stride, + padding=self.rbr_dense.conv.padding, + dilation=self.rbr_dense.conv.dilation, + groups=self.rbr_dense.conv.groups, + bias=True) + self.rbr_reparam.weight.data = kernel + self.rbr_reparam.bias.data = bias + for para in self.parameters(): + para.detach_() + self.__delattr__('rbr_dense') + self.__delattr__('rbr_1x1') + if hasattr(self, 'rbr_identity'): + self.__delattr__('rbr_identity') + if hasattr(self, 'id_tensor'): + self.__delattr__('id_tensor') + self.deploy = True + + +@MODELS.register_module() +class BepC3StageBlock(nn.Module): + """Beer-mug RepC3 Block. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + num_blocks (int): Number of blocks. Defaults to 1 + hidden_ratio (float): Hidden channel expansion. + Default: 0.5 + concat_all_layer (bool): Concat all layer when forward calculate. + Default: True + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + norm_cfg (ConfigType): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (ConfigType): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_blocks: int = 1, + hidden_ratio: float = 0.5, + concat_all_layer: bool = True, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True)): + super().__init__() + hidden_channels = int(out_channels * hidden_ratio) + + self.conv1 = ConvModule( + in_channels, + hidden_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = ConvModule( + in_channels, + hidden_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + 2 * hidden_channels, + out_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.block = RepStageBlock( + in_channels=hidden_channels, + out_channels=hidden_channels, + num_blocks=num_blocks, + block_cfg=block_cfg, + bottle_block=BottleRep) + self.concat_all_layer = concat_all_layer + if not concat_all_layer: + self.conv3 = ConvModule( + hidden_channels, + out_channels, + kernel_size=1, + stride=1, + groups=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x): + if self.concat_all_layer is True: + return self.conv3( + torch.cat((self.block(self.conv1(x)), self.conv2(x)), dim=1)) + else: + return self.conv3(self.block(self.conv1(x))) + + +class BottleRep(nn.Module): + """Bottle Rep Block. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + adaptive_weight (bool): Add adaptive_weight when forward calculate. + Defaults False. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + block_cfg: ConfigType = dict(type='RepVGGBlock'), + adaptive_weight: bool = False): + super().__init__() + conv1_cfg = block_cfg.copy() + conv2_cfg = block_cfg.copy() + + conv1_cfg.update( + dict(in_channels=in_channels, out_channels=out_channels)) + conv2_cfg.update( + dict(in_channels=out_channels, out_channels=out_channels)) + + self.conv1 = MODELS.build(conv1_cfg) + self.conv2 = MODELS.build(conv2_cfg) + + if in_channels != out_channels: + self.shortcut = False + else: + self.shortcut = True + if adaptive_weight: + self.alpha = nn.Parameter(torch.ones(1)) + else: + self.alpha = 1.0 + + def forward(self, x: Tensor) -> Tensor: + outputs = self.conv1(x) + outputs = self.conv2(outputs) + return outputs + self.alpha * x if self.shortcut else outputs + + +@MODELS.register_module() +class ConvWrapper(nn.Module): + """Wrapper for normal Conv with SiLU activation. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple): Stride of the convolution. Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): Conv bias. Default: True. + norm_cfg (ConfigType): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (ConfigType): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int = 3, + stride: int = 1, + groups: int = 1, + bias: bool = True, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = dict(type='SiLU')): + super().__init__() + self.block = ConvModule( + in_channels, + out_channels, + kernel_size, + stride, + padding=kernel_size // 2, + groups=groups, + bias=bias, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + return self.block(x) + + +@MODELS.register_module() +class EffectiveSELayer(nn.Module): + """Effective Squeeze-Excitation. + + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` + arxiv (https://arxiv.org/abs/1911.06667) + This code referenced to + https://github.com/youngwanLEE/CenterMask/blob/72147e8aae673fcaf4103ee90a6a6b73863e7fa1/maskrcnn_benchmark/modeling/backbone/vovnet.py#L108-L121 # noqa + + Args: + channels (int): The input and output channels of this Module. + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='HSigmoid'). + """ + + def __init__(self, + channels: int, + act_cfg: ConfigType = dict(type='HSigmoid')): + super().__init__() + assert isinstance(act_cfg, dict) + self.fc = ConvModule(channels, channels, 1, act_cfg=None) + + act_cfg_ = act_cfg.copy() # type: ignore + self.activate = MODELS.build(act_cfg_) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x_se = x.mean((2, 3), keepdim=True) + x_se = self.fc(x_se) + return x * self.activate(x_se) + + +class PPYOLOESELayer(nn.Module): + """Squeeze-and-Excitation Attention Module for PPYOLOE. + There are some differences between the current implementation and + SELayer in mmdet: + 1. For fast speed and avoiding double inference in ppyoloe, + use `F.adaptive_avg_pool2d` before PPYOLOESELayer. + 2. Special ways to init weights. + 3. Different convolution order. + + Args: + feat_channels (int): The input (and output) channels of the SE layer. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + """ + + def __init__(self, + feat_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True)): + super().__init__() + self.fc = nn.Conv2d(feat_channels, feat_channels, 1) + self.sig = nn.Sigmoid() + self.conv = ConvModule( + feat_channels, + feat_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self._init_weights() + + def _init_weights(self): + """Init weights.""" + nn.init.normal_(self.fc.weight, mean=0, std=0.001) + + def forward(self, feat: Tensor, avg_feat: Tensor) -> Tensor: + """Forward process + Args: + feat (Tensor): The input tensor. + avg_feat (Tensor): Average pooling feature tensor. + """ + weight = self.sig(self.fc(avg_feat)) + return self.conv(feat * weight) + + +@MODELS.register_module() +class ELANBlock(BaseModule): + """Efficient layer aggregation networks for YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + middle_ratio (float): The scaling ratio of the middle layer + based on the in_channels. + block_ratio (float): The scaling ratio of the block layer + based on the in_channels. + num_blocks (int): The number of blocks in the main branch. + Defaults to 2. + num_convs_in_block (int): The number of convs pre block. + Defaults to 1. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + middle_ratio: float, + block_ratio: float, + num_blocks: int = 2, + num_convs_in_block: int = 1, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + assert num_blocks >= 1 + assert num_convs_in_block >= 1 + + middle_channels = int(in_channels * middle_ratio) + block_channels = int(in_channels * block_ratio) + final_conv_in_channels = int( + num_blocks * block_channels) + 2 * middle_channels + + self.main_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.short_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.ModuleList() + for _ in range(num_blocks): + if num_convs_in_block == 1: + internal_block = ConvModule( + middle_channels, + block_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + internal_block = [] + for _ in range(num_convs_in_block): + internal_block.append( + ConvModule( + middle_channels, + block_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + middle_channels = block_channels + internal_block = nn.Sequential(*internal_block) + + middle_channels = block_channels + self.blocks.append(internal_block) + + self.final_conv = ConvModule( + final_conv_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x_short = self.short_conv(x) + x_main = self.main_conv(x) + block_outs = [] + x_block = x_main + for block in self.blocks: + x_block = block(x_block) + block_outs.append(x_block) + x_final = torch.cat((*block_outs[::-1], x_main, x_short), dim=1) + return self.final_conv(x_final) + + +@MODELS.register_module() +class EELANBlock(BaseModule): + """Expand efficient layer aggregation networks for YOLOv7. + + Args: + num_elan_block (int): The number of ELANBlock. + """ + + def __init__(self, num_elan_block: int, **kwargs): + super().__init__() + assert num_elan_block >= 1 + self.e_elan_blocks = nn.ModuleList() + for _ in range(num_elan_block): + self.e_elan_blocks.append(ELANBlock(**kwargs)) + + def forward(self, x: Tensor) -> Tensor: + outs = [] + for elan_blocks in self.e_elan_blocks: + outs.append(elan_blocks(x)) + return sum(outs) + + +class MaxPoolAndStrideConvBlock(BaseModule): + """Max pooling and stride conv layer for YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + maxpool_kernel_sizes (int): kernel sizes of pooling layers. + Defaults to 2. + use_in_channels_of_middle (bool): Whether to calculate middle channels + based on in_channels. Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + maxpool_kernel_sizes: int = 2, + use_in_channels_of_middle: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + + middle_channels = in_channels if use_in_channels_of_middle \ + else out_channels // 2 + + self.maxpool_branches = nn.Sequential( + MaxPool2d( + kernel_size=maxpool_kernel_sizes, stride=maxpool_kernel_sizes), + ConvModule( + in_channels, + out_channels // 2, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.stride_conv_branches = nn.Sequential( + ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + middle_channels, + out_channels // 2, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + maxpool_out = self.maxpool_branches(x) + stride_conv_out = self.stride_conv_branches(x) + return torch.cat([stride_conv_out, maxpool_out], dim=1) + + +@MODELS.register_module() +class TinyDownSampleBlock(BaseModule): + """Down sample layer for YOLOv7-tiny. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The out channels of this Module. + middle_ratio (float): The scaling ratio of the middle layer + based on the in_channels. Defaults to 1.0. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 3. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='LeakyReLU', negative_slope=0.1). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + middle_ratio: float = 1.0, + kernel_sizes: Union[int, Sequence[int]] = 3, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='LeakyReLU', negative_slope=0.1), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + middle_channels = int(in_channels * middle_ratio) + + self.short_conv = ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.main_convs = nn.ModuleList() + for i in range(3): + if i == 0: + self.main_convs.append( + ConvModule( + in_channels, + middle_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + else: + self.main_convs.append( + ConvModule( + middle_channels, + middle_channels, + kernel_sizes, + padding=(kernel_sizes - 1) // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.final_conv = ConvModule( + middle_channels * 4, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x) -> Tensor: + short_out = self.short_conv(x) + + main_outs = [] + for main_conv in self.main_convs: + main_out = main_conv(x) + main_outs.append(main_out) + x = main_out + + return self.final_conv(torch.cat([*main_outs[::-1], short_out], dim=1)) + + +@MODELS.register_module() +class SPPFCSPBlock(BaseModule): + """Spatial pyramid pooling - Fast (SPPF) layer with CSP for + YOLOv7 + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expand_ratio (float): Expand ratio of SPPCSPBlock. + Defaults to 0.5. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + is_tiny_version (bool): Is tiny version of SPPFCSPBlock. If True, + it means it is a yolov7 tiny model. Defaults to False. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + kernel_sizes: Union[int, Sequence[int]] = 5, + is_tiny_version: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg=init_cfg) + self.is_tiny_version = is_tiny_version + + mid_channels = int(2 * out_channels * expand_ratio) + + if is_tiny_version: + self.main_layers = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.main_layers = nn.Sequential( + ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ) + + self.kernel_sizes = kernel_sizes + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + + if is_tiny_version: + self.fuse_layers = ConvModule( + 4 * mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.fuse_layers = nn.Sequential( + ConvModule( + 4 * mid_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + mid_channels, + mid_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.short_layer = ConvModule( + in_channels, + mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.final_conv = ConvModule( + 2 * mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x1 = self.main_layers(x) + if isinstance(self.kernel_sizes, int): + y1 = self.poolings(x1) + y2 = self.poolings(y1) + concat_list = [x1] + [y1, y2, self.poolings(y2)] + if self.is_tiny_version: + x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1)) + else: + x1 = self.fuse_layers(torch.cat(concat_list, 1)) + else: + concat_list = [x1] + [m(x1) for m in self.poolings] + if self.is_tiny_version: + x1 = self.fuse_layers(torch.cat(concat_list[::-1], 1)) + else: + x1 = self.fuse_layers(torch.cat(concat_list, 1)) + + x2 = self.short_layer(x) + return self.final_conv(torch.cat((x1, x2), dim=1)) + + +class ImplicitA(nn.Module): + """Implicit add layer in YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + mean (float): Mean value of implicit module. Defaults to 0. + std (float): Std value of implicit module. Defaults to 0.02 + """ + + def __init__(self, in_channels: int, mean: float = 0., std: float = .02): + super().__init__() + self.implicit = nn.Parameter(torch.zeros(1, in_channels, 1, 1)) + nn.init.normal_(self.implicit, mean=mean, std=std) + + def forward(self, x): + """Forward process + Args: + x (Tensor): The input tensor. + """ + return self.implicit + x + + +class ImplicitM(nn.Module): + """Implicit multiplier layer in YOLOv7. + + Args: + in_channels (int): The input channels of this Module. + mean (float): Mean value of implicit module. Defaults to 1. + std (float): Std value of implicit module. Defaults to 0.02. + """ + + def __init__(self, in_channels: int, mean: float = 1., std: float = .02): + super().__init__() + self.implicit = nn.Parameter(torch.ones(1, in_channels, 1, 1)) + nn.init.normal_(self.implicit, mean=mean, std=std) + + def forward(self, x): + """Forward process + Args: + x (Tensor): The input tensor. + """ + return self.implicit * x + + +@MODELS.register_module() +class PPYOLOEBasicBlock(nn.Module): + """PPYOLOE Backbone BasicBlock. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + shortcut (bool): Whether to add inputs and outputs together + at the end of this layer. Defaults to True. + use_alpha (bool): Whether to use `alpha` parameter at 1x1 conv. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + shortcut: bool = True, + use_alpha: bool = False): + super().__init__() + assert act_cfg is None or isinstance(act_cfg, dict) + self.conv1 = ConvModule( + in_channels, + out_channels, + 3, + stride=1, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv2 = RepVGGBlock( + out_channels, + out_channels, + use_alpha=use_alpha, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + use_bn_first=False) + self.shortcut = shortcut + + def forward(self, x: Tensor) -> Tensor: + """Forward process. + Args: + inputs (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + y = self.conv1(x) + y = self.conv2(y) + if self.shortcut: + return x + y + else: + return y + + +class CSPResLayer(nn.Module): + """PPYOLOE Backbone Stage. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + num_block (int): Number of blocks in this stage. + block_cfg (dict): Config dict for block. Default config is + suitable for PPYOLOE+ backbone. And in PPYOLOE neck, + block_cfg is set to dict(type='PPYOLOEBasicBlock', + shortcut=False, use_alpha=False). Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True). + stride (int): Stride of the convolution. In backbone, the stride + must be set to 2. In neck, the stride must be set to 1. + Defaults to 1. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + attention_cfg (dict, optional): Config dict for `EffectiveSELayer`. + Defaults to dict(type='EffectiveSELayer', + act_cfg=dict(type='HSigmoid')). + use_spp (bool): Whether to use `SPPFBottleneck` layer. + Defaults to False. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_block: int, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=True, use_alpha=True), + stride: int = 1, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + attention_cfg: OptMultiConfig = dict( + type='EffectiveSELayer', act_cfg=dict(type='HSigmoid')), + use_spp: bool = False): + super().__init__() + + self.num_block = num_block + self.block_cfg = block_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.use_spp = use_spp + assert attention_cfg is None or isinstance(attention_cfg, dict) + + if stride == 2: + conv1_in_channels = conv2_in_channels = conv3_in_channels = ( + in_channels + out_channels) // 2 + blocks_channels = conv1_in_channels // 2 + self.conv_down = ConvModule( + in_channels, + conv1_in_channels, + 3, + stride=2, + padding=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + conv1_in_channels = conv2_in_channels = in_channels + conv3_in_channels = out_channels + blocks_channels = out_channels // 2 + self.conv_down = None + + self.conv1 = ConvModule( + conv1_in_channels, + blocks_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.conv2 = ConvModule( + conv2_in_channels, + blocks_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = self.build_blocks_layer(blocks_channels) + + self.conv3 = ConvModule( + conv3_in_channels, + out_channels, + 1, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if attention_cfg: + attention_cfg = attention_cfg.copy() + attention_cfg['channels'] = blocks_channels * 2 + self.attn = MODELS.build(attention_cfg) + else: + self.attn = None + + def build_blocks_layer(self, blocks_channels: int) -> nn.Module: + """Build blocks layer. + + Args: + blocks_channels: The channels of this Module. + """ + blocks = nn.Sequential() + block_cfg = self.block_cfg.copy() + block_cfg.update( + dict(in_channels=blocks_channels, out_channels=blocks_channels)) + block_cfg.setdefault('norm_cfg', self.norm_cfg) + block_cfg.setdefault('act_cfg', self.act_cfg) + + for i in range(self.num_block): + blocks.add_module(str(i), MODELS.build(block_cfg)) + + if i == (self.num_block - 1) // 2 and self.use_spp: + blocks.add_module( + 'spp', + SPPFBottleneck( + blocks_channels, + blocks_channels, + kernel_sizes=[5, 9, 13], + use_conv_first=False, + conv_cfg=None, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + return blocks + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + if self.conv_down is not None: + x = self.conv_down(x) + y1 = self.conv1(x) + y2 = self.blocks(self.conv2(x)) + y = torch.cat([y1, y2], axis=1) + if self.attn is not None: + y = self.attn(y) + y = self.conv3(y) + return y + + +@MODELS.register_module() +class RepStageBlock(nn.Module): + """RepStageBlock is a stage block with rep-style basic block. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + num_blocks (int, tuple[int]): Number of blocks. Defaults to 1. + bottle_block (nn.Module): Basic unit of RepStage. + Defaults to RepVGGBlock. + block_cfg (ConfigType): Config of RepStage. + Defaults to 'RepVGGBlock'. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + num_blocks: int = 1, + bottle_block: nn.Module = RepVGGBlock, + block_cfg: ConfigType = dict(type='RepVGGBlock')): + super().__init__() + block_cfg = block_cfg.copy() + + block_cfg.update( + dict(in_channels=in_channels, out_channels=out_channels)) + + self.conv1 = MODELS.build(block_cfg) + + block_cfg.update( + dict(in_channels=out_channels, out_channels=out_channels)) + + self.block = None + if num_blocks > 1: + self.block = nn.Sequential(*(MODELS.build(block_cfg) + for _ in range(num_blocks - 1))) + + if bottle_block == BottleRep: + self.conv1 = BottleRep( + in_channels, + out_channels, + block_cfg=block_cfg, + adaptive_weight=True) + num_blocks = num_blocks // 2 + self.block = None + if num_blocks > 1: + self.block = nn.Sequential(*(BottleRep( + out_channels, + out_channels, + block_cfg=block_cfg, + adaptive_weight=True) for _ in range(num_blocks - 1))) + + def forward(self, x: Tensor) -> Tensor: + """Forward process. + + Args: + x (Tensor): The input tensor. + + Returns: + Tensor: The output tensor. + """ + x = self.conv1(x) + if self.block is not None: + x = self.block(x) + return x + + +class DarknetBottleneck(MMDET_DarknetBottleneck): + """The basic bottleneck block used in Darknet. + + Each ResBlock consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and LeakyReLU. + The first convLayer has filter size of k1Xk1 and the second one has the + filter size of k2Xk2. + + Note: + This DarknetBottleneck is little different from MMDet's, we can + change the kernel size and padding for each conv. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (float): The kernel size for hidden channel. + Defaults to 0.5. + kernel_size (Sequence[int]): The kernel size of the convolution. + Defaults to (1, 3). + padding (Sequence[int]): The padding size of the convolution. + Defaults to (0, 1). + add_identity (bool): Whether to add identity to the out. + Defaults to True + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + """ + + def __init__(self, + in_channels: int, + out_channels: int, + expansion: float = 0.5, + kernel_size: Sequence[int] = (1, 3), + padding: Sequence[int] = (0, 1), + add_identity: bool = True, + use_depthwise: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels, out_channels, init_cfg=init_cfg) + hidden_channels = int(out_channels * expansion) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + assert isinstance(kernel_size, Sequence) and len(kernel_size) == 2 + + self.conv1 = ConvModule( + in_channels, + hidden_channels, + kernel_size[0], + padding=padding[0], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = conv( + hidden_channels, + out_channels, + kernel_size[1], + stride=1, + padding=padding[1], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.add_identity = \ + add_identity and in_channels == out_channels + + +class CSPLayerWithTwoConv(BaseModule): + """Cross Stage Partial Layer with 2 convolutions. + + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + num_blocks (int): Number of blocks. Defaults to 1 + add_identity (bool): Whether to add identity in blocks. + Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (:obj:`ConfigDict` or dict or list[dict] or + list[:obj:`ConfigDict`], optional): Initialization config dict. + Defaults to None. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + expand_ratio: float = 0.5, + num_blocks: int = 1, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + + self.mid_channels = int(out_channels * expand_ratio) + self.main_conv = ConvModule( + in_channels, + 2 * self.mid_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.final_conv = ConvModule( + (2 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.blocks = nn.ModuleList( + DarknetBottleneck( + self.mid_channels, + self.mid_channels, + expansion=1, + kernel_size=(3, 3), + padding=(1, 1), + add_identity=add_identity, + use_depthwise=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) for _ in range(num_blocks)) + + def forward(self, x: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + return self.final_conv(torch.cat(x_main, 1)) + + +class BiFusion(nn.Module): + """BiFusion Block in YOLOv6. + + BiFusion fuses current-, high- and low-level features. + Compared with concatenation in PAN, it fuses an extra low-level feature. + + Args: + in_channels0 (int): The channels of current-level feature. + in_channels1 (int): The input channels of lower-level feature. + out_channels (int): The out channels of the BiFusion module. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels0: int, + in_channels1: int, + out_channels: int, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True)): + super().__init__() + self.conv1 = ConvModule( + in_channels0, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv2 = ConvModule( + in_channels1, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + out_channels * 3, + out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.upsample = nn.ConvTranspose2d( + out_channels, out_channels, kernel_size=2, stride=2, bias=True) + self.downsample = ConvModule( + out_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: List[torch.Tensor]) -> Tensor: + """Forward process + Args: + x (List[torch.Tensor]): The tensor list of length 3. + x[0]: The high-level feature. + x[1]: The current-level feature. + x[2]: The low-level feature. + """ + x0 = self.upsample(x[0]) + x1 = self.conv1(x[1]) + x2 = self.downsample(self.conv2(x[2])) + return self.conv3(torch.cat((x0, x1, x2), dim=1)) + + +class CSPSPPFBottleneck(BaseModule): + """The SPPF block having a CSP-like version in YOLOv6 3.0. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + kernel_sizes (int, tuple[int]): Sequential or number of kernel + sizes of pooling layers. Defaults to 5. + use_conv_first (bool): Whether to use conv before pooling layer. + In YOLOv5 and YOLOX, the para set to True. + In PPYOLOE, the para set to False. + Defaults to True. + mid_channels_scale (float): Channel multiplier, multiply in_channels + by this amount to get mid_channels. This parameter is valid only + when use_conv_fist=True.Defaults to 0.5. + conv_cfg (dict): Config dict for convolution layer. Defaults to None. + which means using conv2d. Defaults to None. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_sizes: Union[int, Sequence[int]] = 5, + use_conv_first: bool = True, + mid_channels_scale: float = 0.5, + conv_cfg: ConfigType = None, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + + if use_conv_first: + mid_channels = int(in_channels * mid_channels_scale) + self.conv1 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv3 = ConvModule( + mid_channels, + mid_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv4 = ConvModule( + mid_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + mid_channels = in_channels + self.conv1 = None + self.conv3 = None + self.conv4 = None + + self.conv2 = ConvModule( + in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.kernel_sizes = kernel_sizes + + if isinstance(kernel_sizes, int): + self.poolings = nn.MaxPool2d( + kernel_size=kernel_sizes, stride=1, padding=kernel_sizes // 2) + conv2_in_channels = mid_channels * 4 + else: + self.poolings = nn.ModuleList([ + nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) + for ks in kernel_sizes + ]) + conv2_in_channels = mid_channels * (len(kernel_sizes) + 1) + + self.conv5 = ConvModule( + conv2_in_channels, + mid_channels, + 1, + stride=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv6 = ConvModule( + mid_channels, + mid_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv7 = ConvModule( + mid_channels * 2, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, x: Tensor) -> Tensor: + """Forward process + Args: + x (Tensor): The input tensor. + """ + x0 = self.conv4(self.conv3(self.conv1(x))) if self.conv1 else x + y = self.conv2(x) + + if isinstance(self.kernel_sizes, int): + x1 = self.poolings(x0) + x2 = self.poolings(x1) + x3 = torch.cat([x0, x1, x2, self.poolings(x2)], dim=1) + else: + x3 = torch.cat( + [x0] + [pooling(x0) for pooling in self.poolings], dim=1) + + x3 = self.conv6(self.conv5(x3)) + x = self.conv7(torch.cat([y, x3], dim=1)) + return x diff --git a/third_party/mmyolo/mmyolo/models/losses/__init__.py b/third_party/mmyolo/mmyolo/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c89fe4dc45ace2583241cff11542d1fbf8bdc73a --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/losses/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .iou_loss import IoULoss, bbox_overlaps +from .oks_loss import OksLoss + +__all__ = ['IoULoss', 'bbox_overlaps', 'OksLoss'] diff --git a/third_party/mmyolo/mmyolo/models/losses/iou_loss.py b/third_party/mmyolo/mmyolo/models/losses/iou_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..e3d3dc40ef3e678989db85ee8cfd0035a26a9f19 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/losses/iou_loss.py @@ -0,0 +1,232 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +from mmdet.models.losses.utils import weight_reduce_loss +from mmdet.structures.bbox import HorizontalBoxes + +from mmyolo.registry import MODELS + + +def bbox_overlaps(pred: torch.Tensor, + target: torch.Tensor, + iou_mode: str = 'ciou', + bbox_format: str = 'xywh', + siou_theta: float = 4.0, + eps: float = 1e-7) -> torch.Tensor: + r"""Calculate overlap between two set of bboxes. + `Implementation of paper `Enhancing Geometric Factors into + Model Learning and Inference for Object Detection and Instance + Segmentation `_. + + In the CIoU implementation of YOLOv5 and MMDetection, there is a slight + difference in the way the alpha parameter is computed. + + mmdet version: + alpha = (ious > 0.5).float() * v / (1 - ious + v) + YOLOv5 version: + alpha = v / (v - ious + (1 + eps) + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2) + or (x, y, w, h),shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + iou_mode (str): Options are ('iou', 'ciou', 'giou', 'siou'). + Defaults to "ciou". + bbox_format (str): Options are "xywh" and "xyxy". + Defaults to "xywh". + siou_theta (float): siou_theta for SIoU when calculate shape cost. + Defaults to 4.0. + eps (float): Eps to avoid log(0). + + Returns: + Tensor: shape (n, ). + """ + assert iou_mode in ('iou', 'ciou', 'giou', 'siou') + assert bbox_format in ('xyxy', 'xywh') + if bbox_format == 'xywh': + pred = HorizontalBoxes.cxcywh_to_xyxy(pred) + target = HorizontalBoxes.cxcywh_to_xyxy(target) + + bbox1_x1, bbox1_y1 = pred[..., 0], pred[..., 1] + bbox1_x2, bbox1_y2 = pred[..., 2], pred[..., 3] + bbox2_x1, bbox2_y1 = target[..., 0], target[..., 1] + bbox2_x2, bbox2_y2 = target[..., 2], target[..., 3] + + # Overlap + overlap = (torch.min(bbox1_x2, bbox2_x2) - + torch.max(bbox1_x1, bbox2_x1)).clamp(0) * \ + (torch.min(bbox1_y2, bbox2_y2) - + torch.max(bbox1_y1, bbox2_y1)).clamp(0) + + # Union + w1, h1 = bbox1_x2 - bbox1_x1, bbox1_y2 - bbox1_y1 + w2, h2 = bbox2_x2 - bbox2_x1, bbox2_y2 - bbox2_y1 + union = (w1 * h1) + (w2 * h2) - overlap + eps + + h1 = bbox1_y2 - bbox1_y1 + eps + h2 = bbox2_y2 - bbox2_y1 + eps + + # IoU + ious = overlap / union + + # enclose area + enclose_x1y1 = torch.min(pred[..., :2], target[..., :2]) + enclose_x2y2 = torch.max(pred[..., 2:], target[..., 2:]) + enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + + enclose_w = enclose_wh[..., 0] # cw + enclose_h = enclose_wh[..., 1] # ch + + if iou_mode == 'ciou': + # CIoU = IoU - ( (ρ^2(b_pred,b_gt) / c^2) + (alpha x v) ) + + # calculate enclose area (c^2) + enclose_area = enclose_w**2 + enclose_h**2 + eps + + # calculate ρ^2(b_pred,b_gt): + # euclidean distance between b_pred(bbox2) and b_gt(bbox1) + # center point, because bbox format is xyxy -> left-top xy and + # right-bottom xy, so need to / 4 to get center point. + rho2_left_item = ((bbox2_x1 + bbox2_x2) - (bbox1_x1 + bbox1_x2))**2 / 4 + rho2_right_item = ((bbox2_y1 + bbox2_y2) - + (bbox1_y1 + bbox1_y2))**2 / 4 + rho2 = rho2_left_item + rho2_right_item # rho^2 (ρ^2) + + # Width and height ratio (v) + wh_ratio = (4 / (math.pi**2)) * torch.pow( + torch.atan(w2 / h2) - torch.atan(w1 / h1), 2) + + with torch.no_grad(): + alpha = wh_ratio / (wh_ratio - ious + (1 + eps)) + + # CIoU + ious = ious - ((rho2 / enclose_area) + (alpha * wh_ratio)) + + elif iou_mode == 'giou': + # GIoU = IoU - ( (A_c - union) / A_c ) + convex_area = enclose_w * enclose_h + eps # convex area (A_c) + ious = ious - (convex_area - union) / convex_area + + elif iou_mode == 'siou': + # SIoU: https://arxiv.org/pdf/2205.12740.pdf + # SIoU = IoU - ( (Distance Cost + Shape Cost) / 2 ) + + # calculate sigma (σ): + # euclidean distance between bbox2(pred) and bbox1(gt) center point, + # sigma_cw = b_cx_gt - b_cx + sigma_cw = (bbox2_x1 + bbox2_x2) / 2 - (bbox1_x1 + bbox1_x2) / 2 + eps + # sigma_ch = b_cy_gt - b_cy + sigma_ch = (bbox2_y1 + bbox2_y2) / 2 - (bbox1_y1 + bbox1_y2) / 2 + eps + # sigma = √( (sigma_cw ** 2) - (sigma_ch ** 2) ) + sigma = torch.pow(sigma_cw**2 + sigma_ch**2, 0.5) + + # choose minimize alpha, sin(alpha) + sin_alpha = torch.abs(sigma_ch) / sigma + sin_beta = torch.abs(sigma_cw) / sigma + sin_alpha = torch.where(sin_alpha <= math.sin(math.pi / 4), sin_alpha, + sin_beta) + + # Angle cost = 1 - 2 * ( sin^2 ( arcsin(x) - (pi / 4) ) ) + angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) + + # Distance cost = Σ_(t=x,y) (1 - e ^ (- γ ρ_t)) + rho_x = (sigma_cw / enclose_w)**2 # ρ_x + rho_y = (sigma_ch / enclose_h)**2 # ρ_y + gamma = 2 - angle_cost # γ + distance_cost = (1 - torch.exp(-1 * gamma * rho_x)) + ( + 1 - torch.exp(-1 * gamma * rho_y)) + + # Shape cost = Ω = Σ_(t=w,h) ( ( 1 - ( e ^ (-ω_t) ) ) ^ θ ) + omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) # ω_w + omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) # ω_h + shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), + siou_theta) + torch.pow( + 1 - torch.exp(-1 * omiga_h), siou_theta) + + ious = ious - ((distance_cost + shape_cost) * 0.5) + + return ious.clamp(min=-1.0, max=1.0) + + +@MODELS.register_module() +class IoULoss(nn.Module): + """IoULoss. + + Computing the IoU loss between a set of predicted bboxes and target bboxes. + Args: + iou_mode (str): Options are "ciou". + Defaults to "ciou". + bbox_format (str): Options are "xywh" and "xyxy". + Defaults to "xywh". + eps (float): Eps to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + return_iou (bool): If True, return loss and iou. + """ + + def __init__(self, + iou_mode: str = 'ciou', + bbox_format: str = 'xywh', + eps: float = 1e-7, + reduction: str = 'mean', + loss_weight: float = 1.0, + return_iou: bool = True): + super().__init__() + assert bbox_format in ('xywh', 'xyxy') + assert iou_mode in ('ciou', 'siou', 'giou') + self.iou_mode = iou_mode + self.bbox_format = bbox_format + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + self.return_iou = return_iou + + def forward( + self, + pred: torch.Tensor, + target: torch.Tensor, + weight: Optional[torch.Tensor] = None, + avg_factor: Optional[float] = None, + reduction_override: Optional[Union[str, bool]] = None + ) -> Tuple[Union[torch.Tensor, torch.Tensor], torch.Tensor]: + """Forward function. + + Args: + pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2) + or (x, y, w, h),shape (n, 4). + target (Tensor): Corresponding gt bboxes, shape (n, 4). + weight (Tensor, optional): Element-wise weights. + avg_factor (float, optional): Average factor when computing the + mean of losses. + reduction_override (str, bool, optional): Same as built-in losses + of PyTorch. Defaults to None. + Returns: + loss or tuple(loss, iou): + """ + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + if weight is not None and weight.dim() > 1: + weight = weight.mean(-1) + + iou = bbox_overlaps( + pred, + target, + iou_mode=self.iou_mode, + bbox_format=self.bbox_format, + eps=self.eps) + loss = self.loss_weight * weight_reduce_loss(1.0 - iou, weight, + reduction, avg_factor) + + if self.return_iou: + return loss, iou + else: + return loss diff --git a/third_party/mmyolo/mmyolo/models/losses/oks_loss.py b/third_party/mmyolo/mmyolo/models/losses/oks_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..62c63422b3d13ade5164f23a9537a01847ff358d --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/losses/oks_loss.py @@ -0,0 +1,91 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from mmyolo.registry import MODELS + +try: + from mmpose.datasets.datasets.utils import parse_pose_metainfo +except ImportError: + parse_pose_metainfo = None + + +@MODELS.register_module() +class OksLoss(nn.Module): + """A PyTorch implementation of the Object Keypoint Similarity (OKS) loss as + described in the paper "YOLO-Pose: Enhancing YOLO for Multi Person Pose + Estimation Using Object Keypoint Similarity Loss" by Debapriya et al. + + (2022). + The OKS loss is used for keypoint-based object recognition and consists + of a measure of the similarity between predicted and ground truth + keypoint locations, adjusted by the size of the object in the image. + The loss function takes as input the predicted keypoint locations, the + ground truth keypoint locations, a mask indicating which keypoints are + valid, and bounding boxes for the objects. + Args: + metainfo (Optional[str]): Path to a JSON file containing information + about the dataset's annotations. + loss_weight (float): Weight for the loss. + """ + + def __init__(self, + metainfo: Optional[str] = None, + loss_weight: float = 1.0): + super().__init__() + + if metainfo is not None: + if parse_pose_metainfo is None: + raise ImportError( + 'Please run "mim install -r requirements/mmpose.txt" ' + 'to install mmpose first for OksLossn.') + metainfo = parse_pose_metainfo(dict(from_file=metainfo)) + sigmas = metainfo.get('sigmas', None) + if sigmas is not None: + self.register_buffer('sigmas', torch.as_tensor(sigmas)) + self.loss_weight = loss_weight + + def forward(self, + output: Tensor, + target: Tensor, + target_weights: Tensor, + bboxes: Optional[Tensor] = None) -> Tensor: + oks = self.compute_oks(output, target, target_weights, bboxes) + loss = 1 - oks + return loss * self.loss_weight + + def compute_oks(self, + output: Tensor, + target: Tensor, + target_weights: Tensor, + bboxes: Optional[Tensor] = None) -> Tensor: + """Calculates the OKS loss. + + Args: + output (Tensor): Predicted keypoints in shape N x k x 2, where N + is batch size, k is the number of keypoints, and 2 are the + xy coordinates. + target (Tensor): Ground truth keypoints in the same shape as + output. + target_weights (Tensor): Mask of valid keypoints in shape N x k, + with 1 for valid and 0 for invalid. + bboxes (Optional[Tensor]): Bounding boxes in shape N x 4, + where 4 are the xyxy coordinates. + Returns: + Tensor: The calculated OKS loss. + """ + + dist = torch.norm(output - target, dim=-1) + + if hasattr(self, 'sigmas'): + sigmas = self.sigmas.reshape(*((1, ) * (dist.ndim - 1)), -1) + dist = dist / sigmas + if bboxes is not None: + area = torch.norm(bboxes[..., 2:] - bboxes[..., :2], dim=-1) + dist = dist / area.clip(min=1e-8).unsqueeze(-1) + + return (torch.exp(-dist.pow(2) / 2) * target_weights).sum( + dim=-1) / target_weights.sum(dim=-1).clip(min=1e-8) diff --git a/third_party/mmyolo/mmyolo/models/necks/__init__.py b/third_party/mmyolo/mmyolo/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..159fae8d6e248330e49919420bf82154d905ad6c --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_yolo_neck import BaseYOLONeck +from .cspnext_pafpn import CSPNeXtPAFPN +from .ppyoloe_csppan import PPYOLOECSPPAFPN +from .yolov5_pafpn import YOLOv5PAFPN +from .yolov6_pafpn import (YOLOv6CSPRepBiPAFPN, YOLOv6CSPRepPAFPN, + YOLOv6RepBiPAFPN, YOLOv6RepPAFPN) +from .yolov7_pafpn import YOLOv7PAFPN +from .yolov8_pafpn import YOLOv8PAFPN +from .yolox_pafpn import YOLOXPAFPN + +__all__ = [ + 'YOLOv5PAFPN', 'BaseYOLONeck', 'YOLOv6RepPAFPN', 'YOLOXPAFPN', + 'CSPNeXtPAFPN', 'YOLOv7PAFPN', 'PPYOLOECSPPAFPN', 'YOLOv6CSPRepPAFPN', + 'YOLOv8PAFPN', 'YOLOv6RepBiPAFPN', 'YOLOv6CSPRepBiPAFPN' +] diff --git a/third_party/mmyolo/mmyolo/models/necks/base_yolo_neck.py b/third_party/mmyolo/mmyolo/models/necks/base_yolo_neck.py new file mode 100644 index 0000000000000000000000000000000000000000..8825b7634f54df624f56d0cd0beef4d0e4658788 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/base_yolo_neck.py @@ -0,0 +1,261 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod +from typing import List, Union + +import torch +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig +from mmengine.model import BaseModule +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class BaseYOLONeck(BaseModule, metaclass=ABCMeta): + """Base neck used in YOLO series. + + .. code:: text + + P5 neck model structure diagram + +--------+ +-------+ + |top_down|----------+--------->| out |---> output0 + | layer1 | | | layer0| + +--------+ | +-------+ + stride=8 ^ | + idx=0 +------+ +--------+ | + -----> |reduce|--->| cat | | + |layer0| +--------+ | + +------+ ^ v + +--------+ +-----------+ + |upsample| |downsample | + | layer1 | | layer0 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer2 |--->| cat | + +--------+ +-----------+ + stride=16 ^ v + idx=1 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output1 + |layer1| +--------+ | layer0 | | layer1| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer2 | | layer1 | + stride=32 +--------+ +-----------+ + idx=2 +------+ ^ v + -----> |reduce| | +-----------+ + |layer2|---------+------->| cat | + +------+ +-----------+ + v + +-----------+ +-------+ + | bottom_up |--->| out |---> output2 + | layer1 | | layer2| + +-----------+ +-------+ + + .. code:: text + + P6 neck model structure diagram + +--------+ +-------+ + |top_down|----------+--------->| out |---> output0 + | layer1 | | | layer0| + +--------+ | +-------+ + stride=8 ^ | + idx=0 +------+ +--------+ | + -----> |reduce|--->| cat | | + |layer0| +--------+ | + +------+ ^ v + +--------+ +-----------+ + |upsample| |downsample | + | layer1 | | layer0 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer2 |--->| cat | + +--------+ +-----------+ + stride=16 ^ v + idx=1 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output1 + |layer1| +--------+ | layer0 | | layer1| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer2 | | layer1 | + +--------+ +-----------+ + ^ | + +--------+ v + |top_down| +-----------+ + | layer3 |--->| cat | + +--------+ +-----------+ + stride=32 ^ v + idx=2 +------+ +--------+ +-----------+ +-------+ + -----> |reduce|--->| cat | | bottom_up |--->| out |---> output2 + |layer2| +--------+ | layer1 | | layer2| + +------+ ^ +-----------+ +-------+ + | v + +--------+ +-----------+ + |upsample| |downsample | + | layer3 | | layer2 | + +--------+ +-----------+ + stride=64 ^ v + idx=3 +------+ | +-----------+ + -----> |reduce|---------+------->| cat | + |layer3| +-----------+ + +------+ v + +-----------+ +-------+ + | bottom_up |--->| out |---> output3 + | layer2 | | layer3| + +-----------+ +-------+ + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + upsample_feats_cat_first (bool): Whether the output features are + concat first after upsampling in the topdown module. + Defaults to True. Currently only YOLOv7 is false. + freeze_all(bool): Whether to freeze the model. Defaults to False + norm_cfg (dict): Config dict for normalization layer. + Defaults to None. + act_cfg (dict): Config dict for activation layer. + Defaults to None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[int, List[int]], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + upsample_feats_cat_first: bool = True, + freeze_all: bool = False, + norm_cfg: ConfigType = None, + act_cfg: ConfigType = None, + init_cfg: OptMultiConfig = None, + **kwargs): + super().__init__(init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + self.deepen_factor = deepen_factor + self.widen_factor = widen_factor + self.upsample_feats_cat_first = upsample_feats_cat_first + self.freeze_all = freeze_all + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + self.reduce_layers = nn.ModuleList() + for idx in range(len(in_channels)): + self.reduce_layers.append(self.build_reduce_layer(idx)) + + # build top-down blocks + self.upsample_layers = nn.ModuleList() + self.top_down_layers = nn.ModuleList() + for idx in range(len(in_channels) - 1, 0, -1): + self.upsample_layers.append(self.build_upsample_layer(idx)) + self.top_down_layers.append(self.build_top_down_layer(idx)) + + # build bottom-up blocks + self.downsample_layers = nn.ModuleList() + self.bottom_up_layers = nn.ModuleList() + for idx in range(len(in_channels) - 1): + self.downsample_layers.append(self.build_downsample_layer(idx)) + self.bottom_up_layers.append(self.build_bottom_up_layer(idx)) + + self.out_layers = nn.ModuleList() + for idx in range(len(in_channels)): + self.out_layers.append(self.build_out_layer(idx)) + + @abstractmethod + def build_reduce_layer(self, idx: int): + """build reduce layer.""" + pass + + @abstractmethod + def build_upsample_layer(self, idx: int): + """build upsample layer.""" + pass + + @abstractmethod + def build_top_down_layer(self, idx: int): + """build top down layer.""" + pass + + @abstractmethod + def build_downsample_layer(self, idx: int): + """build downsample layer.""" + pass + + @abstractmethod + def build_bottom_up_layer(self, idx: int): + """build bottom up layer.""" + pass + + @abstractmethod + def build_out_layer(self, idx: int): + """build out layer.""" + pass + + def _freeze_all(self): + """Freeze the model.""" + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Convert the model into training mode while keep the normalization + layer freezed.""" + super().train(mode) + if self.freeze_all: + self._freeze_all() + + def forward(self, inputs: List[torch.Tensor]) -> tuple: + """Forward function.""" + assert len(inputs) == len(self.in_channels) + # reduce layers + reduce_outs = [] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](inputs[idx])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = reduce_outs[idx - 1] + upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - + idx]( + feat_high) + if self.upsample_feats_cat_first: + top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1) + else: + top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx]( + torch.cat([downsample_feat, feat_high], 1)) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) diff --git a/third_party/mmyolo/mmyolo/models/necks/cspnext_pafpn.py b/third_party/mmyolo/mmyolo/models/necks/cspnext_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..310126f63e12f888daac50ca30674484f7b3a6ec --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/cspnext_pafpn.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import Sequence + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class CSPNeXtPAFPN(BaseYOLONeck): + """Path Aggregation Network with CSPNeXt blocks. + + Args: + in_channels (Sequence[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. + Defaults to 3. + use_depthwise (bool): Whether to use depthwise separable convolution in + blocks. Defaults to False. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Defaults to 0.5. + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(scale_factor=2, mode='nearest')` + conv_cfg (dict, optional): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN') + act_cfg (dict): Config dict for activation layer. + Default: dict(type='SiLU', inplace=True) + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None. + """ + + def __init__( + self, + in_channels: Sequence[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + use_depthwise: bool = False, + expand_ratio: float = 0.5, + upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'), + conv_cfg: bool = None, + norm_cfg: ConfigType = dict(type='BN'), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = dict( + type='Kaiming', + layer='Conv2d', + a=math.sqrt(5), + distribution='uniform', + mode='fan_in', + nonlinearity='leaky_relu') + ) -> None: + self.num_csp_blocks = round(num_csp_blocks * deepen_factor) + self.conv = DepthwiseSeparableConvModule \ + if use_depthwise else ConvModule + self.upsample_cfg = upsample_cfg + self.expand_ratio = expand_ratio + self.conv_cfg = conv_cfg + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=int(out_channels * widen_factor), + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = self.conv( + self.in_channels[idx], + self.in_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(**self.upsample_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + if idx == 1: + return CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return nn.Sequential( + CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + self.conv( + self.in_channels[idx - 1], + self.in_channels[idx - 2], + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + return self.conv( + self.in_channels[idx], + self.in_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + self.in_channels[idx] * 2, + self.in_channels[idx + 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + use_cspnext_block=True, + expand_ratio=self.expand_ratio, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + return self.conv( + self.in_channels[idx], + self.out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/third_party/mmyolo/mmyolo/models/necks/ppyoloe_csppan.py b/third_party/mmyolo/mmyolo/models/necks/ppyoloe_csppan.py new file mode 100644 index 0000000000000000000000000000000000000000..4e4ef7200bfc6784a7ce8d92bcfbc46314e518e9 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/ppyoloe_csppan.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.models.backbones.csp_resnet import CSPResLayer +from mmyolo.models.necks import BaseYOLONeck +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class PPYOLOECSPPAFPN(BaseYOLONeck): + """CSPPAN in PPYOLOE. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (List[int]): Number of output channels + (used at each scale). + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + freeze_all(bool): Whether to freeze the model. + num_csplayer (int): Number of `CSPResLayer` in per layer. + Defaults to 1. + num_blocks_per_layer (int): Number of blocks per `CSPResLayer`. + Defaults to 3. + block_cfg (dict): Config dict for block. Defaults to + dict(type='PPYOLOEBasicBlock', shortcut=True, use_alpha=False) + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.1, eps=1e-5). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + drop_block_cfg (dict, optional): Drop block config. + Defaults to None. If you want to use Drop block after + `CSPResLayer`, you can set this para as + dict(type='mmdet.DropBlock', drop_prob=0.1, + block_size=3, warm_iters=0). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + use_spp (bool): Whether to use `SPP` in reduce layer. + Defaults to False. + """ + + def __init__(self, + in_channels: List[int] = [256, 512, 1024], + out_channels: List[int] = [256, 512, 1024], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + freeze_all: bool = False, + num_csplayer: int = 1, + num_blocks_per_layer: int = 3, + block_cfg: ConfigType = dict( + type='PPYOLOEBasicBlock', shortcut=False, + use_alpha=False), + norm_cfg: ConfigType = dict( + type='BN', momentum=0.1, eps=1e-5), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + drop_block_cfg: ConfigType = None, + init_cfg: OptMultiConfig = None, + use_spp: bool = False): + self.block_cfg = block_cfg + self.num_csplayer = num_csplayer + self.num_blocks_per_layer = round(num_blocks_per_layer * deepen_factor) + # Only use spp in last reduce_layer, if use_spp=True. + self.use_spp = use_spp + self.drop_block_cfg = drop_block_cfg + assert drop_block_cfg is None or isinstance(drop_block_cfg, dict) + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=[ + int(channel * widen_factor) for channel in out_channels + ], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int): + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + # fpn_stage + in_channels = self.in_channels[idx] + out_channels = self.out_channels[idx] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=self.use_spp) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + layer = nn.Sequential(*layer) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer.""" + # fpn_route + in_channels = self.out_channels[idx] + return nn.Sequential( + ConvModule( + in_channels=in_channels, + out_channels=in_channels // 2, + kernel_size=1, + stride=1, + padding=0, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Upsample(scale_factor=2, mode='nearest')) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + # fpn_stage + in_channels = self.in_channels[idx - 1] + self.out_channels[idx] // 2 + out_channels = self.out_channels[idx - 1] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=False) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + + return nn.Sequential(*layer) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + # pan_route + return ConvModule( + in_channels=self.out_channels[idx], + out_channels=self.out_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + # pan_stage + in_channels = self.out_channels[idx + 1] + self.out_channels[idx] + out_channels = self.out_channels[idx + 1] + + layer = [ + CSPResLayer( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + num_block=self.num_blocks_per_layer, + block_cfg=self.block_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + attention_cfg=None, + use_spp=False) for i in range(self.num_csplayer) + ] + + if self.drop_block_cfg: + layer.append(MODELS.build(self.drop_block_cfg)) + + return nn.Sequential(*layer) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() diff --git a/third_party/mmyolo/mmyolo/models/necks/yolov5_pafpn.py b/third_party/mmyolo/mmyolo/models/necks/yolov5_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..b95147fc512359442aeb1bbc88aadd07031bdadf --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/yolov5_pafpn.py @@ -0,0 +1,171 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..utils import make_divisible, make_round +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv5PAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv5. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 1, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = num_csp_blocks + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = ConvModule( + make_divisible(self.in_channels[idx], self.widen_factor), + make_divisible(self.in_channels[idx - 1], self.widen_factor), + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(scale_factor=2, mode='nearest') + + def build_top_down_layer(self, idx: int): + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + + if idx == 1: + return CSPLayer( + make_divisible(self.in_channels[idx - 1] * 2, + self.widen_factor), + make_divisible(self.in_channels[idx - 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return nn.Sequential( + CSPLayer( + make_divisible(self.in_channels[idx - 1] * 2, + self.widen_factor), + make_divisible(self.in_channels[idx - 1], + self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, + self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + make_divisible(self.in_channels[idx - 1], + self.widen_factor), + make_divisible(self.in_channels[idx - 2], + self.widen_factor), + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + return ConvModule( + make_divisible(self.in_channels[idx], self.widen_factor), + make_divisible(self.in_channels[idx], self.widen_factor), + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + make_divisible(self.in_channels[idx] * 2, self.widen_factor), + make_divisible(self.in_channels[idx + 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() diff --git a/third_party/mmyolo/mmyolo/models/necks/yolov6_pafpn.py b/third_party/mmyolo/mmyolo/models/necks/yolov6_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..87782712352e269f159cc56da6ba6715840c87c7 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/yolov6_pafpn.py @@ -0,0 +1,527 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import BepC3StageBlock, BiFusion, RepStageBlock +from ..utils import make_round +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv6RepPAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv6. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = num_csp_blocks + self.block_cfg = block_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The reduce layer. + """ + if idx == 2: + layer = ConvModule( + in_channels=int(self.in_channels[idx] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The upsample layer. + """ + return nn.ConvTranspose2d( + in_channels=int(self.out_channels[idx - 1] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + kernel_size=2, + stride=2, + bias=True) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = RepStageBlock( + in_channels=int( + (self.out_channels[idx - 1] + self.in_channels[idx - 1]) * + self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The downsample layer. + """ + return ConvModule( + in_channels=int(self.out_channels[idx] * self.widen_factor), + out_channels=int(self.out_channels[idx] * self.widen_factor), + kernel_size=3, + stride=2, + padding=3 // 2, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return RepStageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + def build_out_layer(self, *args, **kwargs) -> nn.Module: + """build out layer.""" + return nn.Identity() + + def init_weights(self): + if self.init_cfg is None: + """Initialize the parameters.""" + for m in self.modules(): + if isinstance(m, torch.nn.Conv2d): + # In order to be consistent with the source code, + # reset the Conv2d initialization parameters + m.reset_parameters() + else: + super().init_weights() + + +@MODELS.register_module() +class YOLOv6CSPRepPAFPN(YOLOv6RepPAFPN): + """Path Aggregation Network used in YOLOv6. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + hidden_ratio: float = 0.5, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_act_cfg: ConfigType = dict(type='SiLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + self.block_act_cfg = block_act_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = BepC3StageBlock( + in_channels=int( + (self.out_channels[idx - 1] + self.in_channels[idx - 1]) * + self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return BepC3StageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) + + +@MODELS.register_module() +class YOLOv6RepBiPAFPN(YOLOv6RepPAFPN): + """Path Aggregation Network used in YOLOv6 3.0. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.extra_in_channel = in_channels[0] + super().__init__( + in_channels=in_channels[1:], + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = RepStageBlock( + in_channels=int(self.out_channels[idx - 1] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The upsample layer. + """ + in_channels1 = self.in_channels[ + idx - 2] if idx > 1 else self.extra_in_channel + return BiFusion( + in_channels0=int(self.in_channels[idx - 1] * self.widen_factor), + in_channels1=int(in_channels1 * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs: List[torch.Tensor]) -> tuple: + """Forward function.""" + assert len(inputs) == len(self.in_channels) + 1 + # reduce layers + reduce_outs = [inputs[0]] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](inputs[idx + 1])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_cur = reduce_outs[idx] + feat_low = reduce_outs[idx - 1] + top_down_layer_inputs = self.upsample_layers[len(self.in_channels) + - 1 - idx]([ + feat_high, + feat_cur, feat_low + ]) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx]( + torch.cat([downsample_feat, feat_high], 1)) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) + + +@MODELS.register_module() +class YOLOv6CSPRepBiPAFPN(YOLOv6RepBiPAFPN): + """Path Aggregation Network used in YOLOv6 3.0. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='ReLU', inplace=True). + block_cfg (dict): Config dict for the block used to build each + layer. Defaults to dict(type='RepVGGBlock'). + block_act_cfg (dict): Config dict for activation layer used in each + stage. Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + hidden_ratio: float = 0.5, + num_csp_blocks: int = 12, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='ReLU', inplace=True), + block_act_cfg: ConfigType = dict(type='SiLU', inplace=True), + block_cfg: ConfigType = dict(type='RepVGGBlock'), + init_cfg: OptMultiConfig = None): + self.hidden_ratio = hidden_ratio + self.block_act_cfg = block_act_cfg + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + block_cfg=block_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + + layer0 = BepC3StageBlock( + in_channels=int(self.out_channels[idx - 1] * self.widen_factor), + out_channels=int(self.out_channels[idx - 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) + + if idx == 1: + return layer0 + elif idx == 2: + layer1 = ConvModule( + in_channels=int(self.out_channels[idx - 1] * + self.widen_factor), + out_channels=int(self.out_channels[idx - 2] * + self.widen_factor), + kernel_size=1, + stride=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + return nn.Sequential(layer0, layer1) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + + return BepC3StageBlock( + in_channels=int(self.out_channels[idx] * 2 * self.widen_factor), + out_channels=int(self.out_channels[idx + 1] * self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + block_cfg=block_cfg, + hidden_ratio=self.hidden_ratio, + norm_cfg=self.norm_cfg, + act_cfg=self.block_act_cfg) diff --git a/third_party/mmyolo/mmyolo/models/necks/yolov7_pafpn.py b/third_party/mmyolo/mmyolo/models/necks/yolov7_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..1d31f4623b50083ff820e6b20229b33ad0f41860 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/yolov7_pafpn.py @@ -0,0 +1,216 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from ..layers import MaxPoolAndStrideConvBlock, RepVGGBlock, SPPFCSPBlock +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOv7PAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOv7. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + block_cfg (dict): Config dict for block. + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + spp_expand_ratio (float): Expand ratio of SPPCSPBlock. + Defaults to 0.5. + is_tiny_version (bool): Is tiny version of neck. If True, + it means it is a yolov7 tiny model. Defaults to False. + use_maxpool_in_downsample (bool): Whether maxpooling is + used in downsample layers. Defaults to True. + use_in_channels_in_downsample (bool): MaxPoolAndStrideConvBlock + module input parameters. Defaults to False. + use_repconv_outs (bool): Whether to use `repconv` in the output + layer. Defaults to True. + upsample_feats_cat_first (bool): Whether the output features are + concat first after upsampling in the topdown module. + Defaults to True. Currently only YOLOv7 is false. + freeze_all(bool): Whether to freeze the model. Defaults to False. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: List[int], + block_cfg: dict = dict( + type='ELANBlock', + middle_ratio=0.5, + block_ratio=0.25, + num_blocks=4, + num_convs_in_block=1), + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + spp_expand_ratio: float = 0.5, + is_tiny_version: bool = False, + use_maxpool_in_downsample: bool = True, + use_in_channels_in_downsample: bool = False, + use_repconv_outs: bool = True, + upsample_feats_cat_first: bool = False, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + + self.is_tiny_version = is_tiny_version + self.use_maxpool_in_downsample = use_maxpool_in_downsample + self.use_in_channels_in_downsample = use_in_channels_in_downsample + self.spp_expand_ratio = spp_expand_ratio + self.use_repconv_outs = use_repconv_outs + self.block_cfg = block_cfg + self.block_cfg.setdefault('norm_cfg', norm_cfg) + self.block_cfg.setdefault('act_cfg', act_cfg) + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=[ + int(channel * widen_factor) for channel in out_channels + ], + deepen_factor=deepen_factor, + widen_factor=widen_factor, + upsample_feats_cat_first=upsample_feats_cat_first, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == len(self.in_channels) - 1: + layer = SPPFCSPBlock( + self.in_channels[idx], + self.out_channels[idx], + expand_ratio=self.spp_expand_ratio, + is_tiny_version=self.is_tiny_version, + kernel_sizes=5, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = ConvModule( + self.in_channels[idx], + self.out_channels[idx], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + return layer + + def build_upsample_layer(self, idx: int) -> nn.Module: + """build upsample layer.""" + return nn.Sequential( + ConvModule( + self.out_channels[idx], + self.out_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Upsample(scale_factor=2, mode='nearest')) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + block_cfg = self.block_cfg.copy() + block_cfg['in_channels'] = self.out_channels[idx - 1] * 2 + block_cfg['out_channels'] = self.out_channels[idx - 1] + return MODELS.build(block_cfg) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + if self.use_maxpool_in_downsample and not self.is_tiny_version: + return MaxPoolAndStrideConvBlock( + self.out_channels[idx], + self.out_channels[idx + 1], + use_in_channels_of_middle=self.use_in_channels_in_downsample, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return ConvModule( + self.out_channels[idx], + self.out_channels[idx + 1], + 3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = self.block_cfg.copy() + block_cfg['in_channels'] = self.out_channels[idx + 1] * 2 + block_cfg['out_channels'] = self.out_channels[idx + 1] + return MODELS.build(block_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + if len(self.in_channels) == 4: + # P6 + return nn.Identity() + + out_channels = self.out_channels[idx] * 2 + + if self.use_repconv_outs: + return RepVGGBlock( + self.out_channels[idx], + out_channels, + 3, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + return ConvModule( + self.out_channels[idx], + out_channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/third_party/mmyolo/mmyolo/models/necks/yolov8_pafpn.py b/third_party/mmyolo/mmyolo/models/necks/yolov8_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..e26698bcc191b0141d89c1e965de811494a96539 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/yolov8_pafpn.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch.nn as nn +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .. import CSPLayerWithTwoConv +from ..utils import make_divisible, make_round +from .yolov5_pafpn import YOLOv5PAFPN + + +@MODELS.register_module() +class YOLOv8PAFPN(YOLOv5PAFPN): + """Path Aggregation Network used in YOLOv8. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + freeze_all(bool): Whether to freeze the model + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + return nn.Identity() + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + return CSPLayerWithTwoConv( + make_divisible((self.in_channels[idx - 1] + self.in_channels[idx]), + self.widen_factor), + make_divisible(self.out_channels[idx - 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayerWithTwoConv( + make_divisible( + (self.out_channels[idx] + self.out_channels[idx + 1]), + self.widen_factor), + make_divisible(self.out_channels[idx + 1], self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/third_party/mmyolo/mmyolo/models/necks/yolox_pafpn.py b/third_party/mmyolo/mmyolo/models/necks/yolox_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..bd2595e70fe47e38e68ebd0d878deb6f264bf2d1 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/necks/yolox_pafpn.py @@ -0,0 +1,172 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List + +import torch.nn as nn +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule +from mmdet.models.backbones.csp_darknet import CSPLayer +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from .base_yolo_neck import BaseYOLONeck + + +@MODELS.register_module() +class YOLOXPAFPN(BaseYOLONeck): + """Path Aggregation Network used in YOLOX. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + deepen_factor (float): Depth multiplier, multiply number of + blocks in CSP layer by this amount. Defaults to 1.0. + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Defaults to 1.0. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Defaults to 1. + use_depthwise (bool): Whether to use depthwise separable convolution. + Defaults to False. + freeze_all(bool): Whether to freeze the model. Defaults to False. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN', momentum=0.03, eps=0.001). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='SiLU', inplace=True). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: List[int], + out_channels: int, + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + use_depthwise: bool = False, + freeze_all: bool = False, + norm_cfg: ConfigType = dict( + type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None): + self.num_csp_blocks = round(num_csp_blocks * deepen_factor) + self.use_depthwise = use_depthwise + + super().__init__( + in_channels=[ + int(channel * widen_factor) for channel in in_channels + ], + out_channels=int(out_channels * widen_factor), + deepen_factor=deepen_factor, + widen_factor=widen_factor, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_reduce_layer(self, idx: int) -> nn.Module: + """build reduce layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The reduce layer. + """ + if idx == 2: + layer = ConvModule( + self.in_channels[idx], + self.in_channels[idx - 1], + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + layer = nn.Identity() + + return layer + + def build_upsample_layer(self, *args, **kwargs) -> nn.Module: + """build upsample layer.""" + return nn.Upsample(scale_factor=2, mode='nearest') + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + if idx == 1: + return CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + elif idx == 2: + return nn.Sequential( + CSPLayer( + self.in_channels[idx - 1] * 2, + self.in_channels[idx - 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule( + self.in_channels[idx - 1], + self.in_channels[idx - 2], + kernel_size=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def build_downsample_layer(self, idx: int) -> nn.Module: + """build downsample layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The downsample layer. + """ + conv = DepthwiseSeparableConvModule \ + if self.use_depthwise else ConvModule + return conv( + self.in_channels[idx], + self.in_channels[idx], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + return CSPLayer( + self.in_channels[idx] * 2, + self.in_channels[idx + 1], + num_blocks=self.num_csp_blocks, + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def build_out_layer(self, idx: int) -> nn.Module: + """build out layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The out layer. + """ + return ConvModule( + self.in_channels[idx], + self.out_channels, + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) diff --git a/third_party/mmyolo/mmyolo/models/plugins/__init__.py b/third_party/mmyolo/mmyolo/models/plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..497233ac21a4dd1a6a2a3127c09435d8146eb553 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/plugins/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .cbam import CBAM + +__all__ = ['CBAM'] diff --git a/third_party/mmyolo/mmyolo/models/plugins/cbam.py b/third_party/mmyolo/mmyolo/models/plugins/cbam.py new file mode 100644 index 0000000000000000000000000000000000000000..e9559f2e2db951a5681ec9af5864928ed480361b --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/plugins/cbam.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmdet.utils import OptMultiConfig +from mmengine.model import BaseModule + +from mmyolo.registry import MODELS + + +class ChannelAttention(BaseModule): + """ChannelAttention. + + Args: + channels (int): The input (and output) channels of the + ChannelAttention. + reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate + channel will be ``int(channels/ratio)``. Defaults to 16. + act_cfg (dict): Config dict for activation layer + Defaults to dict(type='ReLU'). + """ + + def __init__(self, + channels: int, + reduce_ratio: int = 16, + act_cfg: dict = dict(type='ReLU')): + super().__init__() + + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + + self.fc = nn.Sequential( + ConvModule( + in_channels=channels, + out_channels=int(channels / reduce_ratio), + kernel_size=1, + stride=1, + conv_cfg=None, + act_cfg=act_cfg), + ConvModule( + in_channels=int(channels / reduce_ratio), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=None, + act_cfg=None)) + self.sigmoid = nn.Sigmoid() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + avgpool_out = self.fc(self.avg_pool(x)) + maxpool_out = self.fc(self.max_pool(x)) + out = self.sigmoid(avgpool_out + maxpool_out) + return out + + +class SpatialAttention(BaseModule): + """SpatialAttention + Args: + kernel_size (int): The size of the convolution kernel in + SpatialAttention. Defaults to 7. + """ + + def __init__(self, kernel_size: int = 7): + super().__init__() + + self.conv = ConvModule( + in_channels=2, + out_channels=1, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=None, + act_cfg=dict(type='Sigmoid')) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + avg_out = torch.mean(x, dim=1, keepdim=True) + max_out, _ = torch.max(x, dim=1, keepdim=True) + out = torch.cat([avg_out, max_out], dim=1) + out = self.conv(out) + return out + + +@MODELS.register_module() +class CBAM(BaseModule): + """Convolutional Block Attention Module. arxiv link: + https://arxiv.org/abs/1807.06521v2. + + Args: + in_channels (int): The input (and output) channels of the CBAM. + reduce_ratio (int): Squeeze ratio in ChannelAttention, the intermediate + channel will be ``int(channels/ratio)``. Defaults to 16. + kernel_size (int): The size of the convolution kernel in + SpatialAttention. Defaults to 7. + act_cfg (dict): Config dict for activation layer in ChannelAttention + Defaults to dict(type='ReLU'). + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + in_channels: int, + reduce_ratio: int = 16, + kernel_size: int = 7, + act_cfg: dict = dict(type='ReLU'), + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + self.channel_attention = ChannelAttention( + channels=in_channels, reduce_ratio=reduce_ratio, act_cfg=act_cfg) + + self.spatial_attention = SpatialAttention(kernel_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward function.""" + out = self.channel_attention(x) * x + out = self.spatial_attention(out) * out + return out diff --git a/third_party/mmyolo/mmyolo/models/task_modules/__init__.py b/third_party/mmyolo/mmyolo/models/task_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7dbdc25fa3cf16e85e0e99e7d302a98f2b4f13ce --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assigners import BatchATSSAssigner, BatchTaskAlignedAssigner +from .coders import YOLOv5BBoxCoder, YOLOXBBoxCoder + +__all__ = [ + 'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'BatchATSSAssigner', + 'BatchTaskAlignedAssigner' +] diff --git a/third_party/mmyolo/mmyolo/models/task_modules/assigners/__init__.py b/third_party/mmyolo/mmyolo/models/task_modules/assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7b2e2e69c921367083e21abce799e3ef5b8d47e1 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/assigners/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .batch_atss_assigner import BatchATSSAssigner +from .batch_dsl_assigner import BatchDynamicSoftLabelAssigner +from .batch_task_aligned_assigner import BatchTaskAlignedAssigner +from .pose_sim_ota_assigner import PoseSimOTAAssigner +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + +__all__ = [ + 'BatchATSSAssigner', 'BatchTaskAlignedAssigner', + 'select_candidates_in_gts', 'select_highest_overlaps', + 'yolov6_iou_calculator', 'BatchDynamicSoftLabelAssigner', + 'PoseSimOTAAssigner' +] diff --git a/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_atss_assigner.py b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_atss_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..45b3069afde73e240890273c58e3860da59ad854 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_atss_assigner.py @@ -0,0 +1,339 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.utils import ConfigType +from torch import Tensor + +from mmyolo.registry import TASK_UTILS +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + + +def bbox_center_distance(bboxes: Tensor, + priors: Tensor) -> Tuple[Tensor, Tensor]: + """Compute the center distance between bboxes and priors. + + Args: + bboxes (Tensor): Shape (n, 4) for bbox, "xyxy" format. + priors (Tensor): Shape (num_priors, 4) for priors, "xyxy" format. + + Returns: + distances (Tensor): Center distances between bboxes and priors, + shape (num_priors, n). + priors_points (Tensor): Priors cx cy points, + shape (num_priors, 2). + """ + bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 + bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 + bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1) + + priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0 + priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0 + priors_points = torch.stack((priors_cx, priors_cy), dim=1) + + distances = (bbox_points[:, None, :] - + priors_points[None, :, :]).pow(2).sum(-1).sqrt() + + return distances, priors_points + + +@TASK_UTILS.register_module() +class BatchATSSAssigner(nn.Module): + """Assign a batch of corresponding gt bboxes or background to each prior. + + This code is based on + https://github.com/meituan/YOLOv6/blob/main/yolov6/assigners/atss_assigner.py + + Each proposal will be assigned with `0` or a positive integer + indicating the ground truth index. + + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + + Args: + num_classes (int): number of class + iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou + calculator. Defaults to ``dict(type='BboxOverlaps2D')`` + topk (int): number of priors selected in each level + """ + + def __init__( + self, + num_classes: int, + iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'), + topk: int = 9): + super().__init__() + self.num_classes = num_classes + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.topk = topk + + @torch.no_grad() + def forward(self, pred_bboxes: Tensor, priors: Tensor, + num_level_priors: List, gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor) -> dict: + """Assign gt to priors. + + The assignment is done in following steps + + 1. compute iou between all prior (prior of all pyramid levels) and gt + 2. compute center distance between all prior and gt + 3. on each pyramid level, for each gt, select k prior whose center + are closest to the gt center, so we total select k*l prior as + candidates for each gt + 4. get corresponding iou for the these candidates, and compute the + mean and std, set mean + std as the iou threshold + 5. select these candidates whose iou are greater than or equal to + the threshold as positive + 6. limit the positive sample's center in gt + + Args: + pred_bboxes (Tensor): Predicted bounding boxes, + shape(batch_size, num_priors, 4) + priors (Tensor): Model priors with stride, shape(num_priors, 4) + num_level_priors (List): Number of bboxes in each level, len(3) + gt_labels (Tensor): Ground truth label, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground truth bbox, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + Returns: + assigned_result (dict): Assigned result + 'assigned_labels' (Tensor): shape(batch_size, num_gt) + 'assigned_bboxes' (Tensor): shape(batch_size, num_gt, 4) + 'assigned_scores' (Tensor): + shape(batch_size, num_gt, number_classes) + 'fg_mask_pre_prior' (Tensor): shape(bs, num_gt) + """ + # generate priors + cell_half_size = priors[:, 2:] * 2.5 + priors_gen = torch.zeros_like(priors) + priors_gen[:, :2] = priors[:, :2] - cell_half_size + priors_gen[:, 2:] = priors[:, :2] + cell_half_size + priors = priors_gen + + batch_size = gt_bboxes.size(0) + num_gt, num_priors = gt_bboxes.size(1), priors.size(0) + + assigned_result = { + 'assigned_labels': + gt_bboxes.new_full([batch_size, num_priors], self.num_classes), + 'assigned_bboxes': + gt_bboxes.new_full([batch_size, num_priors, 4], 0), + 'assigned_scores': + gt_bboxes.new_full([batch_size, num_priors, self.num_classes], 0), + 'fg_mask_pre_prior': + gt_bboxes.new_full([batch_size, num_priors], 0) + } + + if num_gt == 0: + return assigned_result + + # compute iou between all prior (prior of all pyramid levels) and gt + overlaps = self.iou_calculator(gt_bboxes.reshape([-1, 4]), priors) + overlaps = overlaps.reshape([batch_size, -1, num_priors]) + + # compute center distance between all prior and gt + distances, priors_points = bbox_center_distance( + gt_bboxes.reshape([-1, 4]), priors) + distances = distances.reshape([batch_size, -1, num_priors]) + + # Selecting candidates based on the center distance + is_in_candidate, candidate_idxs = self.select_topk_candidates( + distances, num_level_priors, pad_bbox_flag) + + # get corresponding iou for the these candidates, and compute the + # mean and std, set mean + std as the iou threshold + overlaps_thr_per_gt, iou_candidates = self.threshold_calculator( + is_in_candidate, candidate_idxs, overlaps, num_priors, batch_size, + num_gt) + + # select candidates iou >= threshold as positive + is_pos = torch.where( + iou_candidates > overlaps_thr_per_gt.repeat([1, 1, num_priors]), + is_in_candidate, torch.zeros_like(is_in_candidate)) + + is_in_gts = select_candidates_in_gts(priors_points, gt_bboxes) + pos_mask = is_pos * is_in_gts * pad_bbox_flag + + # if an anchor box is assigned to multiple gts, + # the one with the highest IoU will be selected. + gt_idx_pre_prior, fg_mask_pre_prior, pos_mask = \ + select_highest_overlaps(pos_mask, overlaps, num_gt) + + # assigned target + assigned_labels, assigned_bboxes, assigned_scores = self.get_targets( + gt_labels, gt_bboxes, gt_idx_pre_prior, fg_mask_pre_prior, + num_priors, batch_size, num_gt) + + # soft label with iou + if pred_bboxes is not None: + ious = yolov6_iou_calculator(gt_bboxes, pred_bboxes) * pos_mask + ious = ious.max(axis=-2)[0].unsqueeze(-1) + assigned_scores *= ious + + assigned_result['assigned_labels'] = assigned_labels.long() + assigned_result['assigned_bboxes'] = assigned_bboxes + assigned_result['assigned_scores'] = assigned_scores + assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool() + return assigned_result + + def select_topk_candidates(self, distances: Tensor, + num_level_priors: List[int], + pad_bbox_flag: Tensor) -> Tuple[Tensor, Tensor]: + """Selecting candidates based on the center distance. + + Args: + distances (Tensor): Distance between all bbox and gt, + shape(batch_size, num_gt, num_priors) + num_level_priors (List[int]): Number of bboxes in each level, + len(3) + pad_bbox_flag (Tensor): Ground truth bbox mask, + shape(batch_size, num_gt, 1) + + Return: + is_in_candidate_list (Tensor): Flag show that each level have + topk candidates or not, shape(batch_size, num_gt, num_priors) + candidate_idxs (Tensor): Candidates index, + shape(batch_size, num_gt, num_gt) + """ + is_in_candidate_list = [] + candidate_idxs = [] + start_idx = 0 + + distances_dtype = distances.dtype + distances = torch.split(distances, num_level_priors, dim=-1) + pad_bbox_flag = pad_bbox_flag.repeat(1, 1, self.topk).bool() + + for distances_per_level, priors_per_level in zip( + distances, num_level_priors): + # on each pyramid level, for each gt, + # select k bbox whose center are closest to the gt center + end_index = start_idx + priors_per_level + selected_k = min(self.topk, priors_per_level) + + _, topk_idxs_per_level = distances_per_level.topk( + selected_k, dim=-1, largest=False) + candidate_idxs.append(topk_idxs_per_level + start_idx) + + topk_idxs_per_level = torch.where( + pad_bbox_flag, topk_idxs_per_level, + torch.zeros_like(topk_idxs_per_level)) + + is_in_candidate = F.one_hot(topk_idxs_per_level, + priors_per_level).sum(dim=-2) + is_in_candidate = torch.where(is_in_candidate > 1, + torch.zeros_like(is_in_candidate), + is_in_candidate) + is_in_candidate_list.append(is_in_candidate.to(distances_dtype)) + + start_idx = end_index + + is_in_candidate_list = torch.cat(is_in_candidate_list, dim=-1) + candidate_idxs = torch.cat(candidate_idxs, dim=-1) + + return is_in_candidate_list, candidate_idxs + + @staticmethod + def threshold_calculator(is_in_candidate: List, candidate_idxs: Tensor, + overlaps: Tensor, num_priors: int, + batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor]: + """Get corresponding iou for the these candidates, and compute the mean + and std, set mean + std as the iou threshold. + + Args: + is_in_candidate (Tensor): Flag show that each level have + topk candidates or not, shape(batch_size, num_gt, num_priors). + candidate_idxs (Tensor): Candidates index, + shape(batch_size, num_gt, num_gt) + overlaps (Tensor): Overlaps area, + shape(batch_size, num_gt, num_priors). + num_priors (int): Number of priors. + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + + Return: + overlaps_thr_per_gt (Tensor): Overlap threshold of + per ground truth, shape(batch_size, num_gt, 1). + candidate_overlaps (Tensor): Candidate overlaps, + shape(batch_size, num_gt, num_priors). + """ + + batch_size_num_gt = batch_size * num_gt + candidate_overlaps = torch.where(is_in_candidate > 0, overlaps, + torch.zeros_like(overlaps)) + candidate_idxs = candidate_idxs.reshape([batch_size_num_gt, -1]) + + assist_indexes = num_priors * torch.arange( + batch_size_num_gt, device=candidate_idxs.device) + assist_indexes = assist_indexes[:, None] + flatten_indexes = candidate_idxs + assist_indexes + + candidate_overlaps_reshape = candidate_overlaps.reshape( + -1)[flatten_indexes] + candidate_overlaps_reshape = candidate_overlaps_reshape.reshape( + [batch_size, num_gt, -1]) + + overlaps_mean_per_gt = candidate_overlaps_reshape.mean( + axis=-1, keepdim=True) + overlaps_std_per_gt = candidate_overlaps_reshape.std( + axis=-1, keepdim=True) + overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt + + return overlaps_thr_per_gt, candidate_overlaps + + def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor, + assigned_gt_inds: Tensor, fg_mask_pre_prior: Tensor, + num_priors: int, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get target info. + + Args: + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + assigned_gt_inds (Tensor): Assigned ground truth indexes, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + num_priors (int): Number of priors. + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + + Return: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned bboxes, + shape(batch_size, num_priors) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors) + """ + + # assigned target labels + batch_index = torch.arange( + batch_size, dtype=gt_labels.dtype, device=gt_labels.device) + batch_index = batch_index[..., None] + assigned_gt_inds = (assigned_gt_inds + batch_index * num_gt).long() + assigned_labels = gt_labels.flatten()[assigned_gt_inds.flatten()] + assigned_labels = assigned_labels.reshape([batch_size, num_priors]) + assigned_labels = torch.where( + fg_mask_pre_prior > 0, assigned_labels, + torch.full_like(assigned_labels, self.num_classes)) + + # assigned target boxes + assigned_bboxes = gt_bboxes.reshape([-1, + 4])[assigned_gt_inds.flatten()] + assigned_bboxes = assigned_bboxes.reshape([batch_size, num_priors, 4]) + + # assigned target scores + assigned_scores = F.one_hot(assigned_labels.long(), + self.num_classes + 1).float() + assigned_scores = assigned_scores[:, :, :self.num_classes] + + return assigned_labels, assigned_bboxes, assigned_scores diff --git a/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..5ae0f80239590f9c906778e6e4c7c6b4bd10c488 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_dsl_assigner.py @@ -0,0 +1,272 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.structures.bbox import BaseBoxes +from mmdet.utils import ConfigType +from torch import Tensor + +from mmyolo.registry import TASK_UTILS + +INF = 100000000 +EPS = 1.0e-7 + + +def find_inside_points(boxes: Tensor, + points: Tensor, + box_dim: int = 4, + eps: float = 0.01) -> Tensor: + """Find inside box points in batches. Boxes dimension must be 3. + + Args: + boxes (Tensor): Boxes tensor. Must be batch input. + Has shape of (batch_size, n_boxes, box_dim). + points (Tensor): Points coordinates. Has shape of (n_points, 2). + box_dim (int): The dimension of box. 4 means horizontal box and + 5 means rotated box. Defaults to 4. + eps (float): Make sure the points are inside not on the boundary. + Only use in rotated boxes. Defaults to 0.01. + + Returns: + Tensor: A BoolTensor indicating whether a point is inside + boxes. The index has shape of (n_points, batch_size, n_boxes). + """ + if box_dim == 4: + # Horizontal Boxes + lt_ = points[:, None, None] - boxes[..., :2] + rb_ = boxes[..., 2:] - points[:, None, None] + + deltas = torch.cat([lt_, rb_], dim=-1) + is_in_gts = deltas.min(dim=-1).values > 0 + + elif box_dim == 5: + # Rotated Boxes + points = points[:, None, None] + ctrs, wh, t = torch.split(boxes, [2, 2, 1], dim=-1) + cos_value, sin_value = torch.cos(t), torch.sin(t) + matrix = torch.cat([cos_value, sin_value, -sin_value, cos_value], + dim=-1).reshape(*boxes.shape[:-1], 2, 2) + + offset = points - ctrs + offset = torch.matmul(matrix, offset[..., None]) + offset = offset.squeeze(-1) + offset_x, offset_y = offset[..., 0], offset[..., 1] + w, h = wh[..., 0], wh[..., 1] + is_in_gts = (offset_x <= w / 2 - eps) & (offset_x >= - w / 2 + eps) & \ + (offset_y <= h / 2 - eps) & (offset_y >= - h / 2 + eps) + else: + raise NotImplementedError(f'Unsupport box_dim:{box_dim}') + + return is_in_gts + + +def get_box_center(boxes: Tensor, box_dim: int = 4) -> Tensor: + """Return a tensor representing the centers of boxes. + + Args: + boxes (Tensor): Boxes tensor. Has shape of (b, n, box_dim) + box_dim (int): The dimension of box. 4 means horizontal box and + 5 means rotated box. Defaults to 4. + + Returns: + Tensor: Centers have shape of (b, n, 2) + """ + if box_dim == 4: + # Horizontal Boxes, (x1, y1, x2, y2) + return (boxes[..., :2] + boxes[..., 2:]) / 2.0 + elif box_dim == 5: + # Rotated Boxes, (x, y, w, h, a) + return boxes[..., :2] + else: + raise NotImplementedError(f'Unsupported box_dim:{box_dim}') + + +@TASK_UTILS.register_module() +class BatchDynamicSoftLabelAssigner(nn.Module): + """Computes matching between predictions and ground truth with dynamic soft + label assignment. + + Args: + num_classes (int): number of class + soft_center_radius (float): Radius of the soft center prior. + Defaults to 3.0. + topk (int): Select top-k predictions to calculate dynamic k + best matches for each gt. Defaults to 13. + iou_weight (float): The scale factor of iou cost. Defaults to 3.0. + iou_calculator (ConfigType): Config of overlaps Calculator. + Defaults to dict(type='BboxOverlaps2D'). + batch_iou (bool): Use batch input when calculate IoU. + If set to False use loop instead. Defaults to True. + """ + + def __init__( + self, + num_classes, + soft_center_radius: float = 3.0, + topk: int = 13, + iou_weight: float = 3.0, + iou_calculator: ConfigType = dict(type='mmdet.BboxOverlaps2D'), + batch_iou: bool = True, + ) -> None: + super().__init__() + self.num_classes = num_classes + self.soft_center_radius = soft_center_radius + self.topk = topk + self.iou_weight = iou_weight + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.batch_iou = batch_iou + + @torch.no_grad() + def forward(self, pred_bboxes: Tensor, pred_scores: Tensor, priors: Tensor, + gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor) -> dict: + num_gt = gt_bboxes.size(1) + decoded_bboxes = pred_bboxes + batch_size, num_bboxes, box_dim = decoded_bboxes.size() + + if num_gt == 0 or num_bboxes == 0: + return { + 'assigned_labels': + gt_labels.new_full( + pred_scores[..., 0].shape, + self.num_classes, + dtype=torch.long), + 'assigned_labels_weights': + gt_bboxes.new_full(pred_scores[..., 0].shape, 1), + 'assigned_bboxes': + gt_bboxes.new_full(pred_bboxes.shape, 0), + 'assign_metrics': + gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + } + + prior_center = priors[:, :2] + if isinstance(gt_bboxes, BaseBoxes): + raise NotImplementedError( + f'type of {type(gt_bboxes)} are not implemented !') + else: + is_in_gts = find_inside_points(gt_bboxes, prior_center, box_dim) + + # (N_points, B, N_boxes) + is_in_gts = is_in_gts * pad_bbox_flag[..., 0][None] + # (N_points, B, N_boxes) -> (B, N_points, N_boxes) + is_in_gts = is_in_gts.permute(1, 0, 2) + # (B, N_points) + valid_mask = is_in_gts.sum(dim=-1) > 0 + + gt_center = get_box_center(gt_bboxes, box_dim) + + strides = priors[..., 2] + distance = (priors[None].unsqueeze(2)[..., :2] - + gt_center[:, None, :, :] + ).pow(2).sum(-1).sqrt() / strides[None, :, None] + + # prevent overflow + distance = distance * valid_mask.unsqueeze(-1) + soft_center_prior = torch.pow(10, distance - self.soft_center_radius) + + if self.batch_iou: + pairwise_ious = self.iou_calculator(decoded_bboxes, gt_bboxes) + else: + ious = [] + for box, gt in zip(decoded_bboxes, gt_bboxes): + iou = self.iou_calculator(box, gt) + ious.append(iou) + pairwise_ious = torch.stack(ious, dim=0) + + iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight + + # select the predicted scores corresponded to the gt_labels + pairwise_pred_scores = pred_scores.permute(0, 2, 1) + idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long) + idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt) + idx[1] = gt_labels.long().squeeze(-1) + pairwise_pred_scores = pairwise_pred_scores[idx[0], + idx[1]].permute(0, 2, 1) + # classification cost + scale_factor = pairwise_ious - pairwise_pred_scores.sigmoid() + pairwise_cls_cost = F.binary_cross_entropy_with_logits( + pairwise_pred_scores, pairwise_ious, + reduction='none') * scale_factor.abs().pow(2.0) + + cost_matrix = pairwise_cls_cost + iou_cost + soft_center_prior + + max_pad_value = torch.ones_like(cost_matrix) * INF + cost_matrix = torch.where(valid_mask[..., None].repeat(1, 1, num_gt), + cost_matrix, max_pad_value) + + (matched_pred_ious, matched_gt_inds, + fg_mask_inboxes) = self.dynamic_k_matching(cost_matrix, pairwise_ious, + pad_bbox_flag) + + del pairwise_ious, cost_matrix + + batch_index = (fg_mask_inboxes > 0).nonzero(as_tuple=True)[0] + + assigned_labels = gt_labels.new_full(pred_scores[..., 0].shape, + self.num_classes) + assigned_labels[fg_mask_inboxes] = gt_labels[ + batch_index, matched_gt_inds].squeeze(-1) + assigned_labels = assigned_labels.long() + + assigned_labels_weights = gt_bboxes.new_full(pred_scores[..., 0].shape, + 1) + + assigned_bboxes = gt_bboxes.new_full(pred_bboxes.shape, 0) + assigned_bboxes[fg_mask_inboxes] = gt_bboxes[batch_index, + matched_gt_inds] + + assign_metrics = gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + assign_metrics[fg_mask_inboxes] = matched_pred_ious + + return dict( + assigned_labels=assigned_labels, + assigned_labels_weights=assigned_labels_weights, + assigned_bboxes=assigned_bboxes, + assign_metrics=assign_metrics) + + def dynamic_k_matching( + self, cost_matrix: Tensor, pairwise_ious: Tensor, + pad_bbox_flag: int) -> Tuple[Tensor, Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets. + + Args: + cost_matrix (Tensor): Cost matrix. + pairwise_ious (Tensor): Pairwise iou matrix. + num_gt (int): Number of gt. + valid_mask (Tensor): Mask for valid bboxes. + Returns: + tuple: matched ious and gt indexes. + """ + matching_matrix = torch.zeros_like(cost_matrix, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.topk, pairwise_ious.size(1)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=1) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1) + + num_gts = pad_bbox_flag.sum((1, 2)).int() + # sorting the batch cost matirx is faster than topk + _, sorted_indices = torch.sort(cost_matrix, dim=1) + for b in range(pad_bbox_flag.shape[0]): + for gt_idx in range(num_gts[b]): + topk_ids = sorted_indices[b, :dynamic_ks[b, gt_idx], gt_idx] + matching_matrix[b, :, gt_idx][topk_ids] = 1 + + del topk_ious, dynamic_ks + + prior_match_gt_mask = matching_matrix.sum(2) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost_matrix[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(2) > 0 + matched_pred_ious = (matching_matrix * + pairwise_ious).sum(2)[fg_mask_inboxes] + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + return matched_pred_ious, matched_gt_inds, fg_mask_inboxes diff --git a/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..202d678986c3a398de63675c004592b98ea092e0 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_task_aligned_assigner.py @@ -0,0 +1,311 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +from mmyolo.models.losses import bbox_overlaps +from mmyolo.registry import TASK_UTILS +from .utils import (select_candidates_in_gts, select_highest_overlaps, + yolov6_iou_calculator) + + +@TASK_UTILS.register_module() +class BatchTaskAlignedAssigner(nn.Module): + """This code referenced to + https://github.com/meituan/YOLOv6/blob/main/yolov6/ + assigners/tal_assigner.py. + Batch Task aligned assigner base on the paper: + `TOOD: Task-aligned One-stage Object Detection. + `_. + Assign a corresponding gt bboxes or background to a batch of + predicted bboxes. Each bbox will be assigned with `0` or a + positive integer indicating the ground truth index. + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + Args: + num_classes (int): number of class + topk (int): number of bbox selected in each level + alpha (float): Hyper-parameters related to alignment_metrics. + Defaults to 1.0 + beta (float): Hyper-parameters related to alignment_metrics. + Defaults to 6. + eps (float): Eps to avoid log(0). Default set to 1e-9 + use_ciou (bool): Whether to use ciou while calculating iou. + Defaults to False. + """ + + def __init__(self, + num_classes: int, + topk: int = 13, + alpha: float = 1.0, + beta: float = 6.0, + eps: float = 1e-7, + use_ciou: bool = False): + super().__init__() + self.num_classes = num_classes + self.topk = topk + self.alpha = alpha + self.beta = beta + self.eps = eps + self.use_ciou = use_ciou + + @torch.no_grad() + def forward( + self, + pred_bboxes: Tensor, + pred_scores: Tensor, + priors: Tensor, + gt_labels: Tensor, + gt_bboxes: Tensor, + pad_bbox_flag: Tensor, + ) -> dict: + """Assign gt to bboxes. + + The assignment is done in following steps + 1. compute alignment metric between all bbox (bbox of all pyramid + levels) and gt + 2. select top-k bbox as candidates for each gt + 3. limit the positive sample's center in gt (because the anchor-free + detector only can predict positive distance) + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bboxes, + shape(batch_size, num_priors, num_classes) + priors (Tensor): Model priors, shape (num_priors, 4) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + Returns: + assigned_result (dict) Assigned result: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned boxes, + shape(batch_size, num_priors, 4) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors, num_classes) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + """ + # (num_priors, 4) -> (num_priors, 2) + priors = priors[:, :2] + + batch_size = pred_scores.size(0) + num_gt = gt_bboxes.size(1) + + assigned_result = { + 'assigned_labels': + gt_bboxes.new_full(pred_scores[..., 0].shape, self.num_classes), + 'assigned_bboxes': + gt_bboxes.new_full(pred_bboxes.shape, 0), + 'assigned_scores': + gt_bboxes.new_full(pred_scores.shape, 0), + 'fg_mask_pre_prior': + gt_bboxes.new_full(pred_scores[..., 0].shape, 0) + } + + if num_gt == 0: + return assigned_result + + pos_mask, alignment_metrics, overlaps = self.get_pos_mask( + pred_bboxes, pred_scores, priors, gt_labels, gt_bboxes, + pad_bbox_flag, batch_size, num_gt) + + (assigned_gt_idxs, fg_mask_pre_prior, + pos_mask) = select_highest_overlaps(pos_mask, overlaps, num_gt) + + # assigned target + assigned_labels, assigned_bboxes, assigned_scores = self.get_targets( + gt_labels, gt_bboxes, assigned_gt_idxs, fg_mask_pre_prior, + batch_size, num_gt) + + # normalize + alignment_metrics *= pos_mask + pos_align_metrics = alignment_metrics.max(axis=-1, keepdim=True)[0] + pos_overlaps = (overlaps * pos_mask).max(axis=-1, keepdim=True)[0] + norm_align_metric = ( + alignment_metrics * pos_overlaps / + (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1) + assigned_scores = assigned_scores * norm_align_metric + + assigned_result['assigned_labels'] = assigned_labels + assigned_result['assigned_bboxes'] = assigned_bboxes + assigned_result['assigned_scores'] = assigned_scores + assigned_result['fg_mask_pre_prior'] = fg_mask_pre_prior.bool() + return assigned_result + + def get_pos_mask(self, pred_bboxes: Tensor, pred_scores: Tensor, + priors: Tensor, gt_labels: Tensor, gt_bboxes: Tensor, + pad_bbox_flag: Tensor, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get possible mask. + + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bbox, + shape(batch_size, num_priors, num_classes) + priors (Tensor): Model priors, shape (num_priors, 2) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + pad_bbox_flag (Tensor): Ground truth bbox mask, + 1 means bbox, 0 means no bbox, + shape(batch_size, num_gt, 1) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + pos_mask (Tensor): Possible mask, + shape(batch_size, num_gt, num_priors) + alignment_metrics (Tensor): Alignment metrics, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): Overlaps of gt_bboxes and pred_bboxes, + shape(batch_size, num_gt, num_priors) + """ + + # Compute alignment metric between all bbox and gt + alignment_metrics, overlaps = \ + self.get_box_metrics(pred_bboxes, pred_scores, gt_labels, + gt_bboxes, batch_size, num_gt) + + # get is_in_gts mask + is_in_gts = select_candidates_in_gts(priors, gt_bboxes) + + # get topk_metric mask + topk_metric = self.select_topk_candidates( + alignment_metrics * is_in_gts, + topk_mask=pad_bbox_flag.repeat([1, 1, self.topk]).bool()) + + # merge all mask to a final mask + pos_mask = topk_metric * is_in_gts * pad_bbox_flag + + return pos_mask, alignment_metrics, overlaps + + def get_box_metrics(self, pred_bboxes: Tensor, pred_scores: Tensor, + gt_labels: Tensor, gt_bboxes: Tensor, batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor]: + """Compute alignment metric between all bbox and gt. + + Args: + pred_bboxes (Tensor): Predict bboxes, + shape(batch_size, num_priors, 4) + pred_scores (Tensor): Scores of predict bbox, + shape(batch_size, num_priors, num_classes) + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + alignment_metrics (Tensor): Align metric, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): Overlaps, shape(batch_size, num_gt, num_priors) + """ + pred_scores = pred_scores.permute(0, 2, 1) + gt_labels = gt_labels.to(torch.long) + idx = torch.zeros([2, batch_size, num_gt], dtype=torch.long) + idx[0] = torch.arange(end=batch_size).view(-1, 1).repeat(1, num_gt) + idx[1] = gt_labels.squeeze(-1) + bbox_scores = pred_scores[idx[0], idx[1]] + # TODO: need to replace the yolov6_iou_calculator function + if self.use_ciou: + overlaps = bbox_overlaps( + pred_bboxes.unsqueeze(1), + gt_bboxes.unsqueeze(2), + iou_mode='ciou', + bbox_format='xyxy').clamp(0) + else: + overlaps = yolov6_iou_calculator(gt_bboxes, pred_bboxes) + + alignment_metrics = bbox_scores.pow(self.alpha) * overlaps.pow( + self.beta) + + return alignment_metrics, overlaps + + def select_topk_candidates(self, + alignment_gt_metrics: Tensor, + using_largest_topk: bool = True, + topk_mask: Optional[Tensor] = None) -> Tensor: + """Compute alignment metric between all bbox and gt. + + Args: + alignment_gt_metrics (Tensor): Alignment metric of gt candidates, + shape(batch_size, num_gt, num_priors) + using_largest_topk (bool): Controls whether to using largest or + smallest elements. + topk_mask (Tensor): Topk mask, + shape(batch_size, num_gt, self.topk) + Returns: + Tensor: Topk candidates mask, + shape(batch_size, num_gt, num_priors) + """ + num_priors = alignment_gt_metrics.shape[-1] + topk_metrics, topk_idxs = torch.topk( + alignment_gt_metrics, + self.topk, + axis=-1, + largest=using_largest_topk) + if topk_mask is None: + topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > + self.eps).tile([1, 1, self.topk]) + topk_idxs = torch.where(topk_mask, topk_idxs, + torch.zeros_like(topk_idxs)) + is_in_topk = F.one_hot(topk_idxs, num_priors).sum(axis=-2) + is_in_topk = torch.where(is_in_topk > 1, torch.zeros_like(is_in_topk), + is_in_topk) + return is_in_topk.to(alignment_gt_metrics.dtype) + + def get_targets(self, gt_labels: Tensor, gt_bboxes: Tensor, + assigned_gt_idxs: Tensor, fg_mask_pre_prior: Tensor, + batch_size: int, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """Get assigner info. + + Args: + gt_labels (Tensor): Ground true labels, + shape(batch_size, num_gt, 1) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + assigned_gt_idxs (Tensor): Assigned ground truth indexes, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force ground truth matching mask, + shape(batch_size, num_priors) + batch_size (int): Batch size. + num_gt (int): Number of ground truth. + Returns: + assigned_labels (Tensor): Assigned labels, + shape(batch_size, num_priors) + assigned_bboxes (Tensor): Assigned bboxes, + shape(batch_size, num_priors) + assigned_scores (Tensor): Assigned scores, + shape(batch_size, num_priors) + """ + # assigned target labels + batch_ind = torch.arange( + end=batch_size, dtype=torch.int64, device=gt_labels.device)[..., + None] + assigned_gt_idxs = assigned_gt_idxs + batch_ind * num_gt + assigned_labels = gt_labels.long().flatten()[assigned_gt_idxs] + + # assigned target boxes + assigned_bboxes = gt_bboxes.reshape([-1, 4])[assigned_gt_idxs] + + # assigned target scores + assigned_labels[assigned_labels < 0] = 0 + assigned_scores = F.one_hot(assigned_labels, self.num_classes) + force_gt_scores_mask = fg_mask_pre_prior[:, :, None].repeat( + 1, 1, self.num_classes) + assigned_scores = torch.where(force_gt_scores_mask > 0, + assigned_scores, + torch.full_like(assigned_scores, 0)) + + return assigned_labels, assigned_bboxes, assigned_scores diff --git a/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..6709968eeb1768fc4e6124f1f7a344f581dd43a7 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/assigners/batch_yolov7_assigner.py @@ -0,0 +1,344 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_overlaps + + +def _cat_multi_level_tensor_in_place(*multi_level_tensor, place_hold_var): + """concat multi-level tensor in place.""" + for level_tensor in multi_level_tensor: + for i, var in enumerate(level_tensor): + if len(var) > 0: + level_tensor[i] = torch.cat(var, dim=0) + else: + level_tensor[i] = place_hold_var + + +class BatchYOLOv7Assigner(nn.Module): + """Batch YOLOv7 Assigner. + + It consists of two assigning steps: + + 1. YOLOv5 cross-grid sample assigning + 2. SimOTA assigning + + This code referenced to + https://github.com/WongKinYiu/yolov7/blob/main/utils/loss.py. + + Args: + num_classes (int): Number of classes. + num_base_priors (int): Number of base priors. + featmap_strides (Sequence[int]): Feature map strides. + prior_match_thr (float): Threshold to match priors. + Defaults to 4.0. + candidate_topk (int): Number of topk candidates to + assign. Defaults to 10. + iou_weight (float): IOU weight. Defaults to 3.0. + cls_weight (float): Class weight. Defaults to 1.0. + """ + + def __init__(self, + num_classes: int, + num_base_priors: int, + featmap_strides: Sequence[int], + prior_match_thr: float = 4.0, + candidate_topk: int = 10, + iou_weight: float = 3.0, + cls_weight: float = 1.0): + super().__init__() + self.num_classes = num_classes + self.num_base_priors = num_base_priors + self.featmap_strides = featmap_strides + # yolov5 param + self.prior_match_thr = prior_match_thr + # simota param + self.candidate_topk = candidate_topk + self.iou_weight = iou_weight + self.cls_weight = cls_weight + + @torch.no_grad() + def forward(self, + pred_results, + batch_targets_normed, + batch_input_shape, + priors_base_sizes, + grid_offset, + near_neighbor_thr=0.5) -> dict: + """Forward function.""" + # (num_base_priors, num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + + # mlvl is mean multi_level + if batch_targets_normed.shape[1] == 0: + # empty gt of batch + num_levels = len(pred_results) + return dict( + mlvl_positive_infos=[pred_results[0].new_empty( + (0, 4))] * num_levels, + mlvl_priors=[] * num_levels, + mlvl_targets_normed=[] * num_levels) + + # if near_neighbor_thr = 0.5 are mean the nearest + # 3 neighbors are also considered positive samples. + # if near_neighbor_thr = 1.0 are mean the nearest + # 5 neighbors are also considered positive samples. + mlvl_positive_infos, mlvl_priors = self.yolov5_assigner( + pred_results, + batch_targets_normed, + priors_base_sizes, + grid_offset, + near_neighbor_thr=near_neighbor_thr) + + mlvl_positive_infos, mlvl_priors, \ + mlvl_targets_normed = self.simota_assigner( + pred_results, batch_targets_normed, mlvl_positive_infos, + mlvl_priors, batch_input_shape) + + place_hold_var = batch_targets_normed.new_empty((0, 4)) + _cat_multi_level_tensor_in_place( + mlvl_positive_infos, + mlvl_priors, + mlvl_targets_normed, + place_hold_var=place_hold_var) + + return dict( + mlvl_positive_infos=mlvl_positive_infos, + mlvl_priors=mlvl_priors, + mlvl_targets_normed=mlvl_targets_normed) + + def yolov5_assigner(self, + pred_results, + batch_targets_normed, + priors_base_sizes, + grid_offset, + near_neighbor_thr=0.5): + """YOLOv5 cross-grid sample assigner.""" + num_batch_gts = batch_targets_normed.shape[1] + assert num_batch_gts > 0 + + mlvl_positive_infos, mlvl_priors = [], [] + + scaled_factor = torch.ones(7, device=pred_results[0].device) + for i in range(len(pred_results)): # lever + priors_base_sizes_i = priors_base_sizes[i] + # (1, 1, feat_shape_w, feat_shape_h, feat_shape_w, feat_shape_h) + scaled_factor[2:6] = torch.tensor( + pred_results[i].shape)[[3, 2, 3, 2]] + + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_batch_gts, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1. / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[ + match_inds] # (num_matched_target, 7) + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + mlvl_positive_infos.append( + batch_targets_scaled.new_empty((0, 4))) + mlvl_priors.append([]) + continue + + # Positive samples with additional neighbors + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] # () + retained_offsets = grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # batch_targets_scaled: (num_matched_target, 7) + # 7 is mean (batch_idx, cls_id, x_scaled, + # y_scaled, w_scaled, h_scaled, prior_idx) + + # mlvl_positive_info: (num_matched_target, 4) + # 4 is mean (batch_idx, prior_idx, x_scaled, y_scaled) + mlvl_positive_info = batch_targets_scaled[:, [0, 6, 2, 3]] + retained_offsets = retained_offsets * near_neighbor_thr + mlvl_positive_info[:, + 2:] = mlvl_positive_info[:, + 2:] - retained_offsets + mlvl_positive_info[:, 2].clamp_(0, scaled_factor[2] - 1) + mlvl_positive_info[:, 3].clamp_(0, scaled_factor[3] - 1) + mlvl_positive_info = mlvl_positive_info.long() + priors_inds = mlvl_positive_info[:, 1] + + mlvl_positive_infos.append(mlvl_positive_info) + mlvl_priors.append(priors_base_sizes_i[priors_inds]) + + return mlvl_positive_infos, mlvl_priors + + def simota_assigner(self, pred_results, batch_targets_normed, + mlvl_positive_infos, mlvl_priors, batch_input_shape): + """SimOTA assigner.""" + num_batch_gts = batch_targets_normed.shape[1] + assert num_batch_gts > 0 + num_levels = len(mlvl_positive_infos) + + mlvl_positive_infos_matched = [[] for _ in range(num_levels)] + mlvl_priors_matched = [[] for _ in range(num_levels)] + mlvl_targets_normed_matched = [[] for _ in range(num_levels)] + + for batch_idx in range(pred_results[0].shape[0]): + # (num_batch_gt, 7) + # 7 is mean (batch_idx, cls_id, x_norm, y_norm, + # w_norm, h_norm, prior_idx) + targets_normed = batch_targets_normed[0] + # (num_gt, 7) + targets_normed = targets_normed[targets_normed[:, 0] == batch_idx] + num_gts = targets_normed.shape[0] + + if num_gts == 0: + continue + + _mlvl_decoderd_bboxes = [] + _mlvl_obj_cls = [] + _mlvl_priors = [] + _mlvl_positive_infos = [] + _from_which_layer = [] + + for i, head_pred in enumerate(pred_results): + # (num_matched_target, 4) + # 4 is mean (batch_idx, prior_idx, grid_x, grid_y) + _mlvl_positive_info = mlvl_positive_infos[i] + if _mlvl_positive_info.shape[0] == 0: + continue + + idx = (_mlvl_positive_info[:, 0] == batch_idx) + _mlvl_positive_info = _mlvl_positive_info[idx] + _mlvl_positive_infos.append(_mlvl_positive_info) + + priors = mlvl_priors[i][idx] + _mlvl_priors.append(priors) + + _from_which_layer.append( + _mlvl_positive_info.new_full( + size=(_mlvl_positive_info.shape[0], ), fill_value=i)) + + # (n,85) + level_batch_idx, prior_ind, \ + grid_x, grid_y = _mlvl_positive_info.T + pred_positive = head_pred[level_batch_idx, prior_ind, grid_y, + grid_x] + _mlvl_obj_cls.append(pred_positive[:, 4:]) + + # decoded + grid = torch.stack([grid_x, grid_y], dim=1) + pred_positive_cxcy = (pred_positive[:, :2].sigmoid() * 2. - + 0.5 + grid) * self.featmap_strides[i] + pred_positive_wh = (pred_positive[:, 2:4].sigmoid() * 2) ** 2 \ + * priors * self.featmap_strides[i] + pred_positive_xywh = torch.cat( + [pred_positive_cxcy, pred_positive_wh], dim=-1) + _mlvl_decoderd_bboxes.append(pred_positive_xywh) + + if len(_mlvl_decoderd_bboxes) == 0: + continue + + # 1 calc pair_wise_iou_loss + _mlvl_decoderd_bboxes = torch.cat(_mlvl_decoderd_bboxes, dim=0) + num_pred_positive = _mlvl_decoderd_bboxes.shape[0] + + if num_pred_positive == 0: + continue + + # scaled xywh + batch_input_shape_wh = pred_results[0].new_tensor( + batch_input_shape[::-1]).repeat((1, 2)) + targets_scaled_bbox = targets_normed[:, 2:6] * batch_input_shape_wh + + targets_scaled_bbox = bbox_cxcywh_to_xyxy(targets_scaled_bbox) + _mlvl_decoderd_bboxes = bbox_cxcywh_to_xyxy(_mlvl_decoderd_bboxes) + pair_wise_iou = bbox_overlaps(targets_scaled_bbox, + _mlvl_decoderd_bboxes) + pair_wise_iou_loss = -torch.log(pair_wise_iou + 1e-8) + + # 2 calc pair_wise_cls_loss + _mlvl_obj_cls = torch.cat(_mlvl_obj_cls, dim=0).float().sigmoid() + _mlvl_positive_infos = torch.cat(_mlvl_positive_infos, dim=0) + _from_which_layer = torch.cat(_from_which_layer, dim=0) + _mlvl_priors = torch.cat(_mlvl_priors, dim=0) + + gt_cls_per_image = ( + F.one_hot(targets_normed[:, 1].to(torch.int64), + self.num_classes).float().unsqueeze(1).repeat( + 1, num_pred_positive, 1)) + # cls_score * obj + cls_preds_ = _mlvl_obj_cls[:, 1:]\ + .unsqueeze(0)\ + .repeat(num_gts, 1, 1) \ + * _mlvl_obj_cls[:, 0:1]\ + .unsqueeze(0).repeat(num_gts, 1, 1) + y = cls_preds_.sqrt_() + pair_wise_cls_loss = F.binary_cross_entropy_with_logits( + torch.log(y / (1 - y)), gt_cls_per_image, + reduction='none').sum(-1) + del cls_preds_ + + # calc cost + cost = ( + self.cls_weight * pair_wise_cls_loss + + self.iou_weight * pair_wise_iou_loss) + + # num_gt, num_match_pred + matching_matrix = torch.zeros_like(cost) + + top_k, _ = torch.topk( + pair_wise_iou, + min(self.candidate_topk, pair_wise_iou.shape[1]), + dim=1) + dynamic_ks = torch.clamp(top_k.sum(1).int(), min=1) + + # Select only topk matches per gt + for gt_idx in range(num_gts): + _, pos_idx = torch.topk( + cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False) + matching_matrix[gt_idx][pos_idx] = 1.0 + del top_k, dynamic_ks + + # Each prediction box can match at most one gt box, + # and if there are more than one, + # only the least costly one can be taken + anchor_matching_gt = matching_matrix.sum(0) + if (anchor_matching_gt > 1).sum() > 0: + _, cost_argmin = torch.min( + cost[:, anchor_matching_gt > 1], dim=0) + matching_matrix[:, anchor_matching_gt > 1] *= 0.0 + matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 + fg_mask_inboxes = matching_matrix.sum(0) > 0.0 + matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) + + targets_normed = targets_normed[matched_gt_inds] + _mlvl_positive_infos = _mlvl_positive_infos[fg_mask_inboxes] + _from_which_layer = _from_which_layer[fg_mask_inboxes] + _mlvl_priors = _mlvl_priors[fg_mask_inboxes] + + # Rearranged in the order of the prediction layers + # to facilitate loss + for i in range(num_levels): + layer_idx = _from_which_layer == i + mlvl_positive_infos_matched[i].append( + _mlvl_positive_infos[layer_idx]) + mlvl_priors_matched[i].append(_mlvl_priors[layer_idx]) + mlvl_targets_normed_matched[i].append( + targets_normed[layer_idx]) + + results = mlvl_positive_infos_matched, \ + mlvl_priors_matched, \ + mlvl_targets_normed_matched + return results diff --git a/third_party/mmyolo/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py b/third_party/mmyolo/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..e66a9bf157aceceadb2f228cbbcb3ff1ddc00196 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/assigners/pose_sim_ota_assigner.py @@ -0,0 +1,210 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +from mmdet.models.task_modules.assigners import AssignResult, SimOTAAssigner +from mmdet.utils import ConfigType +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS, TASK_UTILS + +INF = 100000.0 +EPS = 1.0e-7 + + +@TASK_UTILS.register_module() +class PoseSimOTAAssigner(SimOTAAssigner): + + def __init__(self, + center_radius: float = 2.5, + candidate_topk: int = 10, + iou_weight: float = 3.0, + cls_weight: float = 1.0, + oks_weight: float = 0.0, + vis_weight: float = 0.0, + iou_calculator: ConfigType = dict(type='BboxOverlaps2D'), + oks_calculator: ConfigType = dict(type='OksLoss')): + + self.center_radius = center_radius + self.candidate_topk = candidate_topk + self.iou_weight = iou_weight + self.cls_weight = cls_weight + self.oks_weight = oks_weight + self.vis_weight = vis_weight + + self.iou_calculator = TASK_UTILS.build(iou_calculator) + self.oks_calculator = MODELS.build(oks_calculator) + + def assign(self, + pred_instances: InstanceData, + gt_instances: InstanceData, + gt_instances_ignore: Optional[InstanceData] = None, + **kwargs) -> AssignResult: + """Assign gt to priors using SimOTA. + + Args: + pred_instances (:obj:`InstanceData`): Instances of model + predictions. It includes ``priors``, and the priors can + be anchors or points, or the bboxes predicted by the + previous stage, has shape (n, 4). The bboxes predicted by + the current model or stage will be named ``bboxes``, + ``labels``, and ``scores``, the same as the ``InstanceData`` + in other places. + gt_instances (:obj:`InstanceData`): Ground truth of instance + annotations. It usually includes ``bboxes``, with shape (k, 4), + and ``labels``, with shape (k, ). + gt_instances_ignore (:obj:`InstanceData`, optional): Instances + to be ignored during training. It includes ``bboxes`` + attribute data that is ignored during training and testing. + Defaults to None. + Returns: + obj:`AssignResult`: The assigned result. + """ + gt_bboxes = gt_instances.bboxes + gt_labels = gt_instances.labels + gt_keypoints = gt_instances.keypoints + gt_keypoints_visible = gt_instances.keypoints_visible + num_gt = gt_bboxes.size(0) + + decoded_bboxes = pred_instances.bboxes[..., :4] + pred_kpts = pred_instances.bboxes[..., 4:] + pred_kpts = pred_kpts.reshape(*pred_kpts.shape[:-1], -1, 3) + pred_kpts_vis = pred_kpts[..., -1] + pred_kpts = pred_kpts[..., :2] + pred_scores = pred_instances.scores + priors = pred_instances.priors + num_bboxes = decoded_bboxes.size(0) + + # assign 0 by default + assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ), + 0, + dtype=torch.long) + if num_gt == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info( + priors, gt_bboxes) + valid_decoded_bbox = decoded_bboxes[valid_mask] + valid_pred_scores = pred_scores[valid_mask] + valid_pred_kpts = pred_kpts[valid_mask] + valid_pred_kpts_vis = pred_kpts_vis[valid_mask] + num_valid = valid_decoded_bbox.size(0) + if num_valid == 0: + # No valid bboxes, return empty assignment + max_overlaps = decoded_bboxes.new_zeros((num_bboxes, )) + assigned_labels = decoded_bboxes.new_full((num_bboxes, ), + -1, + dtype=torch.long) + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + cost_matrix = (~is_in_boxes_and_center) * INF + + # calculate iou + pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes) + if self.iou_weight > 0: + iou_cost = -torch.log(pairwise_ious + EPS) + cost_matrix = cost_matrix + iou_cost * self.iou_weight + + # calculate oks + pairwise_oks = self.oks_calculator.compute_oks( + valid_pred_kpts.unsqueeze(1), # [num_valid, -1, k, 2] + gt_keypoints.unsqueeze(0), # [1, num_gt, k, 2] + gt_keypoints_visible.unsqueeze(0), # [1, num_gt, k] + bboxes=gt_bboxes.unsqueeze(0), # [1, num_gt, 4] + ) # -> [num_valid, num_gt] + if self.oks_weight > 0: + oks_cost = -torch.log(pairwise_oks + EPS) + cost_matrix = cost_matrix + oks_cost * self.oks_weight + + # calculate cls + if self.cls_weight > 0: + gt_onehot_label = ( + F.one_hot(gt_labels.to(torch.int64), + pred_scores.shape[-1]).float().unsqueeze(0).repeat( + num_valid, 1, 1)) + + valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat( + 1, num_gt, 1) + # disable AMP autocast to avoid overflow + with torch.cuda.amp.autocast(enabled=False): + cls_cost = ( + F.binary_cross_entropy( + valid_pred_scores.to(dtype=torch.float32), + gt_onehot_label, + reduction='none', + ).sum(-1).to(dtype=valid_pred_scores.dtype)) + cost_matrix = cost_matrix + cls_cost * self.cls_weight + + # calculate vis + if self.vis_weight > 0: + valid_pred_kpts_vis = valid_pred_kpts_vis.sigmoid().unsqueeze( + 1).repeat(1, num_gt, 1) # [num_valid, 1, k] + gt_kpt_vis = gt_keypoints_visible.unsqueeze( + 0).float() # [1, num_gt, k] + with torch.cuda.amp.autocast(enabled=False): + vis_cost = ( + F.binary_cross_entropy( + valid_pred_kpts_vis.to(dtype=torch.float32), + gt_kpt_vis.repeat(num_valid, 1, 1), + reduction='none', + ).sum(-1).to(dtype=valid_pred_kpts_vis.dtype)) + cost_matrix = cost_matrix + vis_cost * self.vis_weight + + # mixed metric + pairwise_oks = pairwise_oks.pow(0.5) + matched_pred_oks, matched_gt_inds = \ + self.dynamic_k_matching( + cost_matrix, pairwise_ious, pairwise_oks, num_gt, valid_mask) + + # convert to AssignResult format + assigned_gt_inds[valid_mask] = matched_gt_inds + 1 + assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1) + assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long() + max_overlaps = assigned_gt_inds.new_full((num_bboxes, ), + -INF, + dtype=torch.float32) + max_overlaps[valid_mask] = matched_pred_oks + return AssignResult( + num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor, + pairwise_oks: Tensor, num_gt: int, + valid_mask: Tensor) -> Tuple[Tensor, Tensor]: + """Use IoU and matching cost to calculate the dynamic top-k positive + targets.""" + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + # select candidate topk ious for dynamic-k calculation + candidate_topk = min(self.candidate_topk, pairwise_ious.size(0)) + topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0) + # calculate dynamic k for each gt + dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) + matching_matrix[:, gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + prior_match_gt_mask = matching_matrix.sum(1) > 1 + if prior_match_gt_mask.sum() > 0: + cost_min, cost_argmin = torch.min( + cost[prior_match_gt_mask, :], dim=1) + matching_matrix[prior_match_gt_mask, :] *= 0 + matching_matrix[prior_match_gt_mask, cost_argmin] = 1 + # get foreground mask inside box and center prior + fg_mask_inboxes = matching_matrix.sum(1) > 0 + valid_mask[valid_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1) + matched_pred_oks = (matching_matrix * + pairwise_oks).sum(1)[fg_mask_inboxes] + return matched_pred_oks, matched_gt_inds diff --git a/third_party/mmyolo/mmyolo/models/task_modules/assigners/utils.py b/third_party/mmyolo/mmyolo/models/task_modules/assigners/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5843200936ef7a269109517e6d2952cceea02059 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/assigners/utils.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from typing import Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor + + +def select_candidates_in_gts(priors_points: Tensor, + gt_bboxes: Tensor, + eps: float = 1e-9) -> Tensor: + """Select the positive priors' center in gt. + + Args: + priors_points (Tensor): Model priors points, + shape(num_priors, 2) + gt_bboxes (Tensor): Ground true bboxes, + shape(batch_size, num_gt, 4) + eps (float): Default to 1e-9. + Return: + (Tensor): shape(batch_size, num_gt, num_priors) + """ + batch_size, num_gt, _ = gt_bboxes.size() + gt_bboxes = gt_bboxes.reshape([-1, 4]) + + priors_number = priors_points.size(0) + priors_points = priors_points.unsqueeze(0).repeat(batch_size * num_gt, 1, + 1) + + # calculate the left, top, right, bottom distance between positive + # prior center and gt side + gt_bboxes_lt = gt_bboxes[:, 0:2].unsqueeze(1).repeat(1, priors_number, 1) + gt_bboxes_rb = gt_bboxes[:, 2:4].unsqueeze(1).repeat(1, priors_number, 1) + bbox_deltas = torch.cat( + [priors_points - gt_bboxes_lt, gt_bboxes_rb - priors_points], dim=-1) + bbox_deltas = bbox_deltas.reshape([batch_size, num_gt, priors_number, -1]) + + return (bbox_deltas.min(axis=-1)[0] > eps).to(gt_bboxes.dtype) + + +def select_highest_overlaps(pos_mask: Tensor, overlaps: Tensor, + num_gt: int) -> Tuple[Tensor, Tensor, Tensor]: + """If an anchor box is assigned to multiple gts, the one with the highest + iou will be selected. + + Args: + pos_mask (Tensor): The assigned positive sample mask, + shape(batch_size, num_gt, num_priors) + overlaps (Tensor): IoU between all bbox and ground truth, + shape(batch_size, num_gt, num_priors) + num_gt (int): Number of ground truth. + Return: + gt_idx_pre_prior (Tensor): Target ground truth index, + shape(batch_size, num_priors) + fg_mask_pre_prior (Tensor): Force matching ground truth, + shape(batch_size, num_priors) + pos_mask (Tensor): The assigned positive sample mask, + shape(batch_size, num_gt, num_priors) + """ + fg_mask_pre_prior = pos_mask.sum(axis=-2) + + # Make sure the positive sample matches the only one and is the largest IoU + if fg_mask_pre_prior.max() > 1: + mask_multi_gts = (fg_mask_pre_prior.unsqueeze(1) > 1).repeat( + [1, num_gt, 1]) + index = overlaps.argmax(axis=1) + is_max_overlaps = F.one_hot(index, num_gt) + is_max_overlaps = \ + is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype) + + pos_mask = torch.where(mask_multi_gts, is_max_overlaps, pos_mask) + fg_mask_pre_prior = pos_mask.sum(axis=-2) + + gt_idx_pre_prior = pos_mask.argmax(axis=-2) + return gt_idx_pre_prior, fg_mask_pre_prior, pos_mask + + +# TODO:'mmdet.BboxOverlaps2D' will cause gradient inconsistency, +# which will be found and solved in a later version. +def yolov6_iou_calculator(bbox1: Tensor, + bbox2: Tensor, + eps: float = 1e-9) -> Tensor: + """Calculate iou for batch. + + Args: + bbox1 (Tensor): shape(batch size, num_gt, 4) + bbox2 (Tensor): shape(batch size, num_priors, 4) + eps (float): Default to 1e-9. + Return: + (Tensor): IoU, shape(size, num_gt, num_priors) + """ + bbox1 = bbox1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] + bbox2 = bbox2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] + + # calculate xy info of predict and gt bbox + bbox1_x1y1, bbox1_x2y2 = bbox1[:, :, :, 0:2], bbox1[:, :, :, 2:4] + bbox2_x1y1, bbox2_x2y2 = bbox2[:, :, :, 0:2], bbox2[:, :, :, 2:4] + + # calculate overlap area + overlap = (torch.minimum(bbox1_x2y2, bbox2_x2y2) - + torch.maximum(bbox1_x1y1, bbox2_x1y1)).clip(0).prod(-1) + + # calculate bbox area + bbox1_area = (bbox1_x2y2 - bbox1_x1y1).clip(0).prod(-1) + bbox2_area = (bbox2_x2y2 - bbox2_x1y1).clip(0).prod(-1) + + union = bbox1_area + bbox2_area - overlap + eps + + return overlap / union diff --git a/third_party/mmyolo/mmyolo/models/task_modules/coders/__init__.py b/third_party/mmyolo/mmyolo/models/task_modules/coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75b6e7d6b30afd3de21c738dfc8e75df2eae7120 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/coders/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .distance_angle_point_coder import DistanceAnglePointCoder +from .distance_point_bbox_coder import DistancePointBBoxCoder +from .yolov5_bbox_coder import YOLOv5BBoxCoder +from .yolox_bbox_coder import YOLOXBBoxCoder + +__all__ = [ + 'YOLOv5BBoxCoder', 'YOLOXBBoxCoder', 'DistancePointBBoxCoder', + 'DistanceAnglePointCoder' +] diff --git a/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_angle_point_coder.py b/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_angle_point_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..a7e322f94725ee548c9b261be6f5bae2f3d9b4d9 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_angle_point_coder.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch + +from mmyolo.registry import TASK_UTILS + +try: + from mmrotate.models.task_modules.coders import \ + DistanceAnglePointCoder as MMROTATE_DistanceAnglePointCoder + MMROTATE_AVAILABLE = True +except ImportError: + from mmdet.models.task_modules.coders import BaseBBoxCoder + MMROTATE_DistanceAnglePointCoder = BaseBBoxCoder + MMROTATE_AVAILABLE = False + + +@TASK_UTILS.register_module() +class DistanceAnglePointCoder(MMROTATE_DistanceAnglePointCoder): + """Distance Angle Point BBox coder. + + This coder encodes gt bboxes (x, y, w, h, theta) into (top, bottom, left, + right, theta) and decode it back to the original. + """ + + def __init__(self, clip_border=True, angle_version='oc'): + if not MMROTATE_AVAILABLE: + raise ImportError( + 'Please run "mim install -r requirements/mmrotate.txt" ' + 'to install mmrotate first for rotated detection.') + + super().__init__(clip_border=clip_border, angle_version=angle_version) + + def decode( + self, + points: torch.Tensor, + pred_bboxes: torch.Tensor, + stride: torch.Tensor, + max_shape: Optional[Union[Sequence[int], torch.Tensor, + Sequence[Sequence[int]]]] = None, + ) -> torch.Tensor: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries and angle (left, top, right, bottom, angle). + Shape (B, N, 5) or (N, 5) + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Tensor: Boxes with shape (N, 5) or (B, N, 5) + """ + assert points.size(-2) == pred_bboxes.size(-2) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 5 + if self.clip_border is False: + max_shape = None + + if pred_bboxes.dim() == 2: + stride = stride[:, None] + else: + stride = stride[None, :, None] + pred_bboxes[..., :4] = pred_bboxes[..., :4] * stride + + return self.distance2obb(points, pred_bboxes, max_shape, + self.angle_version) + + def encode(self, + points: torch.Tensor, + gt_bboxes: torch.Tensor, + max_dis: float = 16., + eps: float = 0.01) -> torch.Tensor: + """Encode bounding box to distances. + + Args: + points (Tensor): Shape (N, 2), The format is [x, y]. + gt_bboxes (Tensor): Shape (N, 5), The format is "xywha" + max_dis (float): Upper bound of the distance. Default None. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.1. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 5). + """ + + assert points.size(-2) == gt_bboxes.size(-2) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 5 + return self.obb2distance(points, gt_bboxes, max_dis, eps) diff --git a/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py b/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..16417b8ab209c57880cfcfe0ba2a955e78c0a3f0 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/coders/distance_point_bbox_coder.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Sequence, Union + +import torch +from mmdet.models.task_modules.coders import \ + DistancePointBBoxCoder as MMDET_DistancePointBBoxCoder +from mmdet.structures.bbox import bbox2distance, distance2bbox + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class DistancePointBBoxCoder(MMDET_DistancePointBBoxCoder): + """Distance Point BBox coder. + + This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, + right) and decode it back to the original. + """ + + def decode( + self, + points: torch.Tensor, + pred_bboxes: torch.Tensor, + stride: torch.Tensor, + max_shape: Optional[Union[Sequence[int], torch.Tensor, + Sequence[Sequence[int]]]] = None + ) -> torch.Tensor: + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). Shape (B, N, 4) + or (N, 4) + stride (Tensor): Featmap stride. + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Tensor: Boxes with shape (N, 4) or (B, N, 4) + """ + assert points.size(-2) == pred_bboxes.size(-2) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 4 + if self.clip_border is False: + max_shape = None + + pred_bboxes = pred_bboxes * stride[None, :, None] + + return distance2bbox(points, pred_bboxes, max_shape) + + def encode(self, + points: torch.Tensor, + gt_bboxes: torch.Tensor, + max_dis: float = 16., + eps: float = 0.01) -> torch.Tensor: + """Encode bounding box to distances. The rewrite is to support batch + operations. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2), The format is [x, y]. + gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format + is "xyxy" + max_dis (float): Upper bound of the distance. Default to 16.. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.01. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 4) or + (B, N, 4). + """ + + assert points.size(-2) == gt_bboxes.size(-2) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 4 + return bbox2distance(points, gt_bboxes, max_dis, eps) diff --git a/third_party/mmyolo/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py b/third_party/mmyolo/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..bab5f0e0fe06c1930497bdece7c7a06636fe9c37 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/coders/yolov5_bbox_coder.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class YOLOv5BBoxCoder(BaseBBoxCoder): + """YOLOv5 BBox coder. + + This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + """ + + def encode(self, **kwargs): + """Encode deltas between bboxes and ground truth boxes.""" + pass + + def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor, + stride: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + + Args: + priors (torch.Tensor): Basic boxes or points, e.g. anchors. + pred_bboxes (torch.Tensor): Encoded boxes with shape + stride (torch.Tensor | int): Strides of bboxes. + + Returns: + torch.Tensor: Decoded boxes. + """ + assert pred_bboxes.size(-1) == priors.size(-1) == 4 + + pred_bboxes = pred_bboxes.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + # The anchor of mmdet has been offset by 0.5 + x_center_pred = (pred_bboxes[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (pred_bboxes[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (pred_bboxes[..., 2] * 2)**2 * w + h_pred = (pred_bboxes[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + (x_center_pred - w_pred / 2, y_center_pred - h_pred / 2, + x_center_pred + w_pred / 2, y_center_pred + h_pred / 2), + dim=-1) + + return decoded_bboxes diff --git a/third_party/mmyolo/mmyolo/models/task_modules/coders/yolox_bbox_coder.py b/third_party/mmyolo/mmyolo/models/task_modules/coders/yolox_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..02c898d814e89e5c8ef4db792831a7ba80c7c0cc --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/task_modules/coders/yolox_bbox_coder.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +import torch +from mmdet.models.task_modules.coders.base_bbox_coder import BaseBBoxCoder + +from mmyolo.registry import TASK_UTILS + + +@TASK_UTILS.register_module() +class YOLOXBBoxCoder(BaseBBoxCoder): + """YOLOX BBox coder. + + This decoder decodes pred bboxes (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + """ + + def encode(self, **kwargs): + """Encode deltas between bboxes and ground truth boxes.""" + pass + + def decode(self, priors: torch.Tensor, pred_bboxes: torch.Tensor, + stride: Union[torch.Tensor, int]) -> torch.Tensor: + """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x, + tl_y, br_x, br_y). + + Args: + priors (torch.Tensor): Basic boxes or points, e.g. anchors. + pred_bboxes (torch.Tensor): Encoded boxes with shape + stride (torch.Tensor | int): Strides of bboxes. + + Returns: + torch.Tensor: Decoded boxes. + """ + stride = stride[None, :, None] + xys = (pred_bboxes[..., :2] * stride) + priors + whs = pred_bboxes[..., 2:].exp() * stride + + tl_x = (xys[..., 0] - whs[..., 0] / 2) + tl_y = (xys[..., 1] - whs[..., 1] / 2) + br_x = (xys[..., 0] + whs[..., 0] / 2) + br_y = (xys[..., 1] + whs[..., 1] / 2) + + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes diff --git a/third_party/mmyolo/mmyolo/models/utils/__init__.py b/third_party/mmyolo/mmyolo/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d62ff80e25ea5adad8524fd6f756f1db5e4de4d5 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/utils/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .misc import (OutputSaveFunctionWrapper, OutputSaveObjectWrapper, + gt_instances_preprocess, make_divisible, make_round) + +__all__ = [ + 'make_divisible', 'make_round', 'gt_instances_preprocess', + 'OutputSaveFunctionWrapper', 'OutputSaveObjectWrapper' +] diff --git a/third_party/mmyolo/mmyolo/models/utils/misc.py b/third_party/mmyolo/mmyolo/models/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..96cd1195aefb2fbf5db7535be785dae2fab4add9 --- /dev/null +++ b/third_party/mmyolo/mmyolo/models/utils/misc.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from collections import defaultdict +from copy import deepcopy +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union + +import torch +from mmdet.structures.bbox.transforms import get_box_tensor +from torch import Tensor + + +def make_divisible(x: float, + widen_factor: float = 1.0, + divisor: int = 8) -> int: + """Make sure that x*widen_factor is divisible by divisor.""" + return math.ceil(x * widen_factor / divisor) * divisor + + +def make_round(x: float, deepen_factor: float = 1.0) -> int: + """Make sure that x*deepen_factor becomes an integer not less than 1.""" + return max(round(x * deepen_factor), 1) if x > 1 else x + + +def gt_instances_preprocess(batch_gt_instances: Union[Tensor, Sequence], + batch_size: int) -> Tensor: + """Split batch_gt_instances with batch size. + + From [all_gt_bboxes, box_dim+2] to [batch_size, number_gt, box_dim+1]. + For horizontal box, box_dim=4, for rotated box, box_dim=5 + + If some shape of single batch smaller than + gt bbox len, then using zeros to fill. + + Args: + batch_gt_instances (Sequence[Tensor]): Ground truth + instances for whole batch, shape [all_gt_bboxes, box_dim+2] + batch_size (int): Batch size. + + Returns: + Tensor: batch gt instances data, shape + [batch_size, number_gt, box_dim+1] + """ + if isinstance(batch_gt_instances, Sequence): + max_gt_bbox_len = max( + [len(gt_instances) for gt_instances in batch_gt_instances]) + # fill zeros with length box_dim+1 if some shape of + # single batch not equal max_gt_bbox_len + batch_instance_list = [] + for index, gt_instance in enumerate(batch_gt_instances): + bboxes = gt_instance.bboxes + labels = gt_instance.labels + box_dim = get_box_tensor(bboxes).size(-1) + batch_instance_list.append( + torch.cat((labels[:, None], bboxes), dim=-1)) + + if bboxes.shape[0] >= max_gt_bbox_len: + continue + + fill_tensor = bboxes.new_full( + [max_gt_bbox_len - bboxes.shape[0], box_dim + 1], 0) + batch_instance_list[index] = torch.cat( + (batch_instance_list[index], fill_tensor), dim=0) + + return torch.stack(batch_instance_list) + else: + # faster version + # format of batch_gt_instances: [img_ind, cls_ind, (box)] + # For example horizontal box should be: + # [img_ind, cls_ind, x1, y1, x2, y2] + # Rotated box should be + # [img_ind, cls_ind, x, y, w, h, a] + + # sqlit batch gt instance [all_gt_bboxes, box_dim+2] -> + # [batch_size, max_gt_bbox_len, box_dim+1] + assert isinstance(batch_gt_instances, Tensor) + box_dim = batch_gt_instances.size(-1) - 2 + if len(batch_gt_instances) > 0: + gt_images_indexes = batch_gt_instances[:, 0] + max_gt_bbox_len = gt_images_indexes.unique( + return_counts=True)[1].max() + # fill zeros with length box_dim+1 if some shape of + # single batch not equal max_gt_bbox_len + batch_instance = torch.zeros( + (batch_size, max_gt_bbox_len, box_dim + 1), + dtype=batch_gt_instances.dtype, + device=batch_gt_instances.device) + + for i in range(batch_size): + match_indexes = gt_images_indexes == i + gt_num = match_indexes.sum() + if gt_num: + batch_instance[i, :gt_num] = batch_gt_instances[ + match_indexes, 1:] + else: + batch_instance = torch.zeros((batch_size, 0, box_dim + 1), + dtype=batch_gt_instances.dtype, + device=batch_gt_instances.device) + + return batch_instance + + +class OutputSaveObjectWrapper: + """A wrapper class that saves the output of function calls on an object.""" + + def __init__(self, obj: Any) -> None: + self.obj = obj + self.log = defaultdict(list) + + def __getattr__(self, attr: str) -> Any: + """Overrides the default behavior when an attribute is accessed. + + - If the attribute is callable, hooks the attribute and saves the + returned value of the function call to the log. + - If the attribute is not callable, saves the attribute's value to the + log and returns the value. + """ + orig_attr = getattr(self.obj, attr) + + if not callable(orig_attr): + self.log[attr].append(orig_attr) + return orig_attr + + def hooked(*args: Tuple, **kwargs: Dict) -> Any: + """The hooked function that logs the return value of the original + function.""" + result = orig_attr(*args, **kwargs) + self.log[attr].append(result) + return result + + return hooked + + def clear(self): + """Clears the log of function call outputs.""" + self.log.clear() + + def __deepcopy__(self, memo): + """Only copy the object when applying deepcopy.""" + other = type(self)(deepcopy(self.obj)) + memo[id(self)] = other + return other + + +class OutputSaveFunctionWrapper: + """A class that wraps a function and saves its outputs. + + This class can be used to decorate a function to save its outputs. It wraps + the function with a `__call__` method that calls the original function and + saves the results in a log attribute. + Args: + func (Callable): A function to wrap. + spec (Optional[Dict]): A dictionary of global variables to use as the + namespace for the wrapper. If `None`, the global namespace of the + original function is used. + """ + + def __init__(self, func: Callable, spec: Optional[Dict]) -> None: + """Initializes the OutputSaveFunctionWrapper instance.""" + assert callable(func) + self.log = [] + self.func = func + self.func_name = func.__name__ + + if isinstance(spec, dict): + self.spec = spec + elif hasattr(func, '__globals__'): + self.spec = func.__globals__ + else: + raise ValueError + + def __call__(self, *args, **kwargs) -> Any: + """Calls the wrapped function with the given arguments and saves the + results in the `log` attribute.""" + results = self.func(*args, **kwargs) + self.log.append(results) + return results + + def __enter__(self) -> None: + """Enters the context and sets the wrapped function to be a global + variable in the specified namespace.""" + self.spec[self.func_name] = self + return self.log + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """Exits the context and resets the wrapped function to its original + value in the specified namespace.""" + self.spec[self.func_name] = self.func diff --git a/third_party/mmyolo/mmyolo/registry.py b/third_party/mmyolo/mmyolo/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..71f43e6cf53d92917b7aea6175ae0540613ff720 --- /dev/null +++ b/third_party/mmyolo/mmyolo/registry.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""MMYOLO provides 17 registry nodes to support using modules across projects. +Each node is a child of the root registry in MMEngine. + +More details can be found at +https://mmengine.readthedocs.io/en/latest/tutorials/registry.html. +""" + +from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS +from mmengine.registry import DATASETS as MMENGINE_DATASETS +from mmengine.registry import HOOKS as MMENGINE_HOOKS +from mmengine.registry import LOOPS as MMENGINE_LOOPS +from mmengine.registry import METRICS as MMENGINE_METRICS +from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS +from mmengine.registry import MODELS as MMENGINE_MODELS +from mmengine.registry import \ + OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS +from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS +from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS +from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS +from mmengine.registry import \ + RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS +from mmengine.registry import RUNNERS as MMENGINE_RUNNERS +from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS +from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS +from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS +from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS +from mmengine.registry import \ + WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS +from mmengine.registry import Registry + +# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner` +RUNNERS = Registry( + 'runner', parent=MMENGINE_RUNNERS, locations=['mmyolo.engine']) +# manage runner constructors that define how to initialize runners +RUNNER_CONSTRUCTORS = Registry( + 'runner constructor', + parent=MMENGINE_RUNNER_CONSTRUCTORS, + locations=['mmyolo.engine']) +# manage all kinds of loops like `EpochBasedTrainLoop` +LOOPS = Registry('loop', parent=MMENGINE_LOOPS, locations=['mmyolo.engine']) +# manage all kinds of hooks like `CheckpointHook` +HOOKS = Registry( + 'hook', parent=MMENGINE_HOOKS, locations=['mmyolo.engine.hooks']) + +# manage data-related modules +DATASETS = Registry( + 'dataset', parent=MMENGINE_DATASETS, locations=['mmyolo.datasets']) +DATA_SAMPLERS = Registry( + 'data sampler', + parent=MMENGINE_DATA_SAMPLERS, + locations=['mmyolo.datasets']) +TRANSFORMS = Registry( + 'transform', + parent=MMENGINE_TRANSFORMS, + locations=['mmyolo.datasets.transforms']) + +# manage all kinds of modules inheriting `nn.Module` +MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmyolo.models']) +# manage all kinds of model wrappers like 'MMDistributedDataParallel' +MODEL_WRAPPERS = Registry( + 'model_wrapper', + parent=MMENGINE_MODEL_WRAPPERS, + locations=['mmyolo.models']) +# manage all kinds of weight initialization modules like `Uniform` +WEIGHT_INITIALIZERS = Registry( + 'weight initializer', + parent=MMENGINE_WEIGHT_INITIALIZERS, + locations=['mmyolo.models']) + +# manage all kinds of optimizers like `SGD` and `Adam` +OPTIMIZERS = Registry( + 'optimizer', + parent=MMENGINE_OPTIMIZERS, + locations=['mmyolo.engine.optimizers']) +OPTIM_WRAPPERS = Registry( + 'optim_wrapper', + parent=MMENGINE_OPTIM_WRAPPERS, + locations=['mmyolo.engine.optimizers']) +# manage constructors that customize the optimization hyperparameters. +OPTIM_WRAPPER_CONSTRUCTORS = Registry( + 'optimizer constructor', + parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS, + locations=['mmyolo.engine.optimizers']) +# manage all kinds of parameter schedulers like `MultiStepLR` +PARAM_SCHEDULERS = Registry( + 'parameter scheduler', + parent=MMENGINE_PARAM_SCHEDULERS, + locations=['mmyolo.engine.optimizers']) +# manage all kinds of metrics +METRICS = Registry( + 'metric', parent=MMENGINE_METRICS, locations=['mmyolo.engine']) + +# manage task-specific modules like anchor generators and box coders +TASK_UTILS = Registry( + 'task util', parent=MMENGINE_TASK_UTILS, locations=['mmyolo.models']) + +# manage visualizer +VISUALIZERS = Registry( + 'visualizer', parent=MMENGINE_VISUALIZERS, locations=['mmyolo.utils']) +# manage visualizer backend +VISBACKENDS = Registry( + 'vis_backend', parent=MMENGINE_VISBACKENDS, locations=['mmyolo.utils']) diff --git a/third_party/mmyolo/mmyolo/testing/__init__.py b/third_party/mmyolo/mmyolo/testing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b6d7a010ee27b2822d44ad099f46f65bf6f0c00a --- /dev/null +++ b/third_party/mmyolo/mmyolo/testing/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ._utils import get_detector_cfg + +__all__ = ['get_detector_cfg'] diff --git a/third_party/mmyolo/mmyolo/testing/_utils.py b/third_party/mmyolo/mmyolo/testing/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9ccf2fe0cfd7baa3aeb7f3793c3db025d8889d5f --- /dev/null +++ b/third_party/mmyolo/mmyolo/testing/_utils.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from os.path import dirname, exists, join + +import numpy as np +from mmengine.config import Config + + +def _get_config_directory(): + """Find the predefined detector config directory.""" + try: + # Assume we are running in the source mmyolo repo + repo_dpath = dirname(dirname(dirname(__file__))) + except NameError: + # For IPython development when this __file__ is not defined + import mmyolo + repo_dpath = dirname(dirname(mmyolo.__file__)) + config_dpath = join(repo_dpath, 'configs') + if not exists(config_dpath): + raise Exception('Cannot find config path') + return config_dpath + + +def _get_config_module(fname): + """Load a configuration as a python module.""" + config_dpath = _get_config_directory() + config_fpath = join(config_dpath, fname) + config_mod = Config.fromfile(config_fpath) + return config_mod + + +def get_detector_cfg(fname): + """Grab configs necessary to create a detector. + + These are deep copied to allow for safe modification of parameters without + influencing other tests. + """ + config = _get_config_module(fname) + model = copy.deepcopy(config.model) + return model + + +def _rand_bboxes(rng, num_boxes, w, h): + """Randomly generate a specified number of bboxes.""" + cx, cy, bw, bh = rng.rand(num_boxes, 4).T + + tl_x = ((cx * w) - (w * bw / 2)).clip(0, w) + tl_y = ((cy * h) - (h * bh / 2)).clip(0, h) + br_x = ((cx * w) + (w * bw / 2)).clip(0, w) + br_y = ((cy * h) + (h * bh / 2)).clip(0, h) + + bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T + return bboxes diff --git a/third_party/mmyolo/mmyolo/utils/__init__.py b/third_party/mmyolo/mmyolo/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f4e968494892ccefb60d0c7b713c131ddc6fb869 --- /dev/null +++ b/third_party/mmyolo/mmyolo/utils/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collect_env import collect_env +from .misc import is_metainfo_lower, switch_to_deploy +from .setup_env import register_all_modules + +__all__ = [ + 'register_all_modules', 'collect_env', 'switch_to_deploy', + 'is_metainfo_lower' +] diff --git a/third_party/mmyolo/mmyolo/utils/boxam_utils.py b/third_party/mmyolo/mmyolo/utils/boxam_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..50d6c09ecd309abe11777b4bc5307db0bbec2735 --- /dev/null +++ b/third_party/mmyolo/mmyolo/utils/boxam_utils.py @@ -0,0 +1,517 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import bisect +import copy +import warnings +from pathlib import Path +from typing import Callable, List, Optional, Tuple, Union + +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torchvision +from mmcv.transforms import Compose +from mmdet.evaluation import get_classes +from mmdet.utils import ConfigType +from mmengine.config import Config +from mmengine.registry import init_default_scope +from mmengine.runner import load_checkpoint +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import MODELS + +try: + from pytorch_grad_cam import (AblationCAM, AblationLayer, + ActivationsAndGradients) + from pytorch_grad_cam import GradCAM as Base_GradCAM + from pytorch_grad_cam import GradCAMPlusPlus as Base_GradCAMPlusPlus + from pytorch_grad_cam.base_cam import BaseCAM + from pytorch_grad_cam.utils.image import scale_cam_image, show_cam_on_image + from pytorch_grad_cam.utils.svd_on_activations import get_2d_projection +except ImportError: + pass + + +def init_detector( + config: Union[str, Path, Config], + checkpoint: Optional[str] = None, + palette: str = 'coco', + device: str = 'cuda:0', + cfg_options: Optional[dict] = None, +) -> nn.Module: + """Initialize a detector from config file. + + Args: + config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path, + :obj:`Path`, or the config object. + checkpoint (str, optional): Checkpoint path. If left as None, the model + will not load any weights. + palette (str): Color palette used for visualization. If palette + is stored in checkpoint, use checkpoint's palette first, otherwise + use externally passed palette. Currently, supports 'coco', 'voc', + 'citys' and 'random'. Defaults to coco. + device (str): The device where the anchors will be put on. + Defaults to cuda:0. + cfg_options (dict, optional): Options to override some settings in + the used config. + + Returns: + nn.Module: The constructed detector. + """ + if isinstance(config, (str, Path)): + config = Config.fromfile(config) + elif not isinstance(config, Config): + raise TypeError('config must be a filename or Config object, ' + f'but got {type(config)}') + if cfg_options is not None: + config.merge_from_dict(cfg_options) + elif 'init_cfg' in config.model.backbone: + config.model.backbone.init_cfg = None + + # only change this + # grad based method requires train_cfg + # config.model.train_cfg = None + init_default_scope(config.get('default_scope', 'mmyolo')) + + model = MODELS.build(config.model) + if checkpoint is not None: + checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') + # Weights converted from elsewhere may not have meta fields. + checkpoint_meta = checkpoint.get('meta', {}) + # save the dataset_meta in the model for convenience + if 'dataset_meta' in checkpoint_meta: + # mmdet 3.x, all keys should be lowercase + model.dataset_meta = { + k.lower(): v + for k, v in checkpoint_meta['dataset_meta'].items() + } + elif 'CLASSES' in checkpoint_meta: + # < mmdet 3.x + classes = checkpoint_meta['CLASSES'] + model.dataset_meta = {'classes': classes, 'palette': palette} + else: + warnings.simplefilter('once') + warnings.warn( + 'dataset_meta or class names are not saved in the ' + 'checkpoint\'s meta data, use COCO classes by default.') + model.dataset_meta = { + 'classes': get_classes('coco'), + 'palette': palette + } + + model.cfg = config # save the config in the model for convenience + model.to(device) + model.eval() + return model + + +def reshape_transform(feats: Union[Tensor, List[Tensor]], + max_shape: Tuple[int, int] = (20, 20), + is_need_grad: bool = False): + """Reshape and aggregate feature maps when the input is a multi-layer + feature map. + + Takes these tensors with different sizes, resizes them to a common shape, + and concatenates them. + """ + if len(max_shape) == 1: + max_shape = max_shape * 2 + + if isinstance(feats, torch.Tensor): + feats = [feats] + else: + if is_need_grad: + raise NotImplementedError('The `grad_base` method does not ' + 'support output multi-activation layers') + + max_h = max([im.shape[-2] for im in feats]) + max_w = max([im.shape[-1] for im in feats]) + if -1 in max_shape: + max_shape = (max_h, max_w) + else: + max_shape = (min(max_h, max_shape[0]), min(max_w, max_shape[1])) + + activations = [] + for feat in feats: + activations.append( + torch.nn.functional.interpolate( + torch.abs(feat), max_shape, mode='bilinear')) + + activations = torch.cat(activations, axis=1) + return activations + + +class BoxAMDetectorWrapper(nn.Module): + """Wrap the mmdet model class to facilitate handling of non-tensor + situations during inference.""" + + def __init__(self, + cfg: ConfigType, + checkpoint: str, + score_thr: float, + device: str = 'cuda:0'): + super().__init__() + self.cfg = cfg + self.device = device + self.score_thr = score_thr + self.checkpoint = checkpoint + self.detector = init_detector(self.cfg, self.checkpoint, device=device) + + pipeline_cfg = copy.deepcopy(self.cfg.test_dataloader.dataset.pipeline) + pipeline_cfg[0].type = 'mmdet.LoadImageFromNDArray' + + new_test_pipeline = [] + for pipeline in pipeline_cfg: + if not pipeline['type'].endswith('LoadAnnotations'): + new_test_pipeline.append(pipeline) + self.test_pipeline = Compose(new_test_pipeline) + + self.is_need_loss = False + self.input_data = None + self.image = None + + def need_loss(self, is_need_loss: bool): + """Grad-based methods require loss.""" + self.is_need_loss = is_need_loss + + def set_input_data(self, + image: np.ndarray, + pred_instances: Optional[InstanceData] = None): + """Set the input data to be used in the next step.""" + self.image = image + + if self.is_need_loss: + assert pred_instances is not None + pred_instances = pred_instances.numpy() + data = dict( + img=self.image, + img_id=0, + gt_bboxes=pred_instances.bboxes, + gt_bboxes_labels=pred_instances.labels) + data = self.test_pipeline(data) + else: + data = dict(img=self.image, img_id=0) + data = self.test_pipeline(data) + data['inputs'] = [data['inputs']] + data['data_samples'] = [data['data_samples']] + self.input_data = data + + def __call__(self, *args, **kwargs): + assert self.input_data is not None + if self.is_need_loss: + # Maybe this is a direction that can be optimized + # self.detector.init_weights() + if hasattr(self.detector.bbox_head, 'head_module'): + self.detector.bbox_head.head_module.training = True + else: + self.detector.bbox_head.training = True + if hasattr(self.detector.bbox_head, 'featmap_sizes'): + # Prevent the model algorithm error when calculating loss + self.detector.bbox_head.featmap_sizes = None + + data_ = {} + data_['inputs'] = [self.input_data['inputs']] + data_['data_samples'] = [self.input_data['data_samples']] + data = self.detector.data_preprocessor(data_, training=False) + loss = self.detector._run_forward(data, mode='loss') + + if hasattr(self.detector.bbox_head, 'featmap_sizes'): + self.detector.bbox_head.featmap_sizes = None + + return [loss] + else: + if hasattr(self.detector.bbox_head, 'head_module'): + self.detector.bbox_head.head_module.training = False + else: + self.detector.bbox_head.training = False + with torch.no_grad(): + results = self.detector.test_step(self.input_data) + return results + + +class BoxAMDetectorVisualizer: + """Box AM visualization class.""" + + def __init__(self, + method_class, + model: nn.Module, + target_layers: List, + reshape_transform: Optional[Callable] = None, + is_need_grad: bool = False, + extra_params: Optional[dict] = None): + self.target_layers = target_layers + self.reshape_transform = reshape_transform + self.is_need_grad = is_need_grad + + if method_class.__name__ == 'AblationCAM': + batch_size = extra_params.get('batch_size', 1) + ratio_channels_to_ablate = extra_params.get( + 'ratio_channels_to_ablate', 1.) + self.cam = AblationCAM( + model, + target_layers, + use_cuda=True if 'cuda' in model.device else False, + reshape_transform=reshape_transform, + batch_size=batch_size, + ablation_layer=extra_params['ablation_layer'], + ratio_channels_to_ablate=ratio_channels_to_ablate) + else: + self.cam = method_class( + model, + target_layers, + use_cuda=True if 'cuda' in model.device else False, + reshape_transform=reshape_transform, + ) + if self.is_need_grad: + self.cam.activations_and_grads.release() + + self.classes = model.detector.dataset_meta['classes'] + self.COLORS = np.random.uniform(0, 255, size=(len(self.classes), 3)) + + def switch_activations_and_grads(self, model) -> None: + """In the grad-based method, we need to switch + ``ActivationsAndGradients`` layer, otherwise an error will occur.""" + self.cam.model = model + + if self.is_need_grad is True: + self.cam.activations_and_grads = ActivationsAndGradients( + model, self.target_layers, self.reshape_transform) + self.is_need_grad = False + else: + self.cam.activations_and_grads.release() + self.is_need_grad = True + + def __call__(self, img, targets, aug_smooth=False, eigen_smooth=False): + img = torch.from_numpy(img)[None].permute(0, 3, 1, 2) + return self.cam(img, targets, aug_smooth, eigen_smooth)[0, :] + + def show_am(self, + image: np.ndarray, + pred_instance: InstanceData, + grayscale_am: np.ndarray, + with_norm_in_bboxes: bool = False): + """Normalize the AM to be in the range [0, 1] inside every bounding + boxes, and zero outside of the bounding boxes.""" + + boxes = pred_instance.bboxes + labels = pred_instance.labels + + if with_norm_in_bboxes is True: + boxes = boxes.astype(np.int32) + renormalized_am = np.zeros(grayscale_am.shape, dtype=np.float32) + images = [] + for x1, y1, x2, y2 in boxes: + img = renormalized_am * 0 + img[y1:y2, x1:x2] = scale_cam_image( + [grayscale_am[y1:y2, x1:x2].copy()])[0] + images.append(img) + + renormalized_am = np.max(np.float32(images), axis=0) + renormalized_am = scale_cam_image([renormalized_am])[0] + else: + renormalized_am = grayscale_am + + am_image_renormalized = show_cam_on_image( + image / 255, renormalized_am, use_rgb=False) + + image_with_bounding_boxes = self._draw_boxes( + boxes, labels, am_image_renormalized, pred_instance.get('scores')) + return image_with_bounding_boxes + + def _draw_boxes(self, + boxes: List, + labels: List, + image: np.ndarray, + scores: Optional[List] = None): + """draw boxes on image.""" + for i, box in enumerate(boxes): + label = labels[i] + color = self.COLORS[label] + cv2.rectangle(image, (int(box[0]), int(box[1])), + (int(box[2]), int(box[3])), color, 2) + if scores is not None: + score = scores[i] + text = str(self.classes[label]) + ': ' + str( + round(score * 100, 1)) + else: + text = self.classes[label] + + cv2.putText( + image, + text, (int(box[0]), int(box[1] - 5)), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + color, + 1, + lineType=cv2.LINE_AA) + return image + + +class DetAblationLayer(AblationLayer): + """Det AblationLayer.""" + + def __init__(self): + super().__init__() + self.activations = None + + def set_next_batch(self, input_batch_index, activations, + num_channels_to_ablate): + """Extract the next batch member from activations, and repeat it + num_channels_to_ablate times.""" + if isinstance(activations, torch.Tensor): + return super().set_next_batch(input_batch_index, activations, + num_channels_to_ablate) + + self.activations = [] + for activation in activations: + activation = activation[ + input_batch_index, :, :, :].clone().unsqueeze(0) + self.activations.append( + activation.repeat(num_channels_to_ablate, 1, 1, 1)) + + def __call__(self, x): + """Go over the activation indices to be ablated, stored in + self.indices.""" + result = self.activations + + if isinstance(result, torch.Tensor): + return super().__call__(x) + + channel_cumsum = np.cumsum([r.shape[1] for r in result]) + num_channels_to_ablate = result[0].size(0) # batch + for i in range(num_channels_to_ablate): + pyramid_layer = bisect.bisect_right(channel_cumsum, + self.indices[i]) + if pyramid_layer > 0: + index_in_pyramid_layer = self.indices[i] - channel_cumsum[ + pyramid_layer - 1] + else: + index_in_pyramid_layer = self.indices[i] + result[pyramid_layer][i, index_in_pyramid_layer, :, :] = -1000 + return result + + +class DetBoxScoreTarget: + """Det Score calculation class. + + In the case of the grad-free method, the calculation method is that + for every original detected bounding box specified in "bboxes", + assign a score on how the current bounding boxes match it, + + 1. In Bbox IoU + 2. In the classification score. + 3. In Mask IoU if ``segms`` exist. + + If there is not a large enough overlap, or the category changed, + assign a score of 0. The total score is the sum of all the box scores. + + In the case of the grad-based method, the calculation method is + the sum of losses after excluding a specific key. + """ + + def __init__(self, + pred_instance: InstanceData, + match_iou_thr: float = 0.5, + device: str = 'cuda:0', + ignore_loss_params: Optional[List] = None): + self.focal_bboxes = pred_instance.bboxes + self.focal_labels = pred_instance.labels + self.match_iou_thr = match_iou_thr + self.device = device + self.ignore_loss_params = ignore_loss_params + if ignore_loss_params is not None: + assert isinstance(self.ignore_loss_params, list) + + def __call__(self, results): + output = torch.tensor([0.], device=self.device) + + if 'loss_cls' in results: + # grad-based method + # results is dict + for loss_key, loss_value in results.items(): + if 'loss' not in loss_key or \ + loss_key in self.ignore_loss_params: + continue + if isinstance(loss_value, list): + output += sum(loss_value) + else: + output += loss_value + return output + else: + # grad-free method + # results is DetDataSample + pred_instances = results.pred_instances + if len(pred_instances) == 0: + return output + + pred_bboxes = pred_instances.bboxes + pred_scores = pred_instances.scores + pred_labels = pred_instances.labels + + for focal_box, focal_label in zip(self.focal_bboxes, + self.focal_labels): + ious = torchvision.ops.box_iou(focal_box[None], + pred_bboxes[..., :4]) + index = ious.argmax() + if ious[0, index] > self.match_iou_thr and pred_labels[ + index] == focal_label: + # TODO: Adaptive adjustment of weights based on algorithms + score = ious[0, index] + pred_scores[index] + output = output + score + return output + + +class SpatialBaseCAM(BaseCAM): + """CAM that maintains spatial information. + + Gradients are often averaged over the spatial dimension in CAM + visualization for classification, but this is unreasonable in detection + tasks. There is no need to average the gradients in the detection task. + """ + + def get_cam_image(self, + input_tensor: torch.Tensor, + target_layer: torch.nn.Module, + targets: List[torch.nn.Module], + activations: torch.Tensor, + grads: torch.Tensor, + eigen_smooth: bool = False) -> np.ndarray: + + weights = self.get_cam_weights(input_tensor, target_layer, targets, + activations, grads) + weighted_activations = weights * activations + if eigen_smooth: + cam = get_2d_projection(weighted_activations) + else: + cam = weighted_activations.sum(axis=1) + return cam + + +class GradCAM(SpatialBaseCAM, Base_GradCAM): + """Gradients are no longer averaged over the spatial dimension.""" + + def get_cam_weights(self, input_tensor, target_layer, target_category, + activations, grads): + return grads + + +class GradCAMPlusPlus(SpatialBaseCAM, Base_GradCAMPlusPlus): + """Gradients are no longer averaged over the spatial dimension.""" + + def get_cam_weights(self, input_tensor, target_layers, target_category, + activations, grads): + grads_power_2 = grads**2 + grads_power_3 = grads_power_2 * grads + # Equation 19 in https://arxiv.org/abs/1710.11063 + sum_activations = np.sum(activations, axis=(2, 3)) + eps = 0.000001 + aij = grads_power_2 / ( + 2 * grads_power_2 + + sum_activations[:, :, None, None] * grads_power_3 + eps) + # Now bring back the ReLU from eq.7 in the paper, + # And zero out aijs where the activations are 0 + aij = np.where(grads != 0, aij, 0) + + weights = np.maximum(grads, 0) * aij + return weights diff --git a/third_party/mmyolo/mmyolo/utils/collect_env.py b/third_party/mmyolo/mmyolo/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..89bad658cb7d4f1b602690d8d888a309166283ee --- /dev/null +++ b/third_party/mmyolo/mmyolo/utils/collect_env.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import mmdet +from mmengine.utils import get_git_hash +from mmengine.utils.dl_utils import collect_env as collect_base_env + +import mmyolo + + +def collect_env() -> dict: + """Collect the information of the running environments.""" + env_info = collect_base_env() + env_info['MMCV'] = mmcv.__version__ + env_info['MMDetection'] = mmdet.__version__ + env_info['MMYOLO'] = mmyolo.__version__ + '+' + get_git_hash()[:7] + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print(f'{name}: {val}') diff --git a/third_party/mmyolo/mmyolo/utils/labelme_utils.py b/third_party/mmyolo/mmyolo/utils/labelme_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0981919771a617ca79b29c3ddf96ea14c82fccc6 --- /dev/null +++ b/third_party/mmyolo/mmyolo/utils/labelme_utils.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os.path + +from mmengine.structures import InstanceData + + +class LabelmeFormat: + """Predict results save into labelme file. + + Base on https://github.com/wkentaro/labelme/blob/main/labelme/label_file.py + + Args: + classes (tuple): Model classes name. + """ + + def __init__(self, classes: tuple): + super().__init__() + self.classes = classes + + def __call__(self, pred_instances: InstanceData, metainfo: dict, + output_path: str, selected_classes: list): + """Get image data field for labelme. + + Args: + pred_instances (InstanceData): Candidate prediction info. + metainfo (dict): Meta info of prediction. + output_path (str): Image file path. + selected_classes (list): Selected class name. + + Labelme file eg. + { + "version": "5.1.1", + "flags": {}, + "imagePath": "/data/cat/1.jpg", + "imageData": null, + "imageHeight": 3000, + "imageWidth": 4000, + "shapes": [ + { + "label": "cat", + "points": [ + [ + 1148.076923076923, + 1188.4615384615383 + ], + [ + 2471.1538461538457, + 2176.923076923077 + ] + ], + "group_id": null, + "shape_type": "rectangle", + "flags": {} + }, + {...} + ] + } + """ + + image_path = os.path.abspath(metainfo['img_path']) + + json_info = { + 'version': '5.1.1', + 'flags': {}, + 'imagePath': image_path, + 'imageData': None, + 'imageHeight': metainfo['ori_shape'][0], + 'imageWidth': metainfo['ori_shape'][1], + 'shapes': [] + } + + for pred_instance in pred_instances: + pred_bbox = pred_instance.bboxes.cpu().numpy().tolist()[0] + pred_label = self.classes[pred_instance.labels] + + if selected_classes is not None and \ + pred_label not in selected_classes: + # filter class name + continue + + sub_dict = { + 'label': pred_label, + 'points': [pred_bbox[:2], pred_bbox[2:]], + 'group_id': None, + 'shape_type': 'rectangle', + 'flags': {} + } + json_info['shapes'].append(sub_dict) + + with open(output_path, 'w', encoding='utf-8') as f_json: + json.dump(json_info, f_json, ensure_ascii=False, indent=2) diff --git a/third_party/mmyolo/mmyolo/utils/large_image.py b/third_party/mmyolo/mmyolo/utils/large_image.py new file mode 100644 index 0000000000000000000000000000000000000000..8670804684f6dcdc6dc1846cf85260d900b3474e --- /dev/null +++ b/third_party/mmyolo/mmyolo/utils/large_image.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple + +import torch +from mmcv.ops import batched_nms +from mmdet.structures import DetDataSample, SampleList +from mmengine.structures import InstanceData + + +def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]): + """Shift rotated bboxes with offset. + + Args: + bboxes (Tensor): The rotated bboxes need to be translated. + With shape (n, 5), which means (x, y, w, h, a). + offset (Sequence[int]): The translation offsets with shape of (2, ). + Returns: + Tensor: Shifted rotated bboxes. + """ + offset_tensor = bboxes.new_tensor(offset) + shifted_bboxes = bboxes.clone() + shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor + return shifted_bboxes + + +def shift_predictions(det_data_samples: SampleList, + offsets: Sequence[Tuple[int, int]], + src_image_shape: Tuple[int, int]) -> SampleList: + """Shift predictions to the original image. + + Args: + det_data_samples (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + Returns: + (List[:obj:`DetDataSample`]): shifted results. + """ + try: + from sahi.slicing import shift_bboxes, shift_masks + except ImportError: + raise ImportError('Please run "pip install -U sahi" ' + 'to install sahi first for large image inference.') + + assert len(det_data_samples) == len( + offsets), 'The `results` should has the ' 'same length with `offsets`.' + shifted_predictions = [] + for det_data_sample, offset in zip(det_data_samples, offsets): + pred_inst = det_data_sample.pred_instances.clone() + + # Check bbox type + if pred_inst.bboxes.size(-1) == 4: + # Horizontal bboxes + shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset) + elif pred_inst.bboxes.size(-1) == 5: + # Rotated bboxes + shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset) + else: + raise NotImplementedError + + # shift bboxes and masks + pred_inst.bboxes = shifted_bboxes + if 'masks' in det_data_sample: + pred_inst.masks = shift_masks(pred_inst.masks, offset, + src_image_shape) + + shifted_predictions.append(pred_inst.clone()) + + shifted_predictions = InstanceData.cat(shifted_predictions) + + return shifted_predictions + + +def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int, + int]], + src_image_shape: Tuple[int, int], + nms_cfg: dict) -> DetDataSample: + """Merge patch results by nms. + + Args: + results (List[:obj:`DetDataSample`]): A list of patch results. + offsets (Sequence[Tuple[int, int]]): Positions of the left top points + of patches. + src_image_shape (Tuple[int, int]): A (height, width) tuple of the large + image's width and height. + nms_cfg (dict): it should specify nms type and other parameters + like `iou_threshold`. + Returns: + :obj:`DetDataSample`: merged results. + """ + shifted_instances = shift_predictions(results, offsets, src_image_shape) + + _, keeps = batched_nms( + boxes=shifted_instances.bboxes, + scores=shifted_instances.scores, + idxs=shifted_instances.labels, + nms_cfg=nms_cfg) + merged_instances = shifted_instances[keeps] + + merged_result = results[0].clone() + merged_result.pred_instances = merged_instances + return merged_result diff --git a/third_party/mmyolo/mmyolo/utils/misc.py b/third_party/mmyolo/mmyolo/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..f5d366d75d4821753354c54629d2929661993578 --- /dev/null +++ b/third_party/mmyolo/mmyolo/utils/misc.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import urllib + +import numpy as np +import torch +from mmengine.utils import scandir +from prettytable import PrettyTable + +from mmyolo.models import RepVGGBlock + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def switch_to_deploy(model): + """Model switch to deploy status.""" + for layer in model.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + + print('Switch model to deploy modality.') + + +def auto_arrange_images(image_list: list, image_column: int = 2) -> np.ndarray: + """Auto arrange image to image_column x N row. + + Args: + image_list (list): cv2 image list. + image_column (int): Arrange to N column. Default: 2. + Return: + (np.ndarray): image_column x N row merge image + """ + img_count = len(image_list) + if img_count <= image_column: + # no need to arrange + image_show = np.concatenate(image_list, axis=1) + else: + # arrange image according to image_column + image_row = round(img_count / image_column) + fill_img_list = [np.ones(image_list[0].shape, dtype=np.uint8) * 255 + ] * ( + image_row * image_column - img_count) + image_list.extend(fill_img_list) + merge_imgs_col = [] + for i in range(image_row): + start_col = image_column * i + end_col = image_column * (i + 1) + merge_col = np.hstack(image_list[start_col:end_col]) + merge_imgs_col.append(merge_col) + + # merge to one image + image_show = np.vstack(merge_imgs_col) + + return image_show + + +def get_file_list(source_root: str) -> [list, dict]: + """Get file list. + + Args: + source_root (str): image or video source path + + Return: + source_file_path_list (list): A list for all source file. + source_type (dict): Source type: file or url or dir. + """ + is_dir = os.path.isdir(source_root) + is_url = source_root.startswith(('http:/', 'https:/')) + is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS + + source_file_path_list = [] + if is_dir: + # when input source is dir + for file in scandir( + source_root, IMG_EXTENSIONS, recursive=True, + case_sensitive=False): + source_file_path_list.append(os.path.join(source_root, file)) + elif is_url: + # when input source is url + filename = os.path.basename( + urllib.parse.unquote(source_root).split('?')[0]) + file_save_path = os.path.join(os.getcwd(), filename) + print(f'Downloading source file to {file_save_path}') + torch.hub.download_url_to_file(source_root, file_save_path) + source_file_path_list = [file_save_path] + elif is_file: + # when input source is single image + source_file_path_list = [source_root] + else: + print('Cannot find image file.') + + source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file) + + return source_file_path_list, source_type + + +def show_data_classes(data_classes): + """When printing an error, all class names of the dataset.""" + print('\n\nThe name of the class contained in the dataset:') + data_classes_info = PrettyTable() + data_classes_info.title = 'Information of dataset class' + # List Print Settings + # If the quantity is too large, 25 rows will be displayed in each column + if len(data_classes) < 25: + data_classes_info.add_column('Class name', data_classes) + elif len(data_classes) % 25 != 0 and len(data_classes) > 25: + col_num = int(len(data_classes) / 25) + 1 + data_name_list = list(data_classes) + for i in range(0, (col_num * 25) - len(data_classes)): + data_name_list.append('') + for i in range(0, len(data_name_list), 25): + data_classes_info.add_column('Class name', + data_name_list[i:i + 25]) + + # Align display data to the left + data_classes_info.align['Class name'] = 'l' + print(data_classes_info) + + +def is_metainfo_lower(cfg): + """Determine whether the custom metainfo fields are all lowercase.""" + + def judge_keys(dataloader_cfg): + while 'dataset' in dataloader_cfg: + dataloader_cfg = dataloader_cfg['dataset'] + if 'metainfo' in dataloader_cfg: + all_keys = dataloader_cfg['metainfo'].keys() + all_is_lower = all([str(k).islower() for k in all_keys]) + assert all_is_lower, f'The keys in dataset metainfo must be all lowercase, but got {all_keys}. ' \ + f'Please refer to https://github.com/open-mmlab/mmyolo/blob/e62c8c4593/configs/yolov5/yolov5_s-v61_syncbn_fast_1xb4-300e_balloon.py#L8' # noqa + + judge_keys(cfg.get('train_dataloader', {})) + judge_keys(cfg.get('val_dataloader', {})) + judge_keys(cfg.get('test_dataloader', {})) diff --git a/third_party/mmyolo/mmyolo/utils/setup_env.py b/third_party/mmyolo/mmyolo/utils/setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..f51ed928cbddb98c2274e09b5acea1d70dfd1abd --- /dev/null +++ b/third_party/mmyolo/mmyolo/utils/setup_env.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import warnings + +from mmengine import DefaultScope + + +def register_all_modules(init_default_scope: bool = True): + """Register all modules in mmdet into the registries. + + Args: + init_default_scope (bool): Whether initialize the mmdet default scope. + When `init_default_scope=True`, the global default scope will be + set to `mmyolo`, and all registries will build modules from mmdet's + registry node. To understand more about the registry, please refer + to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md + Defaults to True. + """ # noqa + import mmdet.engine # noqa: F401,F403 + import mmdet.visualization # noqa: F401,F403 + + import mmyolo.datasets # noqa: F401,F403 + import mmyolo.engine # noqa: F401,F403 + import mmyolo.models # noqa: F401,F403 + + if init_default_scope: + never_created = DefaultScope.get_current_instance() is None \ + or not DefaultScope.check_instance_created('mmyolo') + if never_created: + DefaultScope.get_instance('mmyolo', scope_name='mmyolo') + return + current_scope = DefaultScope.get_current_instance() + if current_scope.scope_name != 'mmyolo': + warnings.warn('The current default scope ' + f'"{current_scope.scope_name}" is not "mmyolo", ' + '`register_all_modules` will force the current' + 'default scope to be "mmyolo". If this is not ' + 'expected, please set `init_default_scope=False`.') + # avoid name conflict + new_instance_name = f'mmyolo-{datetime.datetime.now()}' + DefaultScope.get_instance(new_instance_name, scope_name='mmyolo') diff --git a/third_party/mmyolo/mmyolo/version.py b/third_party/mmyolo/mmyolo/version.py new file mode 100644 index 0000000000000000000000000000000000000000..6e4f0e8e3747eeb71d72d53d0e2daf6ea203c596 --- /dev/null +++ b/third_party/mmyolo/mmyolo/version.py @@ -0,0 +1,23 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +__version__ = '0.6.0' + +from typing import Tuple + +short_version = __version__ + + +def parse_version_info(version_str: str) -> Tuple: + """Parse version info of MMYOLO.""" + version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = parse_version_info(__version__) diff --git a/third_party/mmyolo/model-index.yml b/third_party/mmyolo/model-index.yml new file mode 100644 index 0000000000000000000000000000000000000000..9aa0288cc4b79f5b367c159ad9b29ccd62a0b74c --- /dev/null +++ b/third_party/mmyolo/model-index.yml @@ -0,0 +1,8 @@ +Import: + - configs/yolov5/metafile.yml + - configs/yolov6/metafile.yml + - configs/yolox/metafile.yml + - configs/rtmdet/metafile.yml + - configs/yolov7/metafile.yml + - configs/ppyoloe/metafile.yml + - configs/yolov8/metafile.yml diff --git a/third_party/mmyolo/projects/assigner_visualization/README.md b/third_party/mmyolo/projects/assigner_visualization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..918589f228af70f3338b2e6ea065ea72f245ebc1 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/README.md @@ -0,0 +1,43 @@ +# MMYOLO Model Assigner Visualization + + + +## Introduction + +This project is developed for easily showing assigning results. The script allows users to analyze where and how many positive samples each gt is assigned in the image. + +Now, the script supports `YOLOv5`, `YOLOv7`, `YOLOv8` and `RTMDet`. + +## Usage + +### Command + +YOLOv5 assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py +``` + +Note: `YOLOv5` does not need to load the trained weights. + +YOLOv7 assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py -c ${checkpont} +``` + +YOLOv8 assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py -c ${checkpont} +``` + +RTMdet assigner visualization command: + +```shell +python projects/assigner_visualization/assigner_visualization.py projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py -c ${checkpont} +``` + +${checkpont} is the checkpont file path. Dynamic label assignment is used in `YOLOv7`, `YOLOv8` and `RTMDet`, model weights will affect the positive sample allocation results, so it is recommended to load the trained model weights. + +If you want to know details about label assignment, you can check the [RTMDet](https://mmyolo.readthedocs.io/zh_CN/latest/algorithm_descriptions/rtmdet_description.html#id5). diff --git a/third_party/mmyolo/projects/assigner_visualization/assigner_visualization.py b/third_party/mmyolo/projects/assigner_visualization/assigner_visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..e290d26b6d6fbb2f703faf3ebcd0474da871aea8 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/assigner_visualization.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp +import sys +import warnings + +import mmcv +import numpy as np +import torch +from mmengine import ProgressBar +from mmengine.config import Config, DictAction +from mmengine.dataset import COLLATE_FUNCTIONS +from mmengine.runner.checkpoint import load_checkpoint +from numpy import random + +from mmyolo.registry import DATASETS, MODELS +from mmyolo.utils import register_all_modules +from projects.assigner_visualization.dense_heads import (RTMHeadAssigner, + YOLOv5HeadAssigner, + YOLOv7HeadAssigner, + YOLOv8HeadAssigner) +from projects.assigner_visualization.visualization import \ + YOLOAssignerVisualizer + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMYOLO show the positive sample assigning' + ' results.') + parser.add_argument('config', help='config file path') + parser.add_argument('--checkpoint', '-c', type=str, help='checkpoint file') + parser.add_argument( + '--show-number', + '-n', + type=int, + default=sys.maxsize, + help='number of images selected to save, ' + 'must bigger than 0. if the number is bigger than length ' + 'of dataset, show all the images in dataset; ' + 'default "sys.maxsize", show all images in dataset') + parser.add_argument( + '--output-dir', + default='assigned_results', + type=str, + help='The name of the folder where the image is saved.') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference.') + parser.add_argument( + '--show-prior', + default=False, + action='store_true', + help='Whether to show prior on image.') + parser.add_argument( + '--not-show-label', + default=False, + action='store_true', + help='Whether to show label on image.') + parser.add_argument('--seed', default=-1, type=int, help='random seed') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + register_all_modules() + + # set random seed + seed = int(args.seed) + if seed != -1: + print(f'Set the global seed: {seed}') + random.seed(int(args.seed)) + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # build model + model = MODELS.build(cfg.model) + if args.checkpoint is not None: + load_checkpoint(model, args.checkpoint) + elif isinstance(model.bbox_head, (YOLOv7HeadAssigner, RTMHeadAssigner)): + warnings.warn( + 'if you use dynamic_assignment methods such as YOLOv7 or ' + 'YOLOv8 or RTMDet assigner, please load the checkpoint.') + assert isinstance(model.bbox_head, (YOLOv5HeadAssigner, + YOLOv7HeadAssigner, + YOLOv8HeadAssigner, + RTMHeadAssigner)), \ + 'Now, this script only support YOLOv5, YOLOv7, YOLOv8 and RTMdet, ' \ + 'and bbox_head must use ' \ + '`YOLOv5HeadAssigner or YOLOv7HeadAssigne or YOLOv8HeadAssigner ' \ + 'or RTMHeadAssigner`. Please use `' \ + 'yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py' \ + 'or yolov7_tiny_syncbn_fast_8x16b-300e_coco_assignervisualization.py' \ + 'or yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py' \ + 'or rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py' \ + """` as config file.""" + model.eval() + model.to(args.device) + + # build dataset + dataset_cfg = cfg.get('train_dataloader').get('dataset') + dataset = DATASETS.build(dataset_cfg) + + # get collate_fn + collate_fn_cfg = cfg.get('train_dataloader').pop( + 'collate_fn', dict(type='pseudo_collate')) + collate_fn_type = collate_fn_cfg.pop('type') + collate_fn = COLLATE_FUNCTIONS.get(collate_fn_type) + + # init visualizer + visualizer = YOLOAssignerVisualizer( + vis_backends=[{ + 'type': 'LocalVisBackend' + }], name='visualizer') + visualizer.dataset_meta = dataset.metainfo + # need priors size to draw priors + + if hasattr(model.bbox_head.prior_generator, 'base_anchors'): + visualizer.priors_size = model.bbox_head.prior_generator.base_anchors + + # make output dir + os.makedirs(args.output_dir, exist_ok=True) + print('Results will save to ', args.output_dir) + + # init visualization image number + assert args.show_number > 0 + display_number = min(args.show_number, len(dataset)) + + progress_bar = ProgressBar(display_number) + for ind_img in range(display_number): + data = dataset.prepare_data(ind_img) + if data is None: + print('Unable to visualize {} due to strong data augmentations'. + format(dataset[ind_img]['data_samples'].img_path)) + continue + # convert data to batch format + batch_data = collate_fn([data]) + with torch.no_grad(): + assign_results = model.assign(batch_data) + + img = data['inputs'].cpu().numpy().astype(np.uint8).transpose( + (1, 2, 0)) + # bgr2rgb + img = mmcv.bgr2rgb(img) + + gt_instances = data['data_samples'].gt_instances + + img_show = visualizer.draw_assign(img, assign_results, gt_instances, + args.show_prior, args.not_show_label) + + if hasattr(data['data_samples'], 'img_path'): + filename = osp.basename(data['data_samples'].img_path) + else: + # some dataset have not image path + filename = f'{ind_img}.jpg' + out_file = osp.join(args.output_dir, filename) + + # convert rgb 2 bgr and save img + mmcv.imwrite(mmcv.rgb2bgr(img_show), out_file) + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py b/third_party/mmyolo/projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..006502eb45af9ece927b68359525cc6c2de30788 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/configs/rtmdet_s_syncbn_fast_8xb32-300e_coco_assignervisualization.py @@ -0,0 +1,9 @@ +_base_ = ['../../../configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py'] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='RTMHeadAssigner')) diff --git a/third_party/mmyolo/projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py b/third_party/mmyolo/projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..1db799b5142375c86bd5a018764017c9d3170a07 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/configs/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_assignervisualization.py @@ -0,0 +1,11 @@ +_base_ = [ + '../../../configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' +] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='YOLOv5HeadAssigner')) diff --git a/third_party/mmyolo/projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py b/third_party/mmyolo/projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..626dc18b59df3b9ced0781347989b65f64de5042 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/configs/yolov7_tiny_syncbn_fast_8xb16-300e_coco_assignervisualization.py @@ -0,0 +1,9 @@ +_base_ = ['../../../configs/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py'] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='YOLOv7HeadAssigner')) diff --git a/third_party/mmyolo/projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py b/third_party/mmyolo/projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py new file mode 100644 index 0000000000000000000000000000000000000000..03dcae8c39a09c0200dc52123efc1bc0a348dea3 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/configs/yolov8_s_syncbn_fast_8xb16-500e_coco_assignervisualization.py @@ -0,0 +1,9 @@ +_base_ = ['../../../configs/yolov8/yolov8_s_syncbn_fast_8xb16-500e_coco.py'] + +custom_imports = dict(imports=[ + 'projects.assigner_visualization.detectors', + 'projects.assigner_visualization.dense_heads' +]) + +model = dict( + type='YOLODetectorAssigner', bbox_head=dict(type='YOLOv8HeadAssigner')) diff --git a/third_party/mmyolo/projects/assigner_visualization/dense_heads/__init__.py b/third_party/mmyolo/projects/assigner_visualization/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..82adaaba8ebe3510895ebc3d5ed5ac7c573b41b2 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/dense_heads/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .rtmdet_head_assigner import RTMHeadAssigner +from .yolov5_head_assigner import YOLOv5HeadAssigner +from .yolov7_head_assigner import YOLOv7HeadAssigner +from .yolov8_head_assigner import YOLOv8HeadAssigner + +__all__ = [ + 'YOLOv5HeadAssigner', 'YOLOv7HeadAssigner', 'YOLOv8HeadAssigner', + 'RTMHeadAssigner' +] diff --git a/third_party/mmyolo/projects/assigner_visualization/dense_heads/rtmdet_head_assigner.py b/third_party/mmyolo/projects/assigner_visualization/dense_heads/rtmdet_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..d3ae1c86d054d02a7a8537ee91251c0cca39edc6 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/dense_heads/rtmdet_head_assigner.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +from mmdet.structures.bbox import distance2bbox +from mmdet.utils import InstanceList +from torch import Tensor + +from mmyolo.models import RTMDetHead +from mmyolo.models.utils import gt_instances_preprocess +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class RTMHeadAssigner(RTMDetHead): + + def assign_by_gt_and_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + inputs_hw: Union[Tensor, tuple] = (640, 640) + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Decoded box for each scale + level with shape (N, num_anchors * 4, H, W) in + [tl_x, tl_y, br_x, br_y] format. + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + num_imgs = len(batch_img_metas) + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + # rtmdet's prior offset differs from others + prior_offset = self.prior_generator.offset + + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + device = cls_scores[0].device + + # If the shape does not equal, generate new one + if featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = featmap_sizes + mlvl_priors_with_stride = self.prior_generator.grid_priors( + featmap_sizes, device=device, with_stride=True) + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + + flatten_cls_scores = torch.cat([ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.cls_out_channels) + for cls_score in cls_scores + ], 1).contiguous() + + flatten_bboxes = torch.cat([ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ], 1) + flatten_bboxes = flatten_bboxes * self.flatten_priors_train[..., -1, + None] + flatten_bboxes = distance2bbox(self.flatten_priors_train[..., :2], + flatten_bboxes) + + assigned_result = self.assigner(flatten_bboxes.detach(), + flatten_cls_scores.detach(), + self.flatten_priors_train, gt_labels, + gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4) + + # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + bg_class_ind = self.num_classes + pos_inds = ((labels >= 0) + & (labels < bg_class_ind)).nonzero().squeeze(1) + targets = bbox_targets[pos_inds] + gt_bboxes = gt_bboxes.squeeze(0) + matched_gt_inds = torch.tensor( + [((t == gt_bboxes).sum(dim=1) == t.shape[0]).nonzero()[0] + for t in targets], + device=device) + + level_inds = torch.zeros_like(labels) + img_inds = torch.zeros_like(labels) + level_nums = [0] + [f[0] * f[1] for f in featmap_sizes] + for i in range(len(level_nums) - 1): + level_nums[i + 1] = level_nums[i] + level_nums[i + 1] + level_inds[level_nums[i]:level_nums[i + 1]] = i + level_inds_pos = level_inds[pos_inds] + + img_inds = img_inds[pos_inds] + labels = labels[pos_inds] + + inputs_hw = batch_img_metas[0]['batch_input_shape'] + assign_results = [] + for i in range(self.num_levels): + retained_inds = level_inds_pos == i + if not retained_inds.any(): + assign_results_prior = { + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + 0, + 'offset': + prior_offset + } + else: + w = inputs_hw[1] // self.featmap_strides[i] + + retained_pos_inds = pos_inds[retained_inds] - level_nums[i] + grid_y_inds = retained_pos_inds // w + grid_x_inds = retained_pos_inds - retained_pos_inds // w * w + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds, + 'grid_y_inds': grid_y_inds, + 'img_inds': img_inds[retained_inds], + 'class_inds': labels[retained_inds], + 'retained_gt_inds': matched_gt_inds[retained_inds], + 'prior_ind': 0, + 'offset': prior_offset + } + assign_results.append([assign_results_prior]) + return assign_results + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. This function is provided to the + `assigner_visualization.py` script. + + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + raise NotImplementedError( + 'assigning results_list is not implemented') + else: + # Fast version + cls_scores, bbox_preds = self(batch_data_samples['feats']) + assign_inputs = (cls_scores, bbox_preds, + batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + return assign_results diff --git a/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov5_head_assigner.py b/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov5_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..599963fede32fc02c73db8c744dfbc2946dd53fb --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov5_head_assigner.py @@ -0,0 +1,188 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Union + +import torch +from mmdet.models.utils import unpack_gt_instances +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.models import YOLOv5Head +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOv5HeadAssigner(YOLOv5Head): + + def assign_by_gt_and_feat( + self, + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + inputs_hw: Union[Tensor, tuple] = (640, 640) + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + + Args: + batch_gt_instances (Sequence[InstanceData]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (Sequence[dict]): Meta information of each image, + e.g., image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + # 1. Convert gt to norm format + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + device = batch_targets_normed.device + scaled_factor = torch.ones(7, device=device) + gt_inds = torch.arange( + batch_targets_normed.shape[1], + dtype=torch.long, + device=device, + requires_grad=False).unsqueeze(0).repeat((self.num_base_priors, 1)) + + assign_results = [] + for i in range(self.num_levels): + assign_results_feat = [] + h = inputs_hw[0] // self.featmap_strides[i] + w = inputs_hw[1] // self.featmap_strides[i] + + # empty gt bboxes + if batch_targets_normed.shape[1] == 0: + for k in range(self.num_base_priors): + assign_results_feat.append({ + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + k + }) + assign_results.append(assign_results_feat) + continue + + priors_base_sizes_i = self.priors_base_sizes[i] + # feature map scale whwh + scaled_factor[2:6] = torch.tensor([w, h, w, h]) + # Scale batch_targets from range 0-1 to range 0-features_maps size. + # (num_base_priors, num_bboxes, 7) + batch_targets_scaled = batch_targets_normed * scaled_factor + + # 2. Shape match + wh_ratio = batch_targets_scaled[..., + 4:6] / priors_base_sizes_i[:, None] + match_inds = torch.max( + wh_ratio, 1 / wh_ratio).max(2)[0] < self.prior_match_thr + batch_targets_scaled = batch_targets_scaled[match_inds] + match_gt_inds = gt_inds[match_inds] + + # no gt bbox matches anchor + if batch_targets_scaled.shape[0] == 0: + for k in range(self.num_base_priors): + assign_results_feat.append({ + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + k + }) + assign_results.append(assign_results_feat) + continue + + # 3. Positive samples with additional neighbors + + # check the left, up, right, bottom sides of the + # targets grid, and determine whether assigned + # them as positive samples as well. + batch_targets_cxcy = batch_targets_scaled[:, 2:4] + grid_xy = scaled_factor[[2, 3]] - batch_targets_cxcy + left, up = ((batch_targets_cxcy % 1 < self.near_neighbor_thr) & + (batch_targets_cxcy > 1)).T + right, bottom = ((grid_xy % 1 < self.near_neighbor_thr) & + (grid_xy > 1)).T + offset_inds = torch.stack( + (torch.ones_like(left), left, up, right, bottom)) + + batch_targets_scaled = batch_targets_scaled.repeat( + (5, 1, 1))[offset_inds] + retained_gt_inds = match_gt_inds.repeat((5, 1))[offset_inds] + retained_offsets = self.grid_offset.repeat(1, offset_inds.shape[1], + 1)[offset_inds] + + # prepare pred results and positive sample indexes to + # calculate class loss and bbox lo + _chunk_targets = batch_targets_scaled.chunk(4, 1) + img_class_inds, grid_xy, grid_wh, priors_inds = _chunk_targets + priors_inds, (img_inds, class_inds) = priors_inds.long().view( + -1), img_class_inds.long().T + + grid_xy_long = (grid_xy - + retained_offsets * self.near_neighbor_thr).long() + grid_x_inds, grid_y_inds = grid_xy_long.T + for k in range(self.num_base_priors): + retained_inds = priors_inds == k + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds[retained_inds], + 'grid_y_inds': grid_y_inds[retained_inds], + 'img_inds': img_inds[retained_inds], + 'class_inds': class_inds[retained_inds], + 'retained_gt_inds': retained_gt_inds[retained_inds], + 'prior_ind': k + } + assign_results_feat.append(assign_results_prior) + assign_results.append(assign_results_feat) + return assign_results + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. This function is provided to the + `assigner_visualization.py` script. + + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + assign_inputs = (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore, inputs_hw) + else: + # Fast version + assign_inputs = (batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + + return assign_results diff --git a/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov7_head_assigner.py b/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov7_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..de2a90e36b57f5ad54158ee546dac6cf513cd5a3 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov7_head_assigner.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +from mmdet.utils import InstanceList +from torch import Tensor + +from mmyolo.models import YOLOv7Head +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOv7HeadAssigner(YOLOv7Head): + + def assign_by_gt_and_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + inputs_hw: Union[Tensor, tuple], + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + objectnesses (Sequence[Tensor]): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W) + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + device = cls_scores[0][0].device + + head_preds = self._merge_predict_results(bbox_preds, objectnesses, + cls_scores) + + batch_targets_normed = self._convert_gt_to_norm_format( + batch_gt_instances, batch_img_metas) + + # yolov5_assign and simota_assign + assigner_results = self.assigner( + head_preds, + batch_targets_normed, + batch_img_metas[0]['batch_input_shape'], + self.priors_base_sizes, + self.grid_offset, + near_neighbor_thr=self.near_neighbor_thr) + + # multi-level positive sample position. + mlvl_positive_infos = assigner_results['mlvl_positive_infos'] + # assigned results with label and bboxes information. + mlvl_targets_normed = assigner_results['mlvl_targets_normed'] + + assign_results = [] + for i in range(self.num_levels): + assign_results_feat = [] + # no gt bbox matches anchor + if mlvl_positive_infos[i].shape[0] == 0: + for k in range(self.num_base_priors): + assign_results_feat.append({ + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + k + }) + assign_results.append(assign_results_feat) + continue + + # (batch_idx, prior_idx, x_scaled, y_scaled) + positive_info = mlvl_positive_infos[i] + targets_normed = mlvl_targets_normed[i] + priors_inds = positive_info[:, 1] + grid_x_inds = positive_info[:, 2] + grid_y_inds = positive_info[:, 3] + img_inds = targets_normed[:, 0] + class_inds = targets_normed[:, 1].long() + retained_gt_inds = self.get_gt_inds( + targets_normed, batch_targets_normed[0]).long() + for k in range(self.num_base_priors): + retained_inds = priors_inds == k + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds[retained_inds], + 'grid_y_inds': grid_y_inds[retained_inds], + 'img_inds': img_inds[retained_inds], + 'class_inds': class_inds[retained_inds], + 'retained_gt_inds': retained_gt_inds[retained_inds], + 'prior_ind': k + } + assign_results_feat.append(assign_results_prior) + assign_results.append(assign_results_feat) + return assign_results + + def get_gt_inds(self, assigned_target, gt_instance): + """Judging which one gt_ind is assigned by comparing assign_target and + origin target. + + Args: + assigned_target (Tensor(assign_nums,7)): YOLOv7 assigning results. + gt_instance (Tensor(gt_nums,7)): Normalized gt_instance, It + usually includes ``bboxes`` and ``labels`` attributes. + Returns: + gt_inds (Tensor): the index which one gt is assigned. + """ + gt_inds = torch.zeros(assigned_target.shape[0]) + for i in range(assigned_target.shape[0]): + gt_inds[i] = ((assigned_target[i] == gt_instance).sum( + dim=1) == 7).nonzero().squeeze() + return gt_inds + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. + + This function is provided to the + `assigner_visualization.py` script. + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + raise NotImplementedError( + 'assigning results_list is not implemented') + else: + # Fast version + cls_scores, bbox_preds, objectnesses = self( + batch_data_samples['feats']) + assign_inputs = (cls_scores, bbox_preds, objectnesses, + batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + return assign_results diff --git a/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov8_head_assigner.py b/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov8_head_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..49d254fdf5ae1e941b5c9b906223ec47311439c3 --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/dense_heads/yolov8_head_assigner.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch +from mmdet.utils import InstanceList +from torch import Tensor + +from mmyolo.models import YOLOv8Head +from mmyolo.models.utils import gt_instances_preprocess +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOv8HeadAssigner(YOLOv8Head): + + def assign_by_gt_and_feat( + self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + batch_gt_instances: InstanceList, + batch_img_metas: List[dict], + inputs_hw: Union[Tensor, tuple] = (640, 640) + ) -> dict: + """Calculate the assigning results based on the gt and features + extracted by the detection head. + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + inputs_hw (Union[Tensor, tuple]): Height and width of inputs size. + Returns: + dict[str, Tensor]: A dictionary of assigning results. + """ + num_imgs = len(batch_img_metas) + device = cls_scores[0].device + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + labels = assigned_result['assigned_labels'].reshape(-1) + bbox_targets = assigned_result['assigned_bboxes'].reshape(-1, 4) + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'].squeeze(0) + + pos_inds = fg_mask_pre_prior.nonzero().squeeze(1) + + targets = bbox_targets[pos_inds] + gt_bboxes = gt_bboxes.squeeze(0) + matched_gt_inds = torch.tensor( + [((t == gt_bboxes).sum(dim=1) == t.shape[0]).nonzero()[0] + for t in targets], + device=device) + + level_inds = torch.zeros_like(labels) + img_inds = torch.zeros_like(labels) + level_nums = [0] + self.num_level_priors + for i in range(len(level_nums) - 1): + level_nums[i + 1] = level_nums[i] + level_nums[i + 1] + level_inds[level_nums[i]:level_nums[i + 1]] = i + level_inds_pos = level_inds[pos_inds] + + img_inds = img_inds[pos_inds] + labels = labels[pos_inds] + + assign_results = [] + for i in range(self.num_levels): + retained_inds = level_inds_pos == i + if not retained_inds.any(): + assign_results_prior = { + 'stride': + self.featmap_strides[i], + 'grid_x_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'grid_y_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'img_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'class_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'retained_gt_inds': + torch.zeros([0], dtype=torch.int64).to(device), + 'prior_ind': + 0 + } + else: + w = inputs_hw[1] // self.featmap_strides[i] + + retained_pos_inds = pos_inds[retained_inds] - level_nums[i] + grid_y_inds = retained_pos_inds // w + grid_x_inds = retained_pos_inds - retained_pos_inds // w * w + assign_results_prior = { + 'stride': self.featmap_strides[i], + 'grid_x_inds': grid_x_inds, + 'grid_y_inds': grid_y_inds, + 'img_inds': img_inds[retained_inds], + 'class_inds': labels[retained_inds], + 'retained_gt_inds': matched_gt_inds[retained_inds], + 'prior_ind': 0 + } + assign_results.append([assign_results_prior]) + return assign_results + + def assign(self, batch_data_samples: Union[list, dict], + inputs_hw: Union[tuple, torch.Size]) -> dict: + """Calculate assigning results. + + This function is provided to the + `assigner_visualization.py` script. + Args: + batch_data_samples (List[:obj:`DetDataSample`], dict): The Data + Samples. It usually includes information such as + `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`. + inputs_hw: Height and width of inputs size + Returns: + dict: A dictionary of assigning components. + """ + if isinstance(batch_data_samples, list): + raise NotImplementedError( + 'assigning results_list is not implemented') + else: + # Fast version + cls_scores, bbox_preds = self(batch_data_samples['feats']) + assign_inputs = (cls_scores, bbox_preds, + batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas'], inputs_hw) + assign_results = self.assign_by_gt_and_feat(*assign_inputs) + return assign_results diff --git a/third_party/mmyolo/projects/assigner_visualization/detectors/__init__.py b/third_party/mmyolo/projects/assigner_visualization/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..155606a0136ef3e93d90347773af3eb7010b27ac --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/detectors/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from projects.assigner_visualization.detectors.yolo_detector_assigner import \ + YOLODetectorAssigner + +__all__ = ['YOLODetectorAssigner'] diff --git a/third_party/mmyolo/projects/assigner_visualization/detectors/yolo_detector_assigner.py b/third_party/mmyolo/projects/assigner_visualization/detectors/yolo_detector_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..5b723e01f65381155aaae962415d3c70040de06b --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/detectors/yolo_detector_assigner.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + +from mmyolo.models import YOLODetector +from mmyolo.registry import MODELS +from projects.assigner_visualization.dense_heads import (RTMHeadAssigner, + YOLOv7HeadAssigner, + YOLOv8HeadAssigner) + + +@MODELS.register_module() +class YOLODetectorAssigner(YOLODetector): + + def assign(self, data: dict) -> Union[dict, list]: + """Calculate assigning results from a batch of inputs and data + samples.This function is provided to the `assigner_visualization.py` + script. + + Args: + data (dict or tuple or list): Data sampled from dataset. + + Returns: + dict: A dictionary of assigning components. + """ + assert isinstance(data, dict) + assert len(data['inputs']) == 1, 'Only support batchsize == 1' + data = self.data_preprocessor(data, True) + available_assigners = (YOLOv7HeadAssigner, YOLOv8HeadAssigner, + RTMHeadAssigner) + if isinstance(self.bbox_head, available_assigners): + data['data_samples']['feats'] = self.extract_feat(data['inputs']) + inputs_hw = data['inputs'].shape[-2:] + assign_results = self.bbox_head.assign(data['data_samples'], inputs_hw) + return assign_results diff --git a/third_party/mmyolo/projects/assigner_visualization/visualization/__init__.py b/third_party/mmyolo/projects/assigner_visualization/visualization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..521a25b8837cf084e78fffa9f84660a4c9ae02bb --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/visualization/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assigner_visualizer import YOLOAssignerVisualizer + +__all__ = ['YOLOAssignerVisualizer'] diff --git a/third_party/mmyolo/projects/assigner_visualization/visualization/assigner_visualizer.py b/third_party/mmyolo/projects/assigner_visualization/visualization/assigner_visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..fe1f4f0b90da2bbd683e3f9845efb66c9348459e --- /dev/null +++ b/third_party/mmyolo/projects/assigner_visualization/visualization/assigner_visualizer.py @@ -0,0 +1,326 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +from typing import List, Union + +import mmcv +import numpy as np +import torch +from mmdet.structures.bbox import HorizontalBoxes +from mmdet.visualization import DetLocalVisualizer +from mmdet.visualization.palette import _get_adaptive_scales, get_palette +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.registry import VISUALIZERS + + +@VISUALIZERS.register_module() +class YOLOAssignerVisualizer(DetLocalVisualizer): + """MMYOLO Detection Assigner Visualizer. + + This class is provided to the `assigner_visualization.py` script. + Args: + name (str): Name of the instance. Defaults to 'visualizer'. + """ + + def __init__(self, name: str = 'visualizer', *args, **kwargs): + super().__init__(name=name, *args, **kwargs) + # need priors_size from config + self.priors_size = None + + def draw_grid(self, + stride: int = 8, + line_styles: Union[str, List[str]] = ':', + colors: Union[str, tuple, List[str], + List[tuple]] = (180, 180, 180), + line_widths: Union[Union[int, float], + List[Union[int, float]]] = 1): + """Draw grids on image. + + Args: + stride (int): Downsample factor of feature map. + line_styles (Union[str, List[str]]): The linestyle + of lines. ``line_styles`` can have the same length with + texts or just single value. If ``line_styles`` is single + value, all the lines will have the same linestyle. + Reference to + https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle + for more details. Defaults to ':'. + colors (Union[str, tuple, List[str], List[tuple]]): The colors of + lines. ``colors`` can have the same length with lines or just + single value. If ``colors`` is single value, all the lines + will have the same colors. Reference to + https://matplotlib.org/stable/gallery/color/named_colors.html + for more details. Defaults to (180, 180, 180). + line_widths (Union[Union[int, float], List[Union[int, float]]]): + The linewidth of lines. ``line_widths`` can have + the same length with lines or just single value. + If ``line_widths`` is single value, all the lines will + have the same linewidth. Defaults to 1. + """ + assert self._image is not None, 'Please set image using `set_image`' + # draw vertical lines + x_datas_vertical = ((np.arange(self.width // stride - 1) + 1) * + stride).reshape((-1, 1)).repeat( + 2, axis=1) + y_datas_vertical = np.array([[0, self.height - 1]]).repeat( + self.width // stride - 1, axis=0) + self.draw_lines( + x_datas_vertical, + y_datas_vertical, + colors=colors, + line_styles=line_styles, + line_widths=line_widths) + + # draw horizontal lines + x_datas_horizontal = np.array([[0, self.width - 1]]).repeat( + self.height // stride - 1, axis=0) + y_datas_horizontal = ((np.arange(self.height // stride - 1) + 1) * + stride).reshape((-1, 1)).repeat( + 2, axis=1) + self.draw_lines( + x_datas_horizontal, + y_datas_horizontal, + colors=colors, + line_styles=line_styles, + line_widths=line_widths) + + def draw_instances_assign(self, + instances: InstanceData, + retained_gt_inds: Tensor, + not_show_label: bool = False): + """Draw instances of GT. + + Args: + instances (:obj:`InstanceData`): gt_instance. It usually + includes ``bboxes`` and ``labels`` attributes. + retained_gt_inds (Tensor): The gt indexes assigned as the + positive sample in the current prior. + not_show_label (bool): Whether to show gt labels on images. + """ + assert self.dataset_meta is not None + classes = self.dataset_meta['classes'] + palette = self.dataset_meta['palette'] + if len(retained_gt_inds) == 0: + return self.get_image() + draw_gt_inds = torch.from_numpy( + np.array( + list(set(retained_gt_inds.cpu().numpy())), dtype=np.int64)) + bboxes = instances.bboxes[draw_gt_inds] + labels = instances.labels[draw_gt_inds] + + if not isinstance(bboxes, Tensor): + bboxes = bboxes.tensor + + edge_colors = [palette[i] for i in labels] + + max_label = int(max(labels) if len(labels) > 0 else 0) + text_palette = get_palette(self.text_color, max_label + 1) + text_colors = [text_palette[label] for label in labels] + + self.draw_bboxes( + bboxes, + edge_colors=edge_colors, + alpha=self.alpha, + line_widths=self.line_width) + + if not not_show_label: + positions = bboxes[:, :2] + self.line_width + areas = (bboxes[:, 3] - bboxes[:, 1]) * ( + bboxes[:, 2] - bboxes[:, 0]) + scales = _get_adaptive_scales(areas) + for i, (pos, label) in enumerate(zip(positions, labels)): + label_text = classes[ + label] if classes is not None else f'class {label}' + + self.draw_texts( + label_text, + pos, + colors=text_colors[i], + font_sizes=int(13 * scales[i]), + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + + def draw_positive_assign(self, + grid_x_inds: Tensor, + grid_y_inds: Tensor, + class_inds: Tensor, + stride: int, + bboxes: Union[Tensor, HorizontalBoxes], + retained_gt_inds: Tensor, + offset: float = 0.5): + """ + + Args: + grid_x_inds (Tensor): The X-axis indexes of the positive sample + in current prior. + grid_y_inds (Tensor): The Y-axis indexes of the positive sample + in current prior. + class_inds (Tensor): The classes indexes of the positive sample + in current prior. + stride (int): Downsample factor of feature map. + bboxes (Union[Tensor, HorizontalBoxes]): Bounding boxes of GT. + retained_gt_inds (Tensor): The gt indexes assigned as the + positive sample in the current prior. + offset (float): The offset of points, the value is normalized + with corresponding stride. Defaults to 0.5. + """ + if not isinstance(bboxes, Tensor): + # Convert HorizontalBoxes to Tensor + bboxes = bboxes.tensor + + # The PALETTE in the dataset_meta is required + assert self.dataset_meta is not None + palette = self.dataset_meta['palette'] + x = ((grid_x_inds + offset) * stride).long() + y = ((grid_y_inds + offset) * stride).long() + center = torch.stack((x, y), dim=-1) + + retained_bboxes = bboxes[retained_gt_inds] + bbox_wh = retained_bboxes[:, 2:] - retained_bboxes[:, :2] + bbox_area = bbox_wh[:, 0] * bbox_wh[:, 1] + radius = _get_adaptive_scales(bbox_area) * 4 + colors = [palette[i] for i in class_inds] + + self.draw_circles( + center, + radius, + colors, + line_widths=0, + face_colors=colors, + alpha=1.0) + + def draw_prior(self, + grid_x_inds: Tensor, + grid_y_inds: Tensor, + class_inds: Tensor, + stride: int, + feat_ind: int, + prior_ind: int, + offset: float = 0.5): + """Draw priors on image. + + Args: + grid_x_inds (Tensor): The X-axis indexes of the positive sample + in current prior. + grid_y_inds (Tensor): The Y-axis indexes of the positive sample + in current prior. + class_inds (Tensor): The classes indexes of the positive sample + in current prior. + stride (int): Downsample factor of feature map. + feat_ind (int): Index of featmap. + prior_ind (int): Index of prior in current featmap. + offset (float): The offset of points, the value is normalized + with corresponding stride. Defaults to 0.5. + """ + + palette = self.dataset_meta['palette'] + center_x = ((grid_x_inds + offset) * stride) + center_y = ((grid_y_inds + offset) * stride) + xyxy = torch.stack((center_x, center_y, center_x, center_y), dim=1) + device = xyxy.device + if self.priors_size is not None: + xyxy += self.priors_size[feat_ind][prior_ind].to(device) + else: + xyxy += torch.tensor( + [[-stride / 2, -stride / 2, stride / 2, stride / 2]], + device=device) + + colors = [palette[i] for i in class_inds] + self.draw_bboxes( + xyxy, + edge_colors=colors, + alpha=self.alpha, + line_styles='--', + line_widths=math.ceil(self.line_width * 0.3)) + + def draw_assign(self, + image: np.ndarray, + assign_results: List[List[dict]], + gt_instances: InstanceData, + show_prior: bool = False, + not_show_label: bool = False) -> np.ndarray: + """Draw assigning results. + + Args: + image (np.ndarray): The image to draw. + assign_results (list): The assigning results. + gt_instances (:obj:`InstanceData`): Data structure for + instance-level annotations or predictions. + show_prior (bool): Whether to show prior on image. + not_show_label (bool): Whether to show gt labels on images. + + Returns: + np.ndarray: the drawn image which channel is RGB. + """ + img_show_list = [] + for feat_ind, assign_results_feat in enumerate(assign_results): + img_show_list_feat = [] + for prior_ind, assign_results_prior in enumerate( + assign_results_feat): + self.set_image(image) + h, w = image.shape[:2] + + # draw grid + stride = assign_results_prior['stride'] + self.draw_grid(stride) + + # draw prior on matched gt + grid_x_inds = assign_results_prior['grid_x_inds'] + grid_y_inds = assign_results_prior['grid_y_inds'] + class_inds = assign_results_prior['class_inds'] + prior_ind = assign_results_prior['prior_ind'] + offset = assign_results_prior.get('offset', 0.5) + + if show_prior: + self.draw_prior(grid_x_inds, grid_y_inds, class_inds, + stride, feat_ind, prior_ind, offset) + + # draw matched gt + retained_gt_inds = assign_results_prior['retained_gt_inds'] + self.draw_instances_assign(gt_instances, retained_gt_inds, + not_show_label) + + # draw positive + self.draw_positive_assign(grid_x_inds, grid_y_inds, class_inds, + stride, gt_instances.bboxes, + retained_gt_inds, offset) + + # draw title + if self.priors_size is not None: + base_prior = self.priors_size[feat_ind][prior_ind] + else: + base_prior = [stride, stride, stride * 2, stride * 2] + prior_size = (base_prior[2] - base_prior[0], + base_prior[3] - base_prior[1]) + pos = np.array((20, 20)) + text = f'feat_ind: {feat_ind} ' \ + f'prior_ind: {prior_ind} ' \ + f'prior_size: ({prior_size[0]}, {prior_size[1]})' + scales = _get_adaptive_scales(np.array([h * w / 16])) + font_sizes = int(13 * scales) + self.draw_texts( + text, + pos, + colors=self.text_color, + font_sizes=font_sizes, + bboxes=[{ + 'facecolor': 'black', + 'alpha': 0.8, + 'pad': 0.7, + 'edgecolor': 'none' + }]) + + img_show = self.get_image() + img_show = mmcv.impad(img_show, padding=(5, 5, 5, 5)) + img_show_list_feat.append(img_show) + img_show_list.append(np.concatenate(img_show_list_feat, axis=1)) + + # Merge all images into one image + # setting axis is to beautify the merged image + axis = 0 if len(assign_results[0]) > 1 else 1 + return np.concatenate(img_show_list, axis=axis) diff --git a/third_party/mmyolo/projects/easydeploy/README.md b/third_party/mmyolo/projects/easydeploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1816e7ed96ee34209c56af4a22eda5f1eb7e499b --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/README.md @@ -0,0 +1,11 @@ +# MMYOLO Model Easy-Deployment + +## Introduction + +This project is developed for easily converting your MMYOLO models to other inference backends without the need of MMDeploy, which reduces the cost of both time and effort on getting familiar with MMDeploy. + +Currently we support converting to `ONNX` and `TensorRT` formats, other inference backends such `ncnn` will be added to this project as well. + +## Supported Backends + +- [Model Convert](docs/model_convert.md) diff --git a/third_party/mmyolo/projects/easydeploy/README_zh-CN.md b/third_party/mmyolo/projects/easydeploy/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..4c6bc0cf4ef91edeced04bdf15af08ae1f6f0dcd --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/README_zh-CN.md @@ -0,0 +1,11 @@ +# MMYOLO 模型转换 + +## 介绍 + +本项目作为 MMYOLO 的部署 project 单独存在,意图剥离 MMDeploy 当前的体系,独自支持用户完成模型训练后的转换和部署功能,使用户的学习和工程成本下降。 + +当前支持对 ONNX 格式和 TensorRT 格式的转换,后续对其他推理平台也会支持起来。 + +## 转换教程 + +- [Model Convert](docs/model_convert.md) diff --git a/third_party/mmyolo/projects/easydeploy/backbone/__init__.py b/third_party/mmyolo/projects/easydeploy/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc167f8515c66a30d884ed9655a11d45e21481c0 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/backbone/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .common import DeployC2f +from .focus import DeployFocus, GConvFocus, NcnnFocus + +__all__ = ['DeployFocus', 'NcnnFocus', 'GConvFocus', 'DeployC2f'] diff --git a/third_party/mmyolo/projects/easydeploy/backbone/common.py b/third_party/mmyolo/projects/easydeploy/backbone/common.py new file mode 100644 index 0000000000000000000000000000000000000000..617875bd979a5b9150e476544090777118087a0b --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/backbone/common.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn +from torch import Tensor + + +class DeployC2f(nn.Module): + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, x: Tensor) -> Tensor: + x_main = self.main_conv(x) + x_main = [x_main, x_main[:, self.mid_channels:, ...]] + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.pop(1) + return self.final_conv(torch.cat(x_main, 1)) diff --git a/third_party/mmyolo/projects/easydeploy/backbone/focus.py b/third_party/mmyolo/projects/easydeploy/backbone/focus.py new file mode 100644 index 0000000000000000000000000000000000000000..2a19afcca1d9c4e27109daeebd83907cd9b7b284 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/backbone/focus.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class DeployFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, channel, height, width = x.shape + x = x.reshape(batch_size, channel, -1, 2, width) + x = x.reshape(batch_size, channel, x.shape[2], 2, -1, 2) + half_h = x.shape[2] + half_w = x.shape[4] + x = x.permute(0, 5, 3, 1, 2, 4) + x = x.reshape(batch_size, channel * 4, half_h, half_w) + + return self.conv(x) + + +class NcnnFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, c, h, w = x.shape + assert h % 2 == 0 and w % 2 == 0, f'focus for yolox needs even feature\ + height and width, got {(h, w)}.' + + x = x.reshape(batch_size, c * h, 1, w) + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * h * w, 1, 1) + + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * 4, h // 2, w // 2) + + return self.conv(x) + + +class GConvFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + device = next(orin_Focus.parameters()).device + self.weight1 = torch.tensor([[1., 0], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight2 = torch.tensor([[0, 0], [1., 0]]).expand(3, 1, 2, + 2).to(device) + self.weight3 = torch.tensor([[0, 1.], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight4 = torch.tensor([[0, 0], [0, 1.]]).expand(3, 1, 2, + 2).to(device) + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + conv1 = F.conv2d(x, self.weight1, stride=2, groups=3) + conv2 = F.conv2d(x, self.weight2, stride=2, groups=3) + conv3 = F.conv2d(x, self.weight3, stride=2, groups=3) + conv4 = F.conv2d(x, self.weight4, stride=2, groups=3) + return self.conv(torch.cat([conv1, conv2, conv3, conv4], dim=1)) diff --git a/third_party/mmyolo/projects/easydeploy/bbox_code/__init__.py b/third_party/mmyolo/projects/easydeploy/bbox_code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b85a815536a5749a15f0ad6aab2b028eb6a3fe0a --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/bbox_code/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_coder import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) + +__all__ = ['yolov5_bbox_decoder', 'rtmdet_bbox_decoder', 'yolox_bbox_decoder'] diff --git a/third_party/mmyolo/projects/easydeploy/bbox_code/bbox_coder.py b/third_party/mmyolo/projects/easydeploy/bbox_code/bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..6483cf8b0328aff3d61f1fa0788337ab536d347d --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/bbox_code/bbox_coder.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from torch import Tensor + + +def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Tensor) -> Tensor: + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + + +def rtmdet_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + bbox_preds *= stride + tl_x = (priors[..., 0] - bbox_preds[..., 0]) + tl_y = (priors[..., 1] - bbox_preds[..., 1]) + br_x = (priors[..., 0] + bbox_preds[..., 2]) + br_y = (priors[..., 1] + bbox_preds[..., 3]) + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes + + +def yolox_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + xys = (bbox_preds[..., :2] * stride) + priors + whs = bbox_preds[..., 2:].exp() * stride + decoded_bboxes = torch.cat([xys, whs], -1) + return decoded_bboxes diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/CMakeLists.txt b/third_party/mmyolo/projects/easydeploy/deepstream/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f640bea13bacfc0f6cc2f33e598f65cf5ce0922e --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 2.8.12) + +set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86) +set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) + +project(nvdsparsebbox_mmyolo LANGUAGES CXX) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O3 -g -Wall -Werror -shared -fPIC") +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_BUILD_TYPE Release) +option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) + +# CUDA +find_package(CUDA REQUIRED) + +# TensorRT +set(TensorRT_INCLUDE_DIRS "/usr/include/x86_64-linux-gnu" CACHE STRING "TensorRT headers path") +set(TensorRT_LIBRARIES "/usr/lib/x86_64-linux-gnu" CACHE STRING "TensorRT libs path") + +# DeepStream +set(DEEPSTREAM "/opt/nvidia/deepstream/deepstream" CACHE STRING "DeepStream root path") +set(DS_LIBRARIES ${DEEPSTREAM}/lib) +set(DS_INCLUDE_DIRS ${DEEPSTREAM}/sources/includes) + +include_directories( + ${CUDA_INCLUDE_DIRS} + ${TensorRT_INCLUDE_DIRS} + ${DS_INCLUDE_DIRS}) + +add_library( + ${PROJECT_NAME} + SHARED + custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp) + +target_link_libraries(${PROJECT_NAME} PRIVATE nvinfer nvinfer_plugin) diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/README.md b/third_party/mmyolo/projects/easydeploy/deepstream/README.md new file mode 100644 index 0000000000000000000000000000000000000000..111f3765e41d558b64097d8a25585bd9c14acf4f --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/README.md @@ -0,0 +1,48 @@ +# Inference MMYOLO Models with DeepStream + +This project demonstrates how to inference MMYOLO models with customized parsers in [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk). + +## Pre-requisites + +### 1. Install Nvidia Driver and CUDA + +First, please follow the official documents and instructions to install dedicated Nvidia graphic driver and CUDA matched to your gpu and target Nvidia AIoT devices. + +### 2. Install DeepStream SDK + +Second, please follow the official instruction to download and install DeepStream SDK. Currently stable version of DeepStream is v6.2. + +### 3. Generate TensorRT Engine + +As DeepStream builds on top of several NVIDIA libraries, you need to first convert your trained MMYOLO models to TensorRT engine files. We strongly recommend you to try the supported TensorRT deployment solution in [EasyDeploy](../../easydeploy/). + +## Build and Run + +Please make sure that your converted TensorRT engine is already located in the `deepstream` folder as the config shows. Create your own model config files and change the `config-file` parameter in [deepstream_app_config.txt](deepstream_app_config.txt) to the model you want to run with. + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +Then you can run the inference with this command. + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## Code Structure + +```bash +├── deepstream +│ ├── configs # config file for MMYOLO models +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # customized parser for MMYOLO models to DeepStream formats +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # labels for coco detection +│ ├── deepstream_app_config.txt # deepStream reference app configs for MMYOLO models +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/README_zh-CN.md b/third_party/mmyolo/projects/easydeploy/deepstream/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..13a85d5bc90159c3ff9f1a32e93d01e82ed2faa4 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/README_zh-CN.md @@ -0,0 +1,48 @@ +# 使用 DeepStream SDK 推理 MMYOLO 模型 + +本项目演示了如何使用 [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk) 配合改写的 parser 来推理 MMYOLO 的模型。 + +## 预先准备 + +### 1. 安装 Nidia 驱动和 CUDA + +首先请根据当前的显卡驱动和目标使用设备的驱动完成显卡驱动和 CUDA 的安装。 + +### 2. 安装 DeepStream SDK + +目前 DeepStream SDK 稳定版本已经更新到 v6.2,官方推荐使用这个版本。 + +### 3. 将 MMYOLO 模型转换为 TensorRT Engine + +推荐使用 EasyDeploy 中的 TensorRT 方案完成目标模型的转换部署,具体可参考 [此文档](../../easydeploy/docs/model_convert.md) 。 + +## 编译使用 + +当前项目使用的是 MMYOLO 的 rtmdet 模型,若想使用其他的模型,请参照目录下的配置文件进行改写。然后将转换完的 TensorRT engine 放在当前目录下并执行如下命令: + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +完成编译后可使用如下命令进行推理: + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## 项目代码结构 + +```bash +├── deepstream +│ ├── configs # MMYOLO 模型对应的 DeepStream 配置 +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # 适配 DeepStream formats 的 parser +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # coco labels +│ ├── deepstream_app_config.txt # DeepStream app 配置 +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/coco_labels.txt b/third_party/mmyolo/projects/easydeploy/deepstream/coco_labels.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/coco_labels.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_rtmdet.txt b/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_rtmdet.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1e5efd2a3810730144e037ee96dfbd36124b0e6 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_rtmdet.txt @@ -0,0 +1,22 @@ +[property] +gpu-id=0 +net-scale-factor=0.01735207357279195 +offsets=57.375;57.12;58.395 +model-color-format=1 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov5.txt b/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov5.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov5.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt b/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/configs/config_infer_yolov8.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp b/third_party/mmyolo/projects/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb780856cbd2b289cdf9dc8518438f946a2ab548 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp @@ -0,0 +1,118 @@ +#include "nvdsinfer_custom_impl.h" +#include +#include + +/** + * Function expected by DeepStream for decoding the MMYOLO output. + * + * C-linkage [extern "C"] was written to prevent name-mangling. This function must return true after + * adding all bounding boxes to the objectList vector. + * + * @param [outputLayersInfo] std::vector of NvDsInferLayerInfo objects with information about the output layer. + * @param [networkInfo] NvDsInferNetworkInfo object with information about the MMYOLO network. + * @param [detectionParams] NvDsInferParseDetectionParams with information about some config params. + * @param [objectList] std::vector of NvDsInferParseObjectInfo objects to which bounding box information must + * be stored. + * + * @return true + */ + +// This is just the function prototype. The definition is written at the end of the file. +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList); + +static __inline__ float clamp(float& val, float min, float max) +{ + return val > min ? (val < max ? val : max) : min; +} + +static std::vector decodeMMYoloTensor( + const int* num_dets, + const float* bboxes, + const float* scores, + const int* labels, + const float& conf_thres, + const unsigned int& img_w, + const unsigned int& img_h +) +{ + std::vector bboxInfo; + size_t nums = num_dets[0]; + for (size_t i = 0; i < nums; i++) + { + float score = scores[i]; + if (score < conf_thres)continue; + float x0 = (bboxes[i * 4]); + float y0 = (bboxes[i * 4 + 1]); + float x1 = (bboxes[i * 4 + 2]); + float y1 = (bboxes[i * 4 + 3]); + x0 = clamp(x0, 0.f, img_w); + y0 = clamp(y0, 0.f, img_h); + x1 = clamp(x1, 0.f, img_w); + y1 = clamp(y1, 0.f, img_h); + NvDsInferParseObjectInfo obj; + obj.left = x0; + obj.top = y0; + obj.width = x1 - x0; + obj.height = y1 - y0; + obj.detectionConfidence = score; + obj.classId = labels[i]; + bboxInfo.push_back(obj); + } + + return bboxInfo; +} + +/* C-linkage to prevent name-mangling */ +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList) +{ + +// Some assertions and error checking. + if (outputLayersInfo.empty() || outputLayersInfo.size() != 4) + { + std::cerr << "Could not find output layer in bbox parsing" << std::endl; + return false; + } + +// Score threshold of bboxes. + const float conf_thres = detectionParams.perClassThreshold[0]; + +// Obtaining the output layer. + const NvDsInferLayerInfo& num_dets = outputLayersInfo[0]; + const NvDsInferLayerInfo& bboxes = outputLayersInfo[1]; + const NvDsInferLayerInfo& scores = outputLayersInfo[2]; + const NvDsInferLayerInfo& labels = outputLayersInfo[3]; + +// num_dets(int) bboxes(float) scores(float) labels(int) + assert (num_dets.dims.numDims == 2); + assert (bboxes.dims.numDims == 3); + assert (scores.dims.numDims == 2); + assert (labels.dims.numDims == 2); + + +// Decoding the output tensor of MMYOLO to the NvDsInferParseObjectInfo format. + std::vector objects = + decodeMMYoloTensor( + (const int*)(num_dets.buffer), + (const float*)(bboxes.buffer), + (const float*)(scores.buffer), + (const int*)(labels.buffer), + conf_thres, + networkInfo.width, + networkInfo.height + ); + + objectList.clear(); + objectList = objects; + return true; +} + +/* Check that the custom function has been defined correctly */ +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomMMYOLO); diff --git a/third_party/mmyolo/projects/easydeploy/deepstream/deepstream_app_config.txt b/third_party/mmyolo/projects/easydeploy/deepstream/deepstream_app_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..331776897a5e9109b9007ed1b7974f128287c4fc --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/deepstream/deepstream_app_config.txt @@ -0,0 +1,62 @@ +[application] +enable-perf-measurement=1 +perf-measurement-interval-sec=5 + +[tiled-display] +enable=1 +rows=1 +columns=1 +width=1280 +height=720 +gpu-id=0 +nvbuf-memory-type=0 + +[source0] +enable=1 +type=3 +uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4 +num-sources=1 +gpu-id=0 +cudadec-memtype=0 + +[sink0] +enable=1 +type=2 +sync=0 +gpu-id=0 +nvbuf-memory-type=0 + +[osd] +enable=1 +gpu-id=0 +border-width=5 +text-size=15 +text-color=1;1;1;1; +text-bg-color=0.3;0.3;0.3;1 +font=Serif +show-clock=0 +clock-x-offset=800 +clock-y-offset=820 +clock-text-size=12 +clock-color=1;0;0;0 +nvbuf-memory-type=0 + +[streammux] +gpu-id=0 +live-source=0 +batch-size=1 +batched-push-timeout=40000 +width=1920 +height=1080 +enable-padding=0 +nvbuf-memory-type=0 + +[primary-gie] +enable=1 +gpu-id=0 +gie-unique-id=1 +nvbuf-memory-type=0 +config-file=configs/config_infer_rtmdet.txt + +[tests] +file-loop=0 diff --git a/third_party/mmyolo/projects/easydeploy/docs/model_convert.md b/third_party/mmyolo/projects/easydeploy/docs/model_convert.md new file mode 100644 index 0000000000000000000000000000000000000000..9af62599dd1b56648680fc315ca88c35c7b31cb9 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/docs/model_convert.md @@ -0,0 +1,156 @@ +# MMYOLO 模型 ONNX 转换 + +## 1. 导出后端支持的 ONNX + +## 环境依赖 + +- [onnx](https://github.com/onnx/onnx) + + ```shell + pip install onnx + ``` + + [onnx-simplifier](https://github.com/daquexian/onnx-simplifier) (可选,用于简化模型) + + ```shell + pip install onnx-simplifier + ``` + +\*\*\* 请确保您在 `MMYOLO` 根目录下运行相关脚本,避免无法找到相关依赖包。\*\*\* + +## 使用方法 + +[模型导出脚本](./projects/easydeploy/tools/export_onnx.py)用于将 `MMYOLO` 模型转换为 `onnx` 。 + +### 参数介绍: + +- `config` : 构建模型使用的配置文件,如 [`yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`](./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) 。 +- `checkpoint` : 训练得到的权重文件,如 `yolov5s.pth` 。 +- `--work-dir` : 转换后的模型保存路径。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--batch-size`: 转换后的模型输入 `batch size` 。 +- `--device`: 转换模型使用的设备,默认为 `cuda:0`。 +- `--simplify`: 是否简化导出的 `onnx` 模型,需要安装 [onnx-simplifier](https://github.com/daquexian/onnx-simplifier),默认关闭。 +- `--opset`: 指定导出 `onnx` 的 `opset`,默认为 `11` 。 +- `--backend`: 指定导出 `onnx` 用于的后端名称,`ONNXRuntime`: `onnxruntime`, `TensorRT8`: `tensorrt8`, `TensorRT7`: `tensorrt7`,默认为`onnxruntime`即 `ONNXRuntime`。 +- `--pre-topk`: 指定导出 `onnx` 的后处理筛选候选框个数阈值,默认为 `1000`。 +- `--keep-topk`: 指定导出 `onnx` 的非极大值抑制输出的候选框个数阈值,默认为 `100`。 +- `--iou-threshold`: 非极大值抑制中过滤重复候选框的 `iou` 阈值,默认为 `0.65`。 +- `--score-threshold`: 非极大值抑制中过滤候选框得分的阈值,默认为 `0.25`。 +- `--model-only`: 指定仅导出模型 backbone + neck, 不包含后处理,默认关闭。 + +例子: + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --backend 1 \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等。 + +`MMYOLO` 目前支持 `TensorRT8`, `TensorRT7`, `ONNXRuntime` 后端的端到端模型转换,目前仅支持静态 shape 模型的导出和转换,动态 batch 或动态长宽的模型端到端转换会在未来继续支持。 + +端到端转换得到的 `onnx` 模型输入输出如图: + +
+ +
+ +输入名: `images`, 尺寸 640x640 + +输出名: `num_dets`, 尺寸 1x1,表示检测目标数量。 + +输出名: `boxes`, 尺寸 1x100x4,表示检测框的坐标,格式为 `x1y1x2y1`。 + +输出名: `scores`, 尺寸 1x100,表示检测框的分数。 + +输出名: `labels`, 尺寸 1x100,表示检测框的类别 id。 + +可以利用 `num_dets` 中的个数对 `boxes`, `scores`, `labels` 进行截断,从 100 个检测结果中抽取前 `num_dets` 个目标作为最终检测结果。 + +## 2. 仅导出模型 Backbone + Neck + +当您需要部署在非 `TensorRT`, `ONNXRuntime` 等支持端到端部署的平台时,您可以考虑使用`--model-only` 参数并且不要传递 `--backend` 参数,您将会导出仅包含 `Backbone` + `neck` 的模型,模型的部分输出如图: + +
+ +
+ +这种导出方式获取的 `ONNX` 模型具有如下优点: + +- 算子简单,一般而言只包含 `Conv`,激活函数等简单算子,几乎不存在无法正确导出的情况,对于嵌入式部署更加友好。 +- 方便不同算法之间对比速度性能,由于不同的算法后处理不同,仅对比 `backbone` + `Neck` 的速度更加公平。 + +也有如下缺点: + +- 后处理逻辑需要单独完成,会有额外的 `decode` + `nms` 的操作需要实现。 +- 与 `TensorRT` 相比,由于 `TensorRT` 可以利用多核优势并行进行后处理,使用 `--model-only` 方式导出的模型性能会差很多。 + +### 使用方法 + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --model-only +``` + +## 使用 `model-only` 导出的 ONNX 进行推理 + +[模型推理脚本](./projects/easydeploy/examples/main_onnxruntime.py)用于推理导出的 `ONNX` 模型,需要安装基础依赖环境: + +[`onnxruntime`](https://github.com/microsoft/onnxruntime) 和 [`opencv-python`](https://github.com/opencv/opencv-python) + +```shell +pip install onnxruntime +pip install opencv-python==4.7.0.72 # 建议使用最新的 opencv +``` + +### 参数介绍: + +- `img` : 待检测的图片路径或图片文件夹路径。 +- `onnx` : 导出的 `model-only` ONNX 模型。 +- `--type` : 模型名称,目前支持 `yolov5`, `yolox`, `yolov6`, `ppyoloe`, `ppyoloep`, `yolov7`, `rtmdet`, `yolov8`。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--out-dir`: 保存检测结果的路径 。 +- `--show`: 是否可视化检测结果。 +- `--score-thr`: 模型检测后处理的置信度分数 。 +- `--iou-thr`: 模型检测后处理的 IOU 分数 。 + +## 使用方法 + +```shell +cd ./projects/easydeploy/examples +python main_onnxruntime.py \ + "image_path_to_detect" \ + yolov5_s_model-only.onnx \ + --out-dir work_dir \ + --img-size 640 640 \ + --show \ + --score-thr 0.3 \ + --iou-thr 0.7 +``` + +*注意!!!* + +当您使用自定义数据集训练得到的模型时,请修改 [`config.py`](./projects/easydeploy/examples/config.py) 中 `CLASS_NAMES` 和 `CLASS_COLORS`,如果是 `yolov5` 或者 `yolov7` 基于 `anchor` 的模型请同时修改 `YOLOv5_ANCHORS` 和 `YOLOv7_ANCHORS`。 + +[`numpy_coder.py`](./projects/easydeploy/examples/numpy_coder.py) 是目前所有算法仅使用 `numpy` 实现的 `decoder`,如果您对性能有较高的要求,可以参照相关代码改写为 `c/c++`。 diff --git a/third_party/mmyolo/projects/easydeploy/examples/config.py b/third_party/mmyolo/projects/easydeploy/examples/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a85ff34273c22a356c9d6a3eaeb048b637b5f40 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/examples/config.py @@ -0,0 +1,64 @@ +from enum import Enum + + +class TASK_TYPE(Enum): + DET = 'det' + SEG = 'seg' + POSE = 'pose' + + +class ModelType(Enum): + YOLOV5 = 'yolov5' + YOLOX = 'yolox' + PPYOLOE = 'ppyoloe' + PPYOLOEP = 'ppyoloep' + YOLOV6 = 'yolov6' + YOLOV7 = 'yolov7' + RTMDET = 'rtmdet' + YOLOV8 = 'yolov8' + + +CLASS_NAMES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', + 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', + 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', + 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', + 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', + 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', + 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', + 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') + +CLASS_COLORS = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), + (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70), + (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0), + (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255), + (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157), + (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118), + (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182), + (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255), + (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255), + (134, 134, 103), (145, 148, 174), (255, 208, 186), + (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255), + (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105), + (166, 196, 102), (208, 195, 210), (255, 109, 65), + (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0), + (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161), + (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120), + (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), + (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), + (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45), + (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), + (246, 0, 122), (191, 162, 208)] + +YOLOv5_ANCHORS = [[(10, 13), (16, 30), (33, 23)], + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] + +YOLOv7_ANCHORS = [[(12, 16), (19, 36), (40, 28)], + [(36, 75), (76, 55), (72, 146)], + [(142, 110), (192, 243), (459, 401)]] diff --git a/third_party/mmyolo/projects/easydeploy/examples/cv2_nms.py b/third_party/mmyolo/projects/easydeploy/examples/cv2_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..79e376356b75339c796aeeb280cd8cdb52db8518 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/examples/cv2_nms.py @@ -0,0 +1,36 @@ +from typing import List, Tuple, Union + +import cv2 +from numpy import ndarray + +MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2]) +assert MAJOR == 4 + + +def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]], + scores: Union[List[float], Tuple[float]], + labels: Union[List[int], Tuple[int]], + conf_thres: float = 0.25, + iou_thres: float = 0.65) -> Tuple[List, List, List]: + if MINOR >= 7: + indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres, + iou_thres) + elif MINOR == 6: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres) + else: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, + iou_thres).flatten() + + nmsd_boxes = [] + nmsd_scores = [] + nmsd_labels = [] + for idx in indices: + box = boxes[idx] + # x0y0wh -> x0y0x1y1 + box[2:] = box[:2] + box[2:] + score = scores[idx] + label = labels[idx] + nmsd_boxes.append(box) + nmsd_scores.append(score) + nmsd_labels.append(label) + return nmsd_boxes, nmsd_scores, nmsd_labels diff --git a/third_party/mmyolo/projects/easydeploy/examples/main_onnxruntime.py b/third_party/mmyolo/projects/easydeploy/examples/main_onnxruntime.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0ad1b0f10ed6cbea8c8b3c0c5010ec7a760cb5 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/examples/main_onnxruntime.py @@ -0,0 +1,110 @@ +import math +import sys +from argparse import ArgumentParser +from pathlib import Path + +import cv2 +import onnxruntime +from config import (CLASS_COLORS, CLASS_NAMES, ModelType, YOLOv5_ANCHORS, + YOLOv7_ANCHORS) +from cv2_nms import non_max_suppression +from numpy_coder import Decoder +from preprocess import Preprocess +from tqdm import tqdm + +# Add __FILE__ to sys.path +sys.path.append(str(Path(__file__).resolve().parents[0])) + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def path_to_list(path: str): + path = Path(path) + if path.is_file() and path.suffix in IMG_EXTENSIONS: + res_list = [str(path.absolute())] + elif path.is_dir(): + res_list = [ + str(p.absolute()) for p in path.iterdir() + if p.suffix in IMG_EXTENSIONS + ] + else: + raise RuntimeError + return res_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('onnx', type=str, help='Onnx file') + parser.add_argument('--type', type=str, help='Model type') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--out-dir', default='./output', type=str, help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--iou-thr', type=float, default=0.7, help='Bbox iou threshold') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + out_dir = Path(args.out_dir) + model_type = ModelType(args.type.lower()) + + if not args.show: + out_dir.mkdir(parents=True, exist_ok=True) + + files = path_to_list(args.img) + session = onnxruntime.InferenceSession( + args.onnx, providers=['CPUExecutionProvider']) + preprocessor = Preprocess(model_type) + decoder = Decoder(model_type, model_only=True) + if model_type == ModelType.YOLOV5: + anchors = YOLOv5_ANCHORS + elif model_type == ModelType.YOLOV7: + anchors = YOLOv7_ANCHORS + else: + anchors = None + + for file in tqdm(files): + image = cv2.imread(file) + image_h, image_w = image.shape[:2] + img, (ratio_w, ratio_h) = preprocessor(image, args.img_size) + features = session.run(None, {'images': img}) + decoder_outputs = decoder( + features, + args.score_thr, + num_labels=len(CLASS_NAMES), + anchors=anchors) + nmsd_boxes, nmsd_scores, nmsd_labels = non_max_suppression( + *decoder_outputs, args.score_thr, args.iou_thr) + for box, score, label in zip(nmsd_boxes, nmsd_scores, nmsd_labels): + x0, y0, x1, y1 = box + x0 = math.floor(min(max(x0 / ratio_w, 1), image_w - 1)) + y0 = math.floor(min(max(y0 / ratio_h, 1), image_h - 1)) + x1 = math.ceil(min(max(x1 / ratio_w, 1), image_w - 1)) + y1 = math.ceil(min(max(y1 / ratio_h, 1), image_h - 1)) + cv2.rectangle(image, (x0, y0), (x1, y1), CLASS_COLORS[label], 2) + cv2.putText(image, f'{CLASS_NAMES[label]}: {score:.2f}', + (x0, y0 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, + (0, 255, 255), 2) + if args.show: + cv2.imshow('result', image) + cv2.waitKey(0) + else: + cv2.imwrite(f'{out_dir / Path(file).name}', image) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/projects/easydeploy/examples/numpy_coder.py b/third_party/mmyolo/projects/easydeploy/examples/numpy_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..ccd3687f89ed47dbbb1d90e603eba21a760bded9 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/examples/numpy_coder.py @@ -0,0 +1,310 @@ +from typing import List, Tuple, Union + +import numpy as np +from config import ModelType +from numpy import ndarray + + +def softmax(x: ndarray, axis: int = -1) -> ndarray: + e_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) + y = e_x / e_x.sum(axis=axis, keepdims=True) + return y + + +def sigmoid(x: ndarray) -> ndarray: + return 1. / (1. + np.exp(-x)) + + +class Decoder: + + def __init__(self, model_type: ModelType, model_only: bool = False): + self.model_type = model_type + self.model_only = model_only + self.boxes_pro = [] + self.scores_pro = [] + self.labels_pro = [] + self.is_logging = False + + def __call__(self, + feats: Union[List, Tuple], + conf_thres: float, + num_labels: int = 80, + **kwargs) -> Tuple: + if not self.is_logging: + print('Only support decode in batch==1') + self.is_logging = True + self.boxes_pro.clear() + self.scores_pro.clear() + self.labels_pro.clear() + + if self.model_only: + # transpose channel to last dim for easy decoding + feats = [ + np.ascontiguousarray(feat[0].transpose(1, 2, 0)) + for feat in feats + ] + else: + # ax620a horizonX3 transpose channel to last dim by default + feats = [np.ascontiguousarray(feat) for feat in feats] + if self.model_type == ModelType.YOLOV5: + self.__yolov5_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOX: + self.__yolox_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type in (ModelType.PPYOLOE, ModelType.PPYOLOEP): + self.__ppyoloe_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV6: + self.__yolov6_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV7: + self.__yolov7_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.RTMDET: + self.__rtmdet_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV8: + self.__yolov8_decode(feats, conf_thres, num_labels, **kwargs) + else: + raise NotImplementedError + return self.boxes_pro, self.scores_pro, self.labels_pro + + def __yolov5_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(10, 13), (16, 30), + (33, 23)], [(30, 61), (62, 45), + (59, 119)], [(116, 90), (156, 198), (373, 326)]]) + for i, feat in enumerate(feats): + stride = 8 << i + feat_h, feat_w, _ = feat.shape + anchor = anchors[i] + feat = sigmoid(feat) + feat = feat.reshape((feat_h, feat_w, len(anchor), -1)) + box_feat, conf_feat, score_feat = np.split(feat, [4, 5], -1) + + hIdx, wIdx, aIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = score_feat[hIdx, wIdx, aIdx] * conf_feat[hIdx, wIdx, + aIdx] + boxes = box_feat[hIdx, wIdx, aIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + + indices = np.where(scores > conf_thres)[0] + if len(indices) == 0: + continue + + for idx in indices: + a_w, a_h = anchor[aIdx[idx]] + x, y, w, h = boxes[idx] + x = (x * 2.0 - 0.5 + wIdx[idx]) * stride + y = (y * 2.0 - 0.5 + hIdx[idx]) * stride + w = (w * 2.0)**2 * a_w + h = (h * 2.0)**2 * a_h + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(scores[idx])) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(labels[idx])) + + def __yolox_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat, conf_feat = np.split( + feat, [num_labels, num_labels + 4], -1) + conf_feat = sigmoid(conf_feat) + + hIdx, wIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = sigmoid(score_feat[hIdx, wIdx]) * conf_feat[hIdx, + wIdx] + boxes = box_feat[hIdx, wIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + indices = np.where(scores > conf_thres)[0] + + if len(indices) == 0: + continue + + for idx in indices: + score = scores[idx] + label = labels[idx] + + x, y, w, h = boxes[idx] + + x = (x + wIdx[idx]) * stride + y = (y + hIdx[idx]) * stride + w = np.exp(w) * stride + h = np.exp(h) * stride + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __ppyoloe_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + reg_max: int = kwargs.get('reg_max', 17) + dfl = np.arange(0, reg_max, dtype=np.float32) + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx].reshape(num_proposal, 4, reg_max) + boxes = softmax(boxes, -1) @ dfl + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov6_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov7_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(12, 16), (19, 36), + (40, 28)], [(36, 75), (76, 55), + (72, 146)], [(142, 110), (192, 243), (459, 401)]]) + self.__yolov5_decode(feats, conf_thres, num_labels, anchors=anchors) + + def __rtmdet_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] - x0) * stride + y0 = (hIdx[k] - y0) * stride + x1 = (wIdx[k] + x1) * stride + y1 = (hIdx[k] + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov8_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + reg_max: int = kwargs.get('reg_max', 16) + self.__ppyoloe_decode(feats, conf_thres, num_labels, reg_max=reg_max) diff --git a/third_party/mmyolo/projects/easydeploy/examples/preprocess.py b/third_party/mmyolo/projects/easydeploy/examples/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..6b6fb563a16a7f40ef556b5a23f635ab4627fc4f --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/examples/preprocess.py @@ -0,0 +1,57 @@ +from typing import List, Tuple, Union + +import cv2 +import numpy as np +from config import ModelType +from numpy import ndarray + + +class Preprocess: + + def __init__(self, model_type: ModelType): + if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7, + ModelType.YOLOV8): + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.YOLOX: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([1, 1, 1], dtype=np.float32) + is_rgb = False + elif model_type == ModelType.PPYOLOE: + mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) + std = np.array([58.395, 57.12, 57.375], dtype=np.float32) + is_rgb = True + + elif model_type == ModelType.PPYOLOEP: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.RTMDET: + mean = np.array([103.53, 116.28, 123.675], dtype=np.float32) + std = np.array([57.375, 57.12, 58.3955], dtype=np.float32) + is_rgb = False + else: + raise NotImplementedError + + self.mean = mean.reshape((3, 1, 1)) + self.std = std.reshape((3, 1, 1)) + self.is_rgb = is_rgb + + def __call__(self, + image: ndarray, + new_size: Union[List[int], Tuple[int]] = (640, 640), + **kwargs) -> Tuple[ndarray, Tuple[float, float]]: + # new_size: (height, width) + height, width = image.shape[:2] + ratio_h, ratio_w = new_size[0] / height, new_size[1] / width + image = cv2.resize( + image, (0, 0), + fx=ratio_w, + fy=ratio_h, + interpolation=cv2.INTER_LINEAR) + image = np.ascontiguousarray(image.transpose(2, 0, 1)) + image = image.astype(np.float32) + image -= self.mean + image /= self.std + return image[np.newaxis], (ratio_w, ratio_h) diff --git a/third_party/mmyolo/projects/easydeploy/examples/requirements.txt b/third_party/mmyolo/projects/easydeploy/examples/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b761189b52fc57e4231b37df0ff42bb44404c95 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/examples/requirements.txt @@ -0,0 +1,2 @@ +onnxruntime +opencv-python==4.7.0.72 diff --git a/third_party/mmyolo/projects/easydeploy/model/__init__.py b/third_party/mmyolo/projects/easydeploy/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..38af8bc322b0a8e0c870fac243a0af9c1dba7315 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/model/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backend import MMYOLOBackend +from .backendwrapper import ORTWrapper, TRTWrapper +from .model import DeployModel + +__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend'] diff --git a/third_party/mmyolo/projects/easydeploy/model/backend.py b/third_party/mmyolo/projects/easydeploy/model/backend.py new file mode 100644 index 0000000000000000000000000000000000000000..64d6e3f020bcfd3c3cf7db5f5611a8f815df4cb1 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/model/backend.py @@ -0,0 +1,23 @@ +from enum import Enum + +import torch +import torch.nn.functional as F + + +class MMYOLOBackend(Enum): + AX620A = 'ax620a' + COREML = 'coreml' + HORIZONX3 = 'horizonx3' + NCNN = 'ncnn' + ONNXRUNTIME = 'onnxruntime' + OPENVINO = 'openvino' + PPLNN = 'pplnn' + RKNN = 'rknn' + TENSORRT8 = 'tensorrt8' + TENSORRT7 = 'tensorrt7' + TORCHSCRIPT = 'torchscript' + TVM = 'tvm' + + +def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor: + return F.hardsigmoid(x, inplace=True) diff --git a/third_party/mmyolo/projects/easydeploy/model/backendwrapper.py b/third_party/mmyolo/projects/easydeploy/model/backendwrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..2997d84ea98b3f30973cf2335ab0eb4af4edaef5 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/model/backendwrapper.py @@ -0,0 +1,202 @@ +import warnings +from collections import namedtuple +from functools import partial +from pathlib import Path +from typing import List, Optional, Union + +import numpy as np +import onnxruntime + +try: + import tensorrt as trt +except Exception: + trt = None +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class TRTWrapper(torch.nn.Module): + dtype_mapping = {} + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix in ('.engine', '.plan') + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.stream = torch.cuda.Stream(device=device) + self.__update_mapping() + self.__init_engine() + self.__init_bindings() + + def __update_mapping(self): + self.dtype_mapping.update({ + trt.bool: torch.bool, + trt.int8: torch.int8, + trt.int32: torch.int32, + trt.float16: torch.float16, + trt.float32: torch.float32 + }) + + def __init_engine(self): + logger = trt.Logger(trt.Logger.ERROR) + self.log = partial(logger.log, trt.Logger.ERROR) + trt.init_libnvinfer_plugins(logger, namespace='') + self.logger = logger + with trt.Runtime(logger) as runtime: + model = runtime.deserialize_cuda_engine(self.weight.read_bytes()) + + context = model.create_execution_context() + + names = [model.get_binding_name(i) for i in range(model.num_bindings)] + + num_inputs, num_outputs = 0, 0 + + for i in range(model.num_bindings): + if model.binding_is_input(i): + num_inputs += 1 + else: + num_outputs += 1 + + self.is_dynamic = -1 in model.get_binding_shape(0) + + self.model = model + self.context = context + self.input_names = names[:num_inputs] + self.output_names = names[num_inputs:] + self.num_inputs = num_inputs + self.num_outputs = num_outputs + self.num_bindings = num_inputs + num_outputs + self.bindings: List[int] = [0] * self.num_bindings + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + + for i, name in enumerate(self.input_names): + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + inputs_info.append(Binding(name, dtype, shape)) + + for i, name in enumerate(self.output_names): + i += self.num_inputs + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + outputs_info.append(Binding(name, dtype, shape)) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + if not self.is_dynamic: + self.output_tensor = [ + torch.empty(o.shape, dtype=o.dtype, device=self.device) + for o in outputs_info + ] + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[torch.Tensor] = [ + i.contiguous() for i in inputs + ] + + for i in range(self.num_inputs): + self.bindings[i] = contiguous_inputs[i].data_ptr() + if self.is_dynamic: + self.context.set_binding_shape( + i, tuple(contiguous_inputs[i].shape)) + + # create output tensors + outputs: List[torch.Tensor] = [] + + for i in range(self.num_outputs): + j = i + self.num_inputs + if self.is_dynamic: + shape = tuple(self.context.get_binding_shape(j)) + output = torch.empty( + size=shape, + dtype=self.output_dtypes[i], + device=self.device) + + else: + output = self.output_tensor[i] + outputs.append(output) + self.bindings[j] = output.data_ptr() + + self.context.execute_async_v2(self.bindings, self.stream.cuda_stream) + self.stream.synchronize() + + return tuple(outputs) + + +class ORTWrapper(torch.nn.Module): + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix == '.onnx' + + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.__init_session() + self.__init_bindings() + + def __init_session(self): + providers = ['CPUExecutionProvider'] + if 'cuda' in self.device.type: + providers.insert(0, 'CUDAExecutionProvider') + + session = onnxruntime.InferenceSession( + str(self.weight), providers=providers) + self.session = session + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + self.is_dynamic = False + for i, tensor in enumerate(self.session.get_inputs()): + if any(not isinstance(i, int) for i in tensor.shape): + self.is_dynamic = True + inputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + + for i, tensor in enumerate(self.session.get_outputs()): + outputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + self.num_inputs = len(inputs_info) + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[np.ndarray] = [ + i.contiguous().cpu().numpy() for i in inputs + ] + + if not self.is_dynamic: + # make sure input shape is right for static input shape + for i in range(self.num_inputs): + assert contiguous_inputs[i].shape == self.inputs_info[i].shape + + outputs = self.session.run([o.name for o in self.outputs_info], { + j.name: contiguous_inputs[i] + for i, j in enumerate(self.inputs_info) + }) + + return tuple(torch.from_numpy(o).to(self.device) for o in outputs) diff --git a/third_party/mmyolo/projects/easydeploy/model/model.py b/third_party/mmyolo/projects/easydeploy/model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c67ed2872097e82d7f569a2f486b1a6463cde986 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/model/model.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from functools import partial +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmdet.models.backbones.csp_darknet import Focus +from mmdet.models.layers import ChannelAttention +from mmengine.config import ConfigDict +from torch import Tensor + +from mmyolo.models import RepVGGBlock +from mmyolo.models.dense_heads import (PPYOLOEHead, RTMDetHead, YOLOv5Head, + YOLOv7Head, YOLOv8Head, YOLOXHead) +from mmyolo.models.layers import ImplicitA, ImplicitM +from ..backbone import DeployFocus, GConvFocus, NcnnFocus +from ..bbox_code import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) +from ..nms import batched_nms, efficient_nms, onnx_nms +from .backend import MMYOLOBackend + + +class DeployModel(nn.Module): + transpose = False + + def __init__(self, + baseModel: nn.Module, + backend: MMYOLOBackend, + postprocess_cfg: Optional[ConfigDict] = None): + super().__init__() + self.baseModel = baseModel + self.baseHead = baseModel.bbox_head + self.backend = backend + if postprocess_cfg is None: + self.with_postprocess = False + else: + self.with_postprocess = True + self.__init_sub_attributes() + self.detector_type = type(self.baseHead) + self.pre_top_k = postprocess_cfg.get('pre_top_k', 1000) + self.keep_top_k = postprocess_cfg.get('keep_top_k', 100) + self.iou_threshold = postprocess_cfg.get('iou_threshold', 0.65) + self.score_threshold = postprocess_cfg.get('score_threshold', 0.25) + self.__switch_deploy() + + def __init_sub_attributes(self): + self.bbox_decoder = self.baseHead.bbox_coder.decode + self.prior_generate = self.baseHead.prior_generator.grid_priors + self.num_base_priors = self.baseHead.num_base_priors + self.featmap_strides = self.baseHead.featmap_strides + self.num_classes = self.baseHead.num_classes + + def __switch_deploy(self): + headType = type(self.baseHead) + if not self.with_postprocess: + if headType in (YOLOv5Head, YOLOv7Head): + self.baseHead.head_module.forward_single = self.forward_single + elif headType in (PPYOLOEHead, YOLOv8Head): + self.baseHead.head_module.reg_max = 0 + + if self.backend in (MMYOLOBackend.HORIZONX3, MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + self.transpose = True + for layer in self.baseModel.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + elif isinstance(layer, ChannelAttention): + layer.global_avgpool.forward = self.forward_gvp + elif isinstance(layer, Focus): + # onnxruntime openvino tensorrt8 tensorrt7 + if self.backend in (MMYOLOBackend.ONNXRUNTIME, + MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + self.baseModel.backbone.stem = DeployFocus(layer) + # ncnn + elif self.backend == MMYOLOBackend.NCNN: + self.baseModel.backbone.stem = NcnnFocus(layer) + # switch focus to group conv + else: + self.baseModel.backbone.stem = GConvFocus(layer) + + def pred_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + **kwargs): + assert len(cls_scores) == len(bbox_preds) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + + nms_func = self.select_nms() + if self.detector_type in (YOLOv5Head, YOLOv7Head): + bbox_decoder = yolov5_bbox_decoder + elif self.detector_type is RTMDetHead: + bbox_decoder = rtmdet_bbox_decoder + elif self.detector_type is YOLOXHead: + bbox_decoder = yolox_bbox_decoder + else: + bbox_decoder = self.bbox_decoder + + num_imgs = cls_scores[0].shape[0] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + mlvl_priors = self.prior_generate( + featmap_sizes, dtype=dtype, device=device) + + flatten_priors = torch.cat(mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size[0] * featmap_size[1] * self.num_base_priors, ), + stride) for featmap_size, stride in zip( + featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) + + return nms_func(bboxes, scores, self.keep_top_k, self.iou_threshold, + self.score_threshold, self.pre_top_k, self.keep_top_k) + + def select_nms(self): + if self.backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO): + nms_func = onnx_nms + elif self.backend == MMYOLOBackend.TENSORRT8: + nms_func = efficient_nms + elif self.backend == MMYOLOBackend.TENSORRT7: + nms_func = batched_nms + else: + raise NotImplementedError + if type(self.baseHead) in (YOLOv5Head, YOLOv7Head, YOLOXHead): + nms_func = partial(nms_func, box_coding=1) + + return nms_func + + def forward(self, inputs: Tensor): + neck_outputs = self.baseModel(inputs) + if self.with_postprocess: + return self.pred_by_feat(*neck_outputs) + else: + outputs = [] + if self.transpose: + for feats in zip(*neck_outputs): + if self.backend in (MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + outputs.append( + torch.cat( + [feat.permute(0, 2, 3, 1) for feat in feats], + -1)) + else: + outputs.append(torch.cat(feats, 1).permute(0, 2, 3, 1)) + else: + for feats in zip(*neck_outputs): + outputs.append(torch.cat(feats, 1)) + return tuple(outputs) + + @staticmethod + def forward_single(x: Tensor, convs: nn.Module) -> Tuple[Tensor]: + if isinstance(convs, nn.Sequential) and any( + type(m) in (ImplicitA, ImplicitM) for m in convs): + a, c, m = convs + aw = a.implicit.clone() + mw = m.implicit.clone() + c = deepcopy(c) + nw, cw, _, _ = c.weight.shape + na, ca, _, _ = aw.shape + nm, cm, _, _ = mw.shape + c.bias = nn.Parameter(c.bias + ( + c.weight.reshape(nw, cw) @ aw.reshape(ca, na)).squeeze(1)) + c.bias = nn.Parameter(c.bias * mw.reshape(cm)) + c.weight = nn.Parameter(c.weight * mw.transpose(0, 1)) + convs = c + feat = convs(x) + return (feat, ) + + @staticmethod + def forward_gvp(x: Tensor) -> Tensor: + return torch.mean(x, [2, 3], keepdim=True) diff --git a/third_party/mmyolo/projects/easydeploy/nms/__init__.py b/third_party/mmyolo/projects/easydeploy/nms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..59c5cdbd2b3b195125a14f473b825f616755fd6e --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/nms/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ort_nms import onnx_nms +from .trt_nms import batched_nms, efficient_nms + +__all__ = ['efficient_nms', 'batched_nms', 'onnx_nms'] diff --git a/third_party/mmyolo/projects/easydeploy/nms/ort_nms.py b/third_party/mmyolo/projects/easydeploy/nms/ort_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..aad93cf05ac2ee9d61a85b4bf9e7b63c352859ec --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/nms/ort_nms.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +def select_nms_index(scores: Tensor, + boxes: Tensor, + nms_index: Tensor, + batch_size: int, + keep_top_k: int = -1): + batch_inds, cls_inds = nms_index[:, 0], nms_index[:, 1] + box_inds = nms_index[:, 2] + + scores = scores[batch_inds, cls_inds, box_inds].unsqueeze(1) + boxes = boxes[batch_inds, box_inds, ...] + dets = torch.cat([boxes, scores], dim=1) + + batched_dets = dets.unsqueeze(0).repeat(batch_size, 1, 1) + batch_template = torch.arange( + 0, batch_size, dtype=batch_inds.dtype, device=batch_inds.device) + batched_dets = batched_dets.where( + (batch_inds == batch_template.unsqueeze(1)).unsqueeze(-1), + batched_dets.new_zeros(1)) + + batched_labels = cls_inds.unsqueeze(0).repeat(batch_size, 1) + batched_labels = batched_labels.where( + (batch_inds == batch_template.unsqueeze(1)), + batched_labels.new_ones(1) * -1) + + N = batched_dets.shape[0] + + batched_dets = torch.cat((batched_dets, batched_dets.new_zeros((N, 1, 5))), + 1) + batched_labels = torch.cat((batched_labels, -batched_labels.new_ones( + (N, 1))), 1) + + _, topk_inds = batched_dets[:, :, -1].sort(dim=1, descending=True) + topk_batch_inds = torch.arange( + batch_size, dtype=topk_inds.dtype, + device=topk_inds.device).view(-1, 1) + batched_dets = batched_dets[topk_batch_inds, topk_inds, ...] + batched_labels = batched_labels[topk_batch_inds, topk_inds, ...] + batched_dets, batched_scores = batched_dets.split([4, 1], 2) + batched_scores = batched_scores.squeeze(-1) + + num_dets = (batched_scores > 0).sum(1, keepdim=True) + return num_dets, batched_dets, batched_scores, batched_labels + + +class ONNXNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]) + ) -> Tensor: + device = boxes.device + batch = scores.shape[0] + num_det = 20 + batches = torch.randint(0, batch, (num_det, )).sort()[0].to(device) + idxs = torch.arange(100, 100 + num_det).to(device) + zeros = torch.zeros((num_det, ), dtype=torch.int64).to(device) + selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], + 0).T.contiguous() + selected_indices = selected_indices.to(torch.int64) + + return selected_indices + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]), + ): + return g.op( + 'NonMaxSuppression', + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + outputs=1) + + +def onnx_nms( + boxes: torch.Tensor, + scores: torch.Tensor, + max_output_boxes_per_class: int = 100, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + max_output_boxes_per_class = torch.tensor([max_output_boxes_per_class]) + iou_threshold = torch.tensor([iou_threshold]) + score_threshold = torch.tensor([score_threshold]) + + batch_size, _, _ = scores.shape + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + scores = scores.transpose(1, 2).contiguous() + selected_indices = ONNXNMSop.apply(boxes, scores, + max_output_boxes_per_class, + iou_threshold, score_threshold) + + num_dets, batched_dets, batched_scores, batched_labels = select_nms_index( + scores, boxes, selected_indices, batch_size, keep_top_k=keep_top_k) + + return num_dets, batched_dets, batched_scores, batched_labels.to( + torch.int32) diff --git a/third_party/mmyolo/projects/easydeploy/nms/trt_nms.py b/third_party/mmyolo/projects/easydeploy/nms/trt_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..e0db1e2164d4366ff9ce4f74d39ded917c39ba79 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/nms/trt_nms.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +class TRTEfficientNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25, + ): + batch_size, _, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25): + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +class TRTbatchedNMSop(torch.autograd.Function): + """TensorRT NMS operation.""" + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + batch_size, _, numClasses = scores.shape + num_det = torch.randint( + 0, keepTopK, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, keepTopK, 4) + det_scores = torch.randn(batch_size, keepTopK) + det_classes = torch.randint(0, numClasses, + (batch_size, keepTopK)).float() + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + out = g.op( + 'TRT::BatchedNMSDynamic_TRT', + boxes, + scores, + shareLocation_i=shareLocation, + plugin_version_s=plugin_version, + backgroundLabelId_i=backgroundLabelId, + numClasses_i=numClasses, + topK_i=topK, + keepTopK_i=keepTopK, + scoreThreshold_f=scoreThreshold, + iouThreshold_f=iouThreshold, + isNormalized_i=isNormalized, + clipBoxes_i=clipBoxes, + scoreBits_i=scoreBits, + caffeSemantics_i=caffeSemantics, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + return num_det, det_boxes, det_scores, det_classes + + +def _batched_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, _, numClasses = scores.shape + + num_det, det_boxes, det_scores, det_classes = TRTbatchedNMSop.apply( + boxes, scores, '1', 1, -1, int(numClasses), min(pre_top_k, 4096), + keep_top_k, score_threshold, iou_threshold, 0, 0, 16, 1) + + det_classes = det_classes.int() + return num_det, det_boxes, det_scores, det_classes + + +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +def batched_nms(*args, **kwargs): + """Wrapper function for `_batched_nms`.""" + return _batched_nms(*args, **kwargs) diff --git a/third_party/mmyolo/projects/easydeploy/tools/build_engine.py b/third_party/mmyolo/projects/easydeploy/tools/build_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..b400c9db826878a7bb0fb13f4b1dea9b793583e7 --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/tools/build_engine.py @@ -0,0 +1,136 @@ +import argparse +from pathlib import Path +from typing import List, Optional, Tuple, Union + +try: + import tensorrt as trt +except Exception: + trt = None +import warnings + +import numpy as np +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class EngineBuilder: + + def __init__( + self, + checkpoint: Union[str, Path], + opt_shape: Union[Tuple, List] = (1, 3, 640, 640), + device: Optional[Union[str, int, torch.device]] = None) -> None: + checkpoint = Path(checkpoint) if isinstance(checkpoint, + str) else checkpoint + assert checkpoint.exists() and checkpoint.suffix == '.onnx' + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + + self.checkpoint = checkpoint + self.opt_shape = np.array(opt_shape, dtype=np.float32) + self.device = device + + def __build_engine(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling: bool = True) -> None: + logger = trt.Logger(trt.Logger.WARNING) + trt.init_libnvinfer_plugins(logger, namespace='') + builder = trt.Builder(logger) + config = builder.create_builder_config() + config.max_workspace_size = torch.cuda.get_device_properties( + self.device).total_memory + flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + network = builder.create_network(flag) + parser = trt.OnnxParser(network, logger) + if not parser.parse_from_file(str(self.checkpoint)): + raise RuntimeError( + f'failed to load ONNX file: {str(self.checkpoint)}') + inputs = [network.get_input(i) for i in range(network.num_inputs)] + outputs = [network.get_output(i) for i in range(network.num_outputs)] + profile = None + dshape = -1 in network.get_input(0).shape + if dshape: + profile = builder.create_optimization_profile() + if scale is None: + scale = np.array( + [[1, 1, 0.5, 0.5], [1, 1, 1, 1], [4, 1, 1.5, 1.5]], + dtype=np.float32) + scale = (self.opt_shape * scale).astype(np.int32) + elif isinstance(scale, List): + scale = np.array(scale, dtype=np.int32) + assert scale.shape[0] == 3, 'Input a wrong scale list' + else: + raise NotImplementedError + + for inp in inputs: + logger.log( + trt.Logger.WARNING, + f'input "{inp.name}" with shape{inp.shape} {inp.dtype}') + if dshape: + profile.set_shape(inp.name, *scale) + for out in outputs: + logger.log( + trt.Logger.WARNING, + f'output "{out.name}" with shape{out.shape} {out.dtype}') + if fp16 and builder.platform_has_fast_fp16: + config.set_flag(trt.BuilderFlag.FP16) + self.weight = self.checkpoint.with_suffix('.engine') + if dshape: + config.add_optimization_profile(profile) + if with_profiling: + config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED + with builder.build_engine(network, config) as engine: + self.weight.write_bytes(engine.serialize()) + logger.log( + trt.Logger.WARNING, f'Build tensorrt engine finish.\n' + f'Save in {str(self.weight.absolute())}') + + def build(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling=True): + self.__build_engine(scale, fp16, with_profiling) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--device', type=str, default='cuda:0', help='TensorRT builder device') + parser.add_argument( + '--scales', + type=str, + default='[[1,3,640,640],[1,3,640,640],[1,3,640,640]]', + help='Input scales for build dynamic input shape engine') + parser.add_argument( + '--fp16', action='store_true', help='Build model with fp16 mode') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def main(args): + img_size = (1, 3, *args.img_size) + try: + scales = eval(args.scales) + except Exception: + print('Input scales is not a python variable') + print('Set scales default None') + scales = None + builder = EngineBuilder(args.checkpoint, img_size, args.device) + builder.build(scales, fp16=args.fp16) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/third_party/mmyolo/projects/easydeploy/tools/export_onnx.py b/third_party/mmyolo/projects/easydeploy/tools/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b937cc8a72b5c09d61580ddb1297213693adaf1c --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/tools/export_onnx.py @@ -0,0 +1,157 @@ +import argparse +import os +import sys +import warnings +from io import BytesIO +from pathlib import Path + +import onnx +import torch +from mmdet.apis import init_detector +from mmengine.config import ConfigDict +from mmengine.logging import print_log +from mmengine.utils.path import mkdir_or_exist + +# Add MMYOLO ROOT to sys.path +sys.path.append(str(Path(__file__).resolve().parents[3])) +from projects.easydeploy.model import DeployModel, MMYOLOBackend # noqa E402 + +warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning) +warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning) +warnings.filterwarnings(action='ignore', category=UserWarning) +warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action='ignore', category=ResourceWarning) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--model-only', action='store_true', help='Export model only') + parser.add_argument( + '--work-dir', default='./work_dir', help='Path to save export model') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument('--batch-size', type=int, default=1, help='Batch size') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--simplify', + action='store_true', + help='Simplify onnx model by onnx-sim') + parser.add_argument( + '--opset', type=int, default=11, help='ONNX opset version') + parser.add_argument( + '--backend', + type=str, + default='onnxruntime', + help='Backend for export onnx') + parser.add_argument( + '--pre-topk', + type=int, + default=1000, + help='Postprocess pre topk bboxes feed into NMS') + parser.add_argument( + '--keep-topk', + type=int, + default=100, + help='Postprocess keep topk bboxes out of NMS') + parser.add_argument( + '--iou-threshold', + type=float, + default=0.65, + help='IoU threshold for NMS') + parser.add_argument( + '--score-threshold', + type=float, + default=0.25, + help='Score threshold for NMS') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def build_model_from_cfg(config_path, checkpoint_path, device): + model = init_detector(config_path, checkpoint_path, device=device) + model.eval() + return model + + +def main(): + args = parse_args() + mkdir_or_exist(args.work_dir) + backend = MMYOLOBackend(args.backend.lower()) + if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7): + if not args.model_only: + print_log('Export ONNX with bbox decoder and NMS ...') + else: + args.model_only = True + print_log(f'Can not export postprocess for {args.backend.lower()}.\n' + f'Set "args.model_only=True" default.') + if args.model_only: + postprocess_cfg = None + output_names = None + else: + postprocess_cfg = ConfigDict( + pre_top_k=args.pre_topk, + keep_top_k=args.keep_topk, + iou_threshold=args.iou_threshold, + score_threshold=args.score_threshold) + output_names = ['num_dets', 'boxes', 'scores', 'labels'] + baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device) + + deploy_model = DeployModel( + baseModel=baseModel, backend=backend, postprocess_cfg=postprocess_cfg) + deploy_model.eval() + + fake_input = torch.randn(args.batch_size, 3, + *args.img_size).to(args.device) + # dry run + deploy_model(fake_input) + + save_onnx_path = os.path.join( + args.work_dir, + os.path.basename(args.checkpoint).replace('pth', 'onnx')) + # export onnx + with BytesIO() as f: + torch.onnx.export( + deploy_model, + fake_input, + f, + input_names=['images'], + output_names=output_names, + opset_version=args.opset) + f.seek(0) + onnx_model = onnx.load(f) + onnx.checker.check_model(onnx_model) + + # Fix tensorrt onnx output shape, just for view + if not args.model_only and backend in (MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + shapes = [ + args.batch_size, 1, args.batch_size, args.keep_topk, 4, + args.batch_size, args.keep_topk, args.batch_size, + args.keep_topk + ] + for i in onnx_model.graph.output: + for j in i.type.tensor_type.shape.dim: + j.dim_param = str(shapes.pop(0)) + if args.simplify: + try: + import onnxsim + onnx_model, check = onnxsim.simplify(onnx_model) + assert check, 'assert check failed' + except Exception as e: + print_log(f'Simplify failure: {e}') + onnx.save(onnx_model, save_onnx_path) + print_log(f'ONNX export success, save into {save_onnx_path}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/projects/easydeploy/tools/image-demo.py b/third_party/mmyolo/projects/easydeploy/tools/image-demo.py new file mode 100644 index 0000000000000000000000000000000000000000..c85f31a02beeb708e23662fe08dd0a105f112aaf --- /dev/null +++ b/third_party/mmyolo/projects/easydeploy/tools/image-demo.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from projects.easydeploy.model import ORTWrapper, TRTWrapper # isort:skip +import os +import random +from argparse import ArgumentParser + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.transforms import Compose +from mmdet.utils import get_test_pipeline_cfg +from mmengine.config import Config, ConfigDict +from mmengine.utils import ProgressBar, path + +from mmyolo.utils import register_all_modules +from mmyolo.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + args = parser.parse_args() + return args + + +def preprocess(config): + data_preprocess = config.get('model', {}).get('data_preprocessor', {}) + mean = data_preprocess.get('mean', [0., 0., 0.]) + std = data_preprocess.get('std', [1., 1., 1.]) + mean = torch.tensor(mean, dtype=torch.float32).reshape(1, 3, 1, 1) + std = torch.tensor(std, dtype=torch.float32).reshape(1, 3, 1, 1) + + class PreProcess(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + x = x[None].float() + x -= mean.to(x.device) + x /= std.to(x.device) + return x + + return PreProcess().eval() + + +def main(): + args = parse_args() + + # register all modules in mmdet into the registries + register_all_modules() + + colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(1000)] + + # build the model from a config file and a checkpoint file + if args.checkpoint.endswith('.onnx'): + model = ORTWrapper(args.checkpoint, args.device) + elif args.checkpoint.endswith('.engine') or args.checkpoint.endswith( + '.plan'): + model = TRTWrapper(args.checkpoint, args.device) + else: + raise NotImplementedError + + model.to(args.device) + + cfg = Config.fromfile(args.config) + class_names = cfg.get('class_name') + + test_pipeline = get_test_pipeline_cfg(cfg) + test_pipeline[0] = ConfigDict({'type': 'mmdet.LoadImageFromNDArray'}) + test_pipeline = Compose(test_pipeline) + + pre_pipeline = preprocess(cfg) + + if not args.show: + path.mkdir_or_exist(args.out_dir) + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for i, file in enumerate(files): + bgr = mmcv.imread(file) + rgb = mmcv.imconvert(bgr, 'bgr', 'rgb') + data, samples = test_pipeline(dict(img=rgb, img_id=i)).values() + pad_param = samples.get('pad_param', + np.array([0, 0, 0, 0], dtype=np.float32)) + h, w = samples.get('ori_shape', rgb.shape[:2]) + pad_param = torch.asarray( + [pad_param[2], pad_param[0], pad_param[2], pad_param[0]], + device=args.device) + scale_factor = samples.get('scale_factor', [1., 1]) + scale_factor = torch.asarray(scale_factor * 2, device=args.device) + data = pre_pipeline(data).to(args.device) + + result = model(data) + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # Get candidate predict info by num_dets + num_dets, bboxes, scores, labels = result + scores = scores[0, :num_dets] + bboxes = bboxes[0, :num_dets] + labels = labels[0, :num_dets] + bboxes -= pad_param + bboxes /= scale_factor + + bboxes[:, 0::2].clamp_(0, w) + bboxes[:, 1::2].clamp_(0, h) + bboxes = bboxes.round().int() + + for (bbox, score, label) in zip(bboxes, scores, labels): + bbox = bbox.tolist() + color = colors[label] + + if class_names is not None: + label_name = class_names[label] + name = f'cls:{label_name}_score:{score:0.4f}' + else: + name = f'cls:{label}_score:{score:0.4f}' + + cv2.rectangle(bgr, bbox[:2], bbox[2:], color, 2) + cv2.putText( + bgr, + name, (bbox[0], bbox[1] - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 2.0, [225, 255, 255], + thickness=3) + + if args.show: + mmcv.imshow(bgr, 'result', 0) + else: + mmcv.imwrite(bgr, out_file) + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/projects/example_project/README.md b/third_party/mmyolo/projects/example_project/README.md new file mode 100644 index 0000000000000000000000000000000000000000..24c84d9808aa4a78294aa23058083e0de80de62e --- /dev/null +++ b/third_party/mmyolo/projects/example_project/README.md @@ -0,0 +1,141 @@ +# Dummy YOLOv5CSPDarknet Wrapper + +This is an example README for community `projects/`. We have provided detailed explanations for each field in the form of html comments, which are visible when you read the source of this README file. If you wish to submit your project to our main repository, then all the fields in this README are mandatory for others to understand what you have achieved in this implementation. For more details, read our [contribution guide](https://mmyolo.readthedocs.io/en/latest/community/contributing.html) or approach us in [Discussions](https://github.com/open-mmlab/mmyolo/discussions). + +## Description + + + +This project implements a dummy YOLOv5CSPDarknet wrapper, which literally does nothing new but prints "hello world" during initialization. + +## Usage + + + +### Training commands + +In MMYOLO's root directory, run the following command to train the model: + +```bash +python tools/train.py projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py +``` + +### Testing commands + +In MMYOLO's root directory, run the following command to test the model: + +```bash +python tools/test.py projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py ${CHECKPOINT_PATH} +``` + +## Results + + + +| Method | Backbone | Pretrained Model | Training set | Test set | #epoch | box AP | Download | +| :---------------------------------------------------------------------------: | :-------------------: | :--------------: | :------------: | :----------: | :----: | :----: | :----------------------: | +| [YOLOv5 dummy](configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py) | DummyYOLOv5CSPDarknet | - | COCO2017 Train | COCO2017 Val | 300 | 37.7 | [model](<>) \| [log](<>) | + +## Citation + + + +```latex +@software{glenn_jocher_2022_7002879, + author = {Glenn Jocher and + Ayush Chaurasia and + Alex Stoken and + Jirka Borovec and + NanoCode012 and + Yonghye Kwon and + TaoXie and + Kalen Michael and + Jiacong Fang and + imyhxy and + Lorna and + Colin Wong and + 曾逸夫(Zeng Yifu) and + Abhiram V and + Diego Montes and + Zhiqiang Wang and + Cristi Fati and + Jebastin Nadar and + Laughing and + UnglvKitDe and + tkianai and + yxNONG and + Piotr Skalski and + Adam Hogan and + Max Strobel and + Mrinal Jain and + Lorenzo Mammana and + xylieong}, + title = {{ultralytics/yolov5: v6.2 - YOLOv5 Classification + Models, Apple M1, Reproducibility, ClearML and + Deci.ai integrations}}, + month = aug, + year = 2022, + publisher = {Zenodo}, + version = {v6.2}, + doi = {10.5281/zenodo.7002879}, + url = {https://doi.org/10.5281/zenodo.7002879} +} +``` + +## Checklist + + + +- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [ ] Finish the code + + + + - [ ] Basic docstrings & proper citation + + + + - [ ] Test-time correctness + + + + - [ ] A full README + + + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training-time correctness + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Type hints and docstrings + + + + - [ ] Unit tests + + + + - [ ] Code polishing + + + + - [ ] Metafile.yml + + + +- [ ] Move your modules into the core package following the codebase's file hierarchy structure. + + + +- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure. diff --git a/third_party/mmyolo/projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py b/third_party/mmyolo/projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..55b43bb3e97a20b4b9f98d5bc297bf8ef375da8e --- /dev/null +++ b/third_party/mmyolo/projects/example_project/configs/yolov5_s_dummy-backbone_v61_syncbn_8xb16-300e_coco.py @@ -0,0 +1,5 @@ +_base_ = '../../../configs/yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py' + +custom_imports = dict(imports=['projects.example_project.dummy']) + +_base_.model.backbone.type = 'DummyYOLOv5CSPDarknet' diff --git a/third_party/mmyolo/projects/example_project/dummy/__init__.py b/third_party/mmyolo/projects/example_project/dummy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ca1028c8735be8ece5942d0ca64b69a8da16ed82 --- /dev/null +++ b/third_party/mmyolo/projects/example_project/dummy/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .dummy_yolov5cspdarknet import DummyYOLOv5CSPDarknet + +__all__ = ['DummyYOLOv5CSPDarknet'] diff --git a/third_party/mmyolo/projects/example_project/dummy/dummy_yolov5cspdarknet.py b/third_party/mmyolo/projects/example_project/dummy/dummy_yolov5cspdarknet.py new file mode 100644 index 0000000000000000000000000000000000000000..c500abb4278581af99d6a190fd7694ffdd08117c --- /dev/null +++ b/third_party/mmyolo/projects/example_project/dummy/dummy_yolov5cspdarknet.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmyolo.models import YOLOv5CSPDarknet +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class DummyYOLOv5CSPDarknet(YOLOv5CSPDarknet): + """Implements a dummy YOLOv5CSPDarknet wrapper for demonstration purpose. + Args: + **kwargs: All the arguments are passed to the parent class. + """ + + def __init__(self, **kwargs) -> None: + print('Hello world!') + super().__init__(**kwargs) diff --git a/third_party/mmyolo/projects/misc/custom_dataset/README.md b/third_party/mmyolo/projects/misc/custom_dataset/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e98fa730241aee0d54fea62fb752ab4eb901f0a0 --- /dev/null +++ b/third_party/mmyolo/projects/misc/custom_dataset/README.md @@ -0,0 +1,3 @@ +Tips: 这个是自定义数据集的 config 文件,请结合 [标注+训练+测试+部署全流程](https://github.com/open-mmlab/mmyolo/blob/main/docs/zh_cn/recommended_topics/labeling_to_deployment_tutorials.md) 来使用。 + +Tips: This is the config file of the custom dataset. Please use it in combination with [Annotation-to-deployment workflow for custom dataset](https://github.com/open-mmlab/mmyolo/blob/main/docs/en/recommended_topics/labeling_to_deployment_tutorials.md). diff --git a/third_party/mmyolo/projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py b/third_party/mmyolo/projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..1d6a9d3b0f5ecf9ff7a46202d50b733810d93124 --- /dev/null +++ b/third_party/mmyolo/projects/misc/custom_dataset/yolov5_s-v61_syncbn_fast_1xb32-100e_cat.py @@ -0,0 +1,76 @@ +_base_ = '../yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +max_epochs = 100 +data_root = './data/cat/' +# data_root = '/root/workspace/mmyolo/data/cat/' # Docker + +work_dir = './work_dirs/yolov5_s-v61_syncbn_fast_1xb32-100e_cat' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +train_batch_size_per_gpu = 32 +train_num_workers = 4 + +save_epoch_intervals = 2 + +# base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 4 + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(220, 20, 60)]) + +train_cfg = dict( + max_epochs=max_epochs, val_begin=20, val_interval=save_epoch_intervals) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=10)) diff --git a/third_party/mmyolo/projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py b/third_party/mmyolo/projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..67d5638aae7532efb60bd608f2a976d8991503b8 --- /dev/null +++ b/third_party/mmyolo/projects/misc/custom_dataset/yolov6_s_syncbn_fast_1xb32-100e_cat.py @@ -0,0 +1,85 @@ +_base_ = '../yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +max_epochs = 100 +data_root = './data/cat/' + +work_dir = './work_dirs/yolov6_s_syncbn_fast_1xb32-100e_cat' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa + +train_batch_size_per_gpu = 32 +train_num_workers = 4 # train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 + +# base_lr_default * (your_bs / default_bs) +base_lr = _base_.base_lr / 8 + +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(220, 20, 60)]) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - _base_.num_last_epochs, 1)]) + +model = dict( + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=5, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=10)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - _base_.num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] diff --git a/third_party/mmyolo/projects/misc/custom_dataset/yolov7_tiny_syncbn_fast_1xb32-100e_cat.py b/third_party/mmyolo/projects/misc/custom_dataset/yolov7_tiny_syncbn_fast_1xb32-100e_cat.py new file mode 100644 index 0000000000000000000000000000000000000000..fff59cb3d31f002724b11674bb8c1550220be503 --- /dev/null +++ b/third_party/mmyolo/projects/misc/custom_dataset/yolov7_tiny_syncbn_fast_1xb32-100e_cat.py @@ -0,0 +1,78 @@ +_base_ = '../yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py' + +max_epochs = 100 +data_root = './data/cat/' + +work_dir = './work_dirs/yolov7_tiny_syncbn_fast_1xb32-100e_cat' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth' # noqa + +train_batch_size_per_gpu = 32 +train_num_workers = 4 # train_num_workers = nGPU x 4 + +save_epoch_intervals = 2 + +# base_lr_default * (your_bs / default_bs) +base_lr = 0.01 / 4 + +anchors = [ + [(68, 69), (154, 91), (143, 162)], # P3/8 + [(242, 160), (189, 287), (391, 207)], # P4/16 + [(353, 337), (539, 341), (443, 432)] # P5/32 +] + +class_name = ('cat', ) +num_classes = len(class_name) +metainfo = dict(classes=class_name, palette=[(220, 20, 60)]) + +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, + val_interval=save_epoch_intervals, + dynamic_intervals=[(max_epochs - 10, 1)]) + +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=5, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file='annotations/trainval.json', + data_prefix=dict(img='images/'))) + +test_dataloader = val_dataloader + +val_evaluator = dict(ann_file=data_root + 'annotations/trainval.json') +test_evaluator = val_evaluator + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=2, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=10)) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/README.md b/third_party/mmyolo/projects/misc/ionogram_detection/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eb7ddd580fb4e2872e54b9eade49a25b83211159 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/README.md @@ -0,0 +1,3 @@ +Tips: 这是 MMYOLO 应用范例的配置文件,请结合 [基于 MMYOLO 的频高图实时目标检测 benchmark](/docs/zh_cn/recommended_topics/application_examples/ionogram_detection.md) 来使用。 + +Tips: This is the config file of the MMYOLO application examples. Please use it in combination with [A Benchmark for Ionogram Detection Based on MMYOLO](/docs/en/recommended_topics/application_examples/ionogram_detection.md). diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..f1829eebf93e0dd8480819ef7710b94c2f3c24f5 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_l_fast_1xb32-100e_ionogram.py @@ -0,0 +1,107 @@ +_base_ = 'mmyolo::rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco.py' + +# ======================== Modified parameters ====================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' + +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) + +train_batch_size_per_gpu = 32 +train_num_workers = 8 +val_batch_size_per_gpu = train_batch_size_per_gpu + +# Config of batch shapes. Only on val. +batch_shapes_cfg = dict(batch_size=val_batch_size_per_gpu) + +# -----train val related----- +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_l_syncbn_fast_8xb32-300e_coco/rtmdet_l_syncbn_fast_8xb32-300e_coco_20230102_135928-ee3abdc4.pth' # noqa + +# default hooks +save_epoch_intervals = 10 +max_epochs = 100 +max_keep_ckpts = 1 + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=1.0e-5, by_epoch=False, begin=0, + end=300), + dict( + # use cosine lr from 20 to 100 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=max_epochs // 5, + end=max_epochs, + T_max=max_epochs * 4 // 5, + by_epoch=True, + convert_to_iter_based=True), +] + +# train_cfg +val_interval = 2 +val_begin = 20 + +tta_model = None +tta_pipeline = None + +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ===================== Unmodified in most cases ================== +model = dict( + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict(assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix))) + +val_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) + +test_dataloader = dict( + batch_size=val_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=test_data_prefix), + ann_file=test_ann_file)) + +default_hooks = dict( + checkpoint=dict( + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto')) + +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = dict(ann_file=data_root + test_ann_file) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=val_begin, + val_interval=val_interval) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..49b284b09a0c5605d59c2e332f9894aadaf3d483 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_s_fast_1xb32-100e_ionogram.py @@ -0,0 +1,83 @@ +_base_ = './rtmdet_l_fast_1xb32-100e_ionogram.py' + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco/rtmdet_s_syncbn_fast_8xb32-300e_coco_20221230_182329-0a8c901a.pth' # noqa + +# ======================= Modified parameters ===================== +deepen_factor = 0.33 +widen_factor = 0.5 +img_scale = _base_.img_scale + +# ratio range for random resize +random_resize_ratio_range = (0.5, 2.0) +# Number of cached images in mosaic +mosaic_max_cached_images = 40 +# Number of cached images in mixup +mixup_max_cached_images = 20 + +# ===================== Unmodified in most cases ================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=mosaic_max_cached_images, + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + max_cached_images=mixup_max_cached_images), + dict(type='mmdet.PackDetInputs') +] + +train_pipeline_stage2 = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='mmdet.RandomResize', + scale=img_scale, + ratio_range=random_resize_ratio_range, # note + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0002, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=_base_.max_epochs - _base_.num_epochs_stage2, + switch_pipeline=train_pipeline_stage2) +] diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..acdaa0756c5df4e3aff3391651ab737c6632da44 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/rtmdet/rtmdet_tiny_fast_1xb32-100e_ionogram.py @@ -0,0 +1,62 @@ +_base_ = './rtmdet_s_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ====================== +deepen_factor = 0.167 +widen_factor = 0.375 +img_scale = _base_.img_scale + +load_from = 'https://download.openmmlab.com/mmyolo/v0/rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco/rtmdet_tiny_syncbn_fast_8xb32-300e_coco_20230102_140117-dbb1dc83.pth' # noqa + +# learning rate +param_scheduler = [ + dict( + type='LinearLR', start_factor=1.0e-5, by_epoch=False, begin=0, + end=300), + dict( + # use cosine lr from 50 to 100 epoch + type='CosineAnnealingLR', + eta_min=_base_.base_lr * 0.05, + begin=_base_.max_epochs // 2, + end=_base_.max_epochs, + T_max=_base_.max_epochs // 2, + by_epoch=True, + convert_to_iter_based=True), +] + +# =======================Unmodified in most cases================== +model = dict( + backbone=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + neck=dict(deepen_factor=deepen_factor, widen_factor=widen_factor), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) + +train_pipeline = [ + dict(type='LoadImageFromFile', backend_args=_base_.backend_args), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=img_scale, + use_cached=True, + max_cached_images=20, # note + random_pop=False, # note + pad_val=114.0), + dict( + type='mmdet.RandomResize', + # img_scale is (width, height) + scale=(img_scale[0] * 2, img_scale[1] * 2), + ratio_range=(0.5, 2.0), + resize_type='mmdet.Resize', + keep_ratio=True), + dict(type='mmdet.RandomCrop', crop_size=img_scale), + dict(type='mmdet.YOLOXHSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict(type='mmdet.Pad', size=img_scale, pad_val=dict(img=(114, 114, 114))), + dict( + type='YOLOv5MixUp', + use_cached=True, + random_pop=False, + max_cached_images=10, + prob=0.5), + dict(type='mmdet.PackDetInputs') +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..737aeae9abeaee0e0024f04f4d7bfbeb9d8798a6 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_m-v61_fast_1xb32-100e_ionogram.py @@ -0,0 +1,95 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# Copied from '../../yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco.py' +deepen_factor = 0.67 +widen_factor = 0.75 +lr_factor = 0.1 +affine_scale = 0.9 +loss_cls_weight = 0.3 +loss_obj_weight = 0.7 +mixup_prob = 0.1 + +# -----data related----- +train_batch_size_per_gpu = 32 + +# -----train val related----- +# Scale lr for SGD +base_lr = _base_.base_lr * train_batch_size_per_gpu \ + / _base_.train_batch_size_per_gpu +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_m-v61_syncbn_fast_8xb16-300e_coco/yolov5_m-v61_syncbn_fast_8xb16-300e_coco_20220917_204944-516a710f.pth' # noqa + +# ===================== Unmodified in most cases ================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale + +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + ), + bbox_head=dict( + head_module=dict(widen_factor=widen_factor), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +pre_transform = _base_.pre_transform +albu_train_transforms = _base_.albu_train_transforms + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +# enable mixup +train_pipeline = [ + *pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=mixup_prob, + pre_transform=[*pre_transform, *mosaic_affine_pipeline]), + dict( + type='mmdet.Albu', + transforms=albu_train_transforms, + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap={ + 'img': 'image', + 'gt_bboxes': 'bboxes' + }), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + dataset=dict(dataset=dict(pipeline=train_pipeline))) + +val_dataloader = dict(batch_size=train_batch_size_per_gpu) +test_dataloader = dict(batch_size=train_batch_size_per_gpu) +optim_wrapper = dict(optimizer=dict(lr=base_lr)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py new file mode 100644 index 0000000000000000000000000000000000000000..1252ebfca09eb21b1b96d4424c2329855e1b1a40 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb32-100e_ionogram_mosaic.py @@ -0,0 +1,35 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----data related----- +train_batch_size_per_gpu = 32 + +# -----train val related----- +base_lr = _base_.base_lr * train_batch_size_per_gpu \ + / _base_.train_batch_size_per_gpu / 2 +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + dataset=dict(dataset=dict(pipeline=train_pipeline))) + +val_dataloader = dict(batch_size=train_batch_size_per_gpu) + +test_dataloader = dict(batch_size=train_batch_size_per_gpu) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..dbe1305d835e8e0a435433deb36ff0d7ce9ec77d --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram.py @@ -0,0 +1,108 @@ +_base_ = 'mmyolo::yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py' + +# ======================= Modified parameters ===================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) +# Batch size of a single GPU during training +train_batch_size_per_gpu = 96 +# Worker to pre-fetch data for each single GPU during training +train_num_workers = 8 + +# -----model related----- +# Basic size of multi-scale prior box +anchors = [[[8, 6], [24, 4], [19, 9]], [[22, 19], [17, 49], [29, 45]], + [[44, 66], [96, 76], [126, 59]]] + +# -----train val related----- +# base_lr_default * (your_bs / default_bs (8x16)) for SGD +base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 16) +max_epochs = 100 +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco/yolov5_s-v61_syncbn_fast_8xb16-300e_coco_20220918_084700-86e02187.pth' # noqa + +# default_hooks +save_epoch_intervals = 10 +logger_interval = 20 +max_keep_ckpts = 1 + +# train_cfg +val_interval = 2 +val_begin = 20 + +tta_model = None +tta_pipeline = None + +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ===================== Unmodified in most cases ================== +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=0.5 * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=1, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix))) + +test_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=test_ann_file, + data_prefix=dict(img=test_data_prefix))) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + save_param_scheduler=None, # for yolov5 + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=logger_interval)) + +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = dict(ann_file=data_root + test_ann_file) + +train_cfg = dict( + max_epochs=max_epochs, val_begin=val_begin, val_interval=val_interval) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py new file mode 100644 index 0000000000000000000000000000000000000000..39ffb6ba1e110b0ee59136414939164d8e0fe1b5 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_aug0.py @@ -0,0 +1,21 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict(type='YOLOv5KeepRatioResize', scale=(640, 640)), + dict( + type='LetterResize', + scale=(640, 640), + allow_scale_up=False, + pad_val=dict(img=114)), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor', 'pad_param')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py new file mode 100644 index 0000000000000000000000000000000000000000..10c114cbcc1f754d46139157eece5d59666d6649 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine.py @@ -0,0 +1,29 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.5, 1.5), + border=(-320, -320), + border_val=(114, 114, 114)), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py new file mode 100644 index 0000000000000000000000000000000000000000..df8f6a2c561a67b275abca3cc5ca3763f1527d72 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-100e_ionogram_mosaic_affine_albu_hsv.py @@ -0,0 +1,44 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + type='Mosaic', + img_scale=(640, 640), + pad_val=114.0, + pre_transform=[ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ]), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(0.5, 1.5), + border=(-320, -320), + border_val=(114, 114, 114)), + dict( + type='mmdet.Albu', + transforms=[ + dict(type='Blur', p=0.01), + dict(type='MedianBlur', p=0.01), + dict(type='ToGray', p=0.01), + dict(type='CLAHE', p=0.01) + ], + bbox_params=dict( + type='BboxParams', + format='pascal_voc', + label_fields=['gt_bboxes_labels', 'gt_ignore_flags']), + keymap=dict(img='image', gt_bboxes='bboxes')), + dict(type='YOLOv5HSVRandomAug'), + # dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape')) +] + +# ===================== Unmodified in most cases ================== +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py new file mode 100644 index 0000000000000000000000000000000000000000..9f62fac92864c1de2d52d3382452a84a16dfe6f8 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov5/yolov5_s-v61_fast_1xb96-200e_ionogram_pre0.py @@ -0,0 +1,17 @@ +_base_ = './yolov5_s-v61_fast_1xb96-100e_ionogram.py' + +# ======================= Modified parameters ===================== +# -----train val related----- +base_lr = _base_.base_lr * 4 +max_epochs = 200 +load_from = None +logger_interval = 50 + +train_cfg = dict(max_epochs=max_epochs, ) + +# ===================== Unmodified in most cases ================== +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=logger_interval)) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..dc5918d828ddd82ca349a307cb015b7fc29f68f1 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_l_fast_1xb32-100e_ionogram.py @@ -0,0 +1,29 @@ +_base_ = './yolov6_m_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ======================= +# -----model related----- +deepen_factor = 1 +widen_factor = 1 + +# -----train val related----- +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_l_syncbn_fast_8xb32-300e_coco/yolov6_l_syncbn_fast_8xb32-300e_coco_20221109_183156-91e3c447.pth' # noqa + +# ====================== Unmodified in most cases =================== +model = dict( + backbone=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + act_cfg=dict(type='SiLU', inplace=True)), + neck=dict( + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=1. / 2, + block_cfg=dict( + type='ConvWrapper', + norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)), + block_act_cfg=dict(type='SiLU', inplace=True)), + bbox_head=dict(head_module=dict(widen_factor=widen_factor))) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..00ea8ff055efd5b2094c723cb52118f51d3ce1c6 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_m_fast_1xb32-100e_ionogram.py @@ -0,0 +1,63 @@ +_base_ = './yolov6_s_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ======================= +# -----model related----- +# The scaling factor that controls the depth of the network structure +deepen_factor = 0.6 +# The scaling factor that controls the width of the network structure +widen_factor = 0.75 + +# -----train val related----- +affine_scale = 0.9 # YOLOv5RandomAffine scaling ratio +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_m_syncbn_fast_8xb32-300e_coco/yolov6_m_syncbn_fast_8xb32-300e_coco_20221109_182658-85bda3f4.pth' # noqa + +# ====================== Unmodified in most cases =================== +model = dict( + backbone=dict( + type='YOLOv6CSPBep', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + hidden_ratio=2. / 3, + block_cfg=dict(type='RepVGGBlock'), + act_cfg=dict(type='ReLU', inplace=True)), + neck=dict( + type='YOLOv6CSPRepPAFPN', + deepen_factor=deepen_factor, + widen_factor=widen_factor, + block_cfg=dict(type='RepVGGBlock'), + hidden_ratio=2. / 3, + block_act_cfg=dict(type='ReLU', inplace=True)), + bbox_head=dict( + type='YOLOv6Head', head_module=dict(widen_factor=widen_factor))) + +mosaic_affine_pipeline = [ + dict( + type='Mosaic', + img_scale=_base_.img_scale, + pad_val=114.0, + pre_transform=_base_.pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + scaling_ratio_range=(1 - affine_scale, 1 + affine_scale), + # img_scale is (width, height) + border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2), + border_val=(114, 114, 114)) +] + +train_pipeline = [ + *_base_.pre_transform, *mosaic_affine_pipeline, + dict( + type='YOLOv5MixUp', + prob=0.1, + pre_transform=[*_base_.pre_transform, *mosaic_affine_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline))) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..c9748b408d7a899d96c2852e1f5a9d726187957c --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-100e_ionogram.py @@ -0,0 +1,108 @@ +_base_ = 'mmyolo::yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py' + +# ======================= Modified parameters ===================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' + +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) + +train_batch_size_per_gpu = 32 +train_num_workers = 8 + +tta_model = None +tta_pipeline = None + +# -----train val related----- +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco/yolov6_s_syncbn_fast_8xb32-400e_coco_20221102_203035-932e1d91.pth' # noqa +# base_lr_default * (your_bs 32 / default_bs (8 x 32)) +base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 32) +max_epochs = 100 +save_epoch_intervals = 10 +val_begin = 20 +max_keep_ckpts = 1 +log_interval = 50 +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ==================== Unmodified in most cases =================== +train_cfg = dict( + max_epochs=max_epochs, + val_begin=val_begin, + val_interval=save_epoch_intervals, + dynamic_intervals=None) + +model = dict( + bbox_head=dict(head_module=dict(num_classes=num_classes)), + train_cfg=dict( + initial_assigner=dict(num_classes=num_classes), + assigner=dict(num_classes=num_classes))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + _delete_=True, + type='RepeatDataset', + times=1, + dataset=dict( + type=_base_.dataset_type, + data_root=data_root, + metainfo=metainfo, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=_base_.train_pipeline))) + +val_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=val_ann_file, + data_prefix=dict(img=val_data_prefix))) + +test_dataloader = dict( + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=test_ann_file, + data_prefix=dict(img=test_data_prefix))) + +val_evaluator = dict(ann_file=data_root + val_data_prefix) +test_evaluator = dict(ann_file=data_root + test_data_prefix) + +optim_wrapper = dict(optimizer=dict(lr=base_lr)) + +default_hooks = dict( + checkpoint=dict( + type='CheckpointHook', + interval=save_epoch_intervals, + max_keep_ckpts=max_keep_ckpts, + save_best='auto'), + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=log_interval)) + +custom_hooks = [ + dict( + type='EMAHook', + ema_type='ExpMomentumEMA', + momentum=0.0001, + update_buffers=True, + strict_load=False, + priority=49), + dict( + type='mmdet.PipelineSwitchHook', + switch_epoch=max_epochs - _base_.num_last_epochs, + switch_pipeline=_base_.train_pipeline_stage2) +] diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py new file mode 100644 index 0000000000000000000000000000000000000000..cc38730f971664bb07edff2a8497e25d4376531f --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov6/yolov6_s_fast_1xb32-200e_ionogram_pre0.py @@ -0,0 +1,17 @@ +_base_ = './yolov6_s_fast_1xb32-100e_ionogram.py' + +# ======================= Modified parameters ===================== +base_lr = _base_.base_lr * 4 +optim_wrapper = dict(optimizer=dict(lr=base_lr)) +max_epochs = 200 +load_from = None + +# ==================== Unmodified in most cases =================== +train_cfg = dict( + max_epochs=max_epochs, + val_begin=20, +) + +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + logger=dict(type='LoggerHook', interval=50)) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..44d58c1f33a12b945c4fafb6f01b521a2e8c6e54 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_l_fast_1xb16-100e_ionogram.py @@ -0,0 +1,98 @@ +_base_ = 'mmyolo::yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py' + +# ======================== Modified parameters ====================== +# -----data related----- +data_root = './Iono4311/' +train_ann_file = 'annotations/train.json' +train_data_prefix = 'train_images/' +val_ann_file = 'annotations/val.json' +val_data_prefix = 'val_images/' +test_ann_file = 'annotations/test.json' +test_data_prefix = 'test_images/' + +class_name = ('E', 'Es-l', 'Es-c', 'F1', 'F2', 'Spread-F') +num_classes = len(class_name) +metainfo = dict( + classes=class_name, + palette=[(250, 165, 30), (120, 69, 125), (53, 125, 34), (0, 11, 123), + (130, 20, 12), (120, 121, 80)]) + +train_batch_size_per_gpu = 16 +train_num_workers = 8 + +# -----model related----- +anchors = [[[14, 14], [35, 6], [32, 18]], [[32, 45], [28, 97], [52, 80]], + [[71, 122], [185, 94], [164, 134]]] + +# -----train val related----- +# base_lr_default * (your_bs 32 / default_bs (8 x 16)) +base_lr = _base_.base_lr * train_batch_size_per_gpu / (8 * 16) +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco/yolov7_l_syncbn_fast_8x16b-300e_coco_20221123_023601-8113c0eb.pth' # noqa + +# default hooks +save_epoch_intervals = 10 +max_epochs = 100 +max_keep_ckpts = 1 + +# train_cfg +val_interval = 2 +val_begin = 20 + +tta_model = None +tta_pipeline = None + +visualizer = dict( + vis_backends=[dict(type='LocalVisBackend'), + dict(type='WandbVisBackend')]) + +# ===================== Unmodified in most cases ================== +model = dict( + bbox_head=dict( + head_module=dict(num_classes=num_classes), + prior_generator=dict(base_sizes=anchors), + loss_cls=dict(loss_weight=_base_.loss_cls_weight * + (num_classes / 80 * 3 / _base_.num_det_layers)))) + +train_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + ann_file=train_ann_file, + data_prefix=dict(img=train_data_prefix))) + +val_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=val_data_prefix), + ann_file=val_ann_file)) + +test_dataloader = dict( + batch_size=train_batch_size_per_gpu, + num_workers=train_num_workers, + dataset=dict( + metainfo=metainfo, + data_root=data_root, + data_prefix=dict(img=test_data_prefix), + ann_file=test_ann_file)) + +optim_wrapper = dict( + optimizer=dict(lr=base_lr, batch_size_per_gpu=train_batch_size_per_gpu)) + +default_hooks = dict( + param_scheduler=dict(max_epochs=max_epochs), + checkpoint=dict( + interval=save_epoch_intervals, max_keep_ckpts=max_keep_ckpts)) + +val_evaluator = dict(ann_file=data_root + val_ann_file) +test_evaluator = dict(ann_file=data_root + test_ann_file) + +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=val_begin, + val_interval=val_interval) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..9c2d63ddeefaa50d3e180c558b1eec2e45180d46 --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_tiny_fast_1xb16-100e_ionogram.py @@ -0,0 +1,101 @@ +_base_ = './yolov7_l_fast_1xb16-100e_ionogram.py' + +# ======================== Modified parameters ======================= +# pre-train +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco/yolov7_tiny_syncbn_fast_8x16b-300e_coco_20221126_102719-0ee5bbdf.pth' # noqa + +# -----model related----- +# Data augmentation +max_translate_ratio = 0.1 # YOLOv5RandomAffine +scaling_ratio_range = (0.5, 1.6) # YOLOv5RandomAffine +mixup_prob = 0.05 # YOLOv5MixUp +randchoice_mosaic_prob = [0.8, 0.2] +mixup_alpha = 8.0 # YOLOv5MixUp +mixup_beta = 8.0 # YOLOv5MixUp + +# -----train val related----- +loss_cls_weight = 0.5 +loss_obj_weight = 1.0 + +lr_factor = 0.01 # Learning rate scaling factor + +# ====================== Unmodified in most cases ==================== +num_classes = _base_.num_classes +num_det_layers = _base_.num_det_layers +img_scale = _base_.img_scale +pre_transform = _base_.pre_transform +model = dict( + backbone=dict( + arch='Tiny', act_cfg=dict(type='LeakyReLU', negative_slope=0.1)), + neck=dict( + is_tiny_version=True, + in_channels=[128, 256, 512], + out_channels=[64, 128, 256], + block_cfg=dict( + _delete_=True, type='TinyDownSampleBlock', middle_ratio=0.25), + act_cfg=dict(type='LeakyReLU', negative_slope=0.1), + use_repconv_outs=False), + bbox_head=dict( + head_module=dict(in_channels=[128, 256, 512]), + loss_cls=dict(loss_weight=loss_cls_weight * + (num_classes / 80 * 3 / num_det_layers)), + loss_obj=dict(loss_weight=loss_obj_weight * + ((img_scale[0] / 640)**2 * 3 / num_det_layers)))) + +mosiac4_pipeline = [ + dict( + type='Mosaic', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + # img_scale is (width, height) + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +mosiac9_pipeline = [ + dict( + type='Mosaic9', + img_scale=img_scale, + pad_val=114.0, + pre_transform=pre_transform), + dict( + type='YOLOv5RandomAffine', + max_rotate_degree=0.0, + max_shear_degree=0.0, + max_translate_ratio=max_translate_ratio, # change + scaling_ratio_range=scaling_ratio_range, # change + border=(-img_scale[0] // 2, -img_scale[1] // 2), + border_val=(114, 114, 114)), +] + +randchoice_mosaic_pipeline = dict( + type='RandomChoice', + transforms=[mosiac4_pipeline, mosiac9_pipeline], + prob=randchoice_mosaic_prob) + +train_pipeline = [ + *pre_transform, + randchoice_mosaic_pipeline, + dict( + type='YOLOv5MixUp', + alpha=mixup_alpha, + beta=mixup_beta, + prob=mixup_prob, # change + pre_transform=[*pre_transform, randchoice_mosaic_pipeline]), + dict(type='YOLOv5HSVRandomAug'), + dict(type='mmdet.RandomFlip', prob=0.5), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', + 'flip_direction')) +] + +train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) +default_hooks = dict(param_scheduler=dict(lr_factor=lr_factor)) diff --git a/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py b/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py new file mode 100644 index 0000000000000000000000000000000000000000..606232a6619278e9583276ee89a9c4c340e3e8db --- /dev/null +++ b/third_party/mmyolo/projects/misc/ionogram_detection/yolov7/yolov7_x_fast_1xb16-100e_ionogram.py @@ -0,0 +1,19 @@ +_base_ = './yolov7_l_fast_1xb16-100e_ionogram.py' + +# ======================== Modified parameters ======================= +load_from = 'https://download.openmmlab.com/mmyolo/v0/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth' # noqa + +# ===================== Unmodified in most cases ================== +model = dict( + backbone=dict(arch='X'), + neck=dict( + in_channels=[640, 1280, 1280], + out_channels=[160, 320, 640], + block_cfg=dict( + type='ELANBlock', + middle_ratio=0.4, + block_ratio=0.4, + num_blocks=3, + num_convs_in_block=2), + use_repconv_outs=False), + bbox_head=dict(head_module=dict(in_channels=[320, 640, 1280]))) diff --git a/third_party/mmyolo/pytest.ini b/third_party/mmyolo/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..9796e871e70c7c67345b1d6bcf708c0c82377a98 --- /dev/null +++ b/third_party/mmyolo/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +addopts = --xdoctest --xdoctest-style=auto +norecursedirs = .git ignore build __pycache__ data docker docs .eggs + +filterwarnings= default + ignore:.*No cfgstr given in Cacher constructor or call.*:Warning + ignore:.*Define the __nice__ method for.*:Warning diff --git a/third_party/mmyolo/requirements.txt b/third_party/mmyolo/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f50cbdc09d6389264f87e2aa1a576a81990e66a --- /dev/null +++ b/third_party/mmyolo/requirements.txt @@ -0,0 +1,3 @@ +-r requirements/build.txt +-r requirements/runtime.txt +-r requirements/tests.txt diff --git a/third_party/mmyolo/requirements/albu.txt b/third_party/mmyolo/requirements/albu.txt new file mode 100644 index 0000000000000000000000000000000000000000..2957391ba9d71f694c74257b42e194529c11879f --- /dev/null +++ b/third_party/mmyolo/requirements/albu.txt @@ -0,0 +1 @@ +albumentations --no-binary qudida,albumentations diff --git a/third_party/mmyolo/requirements/build.txt b/third_party/mmyolo/requirements/build.txt new file mode 100644 index 0000000000000000000000000000000000000000..c96c69aae6a2dfd7d8329707c7a7fe77e0b91f99 --- /dev/null +++ b/third_party/mmyolo/requirements/build.txt @@ -0,0 +1,3 @@ +# These must be installed before building mmyolo +cython +numpy diff --git a/third_party/mmyolo/requirements/docs.txt b/third_party/mmyolo/requirements/docs.txt new file mode 100644 index 0000000000000000000000000000000000000000..4933cc9e231c786a22ce41f43373d071d46242a9 --- /dev/null +++ b/third_party/mmyolo/requirements/docs.txt @@ -0,0 +1,13 @@ +docutils==0.16.0 +mmcv>=2.0.0rc4,<2.1.0 +mmdet>=3.0.0 +mmengine>=0.7.1 +myst-parser +-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +sphinx==4.0.2 +sphinx-copybutton +sphinx_markdown_tables +sphinx_rtd_theme==0.5.2 +torch +torchvision +urllib3<2.0.0 diff --git a/third_party/mmyolo/requirements/mminstall.txt b/third_party/mmyolo/requirements/mminstall.txt new file mode 100644 index 0000000000000000000000000000000000000000..f078af14293815b3ea1e3eeee8a953763bd040f2 --- /dev/null +++ b/third_party/mmyolo/requirements/mminstall.txt @@ -0,0 +1,3 @@ +mmcv>=2.0.0rc4,<2.1.0 +mmdet>=3.0.0 +mmengine>=0.7.1 diff --git a/third_party/mmyolo/requirements/mmpose.txt b/third_party/mmyolo/requirements/mmpose.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e4726e68452ccd045940fa9df95681d9d44c2cf --- /dev/null +++ b/third_party/mmyolo/requirements/mmpose.txt @@ -0,0 +1 @@ +mmpose>=1.0.0 diff --git a/third_party/mmyolo/requirements/mmrotate.txt b/third_party/mmyolo/requirements/mmrotate.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f05d38e76ce50f84535abcbe40109aadd1e1cb --- /dev/null +++ b/third_party/mmyolo/requirements/mmrotate.txt @@ -0,0 +1 @@ +mmrotate>=1.0.0rc1 diff --git a/third_party/mmyolo/requirements/runtime.txt b/third_party/mmyolo/requirements/runtime.txt new file mode 100644 index 0000000000000000000000000000000000000000..794a9cab5748caf8059c4a610e7782bef321841f --- /dev/null +++ b/third_party/mmyolo/requirements/runtime.txt @@ -0,0 +1,2 @@ +numpy +prettytable diff --git a/third_party/mmyolo/requirements/sahi.txt b/third_party/mmyolo/requirements/sahi.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e7b7b842fdc0ead64ce78615c99daa7420bddb9 --- /dev/null +++ b/third_party/mmyolo/requirements/sahi.txt @@ -0,0 +1 @@ +sahi>=0.11.4 diff --git a/third_party/mmyolo/requirements/tests.txt b/third_party/mmyolo/requirements/tests.txt new file mode 100644 index 0000000000000000000000000000000000000000..285b3f3969a2137639e694b3b1652166bc43b177 --- /dev/null +++ b/third_party/mmyolo/requirements/tests.txt @@ -0,0 +1,17 @@ +flake8 +interrogate +isort==4.3.21 +# Note: used for kwarray.group_items, this may be ported to mmcv in the future. +kwarray +memory_profiler +mmcls>=1.0.0rc4 +mmpose>=1.0.0 +mmrazor>=1.0.0rc2 +mmrotate>=1.0.0rc1 +parameterized +protobuf<=3.20.1 +psutil +pytest +ubelt +xdoctest>=0.10.0 +yapf diff --git a/third_party/mmyolo/resources/mmyolo-logo.png b/third_party/mmyolo/resources/mmyolo-logo.png new file mode 100644 index 0000000000000000000000000000000000000000..41318aec92d86749d327bc5f9b9c689632ffc735 Binary files /dev/null and b/third_party/mmyolo/resources/mmyolo-logo.png differ diff --git a/third_party/mmyolo/resources/qq_group_qrcode.jpg b/third_party/mmyolo/resources/qq_group_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..95c4bd1b56367798b632133112e6392ef637debf Binary files /dev/null and b/third_party/mmyolo/resources/qq_group_qrcode.jpg differ diff --git a/third_party/mmyolo/resources/zhihu_qrcode.jpg b/third_party/mmyolo/resources/zhihu_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c745fb027f06564d41794e9a40069b06c34e2bb5 Binary files /dev/null and b/third_party/mmyolo/resources/zhihu_qrcode.jpg differ diff --git a/third_party/mmyolo/setup.cfg b/third_party/mmyolo/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..d30673d0f6242fef3381b4171f9ec208b7f7bc3d --- /dev/null +++ b/third_party/mmyolo/setup.cfg @@ -0,0 +1,21 @@ +[isort] +line_length = 79 +multi_line_output = 0 +extra_standard_library = setuptools +known_first_party = mmyolo +known_third_party = PIL,asynctest,cityscapesscripts,cv2,gather_models,matplotlib,mmcv,numpy,onnx,onnxruntime,pycocotools,pytest,parameterized,pytorch_sphinx_theme,requests,scipy,seaborn,six,terminaltables,torch,ts,yaml,mmengine,mmdet,mmdeploy +no_lines_before = STDLIB,LOCALFOLDER +default_section = THIRDPARTY + +[yapf] +BASED_ON_STYLE = pep8 +BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = true +SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true + +# ignore-words-list needs to be lowercase format. For example, if we want to +# ignore word "BA", then we need to append "ba" to ignore-words-list rather +# than "BA" +[codespell] +skip = *.ipynb +quiet-level = 3 +ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,tood,ba,warmup,elease,dota diff --git a/third_party/mmyolo/setup.py b/third_party/mmyolo/setup.py new file mode 100755 index 0000000000000000000000000000000000000000..f37c89791fee95fb321d66a479f13420f64aa5b9 --- /dev/null +++ b/third_party/mmyolo/setup.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import platform +import shutil +import sys +import warnings +from setuptools import find_packages, setup + +from torch.utils.cpp_extension import BuildExtension + + +def readme(): + with open('README.md', encoding='utf-8') as f: + content = f.read() + return content + + +version_file = 'mmyolo/version.py' + + +def get_version(): + with open(version_file) as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +def parse_requirements(fname='requirements.txt', with_version=True): + """Parse the package dependencies listed in a requirements file but strips + specific versioning information. + + Args: + fname (str): path to requirements file + with_version (bool, default=False): if True include version specs + + Returns: + List[str]: list of requirements items + + CommandLine: + python -c "import setup; print(setup.parse_requirements())" + """ + import re + import sys + from os.path import exists + require_fpath = fname + + def parse_line(line): + """Parse information from a line in a requirements text file.""" + if line.startswith('-r '): + # Allow specifying requirements in other files + target = line.split(' ')[1] + for info in parse_require_file(target): + yield info + else: + info = {'line': line} + if line.startswith('-e '): + info['package'] = line.split('#egg=')[1] + elif '@git+' in line: + info['package'] = line + else: + # Remove versioning from the package + pat = '(' + '|'.join(['>=', '==', '>']) + ')' + parts = re.split(pat, line, maxsplit=1) + parts = [p.strip() for p in parts] + + info['package'] = parts[0] + if len(parts) > 1: + op, rest = parts[1:] + if ';' in rest: + # Handle platform specific dependencies + # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies + version, platform_deps = map(str.strip, + rest.split(';')) + info['platform_deps'] = platform_deps + else: + version = rest # NOQA + info['version'] = (op, version) + yield info + + def parse_require_file(fpath): + with open(fpath) as f: + for line in f.readlines(): + line = line.strip() + if line and not line.startswith('#'): + yield from parse_line(line) + + def gen_packages_items(): + if exists(require_fpath): + for info in parse_require_file(require_fpath): + parts = [info['package']] + if with_version and 'version' in info: + parts.extend(info['version']) + if not sys.version.startswith('3.4'): + # apparently package_deps are broken in 3.4 + platform_deps = info.get('platform_deps') + if platform_deps is not None: + parts.append(';' + platform_deps) + item = ''.join(parts) + yield item + + packages = list(gen_packages_items()) + return packages + + +def add_mim_extension(): + """Add extra files that are required to support MIM into the package. + + These files will be added by creating a symlink to the originals if the + package is installed in `editable` mode (e.g. pip install -e .), or by + copying from the originals otherwise. + """ + + # parse installment mode + if 'develop' in sys.argv: + # installed by `pip install -e .` + if platform.system() == 'Windows': + # set `copy` mode here since symlink fails on Windows. + mode = 'copy' + else: + mode = 'symlink' + elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv: + # installed by `pip install .` + # or create source distribution by `python setup.py sdist` + mode = 'copy' + else: + return + + filenames = ['tools', 'configs', 'demo', 'model-index.yml'] + repo_path = osp.dirname(__file__) + mim_path = osp.join(repo_path, 'mmyolo', '.mim') + os.makedirs(mim_path, exist_ok=True) + + for filename in filenames: + if osp.exists(filename): + src_path = osp.join(repo_path, filename) + tar_path = osp.join(mim_path, filename) + + if osp.isfile(tar_path) or osp.islink(tar_path): + os.remove(tar_path) + elif osp.isdir(tar_path): + shutil.rmtree(tar_path) + + if mode == 'symlink': + src_relpath = osp.relpath(src_path, osp.dirname(tar_path)) + os.symlink(src_relpath, tar_path) + elif mode == 'copy': + if osp.isfile(src_path): + shutil.copyfile(src_path, tar_path) + elif osp.isdir(src_path): + shutil.copytree(src_path, tar_path) + else: + warnings.warn(f'Cannot copy file {src_path}.') + else: + raise ValueError(f'Invalid mode {mode}') + + +if __name__ == '__main__': + add_mim_extension() + setup( + name='mmyolo', + version=get_version(), + description='OpenMMLab Toolbox of YOLO', + long_description=readme(), + long_description_content_type='text/markdown', + author='MMYOLO Contributors', + author_email='openmmlab@gmail.com', + keywords='computer vision, object detection', + url='https://github.com/open-mmlab/mmyolo', + packages=find_packages(exclude=('configs', 'tools', 'demo')), + include_package_data=True, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + ], + license='GPL License 3.0', + install_requires=parse_requirements('requirements/runtime.txt'), + extras_require={ + 'all': parse_requirements('requirements.txt'), + 'tests': parse_requirements('requirements/tests.txt'), + 'build': parse_requirements('requirements/build.txt'), + 'mim': parse_requirements('requirements/mminstall.txt'), + }, + ext_modules=[], + cmdclass={'build_ext': BuildExtension}, + zip_safe=False) diff --git a/third_party/mmyolo/tests/regression/mmyolo.yml b/third_party/mmyolo/tests/regression/mmyolo.yml new file mode 100644 index 0000000000000000000000000000000000000000..55eaec38e1d7a7d3ef524928a1896c97f39633e4 --- /dev/null +++ b/third_party/mmyolo/tests/regression/mmyolo.yml @@ -0,0 +1,81 @@ +globals: + codebase_dir: ../mmyolo + checkpoint_force_download: False + images: + input_img: &input_img ../mmyolo/demo/demo.jpg + test_img: &test_img ./tests/data/tiger.jpeg + metric_info: &metric_info + box AP: # named after metafile.Results.Metrics + metric_key: coco/bbox_mAP # eval OrderedDict key name + tolerance: 1 # metric ±n% + multi_value: 100 + convert_image: &convert_image + input_img: *input_img + test_img: *test_img + backend_test: &default_backend_test True + +onnxruntime: + pipeline_ort_static_fp32: &pipeline_ort_static_fp32 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_onnxruntime_static.py + + pipeline_ort_dynamic_fp32: &pipeline_ort_dynamic_fp32 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_onnxruntime_dynamic.py + +tensorrt: + pipeline_trt_static_fp32: &pipeline_trt_static_fp32_640x640 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_tensorrt_static-640x640.py + + pipeline_trt_static_fp16: &pipeline_trt_static_fp16_640x640 + convert_image: *convert_image + backend_test: False + deploy_config: configs/mmyolo/detection_tensorrt-fp16_static-640x640.py + + pipeline_trt_dynamic_fp32: &pipeline_trt_dynamic_fp32 + convert_image: *convert_image + backend_test: *default_backend_test + deploy_config: configs/mmyolo/detection_tensorrt_dynamic-192x192-960x960.py + + pipeline_trt_dynamic_fp16: &pipeline_trt_dynamic_fp16 + convert_image: *convert_image + backend_test: *default_backend_test + deploy_config: configs/mmyolo/detection_tensorrt-fp16_dynamic-64x64-1344x1344.py + +models: + - name: YOLOv5 + metafile: configs/yolov5/metafile.yml + model_configs: + - configs/yolov5/yolov5_s-p6-v62_syncbn_fast_8xb16-300e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 + + - name: YOLOv6 + metafile: configs/yolov6/metafile.yml + model_configs: + - configs/yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 + + - name: YOLOX + metafile: configs/yolox/metafile.yml + model_configs: + - configs/yolox/yolox_s_8xb8-300e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 + + + - name: RTMDet + metafile: configs/rtmdet/metafile.yml + model_configs: + - configs/rtmdet/rtmdet_s_syncbn_8xb32-300e_coco.py + pipelines: + - *pipeline_ort_dynamic_fp32 + - *pipeline_trt_dynamic_fp16 diff --git a/third_party/mmyolo/tests/test_datasets/__init__.py b/third_party/mmyolo/tests/test_datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_datasets/test_transforms/__init__.py b/third_party/mmyolo/tests/test_datasets/test_transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/test_transforms/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_datasets/test_transforms/test_formatting.py b/third_party/mmyolo/tests/test_datasets/test_transforms/test_formatting.py new file mode 100644 index 0000000000000000000000000000000000000000..c75475dfcfb4e32f656a194d55fc162a165107b3 --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/test_transforms/test_formatting.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import unittest + +import numpy as np +from mmdet.structures import DetDataSample +from mmdet.structures.mask import BitmapMasks +from mmengine.structures import InstanceData, PixelData + +from mmyolo.datasets.transforms import PackDetInputs + + +class TestPackDetInputs(unittest.TestCase): + + def setUp(self): + """Setup the model and optimizer which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + data_prefix = osp.join(osp.dirname(__file__), '../../data') + img_path = osp.join(data_prefix, 'color.jpg') + rng = np.random.RandomState(0) + self.results1 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_bboxes_labels': rng.rand(3, ), + 'gt_ignore_flags': np.array([0, 0, 1], dtype=bool), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.results2 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_bboxes_labels': rng.rand(3, ), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.results3 = { + 'img_id': 1, + 'img_path': img_path, + 'ori_shape': (300, 400), + 'img_shape': (600, 800), + 'scale_factor': 2.0, + 'flip': False, + 'img': rng.rand(300, 400), + 'gt_seg_map': rng.rand(300, 400), + 'gt_masks': + BitmapMasks(rng.rand(3, 300, 400), height=300, width=400), + 'gt_panoptic_seg': rng.rand(1, 300, 400), + 'gt_bboxes_labels': rng.rand(3, ), + 'proposals': rng.rand(2, 4), + 'proposals_scores': rng.rand(2, ) + } + self.meta_keys = ('img_id', 'img_path', 'ori_shape', 'scale_factor', + 'flip') + + def test_transform(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results1)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 2) + self.assertEqual(len(results['data_samples'].ignored_instances), 1) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + + def test_transform_without_ignore(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results2)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 3) + self.assertEqual(len(results['data_samples'].ignored_instances), 0) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + + def test_transform_with_panoptic_seg(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + results = transform(copy.deepcopy(self.results3)) + self.assertIn('data_samples', results) + self.assertIsInstance(results['data_samples'], DetDataSample) + self.assertIsInstance(results['data_samples'].gt_instances, + InstanceData) + self.assertIsInstance(results['data_samples'].ignored_instances, + InstanceData) + self.assertEqual(len(results['data_samples'].gt_instances), 3) + self.assertEqual(len(results['data_samples'].ignored_instances), 0) + self.assertIsInstance(results['data_samples'].gt_sem_seg, PixelData) + self.assertIsInstance(results['data_samples'].gt_panoptic_seg, + PixelData) + + def test_repr(self): + transform = PackDetInputs(meta_keys=self.meta_keys) + self.assertEqual( + repr(transform), f'PackDetInputs(meta_keys={self.meta_keys})') diff --git a/third_party/mmyolo/tests/test_datasets/test_transforms/test_mix_img_transforms.py b/third_party/mmyolo/tests/test_datasets/test_transforms/test_mix_img_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..2e9bf20e39572c946e1b66bdf87626a0c243ac29 --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/test_transforms/test_mix_img_transforms.py @@ -0,0 +1,416 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import unittest + +import numpy as np +import torch +from mmdet.structures.bbox import HorizontalBoxes +from mmdet.structures.mask import BitmapMasks, PolygonMasks + +from mmyolo.datasets import YOLOv5CocoDataset +from mmyolo.datasets.transforms import Mosaic, Mosaic9, YOLOv5MixUp, YOLOXMixUp +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestMosaic(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'dataset': + self.dataset + } + + def test_transform(self): + # test assertion for invalid img_scale + with self.assertRaises(AssertionError): + transform = Mosaic(img_scale=640) + + # test assertion for invalid probability + with self.assertRaises(AssertionError): + transform = Mosaic(prob=1.5) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = Mosaic(use_cached=True, max_cached_images=1) + + transform = Mosaic( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_no_gt(self): + self.results['gt_bboxes'] = np.empty((0, 4), dtype=np.float32) + self.results['gt_bboxes_labels'] = np.empty((0, ), dtype=np.int64) + self.results['gt_ignore_flags'] = np.empty((0, ), dtype=bool) + transform = Mosaic( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertIsInstance(results, dict) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue( + results['gt_bboxes_labels'].shape[0] == results['gt_bboxes']. + shape[0] == results['gt_ignore_flags'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_box_list(self): + transform = Mosaic( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_mask(self): + rng = np.random.RandomState(0) + pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True) + ] + + dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + PolygonMasks.random(num_masks=3, height=224, width=224, rng=rng), + 'dataset': + dataset + } + transform = Mosaic(img_scale=(12, 10), pre_transform=pre_transform) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestMosaic9(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + BitmapMasks(rng.rand(3, 224, 224), height=224, width=224), + 'dataset': + self.dataset + } + + def test_transform(self): + # test assertion for invalid img_scale + with self.assertRaises(AssertionError): + transform = Mosaic9(img_scale=640) + + # test assertion for invalid probability + with self.assertRaises(AssertionError): + transform = Mosaic9(prob=1.5) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = Mosaic9(use_cached=True, max_cached_images=1) + + transform = Mosaic9( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_no_gt(self): + self.results['gt_bboxes'] = np.empty((0, 4), dtype=np.float32) + self.results['gt_bboxes_labels'] = np.empty((0, ), dtype=np.int64) + self.results['gt_ignore_flags'] = np.empty((0, ), dtype=bool) + transform = Mosaic9( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertIsInstance(results, dict) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue( + results['gt_bboxes_labels'].shape[0] == results['gt_bboxes']. + shape[0] == results['gt_ignore_flags'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_box_list(self): + transform = Mosaic9( + img_scale=(12, 10), pre_transform=self.pre_transform) + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (20, 24)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestYOLOv5MixUp(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + + self.results = { + 'img': + np.random.random((288, 512, 3)), + 'img_shape': (288, 512), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'dataset': + self.dataset + } + + def test_transform(self): + transform = YOLOv5MixUp(pre_transform=self.pre_transform) + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (288, 512)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = YOLOv5MixUp(use_cached=True, max_cached_images=1) + + def test_transform_with_box_list(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = YOLOv5MixUp(pre_transform=self.pre_transform) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (288, 512)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_mask(self): + rng = np.random.RandomState(0) + pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True) + ] + dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + + results = { + 'img': + np.random.random((288, 512, 3)), + 'img_shape': (288, 512), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + PolygonMasks.random(num_masks=3, height=288, width=512, rng=rng), + 'dataset': + dataset + } + + transform = YOLOv5MixUp(pre_transform=pre_transform) + results = transform(copy.deepcopy(results)) + self.assertTrue(results['img'].shape[:2] == (288, 512)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestYOLOXMixUp(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.pre_transform = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True) + ] + self.dataset = YOLOv5CocoDataset( + data_prefix=dict( + img=osp.join(osp.dirname(__file__), '../../data')), + ann_file=osp.join( + osp.dirname(__file__), '../../data/coco_sample_color.json'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + 'gt_masks': + BitmapMasks(rng.rand(3, 224, 224), height=224, width=224), + 'dataset': + self.dataset + } + + def test_transform(self): + # test assertion for invalid img_scale + with self.assertRaises(AssertionError): + transform = YOLOXMixUp(img_scale=640) + + # test assertion for invalid max_cached_images + with self.assertRaises(AssertionError): + transform = YOLOXMixUp(use_cached=True, max_cached_images=1) + + transform = YOLOXMixUp( + img_scale=(10, 12), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=self.pre_transform) + + # self.results['mix_results'] = [copy.deepcopy(self.results)] + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = YOLOXMixUp( + img_scale=(10, 12), + ratio_range=(0.8, 1.6), + pad_val=114.0, + pre_transform=self.pre_transform) + results = transform(results) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) diff --git a/third_party/mmyolo/tests/test_datasets/test_transforms/test_transforms.py b/third_party/mmyolo/tests/test_datasets/test_transforms/test_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b7ea49f875582a343829ec7142ed09a61fe51e --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/test_transforms/test_transforms.py @@ -0,0 +1,493 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import os.path as osp +import unittest + +import mmcv +import numpy as np +import torch +from mmdet.structures.bbox import HorizontalBoxes +from mmdet.structures.mask import BitmapMasks, PolygonMasks + +from mmyolo.datasets.transforms import (LetterResize, LoadAnnotations, + YOLOv5HSVRandomAug, + YOLOv5KeepRatioResize, + YOLOv5RandomAffine) +from mmyolo.datasets.transforms.transforms import (PPYOLOERandomCrop, + PPYOLOERandomDistort, + YOLOv5CopyPaste) + + +class TestLetterResize(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.data_info1 = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 150, 150]], dtype=np.float32), + batch_shape=np.array([192, 672], dtype=np.int64), + gt_masks=PolygonMasks.random(1, height=300, width=400, rng=rng)) + self.data_info2 = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 150, 150]], dtype=np.float32)) + self.data_info3 = dict( + img=np.random.random((300, 400, 3)), + batch_shape=np.array([192, 672], dtype=np.int64)) + self.data_info4 = dict(img=np.random.random((300, 400, 3))) + + def test_letter_resize(self): + # Test allow_scale_up + transform = LetterResize(scale=(640, 640), allow_scale_up=False) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 672, 3)) + self.assertTrue( + (results['gt_bboxes'] == np.array([[208., 0., 304., 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 208., + 208.])).all()) + self.assertTrue( + (np.array(results['scale_factor'], dtype=np.float32) <= 1.).all()) + + # Test pad_val + transform = LetterResize(scale=(640, 640), pad_val=dict(img=144)) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 672, 3)) + self.assertTrue( + (results['gt_bboxes'] == np.array([[208., 0., 304., 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 208., + 208.])).all()) + self.assertTrue( + (np.array(results['scale_factor'], dtype=np.float32) <= 1.).all()) + + # Test use_mini_pad + transform = LetterResize(scale=(640, 640), use_mini_pad=True) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 256, 3)) + self.assertTrue((results['gt_bboxes'] == np.array([[0., 0., 96., + 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 0., + 0.])).all()) + self.assertTrue( + (np.array(results['scale_factor'], dtype=np.float32) <= 1.).all()) + + # Test stretch_only + transform = LetterResize(scale=(640, 640), stretch_only=True) + results = transform(copy.deepcopy(self.data_info1)) + self.assertEqual(results['img_shape'], (192, 672, 3)) + self.assertTrue((results['gt_bboxes'] == np.array( + [[0., 0., 251.99998474121094, 96.]])).all()) + self.assertTrue((results['batch_shape'] == np.array([192, 672])).all()) + self.assertTrue((results['pad_param'] == np.array([0., 0., 0., + 0.])).all()) + + # Test + transform = LetterResize(scale=(640, 640), pad_val=dict(img=144)) + for _ in range(5): + input_h, input_w = np.random.randint(100, 700), np.random.randint( + 100, 700) + output_h, output_w = np.random.randint(100, + 700), np.random.randint( + 100, 700) + data_info = dict( + img=np.random.random((input_h, input_w, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + batch_shape=np.array([output_h, output_w], dtype=np.int64), + gt_masks=PolygonMasks( + [[np.array([0., 0., 0., 10., 10., 10., 10., 0.])]], + height=input_h, + width=input_w)) + results = transform(data_info) + self.assertEqual(results['img_shape'], (output_h, output_w, 3)) + self.assertTrue( + (results['batch_shape'] == np.array([output_h, + output_w])).all()) + + # Test without batchshape + transform = LetterResize(scale=(640, 640), pad_val=dict(img=144)) + for _ in range(5): + input_h, input_w = np.random.randint(100, 700), np.random.randint( + 100, 700) + data_info = dict( + img=np.random.random((input_h, input_w, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + gt_masks=PolygonMasks( + [[np.array([0., 0., 0., 10., 10., 10., 10., 0.])]], + height=input_h, + width=input_w)) + results = transform(data_info) + self.assertEqual(results['img_shape'], (640, 640, 3)) + + # TODO: Testing the existence of multiple scale_factor and pad_param + transform = [ + YOLOv5KeepRatioResize(scale=(32, 32)), + LetterResize(scale=(64, 68), pad_val=dict(img=144)) + ] + for _ in range(5): + input_h, input_w = np.random.randint(100, 700), np.random.randint( + 100, 700) + output_h, output_w = np.random.randint(100, + 700), np.random.randint( + 100, 700) + data_info = dict( + img=np.random.random((input_h, input_w, 3)), + gt_bboxes=np.array([[0, 0, 5, 5]], dtype=np.float32), + batch_shape=np.array([output_h, output_w], dtype=np.int64)) + for t in transform: + data_info = t(data_info) + # because of the "math.round" operation, + # it is unable to strictly restore the original input shape + # we just validate the correctness of scale_factor and pad_param + self.assertIn('scale_factor', data_info) + self.assertIn('pad_param', data_info) + pad_param = data_info['pad_param'].reshape(-1, 2).sum( + 1) # (top, b, l, r) -> (h, w) + scale_factor = np.asarray(data_info['scale_factor']) # (w, h) + + max_long_edge = max((32, 32)) + max_short_edge = min((32, 32)) + scale_factor_keepratio = min( + max_long_edge / max(input_h, input_w), + max_short_edge / min(input_h, input_w)) + validate_shape = np.asarray( + (int(input_h * scale_factor_keepratio), + int(input_w * scale_factor_keepratio))) + scale_factor_keepratio = np.asarray( + (validate_shape[1] / input_w, validate_shape[0] / input_h)) + + scale_factor_letter = ((np.asarray( + (output_h, output_w)) - pad_param) / validate_shape)[::-1] + self.assertTrue(data_info['img_shape'][:2] == (output_h, output_w)) + self.assertTrue((scale_factor == (scale_factor_keepratio * + scale_factor_letter)).all()) + + +class TestYOLOv5KeepRatioResize(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + rng = np.random.RandomState(0) + self.data_info1 = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 150, 150]], dtype=np.float32), + gt_masks=PolygonMasks.random( + num_masks=1, height=300, width=400, rng=rng)) + self.data_info2 = dict(img=np.random.random((300, 400, 3))) + + def test_yolov5_keep_ratio_resize(self): + # test assertion for invalid keep_ratio + with self.assertRaises(AssertionError): + transform = YOLOv5KeepRatioResize(scale=(640, 640)) + transform.keep_ratio = False + results = transform(copy.deepcopy(self.data_info1)) + + # Test with gt_bboxes + transform = YOLOv5KeepRatioResize(scale=(640, 640)) + results = transform(copy.deepcopy(self.data_info1)) + self.assertTrue(transform.keep_ratio, True) + self.assertEqual(results['img_shape'], (480, 640)) + self.assertTrue( + (results['gt_bboxes'] == np.array([[0., 0., 240., 240.]])).all()) + self.assertTrue((np.array(results['scale_factor'], + dtype=np.float32) == 1.6).all()) + + # Test only img + transform = YOLOv5KeepRatioResize(scale=(640, 640)) + results = transform(copy.deepcopy(self.data_info2)) + self.assertEqual(results['img_shape'], (480, 640)) + self.assertTrue((np.array(results['scale_factor'], + dtype=np.float32) == 1.6).all()) + + +class TestYOLOv5HSVRandomAug(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.data_info = dict( + img=mmcv.imread( + osp.join(osp.dirname(__file__), '../../data/color.jpg'), + 'color')) + + def test_yolov5_hsv_random_aug(self): + # Test with gt_bboxes + transform = YOLOv5HSVRandomAug( + hue_delta=0.015, saturation_delta=0.7, value_delta=0.4) + results = transform(copy.deepcopy(self.data_info)) + self.assertTrue( + results['img'].shape[:2] == self.data_info['img'].shape[:2]) + + +class TestLoadAnnotations(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + data_prefix = osp.join(osp.dirname(__file__), '../../data') + seg_map = osp.join(data_prefix, 'gray.jpg') + self.results = { + 'ori_shape': (300, 400), + 'seg_map_path': + seg_map, + 'instances': [{ + 'bbox': [0, 0, 10, 20], + 'bbox_label': 1, + 'mask': [[0, 0, 0, 20, 10, 20, 10, 0]], + 'ignore_flag': 0 + }, { + 'bbox': [10, 10, 110, 120], + 'bbox_label': 2, + 'mask': [[10, 10, 110, 10, 110, 120, 110, 10]], + 'ignore_flag': 0 + }, { + 'bbox': [50, 50, 60, 80], + 'bbox_label': 2, + 'mask': [[50, 50, 60, 50, 60, 80, 50, 80]], + 'ignore_flag': 1 + }] + } + + def test_load_bboxes(self): + transform = LoadAnnotations( + with_bbox=True, + with_label=False, + with_seg=False, + with_mask=False, + box_type=None) + results = transform(copy.deepcopy(self.results)) + self.assertIn('gt_bboxes', results) + self.assertTrue((results['gt_bboxes'] == np.array([[0, 0, 10, 20], + [10, 10, 110, + 120]])).all()) + self.assertEqual(results['gt_bboxes'].dtype, np.float32) + self.assertTrue( + (results['gt_ignore_flags'] == np.array([False, False])).all()) + self.assertEqual(results['gt_ignore_flags'].dtype, bool) + + # test empty instance + results = transform({}) + self.assertIn('gt_bboxes', results) + self.assertTrue(results['gt_bboxes'].shape == (0, 4)) + self.assertIn('gt_ignore_flags', results) + self.assertTrue(results['gt_ignore_flags'].shape == (0, )) + + def test_load_labels(self): + transform = LoadAnnotations( + with_bbox=False, + with_label=True, + with_seg=False, + with_mask=False, + ) + results = transform(copy.deepcopy(self.results)) + self.assertIn('gt_bboxes_labels', results) + self.assertTrue((results['gt_bboxes_labels'] == np.array([1, + 2])).all()) + self.assertEqual(results['gt_bboxes_labels'].dtype, np.int64) + + # test empty instance + results = transform({}) + self.assertIn('gt_bboxes_labels', results) + self.assertTrue(results['gt_bboxes_labels'].shape == (0, )) + + +class TestYOLOv5RandomAffine(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + } + + def test_transform(self): + # test assertion for invalid translate_ratio + with self.assertRaises(AssertionError): + transform = YOLOv5RandomAffine(max_translate_ratio=1.5) + + # test assertion for invalid scaling_ratio_range + with self.assertRaises(AssertionError): + transform = YOLOv5RandomAffine(scaling_ratio_range=(1.5, 0.5)) + + with self.assertRaises(AssertionError): + transform = YOLOv5RandomAffine(scaling_ratio_range=(0, 0.5)) + + transform = YOLOv5RandomAffine() + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = YOLOv5RandomAffine() + results = transform(copy.deepcopy(results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestPPYOLOERandomCrop(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + } + + def test_transform(self): + transform = PPYOLOERandomCrop() + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = PPYOLOERandomCrop() + results = transform(copy.deepcopy(results)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestPPYOLOERandomDistort(unittest.TestCase): + + def setUp(self): + """Setup the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.results = { + 'img': + np.random.random((224, 224, 3)), + 'img_shape': (224, 224), + 'gt_bboxes_labels': + np.array([1, 2, 3], dtype=np.int64), + 'gt_bboxes': + np.array([[10, 10, 20, 20], [20, 20, 40, 40], [40, 40, 80, 80]], + dtype=np.float32), + 'gt_ignore_flags': + np.array([0, 0, 1], dtype=bool), + } + + def test_transform(self): + # test assertion for invalid prob + with self.assertRaises(AssertionError): + transform = PPYOLOERandomDistort( + hue_cfg=dict(min=-18, max=18, prob=1.5)) + + # test assertion for invalid num_distort_func + with self.assertRaises(AssertionError): + transform = PPYOLOERandomDistort(num_distort_func=5) + + transform = PPYOLOERandomDistort() + results = transform(copy.deepcopy(self.results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == np.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + def test_transform_with_boxlist(self): + results = copy.deepcopy(self.results) + results['gt_bboxes'] = HorizontalBoxes(results['gt_bboxes']) + + transform = PPYOLOERandomDistort() + results = transform(copy.deepcopy(results)) + self.assertTrue(results['img'].shape[:2] == (224, 224)) + self.assertTrue(results['gt_bboxes_labels'].shape[0] == + results['gt_bboxes'].shape[0]) + self.assertTrue(results['gt_bboxes_labels'].dtype == np.int64) + self.assertTrue(results['gt_bboxes'].dtype == torch.float32) + self.assertTrue(results['gt_ignore_flags'].dtype == bool) + + +class TestYOLOv5CopyPaste(unittest.TestCase): + + def setUp(self): + """Set up the data info which are used in every test method. + + TestCase calls functions in this order: setUp() -> testMethod() -> + tearDown() -> cleanUp() + """ + self.data_info = dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + gt_masks=PolygonMasks( + [[np.array([0., 0., 0., 10., 10., 10., 10., 0.])]], + height=300, + width=400)) + + def test_transform(self): + # test transform + transform = YOLOv5CopyPaste(prob=1.0) + results = transform(copy.deepcopy(self.data_info)) + self.assertTrue(len(results['gt_bboxes']) == 2) + self.assertTrue(len(results['gt_masks']) == 2) + + rng = np.random.RandomState(0) + # test with bitmap + with self.assertRaises(AssertionError): + results = transform( + dict( + img=np.random.random((300, 400, 3)), + gt_bboxes=np.array([[0, 0, 10, 10]], dtype=np.float32), + gt_masks=BitmapMasks( + rng.rand(1, 300, 400), height=300, width=400))) diff --git a/third_party/mmyolo/tests/test_datasets/test_utils.py b/third_party/mmyolo/tests/test_datasets/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dc7b9022650fd49ed4283858bb030852191260c8 --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/test_utils.py @@ -0,0 +1,138 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +import numpy as np +import torch +from mmdet.structures import DetDataSample +from mmdet.structures.bbox import HorizontalBoxes +from mmengine.structures import InstanceData + +from mmyolo.datasets import BatchShapePolicy, yolov5_collate + + +def _rand_bboxes(rng, num_boxes, w, h): + cx, cy, bw, bh = rng.rand(num_boxes, 4).T + + tl_x = ((cx * w) - (w * bw / 2)).clip(0, w) + tl_y = ((cy * h) - (h * bh / 2)).clip(0, h) + br_x = ((cx * w) + (w * bw / 2)).clip(0, w) + br_y = ((cy * h) + (h * bh / 2)).clip(0, h) + + bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T + return bboxes + + +class TestYOLOv5Collate(unittest.TestCase): + + def test_yolov5_collate(self): + rng = np.random.RandomState(0) + + inputs = torch.randn((3, 10, 10)) + data_samples = DetDataSample() + gt_instances = InstanceData() + bboxes = _rand_bboxes(rng, 4, 6, 8) + gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32) + labels = rng.randint(1, 2, size=len(bboxes)) + gt_instances.labels = torch.LongTensor(labels) + data_samples.gt_instances = gt_instances + + out = yolov5_collate([dict(inputs=inputs, data_samples=data_samples)]) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'].shape == (1, 3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (4, 6)) + + out = yolov5_collate([dict(inputs=inputs, data_samples=data_samples)] * + 2) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'].shape == (2, 3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (8, 6)) + + def test_yolov5_collate_with_multi_scale(self): + rng = np.random.RandomState(0) + + inputs = torch.randn((3, 10, 10)) + data_samples = DetDataSample() + gt_instances = InstanceData() + bboxes = _rand_bboxes(rng, 4, 6, 8) + gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32) + labels = rng.randint(1, 2, size=len(bboxes)) + gt_instances.labels = torch.LongTensor(labels) + data_samples.gt_instances = gt_instances + + out = yolov5_collate([dict(inputs=inputs, data_samples=data_samples)], + use_ms_training=True) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'][0].shape == (3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (4, 6)) + self.assertIsInstance(out['inputs'], list) + self.assertIsInstance(out['data_samples']['bboxes_labels'], + torch.Tensor) + + out = yolov5_collate( + [dict(inputs=inputs, data_samples=data_samples)] * 2, + use_ms_training=True) + self.assertIsInstance(out, dict) + self.assertTrue(out['inputs'][0].shape == (3, 10, 10)) + self.assertTrue(out['data_samples'], dict) + self.assertTrue(out['data_samples']['bboxes_labels'].shape == (8, 6)) + self.assertIsInstance(out['inputs'], list) + self.assertIsInstance(out['data_samples']['bboxes_labels'], + torch.Tensor) + + +class TestBatchShapePolicy(unittest.TestCase): + + def test_batch_shape_policy(self): + src_data_infos = [{ + 'height': 20, + 'width': 100, + }, { + 'height': 11, + 'width': 100, + }, { + 'height': 21, + 'width': 100, + }, { + 'height': 30, + 'width': 100, + }, { + 'height': 10, + 'width': 100, + }] + + expected_data_infos = [{ + 'height': 10, + 'width': 100, + 'batch_shape': np.array([96, 672]) + }, { + 'height': 11, + 'width': 100, + 'batch_shape': np.array([96, 672]) + }, { + 'height': 20, + 'width': 100, + 'batch_shape': np.array([160, 672]) + }, { + 'height': 21, + 'width': 100, + 'batch_shape': np.array([160, 672]) + }, { + 'height': 30, + 'width': 100, + 'batch_shape': np.array([224, 672]) + }] + + batch_shapes_policy = BatchShapePolicy(batch_size=2) + out_data_infos = batch_shapes_policy(src_data_infos) + + for i in range(5): + self.assertEqual( + (expected_data_infos[i]['height'], + expected_data_infos[i]['width']), + (out_data_infos[i]['height'], out_data_infos[i]['width'])) + self.assertTrue( + np.allclose(expected_data_infos[i]['batch_shape'], + out_data_infos[i]['batch_shape'])) diff --git a/third_party/mmyolo/tests/test_datasets/test_yolov5_coco.py b/third_party/mmyolo/tests/test_datasets/test_yolov5_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..b7e1c9a43077e7e34f36b2ecda5b3235cfa9bd75 --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/test_yolov5_coco.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +from mmyolo.datasets import YOLOv5CocoDataset + + +class TestYOLOv5CocoDataset(unittest.TestCase): + + def test_batch_shapes_cfg(self): + batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=2, + img_size=640, + size_divisor=32, + extra_pad_ratio=0.5) + + # test serialize_data=True + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=True, + batch_shapes_cfg=batch_shapes_cfg, + ) + + expected_img_ids = [3, 0, 2, 1] + expected_batch_shapes = [[512, 672], [512, 672], [672, 672], + [672, 672]] + for i, data in enumerate(dataset): + assert data['img_id'] == expected_img_ids[i] + assert data['batch_shape'].tolist() == expected_batch_shapes[i] + + # test serialize_data=True + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=False, + batch_shapes_cfg=batch_shapes_cfg, + ) + + expected_img_ids = [3, 0, 2, 1] + expected_batch_shapes = [[512, 672], [512, 672], [672, 672], + [672, 672]] + for i, data in enumerate(dataset): + assert data['img_id'] == expected_img_ids[i] + assert data['batch_shape'].tolist() == expected_batch_shapes[i] + + def test_prepare_data(self): + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=True, + batch_shapes_cfg=None, + ) + for data in dataset: + assert 'dataset' in data + + # test with test_mode = True + dataset = YOLOv5CocoDataset( + data_prefix=dict(img='imgs'), + ann_file='tests/data/coco_sample.json', + test_mode=True, + pipeline=[]) + + for data in dataset: + assert 'dataset' not in data diff --git a/third_party/mmyolo/tests/test_datasets/test_yolov5_voc.py b/third_party/mmyolo/tests/test_datasets/test_yolov5_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..f7e9b989c8b390624a2c1996b8ca534a0b000b56 --- /dev/null +++ b/third_party/mmyolo/tests/test_datasets/test_yolov5_voc.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import unittest + +from mmengine.dataset import ConcatDataset + +from mmyolo.datasets import YOLOv5VOCDataset +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5VocDataset(unittest.TestCase): + + def test_batch_shapes_cfg(self): + batch_shapes_cfg = dict( + type='BatchShapePolicy', + batch_size=2, + img_size=640, + size_divisor=32, + extra_pad_ratio=0.5) + + # test serialize_data=True + dataset = YOLOv5VOCDataset( + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + test_mode=True, + pipeline=[], + batch_shapes_cfg=batch_shapes_cfg, + ) + + expected_img_ids = ['000001'] + expected_batch_shapes = [[672, 480]] + for i, data in enumerate(dataset): + assert data['img_id'] == expected_img_ids[i] + assert data['batch_shape'].tolist() == expected_batch_shapes[i] + + def test_prepare_data(self): + dataset = YOLOv5VOCDataset( + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=False, min_size=0), + pipeline=[], + serialize_data=True, + batch_shapes_cfg=None, + ) + for data in dataset: + assert 'dataset' in data + + # test with test_mode = True + dataset = YOLOv5VOCDataset( + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict( + filter_empty_gt=True, min_size=32, bbox_min_size=None), + pipeline=[], + test_mode=True, + batch_shapes_cfg=None) + + for data in dataset: + assert 'dataset' not in data + + def test_concat_dataset(self): + dataset = ConcatDataset( + datasets=[ + dict( + type='YOLOv5VOCDataset', + data_root='tests/data/VOCdevkit/', + ann_file='VOC2007/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2007/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]), + dict( + type='YOLOv5VOCDataset', + data_root='tests/data/VOCdevkit/', + ann_file='VOC2012/ImageSets/Main/trainval.txt', + data_prefix=dict(sub_data_root='VOC2012/'), + filter_cfg=dict(filter_empty_gt=False, min_size=32), + pipeline=[]) + ], + ignore_keys='dataset_type') + + dataset.full_init() + self.assertEqual(len(dataset), 2) diff --git a/third_party/mmyolo/tests/test_deploy/conftest.py b/third_party/mmyolo/tests/test_deploy/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..ed1bd3d88905e39928d9bc1c1803844d59f92ad9 --- /dev/null +++ b/third_party/mmyolo/tests/test_deploy/conftest.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest + + +@pytest.fixture(autouse=True) +def init_test(): + # init default scope + from mmdet.utils import register_all_modules as register_det + + from mmyolo.utils import register_all_modules as register_yolo + + register_yolo(True) + register_det(False) diff --git a/third_party/mmyolo/tests/test_deploy/test_mmyolo_models.py b/third_party/mmyolo/tests/test_deploy/test_mmyolo_models.py new file mode 100644 index 0000000000000000000000000000000000000000..65394e539aa5b8dca39c17012aa8b805ca69bc39 --- /dev/null +++ b/third_party/mmyolo/tests/test_deploy/test_mmyolo_models.py @@ -0,0 +1,165 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import random + +import numpy as np +import pytest +import torch +from mmengine import Config + +try: + import importlib + importlib.import_module('mmdeploy') +except ImportError: + pytest.skip('mmdeploy is not installed.', allow_module_level=True) + +from mmdeploy.codebase import import_codebase +from mmdeploy.utils import Backend +from mmdeploy.utils.config_utils import register_codebase +from mmdeploy.utils.test import (WrapModel, check_backend, get_model_outputs, + get_rewrite_outputs) + +try: + codebase = register_codebase('mmyolo') + import_codebase(codebase, ['mmyolo.deploy']) +except ImportError: + pytest.skip('mmyolo is not installed.', allow_module_level=True) + + +def seed_everything(seed=1029): + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.enabled = False + + +def get_yolov5_head_model(): + """YOLOv5 Head Config.""" + test_cfg = Config( + dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300)) + + from mmyolo.models.dense_heads import YOLOv5Head + head_module = dict( + type='YOLOv5HeadModule', + num_classes=4, + in_channels=[2, 4, 8], + featmap_strides=[8, 16, 32], + num_base_priors=1) + + model = YOLOv5Head(head_module, test_cfg=test_cfg) + + model.requires_grad_(False) + return model + + +@pytest.mark.parametrize('backend_type', [Backend.ONNXRUNTIME]) +def test_yolov5_head_predict_by_feat(backend_type: Backend): + """Test predict_by_feat rewrite of YOLOXHead.""" + check_backend(backend_type) + yolov5_head = get_yolov5_head_model() + yolov5_head.cpu().eval() + s = 256 + batch_img_metas = [{ + 'scale_factor': (1.0, 1.0), + 'pad_shape': (s, s, 3), + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3) + }] + output_names = ['dets', 'labels'] + deploy_cfg = Config( + dict( + backend_config=dict(type=backend_type.value), + onnx_config=dict(output_names=output_names, input_shape=None), + codebase_config=dict( + type='mmyolo', + task='ObjectDetection', + post_processing=dict( + score_threshold=0.05, + iou_threshold=0.5, + max_output_boxes_per_class=20, + pre_top_k=-1, + keep_top_k=10, + background_label_id=-1, + ), + module=['mmyolo.deploy']))) + seed_everything(1234) + cls_scores = [ + torch.rand(1, yolov5_head.num_classes * yolov5_head.num_base_priors, + 4 * pow(2, i), 4 * pow(2, i)) for i in range(3, 0, -1) + ] + seed_everything(5678) + bbox_preds = [ + torch.rand(1, 4 * yolov5_head.num_base_priors, 4 * pow(2, i), + 4 * pow(2, i)) for i in range(3, 0, -1) + ] + seed_everything(9101) + objectnesses = [ + torch.rand(1, 1 * yolov5_head.num_base_priors, 4 * pow(2, i), + 4 * pow(2, i)) for i in range(3, 0, -1) + ] + + # to get outputs of pytorch model + model_inputs = { + 'cls_scores': cls_scores, + 'bbox_preds': bbox_preds, + 'objectnesses': objectnesses, + 'batch_img_metas': batch_img_metas, + 'with_nms': True + } + model_outputs = get_model_outputs(yolov5_head, 'predict_by_feat', + model_inputs) + + # to get outputs of onnx model after rewrite + wrapped_model = WrapModel( + yolov5_head, + 'predict_by_feat', + batch_img_metas=batch_img_metas, + with_nms=True) + rewrite_inputs = { + 'cls_scores': cls_scores, + 'bbox_preds': bbox_preds, + 'objectnesses': objectnesses, + } + rewrite_outputs, is_backend_output = get_rewrite_outputs( + wrapped_model=wrapped_model, + model_inputs=rewrite_inputs, + deploy_cfg=deploy_cfg) + + if is_backend_output: + # hard code to make two tensors with the same shape + # rewrite and original codes applied different nms strategy + min_shape = min(model_outputs[0].bboxes.shape[0], + rewrite_outputs[0].shape[1], 5) + for i in range(len(model_outputs)): + rewrite_outputs[0][i, :min_shape, 0::2] = \ + rewrite_outputs[0][i, :min_shape, 0::2].clamp_(0, s) + rewrite_outputs[0][i, :min_shape, 1::2] = \ + rewrite_outputs[0][i, :min_shape, 1::2].clamp_(0, s) + assert np.allclose( + model_outputs[i].bboxes[:min_shape], + rewrite_outputs[0][i, :min_shape, :4], + rtol=1e-03, + atol=1e-05) + assert np.allclose( + model_outputs[i].scores[:min_shape], + rewrite_outputs[0][i, :min_shape, 4], + rtol=1e-03, + atol=1e-05) + assert np.allclose( + model_outputs[i].labels[:min_shape], + rewrite_outputs[1][i, :min_shape], + rtol=1e-03, + atol=1e-05) + else: + assert rewrite_outputs is not None diff --git a/third_party/mmyolo/tests/test_deploy/test_object_detection.py b/third_party/mmyolo/tests/test_deploy/test_object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..b701e2557699de14d5e42679740e67706fa3bf6d --- /dev/null +++ b/third_party/mmyolo/tests/test_deploy/test_object_detection.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +from tempfile import NamedTemporaryFile, TemporaryDirectory + +import numpy as np +import pytest +import torch +from mmengine import Config + +try: + import importlib + importlib.import_module('mmdeploy') +except ImportError: + pytest.skip('mmdeploy is not installed.', allow_module_level=True) + +import mmdeploy.backend.onnxruntime as ort_apis +from mmdeploy.apis import build_task_processor +from mmdeploy.codebase import import_codebase +from mmdeploy.utils import load_config +from mmdeploy.utils.config_utils import register_codebase +from mmdeploy.utils.test import SwitchBackendWrapper + +try: + codebase = register_codebase('mmyolo') + import_codebase(codebase, ['mmyolo.deploy']) +except ImportError: + pytest.skip('mmyolo is not installed.', allow_module_level=True) + +model_cfg_path = 'tests/test_deploy/data/model.py' +model_cfg = load_config(model_cfg_path)[0] +model_cfg.test_dataloader.dataset.data_root = \ + 'tests/data' +model_cfg.test_dataloader.dataset.ann_file = 'coco_sample.json' +model_cfg.test_evaluator.ann_file = \ + 'tests/coco_sample.json' +deploy_cfg = Config( + dict( + backend_config=dict(type='onnxruntime'), + codebase_config=dict( + type='mmyolo', + task='ObjectDetection', + post_processing=dict( + score_threshold=0.05, + confidence_threshold=0.005, # for YOLOv3 + iou_threshold=0.5, + max_output_boxes_per_class=200, + pre_top_k=5000, + keep_top_k=100, + background_label_id=-1, + ), + module=['mmyolo.deploy']), + onnx_config=dict( + type='onnx', + export_params=True, + keep_initializers_as_inputs=False, + opset_version=11, + input_shape=None, + input_names=['input'], + output_names=['dets', 'labels']))) +onnx_file = NamedTemporaryFile(suffix='.onnx').name +task_processor = None +img_shape = (32, 32) +img = np.random.rand(*img_shape, 3) + + +@pytest.fixture(autouse=True) +def init_task_processor(): + global task_processor + task_processor = build_task_processor(model_cfg, deploy_cfg, 'cpu') + + +@pytest.fixture +def backend_model(): + from mmdeploy.backend.onnxruntime import ORTWrapper + ort_apis.__dict__.update({'ORTWrapper': ORTWrapper}) + wrapper = SwitchBackendWrapper(ORTWrapper) + wrapper.set( + outputs={ + 'dets': torch.rand(1, 10, 5).sort(2).values, + 'labels': torch.randint(0, 10, (1, 10)) + }) + + yield task_processor.build_backend_model(['']) + + wrapper.recover() + + +def test_visualize(backend_model): + img_path = 'tests/data/color.jpg' + input_dict, _ = task_processor.create_input( + img_path, input_shape=img_shape) + results = backend_model.test_step(input_dict)[0] + with TemporaryDirectory() as dir: + filename = dir + 'tmp.jpg' + task_processor.visualize(img, results, filename, 'window') + assert os.path.exists(filename) diff --git a/third_party/mmyolo/tests/test_downstream/test_mmrazor.py b/third_party/mmyolo/tests/test_downstream/test_mmrazor.py new file mode 100644 index 0000000000000000000000000000000000000000..dc3090d263853e871fb70950be0acd845e19a238 --- /dev/null +++ b/third_party/mmyolo/tests/test_downstream/test_mmrazor.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import pytest +from mmcls.models.backbones.base_backbone import BaseBackbone + +from mmyolo.testing import get_detector_cfg + + +@pytest.mark.parametrize('cfg_file', [ + 'razor/subnets/' + 'yolov5_s_spos_shufflenetv2_syncbn_8xb16-300e_coco.py', 'razor/subnets/' + 'rtmdet_tiny_ofa_lat31_syncbn_16xb16-300e_coco.py', 'razor/subnets/' + 'yolov6_l_attentivenas_a6_d12_syncbn_fast_8xb32-300e_coco.py' +]) +def test_razor_backbone_init(cfg_file): + model = get_detector_cfg(cfg_file) + model_cfg = copy.deepcopy(model.backbone) + from mmrazor.registry import MODELS + model = MODELS.build(model_cfg) + assert isinstance(model, BaseBackbone) diff --git a/third_party/mmyolo/tests/test_engine/__init__.py b/third_party/mmyolo/tests/test_engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_engine/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_engine/test_hooks/test_switch_to_deploy_hook.py b/third_party/mmyolo/tests/test_engine/test_hooks/test_switch_to_deploy_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..52d6e9f0583923feff08cf1cc6f41c8223503d88 --- /dev/null +++ b/third_party/mmyolo/tests/test_engine/test_hooks/test_switch_to_deploy_hook.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock + +from mmyolo.engine.hooks import SwitchToDeployHook +from mmyolo.models import RepVGGBlock +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestSwitchToDeployHook(TestCase): + + def test(self): + + runner = Mock() + runner.model = RepVGGBlock(256, 256) + + hook = SwitchToDeployHook() + self.assertFalse(runner.model.deploy) + + # test after change mode + hook.before_test_epoch(runner) + self.assertTrue(runner.model.deploy) diff --git a/third_party/mmyolo/tests/test_engine/test_hooks/test_yolov5_param_scheduler_hook.py b/third_party/mmyolo/tests/test_engine/test_hooks/test_yolov5_param_scheduler_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..1a527333023a179d95b8cd41b82fa5fd9842c0c6 --- /dev/null +++ b/third_party/mmyolo/tests/test_engine/test_hooks/test_yolov5_param_scheduler_hook.py @@ -0,0 +1,124 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock + +import torch +from mmengine.config import Config +from mmengine.optim import build_optim_wrapper +from mmengine.runner import Runner +from torch import nn +from torch.utils.data import Dataset + +from mmyolo.engine.hooks import YOLOv5ParamSchedulerHook +from mmyolo.utils import register_all_modules + + +class ToyModel(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(2, 1) + + def forward(self, inputs, data_samples, mode='tensor'): + labels = torch.stack(data_samples) + inputs = torch.stack(inputs) + outputs = self.linear(inputs) + if mode == 'tensor': + return outputs + elif mode == 'loss': + loss = (labels - outputs).sum() + outputs = dict(loss=loss) + return outputs + else: + return outputs + + +class DummyDataset(Dataset): + METAINFO = dict() # type: ignore + data = torch.randn(12, 2) + label = torch.ones(12) + + @property + def metainfo(self): + return self.METAINFO + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, index): + return dict(inputs=self.data[index], data_sample=self.label[index]) + + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=0.01, + momentum=0.937, + weight_decay=0.0005, + nesterov=True, + batch_size_per_gpu=1), + constructor='YOLOv5OptimizerConstructor') + +register_all_modules() + + +class TestYOLOv5ParamSchelerHook(TestCase): + + def test(self): + model = ToyModel() + train_dataloader = dict( + dataset=DummyDataset(), + sampler=dict(type='DefaultSampler', shuffle=True), + batch_size=3, + num_workers=0) + + runner = Mock() + runner.model = model + runner.optim_wrapper = build_optim_wrapper(model, optim_wrapper) + runner.cfg.train_dataloader = Config(train_dataloader) + runner.train_dataloader = Runner.build_dataloader(train_dataloader) + + hook = YOLOv5ParamSchedulerHook( + scheduler_type='linear', lr_factor=0.01, max_epochs=300) + + # test before train + runner.epoch = 0 + runner.iter = 0 + hook.before_train(runner) + + for group in runner.optim_wrapper.param_groups: + self.assertEqual(group['lr'], 0.01) + self.assertEqual(group['momentum'], 0.937) + + self.assertFalse(hook._warmup_end) + + # test after training 10 steps + for i in range(10): + runner.iter += 1 + hook.before_train_iter(runner, 0) + + for group_idx, group in enumerate(runner.optim_wrapper.param_groups): + if group_idx == 2: + self.assertEqual(round(group['lr'], 5), 0.0991) + self.assertEqual(group['momentum'], 0.80137) + self.assertFalse(hook._warmup_end) + + # test after warm up + runner.iter = 1000 + hook.before_train_iter(runner, 0) + self.assertFalse(hook._warmup_end) + + for group in runner.optim_wrapper.param_groups: + self.assertEqual(group['lr'], 0.01) + self.assertEqual(group['momentum'], 0.937) + + runner.iter = 1001 + hook.before_train_iter(runner, 0) + self.assertTrue(hook._warmup_end) + + # test after train_epoch + hook.after_train_epoch(runner) + for group in runner.optim_wrapper.param_groups: + self.assertEqual(group['lr'], 0.01) + self.assertEqual(group['momentum'], 0.937) diff --git a/third_party/mmyolo/tests/test_engine/test_hooks/test_yolox_mode_switch_hook.py b/third_party/mmyolo/tests/test_engine/test_hooks/test_yolox_mode_switch_hook.py new file mode 100644 index 0000000000000000000000000000000000000000..fbe13413c4c2abf6369e3e439de63044dc68444c --- /dev/null +++ b/third_party/mmyolo/tests/test_engine/test_hooks/test_yolox_mode_switch_hook.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase +from unittest.mock import Mock + +import torch +from mmengine.config import Config +from mmengine.runner import Runner +from torch.utils.data import Dataset + +from mmyolo.engine.hooks import YOLOXModeSwitchHook +from mmyolo.utils import register_all_modules + + +class DummyDataset(Dataset): + METAINFO = dict() # type: ignore + data = torch.randn(12, 2) + label = torch.ones(12) + + @property + def metainfo(self): + return self.METAINFO + + def __len__(self): + return self.data.size(0) + + def __getitem__(self, index): + return dict(inputs=self.data[index], data_sample=self.label[index]) + + +pipeline1 = [ + dict(type='mmdet.Resize'), +] + +pipeline2 = [ + dict(type='mmdet.RandomFlip'), +] +register_all_modules() + + +class TestYOLOXModeSwitchHook(TestCase): + + def test(self): + train_dataloader = dict( + dataset=DummyDataset(), + sampler=dict(type='DefaultSampler', shuffle=True), + batch_size=3, + num_workers=0) + + runner = Mock() + runner.model = Mock() + runner.model.module = Mock() + + runner.model.bbox_head.use_bbox_aux = False + runner.cfg.train_dataloader = Config(train_dataloader) + runner.train_dataloader = Runner.build_dataloader(train_dataloader) + runner.train_dataloader.dataset.pipeline = pipeline1 + + hook = YOLOXModeSwitchHook( + num_last_epochs=15, new_train_pipeline=pipeline2) + + # test after change mode + runner.epoch = 284 + runner.max_epochs = 300 + hook.before_train_epoch(runner) + self.assertTrue(runner.model.bbox_head.use_bbox_aux) + self.assertEqual(runner.train_loop.dataloader.dataset.pipeline, + pipeline2) diff --git a/third_party/mmyolo/tests/test_engine/test_optimizers/__init__.py b/third_party/mmyolo/tests/test_engine/test_optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_engine/test_optimizers/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov5_optim_constructor.py b/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov5_optim_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..4830e5cd604f99bb40f783c4815e124a37f11c96 --- /dev/null +++ b/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov5_optim_constructor.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import copy +from unittest import TestCase + +import torch +import torch.nn as nn +from mmengine.optim import build_optim_wrapper + +from mmyolo.engine import YOLOv5OptimizerConstructor +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class ExampleModel(nn.Module): + + def __init__(self): + super().__init__() + self.param1 = nn.Parameter(torch.ones(1)) + self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d(4, 2, kernel_size=1) + self.bn = nn.BatchNorm2d(2) + + +class TestYOLOv5OptimizerConstructor(TestCase): + + def setUp(self): + self.model = ExampleModel() + self.base_lr = 0.01 + self.weight_decay = 0.0001 + self.optim_wrapper_cfg = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=self.base_lr, + momentum=0.9, + weight_decay=self.weight_decay, + batch_size_per_gpu=16)) + + def test_init(self): + YOLOv5OptimizerConstructor(copy.deepcopy(self.optim_wrapper_cfg)) + YOLOv5OptimizerConstructor( + copy.deepcopy(self.optim_wrapper_cfg), + paramwise_cfg={'base_total_batch_size': 64}) + + # `paramwise_cfg` must include `base_total_batch_size` if not None. + with self.assertRaises(AssertionError): + YOLOv5OptimizerConstructor( + copy.deepcopy(self.optim_wrapper_cfg), paramwise_cfg={'a': 64}) + + def test_build(self): + optim_wrapper = YOLOv5OptimizerConstructor( + copy.deepcopy(self.optim_wrapper_cfg))( + self.model) + # test param_groups + assert len(optim_wrapper.optimizer.param_groups) == 3 + for i in range(3): + param_groups_i = optim_wrapper.optimizer.param_groups[i] + assert param_groups_i['lr'] == self.base_lr + if i == 0: + assert param_groups_i['weight_decay'] == self.weight_decay + else: + assert param_groups_i['weight_decay'] == 0 + + # test weight_decay linear scaling + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer']['batch_size_per_gpu'] = 128 + optim_wrapper = YOLOv5OptimizerConstructor(optim_wrapper_cfg)( + self.model) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay * 2 + + # test without batch_size_per_gpu + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer'].pop('batch_size_per_gpu') + optim_wrapper = dict( + optim_wrapper_cfg, constructor='YOLOv5OptimizerConstructor') + optim_wrapper = build_optim_wrapper(self.model, optim_wrapper) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay diff --git a/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov7_optim_wrapper_constructor.py b/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov7_optim_wrapper_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..a2f445bedd7b86ffaa00f4c74affa990eaeb663e --- /dev/null +++ b/third_party/mmyolo/tests/test_engine/test_optimizers/test_yolov7_optim_wrapper_constructor.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import copy +from unittest import TestCase + +import torch +import torch.nn as nn +from mmengine.optim import build_optim_wrapper + +from mmyolo.engine import YOLOv7OptimWrapperConstructor +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class ExampleModel(nn.Module): + + def __init__(self): + super().__init__() + self.param1 = nn.Parameter(torch.ones(1)) + self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d(4, 2, kernel_size=1) + self.bn = nn.BatchNorm2d(2) + + +class TestYOLOv7OptimWrapperConstructor(TestCase): + + def setUp(self): + self.model = ExampleModel() + self.base_lr = 0.01 + self.weight_decay = 0.0001 + self.optim_wrapper_cfg = dict( + type='OptimWrapper', + optimizer=dict( + type='SGD', + lr=self.base_lr, + momentum=0.9, + weight_decay=self.weight_decay, + batch_size_per_gpu=16)) + + def test_init(self): + YOLOv7OptimWrapperConstructor(copy.deepcopy(self.optim_wrapper_cfg)) + YOLOv7OptimWrapperConstructor( + copy.deepcopy(self.optim_wrapper_cfg), + paramwise_cfg={'base_total_batch_size': 64}) + + # `paramwise_cfg` must include `base_total_batch_size` if not None. + with self.assertRaises(AssertionError): + YOLOv7OptimWrapperConstructor( + copy.deepcopy(self.optim_wrapper_cfg), paramwise_cfg={'a': 64}) + + def test_build(self): + optim_wrapper = YOLOv7OptimWrapperConstructor( + copy.deepcopy(self.optim_wrapper_cfg))( + self.model) + # test param_groups + assert len(optim_wrapper.optimizer.param_groups) == 3 + for i in range(3): + param_groups_i = optim_wrapper.optimizer.param_groups[i] + assert param_groups_i['lr'] == self.base_lr + if i == 0: + assert param_groups_i['weight_decay'] == self.weight_decay + else: + assert param_groups_i['weight_decay'] == 0 + + # test weight_decay linear scaling + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer']['batch_size_per_gpu'] = 128 + optim_wrapper = YOLOv7OptimWrapperConstructor(optim_wrapper_cfg)( + self.model) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay * 2 + + # test without batch_size_per_gpu + optim_wrapper_cfg = copy.deepcopy(self.optim_wrapper_cfg) + optim_wrapper_cfg['optimizer'].pop('batch_size_per_gpu') + optim_wrapper = dict( + optim_wrapper_cfg, constructor='YOLOv7OptimWrapperConstructor') + optim_wrapper = build_optim_wrapper(self.model, optim_wrapper) + assert optim_wrapper.optimizer.param_groups[0][ + 'weight_decay'] == self.weight_decay diff --git a/third_party/mmyolo/tests/test_models/__init__.py b/third_party/mmyolo/tests/test_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_backbone/__init__.py b/third_party/mmyolo/tests/test_models/test_backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_backbone/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_backbone/test_csp_darknet.py b/third_party/mmyolo/tests/test_models/test_backbone/test_csp_darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..82dceb55f90558b8d6bec48254640e248e7ba772 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_backbone/test_csp_darknet.py @@ -0,0 +1,119 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from parameterized import parameterized +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models.backbones import (YOLOv5CSPDarknet, YOLOv8CSPDarknet, + YOLOXCSPDarknet) +from mmyolo.utils import register_all_modules +from .utils import check_norm_state, is_norm + +register_all_modules() + + +class TestCSPDarknet(TestCase): + + @parameterized.expand([(YOLOv5CSPDarknet, ), (YOLOXCSPDarknet, ), + (YOLOv8CSPDarknet, )]) + def test_init(self, module_class): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + module_class(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + module_class(frozen_stages=6) + + @parameterized.expand([(YOLOv5CSPDarknet, ), (YOLOXCSPDarknet, ), + (YOLOv8CSPDarknet, )]) + def test_forward(self, module_class): + # Test CSPDarknet with first stage frozen + frozen_stages = 1 + model = module_class(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test CSPDarknet with norm_eval=True + model = module_class(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test CSPDarknet-P5 forward with widen_factor=0.25 + model = module_class( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test CSPDarknet forward with dict(type='ReLU') + model = module_class( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test CSPDarknet with BatchNorm forward + model = module_class(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test CSPDarknet with Dropout Block + model = module_class(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 2 + assert len(model.stage2) == 2 + assert len(model.stage3) == 3 # +DropBlock + assert len(model.stage4) == 4 # +SPPF+DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) diff --git a/third_party/mmyolo/tests/test_models/test_backbone/test_csp_resnet.py b/third_party/mmyolo/tests/test_models/test_backbone/test_csp_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0f3c473a8adbf5fa139bff50a7d39006657065 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_backbone/test_csp_resnet.py @@ -0,0 +1,113 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models import PPYOLOECSPResNet +from mmyolo.utils import register_all_modules +from .utils import check_norm_state, is_norm + +register_all_modules() + + +class TestPPYOLOECSPResNet(TestCase): + + def test_init(self): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + PPYOLOECSPResNet(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + PPYOLOECSPResNet(frozen_stages=6) + + def test_forward(self): + # Test PPYOLOECSPResNet with first stage frozen + frozen_stages = 1 + model = PPYOLOECSPResNet(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test PPYOLOECSPResNet with norm_eval=True + model = PPYOLOECSPResNet(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test PPYOLOECSPResNet-P5 forward with widen_factor=0.25 + model = PPYOLOECSPResNet( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test PPYOLOECSPResNet forward with dict(type='ReLU') + model = PPYOLOECSPResNet( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test PPYOLOECSPResNet with BatchNorm forward + model = PPYOLOECSPResNet(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test PPYOLOECSPResNet with BatchNorm forward + model = PPYOLOECSPResNet(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 1 + assert len(model.stage2) == 1 + assert len(model.stage3) == 2 # +DropBlock + assert len(model.stage4) == 2 # +DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) diff --git a/third_party/mmyolo/tests/test_models/test_backbone/test_efficient_rep.py b/third_party/mmyolo/tests/test_models/test_backbone/test_efficient_rep.py new file mode 100644 index 0000000000000000000000000000000000000000..53af20294137b0d29a67e4f1946fe9fd79991f80 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_backbone/test_efficient_rep.py @@ -0,0 +1,202 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models.backbones import YOLOv6CSPBep, YOLOv6EfficientRep +from mmyolo.utils import register_all_modules +from .utils import check_norm_state, is_norm + +register_all_modules() + + +class TestYOLOv6EfficientRep(TestCase): + + def test_init(self): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + YOLOv6EfficientRep(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + YOLOv6EfficientRep(frozen_stages=6) + + def test_YOLOv6EfficientRep_forward(self): + # Test YOLOv6EfficientRep with first stage frozen + frozen_stages = 1 + model = YOLOv6EfficientRep(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test YOLOv6EfficientRep with norm_eval=True + model = YOLOv6EfficientRep(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test YOLOv6EfficientRep-P5 forward with widen_factor=0.25 + model = YOLOv6EfficientRep( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv6EfficientRep forward with dict(type='ReLU') + model = YOLOv6EfficientRep( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6EfficientRep with BatchNorm forward + model = YOLOv6EfficientRep(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6EfficientRep with BatchNorm forward + model = YOLOv6EfficientRep(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 1 + assert len(model.stage2) == 1 + assert len(model.stage3) == 2 # +DropBlock + assert len(model.stage4) == 3 # +SPPF+DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) + + def test_YOLOv6CSPBep_forward(self): + # Test YOLOv6CSPBep with first stage frozen + frozen_stages = 1 + model = YOLOv6CSPBep(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test YOLOv6CSPBep with norm_eval=True + model = YOLOv6CSPBep(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test YOLOv6CSPBep forward with widen_factor=0.25 + model = YOLOv6CSPBep( + arch='P5', widen_factor=0.25, out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 32, 16, 16)) + assert feat[2].shape == torch.Size((1, 64, 8, 8)) + assert feat[3].shape == torch.Size((1, 128, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv6CSPBep forward with dict(type='ReLU') + model = YOLOv6CSPBep( + widen_factor=0.125, + act_cfg=dict(type='ReLU'), + out_indices=range(0, 5)) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6CSPBep with BatchNorm forward + model = YOLOv6CSPBep(widen_factor=0.125, out_indices=range(0, 5)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, _BatchNorm) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 8, 32, 32)) + assert feat[1].shape == torch.Size((1, 16, 16, 16)) + assert feat[2].shape == torch.Size((1, 32, 8, 8)) + assert feat[3].shape == torch.Size((1, 64, 4, 4)) + assert feat[4].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv6CSPBep with BatchNorm forward + model = YOLOv6CSPBep(plugins=[ + dict( + cfg=dict(type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 1 + assert len(model.stage2) == 1 + assert len(model.stage3) == 2 # +DropBlock + assert len(model.stage4) == 3 # +SPPF+DropBlock + model.train() + imgs = torch.randn(1, 3, 256, 256) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 256, 32, 32)) + assert feat[1].shape == torch.Size((1, 512, 16, 16)) + assert feat[2].shape == torch.Size((1, 1024, 8, 8)) diff --git a/third_party/mmyolo/tests/test_models/test_backbone/test_yolov7_backbone.py b/third_party/mmyolo/tests/test_models/test_backbone/test_yolov7_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..76b40aa44b99ea1509be6768a6c4287652961ad0 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_backbone/test_yolov7_backbone.py @@ -0,0 +1,154 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from torch.nn.modules.batchnorm import _BatchNorm + +from mmyolo.models.backbones import YOLOv7Backbone +from mmyolo.utils import register_all_modules +from .utils import check_norm_state + +register_all_modules() + + +class TestYOLOv7Backbone(TestCase): + + def test_init(self): + # out_indices in range(len(arch_setting) + 1) + with pytest.raises(AssertionError): + YOLOv7Backbone(out_indices=(6, )) + + with pytest.raises(ValueError): + # frozen_stages must in range(-1, len(arch_setting) + 1) + YOLOv7Backbone(frozen_stages=6) + + def test_forward(self): + # Test YOLOv7Backbone-L with first stage frozen + frozen_stages = 1 + model = YOLOv7Backbone(frozen_stages=frozen_stages) + model.init_weights() + model.train() + + for mod in model.stem.modules(): + for param in mod.parameters(): + assert param.requires_grad is False + for i in range(1, frozen_stages + 1): + layer = getattr(model, f'stage{i}') + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test YOLOv7Backbone-L with norm_eval=True + model = YOLOv7Backbone(norm_eval=True) + model.train() + + assert check_norm_state(model.modules(), False) + + # Test YOLOv7Backbone-L forward with widen_factor=0.25 + model = YOLOv7Backbone( + widen_factor=0.25, out_indices=tuple(range(0, 5))) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size((1, 16, 32, 32)) + assert feat[1].shape == torch.Size((1, 64, 16, 16)) + assert feat[2].shape == torch.Size((1, 128, 8, 8)) + assert feat[3].shape == torch.Size((1, 256, 4, 4)) + assert feat[4].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv7Backbone-L with plugins + model = YOLOv7Backbone( + widen_factor=0.25, + plugins=[ + dict( + cfg=dict( + type='mmdet.DropBlock', drop_prob=0.1, block_size=3), + stages=(False, False, True, True)), + ]) + + assert len(model.stage1) == 2 + assert len(model.stage2) == 2 + assert len(model.stage3) == 3 # +DropBlock + assert len(model.stage4) == 3 # +DropBlock + model.train() + imgs = torch.randn(1, 3, 128, 128) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 128, 16, 16)) + assert feat[1].shape == torch.Size((1, 256, 8, 8)) + assert feat[2].shape == torch.Size((1, 256, 4, 4)) + + # Test YOLOv7Backbone-X forward with widen_factor=0.25 + model = YOLOv7Backbone(arch='X', widen_factor=0.25) + model.train() + + imgs = torch.randn(1, 3, 64, 64) + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 160, 8, 8)) + assert feat[1].shape == torch.Size((1, 320, 4, 4)) + assert feat[2].shape == torch.Size((1, 320, 2, 2)) + + # Test YOLOv7Backbone-tiny forward with widen_factor=0.25 + model = YOLOv7Backbone(arch='Tiny', widen_factor=0.25) + model.train() + + feat = model(imgs) + assert len(feat) == 3 + assert feat[0].shape == torch.Size((1, 32, 8, 8)) + assert feat[1].shape == torch.Size((1, 64, 4, 4)) + assert feat[2].shape == torch.Size((1, 128, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='W', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + imgs = torch.randn(1, 3, 128, 128) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 64, 16, 16)) + assert feat[1].shape == torch.Size((1, 128, 8, 8)) + assert feat[2].shape == torch.Size((1, 192, 4, 4)) + assert feat[3].shape == torch.Size((1, 256, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='D', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 96, 16, 16)) + assert feat[1].shape == torch.Size((1, 192, 8, 8)) + assert feat[2].shape == torch.Size((1, 288, 4, 4)) + assert feat[3].shape == torch.Size((1, 384, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='E', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 80, 16, 16)) + assert feat[1].shape == torch.Size((1, 160, 8, 8)) + assert feat[2].shape == torch.Size((1, 240, 4, 4)) + assert feat[3].shape == torch.Size((1, 320, 2, 2)) + + # Test YOLOv7Backbone-w forward with widen_factor=0.25 + model = YOLOv7Backbone( + arch='E2E', widen_factor=0.25, out_indices=(2, 3, 4, 5)) + model.train() + + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size((1, 80, 16, 16)) + assert feat[1].shape == torch.Size((1, 160, 8, 8)) + assert feat[2].shape == torch.Size((1, 240, 4, 4)) + assert feat[3].shape == torch.Size((1, 320, 2, 2)) diff --git a/third_party/mmyolo/tests/test_models/test_backbone/utils.py b/third_party/mmyolo/tests/test_models/test_backbone/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d65db568d6f1693eb457dc74b0d8c417cef1b9ea --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_backbone/utils.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.models.backbones.res2net import Bottle2neck +from mmdet.models.backbones.resnet import BasicBlock, Bottleneck +from mmdet.models.backbones.resnext import Bottleneck as BottleneckX +from mmdet.models.layers import SimplifiedBasicBlock +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + + +def is_block(modules): + """Check if is ResNet building block.""" + if isinstance(modules, (BasicBlock, Bottleneck, BottleneckX, Bottle2neck, + SimplifiedBasicBlock)): + return True + return False + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True diff --git a/third_party/mmyolo/tests/test_models/test_data_preprocessor/__init__.py b/third_party/mmyolo/tests/test_models/test_data_preprocessor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_data_preprocessor/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_data_preprocessor/test_data_preprocessor.py b/third_party/mmyolo/tests/test_models/test_data_preprocessor/test_data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..2c7e4415b627afe0046bc30b3b416af9deb302b6 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_data_preprocessor/test_data_preprocessor.py @@ -0,0 +1,156 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmdet.structures import DetDataSample +from mmengine import MessageHub + +from mmyolo.models import PPYOLOEBatchRandomResize, PPYOLOEDetDataPreprocessor +from mmyolo.models.data_preprocessors import (YOLOv5DetDataPreprocessor, + YOLOXBatchSyncRandomResize) +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5DetDataPreprocessor(TestCase): + + def test_forward(self): + processor = YOLOv5DetDataPreprocessor(mean=[0, 0, 0], std=[1, 1, 1]) + + data = { + 'inputs': [torch.randint(0, 256, (3, 11, 10))], + 'data_samples': [DetDataSample()] + } + out_data = processor(data, training=False) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + + self.assertEqual(batch_inputs.shape, (1, 3, 11, 10)) + self.assertEqual(len(batch_data_samples), 1) + + # test channel_conversion + processor = YOLOv5DetDataPreprocessor( + mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True) + out_data = processor(data, training=False) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertEqual(batch_inputs.shape, (1, 3, 11, 10)) + self.assertEqual(len(batch_data_samples), 1) + + # test padding, training=False + data = { + 'inputs': [ + torch.randint(0, 256, (3, 10, 11)), + torch.randint(0, 256, (3, 9, 14)) + ] + } + processor = YOLOv5DetDataPreprocessor( + mean=[0., 0., 0.], std=[1., 1., 1.], bgr_to_rgb=True) + out_data = processor(data, training=False) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertEqual(batch_inputs.shape, (2, 3, 10, 14)) + self.assertIsNone(batch_data_samples) + + # test training + data = { + 'inputs': torch.randint(0, 256, (2, 3, 10, 11)), + 'data_samples': { + 'bboxes_labels': torch.randint(0, 11, (18, 6)) + }, + } + out_data = processor(data, training=True) + batch_inputs, batch_data_samples = out_data['inputs'], out_data[ + 'data_samples'] + self.assertIn('img_metas', batch_data_samples) + self.assertIn('bboxes_labels', batch_data_samples) + self.assertEqual(batch_inputs.shape, (2, 3, 10, 11)) + self.assertIsInstance(batch_data_samples['bboxes_labels'], + torch.Tensor) + self.assertIsInstance(batch_data_samples['img_metas'], list) + + data = { + 'inputs': [torch.randint(0, 256, (3, 11, 10))], + 'data_samples': [DetDataSample()] + } + # data_samples must be dict + with self.assertRaises(AssertionError): + processor(data, training=True) + + +class TestPPYOLOEDetDataPreprocessor(TestCase): + + def test_batch_random_resize(self): + processor = PPYOLOEDetDataPreprocessor( + pad_size_divisor=32, + batch_augments=[ + dict( + type='PPYOLOEBatchRandomResize', + random_size_range=(320, 480), + interval=1, + size_divisor=32, + random_interp=True, + keep_ratio=False) + ], + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True) + self.assertTrue( + isinstance(processor.batch_augments[0], PPYOLOEBatchRandomResize)) + message_hub = MessageHub.get_instance('test_batch_random_resize') + message_hub.update_info('iter', 0) + + # test training + data = { + 'inputs': [ + torch.randint(0, 256, (3, 10, 11)), + torch.randint(0, 256, (3, 10, 11)) + ], + 'data_samples': { + 'bboxes_labels': torch.randint(0, 11, (18, 6)).float() + }, + } + out_data = processor(data, training=True) + batch_data_samples = out_data['data_samples'] + self.assertIn('img_metas', batch_data_samples) + self.assertIn('bboxes_labels', batch_data_samples) + self.assertIsInstance(batch_data_samples['bboxes_labels'], + torch.Tensor) + self.assertIsInstance(batch_data_samples['img_metas'], list) + + data = { + 'inputs': [torch.randint(0, 256, (3, 11, 10))], + 'data_samples': DetDataSample() + } + # data_samples must be list + with self.assertRaises(AssertionError): + processor(data, training=True) + + +class TestYOLOXDetDataPreprocessor(TestCase): + + def test_batch_sync_random_size(self): + processor = YOLOXBatchSyncRandomResize( + random_size_range=(480, 800), size_divisor=32, interval=1) + self.assertTrue(isinstance(processor, YOLOXBatchSyncRandomResize)) + message_hub = MessageHub.get_instance( + 'test_yolox_batch_sync_random_resize') + message_hub.update_info('iter', 0) + + # test training + inputs = torch.randint(0, 256, (4, 3, 10, 11)) + data_samples = {'bboxes_labels': torch.randint(0, 11, (18, 6)).float()} + + inputs, data_samples = processor(inputs, data_samples) + + self.assertIn('bboxes_labels', data_samples) + self.assertIsInstance(data_samples['bboxes_labels'], torch.Tensor) + self.assertIsInstance(inputs, torch.Tensor) + + inputs = torch.randint(0, 256, (4, 3, 10, 11)) + data_samples = DetDataSample() + + # data_samples must be dict + with self.assertRaises(AssertionError): + processor(inputs, data_samples) diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/__init__.py b/third_party/mmyolo/tests/test_models/test_dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_ppyoloe_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_ppyoloe_head.py new file mode 100644 index 0000000000000000000000000000000000000000..20e0c45761454f3575856babe39fa3fc95e6d5fa --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_ppyoloe_head.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine import ConfigDict, MessageHub +from mmengine.config import Config +from mmengine.model import bias_init_with_prob +from mmengine.testing import assert_allclose + +from mmyolo.models import PPYOLOEHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestPPYOLOEHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='PPYOLOEHeadModule', + num_classes=4, + in_channels=[32, 64, 128], + featmap_strides=(8, 16, 32)) + + def test_init_weights(self): + head = PPYOLOEHead(head_module=self.head_module) + head.head_module.init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_reg in zip(head.head_module.cls_preds, + head.head_module.reg_preds): + assert_allclose(conv_cls.weight.data, + torch.zeros_like(conv_cls.weight.data)) + assert_allclose(conv_reg.weight.data, + torch.zeros_like(conv_reg.weight.data)) + + assert_allclose(conv_cls.bias.data, + torch.ones_like(conv_cls.bias.data) * bias_init) + assert_allclose(conv_reg.bias.data, + torch.ones_like(conv_reg.bias.data)) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + nms_pre=1000, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.7), + max_per_img=300)) + + head = PPYOLOEHead(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + feat = [ + torch.rand(1, in_channels, s // feat_size, s // feat_size) + for in_channels, feat_size in [[32, 8], [64, 16], [128, 32]] + ] + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + message_hub = MessageHub.get_instance('test_ppyoloe_loss_by_feat') + message_hub.update_info('epoch', 1) + + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = PPYOLOEHead( + head_module=self.head_module, + train_cfg=ConfigDict( + initial_epoch=31, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=4, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=4, + topk=13, + alpha=1, + beta=6))) + head.train() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6), dtype=torch.float32) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_dfl_loss = empty_gt_losses['loss_dfl'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertEqual( + empty_dfl_loss.item(), 0, + 'there should be df loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = PPYOLOEHead( + head_module=self.head_module, + train_cfg=ConfigDict( + initial_epoch=31, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=4, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=4, + topk=13, + alpha=1, + beta=6))) + head.train() + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = PPYOLOEHead( + head_module=self.head_module, + train_cfg=ConfigDict( + initial_epoch=31, + initial_assigner=dict( + type='BatchATSSAssigner', + num_classes=1, + topk=9, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=1, + topk=13, + alpha=1, + beta=6))) + head.train() + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874]]) + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_rotated_rtmdet_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_rotated_rtmdet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..21e1d4d139a2cbf2815f69ffac105100bcd62f34 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_rotated_rtmdet_head.py @@ -0,0 +1,264 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models.dense_heads import RTMDetRotatedHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestRTMDetRotatedHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='RTMDetRotatedSepBNHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + feat_channels=64, + featmap_strides=[4, 8, 16]) + + def test_init_weights(self): + head = RTMDetRotatedHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = dict( + multi_label=True, + decode_with_angle=True, + nms_pre=2000, + score_thr=0.01, + nms=dict(type='nms_rotated', iou_threshold=0.1), + max_per_img=300) + test_cfg = Config(test_cfg) + + head = RTMDetRotatedHead( + head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, angle_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + angle_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + angle_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + if not torch.cuda.is_available(): + pytest.skip('test requires GPU and torch+cuda') + + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + train_cfg = dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=80, + topk=13, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + batch_iou=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + train_cfg = Config(train_cfg) + head = RTMDetRotatedHead( + head_module=self.head_module, train_cfg=train_cfg).cuda() + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size).cuda() + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 5)).cuda(), + labels=torch.LongTensor([]).cuda()) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + angle_preds, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'classification loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = RTMDetRotatedHead( + head_module=self.head_module, train_cfg=train_cfg).cuda() + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, + 0.2]]).cuda(), + labels=torch.LongTensor([1]).cuda()) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = RTMDetRotatedHead( + head_module=self.head_module, train_cfg=train_cfg).cuda() + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, + 0.2]]).cuda(), + labels=torch.LongTensor([0]).cuda()) + + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + def test_hbb_loss_by_feat(self): + + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + train_cfg = dict( + assigner=dict( + type='BatchDynamicSoftLabelAssigner', + num_classes=80, + topk=13, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + batch_iou=False), + allowed_border=-1, + pos_weight=-1, + debug=False) + train_cfg = Config(train_cfg) + hbb_cfg = dict( + bbox_coder=dict( + type='DistanceAnglePointCoder', angle_version='le90'), + loss_bbox=dict(type='mmdet.GIoULoss', loss_weight=2.0), + angle_coder=dict( + type='mmrotate.CSLCoder', + angle_version='le90', + omega=1, + window='gaussian', + radius=1), + loss_angle=dict( + type='mmrotate.SmoothFocalLoss', + gamma=2.0, + alpha=0.25, + loss_weight=0.2), + use_hbbox_loss=True, + ) + head = RTMDetRotatedHead( + head_module=self.head_module, **hbb_cfg, train_cfg=train_cfg) + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 5)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + angle_preds, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_angle_loss = empty_gt_losses['loss_angle'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'classification loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertEqual( + empty_angle_loss.item(), 0, + 'there should be no angle loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = RTMDetRotatedHead( + head_module=self.head_module, **hbb_cfg, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, 0.2]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_angle_loss = one_gt_losses['loss_angle'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_angle_loss.item(), 0, + 'angle loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = RTMDetRotatedHead( + head_module=self.head_module, **hbb_cfg, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[130.6667, 86.8757, 100.6326, 70.8874, 0.2]]), + labels=torch.LongTensor([0])) + + cls_scores, bbox_preds, angle_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, angle_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_angle_loss = one_gt_losses['loss_angle'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_angle_loss.item(), 0, + 'angle loss should be non-zero') diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_rtmdet_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_rtmdet_head.py new file mode 100644 index 0000000000000000000000000000000000000000..cce5ee6ffae5c697b32430b9b13cab16127450bb --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_rtmdet_head.py @@ -0,0 +1,223 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models import RTMDetInsSepBNHead +from mmyolo.models.dense_heads import RTMDetHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestRTMDetHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='RTMDetSepBNHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + feat_channels=64, + featmap_strides=[4, 8, 16]) + + def test_init_weights(self): + head = RTMDetHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = dict( + multi_label=True, + nms_pre=30000, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.65), + max_per_img=300) + test_cfg = Config(test_cfg) + + head = RTMDetHead(head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + train_cfg = dict( + assigner=dict( + num_classes=80, + type='BatchDynamicSoftLabelAssigner', + topk=13, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')), + allowed_border=-1, + pos_weight=-1, + debug=False) + train_cfg = Config(train_cfg) + head = RTMDetHead(head_module=self.head_module, train_cfg=train_cfg) + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + [gt_instances], img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'classification loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = RTMDetHead(head_module=self.head_module, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = RTMDetHead(head_module=self.head_module, train_cfg=train_cfg) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + cls_scores, bbox_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + + +class TestRTMDetInsHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='RTMDetInsSepBNHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + feat_channels=64, + featmap_strides=[4, 8, 16], + num_prototypes=8, + dyconv_channels=8, + num_dyconvs=3, + share_conv=True, + use_sigmoid_cls=True) + + def test_init_weights(self): + head = RTMDetInsSepBNHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + 'pad_param': np.array([0., 0., 0., 0.]) + }] + test_cfg = dict( + multi_label=False, + nms_pre=1000, + min_bbox_size=0, + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=100, + mask_thr_binary=0.5) + test_cfg = Config(test_cfg) + + head = RTMDetInsSepBNHead( + head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, kernel_preds, mask_feat = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + kernel_preds, + mask_feat, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + + img_metas_without_pad_param = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0) + }] + head.predict_by_feat( + cls_scores, + bbox_preds, + kernel_preds, + mask_feat, + batch_img_metas=img_metas_without_pad_param, + cfg=test_cfg, + rescale=True, + with_nms=True) + + with self.assertRaises(AssertionError): + head.predict_by_feat( + cls_scores, + bbox_preds, + kernel_preds, + mask_feat, + batch_img_metas=img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov5_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov5_head.py new file mode 100644 index 0000000000000000000000000000000000000000..974b9a9869dbcf39e6928cadd7399b452ba93e1d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov5_head.py @@ -0,0 +1,411 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models.dense_heads import YOLOv5Head, YOLOv5InsHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv5HeadModule', + num_classes=2, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32], + num_base_priors=3) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv5Head(head_module=self.head_module, test_cfg=test_cfg) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv5Head(head_module=self.head_module) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv5Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv5Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + def test_loss_by_feat_with_ignore(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv5Head(head_module=self.head_module, ignore_iof_thr=0.8) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + # ignore boxes + gt_instances_ignore = torch.tensor( + [[0, 0, 69.7688, 0, 619.3611, 62.2711]], dtype=torch.float32) + + empty_gt_losses = head._loss_by_feat_with_ignore( + cls_scores, bbox_preds, objectnesses, [gt_instances], img_metas, + gt_instances_ignore) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv5Head(head_module=self.head_module, ignore_iof_thr=0.8) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + gt_instances_ignore = torch.tensor( + [[0, 0, 69.7688, 0, 619.3611, 62.2711]], dtype=torch.float32) + + one_gt_losses = head._loss_by_feat_with_ignore(cls_scores, bbox_preds, + objectnesses, + [gt_instances], + img_metas, + gt_instances_ignore) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv5Head(head_module=self.head_module, ignore_iof_thr=0.8) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + gt_instances_ignore = torch.tensor( + [[0, 0, 69.7688, 0, 619.3611, 62.2711]], dtype=torch.float32) + + one_gt_losses = head._loss_by_feat_with_ignore(cls_scores, bbox_preds, + objectnesses, + [gt_instances], + img_metas, + gt_instances_ignore) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + +class TestYOLOv5InsHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv5InsHeadModule', + num_classes=4, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32], + mask_channels=32, + proto_channels=32, + widen_factor=1.0) + + def test_init_weights(self): + head = YOLOv5InsHead(head_module=self.head_module) + head.head_module.init_weights() + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + nms_pre=30000, + min_bbox_size=0, + score_thr=0.001, + nms=dict(type='nms', iou_threshold=0.6), + max_per_img=300, + mask_thr_binary=0.5)) + + head = YOLOv5InsHead(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + with torch.no_grad(): + res = head.forward(feat) + cls_scores, bbox_preds, objectnesses,\ + coeff_preds, proto_preds = res + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + coeff_preds, + proto_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + + with self.assertRaises(AssertionError): + head.predict_by_feat( + cls_scores, + bbox_preds, + coeff_preds, + proto_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv5InsHead(head_module=self.head_module) + rng = np.random.RandomState(0) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses,\ + coeff_preds, proto_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_bboxes_labels = torch.empty((0, 6)) + gt_masks = rng.rand(0, s // 4, s // 4) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, coeff_preds, + proto_preds, gt_bboxes_labels, + gt_masks, img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + empty_mask_loss = empty_gt_losses['loss_mask'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + self.assertEqual( + empty_mask_loss.item(), 0, + 'there should be no mask loss when there are no true masks') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv5InsHead(head_module=self.head_module) + + bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]) + labels = torch.Tensor([1.]) + batch_id = torch.LongTensor([0]) + gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes], + dim=1) + gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int() + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + coeff_preds, proto_preds, + gt_bboxes_labels, gt_masks, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_mask_loss = one_gt_losses['loss_mask'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_mask_loss.item(), 0, + 'mask loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv5InsHead(head_module=self.head_module) + bboxes = torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]) + labels = torch.Tensor([1.]) + batch_id = torch.LongTensor([0]) + gt_bboxes_labels = torch.cat([batch_id[None], labels[None], bboxes], + dim=1) + gt_masks = torch.from_numpy(rng.rand(1, s // 4, s // 4)).int() + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + coeff_preds, proto_preds, + gt_bboxes_labels, gt_masks, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_mask_loss = one_gt_losses['loss_mask'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_mask_loss.item(), 0, + 'mask loss should be non-zero') diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov6_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov6_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5bb951d12360614b26b5d3ccf30d1c044ab0ccdc --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov6_head.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.config import Config + +from mmyolo.models.dense_heads import YOLOv6Head +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv6Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv6HeadModule', + num_classes=2, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32]) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv6Head(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov7_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov7_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5033f97e19673af79ab9a9c3ee2c618db3ea80e0 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov7_head.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.config import Config +from mmengine.structures import InstanceData + +from mmyolo.models.dense_heads import YOLOv7Head +from mmyolo.utils import register_all_modules + +register_all_modules() + + +# TODO: Test YOLOv7p6HeadModule +class TestYOLOv7Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv7HeadModule', + num_classes=2, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32], + num_base_priors=3) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv7Head(head_module=self.head_module, test_cfg=test_cfg) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv7Head(head_module=self.head_module) + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = InstanceData( + bboxes=torch.empty((0, 4)), labels=torch.LongTensor([])) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, [gt_instances], + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOv7Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([1])) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv7Head(head_module=self.head_module) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + labels=torch.LongTensor([0])) + + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + [gt_instances], img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + self.assertEqual(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov8_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov8_head.py new file mode 100644 index 0000000000000000000000000000000000000000..8980387a75bdd4ac1d3aebacf8a364e82259a01b --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolov8_head.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine import ConfigDict +from mmengine.config import Config + +from mmyolo.models import YOLOv8Head +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv8Head(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOv8HeadModule', + num_classes=4, + in_channels=[32, 64, 128], + featmap_strides=[8, 16, 32]) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOv8Head(head_module=self.head_module, test_cfg=test_cfg) + head.eval() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + None, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'batch_input_shape': (s, s), + 'scale_factor': 1, + }] + + head = YOLOv8Head( + head_module=self.head_module, + train_cfg=ConfigDict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=4, + topk=10, + alpha=0.5, + beta=6))) + head.train() + + feat = [] + for i in range(len(self.head_module['in_channels'])): + in_channel = self.head_module['in_channels'][i] + feat_size = self.head_module['featmap_strides'][i] + feat.append( + torch.rand(1, in_channel, s // feat_size, s // feat_size)) + + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6), dtype=torch.float32) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_dfl_loss = empty_gt_losses['loss_dfl'].sum() + self.assertGreater(empty_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertEqual( + empty_dfl_loss.item(), 0, + 'there should be df loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') + + # test num_class = 1 + self.head_module['num_classes'] = 1 + head = YOLOv8Head( + head_module=self.head_module, + train_cfg=ConfigDict( + assigner=dict( + type='BatchTaskAlignedAssigner', + num_classes=1, + topk=10, + alpha=0.5, + beta=6))) + head.train() + + gt_instances = torch.Tensor( + [[0., 0., 23.6667, 23.8757, 238.6326, 151.8874], + [1., 0., 24.6667, 27.8757, 28.6326, 51.8874]]) + cls_scores, bbox_preds, bbox_dist_preds = head.forward(feat) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + bbox_dist_preds, gt_instances, + img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_loss_dfl = one_gt_losses['loss_dfl'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_loss_dfl.item(), 0, + 'obj loss should be non-zero') diff --git a/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolox_head.py b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolox_head.py new file mode 100644 index 0000000000000000000000000000000000000000..390994417c7fc9c0b2cb4470484ee3e28248a4a5 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_dense_heads/test_yolox_head.py @@ -0,0 +1,379 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.config import Config +from mmengine.model import bias_init_with_prob +from mmengine.testing import assert_allclose + +from mmyolo.models.dense_heads import YOLOXHead, YOLOXPoseHead +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOXHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOXHeadModule', + num_classes=4, + in_channels=1, + stacked_convs=1, + ) + + def test_init_weights(self): + head = YOLOXHead(head_module=self.head_module) + head.head_module.init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj in zip(head.head_module.multi_level_conv_cls, + head.head_module.multi_level_conv_obj): + assert_allclose(conv_cls.bias.data, + torch.ones_like(conv_cls.bias.data) * bias_init) + assert_allclose(conv_obj.bias.data, + torch.ones_like(conv_obj.bias.data) * bias_init) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOXHead(head_module=self.head_module, test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + img_metas, + cfg=test_cfg, + rescale=False, + with_nms=False) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'scale_factor': 1, + }] + train_cfg = Config( + dict( + assigner=dict( + type='mmdet.SimOTAAssigner', + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + center_radius=2.5, + candidate_topk=10, + iou_weight=3.0, + cls_weight=1.0))) + + head = YOLOXHead(head_module=self.head_module, train_cfg=train_cfg) + assert not head.use_bbox_aux + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6)) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, gt_instances, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOXHead(head_module=self.head_module, train_cfg=train_cfg) + head.use_bbox_aux = True + gt_instances = torch.Tensor( + [[0, 2, 23.6667, 23.8757, 238.6326, 151.8874]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + gt_instances, img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_l1_loss = one_gt_losses['loss_bbox_aux'].sum() + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_l1_loss.item(), 0, + 'l1 loss should be non-zero') + + # Test groud truth out of bound + gt_instances = torch.Tensor( + [[0, 2, s * 4, s * 4, s * 4 + 10, s * 4 + 10]]) + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, gt_instances, + img_metas) + # When gt_bboxes out of bound, the assign results should be empty, + # so the cls and bbox loss should be zero. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when gt_bboxes out of bound') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when gt_bboxes out of bound') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + + +class TestYOLOXPoseHead(TestCase): + + def setUp(self): + self.head_module = dict( + type='YOLOXPoseHeadModule', + num_classes=1, + num_keypoints=17, + in_channels=1, + stacked_convs=1, + ) + self.train_cfg = Config( + dict( + assigner=dict( + type='PoseSimOTAAssigner', + center_radius=2.5, + oks_weight=3.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + oks_calculator=dict( + type='OksLoss', + metainfo='configs/_base_/pose/coco.py')))) + self.loss_pose = Config( + dict( + type='OksLoss', + metainfo='configs/_base_/pose/coco.py', + loss_weight=30.0)) + + def test_init_weights(self): + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg) + head.head_module.init_weights() + bias_init = bias_init_with_prob(0.01) + for conv_cls, conv_obj, conv_vis in zip( + head.head_module.multi_level_conv_cls, + head.head_module.multi_level_conv_obj, + head.head_module.multi_level_conv_vis): + assert_allclose(conv_cls.bias.data, + torch.ones_like(conv_cls.bias.data) * bias_init) + assert_allclose(conv_obj.bias.data, + torch.ones_like(conv_obj.bias.data) * bias_init) + assert_allclose(conv_vis.bias.data, + torch.ones_like(conv_vis.bias.data) * bias_init) + + def test_predict_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'ori_shape': (s, s, 3), + 'scale_factor': (1.0, 1.0), + }] + test_cfg = Config( + dict( + multi_label=True, + max_per_img=300, + score_thr=0.01, + nms=dict(type='nms', iou_threshold=0.65))) + + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg, + test_cfg=test_cfg) + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses, \ + offsets_preds, vis_preds = head.forward(feat) + head.predict_by_feat( + cls_scores, + bbox_preds, + objectnesses, + offsets_preds, + vis_preds, + img_metas, + cfg=test_cfg, + rescale=True, + with_nms=True) + + def test_loss_by_feat(self): + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'scale_factor': 1, + }] + + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg) + assert not head.use_bbox_aux + + feat = [ + torch.rand(1, 1, s // feat_size, s // feat_size) + for feat_size in [4, 8, 16] + ] + cls_scores, bbox_preds, objectnesses, \ + offsets_preds, vis_preds = head.forward(feat) + + # Test that empty ground truth encourages the network to predict + # background + gt_instances = torch.empty((0, 6)) + gt_keypoints = torch.empty((0, 17, 2)) + gt_keypoints_visible = torch.empty((0, 17)) + + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, offsets_preds, + vis_preds, gt_instances, + gt_keypoints, gt_keypoints_visible, + img_metas) + # When there is no truth, the cls loss should be nonzero but there + # should be no box loss. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + empty_loss_kpt = empty_gt_losses['loss_kpt'].sum() + empty_loss_vis = empty_gt_losses['loss_vis'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when there are no true boxes') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when there are no true boxes') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + self.assertEqual( + empty_loss_kpt.item(), 0, + 'there should be no kpt loss when there are no true keypoints') + self.assertEqual( + empty_loss_vis.item(), 0, + 'there should be no vis loss when there are no true keypoints') + # When truth is non-empty then both cls and box loss should be nonzero + # for random inputs + head = YOLOXPoseHead( + head_module=self.head_module, + loss_pose=self.loss_pose, + train_cfg=self.train_cfg) + gt_instances = torch.Tensor( + [[0, 0, 23.6667, 23.8757, 238.6326, 151.8874]]) + gt_keypoints = torch.Tensor([[[317.1519, + 429.8433], [338.3080, 416.9187], + [298.9951, + 403.8911], [102.7025, 273.1329], + [255.4321, + 404.8712], [400.0422, 554.4373], + [167.7857, + 516.7591], [397.4943, 737.4575], + [116.3247, + 674.5684], [102.7025, 273.1329], + [66.0319, + 808.6383], [102.7025, 273.1329], + [157.6150, + 819.1249], [102.7025, 273.1329], + [102.7025, + 273.1329], [102.7025, 273.1329], + [102.7025, 273.1329]]]) + gt_keypoints_visible = torch.Tensor([[ + 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. + ]]) + + one_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, objectnesses, + offsets_preds, vis_preds, + gt_instances, gt_keypoints, + gt_keypoints_visible, img_metas) + onegt_cls_loss = one_gt_losses['loss_cls'].sum() + onegt_box_loss = one_gt_losses['loss_bbox'].sum() + onegt_obj_loss = one_gt_losses['loss_obj'].sum() + onegt_loss_kpt = one_gt_losses['loss_kpt'].sum() + onegt_loss_vis = one_gt_losses['loss_vis'].sum() + + self.assertGreater(onegt_cls_loss.item(), 0, + 'cls loss should be non-zero') + self.assertGreater(onegt_box_loss.item(), 0, + 'box loss should be non-zero') + self.assertGreater(onegt_obj_loss.item(), 0, + 'obj loss should be non-zero') + self.assertGreater(onegt_loss_kpt.item(), 0, + 'kpt loss should be non-zero') + self.assertGreater(onegt_loss_vis.item(), 0, + 'vis loss should be non-zero') + + # Test groud truth out of bound + gt_instances = torch.Tensor( + [[0, 2, s * 4, s * 4, s * 4 + 10, s * 4 + 10]]) + gt_keypoints = torch.Tensor([[[s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10], [s * 4, s * 4 + 10], + [s * 4, s * 4 + 10]]]) + empty_gt_losses = head.loss_by_feat(cls_scores, bbox_preds, + objectnesses, offsets_preds, + vis_preds, gt_instances, + gt_keypoints, gt_keypoints_visible, + img_metas) + # When gt_bboxes out of bound, the assign results should be empty, + # so the cls and bbox loss should be zero. + empty_cls_loss = empty_gt_losses['loss_cls'].sum() + empty_box_loss = empty_gt_losses['loss_bbox'].sum() + empty_obj_loss = empty_gt_losses['loss_obj'].sum() + empty_kpt_loss = empty_gt_losses['loss_kpt'].sum() + empty_vis_loss = empty_gt_losses['loss_vis'].sum() + self.assertEqual( + empty_cls_loss.item(), 0, + 'there should be no cls loss when gt_bboxes out of bound') + self.assertEqual( + empty_box_loss.item(), 0, + 'there should be no box loss when gt_bboxes out of bound') + self.assertGreater(empty_obj_loss.item(), 0, + 'objectness loss should be non-zero') + self.assertEqual(empty_kpt_loss.item(), 0, + 'kps loss should be non-zero') + self.assertEqual(empty_vis_loss.item(), 0, + 'vis loss should be non-zero') diff --git a/third_party/mmyolo/tests/test_models/test_detectors/test_yolo_detector.py b/third_party/mmyolo/tests/test_models/test_detectors/test_yolo_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..4b2952040d193781a6d042976c336485232e1a0a --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_detectors/test_yolo_detector.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import time +import unittest +from unittest import TestCase + +import torch +from mmdet.structures import DetDataSample +from mmdet.testing import demo_mm_inputs +from mmengine.logging import MessageHub +from parameterized import parameterized + +from mmyolo.testing import get_detector_cfg +from mmyolo.utils import register_all_modules + + +class TestSingleStageDetector(TestCase): + + def setUp(self): + register_all_modules() + + @parameterized.expand([ + 'yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py', + 'yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py', + 'yolox/yolox_tiny_fast_8xb8-300e_coco.py', + 'rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', + 'yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', + 'yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py' + ]) + def test_init(self, cfg_file): + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + from mmyolo.registry import MODELS + detector = MODELS.build(model) + self.assertTrue(detector.backbone) + self.assertTrue(detector.neck) + self.assertTrue(detector.bbox_head) + + @parameterized.expand([ + ('yolov5/yolov5_s-v61_syncbn_8xb16-300e_coco.py', ('cuda', 'cpu')), + ('yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', ('cuda', 'cpu')), + ('rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', ('cuda', 'cpu')), + ('yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py', ('cuda', 'cpu')) + ]) + def test_forward_loss_mode(self, cfg_file, devices): + message_hub = MessageHub.get_instance( + f'test_single_stage_forward_loss_mode-{time.time()}') + message_hub.update_info('iter', 0) + message_hub.update_info('epoch', 0) + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + if 'fast' in cfg_file: + model.data_preprocessor = dict( + type='mmdet.DetDataPreprocessor', + mean=[0., 0., 0.], + std=[255., 255., 255.], + bgr_to_rgb=True) + + from mmyolo.registry import MODELS + assert all([device in ['cpu', 'cuda'] for device in devices]) + + for device in devices: + detector = MODELS.build(model) + detector.init_weights() + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + detector = detector.cuda() + + packed_inputs = demo_mm_inputs(2, [[3, 320, 128], [3, 125, 320]]) + data = detector.data_preprocessor(packed_inputs, True) + losses = detector.forward(**data, mode='loss') + self.assertIsInstance(losses, dict) + + @parameterized.expand([ + ('yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py', ('cuda', + 'cpu')), + ('yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py', ('cuda', 'cpu')), + ('yolox/yolox_tiny_fast_8xb8-300e_coco.py', ('cuda', 'cpu')), + ('yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', ('cuda', 'cpu')), + ('rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', ('cuda', 'cpu')), + ('yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py', ('cuda', 'cpu')) + ]) + def test_forward_predict_mode(self, cfg_file, devices): + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + from mmyolo.registry import MODELS + assert all([device in ['cpu', 'cuda'] for device in devices]) + + for device in devices: + detector = MODELS.build(model) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + detector = detector.cuda() + + packed_inputs = demo_mm_inputs(2, [[3, 320, 128], [3, 125, 320]]) + data = detector.data_preprocessor(packed_inputs, False) + # Test forward test + detector.eval() + with torch.no_grad(): + batch_results = detector.forward(**data, mode='predict') + self.assertEqual(len(batch_results), 2) + self.assertIsInstance(batch_results[0], DetDataSample) + + @parameterized.expand([ + ('yolov5/yolov5_n-v61_syncbn_fast_8xb16-300e_coco.py', ('cuda', + 'cpu')), + ('yolov6/yolov6_s_syncbn_fast_8xb32-400e_coco.py', ('cuda', 'cpu')), + ('yolox/yolox_tiny_fast_8xb8-300e_coco.py', ('cuda', 'cpu')), + ('yolov7/yolov7_tiny_syncbn_fast_8x16b-300e_coco.py', ('cuda', 'cpu')), + ('rtmdet/rtmdet_tiny_syncbn_fast_8xb32-300e_coco.py', ('cuda', 'cpu')), + ('yolov8/yolov8_n_syncbn_fast_8xb16-500e_coco.py', ('cuda', 'cpu')) + ]) + def test_forward_tensor_mode(self, cfg_file, devices): + model = get_detector_cfg(cfg_file) + model.backbone.init_cfg = None + + from mmyolo.registry import MODELS + assert all([device in ['cpu', 'cuda'] for device in devices]) + + for device in devices: + detector = MODELS.build(model) + + if device == 'cuda': + if not torch.cuda.is_available(): + return unittest.skip('test requires GPU and torch+cuda') + detector = detector.cuda() + + packed_inputs = demo_mm_inputs(2, [[3, 320, 128], [3, 125, 320]]) + data = detector.data_preprocessor(packed_inputs, False) + batch_results = detector.forward(**data, mode='tensor') + self.assertIsInstance(batch_results, tuple) diff --git a/third_party/mmyolo/tests/test_models/test_layers/__init__.py b/third_party/mmyolo/tests/test_models/test_layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_layers/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_layers/test_ema.py b/third_party/mmyolo/tests/test_models/test_layers/test_ema.py new file mode 100644 index 0000000000000000000000000000000000000000..b35838280ee5bc09d7c82b451f72468b53f5583f --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_layers/test_ema.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import itertools +import math +from unittest import TestCase + +import torch +import torch.nn as nn +from mmengine.testing import assert_allclose + +from mmyolo.models.layers import ExpMomentumEMA + + +class TestEMA(TestCase): + + def test_exp_momentum_ema(self): + model = nn.Sequential(nn.Conv2d(1, 5, kernel_size=3), nn.Linear(5, 10)) + # Test invalid gamma + with self.assertRaisesRegex(AssertionError, + 'gamma must be greater than 0'): + ExpMomentumEMA(model, gamma=-1) + + # Test EMA + model = torch.nn.Sequential( + torch.nn.Conv2d(1, 5, kernel_size=3), torch.nn.Linear(5, 10)) + momentum = 0.1 + gamma = 4 + + ema_model = ExpMomentumEMA(model, momentum=momentum, gamma=gamma) + averaged_params = [ + torch.zeros_like(param) for param in model.parameters() + ] + n_updates = 10 + for i in range(n_updates): + updated_averaged_params = [] + for p, p_avg in zip(model.parameters(), averaged_params): + p.detach().add_(torch.randn_like(p)) + if i == 0: + updated_averaged_params.append(p.clone()) + else: + m = (1 - momentum) * math.exp(-(1 + i) / gamma) + momentum + updated_averaged_params.append( + (p_avg * (1 - m) + p * m).clone()) + ema_model.update_parameters(model) + averaged_params = updated_averaged_params + + for p_target, p_ema in zip(averaged_params, ema_model.parameters()): + assert_allclose(p_target, p_ema) + + def test_exp_momentum_ema_update_buffer(self): + model = nn.Sequential( + nn.Conv2d(1, 5, kernel_size=3), nn.BatchNorm2d(5, momentum=0.3), + nn.Linear(5, 10)) + # Test invalid gamma + with self.assertRaisesRegex(AssertionError, + 'gamma must be greater than 0'): + ExpMomentumEMA(model, gamma=-1) + + # Test EMA with momentum annealing. + momentum = 0.1 + gamma = 4 + + ema_model = ExpMomentumEMA( + model, gamma=gamma, momentum=momentum, update_buffers=True) + averaged_params = [ + torch.zeros_like(param) + for param in itertools.chain(model.parameters(), model.buffers()) + if param.size() != torch.Size([]) + ] + n_updates = 10 + for i in range(n_updates): + updated_averaged_params = [] + params = [ + param for param in itertools.chain(model.parameters(), + model.buffers()) + if param.size() != torch.Size([]) + ] + for p, p_avg in zip(params, averaged_params): + p.detach().add_(torch.randn_like(p)) + if i == 0: + updated_averaged_params.append(p.clone()) + else: + m = (1 - momentum) * math.exp(-(1 + i) / gamma) + momentum + updated_averaged_params.append( + (p_avg * (1 - m) + p * m).clone()) + ema_model.update_parameters(model) + averaged_params = updated_averaged_params + + ema_params = [ + param for param in itertools.chain(ema_model.module.parameters(), + ema_model.module.buffers()) + if param.size() != torch.Size([]) + ] + for p_target, p_ema in zip(averaged_params, ema_params): + assert_allclose(p_target, p_ema) diff --git a/third_party/mmyolo/tests/test_models/test_layers/test_yolo_bricks.py b/third_party/mmyolo/tests/test_models/test_layers/test_yolo_bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..5331a4e013c797052ed003b64b477d24ad10444c --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_layers/test_yolo_bricks.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from unittest import TestCase + +import torch + +from mmyolo.models.layers import SPPFBottleneck +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestSPPFBottleneck(TestCase): + + def test_forward(self): + input_tensor = torch.randn((1, 3, 20, 20)) + bottleneck = SPPFBottleneck(3, 16) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) + + bottleneck = SPPFBottleneck(3, 16, kernel_sizes=[3, 5, 7]) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) + + # set len(kernel_sizes)=4 + bottleneck = SPPFBottleneck(3, 16, kernel_sizes=[3, 5, 7, 9]) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) + + # set use_conv_first=False + bottleneck = SPPFBottleneck( + 3, 16, use_conv_first=False, kernel_sizes=[3, 5, 7, 9]) + out_tensor = bottleneck(input_tensor) + self.assertEqual(out_tensor.shape, (1, 16, 20, 20)) diff --git a/third_party/mmyolo/tests/test_models/test_necks/__init__.py b/third_party/mmyolo/tests/test_models/test_necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_necks/test_cspnext_pafpn.py b/third_party/mmyolo/tests/test_models/test_necks/test_cspnext_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..b26c99aa3c90c9e53be6ef7f8f28c4996c49ca2f --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/test_cspnext_pafpn.py @@ -0,0 +1,37 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import CSPNeXtPAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestCSPNeXtPAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = 24 + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = CSPNeXtPAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test depth-wise + neck = CSPNeXtPAFPN( + in_channels=in_channels, + out_channels=out_channels, + use_depthwise=True) + + from mmcv.cnn.bricks import DepthwiseSeparableConvModule + self.assertTrue(neck.conv, DepthwiseSeparableConvModule) diff --git a/third_party/mmyolo/tests/test_models/test_necks/test_ppyoloe_csppan.py b/third_party/mmyolo/tests/test_models/test_necks/test_ppyoloe_csppan.py new file mode 100644 index 0000000000000000000000000000000000000000..b79c1ce5bee9f0761b6c3deedc2c8c250ad8aac7 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/test_ppyoloe_csppan.py @@ -0,0 +1,53 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models import PPYOLOECSPPAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestPPYOLOECSPPAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = PPYOLOECSPPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + def test_drop_block(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = PPYOLOECSPPAFPN( + in_channels=in_channels, + out_channels=out_channels, + drop_block_cfg=dict( + type='mmdet.DropBlock', + drop_prob=0.1, + block_size=3, + warm_iters=0)) + neck.train() + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/third_party/mmyolo/tests/test_models/test_necks/test_yolov5_pafpn.py b/third_party/mmyolo/tests/test_models/test_necks/test_yolov5_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..339621ec4ba81de7c913b20dc1530289c3bd8c8c --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/test_yolov5_pafpn.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import YOLOv5PAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv5PAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv5PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/third_party/mmyolo/tests/test_models/test_necks/test_yolov6_pafpn.py b/third_party/mmyolo/tests/test_models/test_necks/test_yolov6_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..e766aa8700e292d13d411b3eccc4542b8ef49725 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/test_yolov6_pafpn.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import (YOLOv6CSPRepBiPAFPN, YOLOv6CSPRepPAFPN, + YOLOv6RepBiPAFPN, YOLOv6RepPAFPN) +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv6PAFPN(TestCase): + + def test_YOLOv6RepPAFP_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6RepPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + def test_YOLOv6CSPRepPAFPN_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6CSPRepPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + def test_YOLOv6CSPRepBiPAFPN_forward(self): + s = 64 + in_channels = [4, 8, 16, 32] # includes an extra input for BiFusion + feat_sizes = [s // 2**i for i in range(4)] # [64, 32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6CSPRepBiPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) - 1 + for i in range(len(feats) - 1): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == feat_sizes[i + 1] + + def test_YOLOv6RepBiPAFPN_forward(self): + s = 64 + in_channels = [4, 8, 16, 32] # includes an extra input for BiFusion + feat_sizes = [s // 2**i for i in range(4)] # [64, 32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv6RepBiPAFPN( + in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) - 1 + for i in range(len(feats) - 1): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == feat_sizes[i + 1] diff --git a/third_party/mmyolo/tests/test_models/test_necks/test_yolov7_pafpn.py b/third_party/mmyolo/tests/test_models/test_necks/test_yolov7_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..17bf455c12d6f75191813213d286ae9646ef2d14 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/test_yolov7_pafpn.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmcv.cnn import ConvModule + +from mmyolo.models.necks import YOLOv7PAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv7PAFPN(TestCase): + + def test_forward(self): + # test P5 + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv7PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] * 2 + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test is_tiny_version + neck = YOLOv7PAFPN( + in_channels=in_channels, + out_channels=out_channels, + is_tiny_version=True) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] * 2 + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test use_in_channels_in_downsample + neck = YOLOv7PAFPN( + in_channels=in_channels, + out_channels=out_channels, + use_in_channels_in_downsample=True) + for f in feats: + print(f.shape) + outs = neck(feats) + for f in outs: + print(f.shape) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] * 2 + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) + + # test use_repconv_outs is False + neck = YOLOv7PAFPN( + in_channels=in_channels, + out_channels=out_channels, + use_repconv_outs=False) + self.assertIsInstance(neck.out_layers[0], ConvModule) + + # test P6 + s = 64 + in_channels = [8, 16, 32, 64] + feat_sizes = [s // 2**i for i in range(4)] + out_channels = [8, 16, 32, 64] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv7PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/third_party/mmyolo/tests/test_models/test_necks/test_yolov8_pafpn.py b/third_party/mmyolo/tests/test_models/test_necks/test_yolov8_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..66d136d0f26f68628b29c8a585bfaf4bea0b92fd --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/test_yolov8_pafpn.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models import YOLOv8PAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOv8PAFPN(TestCase): + + def test_YOLOv8PAFPN_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = [8, 16, 32] + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOv8PAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels[i] + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/third_party/mmyolo/tests/test_models/test_necks/test_yolox_pafpn.py b/third_party/mmyolo/tests/test_models/test_necks/test_yolox_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..25fe67a12e969c28bfc09d66c265664c038feba5 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_necks/test_yolox_pafpn.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.necks import YOLOXPAFPN +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestYOLOXPAFPN(TestCase): + + def test_forward(self): + s = 64 + in_channels = [8, 16, 32] + feat_sizes = [s // 2**i for i in range(4)] # [32, 16, 8] + out_channels = 24 + feats = [ + torch.rand(1, in_channels[i], feat_sizes[i], feat_sizes[i]) + for i in range(len(in_channels)) + ] + neck = YOLOXPAFPN(in_channels=in_channels, out_channels=out_channels) + outs = neck(feats) + assert len(outs) == len(feats) + for i in range(len(feats)): + assert outs[i].shape[1] == out_channels + assert outs[i].shape[2] == outs[i].shape[3] == s // (2**i) diff --git a/third_party/mmyolo/tests/test_models/test_plugins/__init__.py b/third_party/mmyolo/tests/test_models/test_plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_plugins/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_plugins/test_cbam.py b/third_party/mmyolo/tests/test_models/test_plugins/test_cbam.py new file mode 100644 index 0000000000000000000000000000000000000000..4af547c05172a2e8de09a5d56c35fa0b383dcea0 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_plugins/test_cbam.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from unittest import TestCase + +import torch + +from mmyolo.models.plugins import CBAM +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestCBAM(TestCase): + + def test_forward(self): + tensor_shape = (2, 16, 20, 20) + + images = torch.randn(*tensor_shape) + cbam = CBAM(16) + out = cbam(images) + self.assertEqual(out.shape, tensor_shape) + + # test other ratio + cbam = CBAM(16, reduce_ratio=8) + out = cbam(images) + self.assertEqual(out.shape, tensor_shape) + + # test other act_cfg in ChannelAttention + cbam = CBAM(in_channels=16, act_cfg=dict(type='Sigmoid')) + out = cbam(images) + self.assertEqual(out.shape, tensor_shape) diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/__init__.py b/third_party/mmyolo/tests/test_models/test_task_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/__init__.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_atss_assigner.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_atss_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..a01e4fce390965bb16a489237464c74851f09a25 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_atss_assigner.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.assigners import BatchATSSAssigner + + +class TestBatchATSSAssigner(TestCase): + + def test_batch_atss_assigner(self): + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.FloatTensor([ + [4., 4., 8., 8.], + [12., 4., 8., 8.], + [20., 4., 8., 8.], + [28., 4., 8., 8.], + ]).repeat(21, 1) + gt_bboxes = torch.FloatTensor([ + [0, 0, 60, 93], + [229, 0, 532, 157], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + gt_labels = torch.LongTensor([ + [0], + [11], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.FloatTensor([ + [1], + [0], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) + + def test_batch_atss_assigner_with_empty_gt(self): + """Test corner case where an image might have no true detections.""" + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.FloatTensor([ + [4., 4., 8., 8.], + [12., 4., 8., 8.], + [20., 4., 8., 8.], + [28., 4., 8., 8.], + ]).repeat(21, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.FloatTensor([ + [1], + [0], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + + gt_bboxes = torch.zeros(batch_size, 0, 4) + gt_labels = torch.zeros(batch_size, 0, 1) + + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) + + def test_batch_atss_assigner_with_empty_boxs(self): + """Test corner case where a network might predict no boxes.""" + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.zeros(84, 4) + gt_bboxes = torch.FloatTensor([ + [0, 0, 60, 93], + [229, 0, 532, 157], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + gt_labels = torch.LongTensor([ + [0], + [11], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.FloatTensor([[1], [0]]).unsqueeze(0).repeat( + batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) + + def test_batch_atss_assigner_with_empty_boxes_and_gt(self): + """Test corner case where a network might predict no boxes and no + gt.""" + num_classes = 2 + batch_size = 2 + batch_atss_assigner = BatchATSSAssigner( + topk=3, + iou_calculator=dict(type='mmdet.BboxOverlaps2D'), + num_classes=num_classes) + priors = torch.zeros(84, 4) + gt_bboxes = torch.zeros(batch_size, 0, 4) + gt_labels = torch.zeros(batch_size, 0, 1) + num_level_bboxes = [64, 16, 4] + pad_bbox_flag = torch.zeros(batch_size, 0, 1) + pred_bboxes = torch.zeros(batch_size, 0, 4) + + batch_assign_result = batch_atss_assigner.forward( + pred_bboxes, priors, num_level_bboxes, gt_labels, gt_bboxes, + pad_bbox_flag) + assigned_labels = batch_assign_result['assigned_labels'] + assigned_bboxes = batch_assign_result['assigned_bboxes'] + assigned_scores = batch_assign_result['assigned_scores'] + fg_mask_pre_prior = batch_assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_dsl_assigner.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_dsl_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..9644896ca2b609ae161de9eb74c2a520e13b76db --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_dsl_assigner.py @@ -0,0 +1,192 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import pytest +import torch + +from mmyolo.models.task_modules.assigners import BatchDynamicSoftLabelAssigner + + +class TestBatchDynamicSoftLabelAssigner(TestCase): + + def test_assign(self): + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0) + + pred_bboxes = torch.FloatTensor([ + [23, 23, 43, 43], + [4, 5, 6, 7], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + pred_scores = torch.FloatTensor([ + [0.2], + [0.8], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + priors = torch.FloatTensor([[30, 30, 8, 8], [4, 5, 6, + 7]]).repeat(10, 1) + + gt_bboxes = torch.FloatTensor([[23, 23, 43, 43]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + gt_labels = torch.LongTensor([[0] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 20])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 20, + 4])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 20])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 20])) + + def test_assign_with_empty_gt(self): + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0) + + pred_bboxes = torch.FloatTensor([ + [23, 23, 43, 43], + [4, 5, 6, 7], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + pred_scores = torch.FloatTensor([ + [0.2], + [0.8], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + priors = torch.FloatTensor([[30, 30, 8, 8], [4, 5, 6, + 7]]).repeat(10, 1) + + gt_bboxes = torch.zeros(batch_size, 0, 4) + gt_labels = torch.zeros(batch_size, 0, 1) + pad_bbox_flag = torch.zeros(batch_size, 0, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 20])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 20, + 4])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 20])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 20])) + + def test_assign_with_empty_boxs(self): + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0) + + pred_bboxes = torch.zeros(batch_size, 0, 4) + + pred_scores = torch.zeros(batch_size, 0, 4) + + priors = torch.zeros(0, 4) + gt_bboxes = torch.FloatTensor([[23, 23, 43, 43]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + gt_labels = torch.LongTensor([[0] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 0])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 0, 4])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 0])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 0])) + + def test_assign_rotate_box(self): + try: + import importlib + importlib.import_module('mmrotate') + except ImportError: + pytest.skip('mmrotate is not installed.', allow_module_level=True) + + num_classes = 2 + batch_size = 2 + + assigner = BatchDynamicSoftLabelAssigner( + num_classes=num_classes, + soft_center_radius=3.0, + topk=1, + iou_weight=3.0, + iou_calculator=dict(type='mmrotate.RBboxOverlaps2D'), + # RBboxOverlaps2D doesn't support batch input, use loop instead. + batch_iou=False, + ) + + pred_bboxes = torch.FloatTensor([ + [23, 23, 20, 20, 0.078], + [4, 5, 2, 2, 0.078], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + pred_scores = torch.FloatTensor([ + [0.2], + [0.8], + ]).unsqueeze(0).repeat(batch_size, 10, 1) + + priors = torch.FloatTensor([[30, 30, 8, 8], [4, 5, 6, + 7]]).repeat(10, 1) + + gt_bboxes = torch.FloatTensor([[23, 23, 20, 20, + 0.078]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + gt_labels = torch.LongTensor([[0] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1]]).unsqueeze(0).repeat( + batch_size, 1, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_labels_weights = assign_result['assigned_labels_weights'] + assigned_bboxes = assign_result['assigned_bboxes'] + assign_metrics = assign_result['assign_metrics'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 20])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 20, + 5])) + self.assertEqual(assigned_labels_weights.shape, + torch.Size([batch_size, 20])) + self.assertEqual(assign_metrics.shape, torch.Size([batch_size, 20])) diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_task_aligned_assigner.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_task_aligned_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..fe474b53122703af556ff11a3ef42fa0a3ced736 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_batch_task_aligned_assigner.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.assigners import BatchTaskAlignedAssigner + + +class TestBatchTaskAlignedAssigner(TestCase): + + def test_batch_task_aligned_assigner(self): + batch_size = 2 + num_classes = 4 + assigner = BatchTaskAlignedAssigner( + num_classes=num_classes, alpha=1, beta=6, topk=13, eps=1e-9) + pred_scores = torch.FloatTensor([ + [0.1, 0.2], + [0.2, 0.3], + [0.3, 0.4], + [0.4, 0.5], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + priors = torch.FloatTensor([ + [0, 0, 4., 4.], + [0, 0, 12., 4.], + [0, 0, 20., 4.], + [0, 0, 28., 4.], + ]).repeat(21, 1) + gt_bboxes = torch.FloatTensor([ + [0, 0, 60, 93], + [229, 0, 532, 157], + ]).unsqueeze(0).repeat(batch_size, 1, 1) + gt_labels = torch.LongTensor([[0], [1] + ]).unsqueeze(0).repeat(batch_size, 1, 1) + pad_bbox_flag = torch.FloatTensor([[1], [0]]).unsqueeze(0).repeat( + batch_size, 1, 1) + pred_bboxes = torch.FloatTensor([ + [-4., -4., 12., 12.], + [4., -4., 20., 12.], + [12., -4., 28., 12.], + [20., -4., 36., 12.], + ]).unsqueeze(0).repeat(batch_size, 21, 1) + + assign_result = assigner.forward(pred_bboxes, pred_scores, priors, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_labels = assign_result['assigned_labels'] + assigned_bboxes = assign_result['assigned_bboxes'] + assigned_scores = assign_result['assigned_scores'] + fg_mask_pre_prior = assign_result['fg_mask_pre_prior'] + + self.assertEqual(assigned_labels.shape, torch.Size([batch_size, 84])) + self.assertEqual(assigned_bboxes.shape, torch.Size([batch_size, 84, + 4])) + self.assertEqual(assigned_scores.shape, + torch.Size([batch_size, 84, num_classes])) + self.assertEqual(fg_mask_pre_prior.shape, torch.Size([batch_size, 84])) diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py new file mode 100644 index 0000000000000000000000000000000000000000..fb4793f7e4ed0066545e821352f0a5e263d3b9fd --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_assigners/test_pose_sim_ota_assigner.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch +from mmengine.structures import InstanceData +from mmengine.testing import assert_allclose + +from mmyolo.models.task_modules.assigners import PoseSimOTAAssigner + + +class TestPoseSimOTAAssigner(TestCase): + + def test_assign(self): + assigner = PoseSimOTAAssigner( + center_radius=2.5, + candidate_topk=1, + iou_weight=3.0, + cls_weight=1.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')) + pred_instances = InstanceData( + bboxes=torch.Tensor([[23, 23, 43, 43] + [1] * 51, + [4, 5, 6, 7] + [1] * 51]), + scores=torch.FloatTensor([[0.2], [0.8]]), + priors=torch.Tensor([[30, 30, 8, 8], [4, 5, 6, 7]])) + gt_instances = InstanceData( + bboxes=torch.Tensor([[23, 23, 43, 43]]), + labels=torch.LongTensor([0]), + keypoints_visible=torch.Tensor([[ + 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., + 0. + ]]), + keypoints=torch.Tensor([[[30, 30], [30, 30], [30, 30], [30, 30], + [30, 30], [30, 30], [30, 30], [30, 30], + [30, 30], [30, 30], [30, 30], [30, 30], + [30, 30], [30, 30], [30, 30], [30, 30], + [30, 30]]])) + assign_result = assigner.assign( + pred_instances=pred_instances, gt_instances=gt_instances) + + expected_gt_inds = torch.LongTensor([1, 0]) + assert_allclose(assign_result.gt_inds, expected_gt_inds) + + def test_assign_with_no_valid_bboxes(self): + assigner = PoseSimOTAAssigner( + center_radius=2.5, + candidate_topk=1, + iou_weight=3.0, + cls_weight=1.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')) + pred_instances = InstanceData( + bboxes=torch.Tensor([[123, 123, 143, 143], [114, 151, 161, 171]]), + scores=torch.FloatTensor([[0.2], [0.8]]), + priors=torch.Tensor([[30, 30, 8, 8], [55, 55, 8, 8]])) + gt_instances = InstanceData( + bboxes=torch.Tensor([[0, 0, 1, 1]]), + labels=torch.LongTensor([0]), + keypoints_visible=torch.zeros((1, 17)), + keypoints=torch.zeros((1, 17, 2))) + assign_result = assigner.assign( + pred_instances=pred_instances, gt_instances=gt_instances) + + expected_gt_inds = torch.LongTensor([0, 0]) + assert_allclose(assign_result.gt_inds, expected_gt_inds) + + def test_assign_with_empty_gt(self): + assigner = PoseSimOTAAssigner( + center_radius=2.5, + candidate_topk=1, + iou_weight=3.0, + cls_weight=1.0, + iou_calculator=dict(type='mmdet.BboxOverlaps2D')) + pred_instances = InstanceData( + bboxes=torch.Tensor([[[30, 40, 50, 60]], [[4, 5, 6, 7]]]), + scores=torch.FloatTensor([[0.2], [0.8]]), + priors=torch.Tensor([[0, 12, 23, 34], [4, 5, 6, 7]])) + gt_instances = InstanceData( + bboxes=torch.empty(0, 4), + labels=torch.empty(0), + keypoints_visible=torch.empty(0, 17), + keypoints=torch.empty(0, 17, 2)) + + assign_result = assigner.assign( + pred_instances=pred_instances, gt_instances=gt_instances) + expected_gt_inds = torch.LongTensor([0, 0]) + assert_allclose(assign_result.gt_inds, expected_gt_inds) diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/__init__.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_distance_point_bbox_coder.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_distance_point_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..10b0215c27d7a1f88f894f459cf641555833da9e --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_distance_point_bbox_coder.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.coders import DistancePointBBoxCoder + + +class TestDistancePointBBoxCoder(TestCase): + + def test_decoder(self): + coder = DistancePointBBoxCoder() + + points = torch.Tensor([[74., 61.], [-29., 106.], [138., 61.], + [29., 170.]]) + pred_bboxes = torch.Tensor([[0, -1, 3, 3], [-1, -7, -4.8, 9], + [-23, -1, 12, 1], [14.5, -13, 10, 18.3]]) + expected_distance = torch.Tensor([[74, 63, 80, 67], + [-25, 134, -48.2, 142], + [276, 67, 210, 67], + [-58, 248, 89, 279.8]]) + strides = torch.Tensor([2, 4, 6, 6]) + out_distance = coder.decode(points, pred_bboxes, strides) + assert expected_distance.allclose(out_distance) + + batch_priors = points.unsqueeze(0).repeat(2, 1, 1) + batch_pred_bboxes = pred_bboxes.unsqueeze(0).repeat(2, 1, 1) + batch_out = coder.decode(batch_priors, batch_pred_bboxes, strides)[0] + assert out_distance.allclose(batch_out) diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolov5_bbox_coder.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolov5_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..e1d4ebe1fd9dc5263b09e8d07a456a41e61bbc3b --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolov5_bbox_coder.py @@ -0,0 +1,32 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.coders import YOLOv5BBoxCoder + + +class TestYOLOv5Coder(TestCase): + + def test_decoder(self): + coder = YOLOv5BBoxCoder() + + priors = torch.Tensor([[10., 10., 20., 20.], [10., 8., 10., 10.], + [15., 8., 20., 3.], [2., 5., 5., 8.]]) + pred_bboxes = torch.Tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.1409, 0.1409, 2.8591, 2.8591], + [0.0000, 0.3161, 4.1945, 0.6839], + [1.0000, 5.0000, 9.0000, 5.0000]]) + strides = torch.Tensor([2, 4, 8, 8]) + expected_decode_bboxes = torch.Tensor( + [[4.3111, 4.3111, 25.6889, 25.6889], + [10.2813, 5.7033, 10.2813, 12.8594], + [7.7949, 11.1710, 27.2051, 2.3369], + [1.1984, 8.4730, 13.1955, 20.3129]]) + out = coder.decode(priors, pred_bboxes, strides) + assert expected_decode_bboxes.allclose(out, atol=1e-04) + + batch_priors = priors.unsqueeze(0).repeat(2, 1, 1) + batch_pred_bboxes = pred_bboxes.unsqueeze(0).repeat(2, 1, 1) + batch_out = coder.decode(batch_priors, batch_pred_bboxes, strides)[0] + assert out.allclose(batch_out) diff --git a/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolox_bbox_coder.py b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolox_bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..00d6c3164b840ad05fe112ff629ad74faffb2418 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_task_modules/test_coders/test_yolox_bbox_coder.py @@ -0,0 +1,31 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import torch + +from mmyolo.models.task_modules.coders import YOLOXBBoxCoder + + +class TestYOLOv5Coder(TestCase): + + def test_decoder(self): + coder = YOLOXBBoxCoder() + + priors = torch.Tensor([[10., 10.], [8., 8.], [15., 8.], [2., 5.]]) + pred_bboxes = torch.Tensor([[0.0000, 0.0000, 1.0000, 1.0000], + [0.0409, 0.1409, 0.8591, 0.8591], + [0.0000, 0.3161, 0.1945, 0.6839], + [1.0000, 5.0000, 0.2000, 0.6000]]) + strides = torch.Tensor([2, 4, 6, 6]) + expected_decode_bboxes = torch.Tensor( + [[7.2817, 7.2817, 12.7183, 12.7183], + [3.4415, 3.8415, 12.8857, 13.2857], + [11.3559, 3.9518, 18.6441, 15.8414], + [4.3358, 29.5336, 11.6642, 40.4664]]) + out = coder.decode(priors, pred_bboxes, strides) + assert expected_decode_bboxes.allclose(out, atol=1e-04) + + batch_priors = priors.unsqueeze(0).repeat(2, 1, 1) + batch_pred_bboxes = pred_bboxes.unsqueeze(0).repeat(2, 1, 1) + batch_out = coder.decode(batch_priors, batch_pred_bboxes, strides)[0] + assert out.allclose(batch_out) diff --git a/third_party/mmyolo/tests/test_models/test_utils/__init__.py b/third_party/mmyolo/tests/test_models/test_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/third_party/mmyolo/tests/test_models/test_utils/test_misc.py b/third_party/mmyolo/tests/test_models/test_utils/test_misc.py new file mode 100644 index 0000000000000000000000000000000000000000..dce9502571e4294757ac6f2b9bb524e35c372c29 --- /dev/null +++ b/third_party/mmyolo/tests/test_models/test_utils/test_misc.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +import torch +from mmengine.structures import InstanceData +from torch import Tensor + +from mmyolo.models.utils import gt_instances_preprocess +from mmyolo.utils import register_all_modules + +register_all_modules() + + +class TestGtInstancesPreprocess: + + @pytest.mark.parametrize('box_dim', [4, 5]) + def test(self, box_dim): + gt_instances = InstanceData( + bboxes=torch.empty((0, box_dim)), labels=torch.LongTensor([])) + batch_size = 1 + batch_instance = gt_instances_preprocess([gt_instances], batch_size) + assert isinstance(batch_instance, Tensor) + assert len(batch_instance.shape) == 3, 'the len of result must be 3.' + assert batch_instance.size(-1) == box_dim + 1 + + @pytest.mark.parametrize('box_dim', [4, 5]) + def test_fast_version(self, box_dim: int): + gt_instances = torch.from_numpy( + np.array([[0., 1., *(0., ) * box_dim]], dtype=np.float32)) + batch_size = 1 + batch_instance = gt_instances_preprocess(gt_instances, batch_size) + assert isinstance(batch_instance, Tensor) + assert len(batch_instance.shape) == 3, 'the len of result must be 3.' + assert batch_instance.shape[1] == 1 + assert batch_instance.shape[2] == box_dim + 1 diff --git a/third_party/mmyolo/tests/test_utils/test_collect_env.py b/third_party/mmyolo/tests/test_utils/test_collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..913f46fa3c9286e9c3cbd656ad5e93def143aea0 --- /dev/null +++ b/third_party/mmyolo/tests/test_utils/test_collect_env.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import sys +from unittest import TestCase + +import mmcv +import mmdet +import mmengine + +from mmyolo.utils import collect_env + + +class TestCollectEnv(TestCase): + + def test_collect_env(self): + env_info = collect_env() + print(env_info) + expected_keys = [ + 'sys.platform', 'Python', 'CUDA available', 'PyTorch', + 'PyTorch compiling details', 'OpenCV', 'MMEngine', 'GCC' + ] + for key in expected_keys: + assert key in env_info + + if env_info['CUDA available']: + for key in ['CUDA_HOME', 'NVCC']: + assert key in env_info + + assert env_info['sys.platform'] == sys.platform + assert env_info['Python'] == sys.version.replace('\n', '') + + assert env_info['MMEngine'] == mmengine.__version__ + assert env_info['MMCV'] == mmcv.__version__ + assert env_info['MMDetection'] == mmdet.__version__ diff --git a/third_party/mmyolo/tests/test_utils/test_setup_env.py b/third_party/mmyolo/tests/test_utils/test_setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..e6bd6890b31bbe9179553bd440cc0e8bc44329c2 --- /dev/null +++ b/third_party/mmyolo/tests/test_utils/test_setup_env.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import sys +from unittest import TestCase + +from mmengine import DefaultScope + +from mmyolo.utils import register_all_modules + + +class TestSetupEnv(TestCase): + + def test_register_all_modules(self): + from mmyolo.registry import DATASETS + + # not init default scope + sys.modules.pop('mmyolo.datasets', None) + sys.modules.pop('mmyolo.datasets.yolov5_coco', None) + DATASETS._module_dict.pop('YOLOv5CocoDataset', None) + self.assertFalse('YOLOv5CocoDataset' in DATASETS.module_dict) + register_all_modules(init_default_scope=False) + self.assertTrue('YOLOv5CocoDataset' in DATASETS.module_dict) + + # init default scope + sys.modules.pop('mmyolo.datasets', None) + sys.modules.pop('mmyolo.datasets.yolov5_coco', None) + DATASETS._module_dict.pop('YOLOv5CocoDataset', None) + self.assertFalse('YOLOv5CocoDataset' in DATASETS.module_dict) + register_all_modules(init_default_scope=True) + self.assertTrue('YOLOv5CocoDataset' in DATASETS.module_dict) + self.assertEqual(DefaultScope.get_current_instance().scope_name, + 'mmyolo') + + # init default scope when another scope is init + name = f'test-{datetime.datetime.now()}' + DefaultScope.get_instance(name, scope_name='test') + with self.assertWarnsRegex( + Warning, 'The current default scope "test" is not "mmyolo"'): + register_all_modules(init_default_scope=True) diff --git a/third_party/mmyolo/tools/analysis_tools/benchmark.py b/third_party/mmyolo/tools/analysis_tools/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..29f53a4768c3339d656d4bb71dae3396e5501265 --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/benchmark.py @@ -0,0 +1,188 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy +import os +import time + +import torch +from mmengine import Config, DictAction +from mmengine.dist import get_world_size, init_dist +from mmengine.logging import MMLogger, print_log +from mmengine.registry import init_default_scope +from mmengine.runner import Runner, load_checkpoint +from mmengine.utils import mkdir_or_exist +from mmengine.utils.dl_utils import set_multi_processing + +from mmyolo.registry import MODELS + + +# TODO: Refactoring and improving +def parse_args(): + parser = argparse.ArgumentParser(description='MMYOLO benchmark a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--repeat-num', + type=int, + default=1, + help='number of repeat times of measurement for averaging the results') + parser.add_argument( + '--max-iter', type=int, default=2000, help='num of max iter') + parser.add_argument( + '--log-interval', type=int, default=50, help='interval of logging') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing ' + 'benchmark metrics') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def measure_inference_speed(cfg, checkpoint, max_iter, log_interval, + is_fuse_conv_bn): + env_cfg = cfg.get('env_cfg') + if env_cfg.get('cudnn_benchmark'): + torch.backends.cudnn.benchmark = True + + mp_cfg: dict = env_cfg.get('mp_cfg', {}) + set_multi_processing(**mp_cfg, distributed=cfg.distributed) + + # Because multiple processes will occupy additional CPU resources, + # FPS statistics will be more unstable when num_workers is not 0. + # It is reasonable to set num_workers to 0. + dataloader_cfg = cfg.test_dataloader + dataloader_cfg['num_workers'] = 0 + dataloader_cfg['batch_size'] = 1 + dataloader_cfg['persistent_workers'] = False + data_loader = Runner.build_dataloader(dataloader_cfg) + + # build the model and load checkpoint + model = MODELS.build(cfg.model) + load_checkpoint(model, checkpoint, map_location='cpu') + model = model.cuda() + model.eval() + + # the first several iterations may be very slow so skip them + num_warmup = 5 + pure_inf_time = 0 + fps = 0 + + # benchmark with 2000 image and take the average + for i, data in enumerate(data_loader): + + torch.cuda.synchronize() + start_time = time.perf_counter() + + with torch.no_grad(): + model.test_step(data) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - start_time + + if i >= num_warmup: + pure_inf_time += elapsed + if (i + 1) % log_interval == 0: + fps = (i + 1 - num_warmup) / pure_inf_time + print_log( + f'Done image [{i + 1:<3}/ {max_iter}], ' + f'fps: {fps:.1f} img / s, ' + f'times per image: {1000 / fps:.1f} ms / img', 'current') + + if (i + 1) == max_iter: + fps = (i + 1 - num_warmup) / pure_inf_time + print_log( + f'Overall fps: {fps:.1f} img / s, ' + f'times per image: {1000 / fps:.1f} ms / img', 'current') + break + return fps + + +def repeat_measure_inference_speed(cfg, + checkpoint, + max_iter, + log_interval, + is_fuse_conv_bn, + repeat_num=1): + assert repeat_num >= 1 + + fps_list = [] + + for _ in range(repeat_num): + cp_cfg = copy.deepcopy(cfg) + + fps_list.append( + measure_inference_speed(cp_cfg, checkpoint, max_iter, log_interval, + is_fuse_conv_bn)) + + if repeat_num > 1: + fps_list_ = [round(fps, 1) for fps in fps_list] + times_pre_image_list_ = [round(1000 / fps, 1) for fps in fps_list] + mean_fps_ = sum(fps_list_) / len(fps_list_) + mean_times_pre_image_ = sum(times_pre_image_list_) / len( + times_pre_image_list_) + print_log( + f'Overall fps: {fps_list_}[{mean_fps_:.1f}] img / s, ' + f'times per image: ' + f'{times_pre_image_list_}[{mean_times_pre_image_:.1f}] ms / img', + 'current') + return fps_list + + return fps_list[0] + + +# TODO: refactoring +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + distributed = False + if args.launcher != 'none': + init_dist(args.launcher, **cfg.get('env_cfg', {}).get('dist_cfg', {})) + distributed = True + assert get_world_size( + ) == 1, 'Inference benchmark does not allow distributed multi-GPU' + + cfg.distributed = distributed + + log_file = None + if args.work_dir: + log_file = os.path.join(args.work_dir, 'benchmark.log') + mkdir_or_exist(args.work_dir) + + MMLogger.get_instance('mmyolo', log_file=log_file, log_level='INFO') + + repeat_measure_inference_speed(cfg, args.checkpoint, args.max_iter, + args.log_interval, args.fuse_conv_bn, + args.repeat_num) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/browse_coco_json.py b/third_party/mmyolo/tools/analysis_tools/browse_coco_json.py new file mode 100644 index 0000000000000000000000000000000000000000..71a2fc2a942d234e1ce2e3e93901a66bacb123df --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/browse_coco_json.py @@ -0,0 +1,147 @@ +import argparse +import os.path as osp + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.collections import PatchCollection +from matplotlib.patches import Polygon +from pycocotools.coco import COCO + + +def show_coco_json(args): + if args.data_root is not None: + coco = COCO(osp.join(args.data_root, args.ann_file)) + else: + coco = COCO(args.ann_file) + print(f'Total number of images:{len(coco.getImgIds())}') + categories = coco.loadCats(coco.getCatIds()) + category_names = [category['name'] for category in categories] + print(f'Total number of Categories : {len(category_names)}') + print('Categories: \n{}\n'.format(' '.join(category_names))) + + if args.category_names is None: + category_ids = [] + else: + assert set(category_names) > set(args.category_names) + category_ids = coco.getCatIds(args.category_names) + + image_ids = coco.getImgIds(catIds=category_ids) + + if args.shuffle: + np.random.shuffle(image_ids) + + for i in range(len(image_ids)): + image_data = coco.loadImgs(image_ids[i])[0] + if args.data_root is not None: + image_path = osp.join(args.data_root, args.img_dir, + image_data['file_name']) + else: + image_path = osp.join(args.img_dir, image_data['file_name']) + + annotation_ids = coco.getAnnIds( + imgIds=image_data['id'], catIds=category_ids, iscrowd=0) + annotations = coco.loadAnns(annotation_ids) + + image = cv2.imread(image_path) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + plt.figure() + plt.imshow(image) + + if args.disp_all: + coco.showAnns(annotations) + else: + show_bbox_only(coco, annotations) + + if args.wait_time == 0: + plt.show() + else: + plt.show(block=False) + plt.pause(args.wait_time) + + plt.close() + + +def show_bbox_only(coco, anns, show_label_bbox=True, is_filling=True): + """Show bounding box of annotations Only.""" + if len(anns) == 0: + return + + ax = plt.gca() + ax.set_autoscale_on(False) + + image2color = dict() + for cat in coco.getCatIds(): + image2color[cat] = (np.random.random((1, 3)) * 0.7 + 0.3).tolist()[0] + + polygons = [] + colors = [] + + for ann in anns: + color = image2color[ann['category_id']] + bbox_x, bbox_y, bbox_w, bbox_h = ann['bbox'] + poly = [[bbox_x, bbox_y], [bbox_x, bbox_y + bbox_h], + [bbox_x + bbox_w, bbox_y + bbox_h], [bbox_x + bbox_w, bbox_y]] + polygons.append(Polygon(np.array(poly).reshape((4, 2)))) + colors.append(color) + + if show_label_bbox: + label_bbox = dict(facecolor=color) + else: + label_bbox = None + + ax.text( + bbox_x, + bbox_y, + '%s' % (coco.loadCats(ann['category_id'])[0]['name']), + color='white', + bbox=label_bbox) + + if is_filling: + p = PatchCollection( + polygons, facecolor=colors, linewidths=0, alpha=0.4) + ax.add_collection(p) + p = PatchCollection( + polygons, facecolor='none', edgecolors=colors, linewidths=2) + ax.add_collection(p) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Show coco json file') + parser.add_argument('--data-root', default=None, help='dataset root') + parser.add_argument( + '--img-dir', default='data/coco/train2017', help='image folder path') + parser.add_argument( + '--ann-file', + default='data/coco/annotations/instances_train2017.json', + help='ann file path') + parser.add_argument( + '--wait-time', type=float, default=2, help='the interval of show (s)') + parser.add_argument( + '--disp-all', + action='store_true', + help='Whether to display all types of data, ' + 'such as bbox and mask.' + ' Default is to display only bbox') + parser.add_argument( + '--category-names', + type=str, + default=None, + nargs='+', + help='Display category-specific data, e.g., "bicycle", "person"') + parser.add_argument( + '--shuffle', + action='store_true', + help='Whether to display in disorder') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + show_coco_json(args) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/browse_dataset.py b/third_party/mmyolo/tools/analysis_tools/browse_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..21a1d709d3ced0e5f865748afa0a1e258a8751f9 --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/browse_dataset.py @@ -0,0 +1,276 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +import sys +from typing import Tuple + +import cv2 +import mmcv +import numpy as np +from mmdet.models.utils import mask2ndarray +from mmdet.structures.bbox import BaseBoxes +from mmengine.config import Config, DictAction +from mmengine.dataset import Compose +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar +from mmengine.visualization import Visualizer + +from mmyolo.registry import DATASETS, VISUALIZERS + + +# TODO: Support for printing the change in key of results +# TODO: Some bug. If you meet some bug, please use the original +def parse_args(): + parser = argparse.ArgumentParser(description='Browse a dataset') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--phase', + '-p', + default='train', + type=str, + choices=['train', 'test', 'val'], + help='phase of dataset to visualize, accept "train" "test" and "val".' + ' Defaults to "train".') + parser.add_argument( + '--mode', + '-m', + default='transformed', + type=str, + choices=['original', 'transformed', 'pipeline'], + help='display mode; display original pictures or ' + 'transformed pictures or comparison pictures. "original" ' + 'means show images load from disk; "transformed" means ' + 'to show images after transformed; "pipeline" means show all ' + 'the intermediate images. Defaults to "transformed".') + parser.add_argument( + '--out-dir', + default='output', + type=str, + help='If there is no display interface, you can save it.') + parser.add_argument('--not-show', default=False, action='store_true') + parser.add_argument( + '--show-number', + '-n', + type=int, + default=sys.maxsize, + help='number of images selected to visualize, ' + 'must bigger than 0. if the number is bigger than length ' + 'of dataset, show all the images in dataset; ' + 'default "sys.maxsize", show all images in dataset') + parser.add_argument( + '--show-interval', + '-i', + type=float, + default=3, + help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def _get_adaptive_scale(img_shape: Tuple[int, int], + min_scale: float = 0.3, + max_scale: float = 3.0) -> float: + """Get adaptive scale according to image shape. + + The target scale depends on the the short edge length of the image. If the + short edge length equals 224, the output is 1.0. And output linear + scales according the short edge length. You can also specify the minimum + scale and the maximum scale to limit the linear scale. + + Args: + img_shape (Tuple[int, int]): The shape of the canvas image. + min_scale (int): The minimum scale. Defaults to 0.3. + max_scale (int): The maximum scale. Defaults to 3.0. + Returns: + int: The adaptive scale. + """ + short_edge_length = min(img_shape) + scale = short_edge_length / 224. + return min(max(scale, min_scale), max_scale) + + +def make_grid(imgs, names): + """Concat list of pictures into a single big picture, align height here.""" + visualizer = Visualizer.get_current_instance() + ori_shapes = [img.shape[:2] for img in imgs] + max_height = int(max(img.shape[0] for img in imgs) * 1.1) + min_width = min(img.shape[1] for img in imgs) + horizontal_gap = min_width // 10 + img_scale = _get_adaptive_scale((max_height, min_width)) + + texts = [] + text_positions = [] + start_x = 0 + for i, img in enumerate(imgs): + pad_height = (max_height - img.shape[0]) // 2 + pad_width = horizontal_gap // 2 + # make border + imgs[i] = cv2.copyMakeBorder( + img, + pad_height, + max_height - img.shape[0] - pad_height + int(img_scale * 30 * 2), + pad_width, + pad_width, + cv2.BORDER_CONSTANT, + value=(255, 255, 255)) + texts.append(f'{"execution: "}{i}\n{names[i]}\n{ori_shapes[i]}') + text_positions.append( + [start_x + img.shape[1] // 2 + pad_width, max_height]) + start_x += img.shape[1] + horizontal_gap + + display_img = np.concatenate(imgs, axis=1) + visualizer.set_image(display_img) + img_scale = _get_adaptive_scale(display_img.shape[:2]) + visualizer.draw_texts( + texts, + positions=np.array(text_positions), + font_sizes=img_scale * 7, + colors='black', + horizontal_alignments='center', + font_families='monospace') + return visualizer.get_image() + + +def swap_pipeline_position(dataset_cfg): + load_ann_tfm_name = 'LoadAnnotations' + pipeline = dataset_cfg.get('pipeline') + if (pipeline is None): + return dataset_cfg + all_transform_types = [tfm['type'] for tfm in pipeline] + if load_ann_tfm_name in all_transform_types: + load_ann_tfm_index = all_transform_types.index(load_ann_tfm_name) + load_ann_tfm = pipeline.pop(load_ann_tfm_index) + pipeline.insert(1, load_ann_tfm) + + +class InspectCompose(Compose): + """Compose multiple transforms sequentially. + + And record "img" field of all results in one list. + """ + + def __init__(self, transforms, intermediate_imgs): + super().__init__(transforms=transforms) + self.intermediate_imgs = intermediate_imgs + + def __call__(self, data): + if 'img' in data: + self.intermediate_imgs.append({ + 'name': 'original', + 'img': data['img'].copy() + }) + self.ptransforms = [ + self.transforms[i] for i in range(len(self.transforms) - 1) + ] + for t in self.ptransforms: + data = t(data) + # Keep the same meta_keys in the PackDetInputs + self.transforms[-1].meta_keys = [key for key in data] + data_sample = self.transforms[-1](data) + if data is None: + return None + if 'img' in data: + self.intermediate_imgs.append({ + 'name': + t.__class__.__name__, + 'dataset_sample': + data_sample['data_samples'] + }) + return data + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + dataset_cfg = cfg.get(args.phase + '_dataloader').get('dataset') + if (args.phase in ['test', 'val']): + swap_pipeline_position(dataset_cfg) + dataset = DATASETS.build(dataset_cfg) + visualizer = VISUALIZERS.build(cfg.visualizer) + visualizer.dataset_meta = dataset.metainfo + + intermediate_imgs = [] + + if not hasattr(dataset, 'pipeline'): + # for dataset_wrapper + dataset = dataset.dataset + + # TODO: The dataset wrapper occasion is not considered here + dataset.pipeline = InspectCompose(dataset.pipeline.transforms, + intermediate_imgs) + + # init visualization image number + assert args.show_number > 0 + display_number = min(args.show_number, len(dataset)) + + progress_bar = ProgressBar(display_number) + for i, item in zip(range(display_number), dataset): + image_i = [] + result_i = [result['dataset_sample'] for result in intermediate_imgs] + for k, datasample in enumerate(result_i): + image = datasample.img + gt_instances = datasample.gt_instances + image = image[..., [2, 1, 0]] # bgr to rgb + gt_bboxes = gt_instances.get('bboxes', None) + if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes): + gt_instances.bboxes = gt_bboxes.tensor + gt_masks = gt_instances.get('masks', None) + if gt_masks is not None: + masks = mask2ndarray(gt_masks) + gt_instances.masks = masks.astype(bool) + datasample.gt_instances = gt_instances + # get filename from dataset or just use index as filename + visualizer.add_datasample( + 'result', + image, + datasample, + draw_pred=False, + draw_gt=True, + show=False) + image_show = visualizer.get_image() + image_i.append(image_show) + + if args.mode == 'original': + image = image_i[0] + elif args.mode == 'transformed': + image = image_i[-1] + else: + image = make_grid([result for result in image_i], + [result['name'] for result in intermediate_imgs]) + + if hasattr(datasample, 'img_path'): + filename = osp.basename(datasample.img_path) + else: + # some dataset have not image path + filename = f'{i}.jpg' + out_file = osp.join(args.out_dir, + filename) if args.out_dir is not None else None + + if out_file is not None: + mmcv.imwrite(image[..., ::-1], out_file) + + if not args.not_show: + visualizer.show( + image, win_name=filename, wait_time=args.show_interval) + + intermediate_imgs.clear() + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/browse_dataset_simple.py b/third_party/mmyolo/tools/analysis_tools/browse_dataset_simple.py new file mode 100644 index 0000000000000000000000000000000000000000..ebacbde3a5a2e1212089e4d4038fa286d462071b --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/browse_dataset_simple.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp + +from mmdet.models.utils import mask2ndarray +from mmdet.structures.bbox import BaseBoxes +from mmengine.config import Config, DictAction +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar + +from mmyolo.registry import DATASETS, VISUALIZERS + + +def parse_args(): + parser = argparse.ArgumentParser(description='Browse a dataset') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--output-dir', + default=None, + type=str, + help='If there is no display interface, you can save it') + parser.add_argument('--not-show', default=False, action='store_true') + parser.add_argument( + '--show-interval', + type=float, + default=0, + help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # register all modules in mmdet into the registries + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + dataset = DATASETS.build(cfg.train_dataloader.dataset) + visualizer = VISUALIZERS.build(cfg.visualizer) + visualizer.dataset_meta = dataset.metainfo + + progress_bar = ProgressBar(len(dataset)) + for item in dataset: + img = item['inputs'].permute(1, 2, 0).numpy() + data_sample = item['data_samples'].numpy() + gt_instances = data_sample.gt_instances + img_path = osp.basename(item['data_samples'].img_path) + + out_file = osp.join( + args.output_dir, + osp.basename(img_path)) if args.output_dir is not None else None + + img = img[..., [2, 1, 0]] # bgr to rgb + gt_bboxes = gt_instances.get('bboxes', None) + if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes): + gt_instances.bboxes = gt_bboxes.tensor + gt_masks = gt_instances.get('masks', None) + if gt_masks is not None: + masks = mask2ndarray(gt_masks) + gt_instances.masks = masks.astype(bool) + data_sample.gt_instances = gt_instances + + visualizer.add_datasample( + osp.basename(img_path), + img, + data_sample, + draw_pred=False, + show=not args.not_show, + wait_time=args.show_interval, + out_file=out_file) + + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/confusion_matrix.py b/third_party/mmyolo/tools/analysis_tools/confusion_matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..f48abdb90eadba3d50bec106c2ad0ea7709e897d --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/confusion_matrix.py @@ -0,0 +1,273 @@ +import argparse +import os + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.ticker import MultipleLocator +from mmcv.ops import nms +from mmdet.evaluation import bbox_overlaps +from mmdet.utils import replace_cfg_vals, update_data_root +from mmengine import Config, DictAction +from mmengine.fileio import load +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar + +from mmyolo.registry import DATASETS + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Generate confusion matrix from detection results') + parser.add_argument('config', help='test config file path') + parser.add_argument( + 'prediction_path', help='prediction path where test .pkl result') + parser.add_argument( + 'save_dir', help='directory where confusion matrix will be saved') + parser.add_argument( + '--show', action='store_true', help='show confusion matrix') + parser.add_argument( + '--color-theme', + default='plasma', + help='theme of the matrix color map') + parser.add_argument( + '--score-thr', + type=float, + default=0.3, + help='score threshold to filter detection bboxes') + parser.add_argument( + '--tp-iou-thr', + type=float, + default=0.5, + help='IoU threshold to be considered as matched') + parser.add_argument( + '--nms-iou-thr', + type=float, + default=None, + help='nms IoU threshold, only applied when users want to change the' + 'nms IoU threshold.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def calculate_confusion_matrix(dataset, + results, + score_thr=0, + nms_iou_thr=None, + tp_iou_thr=0.5): + """Calculate the confusion matrix. + + Args: + dataset (Dataset): Test or val dataset. + results (list[ndarray]): A list of detection results in each image. + score_thr (float|optional): Score threshold to filter bboxes. + Default: 0. + nms_iou_thr (float|optional): nms IoU threshold, the detection results + have done nms in the detector, only applied when users want to + change the nms IoU threshold. Default: None. + tp_iou_thr (float|optional): IoU threshold to be considered as matched. + Default: 0.5. + """ + num_classes = len(dataset.metainfo['classes']) + confusion_matrix = np.zeros(shape=[num_classes + 1, num_classes + 1]) + assert len(dataset) == len(results) + prog_bar = ProgressBar(len(results)) + for idx, per_img_res in enumerate(results): + res_bboxes = per_img_res['pred_instances'] + gts = dataset.get_data_info(idx)['instances'] + analyze_per_img_dets(confusion_matrix, gts, res_bboxes, score_thr, + tp_iou_thr, nms_iou_thr) + prog_bar.update() + return confusion_matrix + + +def analyze_per_img_dets(confusion_matrix, + gts, + result, + score_thr=0, + tp_iou_thr=0.5, + nms_iou_thr=None): + """Analyze detection results on each image. + + Args: + confusion_matrix (ndarray): The confusion matrix, + has shape (num_classes + 1, num_classes + 1). + gt_bboxes (ndarray): Ground truth bboxes, has shape (num_gt, 4). + gt_labels (ndarray): Ground truth labels, has shape (num_gt). + result (ndarray): Detection results, has shape + (num_classes, num_bboxes, 5). + score_thr (float): Score threshold to filter bboxes. + Default: 0. + tp_iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + nms_iou_thr (float|optional): nms IoU threshold, the detection results + have done nms in the detector, only applied when users want to + change the nms IoU threshold. Default: None. + """ + true_positives = np.zeros(len(gts)) + gt_bboxes = [] + gt_labels = [] + for gt in gts: + gt_bboxes.append(gt['bbox']) + gt_labels.append(gt['bbox_label']) + + gt_bboxes = np.array(gt_bboxes) + gt_labels = np.array(gt_labels) + + unique_label = np.unique(result['labels'].numpy()) + + for det_label in unique_label: + mask = (result['labels'] == det_label) + det_bboxes = result['bboxes'][mask].numpy() + det_scores = result['scores'][mask].numpy() + + if nms_iou_thr: + det_bboxes, _ = nms( + det_bboxes, det_scores, nms_iou_thr, score_threshold=score_thr) + ious = bbox_overlaps(det_bboxes[:, :4], gt_bboxes) + for i, score in enumerate(det_scores): + det_match = 0 + if score >= score_thr: + for j, gt_label in enumerate(gt_labels): + if ious[i, j] >= tp_iou_thr: + det_match += 1 + if gt_label == det_label: + true_positives[j] += 1 # TP + confusion_matrix[gt_label, det_label] += 1 + if det_match == 0: # BG FP + confusion_matrix[-1, det_label] += 1 + for num_tp, gt_label in zip(true_positives, gt_labels): + if num_tp == 0: # FN + confusion_matrix[gt_label, -1] += 1 + + +def plot_confusion_matrix(confusion_matrix, + labels, + save_dir=None, + show=True, + title='Normalized Confusion Matrix', + color_theme='plasma'): + """Draw confusion matrix with matplotlib. + + Args: + confusion_matrix (ndarray): The confusion matrix. + labels (list[str]): List of class names. + save_dir (str|optional): If set, save the confusion matrix plot to the + given path. Default: None. + show (bool): Whether to show the plot. Default: True. + title (str): Title of the plot. Default: `Normalized Confusion Matrix`. + color_theme (str): Theme of the matrix color map. Default: `plasma`. + """ + # normalize the confusion matrix + per_label_sums = confusion_matrix.sum(axis=1)[:, np.newaxis] + confusion_matrix = \ + confusion_matrix.astype(np.float32) / per_label_sums * 100 + + num_classes = len(labels) + fig, ax = plt.subplots( + figsize=(0.5 * num_classes, 0.5 * num_classes * 0.8), dpi=180) + cmap = plt.get_cmap(color_theme) + im = ax.imshow(confusion_matrix, cmap=cmap) + plt.colorbar(mappable=im, ax=ax) + + title_font = {'weight': 'bold', 'size': 12} + ax.set_title(title, fontdict=title_font) + label_font = {'size': 10} + plt.ylabel('Ground Truth Label', fontdict=label_font) + plt.xlabel('Prediction Label', fontdict=label_font) + + # draw locator + xmajor_locator = MultipleLocator(1) + xminor_locator = MultipleLocator(0.5) + ax.xaxis.set_major_locator(xmajor_locator) + ax.xaxis.set_minor_locator(xminor_locator) + ymajor_locator = MultipleLocator(1) + yminor_locator = MultipleLocator(0.5) + ax.yaxis.set_major_locator(ymajor_locator) + ax.yaxis.set_minor_locator(yminor_locator) + + # draw grid + ax.grid(True, which='minor', linestyle='-') + + # draw label + ax.set_xticks(np.arange(num_classes)) + ax.set_yticks(np.arange(num_classes)) + ax.set_xticklabels(labels) + ax.set_yticklabels(labels) + + ax.tick_params( + axis='x', bottom=False, top=True, labelbottom=False, labeltop=True) + plt.setp( + ax.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor') + + # draw confution matrix value + for i in range(num_classes): + for j in range(num_classes): + ax.text( + j, + i, + '{}%'.format( + int(confusion_matrix[ + i, + j]) if not np.isnan(confusion_matrix[i, j]) else -1), + ha='center', + va='center', + color='w', + size=7) + + ax.set_ylim(len(confusion_matrix) - 0.5, -0.5) # matplotlib>3.1.1 + + fig.tight_layout() + if save_dir is not None: + plt.savefig( + os.path.join(save_dir, 'confusion_matrix.png'), format='png') + if show: + plt.show() + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + + # replace the ${key} with the value of cfg.key + cfg = replace_cfg_vals(cfg) + + # update data root according to MMYOLO_DATASETS + update_data_root(cfg) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + results = load(args.prediction_path) + + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + + dataset = DATASETS.build(cfg.test_dataloader.dataset) + + confusion_matrix = calculate_confusion_matrix(dataset, results, + args.score_thr, + args.nms_iou_thr, + args.tp_iou_thr) + plot_confusion_matrix( + confusion_matrix, + dataset.metainfo['classes'] + ('background', ), + save_dir=args.save_dir, + show=args.show, + color_theme=args.color_theme) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/dataset_analysis.py b/third_party/mmyolo/tools/analysis_tools/dataset_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..b2164e16b9809957b317b3c9406918292300707a --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/dataset_analysis.py @@ -0,0 +1,498 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path +from statistics import median + +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import numpy as np +from mmengine.config import Config +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar +from prettytable import PrettyTable + +from mmyolo.registry import DATASETS +from mmyolo.utils.misc import show_data_classes + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Distribution of categories and bbox instances') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--val-dataset', + default=False, + action='store_true', + help='The default train_dataset.' + 'To change it to val_dataset, enter "--val-dataset"') + parser.add_argument( + '--class-name', + default=None, + type=str, + help='Display specific class, e.g., "bicycle"') + parser.add_argument( + '--area-rule', + default=None, + type=int, + nargs='+', + help='Redefine area rules,but no more than three numbers.' + ' e.g., 30 70 125') + parser.add_argument( + '--func', + default=None, + type=str, + choices=[ + 'show_bbox_num', 'show_bbox_wh', 'show_bbox_wh_ratio', + 'show_bbox_area' + ], + help='Dataset analysis function selection.') + parser.add_argument( + '--out-dir', + default='./dataset_analysis', + type=str, + help='Output directory of dataset analysis visualization results,' + ' Save in "./dataset_analysis/" by default') + args = parser.parse_args() + return args + + +def show_bbox_num(cfg, out_dir, fig_set, class_name, class_num): + """Display the distribution map of categories and number of bbox + instances.""" + print('\n\nDrawing bbox_num figure:') + # Draw designs + fig = plt.figure( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + plt.bar(class_name, class_num, align='center') + + # Draw titles, labels and so on + for x, y in enumerate(class_num): + plt.text(x, y, '%s' % y, ha='center', fontsize=fig_set['fontsize'] + 3) + plt.xticks(rotation=fig_set['xticks_angle']) + plt.xlabel('Category Name') + plt.ylabel('Num of instances') + plt.title(cfg.dataset_type) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_num.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_num.jpg') + + +def show_bbox_wh(out_dir, fig_set, class_bbox_w, class_bbox_h, class_name): + """Display the width and height distribution of categories and bbox + instances.""" + print('\n\nDrawing bbox_wh figure:') + # Draw designs + fig, ax = plt.subplots( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + + # Set the position of the map and label on the x-axis + positions_w = list(range(0, 12 * len(class_name), 12)) + positions_h = list(range(6, 12 * len(class_name), 12)) + positions_x_label = list(range(3, 12 * len(class_name) + 1, 12)) + ax.violinplot( + class_bbox_w, positions_w, showmeans=True, showmedians=True, widths=4) + ax.violinplot( + class_bbox_h, positions_h, showmeans=True, showmedians=True, widths=4) + + # Draw titles, labels and so on + plt.xticks(rotation=fig_set['xticks_angle']) + plt.ylabel('The width or height of bbox') + plt.xlabel('Class name') + plt.title('Width or height distribution of classes and bbox instances') + + # Draw the max, min and median of wide data in violin chart + for i in range(len(class_bbox_w)): + plt.text( + positions_w[i], + median(class_bbox_w[i]), + f'{"%.2f" % median(class_bbox_w[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_w[i], + max(class_bbox_w[i]), + f'{"%.2f" % max(class_bbox_w[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_w[i], + min(class_bbox_w[i]), + f'{"%.2f" % min(class_bbox_w[i])}', + ha='center', + fontsize=fig_set['fontsize']) + + # Draw the max, min and median of height data in violin chart + for i in range(len(positions_h)): + plt.text( + positions_h[i], + median(class_bbox_h[i]), + f'{"%.2f" % median(class_bbox_h[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_h[i], + max(class_bbox_h[i]), + f'{"%.2f" % max(class_bbox_h[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions_h[i], + min(class_bbox_h[i]), + f'{"%.2f" % min(class_bbox_h[i])}', + ha='center', + fontsize=fig_set['fontsize']) + + # Draw Legend + plt.setp(ax, xticks=positions_x_label, xticklabels=class_name) + labels = ['bbox_w', 'bbox_h'] + colors = ['steelblue', 'darkorange'] + patches = [ + mpatches.Patch(color=colors[i], label=f'{labels[i]:s}') + for i in range(len(colors)) + ] + ax = plt.gca() + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width, box.height * 0.8]) + ax.legend(loc='upper center', handles=patches, ncol=2) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_wh.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_wh.jpg') + + +def show_bbox_wh_ratio(out_dir, fig_set, class_name, class_bbox_ratio): + """Display the distribution map of category and bbox instance width and + height ratio.""" + print('\n\nDrawing bbox_wh_ratio figure:') + # Draw designs + fig, ax = plt.subplots( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + + # Set the position of the map and label on the x-axis + positions = list(range(0, 6 * len(class_name), 6)) + ax.violinplot( + class_bbox_ratio, + positions, + showmeans=True, + showmedians=True, + widths=5) + + # Draw titles, labels and so on + plt.xticks(rotation=fig_set['xticks_angle']) + plt.ylabel('Ratio of width to height of bbox') + plt.xlabel('Class name') + plt.title('Width to height ratio distribution of class and bbox instances') + + # Draw the max, min and median of wide data in violin chart + for i in range(len(class_bbox_ratio)): + plt.text( + positions[i], + median(class_bbox_ratio[i]), + f'{"%.2f" % median(class_bbox_ratio[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions[i], + max(class_bbox_ratio[i]), + f'{"%.2f" % max(class_bbox_ratio[i])}', + ha='center', + fontsize=fig_set['fontsize']) + plt.text( + positions[i], + min(class_bbox_ratio[i]), + f'{"%.2f" % min(class_bbox_ratio[i])}', + ha='center', + fontsize=fig_set['fontsize']) + + # Set the position of the map and label on the x-axis + plt.setp(ax, xticks=positions, xticklabels=class_name) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_ratio.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_ratio.jpg') + + +def show_bbox_area(out_dir, fig_set, area_rule, class_name, bbox_area_num): + """Display the distribution map of category and bbox instance area based on + the rules of large, medium and small objects.""" + print('\n\nDrawing bbox_area figure:') + # Set the direct distance of each label and the width of each histogram + # Set the required labels and colors + positions = np.arange(0, 2 * len(class_name), 2) + width = 0.4 + labels = ['Small', 'Mediun', 'Large', 'Huge'] + colors = ['#438675', '#F7B469', '#6BA6DA', '#913221'] + + # Draw designs + fig = plt.figure( + figsize=(fig_set['figsize'][0], fig_set['figsize'][1]), dpi=300) + for i in range(len(area_rule) - 1): + area_num = [bbox_area_num[idx][i] for idx in range(len(class_name))] + plt.bar( + positions + width * i, + area_num, + width, + label=labels[i], + color=colors[i]) + for idx, (x, y) in enumerate(zip(positions.tolist(), area_num)): + plt.text( + x + width * i, + y, + y, + ha='center', + fontsize=fig_set['fontsize'] - 1) + + # Draw titles, labels and so on + plt.xticks(rotation=fig_set['xticks_angle']) + plt.xticks(positions + width * ((len(area_rule) - 2) / 2), class_name) + plt.ylabel('Class Area') + plt.xlabel('Class Name') + plt.title( + 'Area and number of large, medium and small objects of each class') + + # Set and Draw Legend + patches = [ + mpatches.Patch(color=colors[i], label=f'{labels[i]:s}') + for i in range(len(area_rule) - 1) + ] + ax = plt.gca() + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width, box.height * 0.8]) + ax.legend(loc='upper center', handles=patches, ncol=len(area_rule) - 1) + + # Save figure + if not os.path.exists(out_dir): + os.makedirs(out_dir) + out_name = fig_set['out_name'] + fig.savefig( + f'{out_dir}/{out_name}_bbox_area.jpg', + bbox_inches='tight', + pad_inches=0.1) # Save Image + plt.close() + print(f'End and save in {out_dir}/{out_name}_bbox_area.jpg') + + +def show_class_list(classes, class_num): + """Print the data of the class obtained by the current run.""" + print('\n\nThe information obtained is as follows:') + class_info = PrettyTable() + class_info.title = 'Information of dataset class' + # List Print Settings + # If the quantity is too large, 25 rows will be displayed in each column + if len(classes) < 25: + class_info.add_column('Class name', classes) + class_info.add_column('Bbox num', class_num) + elif len(classes) % 25 != 0 and len(classes) > 25: + col_num = int(len(classes) / 25) + 1 + class_nums = class_num.tolist() + class_name_list = list(classes) + for i in range(0, (col_num * 25) - len(classes)): + class_name_list.append('') + class_nums.append('') + for i in range(0, len(class_name_list), 25): + class_info.add_column('Class name', class_name_list[i:i + 25]) + class_info.add_column('Bbox num', class_nums[i:i + 25]) + + # Align display data to the left + class_info.align['Class name'] = 'l' + class_info.align['Bbox num'] = 'l' + print(class_info) + + +def show_data_list(args, area_rule): + """Print run setup information.""" + print('\n\nPrint current running information:') + data_info = PrettyTable() + data_info.title = 'Dataset information' + # Print the corresponding information according to the settings + if args.val_dataset is False: + data_info.add_column('Dataset type', ['train_dataset']) + elif args.val_dataset is True: + data_info.add_column('Dataset type', ['val_dataset']) + if args.class_name is None: + data_info.add_column('Class name', ['All classes']) + else: + data_info.add_column('Class name', [args.class_name]) + if args.func is None: + data_info.add_column('Function', ['All function']) + else: + data_info.add_column('Function', [args.func]) + data_info.add_column('Area rule', [area_rule]) + + print(data_info) + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + def replace_pipeline_to_none(cfg): + """Recursively iterate over all dataset(or datasets) and set their + pipelines to none.Datasets are mean ConcatDataset. + + Recursively terminates only when all dataset(or datasets) have been + traversed + """ + + if cfg.get('dataset', None) is None and cfg.get('datasets', + None) is None: + return + dataset = cfg.dataset if cfg.get('dataset', None) else cfg.datasets + if isinstance(dataset, list): + for item in dataset: + item.pipeline = None + elif dataset.get('pipeline', None): + dataset.pipeline = None + else: + replace_pipeline_to_none(dataset) + + # 1.Build Dataset + if args.val_dataset is False: + replace_pipeline_to_none(cfg.train_dataloader) + dataset = DATASETS.build(cfg.train_dataloader.dataset) + else: + replace_pipeline_to_none(cfg.val_dataloader) + dataset = DATASETS.build(cfg.val_dataloader.dataset) + + # 2.Prepare data + # Drawing settings + fig_all_set = { + 'figsize': [35, 18], + 'fontsize': int(10 - 0.08 * len(dataset.metainfo['classes'])), + 'xticks_angle': 70, + 'out_name': cfg.dataset_type + } + fig_one_set = { + 'figsize': [15, 10], + 'fontsize': 10, + 'xticks_angle': 0, + 'out_name': args.class_name + } + + # Call the category name and save address + if args.class_name is None: + classes = dataset.metainfo['classes'] + classes_idx = [i for i in range(len(classes))] + fig_set = fig_all_set + elif args.class_name in dataset.metainfo['classes']: + classes = [args.class_name] + classes_idx = [dataset.metainfo['classes'].index(args.class_name)] + fig_set = fig_one_set + else: + data_classes = dataset.metainfo['classes'] + show_data_classes(data_classes) + raise RuntimeError(f'Expected args.class_name to be one of the list,' + f'but got "{args.class_name}"') + + # Building Area Rules + if args.area_rule is None: + area_rule = [0, 32, 96, 1e5] + elif args.area_rule and len(args.area_rule) <= 3: + area_rules = [0] + args.area_rule + [1e5] + area_rule = sorted(area_rules) + else: + raise RuntimeError( + f'Expected the "{args.area_rule}" to be e.g. 30 60 120, ' + 'and no more than three numbers.') + + # Build arrays or lists to store data for each category + class_num = np.zeros((len(classes), ), dtype=np.int64) + class_bbox = [[] for _ in classes] + class_name = [] + class_bbox_w = [] + class_bbox_h = [] + class_bbox_ratio = [] + bbox_area_num = [] + + show_data_list(args, area_rule) + # Get the quantity and bbox data corresponding to each category + print('\nRead the information of each picture in the dataset:') + progress_bar = ProgressBar(len(dataset)) + for index in range(len(dataset)): + for instance in dataset[index]['instances']: + if instance[ + 'bbox_label'] in classes_idx and args.class_name is None: + class_num[instance['bbox_label']] += 1 + class_bbox[instance['bbox_label']].append(instance['bbox']) + elif instance['bbox_label'] in classes_idx and args.class_name: + class_num[0] += 1 + class_bbox[0].append(instance['bbox']) + progress_bar.update() + show_class_list(classes, class_num) + # Get the width, height and area of bbox corresponding to each category + print('\nRead bbox information in each class:') + progress_bar_classes = ProgressBar(len(classes)) + for idx, (classes, classes_idx) in enumerate(zip(classes, classes_idx)): + bbox = np.array(class_bbox[idx]) + bbox_area_nums = np.zeros((len(area_rule) - 1, ), dtype=np.int64) + if len(bbox) > 0: + bbox_wh = bbox[:, 2:4] - bbox[:, 0:2] + bbox_ratio = bbox_wh[:, 0] / bbox_wh[:, 1] + bbox_area = bbox_wh[:, 0] * bbox_wh[:, 1] + class_bbox_w.append(bbox_wh[:, 0].tolist()) + class_bbox_h.append(bbox_wh[:, 1].tolist()) + class_bbox_ratio.append(bbox_ratio.tolist()) + + # The area rule, there is an section between two numbers + for i in range(len(area_rule) - 1): + bbox_area_nums[i] = np.logical_and( + bbox_area >= area_rule[i]**2, + bbox_area < area_rule[i + 1]**2).sum() + elif len(bbox) == 0: + class_bbox_w.append([0]) + class_bbox_h.append([0]) + class_bbox_ratio.append([0]) + + class_name.append(classes) + bbox_area_num.append(bbox_area_nums.tolist()) + progress_bar_classes.update() + + # 3.draw Dataset Information + if args.func is None: + show_bbox_num(cfg, args.out_dir, fig_set, class_name, class_num) + show_bbox_wh(args.out_dir, fig_set, class_bbox_w, class_bbox_h, + class_name) + show_bbox_wh_ratio(args.out_dir, fig_set, class_name, class_bbox_ratio) + show_bbox_area(args.out_dir, fig_set, area_rule, class_name, + bbox_area_num) + elif args.func == 'show_bbox_num': + show_bbox_num(cfg, args.out_dir, fig_set, class_name, class_num) + elif args.func == 'show_bbox_wh': + show_bbox_wh(args.out_dir, fig_set, class_bbox_w, class_bbox_h, + class_name) + elif args.func == 'show_bbox_wh_ratio': + show_bbox_wh_ratio(args.out_dir, fig_set, class_name, class_bbox_ratio) + elif args.func == 'show_bbox_area': + show_bbox_area(args.out_dir, fig_set, area_rule, class_name, + bbox_area_num) + else: + raise RuntimeError( + 'Please enter the correct func name, e.g., show_bbox_num') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/get_flops.py b/third_party/mmyolo/tools/analysis_tools/get_flops.py new file mode 100644 index 0000000000000000000000000000000000000000..965660f7194de231770537d7f80e38f41876df56 --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/get_flops.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import tempfile +from pathlib import Path + +import torch +from mmdet.registry import MODELS +from mmengine.analysis import get_model_complexity_info +from mmengine.config import Config, DictAction +from mmengine.logging import MMLogger +from mmengine.model import revert_sync_batchnorm +from mmengine.registry import init_default_scope + +from mmyolo.utils import switch_to_deploy + + +def parse_args(): + parser = argparse.ArgumentParser(description='Get a detector flops') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--shape', + type=int, + nargs='+', + default=[640, 640], + help='input image size') + parser.add_argument( + '--show-arch', + action='store_true', + help='whether return the statistics in the form of network layers') + parser.add_argument( + '--not-show-table', + action='store_true', + help='whether return the statistics in the form of table'), + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + return parser.parse_args() + + +def inference(args, logger): + config_name = Path(args.config) + if not config_name.exists(): + logger.error(f'{config_name} not found.') + + cfg = Config.fromfile(args.config) + cfg.work_dir = tempfile.TemporaryDirectory().name + cfg.log_level = 'WARN' + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + if len(args.shape) == 1: + h = w = args.shape[0] + elif len(args.shape) == 2: + h, w = args.shape + else: + raise ValueError('invalid input shape') + + # model + model = MODELS.build(cfg.model) + if torch.cuda.is_available(): + model.cuda() + model = revert_sync_batchnorm(model) + model.eval() + switch_to_deploy(model) + + # input tensor + # automatically generate a input tensor with the given input_shape. + data_batch = {'inputs': [torch.rand(3, h, w)], 'batch_samples': [None]} + data = model.data_preprocessor(data_batch) + result = {'ori_shape': (h, w), 'pad_shape': data['inputs'].shape[-2:]} + outputs = get_model_complexity_info( + model, + input_shape=None, + inputs=data['inputs'], # the input tensor of the model + show_table=not args.not_show_table, # show the complexity table + show_arch=args.show_arch) # show the complexity arch + + result['flops'] = outputs['flops_str'] + result['params'] = outputs['params_str'] + result['out_table'] = outputs['out_table'] + result['out_arch'] = outputs['out_arch'] + + return result + + +def main(): + args = parse_args() + logger = MMLogger.get_instance(name='MMLogger') + result = inference(args, logger) + + split_line = '=' * 30 + + ori_shape = result['ori_shape'] + pad_shape = result['pad_shape'] + flops = result['flops'] + params = result['params'] + + print(result['out_table']) # print related information by table + print(result['out_arch']) # print related information by network layers + + if pad_shape != ori_shape: + print(f'{split_line}\nUse size divisor set input shape ' + f'from {ori_shape} to {pad_shape}') + + print(f'{split_line}\n' + f'Input shape: {pad_shape}\nModel Flops: {flops}\n' + f'Model Parameters: {params}\n{split_line}') + print('!!!Please be cautious if you use the results in papers. ' + 'You may need to check if all ops are supported and verify ' + 'that the flops computation is correct.') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/optimize_anchors.py b/third_party/mmyolo/tools/analysis_tools/optimize_anchors.py new file mode 100644 index 0000000000000000000000000000000000000000..34d4d067a6470a610b53868f18203827676892a2 --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/optimize_anchors.py @@ -0,0 +1,647 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Optimize anchor settings on a specific dataset. + +This script provides three methods to optimize YOLO anchors including k-means +anchor cluster, differential evolution and v5-k-means. You can use +``--algorithm k-means``, ``--algorithm differential_evolution`` and +``--algorithm v5-k-means`` to switch those methods. + +Example: + Use k-means anchor cluster:: + + python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm k-means --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} + + Use differential evolution to optimize anchors:: + + python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm differential_evolution \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --out-dir ${OUT_DIR} + + Use v5-k-means to optimize anchors:: + + python tools/analysis_tools/optimize_anchors.py ${CONFIG} \ + --algorithm v5-k-means \ + --input-shape ${INPUT_SHAPE [WIDTH HEIGHT]} \ + --prior_match_thr ${PRIOR_MATCH_THR} \ + --out-dir ${OUT_DIR} +""" +import argparse +import os.path as osp +import random +from typing import Tuple + +import numpy as np +import torch +from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps, + bbox_xyxy_to_cxcywh) +from mmdet.utils import replace_cfg_vals, update_data_root +from mmengine.config import Config +from mmengine.fileio import dump +from mmengine.logging import MMLogger +from mmengine.registry import init_default_scope +from mmengine.utils import ProgressBar +from scipy.optimize import differential_evolution +from torch import Tensor + +from mmyolo.registry import DATASETS + +try: + from scipy.cluster.vq import kmeans +except ImportError: + kmeans = None + + +def parse_args(): + parser = argparse.ArgumentParser(description='Optimize anchor parameters.') + parser.add_argument('config', help='Train config file path.') + parser.add_argument( + '--input-shape', + type=int, + nargs='+', + default=[640, 640], + help='input image size, represent [width, height]') + parser.add_argument( + '--algorithm', + default='DE', + help='Algorithm used for anchor optimizing.' + 'Support k-means and differential_evolution for YOLO,' + 'and v5-k-means is special for YOLOV5.') + parser.add_argument( + '--iters', + default=1000, + type=int, + help='Maximum iterations for optimizer.') + parser.add_argument( + '--prior-match-thr', + default=4.0, + type=float, + help='anchor-label `gt_filter_sizes` ratio threshold ' + 'hyperparameter used for training, default=4.0, this ' + 'parameter is unique to v5-k-means') + parser.add_argument( + '--mutation-args', + type=float, + nargs='+', + default=[0.9, 0.1], + help='paramter of anchor optimize method genetic algorithm, ' + 'represent [prob, sigma], this parameter is unique to v5-k-means') + parser.add_argument( + '--augment-args', + type=float, + nargs='+', + default=[0.9, 1.1], + help='scale factor of box size augment when metric box and anchor, ' + 'represent [min, max], this parameter is unique to v5-k-means') + parser.add_argument( + '--device', default='cuda:0', help='Device used for calculating.') + parser.add_argument( + '--out-dir', + default=None, + type=str, + help='Path to save anchor optimize result.') + + args = parser.parse_args() + return args + + +class BaseAnchorOptimizer: + """Base class for anchor optimizer. + + Args: + dataset (obj:`Dataset`): Dataset object. + input_shape (list[int]): Input image shape of the model. + Format in [width, height]. + num_anchor_per_level (list[int]) : Number of anchors for each level. + logger (obj:`logging.Logger`): The logger for logging. + device (str, optional): Device used for calculating. + Default: 'cuda:0' + out_dir (str, optional): Path to save anchor optimize result. + Default: None + """ + + def __init__(self, + dataset, + input_shape, + num_anchor_per_level, + logger, + device='cuda:0', + out_dir=None): + self.dataset = dataset + self.input_shape = input_shape + self.num_anchor_per_level = num_anchor_per_level + self.num_anchors = sum(num_anchor_per_level) + self.logger = logger + self.device = device + self.out_dir = out_dir + bbox_whs, img_shapes = self.get_whs_and_shapes() + ratios = img_shapes.max(1, keepdims=True) / np.array([input_shape]) + + # resize to input shape + self.bbox_whs = bbox_whs / ratios + + def get_whs_and_shapes(self): + """Get widths and heights of bboxes and shapes of images. + + Returns: + tuple[np.ndarray]: Array of bbox shapes and array of image + shapes with shape (num_bboxes, 2) in [width, height] format. + """ + self.logger.info('Collecting bboxes from annotation...') + bbox_whs = [] + img_shapes = [] + prog_bar = ProgressBar(len(self.dataset)) + for idx in range(len(self.dataset)): + data_info = self.dataset.get_data_info(idx) + img_shape = np.array([data_info['width'], data_info['height']]) + gt_instances = data_info['instances'] + for instance in gt_instances: + bbox = np.array(instance['bbox']) + gt_filter_sizes = bbox[2:4] - bbox[0:2] + img_shapes.append(img_shape) + bbox_whs.append(gt_filter_sizes) + + prog_bar.update() + print('\n') + bbox_whs = np.array(bbox_whs) + img_shapes = np.array(img_shapes) + self.logger.info(f'Collected {bbox_whs.shape[0]} bboxes.') + return bbox_whs, img_shapes + + def get_zero_center_bbox_tensor(self): + """Get a tensor of bboxes centered at (0, 0). + + Returns: + Tensor: Tensor of bboxes with shape (num_bboxes, 4) + in [xmin, ymin, xmax, ymax] format. + """ + whs = torch.from_numpy(self.bbox_whs).to( + self.device, dtype=torch.float32) + bboxes = bbox_cxcywh_to_xyxy( + torch.cat([torch.zeros_like(whs), whs], dim=1)) + return bboxes + + def optimize(self): + raise NotImplementedError + + def save_result(self, anchors, path=None): + + anchor_results = [] + start = 0 + for num in self.num_anchor_per_level: + end = num + start + anchor_results.append([(round(w), round(h)) + for w, h in anchors[start:end]]) + start = end + + self.logger.info(f'Anchor optimize result:{anchor_results}') + if path: + json_path = osp.join(path, 'anchor_optimize_result.json') + dump(anchor_results, json_path) + self.logger.info(f'Result saved in {json_path}') + + +class YOLOKMeansAnchorOptimizer(BaseAnchorOptimizer): + r"""YOLO anchor optimizer using k-means. Code refer to `AlexeyAB/darknet. + `_. + + Args: + iters (int): Maximum iterations for k-means. + """ + + def __init__(self, iters, **kwargs): + + super().__init__(**kwargs) + self.iters = iters + + def optimize(self): + anchors = self.kmeans_anchors() + self.save_result(anchors, self.out_dir) + + def kmeans_anchors(self): + self.logger.info( + f'Start cluster {self.num_anchors} YOLO anchors with K-means...') + bboxes = self.get_zero_center_bbox_tensor() + cluster_center_idx = torch.randint( + 0, bboxes.shape[0], (self.num_anchors, )).to(self.device) + + assignments = torch.zeros((bboxes.shape[0], )).to(self.device) + cluster_centers = bboxes[cluster_center_idx] + if self.num_anchors == 1: + cluster_centers = self.kmeans_maximization(bboxes, assignments, + cluster_centers) + anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() + anchors = sorted(anchors, key=lambda x: x[0] * x[1]) + return anchors + + prog_bar = ProgressBar(self.iters) + for i in range(self.iters): + converged, assignments = self.kmeans_expectation( + bboxes, assignments, cluster_centers) + if converged: + self.logger.info(f'K-means process has converged at iter {i}.') + break + cluster_centers = self.kmeans_maximization(bboxes, assignments, + cluster_centers) + prog_bar.update() + print('\n') + avg_iou = bbox_overlaps(bboxes, + cluster_centers).max(1)[0].mean().item() + + anchors = bbox_xyxy_to_cxcywh(cluster_centers)[:, 2:].cpu().numpy() + anchors = sorted(anchors, key=lambda x: x[0] * x[1]) + self.logger.info(f'Anchor cluster finish. Average IOU: {avg_iou}') + + return anchors + + def kmeans_maximization(self, bboxes, assignments, centers): + """Maximization part of EM algorithm(Expectation-Maximization)""" + new_centers = torch.zeros_like(centers) + for i in range(centers.shape[0]): + mask = (assignments == i) + if mask.sum(): + new_centers[i, :] = bboxes[mask].mean(0) + return new_centers + + def kmeans_expectation(self, bboxes, assignments, centers): + """Expectation part of EM algorithm(Expectation-Maximization)""" + ious = bbox_overlaps(bboxes, centers) + closest = ious.argmax(1) + converged = (closest == assignments).all() + return converged, closest + + +class YOLOV5KMeansAnchorOptimizer(BaseAnchorOptimizer): + r"""YOLOv5 anchor optimizer using shape k-means. + Code refer to `ultralytics/yolov5. + `_. + + Args: + iters (int): Maximum iterations for k-means. + prior_match_thr (float): anchor-label width height + ratio threshold hyperparameter. + """ + + def __init__(self, + iters, + prior_match_thr=4.0, + mutation_args=[0.9, 0.1], + augment_args=[0.9, 1.1], + **kwargs): + + super().__init__(**kwargs) + self.iters = iters + self.prior_match_thr = prior_match_thr + [self.mutation_prob, self.mutation_sigma] = mutation_args + [self.augment_min, self.augment_max] = augment_args + + def optimize(self): + self.logger.info( + f'Start cluster {self.num_anchors} YOLOv5 anchors with K-means...') + + bbox_whs = torch.from_numpy(self.bbox_whs).to( + self.device, dtype=torch.float32) + anchors = self.anchor_generate( + bbox_whs, + num=self.num_anchors, + img_size=self.input_shape[0], + prior_match_thr=self.prior_match_thr, + iters=self.iters) + best_ratio, mean_matched = self.anchor_metric(bbox_whs, anchors) + self.logger.info(f'{mean_matched:.2f} anchors/target {best_ratio:.3f} ' + 'Best Possible Recall (BPR). ') + self.save_result(anchors.tolist(), self.out_dir) + + def anchor_generate(self, + box_size: Tensor, + num: int = 9, + img_size: int = 640, + prior_match_thr: float = 4.0, + iters: int = 1000) -> Tensor: + """cluster boxes metric with anchors. + + Args: + box_size (Tensor): The size of the bxes, which shape is + (box_num, 2),the number 2 means width and height. + num (int): number of anchors. + img_size (int): image size used for training + prior_match_thr (float): width/height ratio threshold + used for training + iters (int): iterations to evolve anchors using genetic algorithm + + Returns: + anchors (Tensor): kmeans evolved anchors + """ + + thr = 1 / prior_match_thr + + # step1: filter small bbox + box_size = self._filter_box(box_size) + assert num <= len(box_size) + + # step2: init anchors + if kmeans: + try: + self.logger.info( + 'beginning init anchors with scipy kmeans method') + # sigmas for whitening + sigmas = box_size.std(0).cpu().numpy() + anchors = kmeans( + box_size.cpu().numpy() / sigmas, num, iter=30)[0] * sigmas + # kmeans may return fewer points than requested + # if width/height is insufficient or too similar + assert num == len(anchors) + except Exception: + self.logger.warning( + 'scipy kmeans method cannot get enough points ' + 'because of width/height is insufficient or too similar, ' + 'now switching strategies from kmeans to random init.') + anchors = np.sort(np.random.rand(num * 2)).reshape( + num, 2) * img_size + else: + self.logger.info( + 'cannot found scipy package, switching strategies from kmeans ' + 'to random init, you can install scipy package to ' + 'get better anchor init') + anchors = np.sort(np.random.rand(num * 2)).reshape(num, + 2) * img_size + + self.logger.info('init done, beginning evolve anchors...') + # sort small to large + anchors = torch.tensor(anchors[np.argsort(anchors.prod(1))]).to( + box_size.device, dtype=torch.float32) + + # step3: evolve anchors use Genetic Algorithm + prog_bar = ProgressBar(iters) + fitness = self._anchor_fitness(box_size, anchors, thr) + cluster_shape = anchors.shape + + for _ in range(iters): + mutate_result = np.ones(cluster_shape) + # mutate until a change occurs (prevent duplicates) + while (mutate_result == 1).all(): + # mutate_result is scale factor of anchors, between 0.3 and 3 + mutate_result = ( + (np.random.random(cluster_shape) < self.mutation_prob) * + random.random() * np.random.randn(*cluster_shape) * + self.mutation_sigma + 1).clip(0.3, 3.0) + mutate_result = torch.from_numpy(mutate_result).to(box_size.device) + new_anchors = (anchors.clone() * mutate_result).clip(min=2.0) + new_fitness = self._anchor_fitness(box_size, new_anchors, thr) + if new_fitness > fitness: + fitness = new_fitness + anchors = new_anchors.clone() + + prog_bar.update() + print('\n') + # sort small to large + anchors = anchors[torch.argsort(anchors.prod(1))] + self.logger.info(f'Anchor cluster finish. fitness = {fitness:.4f}') + + return anchors + + def anchor_metric(self, + box_size: Tensor, + anchors: Tensor, + threshold: float = 4.0) -> Tuple: + """compute boxes metric with anchors. + + Args: + box_size (Tensor): The size of the bxes, which shape + is (box_num, 2), the number 2 means width and height. + anchors (Tensor): The size of the bxes, which shape + is (anchor_num, 2), the number 2 means width and height. + threshold (float): the compare threshold of ratio + + Returns: + Tuple: a tuple of metric result, best_ratio_mean and mean_matched + """ + # step1: augment scale + # According to the uniform distribution,the scaling scale between + # augment_min and augment_max is randomly generated + scale = np.random.uniform( + self.augment_min, self.augment_max, size=(box_size.shape[0], 1)) + box_size = torch.tensor( + np.array( + [l[:, ] * s for s, l in zip(scale, + box_size.cpu().numpy())])).to( + box_size.device, + dtype=torch.float32) + # step2: calculate ratio + min_ratio, best_ratio = self._metric(box_size, anchors) + mean_matched = (min_ratio > 1 / threshold).float().sum(1).mean() + best_ratio_mean = (best_ratio > 1 / threshold).float().mean() + return best_ratio_mean, mean_matched + + def _filter_box(self, box_size: Tensor) -> Tensor: + small_cnt = (box_size < 3.0).any(1).sum() + if small_cnt: + self.logger.warning( + f'Extremely small objects found: {small_cnt} ' + f'of {len(box_size)} labels are <3 pixels in size') + # filter > 2 pixels + filter_sizes = box_size[(box_size >= 2.0).any(1)] + return filter_sizes + + def _anchor_fitness(self, box_size: Tensor, anchors: Tensor, thr: float): + """mutation fitness.""" + _, best = self._metric(box_size, anchors) + return (best * (best > thr).float()).mean() + + def _metric(self, box_size: Tensor, anchors: Tensor) -> Tuple: + """compute boxes metric with anchors. + + Args: + box_size (Tensor): The size of the bxes, which shape is + (box_num, 2), the number 2 means width and height. + anchors (Tensor): The size of the bxes, which shape is + (anchor_num, 2), the number 2 means width and height. + + Returns: + Tuple: a tuple of metric result, min_ratio and best_ratio + """ + + # ratio means the (width_1/width_2 and height_1/height_2) ratio of each + # box and anchor, the ratio shape is torch.Size([box_num,anchor_num,2]) + ratio = box_size[:, None] / anchors[None] + + # min_ratio records the min ratio of each box with all anchor, + # min_ratio.shape is torch.Size([box_num,anchor_num]) + # notice: + # smaller ratio means worse shape-match between boxes and anchors + min_ratio = torch.min(ratio, 1 / ratio).min(2)[0] + + # find the best shape-match ratio for each box + # box_best_ratio.shape is torch.Size([box_num]) + best_ratio = min_ratio.max(1)[0] + + return min_ratio, best_ratio + + +class YOLODEAnchorOptimizer(BaseAnchorOptimizer): + """YOLO anchor optimizer using differential evolution algorithm. + + Args: + iters (int): Maximum iterations for k-means. + strategy (str): The differential evolution strategy to use. + Should be one of: + + - 'best1bin' + - 'best1exp' + - 'rand1exp' + - 'randtobest1exp' + - 'currenttobest1exp' + - 'best2exp' + - 'rand2exp' + - 'randtobest1bin' + - 'currenttobest1bin' + - 'best2bin' + - 'rand2bin' + - 'rand1bin' + + Default: 'best1bin'. + population_size (int): Total population size of evolution algorithm. + Default: 15. + convergence_thr (float): Tolerance for convergence, the + optimizing stops when ``np.std(pop) <= abs(convergence_thr) + + convergence_thr * np.abs(np.mean(population_energies))``, + respectively. Default: 0.0001. + mutation (tuple[float]): Range of dithering randomly changes the + mutation constant. Default: (0.5, 1). + recombination (float): Recombination constant of crossover probability. + Default: 0.7. + """ + + def __init__(self, + iters, + strategy='best1bin', + population_size=15, + convergence_thr=0.0001, + mutation=(0.5, 1), + recombination=0.7, + **kwargs): + + super().__init__(**kwargs) + + self.iters = iters + self.strategy = strategy + self.population_size = population_size + self.convergence_thr = convergence_thr + self.mutation = mutation + self.recombination = recombination + + def optimize(self): + anchors = self.differential_evolution() + self.save_result(anchors, self.out_dir) + + def differential_evolution(self): + bboxes = self.get_zero_center_bbox_tensor() + + bounds = [] + for i in range(self.num_anchors): + bounds.extend([(0, self.input_shape[0]), (0, self.input_shape[1])]) + + result = differential_evolution( + func=self.avg_iou_cost, + bounds=bounds, + args=(bboxes, ), + strategy=self.strategy, + maxiter=self.iters, + popsize=self.population_size, + tol=self.convergence_thr, + mutation=self.mutation, + recombination=self.recombination, + updating='immediate', + disp=True) + self.logger.info( + f'Anchor evolution finish. Average IOU: {1 - result.fun}') + anchors = [(w, h) for w, h in zip(result.x[::2], result.x[1::2])] + anchors = sorted(anchors, key=lambda x: x[0] * x[1]) + return anchors + + @staticmethod + def avg_iou_cost(anchor_params, bboxes): + assert len(anchor_params) % 2 == 0 + anchor_whs = torch.tensor( + [[w, h] + for w, h in zip(anchor_params[::2], anchor_params[1::2])]).to( + bboxes.device, dtype=bboxes.dtype) + anchor_boxes = bbox_cxcywh_to_xyxy( + torch.cat([torch.zeros_like(anchor_whs), anchor_whs], dim=1)) + ious = bbox_overlaps(bboxes, anchor_boxes) + max_ious, _ = ious.max(1) + cost = 1 - max_ious.mean().item() + return cost + + +def main(): + logger = MMLogger.get_current_instance() + args = parse_args() + cfg = args.config + cfg = Config.fromfile(cfg) + + # replace the ${key} with the value of cfg.key + cfg = replace_cfg_vals(cfg) + + # update data root according to MMDET_DATASETS + update_data_root(cfg) + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + input_shape = args.input_shape + assert len(input_shape) == 2 + + anchor_type = cfg.model.bbox_head.prior_generator.type + assert anchor_type == 'mmdet.YOLOAnchorGenerator', \ + f'Only support optimize YOLOAnchor, but get {anchor_type}.' + + base_sizes = cfg.model.bbox_head.prior_generator.base_sizes + num_anchor_per_level = [len(sizes) for sizes in base_sizes] + + train_data_cfg = cfg.train_dataloader + while 'dataset' in train_data_cfg: + train_data_cfg = train_data_cfg['dataset'] + dataset = DATASETS.build(train_data_cfg) + + if args.algorithm == 'k-means': + optimizer = YOLOKMeansAnchorOptimizer( + dataset=dataset, + input_shape=input_shape, + device=args.device, + num_anchor_per_level=num_anchor_per_level, + iters=args.iters, + logger=logger, + out_dir=args.out_dir) + elif args.algorithm == 'DE': + optimizer = YOLODEAnchorOptimizer( + dataset=dataset, + input_shape=input_shape, + device=args.device, + num_anchor_per_level=num_anchor_per_level, + iters=args.iters, + logger=logger, + out_dir=args.out_dir) + elif args.algorithm == 'v5-k-means': + optimizer = YOLOV5KMeansAnchorOptimizer( + dataset=dataset, + input_shape=input_shape, + device=args.device, + num_anchor_per_level=num_anchor_per_level, + iters=args.iters, + prior_match_thr=args.prior_match_thr, + mutation_args=args.mutation_args, + augment_args=args.augment_args, + logger=logger, + out_dir=args.out_dir) + else: + raise NotImplementedError( + f'Only support k-means and differential_evolution, ' + f'but get {args.algorithm}') + + optimizer.optimize() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/analysis_tools/vis_scheduler.py b/third_party/mmyolo/tools/analysis_tools/vis_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..8a2922d890d68e0be54925fc18c8afd43a4451f3 --- /dev/null +++ b/third_party/mmyolo/tools/analysis_tools/vis_scheduler.py @@ -0,0 +1,295 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Hyper-parameter Scheduler Visualization. + +This tool aims to help the user to check +the hyper-parameter scheduler of the optimizer(without training), +which support the "learning rate", "momentum", and "weight_decay". + +Example: +```shell +python tools/analysis_tools/vis_scheduler.py \ + configs/rtmdet/rtmdet_s_syncbn_fast_8xb32-300e_coco.py \ + --dataset-size 118287 \ + --ngpus 8 \ + --out-dir ./output +``` +Modified from: https://github.com/open-mmlab/mmclassification/blob/1.x/tools/visualizations/vis_scheduler.py # noqa +""" +import argparse +import json +import os.path as osp +import re +from pathlib import Path +from unittest.mock import MagicMock + +import matplotlib.pyplot as plt +import rich +import torch.nn as nn +from mmengine.config import Config, DictAction +from mmengine.hooks import Hook +from mmengine.model import BaseModel +from mmengine.registry import init_default_scope +from mmengine.runner import Runner +from mmengine.utils.path import mkdir_or_exist +from mmengine.visualization import Visualizer +from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Visualize a hyper-parameter scheduler') + parser.add_argument('config', help='config file path') + parser.add_argument( + '-p', + '--parameter', + type=str, + default='lr', + choices=['lr', 'momentum', 'wd'], + help='The parameter to visualize its change curve, choose from' + '"lr", "wd" and "momentum". Defaults to "lr".') + parser.add_argument( + '-d', + '--dataset-size', + type=int, + help='The size of the dataset. If specify, `DATASETS.build` will ' + 'be skipped and use this size as the dataset size.') + parser.add_argument( + '-n', + '--ngpus', + type=int, + default=1, + help='The number of GPUs used in training.') + parser.add_argument( + '-o', '--out-dir', type=Path, help='Path to output file') + parser.add_argument( + '--log-level', + default='WARNING', + help='The log level of the handler and logger. Defaults to ' + 'WARNING.') + parser.add_argument('--title', type=str, help='title of figure') + parser.add_argument( + '--style', type=str, default='whitegrid', help='style of plt') + parser.add_argument('--not-show', default=False, action='store_true') + parser.add_argument( + '--window-size', + default='12*7', + help='Size of the window to display images, in format of "$W*$H".') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + if args.window_size != '': + assert re.match(r'\d+\*\d+', args.window_size), \ + "'window-size' must be in format 'W*H'." + + return args + + +class SimpleModel(BaseModel): + """simple model that do nothing in train_step.""" + + def __init__(self): + super().__init__() + self.data_preprocessor = nn.Identity() + self.conv = nn.Conv2d(1, 1, 1) + + def forward(self, inputs, data_samples, mode='tensor'): + pass + + def train_step(self, data, optim_wrapper): + pass + + +class ParamRecordHook(Hook): + + def __init__(self, by_epoch): + super().__init__() + self.by_epoch = by_epoch + self.lr_list = [] + self.momentum_list = [] + self.wd_list = [] + self.task_id = 0 + self.progress = Progress(BarColumn(), MofNCompleteColumn(), + TextColumn('{task.description}')) + + def before_train(self, runner): + if self.by_epoch: + total = runner.train_loop.max_epochs + self.task_id = self.progress.add_task( + 'epochs', start=True, total=total) + else: + total = runner.train_loop.max_iters + self.task_id = self.progress.add_task( + 'iters', start=True, total=total) + self.progress.start() + + def after_train_epoch(self, runner): + if self.by_epoch: + self.progress.update(self.task_id, advance=1) + + # TODO: Support multiple schedulers + def after_train_iter(self, runner, batch_idx, data_batch, outputs): + if not self.by_epoch: + self.progress.update(self.task_id, advance=1) + self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0]) + self.momentum_list.append( + runner.optim_wrapper.get_momentum()['momentum'][0]) + self.wd_list.append( + runner.optim_wrapper.param_groups[0]['weight_decay']) + + def after_train(self, runner): + self.progress.stop() + + +def plot_curve(lr_list, args, param_name, iters_per_epoch, by_epoch=True): + """Plot learning rate vs iter graph.""" + try: + import seaborn as sns + sns.set_style(args.style) + except ImportError: + pass + + wind_w, wind_h = args.window_size.split('*') + wind_w, wind_h = int(wind_w), int(wind_h) + plt.figure(figsize=(wind_w, wind_h)) + + ax: plt.Axes = plt.subplot() + ax.plot(lr_list, linewidth=1) + + if by_epoch: + ax.xaxis.tick_top() + ax.set_xlabel('Iters') + ax.xaxis.set_label_position('top') + sec_ax = ax.secondary_xaxis( + 'bottom', + functions=(lambda x: x / iters_per_epoch, + lambda y: y * iters_per_epoch)) + sec_ax.set_xlabel('Epochs') + else: + plt.xlabel('Iters') + plt.ylabel(param_name) + + if args.title is None: + plt.title(f'{osp.basename(args.config)} {param_name} curve') + else: + plt.title(args.title) + + +def simulate_train(data_loader, cfg, by_epoch): + model = SimpleModel() + param_record_hook = ParamRecordHook(by_epoch=by_epoch) + default_hooks = dict( + param_scheduler=cfg.default_hooks['param_scheduler'], + runtime_info=None, + timer=None, + logger=None, + checkpoint=None, + sampler_seed=None, + param_record=param_record_hook) + + runner = Runner( + model=model, + work_dir=cfg.work_dir, + train_dataloader=data_loader, + train_cfg=cfg.train_cfg, + log_level=cfg.log_level, + optim_wrapper=cfg.optim_wrapper, + param_scheduler=cfg.param_scheduler, + default_scope=cfg.default_scope, + default_hooks=default_hooks, + visualizer=MagicMock(spec=Visualizer), + custom_hooks=cfg.get('custom_hooks', None)) + + runner.train() + + param_dict = dict( + lr=param_record_hook.lr_list, + momentum=param_record_hook.momentum_list, + wd=param_record_hook.wd_list) + + return param_dict + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + if cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.log_level = args.log_level + + init_default_scope(cfg.get('default_scope', 'mmyolo')) + + # init logger + print('Param_scheduler :') + rich.print_json(json.dumps(cfg.param_scheduler)) + + # prepare data loader + batch_size = cfg.train_dataloader.batch_size * args.ngpus + + if 'by_epoch' in cfg.train_cfg: + by_epoch = cfg.train_cfg.get('by_epoch') + elif 'type' in cfg.train_cfg: + by_epoch = cfg.train_cfg.get('type') == 'EpochBasedTrainLoop' + else: + raise ValueError('please set `train_cfg`.') + + if args.dataset_size is None and by_epoch: + from mmyolo.registry import DATASETS + dataset_size = len(DATASETS.build(cfg.train_dataloader.dataset)) + else: + dataset_size = args.dataset_size or batch_size + + class FakeDataloader(list): + dataset = MagicMock(metainfo=None) + + data_loader = FakeDataloader(range(dataset_size // batch_size)) + dataset_info = ( + f'\nDataset infos:' + f'\n - Dataset size: {dataset_size}' + f'\n - Batch size per GPU: {cfg.train_dataloader.batch_size}' + f'\n - Number of GPUs: {args.ngpus}' + f'\n - Total batch size: {batch_size}') + if by_epoch: + dataset_info += f'\n - Iterations per epoch: {len(data_loader)}' + rich.print(dataset_info + '\n') + + # simulation training process + param_dict = simulate_train(data_loader, cfg, by_epoch) + param_list = param_dict[args.parameter] + + if args.parameter == 'lr': + param_name = 'Learning Rate' + elif args.parameter == 'momentum': + param_name = 'Momentum' + else: + param_name = 'Weight Decay' + plot_curve(param_list, args, param_name, len(data_loader), by_epoch) + + if args.out_dir: + # make dir for output + mkdir_or_exist(args.out_dir) + + # save the graph + out_file = osp.join( + args.out_dir, f'{osp.basename(args.config)}-{args.parameter}.jpg') + plt.savefig(out_file) + print(f'\nThe {param_name} graph is saved at {out_file}') + + if not args.not_show: + plt.show() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/dataset_converters/balloon2coco.py b/third_party/mmyolo/tools/dataset_converters/balloon2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..65eb660cb09f850bafb1e743ff840b14200fa975 --- /dev/null +++ b/third_party/mmyolo/tools/dataset_converters/balloon2coco.py @@ -0,0 +1,58 @@ +import os.path as osp + +import mmcv +import mmengine + + +def convert_balloon_to_coco(ann_file, out_file, image_prefix): + + data_infos = mmengine.load(ann_file) + + annotations = [] + images = [] + obj_count = 0 + for idx, v in enumerate(mmengine.track_iter_progress(data_infos.values())): + filename = v['filename'] + img_path = osp.join(image_prefix, filename) + height, width = mmcv.imread(img_path).shape[:2] + + images.append( + dict(id=idx, file_name=filename, height=height, width=width)) + + for _, obj in v['regions'].items(): + assert not obj['region_attributes'] + obj = obj['shape_attributes'] + px = obj['all_points_x'] + py = obj['all_points_y'] + poly = [(x + 0.5, y + 0.5) for x, y in zip(px, py)] + poly = [p for x in poly for p in x] + + x_min, y_min, x_max, y_max = (min(px), min(py), max(px), max(py)) + + data_anno = dict( + image_id=idx, + id=obj_count, + category_id=0, + bbox=[x_min, y_min, x_max - x_min, y_max - y_min], + area=(x_max - x_min) * (y_max - y_min), + segmentation=[poly], + iscrowd=0) + annotations.append(data_anno) + obj_count += 1 + + coco_format_json = dict( + images=images, + annotations=annotations, + categories=[{ + 'id': 0, + 'name': 'balloon' + }]) + mmengine.dump(coco_format_json, out_file) + + +if __name__ == '__main__': + + convert_balloon_to_coco('data/balloon/train/via_region_data.json', + 'data/balloon/train.json', 'data/balloon/train/') + convert_balloon_to_coco('data/balloon/val/via_region_data.json', + 'data/balloon/val.json', 'data/balloon/val/') diff --git a/third_party/mmyolo/tools/dataset_converters/dota/README.md b/third_party/mmyolo/tools/dataset_converters/dota/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a166e2793a0aeb0d08d9e19e7bd5abfd8d8240e5 --- /dev/null +++ b/third_party/mmyolo/tools/dataset_converters/dota/README.md @@ -0,0 +1,3 @@ +# Preparing DOTA Dataset + +Please refer to [Dataset preparation and description](../../../docs/en/recommended_topics/dataset_preparation.md) diff --git a/third_party/mmyolo/tools/dataset_converters/dota/dota_split.py b/third_party/mmyolo/tools/dataset_converters/dota/dota_split.py new file mode 100644 index 0000000000000000000000000000000000000000..0418e9d3c9a7c87a04b825c152f4784f2a7150fa --- /dev/null +++ b/third_party/mmyolo/tools/dataset_converters/dota/dota_split.py @@ -0,0 +1,603 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Reference: https://github.com/jbwang1997/BboxToolkit + +import argparse +import codecs +import datetime +import itertools +import os +import os.path as osp +import time +from functools import partial, reduce +from math import ceil +from multiprocessing import Manager, Pool +from typing import List, Sequence + +import cv2 +import numpy as np +from mmengine import Config, MMLogger, mkdir_or_exist, print_log +from PIL import Image + +Image.MAX_IMAGE_PIXELS = None + +try: + import shapely.geometry as shgeo +except ImportError: + raise ImportError('Please run "pip install shapely" ' + 'to install shapely first.') + +PHASE_REQUIRE_SETS = dict( + trainval=['train', 'val'], + train=[ + 'train', + ], + val=[ + 'val', + ], + test=[ + 'test', + ], +) + + +def parse_args(): + """Parse arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + 'split_config', type=str, help='The split config for image slicing.') + parser.add_argument( + 'data_root', type=str, help='Root dir of DOTA dataset.') + parser.add_argument( + 'out_dir', type=str, help='Output dir for split result.') + parser.add_argument( + '--ann-subdir', + default='labelTxt-v1.0', + type=str, + help='output directory') + parser.add_argument( + '--phase', + '-p', + nargs='+', + default=['trainval', 'test'], + type=str, + choices=['trainval', 'train', 'val', 'test'], + help='Phase of the data set to be prepared.') + parser.add_argument( + '--nproc', default=8, type=int, help='Number of processes.') + parser.add_argument( + '--save-ext', + default=None, + type=str, + help='Extension of the saved image.') + parser.add_argument( + '--overwrite', + action='store_true', + help='Whether to allow overwrite if annotation folder exist.') + args = parser.parse_args() + + assert args.split_config is not None, "argument split_config can't be None" + split_cfg = Config.fromfile(args.split_config) + + # assert arguments + assert args.data_root is not None, "argument data_root can't be None" + if args.save_ext: + assert args.save_ext in ['png', 'jpg', 'bmp', 'tif'] + + assert len(split_cfg.patch_sizes) == len(split_cfg.patch_overlap_sizes) + assert 0 <= split_cfg.iof_thr <= 1 + if split_cfg.get('padding'): + padding_value = split_cfg.get('padding_value') + assert padding_value is not None, \ + "padding_value can't be None when padding is True." + padding_value = padding_value[0] \ + if len(padding_value) == 1 else padding_value + split_cfg.padding_value = padding_value + else: + split_cfg.padding = False + split_cfg.padding_value = None + return args, split_cfg + + +def _make_dirs(out_dir: str, phase: List[str], allow_overwrite: bool): + """Prepare folder for DOTA dataset. + + Args: + out_dir (str): The output dir for DOTA split. + phase (List[str]): The phase to prepare. + allow_overwrite (bool): Whether to allow overwrite when folder exist. + """ + logger = MMLogger.get_current_instance() + for p in phase: + phase_dir = osp.join(out_dir, p) + if not allow_overwrite: + assert not osp.exists(phase_dir), \ + f'{osp.join(phase_dir)} already exists,' \ + 'If you want to ignore existing files, set --overwrite' + else: + if osp.exists(phase_dir): + logger.warning( + f'{p} set in {osp.join(phase_dir)} will be overwritten') + mkdir_or_exist(phase_dir) + mkdir_or_exist(osp.join(phase_dir, 'images')) + mkdir_or_exist(osp.join(phase_dir, 'annfiles')) + + +def load_original_annotations(data_root: str, + ann_subdir: str = 'labelTxt-v1.0', + phase: str = 'train', + nproc: int = 8): + img_dir = osp.join(data_root, phase, 'images') + assert osp.isdir(img_dir), f'The {img_dir} is not an existing dir!' + + if phase == 'test': + ann_dir = None + else: + ann_dir = osp.join(data_root, phase, ann_subdir, 'labelTxt') + assert osp.isdir(ann_dir), f'The {ann_dir} is not an existing dir!' + + _load_func = partial(_load_dota_single, img_dir=img_dir, ann_dir=ann_dir) + if nproc > 1: + pool = Pool(nproc) + contents = pool.map(_load_func, os.listdir(img_dir)) + pool.close() + else: + contents = list(map(_load_func, os.listdir(img_dir))) + infos = [c for c in contents if c is not None] + return infos + + +def _load_dota_single(imgfile: str, img_dir: str, ann_dir: str): + """Load DOTA's single image. + + Args: + imgfile (str): Filename of single image. + img_dir (str): Path of images. + ann_dir (str): Path of annotations. + + Returns: + result (dict): Information of a single image. + + - ``id``: Image id. + - ``filename``: Filename of single image. + - ``filepath``: Filepath of single image. + - ``width``: The width of image. + - ``height``: The height of image. + - ``annotations``: The annotation of single image. + - ``gsd``: The ground sampling distance. + """ + img_id, ext = osp.splitext(imgfile) + if ext not in ['.jpg', '.JPG', '.png', '.tif', '.bmp']: + return None + + imgpath = osp.join(img_dir, imgfile) + size = Image.open(imgpath).size + txtfile = None if ann_dir is None else osp.join(ann_dir, img_id + '.txt') + content = _load_dota_txt(txtfile) + + content.update( + dict( + width=size[0], + height=size[1], + filename=imgfile, + filepath=imgpath, + id=img_id)) + return content + + +def _load_dota_txt(txtfile): + """Load DOTA's txt annotation. + + Args: + txtfile (str): Filename of single Dota txt annotation. + + Returns: + result (dict): Annotation of single image. + + - ``annotations``: The annotation of single image. + - ``gsd``: The ground sampling distance. + """ + gsd, bboxes, labels, diffs = None, [], [], [] + if txtfile is None: + pass + elif not osp.isfile(txtfile): + print(f"Can't find {txtfile}, treated as empty txtfile") + else: + with open(txtfile) as f: + for line in f: + if line.startswith('gsd'): + num = line.split(':')[-1] + try: + gsd = float(num) + except ValueError: + gsd = None + continue + + items = line.split(' ') + if len(items) >= 9: + bboxes.append([float(i) for i in items[:8]]) + labels.append(items[8]) + diffs.append(int(items[9]) if len(items) == 10 else 0) + + bboxes = np.array(bboxes, dtype=np.float32) if bboxes else \ + np.zeros((0, 8), dtype=np.float32) + diffs = np.array(diffs, dtype=np.int64) if diffs else \ + np.zeros((0,), dtype=np.int64) + ann = dict(bboxes=bboxes, labels=labels, diffs=diffs) + return dict(gsd=gsd, annotations=ann) + + +def poly2hbb(polys): + """Convert polygons to horizontal bboxes. + + Args: + polys (np.array): Polygons with shape (N, 8) + + Returns: + np.array: Horizontal bboxes. + """ + shape = polys.shape + polys = polys.reshape(*shape[:-1], shape[-1] // 2, 2) + lt_point = np.min(polys, axis=-2) + rb_point = np.max(polys, axis=-2) + return np.concatenate([lt_point, rb_point], axis=-1) + + +def get_sliding_window(info, patch_settings, img_rate_thr): + """Get sliding windows. + + Args: + info (dict): Dict of image's width and height. + patch_settings (list): List of patch settings, + each in format (patch_size, patch_overlap). + img_rate_thr (float): Threshold of window area divided by image area. + + Returns: + list[np.array]: Information of valid windows. + """ + eps = 0.01 + windows = [] + width, height = info['width'], info['height'] + for (size, gap) in patch_settings: + assert size > gap, f'invaild size gap pair [{size} {gap}]' + step = size - gap + + x_num = 1 if width <= size else ceil((width - size) / step + 1) + x_start = [step * i for i in range(x_num)] + if len(x_start) > 1 and x_start[-1] + size > width: + x_start[-1] = width - size + + y_num = 1 if height <= size else ceil((height - size) / step + 1) + y_start = [step * i for i in range(y_num)] + if len(y_start) > 1 and y_start[-1] + size > height: + y_start[-1] = height - size + + start = np.array( + list(itertools.product(x_start, y_start)), dtype=np.int64) + stop = start + size + windows.append(np.concatenate([start, stop], axis=1)) + windows = np.concatenate(windows, axis=0) + + img_in_wins = windows.copy() + img_in_wins[:, 0::2] = np.clip(img_in_wins[:, 0::2], 0, width) + img_in_wins[:, 1::2] = np.clip(img_in_wins[:, 1::2], 0, height) + img_areas = (img_in_wins[:, 2] - img_in_wins[:, 0]) * \ + (img_in_wins[:, 3] - img_in_wins[:, 1]) + win_areas = (windows[:, 2] - windows[:, 0]) * \ + (windows[:, 3] - windows[:, 1]) + img_rates = img_areas / win_areas + if not (img_rates > img_rate_thr).any(): + max_rate = img_rates.max() + img_rates[abs(img_rates - max_rate) < eps] = 1 + return windows[img_rates > img_rate_thr] + + +def get_window_annotation(info, windows, iof_thr): + """Get annotation by sliding windows. + + Args: + info (dict): Dict of bbox annotations. + windows (np.array): information of sliding windows. + iof_thr (float): Threshold of overlaps between bbox and window. + + Returns: + list[dict]: List of bbox annotations of every window. + """ + bboxes = info['annotations']['bboxes'] + iofs = ann_window_iof(bboxes, windows) + + window_anns = [] + for i in range(windows.shape[0]): + win_iofs = iofs[:, i] + pos_inds = np.nonzero(win_iofs >= iof_thr)[0].tolist() + + win_ann = dict() + for k, v in info['annotations'].items(): + try: + win_ann[k] = v[pos_inds] + except TypeError: + win_ann[k] = [v[i] for i in pos_inds] + win_ann['trunc'] = win_iofs[pos_inds] < 1 + window_anns.append(win_ann) + return window_anns + + +def ann_window_iof(anns, window, eps=1e-6): + """Compute overlaps (iof) between annotations (poly) and window (hbox). + + Args: + anns (np.array): quadri annotations with shape (n, 8). + window (np.array): slide windows with shape (m, 4). + eps (float, optional): Defaults to 1e-6. + + Returns: + np.array: iof between box and window. + """ + rows = anns.shape[0] + cols = window.shape[0] + + if rows * cols == 0: + return np.zeros((rows, cols), dtype=np.float32) + + hbboxes_ann = poly2hbb(anns) + hbboxes_win = window + hbboxes_ann = hbboxes_ann[:, None, :] + lt = np.maximum(hbboxes_ann[..., :2], hbboxes_win[..., :2]) + rb = np.minimum(hbboxes_ann[..., 2:], hbboxes_win[..., 2:]) + wh = np.clip(rb - lt, 0, np.inf) + h_overlaps = wh[..., 0] * wh[..., 1] + + l, t, r, b = (window[..., i] for i in range(4)) + polys_win = np.stack([l, t, r, t, r, b, l, b], axis=-1) + sg_polys_ann = [shgeo.Polygon(p) for p in anns.reshape(rows, -1, 2)] + sg_polys_win = [shgeo.Polygon(p) for p in polys_win.reshape(cols, -1, 2)] + overlaps = np.zeros(h_overlaps.shape) + for p in zip(*np.nonzero(h_overlaps)): + overlaps[p] = sg_polys_ann[p[0]].intersection(sg_polys_win[p[-1]]).area + unions = np.array([p.area for p in sg_polys_ann], dtype=np.float32) + unions = unions[..., None] + + unions = np.clip(unions, eps, np.inf) + outputs = overlaps / unions + if outputs.ndim == 1: + outputs = outputs[..., None] + return outputs + + +def crop_and_save_img(info, windows, window_anns, padding, padding_value, + save_dir, anno_dir, img_ext): + """Crop the image and save. + + Args: + info (dict): Image's information. + windows (np.array): information of sliding windows. + window_anns (list[dict]): List of bbox annotations of every window. + padding (bool): If True, with padding. + padding_value (tuple[int|float]): Padding value. + save_dir (str): Save filename. + anno_dir (str): Annotation filename. + img_ext (str): Picture suffix. + + Returns: + list[dict]: Information of paths. + """ + img = cv2.imread(info['filepath']) + patch_infos = [] + for window, ann in zip(windows, window_anns): + patch_info = dict() + for k, v in info.items(): + if k not in [ + 'id', 'filename', 'filepath', 'width', 'height', + 'annotations' + ]: + patch_info[k] = v + + x_start, y_start, x_stop, y_stop = window.tolist() + patch_info['x_start'] = x_start + patch_info['y_start'] = y_start + patch_info['id'] = \ + info['id'] + '__' + str(x_stop - x_start) + \ + '__' + str(x_start) + '___' + str(y_start) + patch_info['ori_id'] = info['id'] + + ann['bboxes'] = shift_qbboxes(ann['bboxes'], [-x_start, -y_start]) + patch_info['ann'] = ann + + patch = img[y_start:y_stop, x_start:x_stop] + if padding: + height = y_stop - y_start + width = x_stop - x_start + if height > patch.shape[0] or width > patch.shape[1]: + padding_patch = np.empty((height, width, patch.shape[-1]), + dtype=np.uint8) + if not isinstance(padding_value, (int, float)): + assert len(padding_value) == patch.shape[-1] + padding_patch[...] = padding_value + padding_patch[:patch.shape[0], :patch.shape[1], ...] = patch + patch = padding_patch + patch_info['height'] = patch.shape[0] + patch_info['width'] = patch.shape[1] + + cv2.imwrite( + osp.join(save_dir, patch_info['id'] + '.' + img_ext), patch) + patch_info['filename'] = patch_info['id'] + '.' + img_ext + patch_infos.append(patch_info) + + bboxes_num = patch_info['ann']['bboxes'].shape[0] + outdir = os.path.join(anno_dir, patch_info['id'] + '.txt') + + with codecs.open(outdir, 'w', 'utf-8') as f_out: + if bboxes_num == 0: + pass + else: + for idx in range(bboxes_num): + obj = patch_info['ann'] + outline = ' '.join(list(map(str, obj['bboxes'][idx]))) + diffs = str( + obj['diffs'][idx]) if not obj['trunc'][idx] else '2' + outline = outline + ' ' + obj['labels'][idx] + ' ' + diffs + f_out.write(outline + '\n') + + return patch_infos + + +def shift_qbboxes(bboxes, offset: Sequence[float]): + """Map bboxes from window coordinate back to original coordinate. TODO + Refactor and move to `mmyolo/utils/large_image.py` + + Args: + bboxes (np.array): quadrilateral boxes with window coordinate. + offset (Sequence[float]): The translation offsets with shape of (2, ). + + Returns: + np.array: bboxes with original coordinate. + """ + dim = bboxes.shape[-1] + translated = bboxes + np.array(offset * int(dim / 2), dtype=np.float32) + return translated + + +def single_split(info, patch_settings, min_img_ratio, iof_thr, padding, + padding_value, save_dir, anno_dir, img_ext, lock, prog, + total): + """Single image split. TODO Refactoring to make it more generic. + + Args: + info (dict): Image info and annotations. + patch_settings (list): List of patch settings, + each in format (patch_size, patch_overlap). + min_img_ratio (float): Threshold of window area divided by image area. + iof_thr (float): Threshold of overlaps between bbox and window. + padding (bool): If True, with padding. + padding_value (tuple[int|float]): Padding value. + save_dir (str): Save filename. + anno_dir (str): Annotation filename. + img_ext (str): Picture suffix. + lock (Lock): Lock of Manager. + prog (object): Progress of Manager. + total (int): Length of infos. + + Returns: + list[dict]: Information of paths. + """ + img_ext = img_ext if img_ext is not None else info['filename'].split( + '.')[-1] + windows = get_sliding_window(info, patch_settings, min_img_ratio) + window_anns = get_window_annotation(info, windows, iof_thr) + patch_infos = crop_and_save_img(info, windows, window_anns, padding, + padding_value, save_dir, anno_dir, img_ext) + assert patch_infos + + lock.acquire() + prog.value += 1 + msg = f'({prog.value / total:3.1%} {prog.value}:{total})' + msg += ' - ' + f"Filename: {info['filename']}" + msg += ' - ' + f"width: {info['width']:<5d}" + msg += ' - ' + f"height: {info['height']:<5d}" + msg += ' - ' + f"Objects: {len(info['annotations']['bboxes']):<5d}" + msg += ' - ' + f'Patches: {len(patch_infos)}' + print_log(msg, 'current') + lock.release() + + return patch_infos + + +def main(): + args, split_cfg = parse_args() + + mkdir_or_exist(args.out_dir) + + # init logger + log_file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.log' + logger: MMLogger = MMLogger.get_instance( + 'mmyolo', + log_file=osp.join(args.out_dir, log_file_name), + log_level='INFO') + + # print configs + arg_str = '' + for arg in args._get_kwargs(): + arg_str += arg[0] + ' = ' + str(arg[1]) + '\n' + + logger.info('Base Settings:\n' + arg_str) + logger.info('Split Settings:\n' + split_cfg.pretty_text) + + # make dirs + _make_dirs(args.out_dir, args.phase, args.overwrite) + + # Load original dota data + required_sets = [] + for p in args.phase: + required_sets.extend(PHASE_REQUIRE_SETS[p]) + required_sets = set(required_sets) + + loaded_data_set = dict() + for req_set in required_sets: + logger.info(f'Starting loading DOTA {req_set} set information.') + start_time = time.time() + + infos = load_original_annotations( + data_root=args.data_root, + ann_subdir=args.ann_subdir, + phase=req_set) + + end_time = time.time() + result_log = f'Finishing loading {req_set} set, ' + result_log += f'get {len(infos)} images, ' + result_log += f'using {end_time - start_time:.3f}s.' + logger.info(result_log) + + loaded_data_set[req_set] = infos + + # Preprocess patch settings + patch_settings = [] + for ratio in split_cfg.img_resize_ratio: + for size, gap in zip(split_cfg.patch_sizes, + split_cfg.patch_overlap_sizes): + size_gap = (int(size / ratio), int(gap / ratio)) + if size_gap not in patch_settings: + patch_settings.append(size_gap) + + # Split data + for p in args.phase: + save_imgs_dir = osp.join(args.out_dir, p, 'images') + save_anns_dir = osp.join(args.out_dir, p, 'annfiles') + + logger.info(f'Start splitting {p} set images!') + start = time.time() + manager = Manager() + + data_infos = [] + for req_set in PHASE_REQUIRE_SETS[p]: + data_infos.extend(loaded_data_set[req_set]) + + worker = partial( + single_split, + patch_settings=patch_settings, + min_img_ratio=split_cfg.min_img_ratio, + iof_thr=split_cfg.iof_thr, + padding=split_cfg.padding, + padding_value=split_cfg.padding_value, + save_dir=save_imgs_dir, + anno_dir=save_anns_dir, + img_ext=args.save_ext, + lock=manager.Lock(), + prog=manager.Value('i', 0), + total=len(data_infos)) + + if args.nproc > 1: + pool = Pool(args.nproc) + patch_infos = pool.map(worker, data_infos) + pool.close() + else: + patch_infos = list(map(worker, data_infos)) + + patch_infos = reduce(lambda x, y: x + y, patch_infos) + stop = time.time() + logger.info( + f'Finish splitting {p} set images in {int(stop - start)} second!!!' + ) + logger.info(f'Total images number: {len(patch_infos)}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/dataset_converters/dota/split_config/multi_scale.json b/third_party/mmyolo/tools/dataset_converters/dota/split_config/multi_scale.json new file mode 100644 index 0000000000000000000000000000000000000000..8cbdc93a4420abec7298f188a01ee71f38b94eb8 --- /dev/null +++ b/third_party/mmyolo/tools/dataset_converters/dota/split_config/multi_scale.json @@ -0,0 +1,19 @@ +{ + "patch_sizes": [ + 1024 + ], + "patch_overlap_sizes": [ + 500 + ], + "img_resize_ratio": [ + 0.5, 1.0, 1.5 + ], + "min_img_ratio": 0.6, + "iof_thr": 0.7, + "padding": true, + "padding_value": [ + 104, + 116, + 124 + ] +} diff --git a/third_party/mmyolo/tools/dataset_converters/dota/split_config/single_scale.json b/third_party/mmyolo/tools/dataset_converters/dota/split_config/single_scale.json new file mode 100644 index 0000000000000000000000000000000000000000..8c65c40ad63d522b3ab82956f6a7befdef874818 --- /dev/null +++ b/third_party/mmyolo/tools/dataset_converters/dota/split_config/single_scale.json @@ -0,0 +1,19 @@ +{ + "patch_sizes": [ + 1024 + ], + "patch_overlap_sizes": [ + 200 + ], + "img_resize_ratio": [ + 1.0 + ], + "min_img_ratio": 0.6, + "iof_thr": 0.7, + "padding": true, + "padding_value": [ + 104, + 116, + 124 + ] +} diff --git a/third_party/mmyolo/tools/dataset_converters/labelme2coco.py b/third_party/mmyolo/tools/dataset_converters/labelme2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..e68b935db3236177d4c17973ef2a43159150ffc7 --- /dev/null +++ b/third_party/mmyolo/tools/dataset_converters/labelme2coco.py @@ -0,0 +1,325 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This script helps to convert labelme-style dataset to the coco format. + +Usage: + $ python labelme2coco.py \ + --img-dir /path/to/images \ + --labels-dir /path/to/labels \ + --out /path/to/coco_instances.json \ + [--class-id-txt /path/to/class_with_id.txt] + +Note: + Labels dir file structure: + . + └── PATH_TO_LABELS + ├── image1.json + ├── image2.json + └── ... + + Images dir file structure: + . + └── PATH_TO_IMAGES + ├── image1.jpg + ├── image2.png + └── ... + + If user set `--class-id-txt` then will use it in `categories` field, + if not set, then will generate auto base on the all labelme label + files to `class_with_id.json`. + + class_with_id.txt example, each line is "id class_name": + ```text + 1 cat + 2 dog + 3 bicycle + 4 motorcycle + + ``` +""" +import argparse +import json +from pathlib import Path +from typing import Optional + +import numpy as np +from mmengine import track_iter_progress + +from mmyolo.utils.misc import IMG_EXTENSIONS + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--img-dir', type=str, help='Dataset image directory') + parser.add_argument( + '--labels-dir', type=str, help='Dataset labels directory') + parser.add_argument('--out', type=str, help='COCO label json output path') + parser.add_argument( + '--class-id-txt', default=None, type=str, help='All class id txt path') + args = parser.parse_args() + return args + + +def format_coco_annotations(points: list, image_id: int, annotations_id: int, + category_id: int) -> dict: + """Gen COCO annotations format label from labelme format label. + + Args: + points (list): Coordinates of four vertices of rectangle bbox. + image_id (int): Image id. + annotations_id (int): Annotations id. + category_id (int): Image dir path. + + Return: + annotation_info (dict): COCO annotation data. + """ + annotation_info = dict() + annotation_info['iscrowd'] = 0 + annotation_info['category_id'] = category_id + annotation_info['id'] = annotations_id + annotation_info['image_id'] = image_id + + # bbox is [x1, y1, w, h] + annotation_info['bbox'] = [ + points[0][0], points[0][1], points[1][0] - points[0][0], + points[1][1] - points[0][1] + ] + + annotation_info['area'] = annotation_info['bbox'][2] * annotation_info[ + 'bbox'][3] # bbox w * h + segmentation_points = np.asarray(points).copy() + segmentation_points[1, :] = np.asarray(points)[2, :] + segmentation_points[2, :] = np.asarray(points)[1, :] + annotation_info['segmentation'] = [list(segmentation_points.flatten())] + + return annotation_info + + +def parse_labelme_to_coco( + image_dir: str, + labels_root: str, + all_classes_id: Optional[dict] = None) -> (dict, dict): + """Gen COCO json format label from labelme format label. + + Args: + image_dir (str): Image dir path. + labels_root (str): Image label root path. + all_classes_id (Optional[dict]): All class with id. Default None. + + Return: + coco_json (dict): COCO json data. + category_to_id (dict): category id and name. + + COCO json example: + + { + "images": [ + { + "height": 3000, + "width": 4000, + "id": 1, + "file_name": "IMG_20210627_225110.jpg" + }, + ... + ], + "categories": [ + { + "id": 1, + "name": "cat" + }, + ... + ], + "annotations": [ + { + "iscrowd": 0, + "category_id": 1, + "id": 1, + "image_id": 1, + "bbox": [ + 1183.7313232421875, + 1230.0509033203125, + 1270.9998779296875, + 927.0848388671875 + ], + "area": 1178324.7170306593, + "segmentation": [ + [ + 1183.7313232421875, + 1230.0509033203125, + 1183.7313232421875, + 2157.1357421875, + 2454.731201171875, + 2157.1357421875, + 2454.731201171875, + 1230.0509033203125 + ] + ] + }, + ... + ] + } + """ + + # init coco json field + coco_json = {'images': [], 'categories': [], 'annotations': []} + + image_id = 0 + annotations_id = 0 + if all_classes_id is None: + category_to_id = dict() + categories_labels = [] + else: + category_to_id = all_classes_id + categories_labels = list(all_classes_id.keys()) + + # add class_ids and class_names to the categories list in coco_json + for class_name, class_id in category_to_id.items(): + coco_json['categories'].append({ + 'id': class_id, + 'name': class_name + }) + + # filter incorrect image file + img_file_list = [ + img_file for img_file in Path(image_dir).iterdir() + if img_file.suffix.lower() in IMG_EXTENSIONS + ] + + for img_file in track_iter_progress(img_file_list): + + # get label file according to the image file name + label_path = Path(labels_root).joinpath( + img_file.stem).with_suffix('.json') + if not label_path.exists(): + print(f'Can not find label file: {label_path}, skip...') + continue + + # load labelme label + with open(label_path, encoding='utf-8') as f: + labelme_data = json.load(f) + + image_id = image_id + 1 # coco id begin from 1 + + # update coco 'images' field + coco_json['images'].append({ + 'height': + labelme_data['imageHeight'], + 'width': + labelme_data['imageWidth'], + 'id': + image_id, + 'file_name': + Path(labelme_data['imagePath']).name + }) + + for label_shapes in labelme_data['shapes']: + + # Update coco 'categories' field + class_name = label_shapes['label'] + + if (all_classes_id is None) and (class_name + not in categories_labels): + # only update when not been added before + coco_json['categories'].append({ + 'id': + len(categories_labels) + 1, # categories id start with 1 + 'name': class_name + }) + categories_labels.append(class_name) + category_to_id[class_name] = len(categories_labels) + + elif (all_classes_id is not None) and (class_name + not in categories_labels): + # check class name + raise ValueError(f'Got unexpected class name {class_name}, ' + 'which is not in your `--class-id-txt`.') + + # get shape type and convert it to coco format + shape_type = label_shapes['shape_type'] + if shape_type != 'rectangle': + print(f'not support `{shape_type}` yet, skip...') + continue + + annotations_id = annotations_id + 1 + # convert point from [xmin, ymin, xmax, ymax] to [x1, y1, w, h] + (x1, y1), (x2, y2) = label_shapes['points'] + x1, x2 = sorted([x1, x2]) # xmin, xmax + y1, y2 = sorted([y1, y2]) # ymin, ymax + points = [[x1, y1], [x2, y2], [x1, y2], [x2, y1]] + coco_annotations = format_coco_annotations( + points, image_id, annotations_id, category_to_id[class_name]) + coco_json['annotations'].append(coco_annotations) + + print(f'Total image = {image_id}') + print(f'Total annotations = {annotations_id}') + print(f'Number of categories = {len(categories_labels)}, ' + f'which is {categories_labels}') + + return coco_json, category_to_id + + +def convert_labelme_to_coco(image_dir: str, + labels_dir: str, + out_path: str, + class_id_txt: Optional[str] = None): + """Convert labelme format label to COCO json format label. + + Args: + image_dir (str): Image dir path. + labels_dir (str): Image label path. + out_path (str): COCO json file save path. + class_id_txt (Optional[str]): All class id txt file path. + Default None. + """ + assert Path(out_path).suffix == '.json' + + if class_id_txt is not None: + assert Path(class_id_txt).suffix == '.txt' + + all_classes_id = dict() + with open(class_id_txt, encoding='utf-8') as f: + txt_lines = f.read().splitlines() + assert len(txt_lines) > 0 + + for txt_line in txt_lines: + class_info = txt_line.split(' ') + if len(class_info) != 2: + raise ValueError('Error parse "class_id_txt" file ' + f'{class_id_txt}, please check if some of ' + 'the class names is blank, like "1 " -> ' + '"1 blank", or class name has space between' + ' words, like "1 Big house" -> "1 ' + 'Big-house".') + v, k = class_info + all_classes_id.update({k: int(v)}) + else: + all_classes_id = None + + # convert to coco json + coco_json_data, category_to_id = parse_labelme_to_coco( + image_dir, labels_dir, all_classes_id) + + # save json result + Path(out_path).parent.mkdir(exist_ok=True, parents=True) + print(f'Saving json to {out_path}') + json.dump(coco_json_data, open(out_path, 'w'), indent=2) + + if class_id_txt is None: + category_to_id_path = Path(out_path).with_name('class_with_id.txt') + print(f'Saving class id txt to {category_to_id_path}') + with open(category_to_id_path, 'w', encoding='utf-8') as f: + for k, v in category_to_id.items(): + f.write(f'{v} {k}\n') + else: + print('Not Saving new class id txt, user should using ' + f'{class_id_txt} for training config') + + +def main(): + args = parse_args() + convert_labelme_to_coco(args.img_dir, args.labels_dir, args.out, + args.class_id_txt) + print('All done!') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/dataset_converters/yolo2coco.py b/third_party/mmyolo/tools/dataset_converters/yolo2coco.py new file mode 100644 index 0000000000000000000000000000000000000000..19f1366622a3305f001e6e6650ad31f98c54b7c7 --- /dev/null +++ b/third_party/mmyolo/tools/dataset_converters/yolo2coco.py @@ -0,0 +1,264 @@ +"""This script helps to convert yolo-style dataset to the coco format. + +Usage: + $ python yolo2coco.py /path/to/dataset # image_dir + +Note: + 1. Before running this script, please make sure the root directory + of your dataset is formatted in the following struction: + . + └── $ROOT_PATH + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... + 2. The script will automatically check whether the corresponding + `train.txt`, ` val.txt`, and `test.txt` exist under your `image_dir` + or not. If these files are detected, the script will organize the + dataset. The image paths in these files must be ABSOLUTE paths. + 3. Once the script finishes, the result files will be saved in the + directory named 'annotations' in the root directory of your dataset. + The default output file is result.json. The root directory folder may + look like this in the root directory after the converting: + . + └── $ROOT_PATH + ├── annotations + │ ├── result.json + │ └── ... + ├── classes.txt + ├── labels + │ ├── a.txt + │ ├── b.txt + │ └── ... + ├── images + │ ├── a.jpg + │ ├── b.png + │ └── ... + └── ... + 4. After converting to coco, you can use the + `tools/analysis_tools/browse_coco_json.py` script to visualize + whether it is correct. +""" +import argparse +import os +import os.path as osp + +import mmcv +import mmengine + +IMG_EXTENSIONS = ('.jpg', '.png', '.jpeg') + + +def check_existence(file_path: str): + """Check if target file is existed.""" + if not osp.exists(file_path): + raise FileNotFoundError(f'{file_path} does not exist!') + + +def get_image_info(yolo_image_dir, idx, file_name): + """Retrieve image information.""" + img_path = osp.join(yolo_image_dir, file_name) + check_existence(img_path) + + img = mmcv.imread(img_path) + height, width = img.shape[:2] + img_info_dict = { + 'file_name': file_name, + 'id': idx, + 'width': width, + 'height': height + } + return img_info_dict, height, width + + +def convert_bbox_info(label, idx, obj_count, image_height, image_width): + """Convert yolo-style bbox info to the coco format.""" + label = label.strip().split() + x = float(label[1]) + y = float(label[2]) + w = float(label[3]) + h = float(label[4]) + + # convert x,y,w,h to x1,y1,x2,y2 + x1 = (x - w / 2) * image_width + y1 = (y - h / 2) * image_height + x2 = (x + w / 2) * image_width + y2 = (y + h / 2) * image_height + + cls_id = int(label[0]) + width = max(0., x2 - x1) + height = max(0., y2 - y1) + coco_format_info = { + 'image_id': idx, + 'id': obj_count, + 'category_id': cls_id, + 'bbox': [x1, y1, width, height], + 'area': width * height, + 'segmentation': [[x1, y1, x2, y1, x2, y2, x1, y2]], + 'iscrowd': 0 + } + obj_count += 1 + return coco_format_info, obj_count + + +def organize_by_existing_files(image_dir: str, existed_categories: list): + """Format annotations by existing train/val/test files.""" + categories = ['train', 'val', 'test'] + image_list = [] + + for cat in categories: + if cat in existed_categories: + txt_file = osp.join(image_dir, f'{cat}.txt') + print(f'Start to read {cat} dataset definition') + assert osp.exists(txt_file) + + with open(txt_file) as f: + img_paths = f.readlines() + img_paths = [ + os.path.split(img_path.strip())[1] + for img_path in img_paths + ] # split the absolute path + image_list.append(img_paths) + else: + image_list.append([]) + return image_list[0], image_list[1], image_list[2] + + +def convert_yolo_to_coco(image_dir: str): + """Convert annotations from yolo style to coco style. + + Args: + image_dir (str): the root directory of your datasets which contains + labels, images, classes.txt, etc + """ + print(f'Start to load existing images and annotations from {image_dir}') + check_existence(image_dir) + + # check local environment + yolo_label_dir = osp.join(image_dir, 'labels') + yolo_image_dir = osp.join(image_dir, 'images') + yolo_class_txt = osp.join(image_dir, 'classes.txt') + check_existence(yolo_label_dir) + check_existence(yolo_image_dir) + check_existence(yolo_class_txt) + print(f'All necessary files are located at {image_dir}') + + train_txt_path = osp.join(image_dir, 'train.txt') + val_txt_path = osp.join(image_dir, 'val.txt') + test_txt_path = osp.join(image_dir, 'test.txt') + existed_categories = [] + print(f'Checking if train.txt, val.txt, and test.txt are in {image_dir}') + if osp.exists(train_txt_path): + print('Found train.txt') + existed_categories.append('train') + if osp.exists(val_txt_path): + print('Found val.txt') + existed_categories.append('val') + if osp.exists(test_txt_path): + print('Found test.txt') + existed_categories.append('test') + + # prepare the output folders + output_folder = osp.join(image_dir, 'annotations') + if not osp.exists(output_folder): + os.makedirs(output_folder) + check_existence(output_folder) + + # start the convert procedure + with open(yolo_class_txt) as f: + classes = f.read().strip().split() + + indices = os.listdir(yolo_image_dir) + total = len(indices) + + dataset = {'images': [], 'annotations': [], 'categories': []} + if existed_categories == []: + print('These files are not located, no need to organize separately.') + for i, cls in enumerate(classes, 0): + dataset['categories'].append({'id': i, 'name': cls}) + else: + print('Need to organize the data accordingly.') + train_dataset = {'images': [], 'annotations': [], 'categories': []} + val_dataset = {'images': [], 'annotations': [], 'categories': []} + test_dataset = {'images': [], 'annotations': [], 'categories': []} + + # category id starts from 0 + for i, cls in enumerate(classes, 0): + train_dataset['categories'].append({'id': i, 'name': cls}) + val_dataset['categories'].append({'id': i, 'name': cls}) + test_dataset['categories'].append({'id': i, 'name': cls}) + train_img, val_img, test_img = organize_by_existing_files( + image_dir, existed_categories) + + obj_count = 0 + skipped = 0 + converted = 0 + for idx, image in enumerate(mmengine.track_iter_progress(indices)): + img_info_dict, image_height, image_width = get_image_info( + yolo_image_dir, idx, image) + + if existed_categories != []: + if image in train_img: + dataset = train_dataset + elif image in val_img: + dataset = val_dataset + elif image in test_img: + dataset = test_dataset + + dataset['images'].append(img_info_dict) + + img_name = osp.splitext(image)[0] + label_path = f'{osp.join(yolo_label_dir, img_name)}.txt' + if not osp.exists(label_path): + # if current image is not annotated or the annotation file failed + print( + f'WARNING: {label_path} does not exist. Please check the file.' + ) + skipped += 1 + continue + + with open(label_path) as f: + labels = f.readlines() + for label in labels: + coco_info, obj_count = convert_bbox_info( + label, idx, obj_count, image_height, image_width) + dataset['annotations'].append(coco_info) + converted += 1 + + # saving results to result json + if existed_categories == []: + out_file = osp.join(image_dir, 'annotations/result.json') + print(f'Saving converted results to {out_file} ...') + mmengine.dump(dataset, out_file) + else: + for category in existed_categories: + out_file = osp.join(output_folder, f'{category}.json') + print(f'Saving converted results to {out_file} ...') + if category == 'train': + mmengine.dump(train_dataset, out_file) + elif category == 'val': + mmengine.dump(val_dataset, out_file) + elif category == 'test': + mmengine.dump(test_dataset, out_file) + + # simple statistics + print(f'Process finished! Please check at {output_folder} .') + print(f'Number of images found: {total}, converted: {converted},', + f'and skipped: {skipped}. Total annotation count: {obj_count}.') + print('You can use tools/analysis_tools/browse_coco_json.py to visualize!') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + 'image_dir', + type=str, + help='dataset directory with ./images and ./labels, classes.txt, etc.') + arg = parser.parse_args() + convert_yolo_to_coco(arg.image_dir) diff --git a/third_party/mmyolo/tools/dist_test.sh b/third_party/mmyolo/tools/dist_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..dea131b43ea8f1222661d20603d40c18ea7f28a1 --- /dev/null +++ b/third_party/mmyolo/tools/dist_test.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/test.py \ + $CONFIG \ + $CHECKPOINT \ + --launcher pytorch \ + ${@:4} diff --git a/third_party/mmyolo/tools/dist_train.sh b/third_party/mmyolo/tools/dist_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..3fca7641dec4090930c85991a079c28409529d4e --- /dev/null +++ b/third_party/mmyolo/tools/dist_train.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/train.py \ + $CONFIG \ + --launcher pytorch ${@:3} diff --git a/third_party/mmyolo/tools/misc/coco_split.py b/third_party/mmyolo/tools/misc/coco_split.py new file mode 100644 index 0000000000000000000000000000000000000000..8ce70349b6e85f48704e6ef5c8e5c0164bc6084e --- /dev/null +++ b/third_party/mmyolo/tools/misc/coco_split.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import random +from pathlib import Path + +import numpy as np +from pycocotools.coco import COCO + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--json', type=str, required=True, help='COCO json label path') + parser.add_argument( + '--out-dir', type=str, required=True, help='output path') + parser.add_argument( + '--ratios', + nargs='+', + type=float, + help='ratio for sub dataset, if set 2 number then will generate ' + 'trainval + test (eg. "0.8 0.1 0.1" or "2 1 1"), if set 3 number ' + 'then will generate train + val + test (eg. "0.85 0.15" or "2 1")') + parser.add_argument( + '--shuffle', + action='store_true', + help='Whether to display in disorder') + parser.add_argument('--seed', default=-1, type=int, help='seed') + args = parser.parse_args() + return args + + +def split_coco_dataset(coco_json_path: str, save_dir: str, ratios: list, + shuffle: bool, seed: int): + if not Path(coco_json_path).exists(): + raise FileNotFoundError(f'Can not not found {coco_json_path}') + + if not Path(save_dir).exists(): + Path(save_dir).mkdir(parents=True) + + # ratio normalize + ratios = np.array(ratios) / np.array(ratios).sum() + + if len(ratios) == 2: + ratio_train, ratio_test = ratios + ratio_val = 0 + train_type = 'trainval' + elif len(ratios) == 3: + ratio_train, ratio_val, ratio_test = ratios + train_type = 'train' + else: + raise ValueError('ratios must set 2 or 3 group!') + + # Read coco info + coco = COCO(coco_json_path) + coco_image_ids = coco.getImgIds() + + # gen image number of each dataset + val_image_num = int(len(coco_image_ids) * ratio_val) + test_image_num = int(len(coco_image_ids) * ratio_test) + train_image_num = len(coco_image_ids) - val_image_num - test_image_num + print('Split info: ====== \n' + f'Train ratio = {ratio_train}, number = {train_image_num}\n' + f'Val ratio = {ratio_val}, number = {val_image_num}\n' + f'Test ratio = {ratio_test}, number = {test_image_num}') + + seed = int(seed) + if seed != -1: + print(f'Set the global seed: {seed}') + np.random.seed(seed) + + if shuffle: + print('shuffle dataset.') + random.shuffle(coco_image_ids) + + # split each dataset + train_image_ids = coco_image_ids[:train_image_num] + if val_image_num != 0: + val_image_ids = coco_image_ids[train_image_num:train_image_num + + val_image_num] + else: + val_image_ids = None + test_image_ids = coco_image_ids[train_image_num + val_image_num:] + + # Save new json + categories = coco.loadCats(coco.getCatIds()) + for img_id_list in [train_image_ids, val_image_ids, test_image_ids]: + if img_id_list is None: + continue + + # Gen new json + img_dict = { + 'images': coco.loadImgs(ids=img_id_list), + 'categories': categories, + 'annotations': coco.loadAnns(coco.getAnnIds(imgIds=img_id_list)) + } + + # save json + if img_id_list == train_image_ids: + json_file_path = Path(save_dir, f'{train_type}.json') + elif img_id_list == val_image_ids: + json_file_path = Path(save_dir, 'val.json') + elif img_id_list == test_image_ids: + json_file_path = Path(save_dir, 'test.json') + else: + raise ValueError('img_id_list ERROR!') + + print(f'Saving json to {json_file_path}') + with open(json_file_path, 'w') as f_json: + json.dump(img_dict, f_json, ensure_ascii=False, indent=2) + + print('All done!') + + +def main(): + args = parse_args() + split_coco_dataset(args.json, args.out_dir, args.ratios, args.shuffle, + args.seed) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/misc/download_dataset.py b/third_party/mmyolo/tools/misc/download_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7d1c64d82ec21285c348afc65a102d49452f2d4a --- /dev/null +++ b/third_party/mmyolo/tools/misc/download_dataset.py @@ -0,0 +1,112 @@ +import argparse +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path +from tarfile import TarFile +from zipfile import ZipFile + +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Download datasets for training') + parser.add_argument( + '--dataset-name', type=str, help='dataset name', default='coco2017') + parser.add_argument( + '--save-dir', + type=str, + help='the dir to save dataset', + default='data/coco') + parser.add_argument( + '--unzip', + action='store_true', + help='whether unzip dataset or not, zipped files will be saved') + parser.add_argument( + '--delete', + action='store_true', + help='delete the download zipped files') + parser.add_argument( + '--threads', type=int, help='number of threading', default=4) + args = parser.parse_args() + return args + + +def download(url, dir, unzip=True, delete=False, threads=1): + + def download_one(url, dir): + f = dir / Path(url).name + if Path(url).is_file(): + Path(url).rename(f) + elif not f.exists(): + print(f'Downloading {url} to {f}') + torch.hub.download_url_to_file(url, f, progress=True) + if unzip and f.suffix in ('.zip', '.tar'): + print(f'Unzipping {f.name}') + if f.suffix == '.zip': + ZipFile(f).extractall(path=dir) + elif f.suffix == '.tar': + TarFile(f).extractall(path=dir) + if delete: + f.unlink() + print(f'Delete {f}') + + dir = Path(dir) + if threads > 1: + pool = ThreadPool(threads) + pool.imap(lambda x: download_one(*x), zip(url, repeat(dir))) + pool.close() + pool.join() + else: + for u in [url] if isinstance(url, (str, Path)) else url: + download_one(u, dir) + + +def main(): + args = parse_args() + path = Path(args.save_dir) + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + data2url = dict( + # TODO: Support for downloading Panoptic Segmentation of COCO + coco2017=[ + 'http://images.cocodataset.org/zips/train2017.zip', + 'http://images.cocodataset.org/zips/val2017.zip', + 'http://images.cocodataset.org/zips/test2017.zip', + 'http://images.cocodataset.org/annotations/' + + 'annotations_trainval2017.zip' + ], + lvis=[ + 'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa + 'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_v1_train.json.zip', # noqa + ], + voc2007=[ + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', # noqa + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', # noqa + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar', # noqa + ], + voc2012=[ + 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', # noqa + ], + balloon=[ + # src link: https://github.com/matterport/Mask_RCNN/releases/download/v2.1/balloon_dataset.zip # noqa + 'https://download.openmmlab.com/mmyolo/data/balloon_dataset.zip' + ], + cat=[ + 'https://download.openmmlab.com/mmyolo/data/cat_dataset.zip' # noqa + ], + ) + url = data2url.get(args.dataset_name, None) + if url is None: + print('Only support COCO, VOC, balloon, cat and LVIS now!') + return + download( + url, + dir=path, + unzip=args.unzip, + delete=args.delete, + threads=args.threads) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/misc/extract_subcoco.py b/third_party/mmyolo/tools/misc/extract_subcoco.py new file mode 100644 index 0000000000000000000000000000000000000000..31528e0b338bf26bdf5abbca0e2254413e87e186 --- /dev/null +++ b/third_party/mmyolo/tools/misc/extract_subcoco.py @@ -0,0 +1,160 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""Extracting subsets from coco2017 dataset. + +This script is mainly used to debug and verify the correctness of the +program quickly. +The root folder format must be in the following format: + +├── root +│ ├── annotations +│ ├── train2017 +│ ├── val2017 +│ ├── test2017 + +Currently, only support COCO2017. In the future will support user-defined +datasets of standard coco JSON format. + +Example: + python tools/misc/extract_subcoco.py ${ROOT} ${OUT_DIR} --num-img ${NUM_IMG} +""" + +import argparse +import os.path as osp +import shutil + +import mmengine +import numpy as np +from pycocotools.coco import COCO + + +# TODO: Currently only supports coco2017 +def _process_data(args, + in_dataset_type: str, + out_dataset_type: str, + year: str = '2017'): + assert in_dataset_type in ('train', 'val') + assert out_dataset_type in ('train', 'val') + + int_ann_file_name = f'annotations/instances_{in_dataset_type}{year}.json' + out_ann_file_name = f'annotations/instances_{out_dataset_type}{year}.json' + + ann_path = osp.join(args.root, int_ann_file_name) + json_data = mmengine.load(ann_path) + + new_json_data = { + 'info': json_data['info'], + 'licenses': json_data['licenses'], + 'categories': json_data['categories'], + 'images': [], + 'annotations': [] + } + + area_dict = { + 'small': [0., 32 * 32], + 'medium': [32 * 32, 96 * 96], + 'large': [96 * 96, float('inf')] + } + + coco = COCO(ann_path) + + # filter annotations by category ids and area range + areaRng = area_dict[args.area_size] if args.area_size else [] + catIds = coco.getCatIds(args.classes) if args.classes else [] + ann_ids = coco.getAnnIds(catIds=catIds, areaRng=areaRng) + ann_info = coco.loadAnns(ann_ids) + + # get image ids by anns set + filter_img_ids = {ann['image_id'] for ann in ann_info} + filter_img = coco.loadImgs(filter_img_ids) + + # shuffle + np.random.shuffle(filter_img) + + num_img = args.num_img if args.num_img > 0 else len(filter_img) + if num_img > len(filter_img): + print( + f'num_img is too big, will be set to {len(filter_img)}, ' + 'because of not enough image after filter by classes and area_size' + ) + num_img = len(filter_img) + + progress_bar = mmengine.ProgressBar(num_img) + + for i in range(num_img): + file_name = filter_img[i]['file_name'] + image_path = osp.join(args.root, in_dataset_type + year, file_name) + + ann_ids = coco.getAnnIds( + imgIds=[filter_img[i]['id']], catIds=catIds, areaRng=areaRng) + img_ann_info = coco.loadAnns(ann_ids) + + new_json_data['images'].append(filter_img[i]) + new_json_data['annotations'].extend(img_ann_info) + + shutil.copy(image_path, osp.join(args.out_dir, + out_dataset_type + year)) + + progress_bar.update() + + mmengine.dump(new_json_data, osp.join(args.out_dir, out_ann_file_name)) + + +def _make_dirs(out_dir): + mmengine.mkdir_or_exist(out_dir) + mmengine.mkdir_or_exist(osp.join(out_dir, 'annotations')) + mmengine.mkdir_or_exist(osp.join(out_dir, 'train2017')) + mmengine.mkdir_or_exist(osp.join(out_dir, 'val2017')) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Extract coco subset') + parser.add_argument('root', help='root path') + parser.add_argument( + 'out_dir', type=str, help='directory where subset coco will be saved.') + parser.add_argument( + '--num-img', + default=50, + type=int, + help='num of extract image, -1 means all images') + parser.add_argument( + '--area-size', + choices=['small', 'medium', 'large'], + help='filter ground-truth info by area size') + parser.add_argument( + '--classes', nargs='+', help='filter ground-truth by class name') + parser.add_argument( + '--use-training-set', + action='store_true', + help='Whether to use the training set when extract the training set. ' + 'The training subset is extracted from the validation set by ' + 'default which can speed up.') + parser.add_argument('--seed', default=-1, type=int, help='seed') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + assert args.out_dir != args.root, \ + 'The file will be overwritten in place, ' \ + 'so the same folder is not allowed !' + + seed = int(args.seed) + if seed != -1: + print(f'Set the global seed: {seed}') + np.random.seed(int(args.seed)) + + _make_dirs(args.out_dir) + + print('====Start processing train dataset====') + if args.use_training_set: + _process_data(args, 'train', 'train') + else: + _process_data(args, 'val', 'train') + print('\n====Start processing val dataset====') + _process_data(args, 'val', 'val') + print(f'\n Result save to {args.out_dir}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/misc/print_config.py b/third_party/mmyolo/tools/misc/print_config.py new file mode 100644 index 0000000000000000000000000000000000000000..2c2efe33d5f388638d8b9c7b21f8a2eab12bd28e --- /dev/null +++ b/third_party/mmyolo/tools/misc/print_config.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os + +from mmdet.utils import replace_cfg_vals, update_data_root +from mmengine import Config, DictAction + + +def parse_args(): + parser = argparse.ArgumentParser(description='Print the whole config') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--save-path', + default=None, + help='save path of whole config, suffixed with .py, .json or .yml') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + + # replace the ${key} with the value of cfg.key + cfg = replace_cfg_vals(cfg) + + # update data root according to MMDET_DATASETS + update_data_root(cfg) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + print(f'Config:\n{cfg.pretty_text}') + + if args.save_path is not None: + save_path = args.save_path + + suffix = os.path.splitext(save_path)[-1] + assert suffix in ['.py', '.json', '.yml'] + + if not os.path.exists(os.path.split(save_path)[0]): + os.makedirs(os.path.split(save_path)[0]) + cfg.dump(save_path) + print(f'Config saving at {save_path}') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/misc/publish_model.py b/third_party/mmyolo/tools/misc/publish_model.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ccbf080a4b162fe05d542409eec7d3b6441118 --- /dev/null +++ b/third_party/mmyolo/tools/misc/publish_model.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import subprocess + +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Process a checkpoint to be published') + parser.add_argument('in_file', help='input checkpoint filename') + parser.add_argument('out_file', help='output checkpoint filename') + args = parser.parse_args() + return args + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + + # remove optimizer for smaller file size + if 'optimizer' in checkpoint: + del checkpoint['optimizer'] + if 'message_hub' in checkpoint: + del checkpoint['message_hub'] + if 'ema_state_dict' in checkpoint: + del checkpoint['ema_state_dict'] + + for key in list(checkpoint['state_dict']): + if key.startswith('data_preprocessor'): + checkpoint['state_dict'].pop(key) + elif 'priors_base_sizes' in key: + checkpoint['state_dict'].pop(key) + elif 'grid_offset' in key: + checkpoint['state_dict'].pop(key) + elif 'prior_inds' in key: + checkpoint['state_dict'].pop(key) + + if torch.__version__ >= '1.6': + torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False) + else: + torch.save(checkpoint, out_file) + sha = subprocess.check_output(['sha256sum', out_file]).decode() + if out_file.endswith('.pth'): + out_file_name = out_file[:-4] + else: + out_file_name = out_file + final_file = out_file_name + f'-{sha[:8]}.pth' + subprocess.Popen(['mv', out_file, final_file]) + + +def main(): + args = parse_args() + process_checkpoint(args.in_file, args.out_file) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/convert_kd_ckpt_to_student.py b/third_party/mmyolo/tools/model_converters/convert_kd_ckpt_to_student.py new file mode 100644 index 0000000000000000000000000000000000000000..d2f787e47584d3edbed2269760832670530c146b --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/convert_kd_ckpt_to_student.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from pathlib import Path + +from mmengine.runner import CheckpointLoader, save_checkpoint +from mmengine.utils import mkdir_or_exist + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert KD checkpoint to student-only checkpoint') + parser.add_argument('checkpoint', help='input checkpoint filename') + parser.add_argument('--out-path', help='save checkpoint path') + parser.add_argument( + '--inplace', action='store_true', help='replace origin ckpt') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + checkpoint = CheckpointLoader.load_checkpoint( + args.checkpoint, map_location='cpu') + new_state_dict = dict() + new_meta = checkpoint['meta'] + + for key, value in checkpoint['state_dict'].items(): + if key.startswith('architecture.'): + new_key = key.replace('architecture.', '') + new_state_dict[new_key] = value + + checkpoint = dict() + checkpoint['meta'] = new_meta + checkpoint['state_dict'] = new_state_dict + + if args.inplace: + assert osp.exists(args.checkpoint), \ + 'can not find the checkpoint path: {args.checkpoint}' + save_checkpoint(checkpoint, args.checkpoint) + else: + ckpt_path = Path(args.checkpoint) + ckpt_name = ckpt_path.stem + if args.out_path: + ckpt_dir = Path(args.out_path) + else: + ckpt_dir = ckpt_path.parent + mkdir_or_exist(ckpt_dir) + new_ckpt_path = osp.join(ckpt_dir, f'{ckpt_name}_student.pth') + save_checkpoint(checkpoint, new_ckpt_path) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/ppyoloe_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/ppyoloe_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..75c4af6963a8c58834507dd823930b1f9fcab6ac --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/ppyoloe_to_mmyolo.py @@ -0,0 +1,184 @@ +import argparse +import pickle +from collections import OrderedDict + +import torch + + +def convert_bn(k: str): + name = k.replace('._mean', + '.running_mean').replace('._variance', '.running_var') + return name + + +def convert_repvgg(k: str): + if '.conv2.conv1.' in k: + name = k.replace('.conv2.conv1.', '.conv2.rbr_dense.') + return name + elif '.conv2.conv2.' in k: + name = k.replace('.conv2.conv2.', '.conv2.rbr_1x1.') + return name + else: + return k + + +def convert(src: str, dst: str, imagenet_pretrain: bool = False): + with open(src, 'rb') as f: + model = pickle.load(f) + + new_state_dict = OrderedDict() + if imagenet_pretrain: + for k, v in model.items(): + if '@@' in k: + continue + if 'stem.' in k: + # backbone.stem.conv1.conv.weight + # -> backbone.stem.0.conv.weight + org_ind = k.split('.')[1][-1] + new_ind = str(int(org_ind) - 1) + name = k.replace('stem.conv%s.' % org_ind, + 'stem.%s.' % new_ind) + else: + # backbone.stages.1.conv2.bn._variance + # -> backbone.stage2.0.conv2.bn.running_var + org_stage_ind = k.split('.')[1] + new_stage_ind = str(int(org_stage_ind) + 1) + name = k.replace('stages.%s.' % org_stage_ind, + 'stage%s.0.' % new_stage_ind) + name = convert_repvgg(name) + if '.attn.' in k: + name = name.replace('.attn.fc.', '.attn.fc.conv.') + name = convert_bn(name) + name = 'backbone.' + name + + new_state_dict[name] = torch.from_numpy(v) + else: + for k, v in model.items(): + name = k + if k.startswith('backbone.'): + if '.stem.' in k: + # backbone.stem.conv1.conv.weight + # -> backbone.stem.0.conv.weight + org_ind = k.split('.')[2][-1] + new_ind = str(int(org_ind) - 1) + name = k.replace('.stem.conv%s.' % org_ind, + '.stem.%s.' % new_ind) + else: + # backbone.stages.1.conv2.bn._variance + # -> backbone.stage2.0.conv2.bn.running_var + org_stage_ind = k.split('.')[2] + new_stage_ind = str(int(org_stage_ind) + 1) + name = k.replace('.stages.%s.' % org_stage_ind, + '.stage%s.0.' % new_stage_ind) + name = convert_repvgg(name) + if '.attn.' in k: + name = name.replace('.attn.fc.', '.attn.fc.conv.') + name = convert_bn(name) + elif k.startswith('neck.'): + # fpn_stages + if k.startswith('neck.fpn_stages.'): + # neck.fpn_stages.0.0.conv1.conv.weight + # -> neck.reduce_layers.2.0.conv1.conv.weight + if k.startswith('neck.fpn_stages.0.0.'): + name = k.replace('neck.fpn_stages.0.0.', + 'neck.reduce_layers.2.0.') + if '.spp.' in name: + name = name.replace('.spp.conv.', '.spp.conv2.') + # neck.fpn_stages.1.0.conv1.conv.weight + # -> neck.top_down_layers.0.0.conv1.conv.weight + elif k.startswith('neck.fpn_stages.1.0.'): + name = k.replace('neck.fpn_stages.1.0.', + 'neck.top_down_layers.0.0.') + elif k.startswith('neck.fpn_stages.2.0.'): + name = k.replace('neck.fpn_stages.2.0.', + 'neck.top_down_layers.1.0.') + else: + raise NotImplementedError('Not implemented.') + name = name.replace('.0.convs.', '.0.blocks.') + elif k.startswith('neck.fpn_routes.'): + # neck.fpn_routes.0.conv.weight + # -> neck.upsample_layers.0.0.conv.weight + index = k.split('.')[2] + name = 'neck.upsample_layers.' + index + '.0.' + '.'.join( + k.split('.')[-2:]) + name = name.replace('.0.convs.', '.0.blocks.') + elif k.startswith('neck.pan_stages.'): + # neck.pan_stages.0.0.conv1.conv.weight + # -> neck.bottom_up_layers.1.0.conv1.conv.weight + ind = k.split('.')[2] + name = k.replace( + 'neck.pan_stages.' + ind, 'neck.bottom_up_layers.' + + ('0' if ind == '1' else '1')) + name = name.replace('.0.convs.', '.0.blocks.') + elif k.startswith('neck.pan_routes.'): + # neck.pan_routes.0.conv.weight + # -> neck.downsample_layers.0.conv.weight + ind = k.split('.')[2] + name = k.replace( + 'neck.pan_routes.' + ind, 'neck.downsample_layers.' + + ('0' if ind == '1' else '1')) + name = name.replace('.0.convs.', '.0.blocks.') + + else: + raise NotImplementedError('Not implement.') + name = convert_repvgg(name) + name = convert_bn(name) + elif k.startswith('yolo_head.'): + if ('anchor_points' in k) or ('stride_tensor' in k): + continue + if 'proj_conv' in k: + name = k.replace('yolo_head.proj_conv.', + 'bbox_head.head_module.proj_conv.') + else: + for org_key, rep_key in [ + [ + 'yolo_head.stem_cls.', + 'bbox_head.head_module.cls_stems.' + ], + [ + 'yolo_head.stem_reg.', + 'bbox_head.head_module.reg_stems.' + ], + [ + 'yolo_head.pred_cls.', + 'bbox_head.head_module.cls_preds.' + ], + [ + 'yolo_head.pred_reg.', + 'bbox_head.head_module.reg_preds.' + ] + ]: + name = name.replace(org_key, rep_key) + name = name.split('.') + ind = name[3] + name[3] = str(2 - int(ind)) + name = '.'.join(name) + name = convert_bn(name) + else: + continue + + new_state_dict[name] = torch.from_numpy(v) + data = {'state_dict': new_state_dict} + torch.save(data, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', + default='ppyoloe_plus_crn_s_80e_coco.pdparams', + help='src ppyoloe model path') + parser.add_argument( + '--dst', default='mmppyoloe_plus_s.pt', help='save path') + parser.add_argument( + '--imagenet-pretrain', + action='store_true', + default=False, + help='Load model pretrained on imagenet dataset which only ' + 'have weight for backbone.') + args = parser.parse_args() + convert(args.src, args.dst, args.imagenet_pretrain) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/rtmdet_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/rtmdet_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6f237d44464fdfb8882c898f332ef51ba12ae8 --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/rtmdet_to_mmyolo.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + + +def convert(src, dst): + """Convert keys in pretrained RTMDet models to MMYOLO style.""" + blobs = torch.load(src)['state_dict'] + state_dict = OrderedDict() + + for key, weight in blobs.items(): + if 'neck.reduce_layers.0' in key: + new_key = key.replace('.0', '.2') + state_dict[new_key] = weight + elif 'neck.reduce_layers.1' in key: + new_key = key.replace('reduce_layers.1', 'top_down_layers.0.1') + state_dict[new_key] = weight + elif 'neck.top_down_blocks.0' in key: + new_key = key.replace('down_blocks', 'down_layers.0') + state_dict[new_key] = weight + elif 'neck.top_down_blocks.1' in key: + new_key = key.replace('down_blocks', 'down_layers') + state_dict[new_key] = weight + elif 'downsamples' in key: + new_key = key.replace('downsamples', 'downsample_layers') + state_dict[new_key] = weight + elif 'bottom_up_blocks' in key: + new_key = key.replace('bottom_up_blocks', 'bottom_up_layers') + state_dict[new_key] = weight + elif 'out_convs' in key: + new_key = key.replace('out_convs', 'out_layers') + state_dict[new_key] = weight + elif 'bbox_head' in key: + new_key = key.replace('bbox_head', 'bbox_head.head_module') + state_dict[new_key] = weight + elif 'data_preprocessor' in key: + continue + else: + new_key = key + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + checkpoint['meta'] = blobs.get('meta') + torch.save(checkpoint, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument('src', help='src rtm model path') + parser.add_argument('dst', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/yolov5_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/yolov5_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..a4e62a2f7787444862990e35d1fb20c0be9f0961 --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/yolov5_to_mmyolo.py @@ -0,0 +1,128 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +convert_dict_p5 = { + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9.cv1': 'backbone.stage4.2.conv1', + 'model.9.cv2': 'backbone.stage4.2.conv2', + 'model.10': 'neck.reduce_layers.2', + 'model.13': 'neck.top_down_layers.0.0', + 'model.14': 'neck.top_down_layers.0.1', + 'model.17': 'neck.top_down_layers.1', + 'model.18': 'neck.downsample_layers.0', + 'model.20': 'neck.bottom_up_layers.0', + 'model.21': 'neck.downsample_layers.1', + 'model.23': 'neck.bottom_up_layers.1', + 'model.24.m': 'bbox_head.head_module.convs_pred', + 'model.24.proto': 'bbox_head.head_module.proto_preds', +} + +convert_dict_p6 = { + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9': 'backbone.stage5.0', + 'model.10': 'backbone.stage5.1', + 'model.11.cv1': 'backbone.stage5.2.conv1', + 'model.11.cv2': 'backbone.stage5.2.conv2', + 'model.12': 'neck.reduce_layers.3', + 'model.15': 'neck.top_down_layers.0.0', + 'model.16': 'neck.top_down_layers.0.1', + 'model.19': 'neck.top_down_layers.1.0', + 'model.20': 'neck.top_down_layers.1.1', + 'model.23': 'neck.top_down_layers.2', + 'model.24': 'neck.downsample_layers.0', + 'model.26': 'neck.bottom_up_layers.0', + 'model.27': 'neck.downsample_layers.1', + 'model.29': 'neck.bottom_up_layers.1', + 'model.30': 'neck.downsample_layers.2', + 'model.32': 'neck.bottom_up_layers.2', + 'model.33.m': 'bbox_head.head_module.convs_pred', + 'model.33.proto': 'bbox_head.head_module.proto_preds', +} + + +def convert(src, dst): + """Convert keys in pretrained YOLOv5 models to mmyolo style.""" + if src.endswith('6.pt'): + convert_dict = convert_dict_p6 + is_p6_model = True + print('Converting P6 model') + else: + convert_dict = convert_dict_p5 + is_p6_model = False + print('Converting P5 model') + try: + yolov5_model = torch.load(src)['model'] + blobs = yolov5_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the ultralytics/yolov5 repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + + num, module = key.split('.')[1:3] + if (is_p6_model and + (num == '11' or num == '33')) or (not is_p6_model and + (num == '9' or num == '24')): + if module == 'anchors': + continue + prefix = f'model.{num}.{module}' + else: + prefix = f'model.{num}' + + new_key = key.replace(prefix, convert_dict[prefix]) + + if '.m.' in new_key: + new_key = new_key.replace('.m.', '.blocks.') + new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module.proto_preds.cv' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.proto_preds.cv', + 'bbox_head.head_module.proto_preds.conv') + else: + new_key = new_key.replace('.cv1', '.main_conv') + new_key = new_key.replace('.cv2', '.short_conv') + new_key = new_key.replace('.cv3', '.final_conv') + + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the yolov5 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov5s.pt', help='src yolov5 model path') + parser.add_argument('--dst', default='mmyolov5s.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/yolov5u_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/yolov5u_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..806c76cb47b17a3b0291f80e092e7b8d9856a0ab --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/yolov5u_to_mmyolo.py @@ -0,0 +1,88 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +convert_dict_p5 = { + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9': 'backbone.stage4.2', + 'model.10': 'neck.reduce_layers.2', + 'model.13': 'neck.top_down_layers.0.0', + 'model.14': 'neck.top_down_layers.0.1', + 'model.17': 'neck.top_down_layers.1', + 'model.18': 'neck.downsample_layers.0', + 'model.20': 'neck.bottom_up_layers.0', + 'model.21': 'neck.downsample_layers.1', + 'model.23': 'neck.bottom_up_layers.1', + 'model.24': 'bbox_head.head_module', +} + + +def convert(src, dst): + """Convert keys in pretrained YOLOv5u models to mmyolo style.""" + convert_dict = convert_dict_p5 + + print('Converting P5 model') + try: + yolov5_model = torch.load(src)['model'] + blobs = yolov5_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the ultralytics repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + + num, module = key.split('.')[1:3] + prefix = f'model.{num}' + new_key = key.replace(prefix, convert_dict[prefix]) + + if '.m.' in new_key: + new_key = new_key.replace('.m.', '.blocks.') + new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module' in new_key: + new_key = new_key.replace('.cv2', '.reg_preds') + new_key = new_key.replace('.cv3', '.cls_preds') + elif 'backbone.stage4.2' in new_key: + new_key = new_key.replace('.cv', '.conv') + else: + new_key = new_key.replace('.cv1', '.main_conv') + new_key = new_key.replace('.cv2', '.short_conv') + new_key = new_key.replace('.cv3', '.final_conv') + + if 'bbox_head.head_module.dfl.conv.weight' == new_key: + print('Drop "bbox_head.head_module.dfl.conv.weight", ' + 'because it is useless') + continue + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the ultralytics repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov5su.pt', help='src yolov5u model path') + parser.add_argument('--dst', default='mmyolov5su.pth', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/yolov6_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/yolov6_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..e9e86ab46d6cae30aede92ea3598291fbcd009a7 --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/yolov6_to_mmyolo.py @@ -0,0 +1,115 @@ +import argparse +from collections import OrderedDict + +import torch + + +def convert(src, dst): + import sys + sys.path.append('yolov6') + try: + ckpt = torch.load(src, map_location=torch.device('cpu')) + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the meituan/YOLOv6 repo,' + ' because loading the official pretrained model need' + ' some python files to build model.') + # The saved model is the model before reparameterization + model = ckpt['ema' if ckpt.get('ema') else 'model'].float() + new_state_dict = OrderedDict() + for k, v in model.state_dict().items(): + name = k + if 'detect' in k: + if 'proj' in k: + continue + name = k.replace('detect', 'bbox_head.head_module') + if k.find('anchors') >= 0 or k.find('anchor_grid') >= 0: + continue + + if 'ERBlock_2' in k: + name = k.replace('ERBlock_2', 'stage1.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_3' in k: + name = k.replace('ERBlock_3', 'stage2.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_4' in k: + name = k.replace('ERBlock_4', 'stage3.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_5' in k: + name = k.replace('ERBlock_5', 'stage4.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if 'stage4.0.2' in name: + name = name.replace('stage4.0.2', 'stage4.1') + name = name.replace('cv', 'conv') + elif 'reduce_layer0' in k: + name = k.replace('reduce_layer0', 'reduce_layers.2') + elif 'Rep_p4' in k: + name = k.replace('Rep_p4', 'top_down_layers.0.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'reduce_layer1' in k: + name = k.replace('reduce_layer1', 'top_down_layers.0.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_p3' in k: + name = k.replace('Rep_p3', 'top_down_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'upsample0' in k: + name = k.replace('upsample0.upsample_transpose', + 'upsample_layers.0') + elif 'upsample1' in k: + name = k.replace('upsample1.upsample_transpose', + 'upsample_layers.1') + elif 'Rep_n3' in k: + name = k.replace('Rep_n3', 'bottom_up_layers.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_n4' in k: + name = k.replace('Rep_n4', 'bottom_up_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'downsample2' in k: + name = k.replace('downsample2', 'downsample_layers.0') + elif 'downsample1' in k: + name = k.replace('downsample1', 'downsample_layers.1') + + new_state_dict[name] = v + data = {'state_dict': new_state_dict} + torch.save(data, dst) + + +# Note: This script must be placed under the yolov6 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov6s.pt', help='src yolov6 model path') + parser.add_argument('--dst', default='mmyolov6.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/yolov6_v3_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/yolov6_v3_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..bc87664241eb699454c165aa1d760d1da910f7dd --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/yolov6_v3_to_mmyolo.py @@ -0,0 +1,145 @@ +import argparse +from collections import OrderedDict + +import torch + + +def convert(src, dst): + import sys + sys.path.append('yolov6') + try: + ckpt = torch.load(src, map_location=torch.device('cpu')) + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the meituan/YOLOv6 repo,' + ' because loading the official pretrained model need' + ' some python files to build model.') + # The saved model is the model before reparameterization + model = ckpt['ema' if ckpt.get('ema') else 'model'].float() + new_state_dict = OrderedDict() + is_ns = False + for k, v in model.state_dict().items(): + name = k + if 'detect' in k: + if 'proj' in k: + continue + if 'reg_preds_lrtb' in k: + is_ns = True + name = k.replace('detect', 'bbox_head.head_module') + if k.find('anchors') >= 0 or k.find('anchor_grid') >= 0: + continue + + if 'ERBlock_2' in k: + name = k.replace('ERBlock_2', 'stage1.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_3' in k: + name = k.replace('ERBlock_3', 'stage2.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_4' in k: + name = k.replace('ERBlock_4', 'stage3.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'ERBlock_5' in k: + name = k.replace('ERBlock_5', 'stage4.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if 'stage4.0.2' in name: + name = name.replace('stage4.0.2', 'stage4.1') + name = name.replace('cv', 'conv') + elif 'reduce_layer0' in k: + name = k.replace('reduce_layer0', 'reduce_layers.2') + elif 'Rep_p4' in k: + name = k.replace('Rep_p4', 'top_down_layers.0.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'reduce_layer1' in k: + name = k.replace('reduce_layer1', 'top_down_layers.0.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_p3' in k: + name = k.replace('Rep_p3', 'top_down_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Bifusion0' in k: + name = k.replace('Bifusion0', 'upsample_layers.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if '.upsample_transpose.' in k: + name = name.replace('.upsample_transpose.', '.') + elif 'Bifusion1' in k: + name = k.replace('Bifusion1', 'upsample_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + if '.upsample_transpose.' in k: + name = name.replace('.upsample_transpose.', '.') + elif 'Rep_n3' in k: + name = k.replace('Rep_n3', 'bottom_up_layers.0') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'Rep_n4' in k: + name = k.replace('Rep_n4', 'bottom_up_layers.1') + if '.cv' in k: + name = name.replace('.cv', '.conv') + if '.m.' in k: + name = name.replace('.m.', '.block.') + elif 'downsample2' in k: + name = k.replace('downsample2', 'downsample_layers.0') + elif 'downsample1' in k: + name = k.replace('downsample1', 'downsample_layers.1') + + new_state_dict[name] = v + + # The yolov6_v3_n/s has two regression heads. + # One called 'reg_preds_lrtb' is a regular anchor-free head, + # which is used for inference. + # One called 'reg_preds' is a DFL style head, which + # is only used in training. + if is_ns: + tmp_state_dict = OrderedDict() + for k, v in new_state_dict.items(): + name = k + if 'reg_preds_lrtb' in k: + name = k.replace('reg_preds_lrtb', 'reg_preds') + elif 'reg_preds' in k: + name = k.replace('reg_preds', 'distill_ns_head') + tmp_state_dict[name] = v + new_state_dict = tmp_state_dict + + data = {'state_dict': new_state_dict} + torch.save(data, dst) + + +# Note: This script must be placed under the yolov6 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov6s.pt', help='src yolov6 model path') + parser.add_argument('--dst', default='mmyolov6.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/yolov7_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/yolov7_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..1c1f54d0cbf0375dc026c8e6fb234ce9335d85cc --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/yolov7_to_mmyolo.py @@ -0,0 +1,1093 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from collections import OrderedDict + +import torch + +convert_dict_tiny = { + # stem + 'model.0': 'backbone.stem.0', + 'model.1': 'backbone.stem.1', + + # stage1 TinyDownSampleBlock + 'model.2': 'backbone.stage1.0.short_conv', + 'model.3': 'backbone.stage1.0.main_convs.0', + 'model.4': 'backbone.stage1.0.main_convs.1', + 'model.5': 'backbone.stage1.0.main_convs.2', + 'model.7': 'backbone.stage1.0.final_conv', + + # stage2 TinyDownSampleBlock + 'model.9': 'backbone.stage2.1.short_conv', + 'model.10': 'backbone.stage2.1.main_convs.0', + 'model.11': 'backbone.stage2.1.main_convs.1', + 'model.12': 'backbone.stage2.1.main_convs.2', + 'model.14': 'backbone.stage2.1.final_conv', + + # stage3 TinyDownSampleBlock + 'model.16': 'backbone.stage3.1.short_conv', + 'model.17': 'backbone.stage3.1.main_convs.0', + 'model.18': 'backbone.stage3.1.main_convs.1', + 'model.19': 'backbone.stage3.1.main_convs.2', + 'model.21': 'backbone.stage3.1.final_conv', + + # stage4 TinyDownSampleBlock + 'model.23': 'backbone.stage4.1.short_conv', + 'model.24': 'backbone.stage4.1.main_convs.0', + 'model.25': 'backbone.stage4.1.main_convs.1', + 'model.26': 'backbone.stage4.1.main_convs.2', + 'model.28': 'backbone.stage4.1.final_conv', + + # neck SPPCSPBlock + 'model.29': 'neck.reduce_layers.2.short_layer', + 'model.30': 'neck.reduce_layers.2.main_layers', + 'model.35': 'neck.reduce_layers.2.fuse_layers', + 'model.37': 'neck.reduce_layers.2.final_conv', + 'model.38': 'neck.upsample_layers.0.0', + 'model.40': 'neck.reduce_layers.1', + 'model.42': 'neck.top_down_layers.0.short_conv', + 'model.43': 'neck.top_down_layers.0.main_convs.0', + 'model.44': 'neck.top_down_layers.0.main_convs.1', + 'model.45': 'neck.top_down_layers.0.main_convs.2', + 'model.47': 'neck.top_down_layers.0.final_conv', + 'model.48': 'neck.upsample_layers.1.0', + 'model.50': 'neck.reduce_layers.0', + 'model.52': 'neck.top_down_layers.1.short_conv', + 'model.53': 'neck.top_down_layers.1.main_convs.0', + 'model.54': 'neck.top_down_layers.1.main_convs.1', + 'model.55': 'neck.top_down_layers.1.main_convs.2', + 'model.57': 'neck.top_down_layers.1.final_conv', + 'model.58': 'neck.downsample_layers.0', + 'model.60': 'neck.bottom_up_layers.0.short_conv', + 'model.61': 'neck.bottom_up_layers.0.main_convs.0', + 'model.62': 'neck.bottom_up_layers.0.main_convs.1', + 'model.63': 'neck.bottom_up_layers.0.main_convs.2', + 'model.65': 'neck.bottom_up_layers.0.final_conv', + 'model.66': 'neck.downsample_layers.1', + 'model.68': 'neck.bottom_up_layers.1.short_conv', + 'model.69': 'neck.bottom_up_layers.1.main_convs.0', + 'model.70': 'neck.bottom_up_layers.1.main_convs.1', + 'model.71': 'neck.bottom_up_layers.1.main_convs.2', + 'model.73': 'neck.bottom_up_layers.1.final_conv', + 'model.74': 'neck.out_layers.0', + 'model.75': 'neck.out_layers.1', + 'model.76': 'neck.out_layers.2', + + # head + 'model.77.m.0': 'bbox_head.head_module.convs_pred.0.1', + 'model.77.m.1': 'bbox_head.head_module.convs_pred.1.1', + 'model.77.m.2': 'bbox_head.head_module.convs_pred.2.1' +} + +convert_dict_l = { + # stem + 'model.0': 'backbone.stem.0', + 'model.1': 'backbone.stem.1', + 'model.2': 'backbone.stem.2', + + # stage1 + # ConvModule + 'model.3': 'backbone.stage1.0', + # ELANBlock expand_channel_2x + 'model.4': 'backbone.stage1.1.short_conv', + 'model.5': 'backbone.stage1.1.main_conv', + 'model.6': 'backbone.stage1.1.blocks.0.0', + 'model.7': 'backbone.stage1.1.blocks.0.1', + 'model.8': 'backbone.stage1.1.blocks.1.0', + 'model.9': 'backbone.stage1.1.blocks.1.1', + 'model.11': 'backbone.stage1.1.final_conv', + + # stage2 + # MaxPoolBlock reduce_channel_2x + 'model.13': 'backbone.stage2.0.maxpool_branches.1', + 'model.14': 'backbone.stage2.0.stride_conv_branches.0', + 'model.15': 'backbone.stage2.0.stride_conv_branches.1', + # ELANBlock expand_channel_2x + 'model.17': 'backbone.stage2.1.short_conv', + 'model.18': 'backbone.stage2.1.main_conv', + 'model.19': 'backbone.stage2.1.blocks.0.0', + 'model.20': 'backbone.stage2.1.blocks.0.1', + 'model.21': 'backbone.stage2.1.blocks.1.0', + 'model.22': 'backbone.stage2.1.blocks.1.1', + 'model.24': 'backbone.stage2.1.final_conv', + + # stage3 + # MaxPoolBlock reduce_channel_2x + 'model.26': 'backbone.stage3.0.maxpool_branches.1', + 'model.27': 'backbone.stage3.0.stride_conv_branches.0', + 'model.28': 'backbone.stage3.0.stride_conv_branches.1', + # ELANBlock expand_channel_2x + 'model.30': 'backbone.stage3.1.short_conv', + 'model.31': 'backbone.stage3.1.main_conv', + 'model.32': 'backbone.stage3.1.blocks.0.0', + 'model.33': 'backbone.stage3.1.blocks.0.1', + 'model.34': 'backbone.stage3.1.blocks.1.0', + 'model.35': 'backbone.stage3.1.blocks.1.1', + 'model.37': 'backbone.stage3.1.final_conv', + + # stage4 + # MaxPoolBlock reduce_channel_2x + 'model.39': 'backbone.stage4.0.maxpool_branches.1', + 'model.40': 'backbone.stage4.0.stride_conv_branches.0', + 'model.41': 'backbone.stage4.0.stride_conv_branches.1', + # ELANBlock no_change_channel + 'model.43': 'backbone.stage4.1.short_conv', + 'model.44': 'backbone.stage4.1.main_conv', + 'model.45': 'backbone.stage4.1.blocks.0.0', + 'model.46': 'backbone.stage4.1.blocks.0.1', + 'model.47': 'backbone.stage4.1.blocks.1.0', + 'model.48': 'backbone.stage4.1.blocks.1.1', + 'model.50': 'backbone.stage4.1.final_conv', + + # neck SPPCSPBlock + 'model.51.cv1': 'neck.reduce_layers.2.main_layers.0', + 'model.51.cv3': 'neck.reduce_layers.2.main_layers.1', + 'model.51.cv4': 'neck.reduce_layers.2.main_layers.2', + 'model.51.cv5': 'neck.reduce_layers.2.fuse_layers.0', + 'model.51.cv6': 'neck.reduce_layers.2.fuse_layers.1', + 'model.51.cv2': 'neck.reduce_layers.2.short_layer', + 'model.51.cv7': 'neck.reduce_layers.2.final_conv', + + # neck + 'model.52': 'neck.upsample_layers.0.0', + 'model.54': 'neck.reduce_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.56': 'neck.top_down_layers.0.short_conv', + 'model.57': 'neck.top_down_layers.0.main_conv', + 'model.58': 'neck.top_down_layers.0.blocks.0', + 'model.59': 'neck.top_down_layers.0.blocks.1', + 'model.60': 'neck.top_down_layers.0.blocks.2', + 'model.61': 'neck.top_down_layers.0.blocks.3', + 'model.63': 'neck.top_down_layers.0.final_conv', + 'model.64': 'neck.upsample_layers.1.0', + 'model.66': 'neck.reduce_layers.0', + + # neck ELANBlock reduce_channel_2x + 'model.68': 'neck.top_down_layers.1.short_conv', + 'model.69': 'neck.top_down_layers.1.main_conv', + 'model.70': 'neck.top_down_layers.1.blocks.0', + 'model.71': 'neck.top_down_layers.1.blocks.1', + 'model.72': 'neck.top_down_layers.1.blocks.2', + 'model.73': 'neck.top_down_layers.1.blocks.3', + 'model.75': 'neck.top_down_layers.1.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.77': 'neck.downsample_layers.0.maxpool_branches.1', + 'model.78': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.79': 'neck.downsample_layers.0.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.81': 'neck.bottom_up_layers.0.short_conv', + 'model.82': 'neck.bottom_up_layers.0.main_conv', + 'model.83': 'neck.bottom_up_layers.0.blocks.0', + 'model.84': 'neck.bottom_up_layers.0.blocks.1', + 'model.85': 'neck.bottom_up_layers.0.blocks.2', + 'model.86': 'neck.bottom_up_layers.0.blocks.3', + 'model.88': 'neck.bottom_up_layers.0.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.90': 'neck.downsample_layers.1.maxpool_branches.1', + 'model.91': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.92': 'neck.downsample_layers.1.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.94': 'neck.bottom_up_layers.1.short_conv', + 'model.95': 'neck.bottom_up_layers.1.main_conv', + 'model.96': 'neck.bottom_up_layers.1.blocks.0', + 'model.97': 'neck.bottom_up_layers.1.blocks.1', + 'model.98': 'neck.bottom_up_layers.1.blocks.2', + 'model.99': 'neck.bottom_up_layers.1.blocks.3', + 'model.101': 'neck.bottom_up_layers.1.final_conv', + + # RepVGGBlock + 'model.102.rbr_dense.0': 'neck.out_layers.0.rbr_dense.conv', + 'model.102.rbr_dense.1': 'neck.out_layers.0.rbr_dense.bn', + 'model.102.rbr_1x1.0': 'neck.out_layers.0.rbr_1x1.conv', + 'model.102.rbr_1x1.1': 'neck.out_layers.0.rbr_1x1.bn', + 'model.103.rbr_dense.0': 'neck.out_layers.1.rbr_dense.conv', + 'model.103.rbr_dense.1': 'neck.out_layers.1.rbr_dense.bn', + 'model.103.rbr_1x1.0': 'neck.out_layers.1.rbr_1x1.conv', + 'model.103.rbr_1x1.1': 'neck.out_layers.1.rbr_1x1.bn', + 'model.104.rbr_dense.0': 'neck.out_layers.2.rbr_dense.conv', + 'model.104.rbr_dense.1': 'neck.out_layers.2.rbr_dense.bn', + 'model.104.rbr_1x1.0': 'neck.out_layers.2.rbr_1x1.conv', + 'model.104.rbr_1x1.1': 'neck.out_layers.2.rbr_1x1.bn', + + # head + 'model.105.m.0': 'bbox_head.head_module.convs_pred.0.1', + 'model.105.m.1': 'bbox_head.head_module.convs_pred.1.1', + 'model.105.m.2': 'bbox_head.head_module.convs_pred.2.1' +} + +convert_dict_x = { + # stem + 'model.0': 'backbone.stem.0', + 'model.1': 'backbone.stem.1', + 'model.2': 'backbone.stem.2', + + # stage1 + # ConvModule + 'model.3': 'backbone.stage1.0', + # ELANBlock expand_channel_2x + 'model.4': 'backbone.stage1.1.short_conv', + 'model.5': 'backbone.stage1.1.main_conv', + 'model.6': 'backbone.stage1.1.blocks.0.0', + 'model.7': 'backbone.stage1.1.blocks.0.1', + 'model.8': 'backbone.stage1.1.blocks.1.0', + 'model.9': 'backbone.stage1.1.blocks.1.1', + 'model.10': 'backbone.stage1.1.blocks.2.0', + 'model.11': 'backbone.stage1.1.blocks.2.1', + 'model.13': 'backbone.stage1.1.final_conv', + + # stage2 + # MaxPoolBlock reduce_channel_2x + 'model.15': 'backbone.stage2.0.maxpool_branches.1', + 'model.16': 'backbone.stage2.0.stride_conv_branches.0', + 'model.17': 'backbone.stage2.0.stride_conv_branches.1', + + # ELANBlock expand_channel_2x + 'model.19': 'backbone.stage2.1.short_conv', + 'model.20': 'backbone.stage2.1.main_conv', + 'model.21': 'backbone.stage2.1.blocks.0.0', + 'model.22': 'backbone.stage2.1.blocks.0.1', + 'model.23': 'backbone.stage2.1.blocks.1.0', + 'model.24': 'backbone.stage2.1.blocks.1.1', + 'model.25': 'backbone.stage2.1.blocks.2.0', + 'model.26': 'backbone.stage2.1.blocks.2.1', + 'model.28': 'backbone.stage2.1.final_conv', + + # stage3 + # MaxPoolBlock reduce_channel_2x + 'model.30': 'backbone.stage3.0.maxpool_branches.1', + 'model.31': 'backbone.stage3.0.stride_conv_branches.0', + 'model.32': 'backbone.stage3.0.stride_conv_branches.1', + # ELANBlock expand_channel_2x + 'model.34': 'backbone.stage3.1.short_conv', + 'model.35': 'backbone.stage3.1.main_conv', + 'model.36': 'backbone.stage3.1.blocks.0.0', + 'model.37': 'backbone.stage3.1.blocks.0.1', + 'model.38': 'backbone.stage3.1.blocks.1.0', + 'model.39': 'backbone.stage3.1.blocks.1.1', + 'model.40': 'backbone.stage3.1.blocks.2.0', + 'model.41': 'backbone.stage3.1.blocks.2.1', + 'model.43': 'backbone.stage3.1.final_conv', + + # stage4 + # MaxPoolBlock reduce_channel_2x + 'model.45': 'backbone.stage4.0.maxpool_branches.1', + 'model.46': 'backbone.stage4.0.stride_conv_branches.0', + 'model.47': 'backbone.stage4.0.stride_conv_branches.1', + # ELANBlock no_change_channel + 'model.49': 'backbone.stage4.1.short_conv', + 'model.50': 'backbone.stage4.1.main_conv', + 'model.51': 'backbone.stage4.1.blocks.0.0', + 'model.52': 'backbone.stage4.1.blocks.0.1', + 'model.53': 'backbone.stage4.1.blocks.1.0', + 'model.54': 'backbone.stage4.1.blocks.1.1', + 'model.55': 'backbone.stage4.1.blocks.2.0', + 'model.56': 'backbone.stage4.1.blocks.2.1', + 'model.58': 'backbone.stage4.1.final_conv', + + # neck SPPCSPBlock + 'model.59.cv1': 'neck.reduce_layers.2.main_layers.0', + 'model.59.cv3': 'neck.reduce_layers.2.main_layers.1', + 'model.59.cv4': 'neck.reduce_layers.2.main_layers.2', + 'model.59.cv5': 'neck.reduce_layers.2.fuse_layers.0', + 'model.59.cv6': 'neck.reduce_layers.2.fuse_layers.1', + 'model.59.cv2': 'neck.reduce_layers.2.short_layer', + 'model.59.cv7': 'neck.reduce_layers.2.final_conv', + + # neck + 'model.60': 'neck.upsample_layers.0.0', + 'model.62': 'neck.reduce_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.64': 'neck.top_down_layers.0.short_conv', + 'model.65': 'neck.top_down_layers.0.main_conv', + 'model.66': 'neck.top_down_layers.0.blocks.0.0', + 'model.67': 'neck.top_down_layers.0.blocks.0.1', + 'model.68': 'neck.top_down_layers.0.blocks.1.0', + 'model.69': 'neck.top_down_layers.0.blocks.1.1', + 'model.70': 'neck.top_down_layers.0.blocks.2.0', + 'model.71': 'neck.top_down_layers.0.blocks.2.1', + 'model.73': 'neck.top_down_layers.0.final_conv', + 'model.74': 'neck.upsample_layers.1.0', + 'model.76': 'neck.reduce_layers.0', + + # neck ELANBlock reduce_channel_2x + 'model.78': 'neck.top_down_layers.1.short_conv', + 'model.79': 'neck.top_down_layers.1.main_conv', + 'model.80': 'neck.top_down_layers.1.blocks.0.0', + 'model.81': 'neck.top_down_layers.1.blocks.0.1', + 'model.82': 'neck.top_down_layers.1.blocks.1.0', + 'model.83': 'neck.top_down_layers.1.blocks.1.1', + 'model.84': 'neck.top_down_layers.1.blocks.2.0', + 'model.85': 'neck.top_down_layers.1.blocks.2.1', + 'model.87': 'neck.top_down_layers.1.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.89': 'neck.downsample_layers.0.maxpool_branches.1', + 'model.90': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.91': 'neck.downsample_layers.0.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.93': 'neck.bottom_up_layers.0.short_conv', + 'model.94': 'neck.bottom_up_layers.0.main_conv', + 'model.95': 'neck.bottom_up_layers.0.blocks.0.0', + 'model.96': 'neck.bottom_up_layers.0.blocks.0.1', + 'model.97': 'neck.bottom_up_layers.0.blocks.1.0', + 'model.98': 'neck.bottom_up_layers.0.blocks.1.1', + 'model.99': 'neck.bottom_up_layers.0.blocks.2.0', + 'model.100': 'neck.bottom_up_layers.0.blocks.2.1', + 'model.102': 'neck.bottom_up_layers.0.final_conv', + + # neck MaxPoolBlock no_change_channel + 'model.104': 'neck.downsample_layers.1.maxpool_branches.1', + 'model.105': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.106': 'neck.downsample_layers.1.stride_conv_branches.1', + + # neck ELANBlock reduce_channel_2x + 'model.108': 'neck.bottom_up_layers.1.short_conv', + 'model.109': 'neck.bottom_up_layers.1.main_conv', + 'model.110': 'neck.bottom_up_layers.1.blocks.0.0', + 'model.111': 'neck.bottom_up_layers.1.blocks.0.1', + 'model.112': 'neck.bottom_up_layers.1.blocks.1.0', + 'model.113': 'neck.bottom_up_layers.1.blocks.1.1', + 'model.114': 'neck.bottom_up_layers.1.blocks.2.0', + 'model.115': 'neck.bottom_up_layers.1.blocks.2.1', + 'model.117': 'neck.bottom_up_layers.1.final_conv', + + # Conv + 'model.118': 'neck.out_layers.0', + 'model.119': 'neck.out_layers.1', + 'model.120': 'neck.out_layers.2', + + # head + 'model.121.m.0': 'bbox_head.head_module.convs_pred.0.1', + 'model.121.m.1': 'bbox_head.head_module.convs_pred.1.1', + 'model.121.m.2': 'bbox_head.head_module.convs_pred.2.1' +} + +convert_dict_w = { + # stem + 'model.1': 'backbone.stem.conv', + + # stage1 + # ConvModule + 'model.2': 'backbone.stage1.0', + # ELANBlock + 'model.3': 'backbone.stage1.1.short_conv', + 'model.4': 'backbone.stage1.1.main_conv', + 'model.5': 'backbone.stage1.1.blocks.0.0', + 'model.6': 'backbone.stage1.1.blocks.0.1', + 'model.7': 'backbone.stage1.1.blocks.1.0', + 'model.8': 'backbone.stage1.1.blocks.1.1', + 'model.10': 'backbone.stage1.1.final_conv', + + # stage2 + 'model.11': 'backbone.stage2.0', + # ELANBlock + 'model.12': 'backbone.stage2.1.short_conv', + 'model.13': 'backbone.stage2.1.main_conv', + 'model.14': 'backbone.stage2.1.blocks.0.0', + 'model.15': 'backbone.stage2.1.blocks.0.1', + 'model.16': 'backbone.stage2.1.blocks.1.0', + 'model.17': 'backbone.stage2.1.blocks.1.1', + 'model.19': 'backbone.stage2.1.final_conv', + + # stage3 + 'model.20': 'backbone.stage3.0', + # ELANBlock + 'model.21': 'backbone.stage3.1.short_conv', + 'model.22': 'backbone.stage3.1.main_conv', + 'model.23': 'backbone.stage3.1.blocks.0.0', + 'model.24': 'backbone.stage3.1.blocks.0.1', + 'model.25': 'backbone.stage3.1.blocks.1.0', + 'model.26': 'backbone.stage3.1.blocks.1.1', + 'model.28': 'backbone.stage3.1.final_conv', + + # stage4 + 'model.29': 'backbone.stage4.0', + # ELANBlock + 'model.30': 'backbone.stage4.1.short_conv', + 'model.31': 'backbone.stage4.1.main_conv', + 'model.32': 'backbone.stage4.1.blocks.0.0', + 'model.33': 'backbone.stage4.1.blocks.0.1', + 'model.34': 'backbone.stage4.1.blocks.1.0', + 'model.35': 'backbone.stage4.1.blocks.1.1', + 'model.37': 'backbone.stage4.1.final_conv', + + # stage5 + 'model.38': 'backbone.stage5.0', + # ELANBlock + 'model.39': 'backbone.stage5.1.short_conv', + 'model.40': 'backbone.stage5.1.main_conv', + 'model.41': 'backbone.stage5.1.blocks.0.0', + 'model.42': 'backbone.stage5.1.blocks.0.1', + 'model.43': 'backbone.stage5.1.blocks.1.0', + 'model.44': 'backbone.stage5.1.blocks.1.1', + 'model.46': 'backbone.stage5.1.final_conv', + + # neck SPPCSPBlock + 'model.47.cv1': 'neck.reduce_layers.3.main_layers.0', + 'model.47.cv3': 'neck.reduce_layers.3.main_layers.1', + 'model.47.cv4': 'neck.reduce_layers.3.main_layers.2', + 'model.47.cv5': 'neck.reduce_layers.3.fuse_layers.0', + 'model.47.cv6': 'neck.reduce_layers.3.fuse_layers.1', + 'model.47.cv2': 'neck.reduce_layers.3.short_layer', + 'model.47.cv7': 'neck.reduce_layers.3.final_conv', + + # neck + 'model.48': 'neck.upsample_layers.0.0', + 'model.50': 'neck.reduce_layers.2', + + # neck ELANBlock + 'model.52': 'neck.top_down_layers.0.short_conv', + 'model.53': 'neck.top_down_layers.0.main_conv', + 'model.54': 'neck.top_down_layers.0.blocks.0', + 'model.55': 'neck.top_down_layers.0.blocks.1', + 'model.56': 'neck.top_down_layers.0.blocks.2', + 'model.57': 'neck.top_down_layers.0.blocks.3', + 'model.59': 'neck.top_down_layers.0.final_conv', + 'model.60': 'neck.upsample_layers.1.0', + 'model.62': 'neck.reduce_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.64': 'neck.top_down_layers.1.short_conv', + 'model.65': 'neck.top_down_layers.1.main_conv', + 'model.66': 'neck.top_down_layers.1.blocks.0', + 'model.67': 'neck.top_down_layers.1.blocks.1', + 'model.68': 'neck.top_down_layers.1.blocks.2', + 'model.69': 'neck.top_down_layers.1.blocks.3', + 'model.71': 'neck.top_down_layers.1.final_conv', + 'model.72': 'neck.upsample_layers.2.0', + 'model.74': 'neck.reduce_layers.0', + 'model.76': 'neck.top_down_layers.2.short_conv', + 'model.77': 'neck.top_down_layers.2.main_conv', + 'model.78': 'neck.top_down_layers.2.blocks.0', + 'model.79': 'neck.top_down_layers.2.blocks.1', + 'model.80': 'neck.top_down_layers.2.blocks.2', + 'model.81': 'neck.top_down_layers.2.blocks.3', + 'model.83': 'neck.top_down_layers.2.final_conv', + 'model.84': 'neck.downsample_layers.0', + + # neck ELANBlock + 'model.86': 'neck.bottom_up_layers.0.short_conv', + 'model.87': 'neck.bottom_up_layers.0.main_conv', + 'model.88': 'neck.bottom_up_layers.0.blocks.0', + 'model.89': 'neck.bottom_up_layers.0.blocks.1', + 'model.90': 'neck.bottom_up_layers.0.blocks.2', + 'model.91': 'neck.bottom_up_layers.0.blocks.3', + 'model.93': 'neck.bottom_up_layers.0.final_conv', + 'model.94': 'neck.downsample_layers.1', + + # neck ELANBlock reduce_channel_2x + 'model.96': 'neck.bottom_up_layers.1.short_conv', + 'model.97': 'neck.bottom_up_layers.1.main_conv', + 'model.98': 'neck.bottom_up_layers.1.blocks.0', + 'model.99': 'neck.bottom_up_layers.1.blocks.1', + 'model.100': 'neck.bottom_up_layers.1.blocks.2', + 'model.101': 'neck.bottom_up_layers.1.blocks.3', + 'model.103': 'neck.bottom_up_layers.1.final_conv', + 'model.104': 'neck.downsample_layers.2', + + # neck ELANBlock reduce_channel_2x + 'model.106': 'neck.bottom_up_layers.2.short_conv', + 'model.107': 'neck.bottom_up_layers.2.main_conv', + 'model.108': 'neck.bottom_up_layers.2.blocks.0', + 'model.109': 'neck.bottom_up_layers.2.blocks.1', + 'model.110': 'neck.bottom_up_layers.2.blocks.2', + 'model.111': 'neck.bottom_up_layers.2.blocks.3', + 'model.113': 'neck.bottom_up_layers.2.final_conv', + 'model.114': 'bbox_head.head_module.main_convs_pred.0.0', + 'model.115': 'bbox_head.head_module.main_convs_pred.1.0', + 'model.116': 'bbox_head.head_module.main_convs_pred.2.0', + 'model.117': 'bbox_head.head_module.main_convs_pred.3.0', + + # head + 'model.118.m.0': 'bbox_head.head_module.main_convs_pred.0.2', + 'model.118.m.1': 'bbox_head.head_module.main_convs_pred.1.2', + 'model.118.m.2': 'bbox_head.head_module.main_convs_pred.2.2', + 'model.118.m.3': 'bbox_head.head_module.main_convs_pred.3.2' +} + +convert_dict_e = { + # stem + 'model.1': 'backbone.stem.conv', + + # stage1 + 'model.2.cv1': 'backbone.stage1.0.stride_conv_branches.0', + 'model.2.cv2': 'backbone.stage1.0.stride_conv_branches.1', + 'model.2.cv3': 'backbone.stage1.0.maxpool_branches.1', + + # ELANBlock + 'model.3': 'backbone.stage1.1.short_conv', + 'model.4': 'backbone.stage1.1.main_conv', + 'model.5': 'backbone.stage1.1.blocks.0.0', + 'model.6': 'backbone.stage1.1.blocks.0.1', + 'model.7': 'backbone.stage1.1.blocks.1.0', + 'model.8': 'backbone.stage1.1.blocks.1.1', + 'model.9': 'backbone.stage1.1.blocks.2.0', + 'model.10': 'backbone.stage1.1.blocks.2.1', + 'model.12': 'backbone.stage1.1.final_conv', + + # stage2 + 'model.13.cv1': 'backbone.stage2.0.stride_conv_branches.0', + 'model.13.cv2': 'backbone.stage2.0.stride_conv_branches.1', + 'model.13.cv3': 'backbone.stage2.0.maxpool_branches.1', + + # ELANBlock + 'model.14': 'backbone.stage2.1.short_conv', + 'model.15': 'backbone.stage2.1.main_conv', + 'model.16': 'backbone.stage2.1.blocks.0.0', + 'model.17': 'backbone.stage2.1.blocks.0.1', + 'model.18': 'backbone.stage2.1.blocks.1.0', + 'model.19': 'backbone.stage2.1.blocks.1.1', + 'model.20': 'backbone.stage2.1.blocks.2.0', + 'model.21': 'backbone.stage2.1.blocks.2.1', + 'model.23': 'backbone.stage2.1.final_conv', + + # stage3 + 'model.24.cv1': 'backbone.stage3.0.stride_conv_branches.0', + 'model.24.cv2': 'backbone.stage3.0.stride_conv_branches.1', + 'model.24.cv3': 'backbone.stage3.0.maxpool_branches.1', + + # ELANBlock + 'model.25': 'backbone.stage3.1.short_conv', + 'model.26': 'backbone.stage3.1.main_conv', + 'model.27': 'backbone.stage3.1.blocks.0.0', + 'model.28': 'backbone.stage3.1.blocks.0.1', + 'model.29': 'backbone.stage3.1.blocks.1.0', + 'model.30': 'backbone.stage3.1.blocks.1.1', + 'model.31': 'backbone.stage3.1.blocks.2.0', + 'model.32': 'backbone.stage3.1.blocks.2.1', + 'model.34': 'backbone.stage3.1.final_conv', + + # stage4 + 'model.35.cv1': 'backbone.stage4.0.stride_conv_branches.0', + 'model.35.cv2': 'backbone.stage4.0.stride_conv_branches.1', + 'model.35.cv3': 'backbone.stage4.0.maxpool_branches.1', + + # ELANBlock + 'model.36': 'backbone.stage4.1.short_conv', + 'model.37': 'backbone.stage4.1.main_conv', + 'model.38': 'backbone.stage4.1.blocks.0.0', + 'model.39': 'backbone.stage4.1.blocks.0.1', + 'model.40': 'backbone.stage4.1.blocks.1.0', + 'model.41': 'backbone.stage4.1.blocks.1.1', + 'model.42': 'backbone.stage4.1.blocks.2.0', + 'model.43': 'backbone.stage4.1.blocks.2.1', + 'model.45': 'backbone.stage4.1.final_conv', + + # stage5 + 'model.46.cv1': 'backbone.stage5.0.stride_conv_branches.0', + 'model.46.cv2': 'backbone.stage5.0.stride_conv_branches.1', + 'model.46.cv3': 'backbone.stage5.0.maxpool_branches.1', + + # ELANBlock + 'model.47': 'backbone.stage5.1.short_conv', + 'model.48': 'backbone.stage5.1.main_conv', + 'model.49': 'backbone.stage5.1.blocks.0.0', + 'model.50': 'backbone.stage5.1.blocks.0.1', + 'model.51': 'backbone.stage5.1.blocks.1.0', + 'model.52': 'backbone.stage5.1.blocks.1.1', + 'model.53': 'backbone.stage5.1.blocks.2.0', + 'model.54': 'backbone.stage5.1.blocks.2.1', + 'model.56': 'backbone.stage5.1.final_conv', + + # neck SPPCSPBlock + 'model.57.cv1': 'neck.reduce_layers.3.main_layers.0', + 'model.57.cv3': 'neck.reduce_layers.3.main_layers.1', + 'model.57.cv4': 'neck.reduce_layers.3.main_layers.2', + 'model.57.cv5': 'neck.reduce_layers.3.fuse_layers.0', + 'model.57.cv6': 'neck.reduce_layers.3.fuse_layers.1', + 'model.57.cv2': 'neck.reduce_layers.3.short_layer', + 'model.57.cv7': 'neck.reduce_layers.3.final_conv', + + # neck + 'model.58': 'neck.upsample_layers.0.0', + 'model.60': 'neck.reduce_layers.2', + + # neck ELANBlock + 'model.62': 'neck.top_down_layers.0.short_conv', + 'model.63': 'neck.top_down_layers.0.main_conv', + 'model.64': 'neck.top_down_layers.0.blocks.0', + 'model.65': 'neck.top_down_layers.0.blocks.1', + 'model.66': 'neck.top_down_layers.0.blocks.2', + 'model.67': 'neck.top_down_layers.0.blocks.3', + 'model.68': 'neck.top_down_layers.0.blocks.4', + 'model.69': 'neck.top_down_layers.0.blocks.5', + 'model.71': 'neck.top_down_layers.0.final_conv', + 'model.72': 'neck.upsample_layers.1.0', + 'model.74': 'neck.reduce_layers.1', + + # neck ELANBlock + 'model.76': 'neck.top_down_layers.1.short_conv', + 'model.77': 'neck.top_down_layers.1.main_conv', + 'model.78': 'neck.top_down_layers.1.blocks.0', + 'model.79': 'neck.top_down_layers.1.blocks.1', + 'model.80': 'neck.top_down_layers.1.blocks.2', + 'model.81': 'neck.top_down_layers.1.blocks.3', + 'model.82': 'neck.top_down_layers.1.blocks.4', + 'model.83': 'neck.top_down_layers.1.blocks.5', + 'model.85': 'neck.top_down_layers.1.final_conv', + 'model.86': 'neck.upsample_layers.2.0', + 'model.88': 'neck.reduce_layers.0', + 'model.90': 'neck.top_down_layers.2.short_conv', + 'model.91': 'neck.top_down_layers.2.main_conv', + 'model.92': 'neck.top_down_layers.2.blocks.0', + 'model.93': 'neck.top_down_layers.2.blocks.1', + 'model.94': 'neck.top_down_layers.2.blocks.2', + 'model.95': 'neck.top_down_layers.2.blocks.3', + 'model.96': 'neck.top_down_layers.2.blocks.4', + 'model.97': 'neck.top_down_layers.2.blocks.5', + 'model.99': 'neck.top_down_layers.2.final_conv', + 'model.100.cv1': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.100.cv2': 'neck.downsample_layers.0.stride_conv_branches.1', + 'model.100.cv3': 'neck.downsample_layers.0.maxpool_branches.1', + + # neck ELANBlock + 'model.102': 'neck.bottom_up_layers.0.short_conv', + 'model.103': 'neck.bottom_up_layers.0.main_conv', + 'model.104': 'neck.bottom_up_layers.0.blocks.0', + 'model.105': 'neck.bottom_up_layers.0.blocks.1', + 'model.106': 'neck.bottom_up_layers.0.blocks.2', + 'model.107': 'neck.bottom_up_layers.0.blocks.3', + 'model.108': 'neck.bottom_up_layers.0.blocks.4', + 'model.109': 'neck.bottom_up_layers.0.blocks.5', + 'model.111': 'neck.bottom_up_layers.0.final_conv', + 'model.112.cv1': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.112.cv2': 'neck.downsample_layers.1.stride_conv_branches.1', + 'model.112.cv3': 'neck.downsample_layers.1.maxpool_branches.1', + + # neck ELANBlock + 'model.114': 'neck.bottom_up_layers.1.short_conv', + 'model.115': 'neck.bottom_up_layers.1.main_conv', + 'model.116': 'neck.bottom_up_layers.1.blocks.0', + 'model.117': 'neck.bottom_up_layers.1.blocks.1', + 'model.118': 'neck.bottom_up_layers.1.blocks.2', + 'model.119': 'neck.bottom_up_layers.1.blocks.3', + 'model.120': 'neck.bottom_up_layers.1.blocks.4', + 'model.121': 'neck.bottom_up_layers.1.blocks.5', + 'model.123': 'neck.bottom_up_layers.1.final_conv', + 'model.124.cv1': 'neck.downsample_layers.2.stride_conv_branches.0', + 'model.124.cv2': 'neck.downsample_layers.2.stride_conv_branches.1', + 'model.124.cv3': 'neck.downsample_layers.2.maxpool_branches.1', + + # neck ELANBlock + 'model.126': 'neck.bottom_up_layers.2.short_conv', + 'model.127': 'neck.bottom_up_layers.2.main_conv', + 'model.128': 'neck.bottom_up_layers.2.blocks.0', + 'model.129': 'neck.bottom_up_layers.2.blocks.1', + 'model.130': 'neck.bottom_up_layers.2.blocks.2', + 'model.131': 'neck.bottom_up_layers.2.blocks.3', + 'model.132': 'neck.bottom_up_layers.2.blocks.4', + 'model.133': 'neck.bottom_up_layers.2.blocks.5', + 'model.135': 'neck.bottom_up_layers.2.final_conv', + 'model.136': 'bbox_head.head_module.main_convs_pred.0.0', + 'model.137': 'bbox_head.head_module.main_convs_pred.1.0', + 'model.138': 'bbox_head.head_module.main_convs_pred.2.0', + 'model.139': 'bbox_head.head_module.main_convs_pred.3.0', + + # head + 'model.140.m.0': 'bbox_head.head_module.main_convs_pred.0.2', + 'model.140.m.1': 'bbox_head.head_module.main_convs_pred.1.2', + 'model.140.m.2': 'bbox_head.head_module.main_convs_pred.2.2', + 'model.140.m.3': 'bbox_head.head_module.main_convs_pred.3.2' +} + +convert_dict_e2e = { + # stem + 'model.1': 'backbone.stem.conv', + + # stage1 + 'model.2.cv1': 'backbone.stage1.0.stride_conv_branches.0', + 'model.2.cv2': 'backbone.stage1.0.stride_conv_branches.1', + 'model.2.cv3': 'backbone.stage1.0.maxpool_branches.1', + + # E-ELANBlock + 'model.3': 'backbone.stage1.1.e_elan_blocks.0.short_conv', + 'model.4': 'backbone.stage1.1.e_elan_blocks.0.main_conv', + 'model.5': 'backbone.stage1.1.e_elan_blocks.0.blocks.0.0', + 'model.6': 'backbone.stage1.1.e_elan_blocks.0.blocks.0.1', + 'model.7': 'backbone.stage1.1.e_elan_blocks.0.blocks.1.0', + 'model.8': 'backbone.stage1.1.e_elan_blocks.0.blocks.1.1', + 'model.9': 'backbone.stage1.1.e_elan_blocks.0.blocks.2.0', + 'model.10': 'backbone.stage1.1.e_elan_blocks.0.blocks.2.1', + 'model.12': 'backbone.stage1.1.e_elan_blocks.0.final_conv', + 'model.13': 'backbone.stage1.1.e_elan_blocks.1.short_conv', + 'model.14': 'backbone.stage1.1.e_elan_blocks.1.main_conv', + 'model.15': 'backbone.stage1.1.e_elan_blocks.1.blocks.0.0', + 'model.16': 'backbone.stage1.1.e_elan_blocks.1.blocks.0.1', + 'model.17': 'backbone.stage1.1.e_elan_blocks.1.blocks.1.0', + 'model.18': 'backbone.stage1.1.e_elan_blocks.1.blocks.1.1', + 'model.19': 'backbone.stage1.1.e_elan_blocks.1.blocks.2.0', + 'model.20': 'backbone.stage1.1.e_elan_blocks.1.blocks.2.1', + 'model.22': 'backbone.stage1.1.e_elan_blocks.1.final_conv', + + # stage2 + 'model.24.cv1': 'backbone.stage2.0.stride_conv_branches.0', + 'model.24.cv2': 'backbone.stage2.0.stride_conv_branches.1', + 'model.24.cv3': 'backbone.stage2.0.maxpool_branches.1', + + # E-ELANBlock + 'model.25': 'backbone.stage2.1.e_elan_blocks.0.short_conv', + 'model.26': 'backbone.stage2.1.e_elan_blocks.0.main_conv', + 'model.27': 'backbone.stage2.1.e_elan_blocks.0.blocks.0.0', + 'model.28': 'backbone.stage2.1.e_elan_blocks.0.blocks.0.1', + 'model.29': 'backbone.stage2.1.e_elan_blocks.0.blocks.1.0', + 'model.30': 'backbone.stage2.1.e_elan_blocks.0.blocks.1.1', + 'model.31': 'backbone.stage2.1.e_elan_blocks.0.blocks.2.0', + 'model.32': 'backbone.stage2.1.e_elan_blocks.0.blocks.2.1', + 'model.34': 'backbone.stage2.1.e_elan_blocks.0.final_conv', + 'model.35': 'backbone.stage2.1.e_elan_blocks.1.short_conv', + 'model.36': 'backbone.stage2.1.e_elan_blocks.1.main_conv', + 'model.37': 'backbone.stage2.1.e_elan_blocks.1.blocks.0.0', + 'model.38': 'backbone.stage2.1.e_elan_blocks.1.blocks.0.1', + 'model.39': 'backbone.stage2.1.e_elan_blocks.1.blocks.1.0', + 'model.40': 'backbone.stage2.1.e_elan_blocks.1.blocks.1.1', + 'model.41': 'backbone.stage2.1.e_elan_blocks.1.blocks.2.0', + 'model.42': 'backbone.stage2.1.e_elan_blocks.1.blocks.2.1', + 'model.44': 'backbone.stage2.1.e_elan_blocks.1.final_conv', + + # stage3 + 'model.46.cv1': 'backbone.stage3.0.stride_conv_branches.0', + 'model.46.cv2': 'backbone.stage3.0.stride_conv_branches.1', + 'model.46.cv3': 'backbone.stage3.0.maxpool_branches.1', + + # E-ELANBlock + 'model.47': 'backbone.stage3.1.e_elan_blocks.0.short_conv', + 'model.48': 'backbone.stage3.1.e_elan_blocks.0.main_conv', + 'model.49': 'backbone.stage3.1.e_elan_blocks.0.blocks.0.0', + 'model.50': 'backbone.stage3.1.e_elan_blocks.0.blocks.0.1', + 'model.51': 'backbone.stage3.1.e_elan_blocks.0.blocks.1.0', + 'model.52': 'backbone.stage3.1.e_elan_blocks.0.blocks.1.1', + 'model.53': 'backbone.stage3.1.e_elan_blocks.0.blocks.2.0', + 'model.54': 'backbone.stage3.1.e_elan_blocks.0.blocks.2.1', + 'model.56': 'backbone.stage3.1.e_elan_blocks.0.final_conv', + 'model.57': 'backbone.stage3.1.e_elan_blocks.1.short_conv', + 'model.58': 'backbone.stage3.1.e_elan_blocks.1.main_conv', + 'model.59': 'backbone.stage3.1.e_elan_blocks.1.blocks.0.0', + 'model.60': 'backbone.stage3.1.e_elan_blocks.1.blocks.0.1', + 'model.61': 'backbone.stage3.1.e_elan_blocks.1.blocks.1.0', + 'model.62': 'backbone.stage3.1.e_elan_blocks.1.blocks.1.1', + 'model.63': 'backbone.stage3.1.e_elan_blocks.1.blocks.2.0', + 'model.64': 'backbone.stage3.1.e_elan_blocks.1.blocks.2.1', + 'model.66': 'backbone.stage3.1.e_elan_blocks.1.final_conv', + + # stage4 + 'model.68.cv1': 'backbone.stage4.0.stride_conv_branches.0', + 'model.68.cv2': 'backbone.stage4.0.stride_conv_branches.1', + 'model.68.cv3': 'backbone.stage4.0.maxpool_branches.1', + + # E-ELANBlock + 'model.69': 'backbone.stage4.1.e_elan_blocks.0.short_conv', + 'model.70': 'backbone.stage4.1.e_elan_blocks.0.main_conv', + 'model.71': 'backbone.stage4.1.e_elan_blocks.0.blocks.0.0', + 'model.72': 'backbone.stage4.1.e_elan_blocks.0.blocks.0.1', + 'model.73': 'backbone.stage4.1.e_elan_blocks.0.blocks.1.0', + 'model.74': 'backbone.stage4.1.e_elan_blocks.0.blocks.1.1', + 'model.75': 'backbone.stage4.1.e_elan_blocks.0.blocks.2.0', + 'model.76': 'backbone.stage4.1.e_elan_blocks.0.blocks.2.1', + 'model.78': 'backbone.stage4.1.e_elan_blocks.0.final_conv', + 'model.79': 'backbone.stage4.1.e_elan_blocks.1.short_conv', + 'model.80': 'backbone.stage4.1.e_elan_blocks.1.main_conv', + 'model.81': 'backbone.stage4.1.e_elan_blocks.1.blocks.0.0', + 'model.82': 'backbone.stage4.1.e_elan_blocks.1.blocks.0.1', + 'model.83': 'backbone.stage4.1.e_elan_blocks.1.blocks.1.0', + 'model.84': 'backbone.stage4.1.e_elan_blocks.1.blocks.1.1', + 'model.85': 'backbone.stage4.1.e_elan_blocks.1.blocks.2.0', + 'model.86': 'backbone.stage4.1.e_elan_blocks.1.blocks.2.1', + 'model.88': 'backbone.stage4.1.e_elan_blocks.1.final_conv', + + # stage5 + 'model.90.cv1': 'backbone.stage5.0.stride_conv_branches.0', + 'model.90.cv2': 'backbone.stage5.0.stride_conv_branches.1', + 'model.90.cv3': 'backbone.stage5.0.maxpool_branches.1', + + # E-ELANBlock + 'model.91': 'backbone.stage5.1.e_elan_blocks.0.short_conv', + 'model.92': 'backbone.stage5.1.e_elan_blocks.0.main_conv', + 'model.93': 'backbone.stage5.1.e_elan_blocks.0.blocks.0.0', + 'model.94': 'backbone.stage5.1.e_elan_blocks.0.blocks.0.1', + 'model.95': 'backbone.stage5.1.e_elan_blocks.0.blocks.1.0', + 'model.96': 'backbone.stage5.1.e_elan_blocks.0.blocks.1.1', + 'model.97': 'backbone.stage5.1.e_elan_blocks.0.blocks.2.0', + 'model.98': 'backbone.stage5.1.e_elan_blocks.0.blocks.2.1', + 'model.100': 'backbone.stage5.1.e_elan_blocks.0.final_conv', + 'model.101': 'backbone.stage5.1.e_elan_blocks.1.short_conv', + 'model.102': 'backbone.stage5.1.e_elan_blocks.1.main_conv', + 'model.103': 'backbone.stage5.1.e_elan_blocks.1.blocks.0.0', + 'model.104': 'backbone.stage5.1.e_elan_blocks.1.blocks.0.1', + 'model.105': 'backbone.stage5.1.e_elan_blocks.1.blocks.1.0', + 'model.106': 'backbone.stage5.1.e_elan_blocks.1.blocks.1.1', + 'model.107': 'backbone.stage5.1.e_elan_blocks.1.blocks.2.0', + 'model.108': 'backbone.stage5.1.e_elan_blocks.1.blocks.2.1', + 'model.110': 'backbone.stage5.1.e_elan_blocks.1.final_conv', + + # neck SPPCSPBlock + 'model.112.cv1': 'neck.reduce_layers.3.main_layers.0', + 'model.112.cv3': 'neck.reduce_layers.3.main_layers.1', + 'model.112.cv4': 'neck.reduce_layers.3.main_layers.2', + 'model.112.cv5': 'neck.reduce_layers.3.fuse_layers.0', + 'model.112.cv6': 'neck.reduce_layers.3.fuse_layers.1', + 'model.112.cv2': 'neck.reduce_layers.3.short_layer', + 'model.112.cv7': 'neck.reduce_layers.3.final_conv', + + # neck + 'model.113': 'neck.upsample_layers.0.0', + 'model.115': 'neck.reduce_layers.2', + + # neck E-ELANBlock + 'model.117': 'neck.top_down_layers.0.e_elan_blocks.0.short_conv', + 'model.118': 'neck.top_down_layers.0.e_elan_blocks.0.main_conv', + 'model.119': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.0', + 'model.120': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.1', + 'model.121': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.2', + 'model.122': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.3', + 'model.123': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.4', + 'model.124': 'neck.top_down_layers.0.e_elan_blocks.0.blocks.5', + 'model.126': 'neck.top_down_layers.0.e_elan_blocks.0.final_conv', + 'model.127': 'neck.top_down_layers.0.e_elan_blocks.1.short_conv', + 'model.128': 'neck.top_down_layers.0.e_elan_blocks.1.main_conv', + 'model.129': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.0', + 'model.130': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.1', + 'model.131': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.2', + 'model.132': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.3', + 'model.133': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.4', + 'model.134': 'neck.top_down_layers.0.e_elan_blocks.1.blocks.5', + 'model.136': 'neck.top_down_layers.0.e_elan_blocks.1.final_conv', + 'model.138': 'neck.upsample_layers.1.0', + 'model.140': 'neck.reduce_layers.1', + + # neck E-ELANBlock + 'model.142': 'neck.top_down_layers.1.e_elan_blocks.0.short_conv', + 'model.143': 'neck.top_down_layers.1.e_elan_blocks.0.main_conv', + 'model.144': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.0', + 'model.145': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.1', + 'model.146': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.2', + 'model.147': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.3', + 'model.148': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.4', + 'model.149': 'neck.top_down_layers.1.e_elan_blocks.0.blocks.5', + 'model.151': 'neck.top_down_layers.1.e_elan_blocks.0.final_conv', + 'model.152': 'neck.top_down_layers.1.e_elan_blocks.1.short_conv', + 'model.153': 'neck.top_down_layers.1.e_elan_blocks.1.main_conv', + 'model.154': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.0', + 'model.155': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.1', + 'model.156': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.2', + 'model.157': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.3', + 'model.158': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.4', + 'model.159': 'neck.top_down_layers.1.e_elan_blocks.1.blocks.5', + 'model.161': 'neck.top_down_layers.1.e_elan_blocks.1.final_conv', + 'model.163': 'neck.upsample_layers.2.0', + 'model.165': 'neck.reduce_layers.0', + 'model.167': 'neck.top_down_layers.2.e_elan_blocks.0.short_conv', + 'model.168': 'neck.top_down_layers.2.e_elan_blocks.0.main_conv', + 'model.169': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.0', + 'model.170': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.1', + 'model.171': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.2', + 'model.172': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.3', + 'model.173': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.4', + 'model.174': 'neck.top_down_layers.2.e_elan_blocks.0.blocks.5', + 'model.176': 'neck.top_down_layers.2.e_elan_blocks.0.final_conv', + 'model.177': 'neck.top_down_layers.2.e_elan_blocks.1.short_conv', + 'model.178': 'neck.top_down_layers.2.e_elan_blocks.1.main_conv', + 'model.179': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.0', + 'model.180': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.1', + 'model.181': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.2', + 'model.182': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.3', + 'model.183': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.4', + 'model.184': 'neck.top_down_layers.2.e_elan_blocks.1.blocks.5', + 'model.186': 'neck.top_down_layers.2.e_elan_blocks.1.final_conv', + 'model.188.cv1': 'neck.downsample_layers.0.stride_conv_branches.0', + 'model.188.cv2': 'neck.downsample_layers.0.stride_conv_branches.1', + 'model.188.cv3': 'neck.downsample_layers.0.maxpool_branches.1', + + # neck E-ELANBlock + 'model.190': 'neck.bottom_up_layers.0.e_elan_blocks.0.short_conv', + 'model.191': 'neck.bottom_up_layers.0.e_elan_blocks.0.main_conv', + 'model.192': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.0', + 'model.193': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.1', + 'model.194': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.2', + 'model.195': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.3', + 'model.196': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.4', + 'model.197': 'neck.bottom_up_layers.0.e_elan_blocks.0.blocks.5', + 'model.199': 'neck.bottom_up_layers.0.e_elan_blocks.0.final_conv', + 'model.200': 'neck.bottom_up_layers.0.e_elan_blocks.1.short_conv', + 'model.201': 'neck.bottom_up_layers.0.e_elan_blocks.1.main_conv', + 'model.202': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.0', + 'model.203': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.1', + 'model.204': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.2', + 'model.205': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.3', + 'model.206': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.4', + 'model.207': 'neck.bottom_up_layers.0.e_elan_blocks.1.blocks.5', + 'model.209': 'neck.bottom_up_layers.0.e_elan_blocks.1.final_conv', + 'model.211.cv1': 'neck.downsample_layers.1.stride_conv_branches.0', + 'model.211.cv2': 'neck.downsample_layers.1.stride_conv_branches.1', + 'model.211.cv3': 'neck.downsample_layers.1.maxpool_branches.1', + 'model.213': 'neck.bottom_up_layers.1.e_elan_blocks.0.short_conv', + 'model.214': 'neck.bottom_up_layers.1.e_elan_blocks.0.main_conv', + 'model.215': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.0', + 'model.216': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.1', + 'model.217': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.2', + 'model.218': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.3', + 'model.219': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.4', + 'model.220': 'neck.bottom_up_layers.1.e_elan_blocks.0.blocks.5', + 'model.222': 'neck.bottom_up_layers.1.e_elan_blocks.0.final_conv', + 'model.223': 'neck.bottom_up_layers.1.e_elan_blocks.1.short_conv', + 'model.224': 'neck.bottom_up_layers.1.e_elan_blocks.1.main_conv', + 'model.225': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.0', + 'model.226': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.1', + 'model.227': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.2', + 'model.228': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.3', + 'model.229': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.4', + 'model.230': 'neck.bottom_up_layers.1.e_elan_blocks.1.blocks.5', + 'model.232': 'neck.bottom_up_layers.1.e_elan_blocks.1.final_conv', + 'model.234.cv1': 'neck.downsample_layers.2.stride_conv_branches.0', + 'model.234.cv2': 'neck.downsample_layers.2.stride_conv_branches.1', + 'model.234.cv3': 'neck.downsample_layers.2.maxpool_branches.1', + + # neck E-ELANBlock + 'model.236': 'neck.bottom_up_layers.2.e_elan_blocks.0.short_conv', + 'model.237': 'neck.bottom_up_layers.2.e_elan_blocks.0.main_conv', + 'model.238': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.0', + 'model.239': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.1', + 'model.240': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.2', + 'model.241': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.3', + 'model.242': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.4', + 'model.243': 'neck.bottom_up_layers.2.e_elan_blocks.0.blocks.5', + 'model.245': 'neck.bottom_up_layers.2.e_elan_blocks.0.final_conv', + 'model.246': 'neck.bottom_up_layers.2.e_elan_blocks.1.short_conv', + 'model.247': 'neck.bottom_up_layers.2.e_elan_blocks.1.main_conv', + 'model.248': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.0', + 'model.249': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.1', + 'model.250': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.2', + 'model.251': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.3', + 'model.252': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.4', + 'model.253': 'neck.bottom_up_layers.2.e_elan_blocks.1.blocks.5', + 'model.255': 'neck.bottom_up_layers.2.e_elan_blocks.1.final_conv', + 'model.257': 'bbox_head.head_module.main_convs_pred.0.0', + 'model.258': 'bbox_head.head_module.main_convs_pred.1.0', + 'model.259': 'bbox_head.head_module.main_convs_pred.2.0', + 'model.260': 'bbox_head.head_module.main_convs_pred.3.0', + + # head + 'model.261.m.0': 'bbox_head.head_module.main_convs_pred.0.2', + 'model.261.m.1': 'bbox_head.head_module.main_convs_pred.1.2', + 'model.261.m.2': 'bbox_head.head_module.main_convs_pred.2.2', + 'model.261.m.3': 'bbox_head.head_module.main_convs_pred.3.2' +} + +convert_dicts = { + 'yolov7-tiny.pt': convert_dict_tiny, + 'yolov7-w6.pt': convert_dict_w, + 'yolov7-e6.pt': convert_dict_e, + 'yolov7-e6e.pt': convert_dict_e2e, + 'yolov7.pt': convert_dict_l, + 'yolov7x.pt': convert_dict_x +} + + +def convert(src, dst): + src_key = osp.basename(src) + convert_dict = convert_dicts[osp.basename(src)] + + num_levels = 3 + if src_key == 'yolov7.pt': + indexes = [102, 51] + in_channels = [256, 512, 1024] + elif src_key == 'yolov7x.pt': + indexes = [121, 59] + in_channels = [320, 640, 1280] + elif src_key == 'yolov7-tiny.pt': + indexes = [77, 1000] + in_channels = [128, 256, 512] + elif src_key == 'yolov7-w6.pt': + indexes = [118, 47] + in_channels = [256, 512, 768, 1024] + num_levels = 4 + elif src_key == 'yolov7-e6.pt': + indexes = [140, [2, 13, 24, 35, 46, 57, 100, 112, 124]] + in_channels = 320, 640, 960, 1280 + num_levels = 4 + elif src_key == 'yolov7-e6e.pt': + indexes = [261, [2, 24, 46, 68, 90, 112, 188, 211, 234]] + in_channels = 320, 640, 960, 1280 + num_levels = 4 + + if isinstance(indexes[1], int): + indexes[1] = [indexes[1]] + """Convert keys in detectron pretrained YOLOv7 models to mmyolo style.""" + try: + yolov7_model = torch.load(src)['model'].float() + blobs = yolov7_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the WongKinYiu/yolov7 repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + if key.find('anchors') >= 0 or key.find('anchor_grid') >= 0: + continue + + num, module = key.split('.')[1:3] + if int(num) < indexes[0] and int(num) not in indexes[1]: + prefix = f'model.{num}' + new_key = key.replace(prefix, convert_dict[prefix]) + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + elif int(num) in indexes[1]: + strs_key = key.split('.')[:3] + new_key = key.replace('.'.join(strs_key), + convert_dict['.'.join(strs_key)]) + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + else: + strs_key = key.split('.')[:4] + new_key = key.replace('.'.join(strs_key), + convert_dict['.'.join(strs_key)]) + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # Add ImplicitA and ImplicitM + for i in range(num_levels): + if num_levels == 3: + implicit_a = f'bbox_head.head_module.' \ + f'convs_pred.{i}.0.implicit' + state_dict[implicit_a] = torch.zeros((1, in_channels[i], 1, 1)) + implicit_m = f'bbox_head.head_module.' \ + f'convs_pred.{i}.2.implicit' + state_dict[implicit_m] = torch.ones((1, 3 * 85, 1, 1)) + else: + implicit_a = f'bbox_head.head_module.' \ + f'main_convs_pred.{i}.1.implicit' + state_dict[implicit_a] = torch.zeros((1, in_channels[i], 1, 1)) + implicit_m = f'bbox_head.head_module.' \ + f'main_convs_pred.{i}.3.implicit' + state_dict[implicit_m] = torch.ones((1, 3 * 85, 1, 1)) + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the yolov7 repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + 'src', default='yolov7.pt', help='src yolov7 model path') + parser.add_argument('dst', default='mm_yolov7l.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + print('If your model weights are from P6 models, such as W6, E6, D6, \ + E6E, the auxiliary training module is not required to be loaded, \ + so it is normal for the weights of the auxiliary module \ + to be missing.') + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/yolov8_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/yolov8_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..4ed64f2492ba0bece874c482fe704492fad4e8e9 --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/yolov8_to_mmyolo.py @@ -0,0 +1,102 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +convert_dict_s = { + # backbone + 'model.0': 'backbone.stem', + 'model.1': 'backbone.stage1.0', + 'model.2': 'backbone.stage1.1', + 'model.3': 'backbone.stage2.0', + 'model.4': 'backbone.stage2.1', + 'model.5': 'backbone.stage3.0', + 'model.6': 'backbone.stage3.1', + 'model.7': 'backbone.stage4.0', + 'model.8': 'backbone.stage4.1', + 'model.9': 'backbone.stage4.2', + + # neck + 'model.12': 'neck.top_down_layers.0', + 'model.15': 'neck.top_down_layers.1', + 'model.16': 'neck.downsample_layers.0', + 'model.18': 'neck.bottom_up_layers.0', + 'model.19': 'neck.downsample_layers.1', + 'model.21': 'neck.bottom_up_layers.1', + + # Detector + 'model.22': 'bbox_head.head_module', +} + + +def convert(src, dst): + """Convert keys in pretrained YOLOv8 models to mmyolo style.""" + convert_dict = convert_dict_s + + try: + yolov8_model = torch.load(src)['model'] + blobs = yolov8_model.state_dict() + except ModuleNotFoundError: + raise RuntimeError( + 'This script must be placed under the ultralytics repo,' + ' because loading the official pretrained model need' + ' `model.py` to build model.' + 'Also need to install hydra-core>=1.2.0 and thop>=0.1.1') + state_dict = OrderedDict() + + for key, weight in blobs.items(): + num, module = key.split('.')[1:3] + prefix = f'model.{num}' + new_key = key.replace(prefix, convert_dict[prefix]) + + if '.m.' in new_key: + new_key = new_key.replace('.m.', '.blocks.') + new_key = new_key.replace('.cv', '.conv') + elif 'bbox_head.head_module.proto.cv' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.proto.cv', + 'bbox_head.head_module.proto_preds.conv') + elif 'bbox_head.head_module.proto' in new_key: + new_key = new_key.replace('bbox_head.head_module.proto', + 'bbox_head.head_module.proto_preds') + elif 'bbox_head.head_module.cv4.' in new_key: + new_key = new_key.replace( + 'bbox_head.head_module.cv4', + 'bbox_head.head_module.mask_coeff_preds') + new_key = new_key.replace('.2.weight', '.2.conv.weight') + new_key = new_key.replace('.2.bias', '.2.conv.bias') + elif 'bbox_head.head_module' in new_key: + new_key = new_key.replace('.cv2', '.reg_preds') + new_key = new_key.replace('.cv3', '.cls_preds') + elif 'backbone.stage4.2' in new_key: + new_key = new_key.replace('.cv', '.conv') + else: + new_key = new_key.replace('.cv1', '.main_conv') + new_key = new_key.replace('.cv2', '.final_conv') + + if 'bbox_head.head_module.dfl.conv.weight' == new_key: + print('Drop "bbox_head.head_module.dfl.conv.weight", ' + 'because it is useless') + continue + state_dict[new_key] = weight + print(f'Convert {key} to {new_key}') + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +# Note: This script must be placed under the ultralytics repo to run. +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolov8s.pt', help='src YOLOv8 model path') + parser.add_argument('--dst', default='mmyolov8s.pth', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/model_converters/yolox_to_mmyolo.py b/third_party/mmyolo/tools/model_converters/yolox_to_mmyolo.py new file mode 100644 index 0000000000000000000000000000000000000000..5fcc7356780444db59517c931ce1a3557ec8340a --- /dev/null +++ b/third_party/mmyolo/tools/model_converters/yolox_to_mmyolo.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from collections import OrderedDict + +import torch + +neck_dict = { + 'backbone.lateral_conv0': 'neck.reduce_layers.2', + 'backbone.C3_p4.conv': 'neck.top_down_layers.0.0.cv', + 'backbone.C3_p4.m.0.': 'neck.top_down_layers.0.0.m.0.', + 'backbone.reduce_conv1': 'neck.top_down_layers.0.1', + 'backbone.C3_p3.conv': 'neck.top_down_layers.1.cv', + 'backbone.C3_p3.m.0.': 'neck.top_down_layers.1.m.0.', + 'backbone.bu_conv2': 'neck.downsample_layers.0', + 'backbone.C3_n3.conv': 'neck.bottom_up_layers.0.cv', + 'backbone.C3_n3.m.0.': 'neck.bottom_up_layers.0.m.0.', + 'backbone.bu_conv1': 'neck.downsample_layers.1', + 'backbone.C3_n4.conv': 'neck.bottom_up_layers.1.cv', + 'backbone.C3_n4.m.0.': 'neck.bottom_up_layers.1.m.0.', +} + + +def convert_stem(model_key, model_weight, state_dict, converted_names): + new_key = model_key[9:] + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_backbone(model_key, model_weight, state_dict, converted_names): + new_key = model_key.replace('backbone.dark', 'stage') + num = int(new_key[14]) - 1 + new_key = new_key[:14] + str(num) + new_key[15:] + if '.m.' in model_key: + new_key = new_key.replace('.m.', '.blocks.') + elif not new_key[16] == '0' and 'stage4.1' not in new_key: + new_key = new_key.replace('conv1', 'main_conv') + new_key = new_key.replace('conv2', 'short_conv') + new_key = new_key.replace('conv3', 'final_conv') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_neck(model_key, model_weight, state_dict, converted_names): + for old, new in neck_dict.items(): + if old in model_key: + new_key = model_key.replace(old, new) + if '.m.' in model_key: + new_key = new_key.replace('.m.', '.blocks.') + elif '.C' in model_key: + new_key = new_key.replace('cv1', 'main_conv') + new_key = new_key.replace('cv2', 'short_conv') + new_key = new_key.replace('cv3', 'final_conv') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_head(model_key, model_weight, state_dict, converted_names): + if 'stem' in model_key: + new_key = model_key.replace('head.stem', 'neck.out_layer') + elif 'cls_convs' in model_key: + new_key = model_key.replace( + 'head.cls_convs', 'bbox_head.head_module.multi_level_cls_convs') + elif 'reg_convs' in model_key: + new_key = model_key.replace( + 'head.reg_convs', 'bbox_head.head_module.multi_level_reg_convs') + elif 'preds' in model_key: + new_key = model_key.replace('head.', + 'bbox_head.head_module.multi_level_conv_') + new_key = new_key.replace('_preds', '') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert(src, dst): + """Convert keys in detectron pretrained YOLOX models to mmyolo style.""" + blobs = torch.load(src)['model'] + state_dict = OrderedDict() + converted_names = set() + + for key, weight in blobs.items(): + if 'backbone.stem' in key: + convert_stem(key, weight, state_dict, converted_names) + elif 'backbone.backbone' in key: + convert_backbone(key, weight, state_dict, converted_names) + elif 'backbone.neck' not in key and 'head' not in key: + convert_neck(key, weight, state_dict, converted_names) + elif 'head' in key: + convert_head(key, weight, state_dict, converted_names) + + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument( + '--src', default='yolox_s.pth', help='src yolox model path') + parser.add_argument('--dst', default='mmyoloxs.pt', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/slurm_test.sh b/third_party/mmyolo/tools/slurm_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..6dd67e57442b741fc30f26102eb5afe16139edb1 --- /dev/null +++ b/third_party/mmyolo/tools/slurm_test.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +CHECKPOINT=$4 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +PY_ARGS=${@:5} +SRUN_ARGS=${SRUN_ARGS:-""} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} diff --git a/third_party/mmyolo/tools/slurm_train.sh b/third_party/mmyolo/tools/slurm_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..b3feb3d9c7a6c33d82739cdf5ee10365673aaded --- /dev/null +++ b/third_party/mmyolo/tools/slurm_train.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -x + +PARTITION=$1 +JOB_NAME=$2 +CONFIG=$3 +WORK_DIR=$4 +GPUS=${GPUS:-8} +GPUS_PER_NODE=${GPUS_PER_NODE:-8} +CPUS_PER_TASK=${CPUS_PER_TASK:-5} +SRUN_ARGS=${SRUN_ARGS:-""} +PY_ARGS=${@:5} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +srun -p ${PARTITION} \ + --job-name=${JOB_NAME} \ + --gres=gpu:${GPUS_PER_NODE} \ + --ntasks=${GPUS} \ + --ntasks-per-node=${GPUS_PER_NODE} \ + --cpus-per-task=${CPUS_PER_TASK} \ + --kill-on-bad-exit=1 \ + ${SRUN_ARGS} \ + python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} diff --git a/third_party/mmyolo/tools/test.py b/third_party/mmyolo/tools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..f0ac8bde429c946ec18c7f29ea8d7cbad102e262 --- /dev/null +++ b/third_party/mmyolo/tools/test.py @@ -0,0 +1,158 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmdet.engine.hooks.utils import trigger_visualization_hook +from mmdet.utils import setup_cache_size_limit_of_dynamo +from mmengine.config import Config, ConfigDict, DictAction +from mmengine.evaluator import DumpResults +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +# TODO: support fuse_conv_bn +def parse_args(): + parser = argparse.ArgumentParser( + description='MMYOLO test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--out', + type=str, + help='output result file (must be a .pkl file) in pickle format') + parser.add_argument( + '--json-prefix', + type=str, + help='the prefix of the output json file without perform evaluation, ' + 'which is useful when you want to format the result to a specific ' + 'format and submit it to the test server') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--show', action='store_true', help='show prediction results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--show-dir', + help='directory where painted images will be saved. ' + 'If specified, it will be automatically saved ' + 'to the work_dir/timestamp/show_dir') + parser.add_argument( + '--wait-time', type=float, default=2, help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def main(): + args = parse_args() + + # Reduce the number of repeated compilations and improve + # training speed. + setup_cache_size_limit_of_dynamo() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + if args.show or args.show_dir: + cfg = trigger_visualization_hook(cfg, args) + + if args.deploy: + cfg.custom_hooks.append(dict(type='SwitchToDeployHook')) + + # add `format_only` and `outfile_prefix` into cfg + if args.json_prefix is not None: + cfg_json = { + 'test_evaluator.format_only': True, + 'test_evaluator.outfile_prefix': args.json_prefix + } + cfg.merge_from_dict(cfg_json) + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + if args.tta: + assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + + cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model) + test_data_cfg = cfg.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = cfg.tta_pipeline + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # add `DumpResults` dummy metric + if args.out is not None: + assert args.out.endswith(('.pkl', '.pickle')), \ + 'The dump file must be a pkl file.' + runner.test_evaluator.metrics.append( + DumpResults(out_file_path=args.out)) + + # start testing + runner.test() + + +if __name__ == '__main__': + main() diff --git a/third_party/mmyolo/tools/train.py b/third_party/mmyolo/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..61f94980d2236295c4ca317520842a53b1813f0a --- /dev/null +++ b/third_party/mmyolo/tools/train.py @@ -0,0 +1,123 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import logging +import os +import os.path as osp + +from mmdet.utils import setup_cache_size_limit_of_dynamo +from mmengine.config import Config, DictAction +from mmengine.logging import print_log +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--amp', + action='store_true', + default=False, + help='enable automatic-mixed-precision training') + parser.add_argument( + '--resume', + nargs='?', + type=str, + const='auto', + help='If specify checkpoint path, resume from it, while if not ' + 'specify, try to auto resume from the latest checkpoint ' + 'in the work directory.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + + # Reduce the number of repeated compilations and improve + # training speed. + setup_cache_size_limit_of_dynamo() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + # enable automatic-mixed-precision training + if args.amp is True: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == 'AmpOptimWrapper': + print_log( + 'AMP training is already enabled in your config.', + logger='current', + level=logging.WARNING) + else: + assert optim_wrapper == 'OptimWrapper', ( + '`--amp` is only supported when the optimizer wrapper type is ' + f'`OptimWrapper` but got {optim_wrapper}.') + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.loss_scale = 'dynamic' + + # resume is determined in this priority: resume from > auto_resume + if args.resume == 'auto': + cfg.resume = True + cfg.load_from = None + elif args.resume is not None: + cfg.resume = True + cfg.load_from = args.resume + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start training + runner.train() + + +if __name__ == '__main__': + main() diff --git a/tools/demo.py b/tools/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..7732fd453c2db80fa18421d9bab9ff09e9cac3e3 --- /dev/null +++ b/tools/demo.py @@ -0,0 +1,224 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os +import argparse +import os.path as osp +from functools import partial +from io import BytesIO + +import onnx +import onnxsim +import torch +import gradio as gr +import numpy as np +from PIL import Image +from torchvision.ops import nms +from mmengine.config import Config, ConfigDict, DictAction +from mmengine.runner import Runner +from mmengine.runner.amp import autocast +from mmengine.dataset import Compose +from mmdet.visualization import DetLocalVisualizer +from mmdet.datasets import CocoDataset +from mmyolo.registry import RUNNERS + +from yolo_world.easydeploy.model import DeployModel, MMYOLOBackend + + +def parse_args(): + parser = argparse.ArgumentParser( + description='YOLO-World Demo') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def run_image(runner, + image, + text, + max_num_boxes, + score_thr, + nms_thr, + image_path='./work_dirs/demo.png'): + image.save(image_path) + texts = [[t.strip()] for t in text.split(',')] + [[' ']] + data_info = dict(img_id=0, img_path=image_path, texts=texts) + data_info = runner.pipeline(data_info) + data_batch = dict(inputs=data_info['inputs'].unsqueeze(0), + data_samples=[data_info['data_samples']]) + + with autocast(enabled=False), torch.no_grad(): + output = runner.model.test_step(data_batch)[0] + pred_instances = output.pred_instances + + keep_idxs = nms(pred_instances.bboxes, + pred_instances.scores, + iou_threshold=nms_thr) + pred_instances = pred_instances[keep_idxs] + pred_instances = pred_instances[ + pred_instances.scores.float() > score_thr] + if len(pred_instances.scores) > max_num_boxes: + indices = pred_instances.scores.float().topk(max_num_boxes)[1] + pred_instances = pred_instances[indices] + output.pred_instances = pred_instances + + image = np.array(image) + visualizer = DetLocalVisualizer() + visualizer.dataset_meta['classes'] = [t[0] for t in texts] + visualizer.add_datasample('image', + np.array(image), + output, + draw_gt=False, + out_file=image_path, + pred_score_thr=score_thr) + image = Image.open(image_path) + return image + + +def export_model(runner, + checkpoint, + text, + max_num_boxes, + score_thr, + nms_thr): + backend = MMYOLOBackend.ONNXRUNTIME + postprocess_cfg = ConfigDict( + pre_top_k=10 * max_num_boxes, + keep_top_k=max_num_boxes, + iou_threshold=nms_thr, + score_threshold=score_thr) + + base_model = runner.model + texts = [[t.strip() for t in text.split(',')] + [' ']] + base_model.reparameterize(texts) + deploy_model = DeployModel( + baseModel=base_model, + backend=backend, + postprocess_cfg=postprocess_cfg) + deploy_model.eval() + + device = (next(iter(base_model.parameters()))).device + fake_input = torch.ones([1, 3, 640, 640], device=device) + # dry run + deploy_model(fake_input) + + save_onnx_path = os.path.join( + args.work_dir, + os.path.basename(args.checkpoint).replace('pth', 'onnx')) + # export onnx + with BytesIO() as f: + output_names = ['num_dets', 'boxes', 'scores', 'labels'] + torch.onnx.export( + deploy_model, + fake_input, + f, + input_names=['images'], + output_names=output_names, + opset_version=12) + f.seek(0) + onnx_model = onnx.load(f) + onnx.checker.check_model(onnx_model) + onnx_model, check = onnxsim.simplify(onnx_model) + onnx.save(onnx_model, save_onnx_path) + return gr.update(visible=True), save_onnx_path + + +def demo(runner, args): + with gr.Blocks(title="YOLO-World") as demo: + with gr.Row(): + gr.Markdown('

YOLO-World: Real-Time Open-Vocabulary ' + 'Object Detector

') + with gr.Row(): + with gr.Column(scale=0.3): + with gr.Row(): + image = gr.Image(type='pil', label='input image') + input_text = gr.Textbox( + lines=7, + label='Enter the classes to be detected, ' + 'separated by comma', + value=', '.join(CocoDataset.METAINFO['classes']), + elem_id='textbox') + with gr.Row(): + submit = gr.Button('Submit') + clear = gr.Button('Clear') + with gr.Row(): + export = gr.Button('Deploy and Export ONNX Model') + out_download = gr.File(lines=1, visible=False) + max_num_boxes = gr.Slider( + minimum=1, + maximum=300, + value=100, + step=1, + interactive=True, + label='Maximum Number Boxes') + score_thr = gr.Slider( + minimum=0, + maximum=1, + value=0.05, + step=0.001, + interactive=True, + label='Score Threshold') + nms_thr = gr.Slider( + minimum=0, + maximum=1, + value=0.7, + step=0.001, + interactive=True, + label='NMS Threshold') + with gr.Column(scale=0.7): + output_image = gr.Image( + lines=20, + type='pil', + label='output image') + + submit.click(partial(run_image, runner), + [image, input_text, max_num_boxes, + score_thr, nms_thr], + [output_image]) + clear.click(lambda: [[], '', ''], None, + [image, input_text, output_image]) + export.click(partial(export_model, runner, args.checkpoint), + [input_text, max_num_boxes, score_thr, nms_thr], + [out_download, out_download]) + demo.launch(server_name='0.0.0.0', server_port=80) + + +if __name__ == '__main__': + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if args.work_dir is not None: + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + if 'runner_type' not in cfg: + runner = Runner.from_cfg(cfg) + else: + runner = RUNNERS.build(cfg) + + runner.call_hook('before_run') + runner.load_or_resume() + pipeline = cfg.test_dataloader.dataset.pipeline + runner.pipeline = Compose(pipeline) + runner.model.eval() + demo(runner, args) diff --git a/tools/deploy.py b/tools/deploy.py new file mode 100644 index 0000000000000000000000000000000000000000..0e6c9c3d95589e7af3f72e89b41dc80cd6a76dd8 --- /dev/null +++ b/tools/deploy.py @@ -0,0 +1,335 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import argparse +import logging +import os +import os.path as osp +from functools import partial + +import mmengine +import torch.multiprocessing as mp +from torch.multiprocessing import Process, set_start_method + +from mmdeploy.apis import (create_calib_input_data, extract_model, + get_predefined_partition_cfg, torch2onnx, + torch2torchscript, visualize_model) +from mmdeploy.apis.core import PIPELINE_MANAGER +from mmdeploy.apis.utils import to_backend +from mmdeploy.backend.sdk.export_info import export2SDK +from mmdeploy.utils import (IR, Backend, get_backend, get_calib_filename, + get_ir_config, get_partition_config, + get_root_logger, load_config, target_wrapper) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Export model to backends.') + parser.add_argument('deploy_cfg', help='deploy config path') + parser.add_argument('model_cfg', help='model config path') + parser.add_argument('checkpoint', help='model checkpoint path') + parser.add_argument('img', help='image used to convert model model') + parser.add_argument( + '--test-img', + default=None, + type=str, + nargs='+', + help='image used to test model') + parser.add_argument( + '--work-dir', + default=os.getcwd(), + help='the dir to save logs and models') + parser.add_argument( + '--calib-dataset-cfg', + help='dataset config path used to calibrate in int8 mode. If not \ + specified, it will use "val" dataset in model config instead.', + default=None) + parser.add_argument( + '--device', help='device used for conversion', default='cpu') + parser.add_argument( + '--log-level', + help='set log level', + default='INFO', + choices=list(logging._nameToLevel.keys())) + parser.add_argument( + '--show', action='store_true', help='Show detection outputs') + parser.add_argument( + '--dump-info', action='store_true', help='Output information for SDK') + parser.add_argument( + '--quant-image-dir', + default=None, + help='Image directory for quantize model.') + parser.add_argument( + '--quant', action='store_true', help='Quantize model to low bit.') + parser.add_argument( + '--uri', + default='192.168.1.1:60000', + help='Remote ipv4:port or ipv6:port for inference on edge device.') + args = parser.parse_args() + return args + + +def create_process(name, target, args, kwargs, ret_value=None): + logger = get_root_logger() + logger.info(f'{name} start.') + log_level = logger.level + + wrap_func = partial(target_wrapper, target, log_level, ret_value) + + process = Process(target=wrap_func, args=args, kwargs=kwargs) + process.start() + process.join() + + if ret_value is not None: + if ret_value.value != 0: + logger.error(f'{name} failed.') + exit(1) + else: + logger.info(f'{name} success.') + + +def torch2ir(ir_type: IR): + """Return the conversion function from torch to the intermediate + representation. + + Args: + ir_type (IR): The type of the intermediate representation. + """ + if ir_type == IR.ONNX: + return torch2onnx + elif ir_type == IR.TORCHSCRIPT: + return torch2torchscript + else: + raise KeyError(f'Unexpected IR type {ir_type}') + + +def main(): + args = parse_args() + set_start_method('spawn', force=True) + logger = get_root_logger() + log_level = logging.getLevelName(args.log_level) + logger.setLevel(log_level) + + pipeline_funcs = [ + torch2onnx, torch2torchscript, extract_model, create_calib_input_data + ] + PIPELINE_MANAGER.enable_multiprocess(True, pipeline_funcs) + PIPELINE_MANAGER.set_log_level(log_level, pipeline_funcs) + + deploy_cfg_path = args.deploy_cfg + model_cfg_path = args.model_cfg + checkpoint_path = args.checkpoint + quant = args.quant + quant_image_dir = args.quant_image_dir + + # load deploy_cfg + deploy_cfg, model_cfg = load_config(deploy_cfg_path, model_cfg_path) + + # create work_dir if not + mmengine.mkdir_or_exist(osp.abspath(args.work_dir)) + + if args.dump_info: + export2SDK( + deploy_cfg, + model_cfg, + args.work_dir, + pth=checkpoint_path, + device=args.device) + + ret_value = mp.Value('d', 0, lock=False) + + # convert to IR + ir_config = get_ir_config(deploy_cfg) + ir_save_file = ir_config['save_file'] + ir_type = IR.get(ir_config['type']) + torch2ir(ir_type)( + args.img, + args.work_dir, + ir_save_file, + deploy_cfg_path, + model_cfg_path, + checkpoint_path, + device=args.device) + + # convert backend + ir_files = [osp.join(args.work_dir, ir_save_file)] + + # partition model + partition_cfgs = get_partition_config(deploy_cfg) + + if partition_cfgs is not None: + + if 'partition_cfg' in partition_cfgs: + partition_cfgs = partition_cfgs.get('partition_cfg', None) + else: + assert 'type' in partition_cfgs + partition_cfgs = get_predefined_partition_cfg( + deploy_cfg, partition_cfgs['type']) + + origin_ir_file = ir_files[0] + ir_files = [] + for partition_cfg in partition_cfgs: + save_file = partition_cfg['save_file'] + save_path = osp.join(args.work_dir, save_file) + start = partition_cfg['start'] + end = partition_cfg['end'] + dynamic_axes = partition_cfg.get('dynamic_axes', None) + + extract_model( + origin_ir_file, + start, + end, + dynamic_axes=dynamic_axes, + save_file=save_path) + + ir_files.append(save_path) + + # calib data + calib_filename = get_calib_filename(deploy_cfg) + if calib_filename is not None: + calib_path = osp.join(args.work_dir, calib_filename) + create_calib_input_data( + calib_path, + deploy_cfg_path, + model_cfg_path, + checkpoint_path, + dataset_cfg=args.calib_dataset_cfg, + dataset_type='val', + device=args.device) + + backend_files = ir_files + # convert backend + backend = get_backend(deploy_cfg) + + # preprocess deploy_cfg + if backend == Backend.RKNN: + # TODO: Add this to task_processor in the future + import tempfile + + from mmdeploy.utils import (get_common_config, get_normalization, + get_quantization_config, + get_rknn_quantization) + quantization_cfg = get_quantization_config(deploy_cfg) + common_params = get_common_config(deploy_cfg) + if get_rknn_quantization(deploy_cfg) is True: + transform = get_normalization(model_cfg) + common_params.update( + dict( + mean_values=[transform['mean']], + std_values=[transform['std']])) + + dataset_file = tempfile.NamedTemporaryFile(suffix='.txt').name + with open(dataset_file, 'w') as f: + f.writelines([osp.abspath(args.img)]) + if quantization_cfg.get('dataset', None) is None: + quantization_cfg['dataset'] = dataset_file + if backend == Backend.ASCEND: + # TODO: Add this to backend manager in the future + if args.dump_info: + from mmdeploy.backend.ascend import update_sdk_pipeline + update_sdk_pipeline(args.work_dir) + + if backend == Backend.VACC: + # TODO: Add this to task_processor in the future + + from onnx2vacc_quant_dataset import get_quant + + from mmdeploy.utils import get_model_inputs + + deploy_cfg, model_cfg = load_config(deploy_cfg_path, model_cfg_path) + model_inputs = get_model_inputs(deploy_cfg) + + for onnx_path, model_input in zip(ir_files, model_inputs): + + quant_mode = model_input.get('qconfig', {}).get('dtype', 'fp16') + assert quant_mode in ['int8', + 'fp16'], quant_mode + ' not support now' + shape_dict = model_input.get('shape', {}) + + if quant_mode == 'int8': + create_process( + 'vacc quant dataset', + target=get_quant, + args=(deploy_cfg, model_cfg, shape_dict, checkpoint_path, + args.work_dir, args.device), + kwargs=dict(), + ret_value=ret_value) + + # convert to backend + PIPELINE_MANAGER.set_log_level(log_level, [to_backend]) + if backend == Backend.TENSORRT: + PIPELINE_MANAGER.enable_multiprocess(True, [to_backend]) + backend_files = to_backend( + backend, + ir_files, + work_dir=args.work_dir, + deploy_cfg=deploy_cfg, + log_level=log_level, + device=args.device, + uri=args.uri) + + # ncnn quantization + if backend == Backend.NCNN and quant: + from onnx2ncnn_quant_table import get_table + + from mmdeploy.apis.ncnn import get_quant_model_file, ncnn2int8 + model_param_paths = backend_files[::2] + model_bin_paths = backend_files[1::2] + backend_files = [] + for onnx_path, model_param_path, model_bin_path in zip( + ir_files, model_param_paths, model_bin_paths): + + deploy_cfg, model_cfg = load_config(deploy_cfg_path, + model_cfg_path) + quant_onnx, quant_table, quant_param, quant_bin = get_quant_model_file( # noqa: E501 + onnx_path, args.work_dir) + + create_process( + 'ncnn quant table', + target=get_table, + args=(onnx_path, deploy_cfg, model_cfg, quant_onnx, + quant_table, quant_image_dir, args.device), + kwargs=dict(), + ret_value=ret_value) + + create_process( + 'ncnn_int8', + target=ncnn2int8, + args=(model_param_path, model_bin_path, quant_table, + quant_param, quant_bin), + kwargs=dict(), + ret_value=ret_value) + backend_files += [quant_param, quant_bin] + + if args.test_img is None: + args.test_img = args.img + + extra = dict( + backend=backend, + output_file=osp.join(args.work_dir, f'output_{backend.value}.jpg'), + show_result=args.show) + if backend == Backend.SNPE: + extra['uri'] = args.uri + + # get backend inference result, try render + create_process( + f'visualize {backend.value} model', + target=visualize_model, + args=(model_cfg_path, deploy_cfg_path, backend_files, args.test_img, + args.device), + kwargs=extra, + ret_value=ret_value) + + # get pytorch model inference result, try visualize if possible + create_process( + 'visualize pytorch model', + target=visualize_model, + args=(model_cfg_path, deploy_cfg_path, [checkpoint_path], + args.test_img, args.device), + kwargs=dict( + backend=Backend.PYTORCH, + output_file=osp.join(args.work_dir, 'output_pytorch.jpg'), + show_result=args.show), + ret_value=ret_value) + logger.info('All process success.') + + +if __name__ == '__main__': + main() diff --git a/tools/deploy_convert.py b/tools/deploy_convert.py new file mode 100644 index 0000000000000000000000000000000000000000..5d5dbe95b347d553a0b913ca640c9c21455a6f57 --- /dev/null +++ b/tools/deploy_convert.py @@ -0,0 +1,118 @@ +# Copyright (c) Lin Song. All rights reserved. +import os +import json +import argparse +import os.path as osp + +import torch +import torch.nn.functional as F +from mmengine.config import Config, DictAction +from mmengine.runner import Runner +from mmengine.dataset import Compose +from mmyolo.registry import RUNNERS + + +def get_caption_embed(runner, caption, prompt_template): + captions = json.load(open(caption, 'r')) + captions = [[prompt_template.format(c[0])] for c in captions] + with torch.no_grad(): + embed = runner.model.backbone.text_model(captions) + embed = F.normalize(embed[:, 0, :], dim=1, p=2) + embed = embed.detach().cpu() + embed = embed[:, :, None, None] + return embed + + +def convert(runner, caption, checkpoint, prompt_template): + checkpoint = torch.load(checkpoint, map_location='cpu') + state_dict = checkpoint['state_dict'] + embed = get_caption_embed(runner, caption, prompt_template) + import ipdb; ipdb.set_trace() + + new_state_dict = {} + for key in list(state_dict.keys()): + if key.startswith('backbone.text_model'): + continue + elif key.startswith('backbone.image_model'): + new_key = key.replace('backbone.image_model', 'backbone') + new_state_dict[new_key] = state_dict[key].clone() + elif key.startswith('bbox_head.head_module.cls_contrasts'): + module_key = '.'.join(key.split('.')[:4]) + logit_scale = state_dict[module_key + '.logit_scale'] + bias = state_dict[module_key + '.bias'] + conv_weight = embed * logit_scale.exp() + conv_bias = bias.repeat(conv_weight.shape[0]) + new_state_dict[module_key + '.conv.weight'] = conv_weight + new_state_dict[module_key + '.conv.bias'] = conv_bias + else: + new_state_dict[key] = state_dict[key].clone() + + new_checkpoint = {'state_dict': new_state_dict} + return new_checkpoint + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', type=str) + parser.add_argument('checkpoint', type=str) + parser.add_argument('caption', type=str) + parser.add_argument('output', type=str) + parser.add_argument('--prompt-template', type=str, + default='{}') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + runner.call_hook('before_run') + runner.load_or_resume() + pipeline = cfg.test_dataloader.dataset.pipeline + runner.pipeline = Compose(pipeline) + runner.model.eval() + + new_checkpoint = convert(runner, args.caption, args.checkpoint, + args.prompt_template) + os.makedirs(os.path.dirname(args.output), exist_ok=True) + torch.save(new_checkpoint, args.output) diff --git a/tools/deploy_test.py b/tools/deploy_test.py new file mode 100644 index 0000000000000000000000000000000000000000..4657d517b4b062e6ad098886bda16ca5552ca4ae --- /dev/null +++ b/tools/deploy_test.py @@ -0,0 +1,159 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from copy import deepcopy + +from mmengine import DictAction + +from mmdeploy.apis import build_task_processor +from mmdeploy.utils.config_utils import load_config +from mmdeploy.utils.timer import TimeCounter + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDeploy test (and eval) a backend.') + parser.add_argument('deploy_cfg', help='Deploy config path') + parser.add_argument('model_cfg', help='Model config path') + parser.add_argument( + '--model', type=str, nargs='+', help='Input model files.') + parser.add_argument( + '--device', help='device used for conversion', default='cpu') + parser.add_argument( + '--work-dir', + default='./work_dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument('--show', action='store_true', help='show results') + parser.add_argument( + '--show-dir', help='directory where painted images will be saved') + parser.add_argument( + '--interval', + type=int, + default=1, + help='visualize per interval samples.') + parser.add_argument( + '--wait-time', + type=float, + default=2, + help='display time of every window. (second)') + parser.add_argument( + '--log2file', + type=str, + help='log evaluation results and speed to file', + default=None) + parser.add_argument( + '--speed-test', action='store_true', help='activate speed test') + parser.add_argument( + '--warmup', + type=int, + help='warmup before counting inference elapse, require setting ' + 'speed-test first', + default=10) + parser.add_argument( + '--log-interval', + type=int, + help='the interval between each log, require setting ' + 'speed-test first', + default=100) + parser.add_argument( + '--batch-size', + type=int, + default=1, + help='the batch size for test, would override `samples_per_gpu`' + 'in data config.') + parser.add_argument( + '--uri', + action='store_true', + default='192.168.1.1:60000', + help='Remote ipv4:port or ipv6:port for inference on edge device.') + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + deploy_cfg_path = args.deploy_cfg + model_cfg_path = args.model_cfg + + # load deploy_cfg + deploy_cfg, model_cfg = load_config(deploy_cfg_path, model_cfg_path) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + work_dir = args.work_dir + elif model_cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + # merge options for model cfg + if args.cfg_options is not None: + model_cfg.merge_from_dict(args.cfg_options) + + task_processor = build_task_processor(model_cfg, deploy_cfg, args.device) + + # prepare the dataset loader + test_dataloader = deepcopy(model_cfg['test_dataloader']) + if isinstance(test_dataloader, list): + dataset = [] + for loader in test_dataloader: + ds = task_processor.build_dataset(loader['dataset']) + dataset.append(ds) + loader['dataset'] = ds + loader['batch_size'] = args.batch_size + loader = task_processor.build_dataloader(loader) + dataloader = test_dataloader + else: + test_dataloader['batch_size'] = args.batch_size + dataset = task_processor.build_dataset(test_dataloader['dataset']) + test_dataloader['dataset'] = dataset + dataloader = task_processor.build_dataloader(test_dataloader) + + # load the model of the backend + model = task_processor.build_backend_model( + args.model, + data_preprocessor_updater=task_processor.update_data_preprocessor) + destroy_model = model.destroy + is_device_cpu = (args.device == 'cpu') + + runner = task_processor.build_test_runner( + model, + work_dir, + log_file=args.log2file, + show=args.show, + show_dir=args.show_dir, + wait_time=args.wait_time, + interval=args.interval, + dataloader=dataloader) + + if args.speed_test: + with_sync = not is_device_cpu + + with TimeCounter.activate( + warmup=args.warmup, + log_interval=args.log_interval, + with_sync=with_sync, + file=args.log2file, + batch_size=args.batch_size): + runner.test() + + else: + runner.test() + # only effective when the backend requires explicit clean-up (e.g. Ascend) + destroy_model() + + +if __name__ == '__main__': + main() diff --git a/tools/dist_test.sh b/tools/dist_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..dea131b43ea8f1222661d20603d40c18ea7f28a1 --- /dev/null +++ b/tools/dist_test.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/test.py \ + $CONFIG \ + $CHECKPOINT \ + --launcher pytorch \ + ${@:4} diff --git a/tools/dist_train.sh b/tools/dist_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..ea56f698d1f00b992eec8d481c75b273d202acf5 --- /dev/null +++ b/tools/dist_train.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +NNODES=${NNODES:-1} +NODE_RANK=${NODE_RANK:-0} +PORT=${MASTER_PORT:-29500} +MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch \ + --nnodes=$NNODES \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --nproc_per_node=$GPUS \ + --master_port=$PORT \ + $(dirname "$0")/train.py \ + $CONFIG \ + --launcher pytorch ${@:3} diff --git a/tools/dockerfiles/Dockerfile_deployment b/tools/dockerfiles/Dockerfile_deployment new file mode 100644 index 0000000000000000000000000000000000000000..17a48cf05c8bd316d89ea602c3cfcd381bd00788 --- /dev/null +++ b/tools/dockerfiles/Dockerfile_deployment @@ -0,0 +1,64 @@ +FROM nvcr.io/nvidia/pytorch:22.04-py3 + +WORKDIR /openmmlab +ARG ONNXRUNTIME_VERSION=1.8.1 +ENV DEBIAN_FRONTEND=noninteractive \ + APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn \ + FORCE_CUDA="1" + +# Install ZSH +RUN apt-get update \ + && apt-get install -y zsh \ + && wget https://github.com/robbyrussell/oh-my-zsh/raw/master/tools/install.sh -O - | zsh || true + +# Install Prerequisites +RUN apt-get update \ + && apt-get install -y git vim wget +RUN pip install -U ipdb pip + +# Install ONNXRUNTIME +RUN wget -q https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz \ + && tar -zxvf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz \ + && pip install --no-cache-dir onnxruntime-gpu==${ONNXRUNTIME_VERSION} \ + && pip install pycuda + +# Install OpenMMLab +RUN pip install --no-cache-dir openmim \ + && mim install --no-cache-dir "mmengine>=0.6.0" "mmdet>=3.0.0,<4.0.0" \ + && mim install --no-cache-dir opencv-python==4.5.5.64 opencv-python-headless==4.5.5.64 + +# Install MMCV +RUN git clone https://github.com/open-mmlab/mmcv.git -b 2.x mmcv \ + && cd mmcv \ + && mim install --no-cache-dir -r requirements/optional.txt \ + && MMCV_WITH_OPS=1 mim install --no-cache-dir -e . -v \ + && cd .. + +# Install MMYOLO +RUN git clone https://github.com/open-mmlab/mmyolo.git -b dev mmyolo \ + && cd mmyolo \ + && mim install --no-cache-dir -e . \ + && cd .. + +# Install MMDEPLOY +ENV ONNXRUNTIME_DIR=/openmmlab/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION} \ + TENSORRT_DIR=/usr/lib/x86_64-linux-gnu \ + CUDNN_DIR=/usr/lib/x86_64-linux-gnu + +RUN git clone https://github.com/open-mmlab/mmdeploy -b dev-1.x mmdeploy \ + && cd mmdeploy \ + && git submodule update --init --recursive \ + && mkdir -p build \ + && cd build \ + && cmake -DMMDEPLOY_TARGET_BACKENDS="ort;trt" -DONNXRUNTIME_DIR=${ONNXRUNTIME_DIR} -DTENSORRT_DIR=${TENSORRT_DIR} -DCUDNN_DIR=${CUDNN_DIR} .. \ + && make -j$(nproc) \ + && make install \ + && cd .. \ + && mim install --no-cache-dir -e . + +# RUN apt-get install libopencv-dev -y + +# Fix undefined symbol bug +RUN echo -e "\nexport LD_LIBRARY_PATH=${ONNXRUNTIME_DIR}/lib:${TENSORRT_DIR}/lib:${CUDNN_DIR}/lib64:${LD_LIBRARY_PATH}\nldconfig" >> /root/.bashrc +# RUN apt-get update -y \ +# && apt-get install -y iputils-ping \ No newline at end of file diff --git a/tools/dockerfiles/Dockerfile_runtime b/tools/dockerfiles/Dockerfile_runtime new file mode 100644 index 0000000000000000000000000000000000000000..3fc814bf78ae2344b867c8bb5197add9d3290c68 --- /dev/null +++ b/tools/dockerfiles/Dockerfile_runtime @@ -0,0 +1,33 @@ +ARG PYTORCH="1.9.0" +ARG CUDA="11.1" +ARG CUDNN="8" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \ + TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ + FORCE_CUDA="1" + +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key del 7fa2af80 \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# (Optional) +# RUN sed -i 's/http:\/\/archive.ubuntu.com\/ubuntu\//http:\/\/mirrors.aliyun.com\/ubuntu\//g' /etc/apt/sources.list && \ +# pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple + +RUN apt-get update \ + && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install MMEngine, MMCV, MMDet and MMYolo +RUN pip install --no-cache-dir openmim && \ + mim install --no-cache-dir "mmengine>=0.6.0" "mmcv>=2.0.0rc4,<2.1.0" "mmdet>=3.0.0,<4.0.0" \ + mim install --no-cache-dir "mmyolo>=0.6.0" + +# Install other requirements +RUN pip install --no-cache-dir transformers tokenizer gradio sentencepiece diff --git a/tools/test.py b/tools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..c05defe3c70a4cf4b8775a98bb89a84b7faba63a --- /dev/null +++ b/tools/test.py @@ -0,0 +1,150 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp + +from mmdet.engine.hooks.utils import trigger_visualization_hook +from mmengine.config import Config, ConfigDict, DictAction +from mmengine.evaluator import DumpResults +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +# TODO: support fuse_conv_bn +def parse_args(): + parser = argparse.ArgumentParser( + description='MMYOLO test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument( + '--work-dir', + help='the directory to save the file containing evaluation metrics') + parser.add_argument( + '--out', + type=str, + help='output result file (must be a .pkl file) in pickle format') + parser.add_argument( + '--json-prefix', + type=str, + help='the prefix of the output json file without perform evaluation, ' + 'which is useful when you want to format the result to a specific ' + 'format and submit it to the test server') + parser.add_argument( + '--tta', + action='store_true', + help='Whether to use test time augmentation') + parser.add_argument( + '--show', action='store_true', help='show prediction results') + parser.add_argument( + '--deploy', + action='store_true', + help='Switch model to deployment mode') + parser.add_argument( + '--show-dir', + help='directory where painted images will be saved. ' + 'If specified, it will be automatically saved ' + 'to the work_dir/timestamp/show_dir') + parser.add_argument( + '--wait-time', type=float, default=2, help='the interval of show (s)') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + cfg.load_from = args.checkpoint + + if args.show or args.show_dir: + cfg = trigger_visualization_hook(cfg, args) + + if args.deploy: + cfg.custom_hooks.append(dict(type='SwitchToDeployHook')) + + # add `format_only` and `outfile_prefix` into cfg + if args.json_prefix is not None: + cfg_json = { + 'test_evaluator.format_only': True, + 'test_evaluator.outfile_prefix': args.json_prefix + } + cfg.merge_from_dict(cfg_json) + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + if args.tta: + assert 'tta_model' in cfg, 'Cannot find ``tta_model`` in config.' \ + " Can't use tta !" + assert 'tta_pipeline' in cfg, 'Cannot find ``tta_pipeline`` ' \ + "in config. Can't use tta !" + + cfg.model = ConfigDict(**cfg.tta_model, module=cfg.model) + test_data_cfg = cfg.test_dataloader.dataset + while 'dataset' in test_data_cfg: + test_data_cfg = test_data_cfg['dataset'] + + # batch_shapes_cfg will force control the size of the output image, + # it is not compatible with tta. + if 'batch_shapes_cfg' in test_data_cfg: + test_data_cfg.batch_shapes_cfg = None + test_data_cfg.pipeline = cfg.tta_pipeline + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # add `DumpResults` dummy metric + if args.out is not None: + assert args.out.endswith(('.pkl', '.pickle')), \ + 'The dump file must be a pkl file.' + runner.test_evaluator.metrics.append( + DumpResults(out_file_path=args.out)) + + # start testing + runner.test() + + +if __name__ == '__main__': + main() diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f634972af714badd6c501218e4774df58275d0d1 --- /dev/null +++ b/tools/train.py @@ -0,0 +1,120 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import logging +import os +import os.path as osp + +from mmengine.config import Config, DictAction +from mmengine.logging import print_log +from mmengine.runner import Runner + +from mmyolo.registry import RUNNERS +from mmyolo.utils import is_metainfo_lower + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--amp', + action='store_true', + default=False, + help='enable automatic-mixed-precision training') + parser.add_argument( + '--resume', + nargs='?', + type=str, + const='auto', + help='If specify checkpoint path, resume from it, while if not ' + 'specify, try to auto resume from the latest checkpoint ' + 'in the work directory.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + + # load config + cfg = Config.fromfile(args.config) + # replace the ${key} with the value of cfg.key + # cfg = replace_cfg_vals(cfg) + cfg.launcher = args.launcher + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + if args.config.startswith('projects/'): + config = args.config[len('projects/'):] + config = config.replace('/configs/', '/') + cfg.work_dir = osp.join('./work_dirs', osp.splitext(config)[0]) + else: + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + # enable automatic-mixed-precision training + if args.amp is True: + optim_wrapper = cfg.optim_wrapper.type + if optim_wrapper == 'AmpOptimWrapper': + print_log( + 'AMP training is already enabled in your config.', + logger='current', + level=logging.WARNING) + else: + assert optim_wrapper == 'OptimWrapper', ( + '`--amp` is only supported when the optimizer wrapper type is ' + f'`OptimWrapper` but got {optim_wrapper}.') + cfg.optim_wrapper.type = 'AmpOptimWrapper' + cfg.optim_wrapper.loss_scale = 'dynamic' + + # resume is determined in this priority: resume from > auto_resume + if args.resume == 'auto': + cfg.resume = True + cfg.load_from = None + elif args.resume is not None: + cfg.resume = True + cfg.load_from = args.resume + + # Determine whether the custom metainfo fields are all lowercase + is_metainfo_lower(cfg) + + # build the runner from config + if 'runner_type' not in cfg: + # build the default runner + runner = Runner.from_cfg(cfg) + else: + # build customized runner from the registry + # if 'runner_type' is set in the cfg + runner = RUNNERS.build(cfg) + + # start training + runner.train() + + +if __name__ == '__main__': + main() diff --git a/yolo_world/__init__.py b/yolo_world/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a975b032784f40f824c0fb55919e2f4782a42272 --- /dev/null +++ b/yolo_world/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .models import * # noqa +from .datasets import * # noqa +from .engine import * # noqa +from .easydeploy import * # noqa diff --git a/yolo_world/datasets/__init__.py b/yolo_world/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3fbdad0ca10bca182c7323295d898afc03bd3913 --- /dev/null +++ b/yolo_world/datasets/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .mm_dataset import ( + MultiModalDataset, MultiModalMixedDataset) +from .yolov5_obj365v1 import YOLOv5Objects365V1Dataset +from .yolov5_obj365v2 import YOLOv5Objects365V2Dataset +from .yolov5_mixed_grounding import YOLOv5MixedGroundingDataset +from .utils import yolow_collate +from .transformers import * # NOQA +from .yolov5_v3det import YOLOv5V3DetDataset +from .yolov5_lvis import YOLOv5LVISV1Dataset + +__all__ = [ + 'MultiModalDataset', 'YOLOv5Objects365V1Dataset', + 'YOLOv5Objects365V2Dataset', 'YOLOv5MixedGroundingDataset', + 'YOLOv5V3DetDataset', 'yolow_collate', + 'YOLOv5LVISV1Dataset', 'MultiModalMixedDataset', +] diff --git a/yolo_world/datasets/mm_dataset.py b/yolo_world/datasets/mm_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..510e8b935fc85a570247b92b2459eaf160632199 --- /dev/null +++ b/yolo_world/datasets/mm_dataset.py @@ -0,0 +1,122 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import copy +import json +import logging +from typing import Callable, List, Union + +from mmengine.logging import print_log +from mmengine.dataset.base_dataset import ( + BaseDataset, Compose, force_full_init) +from mmyolo.registry import DATASETS + + +@DATASETS.register_module() +class MultiModalDataset: + """Multi-modal dataset.""" + + def __init__(self, + dataset: Union[BaseDataset, dict], + class_text_path: str = None, + test_mode: bool = True, + pipeline: List[Union[dict, Callable]] = [], + lazy_init: bool = False) -> None: + self.dataset: BaseDataset + if isinstance(dataset, dict): + self.dataset = DATASETS.build(dataset) + elif isinstance(dataset, BaseDataset): + self.dataset = dataset + else: + raise TypeError( + 'dataset must be a dict or a BaseDataset, ' + f'but got {dataset}') + + if class_text_path is not None: + self.class_texts = json.load(open(class_text_path, 'r')) + # ori_classes = self.dataset.metainfo['classes'] + # assert len(ori_classes) == len(self.class_texts), \ + # ('The number of classes in the dataset and the class text' + # 'file must be the same.') + else: + self.class_texts = None + + self.test_mode = test_mode + self._metainfo = self.dataset.metainfo + self.pipeline = Compose(pipeline) + + self._fully_initialized = False + if not lazy_init: + self.full_init() + + @property + def metainfo(self) -> dict: + return copy.deepcopy(self._metainfo) + + def full_init(self) -> None: + """``full_init`` dataset.""" + if self._fully_initialized: + return + + self.dataset.full_init() + self._ori_len = len(self.dataset) + self._fully_initialized = True + + @force_full_init + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = self.dataset.get_data_info(idx) + if self.class_texts is not None: + data_info.update({'texts': self.class_texts}) + return data_info + + def __getitem__(self, idx): + if not self._fully_initialized: + print_log( + 'Please call `full_init` method manually to ' + 'accelerate the speed.', + logger='current', + level=logging.WARNING) + self.full_init() + + data_info = self.get_data_info(idx) + + if hasattr(self.dataset, 'test_mode') and not self.dataset.test_mode: + data_info['dataset'] = self + elif not self.test_mode: + data_info['dataset'] = self + return self.pipeline(data_info) + + @force_full_init + def __len__(self) -> int: + return self._ori_len + + +@DATASETS.register_module() +class MultiModalMixedDataset(MultiModalDataset): + """Multi-modal Mixed dataset. + mix "detection dataset" and "caption dataset" + Args: + dataset_type (str): dataset type, 'detection' or 'caption' + """ + def __init__(self, + dataset: Union[BaseDataset, dict], + class_text_path: str = None, + dataset_type: str = 'detection', + test_mode: bool = True, + pipeline: List[Union[dict, Callable]] = [], + lazy_init: bool = False) -> None: + self.dataset_type = dataset_type + super().__init__(dataset, + class_text_path, + test_mode, + pipeline, + lazy_init) + + @force_full_init + def get_data_info(self, idx: int) -> dict: + """Get annotation by index.""" + data_info = self.dataset.get_data_info(idx) + if self.class_texts is not None: + data_info.update({'texts': self.class_texts}) + data_info['is_detection'] = 1 \ + if self.dataset_type == 'detection' else 0 + return data_info diff --git a/yolo_world/datasets/transformers/__init__.py b/yolo_world/datasets/transformers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..495e981551f7ae51761a97e4e41e141c43fbc536 --- /dev/null +++ b/yolo_world/datasets/transformers/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .mm_transforms import RandomLoadText, LoadText +from .mm_mix_img_transforms import ( + MultiModalMosaic, MultiModalMosaic9, YOLOv5MultiModalMixUp, + YOLOXMultiModalMixUp) + +__all__ = ['RandomLoadText', 'LoadText', 'MultiModalMosaic', + 'MultiModalMosaic9', 'YOLOv5MultiModalMixUp', + 'YOLOXMultiModalMixUp'] diff --git a/yolo_world/datasets/transformers/mm_mix_img_transforms.py b/yolo_world/datasets/transformers/mm_mix_img_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..0f4dfe084713a16438d13376ff36fd9265022a4e --- /dev/null +++ b/yolo_world/datasets/transformers/mm_mix_img_transforms.py @@ -0,0 +1,1173 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import collections +import copy +from abc import ABCMeta, abstractmethod +from typing import Optional, Sequence, Tuple, Union + +import mmcv +import numpy as np +from mmcv.transforms import BaseTransform +from mmdet.structures.bbox import autocast_box_type +from mmengine.dataset import BaseDataset +from mmengine.dataset.base_dataset import Compose +from numpy import random +from mmyolo.registry import TRANSFORMS + + +class BaseMultiModalMixImageTransform(BaseTransform, metaclass=ABCMeta): + """A Base Transform of Multimodal multiple images mixed. + + Suitable for training on multiple images mixed data augmentation like + mosaic and mixup. + + Cached mosaic transform will random select images from the cache + and combine them into one output image if use_cached is True. + + Args: + pre_transform(Sequence[str]): Sequence of transform object or + config dict to be composed. Defaults to None. + prob(float): The transformation probability. Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + pre_transform: Optional[Sequence[str]] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + + self.max_refetch = max_refetch + self.prob = prob + + self.use_cached = use_cached + self.max_cached_images = max_cached_images + self.random_pop = random_pop + self.results_cache = [] + + if pre_transform is None: + self.pre_transform = None + else: + self.pre_transform = Compose(pre_transform) + + @abstractmethod + def get_indexes(self, dataset: Union[BaseDataset, + list]) -> Union[list, int]: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list or int: indexes. + """ + pass + + @abstractmethod + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + pass + + def _update_label_text(self, results: dict) -> dict: + """Update label text.""" + if 'texts' not in results: + return results + + mix_texts = sum( + [results['texts']] + + [x['texts'] for x in results['mix_results']], []) + mix_texts = list({tuple(x) for x in mix_texts}) + text2id = {text: i for i, text in enumerate(mix_texts)} + + for res in [results] + results['mix_results']: + for i, label in enumerate(res['gt_bboxes_labels']): + text = res['texts'][label] + updated_id = text2id[tuple(text)] + res['gt_bboxes_labels'][i] = updated_id + res['texts'] = mix_texts + return results + + @autocast_box_type() + def transform(self, results: dict) -> dict: + """Data augmentation function. + + The transform steps are as follows: + 1. Randomly generate index list of other images. + 2. Before Mosaic or MixUp need to go through the necessary + pre_transform, such as MixUp' pre_transform pipeline + include: 'LoadImageFromFile','LoadAnnotations', + 'Mosaic' and 'RandomAffine'. + 3. Use mix_img_transform function to implement specific + mix operations. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + + if random.uniform(0, 1) > self.prob: + return results + + if self.use_cached: + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + self.results_cache.append(copy.deepcopy(results)) + if len(self.results_cache) > self.max_cached_images: + if self.random_pop: + index = random.randint(0, len(self.results_cache) - 1) + else: + index = 0 + self.results_cache.pop(index) + + if len(self.results_cache) <= 4: + return results + else: + assert 'dataset' in results + # Be careful: deep copying can be very time-consuming + # if results includes dataset. + dataset = results.pop('dataset', None) + + for _ in range(self.max_refetch): + # get index of one or three other images + if self.use_cached: + indexes = self.get_indexes(self.results_cache) + else: + indexes = self.get_indexes(dataset) + + if not isinstance(indexes, collections.abc.Sequence): + indexes = [indexes] + + if self.use_cached: + mix_results = [ + copy.deepcopy(self.results_cache[i]) for i in indexes + ] + else: + # get images information will be used for Mosaic or MixUp + mix_results = [ + copy.deepcopy(dataset.get_data_info(index)) + for index in indexes + ] + + if self.pre_transform is not None: + for i, data in enumerate(mix_results): + # pre_transform may also require dataset + data.update({'dataset': dataset}) + # before Mosaic or MixUp need to go through + # the necessary pre_transform + _results = self.pre_transform(data) + _results.pop('dataset') + mix_results[i] = _results + + if None not in mix_results: + results['mix_results'] = mix_results + break + print('Repeated calculation') + else: + raise RuntimeError( + 'The loading pipeline of the original dataset' + ' always return None. Please check the correctness ' + 'of the dataset and its pipeline.') + + # update labels and texts + results = self._update_label_text(results) + + # Mosaic or MixUp + results = self.mix_img_transform(results) + + if 'mix_results' in results: + results.pop('mix_results') + results['dataset'] = dataset + + return results + + +@TRANSFORMS.register_module() +class MultiModalMosaic(BaseMultiModalMixImageTransform): + """Mosaic augmentation. + + Given 4 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + mosaic transform + center_x + +------------------------------+ + | pad | | + | +-----------+ pad | + | | | | + | | image1 +-----------+ + | | | | + | | | image2 | + center_y |----+-+-----------+-----------+ + | | cropped | | + |pad | image3 | image4 | + | | | | + +----|-------------+-----------+ + | | + +-------------+ + + The mosaic transform steps are as follows: + + 1. Choose the mosaic center as the intersections of 4 images + 2. Get the left top image according to the index, and randomly + sample another 3 images from the custom dataset. + 3. Sub image will be cropped if image is larger than mosaic patch + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + center_ratio_range (Sequence[float]): Center ratio range of mosaic + output. Defaults to (0.5, 1.5). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 40. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + center_ratio_range: Tuple[float, float] = (0.5, 1.5), + bbox_clip_border: bool = True, + pad_val: float = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 40, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 4, 'The length of cache must >= 4, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.center_ratio_range = center_ratio_range + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(3)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + # print("use mosaic") + assert 'mix_results' in results + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + mosaic_masks = [] + with_mask = True if 'gt_masks' in results else False + # print("with_mask: ", with_mask) + # self.img_scale is wh format + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 2), int(img_scale_w * 2), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 2), int(img_scale_w * 2)), + self.pad_val, + dtype=results['img'].dtype) + + # mosaic center x, y + center_x = int(random.uniform(*self.center_ratio_range) * img_scale_w) + center_y = int(random.uniform(*self.center_ratio_range) * img_scale_h) + center_position = (center_x, center_y) + + loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right') + for i, loc in enumerate(loc_strs): + if loc == 'top_left': + results_patch = results + else: + results_patch = results['mix_results'][i - 1] + + img_i = results_patch['img'] + h_i, w_i = img_i.shape[:2] + # keep_ratio resize + scale_ratio_i = min(img_scale_h / h_i, img_scale_w / w_i) + img_i = mmcv.imresize( + img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i))) + + # compute the combine parameters + paste_coord, crop_coord = self._mosaic_combine( + loc, center_position, img_i.shape[:2][::-1]) + x1_p, y1_p, x2_p, y2_p = paste_coord + x1_c, y1_c, x2_c, y2_c = crop_coord + + # crop and paste image + mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c] + + # adjust coordinate + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + + padw = x1_p - x1_c + padh = y1_p - y1_c + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + if with_mask and results_patch.get('gt_masks', None) is not None: + gt_masks_i = results_patch['gt_masks'] + gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i)) + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padw, + direction='horizontal') + gt_masks_i = gt_masks_i.translate( + out_shape=(int(self.img_scale[0] * 2), + int(self.img_scale[1] * 2)), + offset=padh, + direction='vertical') + mosaic_masks.append(gt_masks_i) + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks) + results['gt_masks'] = mosaic_masks + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + if with_mask: + mosaic_masks = mosaic_masks[0].cat(mosaic_masks)[inside_inds] + results['gt_masks'] = mosaic_masks + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + + return results + + def _mosaic_combine( + self, loc: str, center_position_xy: Sequence[float], + img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]: + """Calculate global coordinate of mosaic image and local coordinate of + cropped sub-image. + + Args: + loc (str): Index for the sub-image, loc in ('top_left', + 'top_right', 'bottom_left', 'bottom_right'). + center_position_xy (Sequence[float]): Mixing center for 4 images, + (x, y). + img_shape_wh (Sequence[int]): Width and height of sub-image + + Returns: + tuple[tuple[float]]: Corresponding coordinate of pasting and + cropping + - paste_coord (tuple): paste corner coordinate in mosaic image. + - crop_coord (tuple): crop corner coordinate in mosaic image. + """ + assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right') + if loc == 'top_left': + # index0 to top left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + center_position_xy[0], \ + center_position_xy[1] + crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - ( + y2 - y1), img_shape_wh[0], img_shape_wh[1] + + elif loc == 'top_right': + # index1 to top right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + max(center_position_xy[1] - img_shape_wh[1], 0), \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + center_position_xy[1] + crop_coord = 0, img_shape_wh[1] - (y2 - y1), min( + img_shape_wh[0], x2 - x1), img_shape_wh[1] + + elif loc == 'bottom_left': + # index2 to bottom left part of image + x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \ + center_position_xy[1], \ + center_position_xy[0], \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min( + y2 - y1, img_shape_wh[1]) + + else: + # index3 to bottom right part of image + x1, y1, x2, y2 = center_position_xy[0], \ + center_position_xy[1], \ + min(center_position_xy[0] + img_shape_wh[0], + self.img_scale[0] * 2), \ + min(self.img_scale[1] * 2, center_position_xy[1] + + img_shape_wh[1]) + crop_coord = 0, 0, min(img_shape_wh[0], + x2 - x1), min(y2 - y1, img_shape_wh[1]) + + paste_coord = x1, y1, x2, y2 + return paste_coord, crop_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'center_ratio_range={self.center_ratio_range}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class MultiModalMosaic9(BaseMultiModalMixImageTransform): + """Mosaic9 augmentation. + + Given 9 images, mosaic transform combines them into + one output image. The output image is composed of the parts from each sub- + image. + + .. code:: text + + +-------------------------------+------------+ + | pad | pad | | + | +----------+ | | + | | +---------------+ top_right | + | | | top | image2 | + | | top_left | image1 | | + | | image8 o--------+------+--------+---+ + | | | | | | + +----+----------+ | right |pad| + | | center | image3 | | + | left | image0 +---------------+---| + | image7 | | | | + +---+-----------+---+--------+ | | + | | cropped | | bottom_right |pad| + | |bottom_left| | image4 | | + | | image6 | bottom | | | + +---|-----------+ image5 +---------------+---| + | pad | | pad | + +-----------+------------+-------------------+ + + The mosaic transform steps are as follows: + + 1. Get the center image according to the index, and randomly + sample another 8 images from the custom dataset. + 2. Randomly offset the image after Mosaic + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + Args: + img_scale (Sequence[int]): Image size after mosaic pipeline of single + image. The shape order should be (width, height). + Defaults to (640, 640). + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pad_val (int): Pad value. Defaults to 114. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 5 caches for each image suffices for + randomness. Defaults to 50. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of retry iterations for getting + valid results from the pipeline. If the number of iterations is + greater than `max_refetch`, but results is still None, then the + iteration is terminated and raise the error. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + bbox_clip_border: bool = True, + pad_val: Union[float, int] = 114.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 50, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \ + f'got {prob}.' + if use_cached: + assert max_cached_images >= 9, 'The length of cache must >= 9, ' \ + f'but got {max_cached_images}.' + + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + + self.img_scale = img_scale + self.bbox_clip_border = bbox_clip_border + self.pad_val = pad_val + + # intermediate variables + self._current_img_shape = [0, 0] + self._center_img_shape = [0, 0] + self._previous_img_shape = [0, 0] + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> list: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + list: indexes. + """ + indexes = [random.randint(0, len(dataset)) for _ in range(8)] + return indexes + + def mix_img_transform(self, results: dict) -> dict: + """Mixed image data transformation. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + mosaic_bboxes = [] + mosaic_bboxes_labels = [] + mosaic_ignore_flags = [] + + img_scale_w, img_scale_h = self.img_scale + + if len(results['img'].shape) == 3: + mosaic_img = np.full( + (int(img_scale_h * 3), int(img_scale_w * 3), 3), + self.pad_val, + dtype=results['img'].dtype) + else: + mosaic_img = np.full((int(img_scale_h * 3), int(img_scale_w * 3)), + self.pad_val, + dtype=results['img'].dtype) + + # index = 0 is mean original image + # len(results['mix_results']) = 8 + loc_strs = ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + results_all = [results, *results['mix_results']] + for index, results_patch in enumerate(results_all): + img_i = results_patch['img'] + # keep_ratio resize + img_i_h, img_i_w = img_i.shape[:2] + scale_ratio_i = min(img_scale_h / img_i_h, img_scale_w / img_i_w) + img_i = mmcv.imresize( + img_i, + (int(img_i_w * scale_ratio_i), int(img_i_h * scale_ratio_i))) + + paste_coord = self._mosaic_combine(loc_strs[index], + img_i.shape[:2]) + + padw, padh = paste_coord[:2] + x1, y1, x2, y2 = (max(x, 0) for x in paste_coord) + mosaic_img[y1:y2, x1:x2] = img_i[y1 - padh:, x1 - padw:] + + gt_bboxes_i = results_patch['gt_bboxes'] + gt_bboxes_labels_i = results_patch['gt_bboxes_labels'] + gt_ignore_flags_i = results_patch['gt_ignore_flags'] + gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i]) + gt_bboxes_i.translate_([padw, padh]) + + mosaic_bboxes.append(gt_bboxes_i) + mosaic_bboxes_labels.append(gt_bboxes_labels_i) + mosaic_ignore_flags.append(gt_ignore_flags_i) + + # Offset + offset_x = int(random.uniform(0, img_scale_w)) + offset_y = int(random.uniform(0, img_scale_h)) + mosaic_img = mosaic_img[offset_y:offset_y + 2 * img_scale_h, + offset_x:offset_x + 2 * img_scale_w] + + mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0) + mosaic_bboxes.translate_([-offset_x, -offset_y]) + mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0) + mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0) + + if self.bbox_clip_border: + mosaic_bboxes.clip_([2 * img_scale_h, 2 * img_scale_w]) + else: + # remove outside bboxes + inside_inds = mosaic_bboxes.is_inside( + [2 * img_scale_h, 2 * img_scale_w]).numpy() + mosaic_bboxes = mosaic_bboxes[inside_inds] + mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds] + mosaic_ignore_flags = mosaic_ignore_flags[inside_inds] + + results['img'] = mosaic_img + results['img_shape'] = mosaic_img.shape + results['gt_bboxes'] = mosaic_bboxes + results['gt_bboxes_labels'] = mosaic_bboxes_labels + results['gt_ignore_flags'] = mosaic_ignore_flags + return results + + def _mosaic_combine(self, loc: str, + img_shape_hw: Tuple[int, int]) -> Tuple[int, ...]: + """Calculate global coordinate of mosaic image. + + Args: + loc (str): Index for the sub-image. + img_shape_hw (Sequence[int]): Height and width of sub-image + + Returns: + paste_coord (tuple): paste corner coordinate in mosaic image. + """ + assert loc in ('center', 'top', 'top_right', 'right', 'bottom_right', + 'bottom', 'bottom_left', 'left', 'top_left') + + img_scale_w, img_scale_h = self.img_scale + + self._current_img_shape = img_shape_hw + current_img_h, current_img_w = self._current_img_shape + previous_img_h, previous_img_w = self._previous_img_shape + center_img_h, center_img_w = self._center_img_shape + + if loc == 'center': + self._center_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + paste_coord = img_scale_w, \ + img_scale_h, \ + img_scale_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'top': + paste_coord = img_scale_w, \ + img_scale_h - current_img_h, \ + img_scale_w + current_img_w, \ + img_scale_h + elif loc == 'top_right': + paste_coord = img_scale_w + previous_img_w, \ + img_scale_h - current_img_h, \ + img_scale_w + previous_img_w + current_img_w, \ + img_scale_h + elif loc == 'right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + current_img_h + elif loc == 'bottom_right': + paste_coord = img_scale_w + center_img_w, \ + img_scale_h + previous_img_h, \ + img_scale_w + center_img_w + current_img_w, \ + img_scale_h + previous_img_h + current_img_h + elif loc == 'bottom': + paste_coord = img_scale_w + center_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'bottom_left': + paste_coord = img_scale_w + center_img_w - \ + previous_img_w - current_img_w, \ + img_scale_h + center_img_h, \ + img_scale_w + center_img_w - previous_img_w, \ + img_scale_h + center_img_h + current_img_h + elif loc == 'left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h + elif loc == 'top_left': + paste_coord = img_scale_w - current_img_w, \ + img_scale_h + center_img_h - \ + previous_img_h - current_img_h, \ + img_scale_w, \ + img_scale_h + center_img_h - previous_img_h + + self._previous_img_shape = self._current_img_shape + # xmin, ymin, xmax, ymax + return paste_coord + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'prob={self.prob})' + return repr_str + + +@TRANSFORMS.register_module() +class YOLOv5MultiModalMixUp(BaseMultiModalMixImageTransform): + """MixUp data augmentation for YOLOv5. + + .. code:: text + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset. + 2. Randomly obtain the fusion ratio from the beta distribution, + then fuse the target + of the original image and mixup image through this ratio. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + alpha (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + beta (float): parameter of beta distribution to get mixup ratio. + Defaults to 32. + pre_transform (Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + alpha: float = 32.0, + beta: float = 32.0, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.alpha = alpha + self.beta = beta + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOv5 MixUp transform function. + + Args: + results (dict): Result dict + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + ori_img = results['img'] + assert ori_img.shape == retrieve_img.shape + + # Randomly obtain the fusion ratio from the beta distribution, + # which is around 0.5 + ratio = np.random.beta(self.alpha, self.beta) + mixup_img = (ori_img * ratio + retrieve_img * (1 - ratio)) + + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = retrieve_gt_bboxes.cat( + (results['gt_bboxes'], retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + if 'gt_masks' in results: + assert 'gt_masks' in retrieve_results + mixup_gt_masks = results['gt_masks'].cat( + [results['gt_masks'], retrieve_results['gt_masks']]) + results['gt_masks'] = mixup_gt_masks + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + +@TRANSFORMS.register_module() +class YOLOXMultiModalMixUp(BaseMultiModalMixImageTransform): + """MixUp data augmentation for YOLOX. + + .. code:: text + + mixup transform + +---------------+--------------+ + | mixup image | | + | +--------|--------+ | + | | | | | + +---------------+ | | + | | | | + | | image | | + | | | | + | | | | + | +-----------------+ | + | pad | + +------------------------------+ + + The mixup transform steps are as follows: + + 1. Another random image is picked by dataset and embedded in + the top left patch(after padding and resizing) + 2. The target of mixup transform is the weighted average of mixup + image and origin image. + + Required Keys: + + - img + - gt_bboxes (BaseBoxes[torch.float32]) (optional) + - gt_bboxes_labels (np.int64) (optional) + - gt_ignore_flags (bool) (optional) + - mix_results (List[dict]) + + + Modified Keys: + + - img + - img_shape + - gt_bboxes (optional) + - gt_bboxes_labels (optional) + - gt_ignore_flags (optional) + + + Args: + img_scale (Sequence[int]): Image output size after mixup pipeline. + The shape order should be (width, height). Defaults to (640, 640). + ratio_range (Sequence[float]): Scale ratio of mixup image. + Defaults to (0.5, 1.5). + flip_ratio (float): Horizontal flip ratio of mixup image. + Defaults to 0.5. + pad_val (int): Pad value. Defaults to 114. + bbox_clip_border (bool, optional): Whether to clip the objects outside + the border of the image. In some dataset like MOT17, the gt bboxes + are allowed to cross the border of images. Therefore, we don't + need to clip the gt bboxes in these cases. Defaults to True. + pre_transform(Sequence[dict]): Sequence of transform object or + config dict to be composed. + prob (float): Probability of applying this transformation. + Defaults to 1.0. + use_cached (bool): Whether to use cache. Defaults to False. + max_cached_images (int): The maximum length of the cache. The larger + the cache, the stronger the randomness of this transform. As a + rule of thumb, providing 10 caches for each image suffices for + randomness. Defaults to 20. + random_pop (bool): Whether to randomly pop a result from the cache + when the cache is full. If set to False, use FIFO popping method. + Defaults to True. + max_refetch (int): The maximum number of iterations. If the number of + iterations is greater than `max_refetch`, but gt_bbox is still + empty, then the iteration is terminated. Defaults to 15. + """ + + def __init__(self, + img_scale: Tuple[int, int] = (640, 640), + ratio_range: Tuple[float, float] = (0.5, 1.5), + flip_ratio: float = 0.5, + pad_val: float = 114.0, + bbox_clip_border: bool = True, + pre_transform: Sequence[dict] = None, + prob: float = 1.0, + use_cached: bool = False, + max_cached_images: int = 20, + random_pop: bool = True, + max_refetch: int = 15): + assert isinstance(img_scale, tuple) + if use_cached: + assert max_cached_images >= 2, 'The length of cache must >= 2, ' \ + f'but got {max_cached_images}.' + super().__init__( + pre_transform=pre_transform, + prob=prob, + use_cached=use_cached, + max_cached_images=max_cached_images, + random_pop=random_pop, + max_refetch=max_refetch) + self.img_scale = img_scale + self.ratio_range = ratio_range + self.flip_ratio = flip_ratio + self.pad_val = pad_val + self.bbox_clip_border = bbox_clip_border + + def get_indexes(self, dataset: Union[BaseDataset, list]) -> int: + """Call function to collect indexes. + + Args: + dataset (:obj:`Dataset` or list): The dataset or cached list. + + Returns: + int: indexes. + """ + return random.randint(0, len(dataset)) + + def mix_img_transform(self, results: dict) -> dict: + """YOLOX MixUp transform function. + + Args: + results (dict): Result dict. + + Returns: + results (dict): Updated result dict. + """ + assert 'mix_results' in results + assert len( + results['mix_results']) == 1, 'MixUp only support 2 images now !' + + if results['mix_results'][0]['gt_bboxes'].shape[0] == 0: + # empty bbox + return results + + retrieve_results = results['mix_results'][0] + retrieve_img = retrieve_results['img'] + + jit_factor = random.uniform(*self.ratio_range) + is_filp = random.uniform(0, 1) > self.flip_ratio + + if len(retrieve_img.shape) == 3: + out_img = np.ones((self.img_scale[1], self.img_scale[0], 3), + dtype=retrieve_img.dtype) * self.pad_val + else: + out_img = np.ones( + self.img_scale[::-1], dtype=retrieve_img.dtype) * self.pad_val + + # 1. keep_ratio resize + scale_ratio = min(self.img_scale[1] / retrieve_img.shape[0], + self.img_scale[0] / retrieve_img.shape[1]) + retrieve_img = mmcv.imresize( + retrieve_img, (int(retrieve_img.shape[1] * scale_ratio), + int(retrieve_img.shape[0] * scale_ratio))) + + # 2. paste + out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img + + # 3. scale jit + scale_ratio *= jit_factor + out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor), + int(out_img.shape[0] * jit_factor))) + + # 4. flip + if is_filp: + out_img = out_img[:, ::-1, :] + + # 5. random crop + ori_img = results['img'] + origin_h, origin_w = out_img.shape[:2] + target_h, target_w = ori_img.shape[:2] + padded_img = np.ones((max(origin_h, target_h), max( + origin_w, target_w), 3)) * self.pad_val + padded_img = padded_img.astype(np.uint8) + padded_img[:origin_h, :origin_w] = out_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w) + padded_cropped_img = padded_img[y_offset:y_offset + target_h, + x_offset:x_offset + target_w] + + # 6. adjust bbox + retrieve_gt_bboxes = retrieve_results['gt_bboxes'] + retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio]) + if self.bbox_clip_border: + retrieve_gt_bboxes.clip_([origin_h, origin_w]) + + if is_filp: + retrieve_gt_bboxes.flip_([origin_h, origin_w], + direction='horizontal') + + # 7. filter + cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone() + cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset]) + if self.bbox_clip_border: + cp_retrieve_gt_bboxes.clip_([target_h, target_w]) + + # 8. mix up + mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img + + retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels'] + retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags'] + + mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat( + (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0) + mixup_gt_bboxes_labels = np.concatenate( + (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0) + mixup_gt_ignore_flags = np.concatenate( + (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0) + + if not self.bbox_clip_border: + # remove outside bbox + inside_inds = mixup_gt_bboxes.is_inside([target_h, + target_w]).numpy() + mixup_gt_bboxes = mixup_gt_bboxes[inside_inds] + mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds] + mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds] + + results['img'] = mixup_img.astype(np.uint8) + results['img_shape'] = mixup_img.shape + results['gt_bboxes'] = mixup_gt_bboxes + results['gt_bboxes_labels'] = mixup_gt_bboxes_labels + results['gt_ignore_flags'] = mixup_gt_ignore_flags + + return results + + def __repr__(self) -> str: + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'flip_ratio={self.flip_ratio}, ' + repr_str += f'pad_val={self.pad_val}, ' + repr_str += f'max_refetch={self.max_refetch}, ' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str diff --git a/yolo_world/datasets/transformers/mm_transforms.py b/yolo_world/datasets/transformers/mm_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..0008920b82fc29b3ccc0473e894cd718cdb21fa4 --- /dev/null +++ b/yolo_world/datasets/transformers/mm_transforms.py @@ -0,0 +1,129 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import json +import random +from typing import Tuple + +import numpy as np +from mmyolo.registry import TRANSFORMS + + +@TRANSFORMS.register_module() +class RandomLoadText: + + def __init__(self, + text_path: str = None, + prompt_format: str = '{}', + num_neg_samples: Tuple[int, int] = (80, 80), + max_num_samples: int = 80, + padding_to_max: bool = False, + padding_value: str = '') -> None: + self.prompt_format = prompt_format + self.num_neg_samples = num_neg_samples + self.max_num_samples = max_num_samples + self.padding_to_max = padding_to_max + self.padding_value = padding_value + if text_path is not None: + with open(text_path, 'r') as f: + self.class_texts = json.load(f) + + def __call__(self, results: dict) -> dict: + assert 'texts' in results or hasattr(self, 'class_texts'), ( + 'No texts found in results.') + class_texts = results.get( + 'texts', + getattr(self, 'class_texts', None)) + + num_classes = len(class_texts) + if 'gt_labels' in results: + gt_label_tag = 'gt_labels' + elif 'gt_bboxes_labels' in results: + gt_label_tag = 'gt_bboxes_labels' + else: + raise ValueError('No valid labels found in results.') + positive_labels = set(results[gt_label_tag]) + + if len(positive_labels) > self.max_num_samples: + positive_labels = set(random.sample(list(positive_labels), + k=self.max_num_samples)) + + num_neg_samples = min( + min(num_classes, self.max_num_samples) - len(positive_labels), + random.randint(*self.num_neg_samples)) + candidate_neg_labels = [] + for idx in range(num_classes): + if idx not in positive_labels: + candidate_neg_labels.append(idx) + negative_labels = random.sample( + candidate_neg_labels, k=num_neg_samples) + + sampled_labels = list(positive_labels) + list(negative_labels) + random.shuffle(sampled_labels) + + label2ids = {label: i for i, label in enumerate(sampled_labels)} + + gt_valid_mask = np.zeros(len(results['gt_bboxes']), dtype=bool) + for idx, label in enumerate(results[gt_label_tag]): + if label in label2ids: + gt_valid_mask[idx] = True + results[gt_label_tag][idx] = label2ids[label] + results['gt_bboxes'] = results['gt_bboxes'][gt_valid_mask] + results[gt_label_tag] = results[gt_label_tag][gt_valid_mask] + + if 'instances' in results: + retaged_instances = [] + for idx, inst in enumerate(results['instances']): + label = inst['bbox_label'] + if label in label2ids: + inst['bbox_label'] = label2ids[label] + retaged_instances.append(inst) + results['instances'] = retaged_instances + + texts = [] + for label in sampled_labels: + cls_caps = class_texts[label] + assert len(cls_caps) > 0 + cap_id = random.randrange(len(cls_caps)) + sel_cls_cap = self.prompt_format.format(cls_caps[cap_id]) + texts.append(sel_cls_cap) + + if self.padding_to_max: + num_valid_labels = len(positive_labels) + len(negative_labels) + num_padding = self.max_num_samples - num_valid_labels + if num_padding > 0: + texts += [self.padding_value] * num_padding + + results['texts'] = texts + + return results + + +@TRANSFORMS.register_module() +class LoadText: + + def __init__(self, + text_path: str = None, + prompt_format: str = '{}', + multi_prompt_flag: str = '/') -> None: + self.prompt_format = prompt_format + self.multi_prompt_flag = multi_prompt_flag + if text_path is not None: + with open(text_path, 'r') as f: + self.class_texts = json.load(f) + + def __call__(self, results: dict) -> dict: + assert 'texts' in results or hasattr(self, 'class_texts'), ( + 'No texts found in results.') + class_texts = results.get( + 'texts', + getattr(self, 'class_texts', None)) + + texts = [] + for idx, cls_caps in enumerate(class_texts): + assert len(cls_caps) > 0 + sel_cls_cap = cls_caps[0] + sel_cls_cap = self.prompt_format.format(sel_cls_cap) + texts.append(sel_cls_cap) + + results['texts'] = texts + + return results diff --git a/yolo_world/datasets/utils.py b/yolo_world/datasets/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fb2acf11ca53dfa3e151e942591de306c5b123a3 --- /dev/null +++ b/yolo_world/datasets/utils.py @@ -0,0 +1,91 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import random +from typing import Any, Sequence + +import torch +from mmengine.dataset import COLLATE_FUNCTIONS +from mmengine.logging import print_log +from mmyolo.datasets.yolov5_coco import BatchShapePolicyDataset + + +class RobustBatchShapePolicyDataset(BatchShapePolicyDataset): + """Dataset with the batch shape policy that makes paddings with least + pixels during batch inference process, which does not require the image + scales of all batches to be the same throughout validation.""" + + def _prepare_data(self, idx: int) -> Any: + if self.test_mode is False: + data_info = self.get_data_info(idx) + data_info['dataset'] = self + return self.pipeline(data_info) + else: + return super().prepare_data(idx) + + def prepare_data(self, idx: int, timeout=10) -> Any: + """Pass the dataset to the pipeline during training to support mixed + data augmentation, such as Mosaic and MixUp.""" + try: + return self._prepare_data(idx) + except Exception as e: + if timeout <= 0: + raise e + print_log(f'Failed to prepare data, due to {e}.' + f'Retrying {timeout} attempts.') + if not self.test_mode: + idx = random.randrange(len(self)) + return self.prepare_data(idx, timeout=timeout - 1) + + +@COLLATE_FUNCTIONS.register_module() +def yolow_collate(data_batch: Sequence, + use_ms_training: bool = False) -> dict: + """Rewrite collate_fn to get faster training speed. + + Args: + data_batch (Sequence): Batch of data. + use_ms_training (bool): Whether to use multi-scale training. + """ + batch_imgs = [] + batch_bboxes_labels = [] + batch_masks = [] + for i in range(len(data_batch)): + datasamples = data_batch[i]['data_samples'] + inputs = data_batch[i]['inputs'] + batch_imgs.append(inputs) + + gt_bboxes = datasamples.gt_instances.bboxes.tensor + gt_labels = datasamples.gt_instances.labels + if 'masks' in datasamples.gt_instances: + masks = datasamples.gt_instances.masks.to_tensor( + dtype=torch.bool, device=gt_bboxes.device) + batch_masks.append(masks) + batch_idx = gt_labels.new_full((len(gt_labels), 1), i) + bboxes_labels = torch.cat((batch_idx, gt_labels[:, None], gt_bboxes), + dim=1) + batch_bboxes_labels.append(bboxes_labels) + + collated_results = { + 'data_samples': { + 'bboxes_labels': torch.cat(batch_bboxes_labels, 0) + } + } + if len(batch_masks) > 0: + collated_results['data_samples']['masks'] = torch.cat(batch_masks, 0) + + if use_ms_training: + collated_results['inputs'] = batch_imgs + else: + collated_results['inputs'] = torch.stack(batch_imgs, 0) + + if hasattr(data_batch[0]['data_samples'], 'texts'): + batch_texts = [meta['data_samples'].texts for meta in data_batch] + collated_results['data_samples']['texts'] = batch_texts + + if hasattr(data_batch[0]['data_samples'], 'is_detection'): + # detection flag + batch_detection = [meta['data_samples'].is_detection + for meta in data_batch] + collated_results['data_samples']['is_detection'] = torch.tensor( + batch_detection) + + return collated_results diff --git a/yolo_world/datasets/yolov5_lvis.py b/yolo_world/datasets/yolov5_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..554383952252fe028fb90ae7f3e9bb1eee97e6d1 --- /dev/null +++ b/yolo_world/datasets/yolov5_lvis.py @@ -0,0 +1,15 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from mmdet.datasets import LVISV1Dataset +from mmyolo.registry import DATASETS + +from .utils import RobustBatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5LVISV1Dataset(RobustBatchShapePolicyDataset, LVISV1Dataset): + """Dataset for YOLOv5 LVIS Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/yolo_world/datasets/yolov5_mixed_grounding.py b/yolo_world/datasets/yolov5_mixed_grounding.py new file mode 100644 index 0000000000000000000000000000000000000000..ad4348cfe8964c148bf21baab85b2d91dbc7ef70 --- /dev/null +++ b/yolo_world/datasets/yolov5_mixed_grounding.py @@ -0,0 +1,201 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import os.path as osp +from typing import List, Union + +from mmengine.fileio import get_local_path, join_path +from mmengine.utils import is_abs +from mmdet.datasets.coco import CocoDataset +from mmyolo.registry import DATASETS + +from .utils import RobustBatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5MixedGroundingDataset(RobustBatchShapePolicyDataset, CocoDataset): + """Mixed grounding dataset.""" + + METAINFO = { + 'classes': ('object',), + 'palette': [(220, 20, 60)]} + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path( + self.ann_file, backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + # print(len(data_list)) + return data_list + + def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]: + """Parse raw annotation to target format. + + Args: + raw_data_info (dict): Raw data information load from ``ann_file`` + + Returns: + Union[dict, List[dict]]: Parsed annotation. + """ + img_info = raw_data_info['raw_img_info'] + ann_info = raw_data_info['raw_ann_info'] + + data_info = {} + + img_path = None + img_prefix = self.data_prefix.get('img', None) + if isinstance(img_prefix, str): + img_path = osp.join(img_prefix, img_info['file_name']) + elif isinstance(img_prefix, (list, tuple)): + for prefix in img_prefix: + candidate_img_path = osp.join(prefix, img_info['file_name']) + if osp.exists(candidate_img_path): + img_path = candidate_img_path + break + assert img_path is not None, ( + f'Image path {img_info["file_name"]} not found in' + f'{img_prefix}') + if self.data_prefix.get('seg', None): + seg_map_path = osp.join( + self.data_prefix['seg'], + img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix) + else: + seg_map_path = None + data_info['img_path'] = img_path + data_info['img_id'] = img_info['img_id'] + data_info['seg_map_path'] = seg_map_path + data_info['height'] = float(img_info['height']) + data_info['width'] = float(img_info['width']) + + cat2id = {} + texts = [] + for ann in ann_info: + cat_name = ' '.join([img_info['caption'][t[0]:t[1]] + for t in ann['tokens_positive']]) + if cat_name not in cat2id: + cat2id[cat_name] = len(cat2id) + texts.append([cat_name]) + data_info['texts'] = texts + + instances = [] + for i, ann in enumerate(ann_info): + instance = {} + + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + inter_w = max(0, + min(x1 + w, float(img_info['width'])) - max(x1, 0)) + inter_h = max(0, + min(y1 + h, float(img_info['height'])) - max(y1, 0)) + if inter_w * inter_h == 0: + continue + if ann['area'] <= 0 or w < 1 or h < 1: + continue + bbox = [x1, y1, x1 + w, y1 + h] + + if ann.get('iscrowd', False): + instance['ignore_flag'] = 1 + else: + instance['ignore_flag'] = 0 + instance['bbox'] = bbox + + cat_name = ' '.join([img_info['caption'][t[0]:t[1]] + for t in ann['tokens_positive']]) + instance['bbox_label'] = cat2id[cat_name] + + if ann.get('segmentation', None): + instance['mask'] = ann['segmentation'] + + instances.append(instance) + # NOTE: for detection task, we set `is_detection` to 1 + data_info['is_detection'] = 1 + data_info['instances'] = instances + # print(data_info['texts']) + return data_info + + def filter_data(self) -> List[dict]: + """Filter annotations according to filter_cfg. + + Returns: + List[dict]: Filtered results. + """ + if self.test_mode: + return self.data_list + + if self.filter_cfg is None: + return self.data_list + + filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) + min_size = self.filter_cfg.get('min_size', 0) + + # obtain images that contain annotation + ids_with_ann = set(data_info['img_id'] for data_info in self.data_list) + + valid_data_infos = [] + for i, data_info in enumerate(self.data_list): + img_id = data_info['img_id'] + width = int(data_info['width']) + height = int(data_info['height']) + if filter_empty_gt and img_id not in ids_with_ann: + continue + if min(width, height) >= min_size: + valid_data_infos.append(data_info) + + return valid_data_infos + + def _join_prefix(self): + """Join ``self.data_root`` with ``self.data_prefix`` and + ``self.ann_file``. + """ + # Automatically join annotation file path with `self.root` if + # `self.ann_file` is not an absolute path. + if self.ann_file and not is_abs(self.ann_file) and self.data_root: + self.ann_file = join_path(self.data_root, self.ann_file) + # Automatically join data directory with `self.root` if path value in + # `self.data_prefix` is not an absolute path. + for data_key, prefix in self.data_prefix.items(): + if isinstance(prefix, (list, tuple)): + abs_prefix = [] + for p in prefix: + if not is_abs(p) and self.data_root: + abs_prefix.append(join_path(self.data_root, p)) + else: + abs_prefix.append(p) + self.data_prefix[data_key] = abs_prefix + elif isinstance(prefix, str): + if not is_abs(prefix) and self.data_root: + self.data_prefix[data_key] = join_path( + self.data_root, prefix) + else: + self.data_prefix[data_key] = prefix + else: + raise TypeError('prefix should be a string, tuple or list,' + f'but got {type(prefix)}') diff --git a/yolo_world/datasets/yolov5_obj365v1.py b/yolo_world/datasets/yolov5_obj365v1.py new file mode 100644 index 0000000000000000000000000000000000000000..79eff774d2e0ed8f3ce6f0c8d19016956b73c434 --- /dev/null +++ b/yolo_world/datasets/yolov5_obj365v1.py @@ -0,0 +1,16 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from mmdet.datasets import Objects365V1Dataset +from mmyolo.registry import DATASETS + +from .utils import RobustBatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5Objects365V1Dataset(RobustBatchShapePolicyDataset, + Objects365V1Dataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/yolo_world/datasets/yolov5_obj365v2.py b/yolo_world/datasets/yolov5_obj365v2.py new file mode 100644 index 0000000000000000000000000000000000000000..9552f2d1193cacc02dfce0899a46b8a5df16a728 --- /dev/null +++ b/yolo_world/datasets/yolov5_obj365v2.py @@ -0,0 +1,16 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from mmdet.datasets import Objects365V2Dataset +from mmyolo.registry import DATASETS + +from .utils import RobustBatchShapePolicyDataset + + +@DATASETS.register_module() +class YOLOv5Objects365V2Dataset(RobustBatchShapePolicyDataset, + Objects365V2Dataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/yolo_world/datasets/yolov5_v3det.py b/yolo_world/datasets/yolov5_v3det.py new file mode 100644 index 0000000000000000000000000000000000000000..d947948a262f7146feaedff48ad8d2f048820ef3 --- /dev/null +++ b/yolo_world/datasets/yolov5_v3det.py @@ -0,0 +1,109 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import copy +import os.path as osp +from typing import List + +from mmengine.fileio import get_local_path +from mmdet.datasets.api_wrappers import COCO +from mmdet.datasets import CocoDataset +from mmyolo.registry import DATASETS + +from .utils import RobustBatchShapePolicyDataset + +v3det_ignore_list = [ + 'a00013820/26_275_28143226914_ff3a247c53_c.jpg', + 'n03815615/12_1489_32968099046_be38fa580e_c.jpg', + 'n04550184/19_1480_2504784164_ffa3db8844_c.jpg', + 'a00008703/2_363_3576131784_dfac6fc6ce_c.jpg', + 'n02814533/28_2216_30224383848_a90697f1b3_c.jpg', + 'n12026476/29_186_15091304754_5c219872f7_c.jpg', + 'n01956764/12_2004_50133201066_72e0d9fea5_c.jpg', + 'n03785016/14_2642_518053131_d07abcb5da_c.jpg', + 'a00011156/33_250_4548479728_9ce5246596_c.jpg', + 'a00009461/19_152_2792869324_db95bebc84_c.jpg', +] + +# # ugly code here +# import json +# with open(osp.join("data/v3det/cats.json"), 'r') as f: +# _classes = json.load(f)['classes'] + + +@DATASETS.register_module() +class V3DetDataset(CocoDataset): + """Objects365 v1 dataset for detection.""" + + METAINFO = {'classes': 'classes', 'palette': None} + + COCOAPI = COCO + # ann_id is unique in coco dataset. + ANN_ID_UNIQUE = True + + def load_data_list(self) -> List[dict]: + """Load annotations from an annotation file named as ``self.ann_file`` + + Returns: + List[dict]: A list of annotation. + """ # noqa: E501 + with get_local_path(self.ann_file, + backend_args=self.backend_args) as local_path: + self.coco = self.COCOAPI(local_path) + + # 'categories' list in objects365_train.json and objects365_val.json + # is inconsistent, need sort list(or dict) before get cat_ids. + cats = self.coco.cats + sorted_cats = {i: cats[i] for i in sorted(cats)} + self.coco.cats = sorted_cats + categories = self.coco.dataset['categories'] + sorted_categories = sorted(categories, key=lambda i: i['id']) + self.coco.dataset['categories'] = sorted_categories + # The order of returned `cat_ids` will not + # change with the order of the `classes` + self.cat_ids = self.coco.get_cat_ids( + cat_names=self.metainfo['classes']) + self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} + self.cat_img_map = copy.deepcopy(self.coco.cat_img_map) + + img_ids = self.coco.get_img_ids() + data_list = [] + total_ann_ids = [] + for img_id in img_ids: + raw_img_info = self.coco.load_imgs([img_id])[0] + raw_img_info['img_id'] = img_id + + ann_ids = self.coco.get_ann_ids(img_ids=[img_id]) + raw_ann_info = self.coco.load_anns(ann_ids) + total_ann_ids.extend(ann_ids) + + file_name = osp.join( + osp.split(osp.split(raw_img_info['file_name'])[0])[-1], + osp.split(raw_img_info['file_name'])[-1]) + + if file_name in v3det_ignore_list: + continue + + parsed_data_info = self.parse_data_info({ + 'raw_ann_info': + raw_ann_info, + 'raw_img_info': + raw_img_info + }) + data_list.append(parsed_data_info) + if self.ANN_ID_UNIQUE: + assert len(set(total_ann_ids)) == len( + total_ann_ids + ), f"Annotation ids in '{self.ann_file}' are not unique!" + + del self.coco + + return data_list + + +@DATASETS.register_module() +class YOLOv5V3DetDataset(RobustBatchShapePolicyDataset, V3DetDataset): + """Dataset for YOLOv5 VOC Dataset. + + We only add `BatchShapePolicy` function compared with Objects365V1Dataset. + See `mmyolo/datasets/utils.py#BatchShapePolicy` for details + """ + pass diff --git a/yolo_world/easydeploy/README.md b/yolo_world/easydeploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1816e7ed96ee34209c56af4a22eda5f1eb7e499b --- /dev/null +++ b/yolo_world/easydeploy/README.md @@ -0,0 +1,11 @@ +# MMYOLO Model Easy-Deployment + +## Introduction + +This project is developed for easily converting your MMYOLO models to other inference backends without the need of MMDeploy, which reduces the cost of both time and effort on getting familiar with MMDeploy. + +Currently we support converting to `ONNX` and `TensorRT` formats, other inference backends such `ncnn` will be added to this project as well. + +## Supported Backends + +- [Model Convert](docs/model_convert.md) diff --git a/yolo_world/easydeploy/README_zh-CN.md b/yolo_world/easydeploy/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..4c6bc0cf4ef91edeced04bdf15af08ae1f6f0dcd --- /dev/null +++ b/yolo_world/easydeploy/README_zh-CN.md @@ -0,0 +1,11 @@ +# MMYOLO 模型转换 + +## 介绍 + +本项目作为 MMYOLO 的部署 project 单独存在,意图剥离 MMDeploy 当前的体系,独自支持用户完成模型训练后的转换和部署功能,使用户的学习和工程成本下降。 + +当前支持对 ONNX 格式和 TensorRT 格式的转换,后续对其他推理平台也会支持起来。 + +## 转换教程 + +- [Model Convert](docs/model_convert.md) diff --git a/yolo_world/easydeploy/backbone/__init__.py b/yolo_world/easydeploy/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc167f8515c66a30d884ed9655a11d45e21481c0 --- /dev/null +++ b/yolo_world/easydeploy/backbone/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .common import DeployC2f +from .focus import DeployFocus, GConvFocus, NcnnFocus + +__all__ = ['DeployFocus', 'NcnnFocus', 'GConvFocus', 'DeployC2f'] diff --git a/yolo_world/easydeploy/backbone/common.py b/yolo_world/easydeploy/backbone/common.py new file mode 100644 index 0000000000000000000000000000000000000000..617875bd979a5b9150e476544090777118087a0b --- /dev/null +++ b/yolo_world/easydeploy/backbone/common.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn +from torch import Tensor + + +class DeployC2f(nn.Module): + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, x: Tensor) -> Tensor: + x_main = self.main_conv(x) + x_main = [x_main, x_main[:, self.mid_channels:, ...]] + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.pop(1) + return self.final_conv(torch.cat(x_main, 1)) diff --git a/yolo_world/easydeploy/backbone/focus.py b/yolo_world/easydeploy/backbone/focus.py new file mode 100644 index 0000000000000000000000000000000000000000..2a19afcca1d9c4e27109daeebd83907cd9b7b284 --- /dev/null +++ b/yolo_world/easydeploy/backbone/focus.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class DeployFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, channel, height, width = x.shape + x = x.reshape(batch_size, channel, -1, 2, width) + x = x.reshape(batch_size, channel, x.shape[2], 2, -1, 2) + half_h = x.shape[2] + half_w = x.shape[4] + x = x.permute(0, 5, 3, 1, 2, 4) + x = x.reshape(batch_size, channel * 4, half_h, half_w) + + return self.conv(x) + + +class NcnnFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + batch_size, c, h, w = x.shape + assert h % 2 == 0 and w % 2 == 0, f'focus for yolox needs even feature\ + height and width, got {(h, w)}.' + + x = x.reshape(batch_size, c * h, 1, w) + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * h * w, 1, 1) + + _b, _c, _h, _w = x.shape + g = _c // 2 + # fuse to ncnn's shufflechannel + x = x.view(_b, g, 2, _h, _w) + x = torch.transpose(x, 1, 2).contiguous() + x = x.view(_b, -1, _h, _w) + + x = x.reshape(_b, c * 4, h // 2, w // 2) + + return self.conv(x) + + +class GConvFocus(nn.Module): + + def __init__(self, orin_Focus: nn.Module): + super().__init__() + device = next(orin_Focus.parameters()).device + self.weight1 = torch.tensor([[1., 0], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight2 = torch.tensor([[0, 0], [1., 0]]).expand(3, 1, 2, + 2).to(device) + self.weight3 = torch.tensor([[0, 1.], [0, 0]]).expand(3, 1, 2, + 2).to(device) + self.weight4 = torch.tensor([[0, 0], [0, 1.]]).expand(3, 1, 2, + 2).to(device) + self.__dict__.update(orin_Focus.__dict__) + + def forward(self, x: Tensor) -> Tensor: + conv1 = F.conv2d(x, self.weight1, stride=2, groups=3) + conv2 = F.conv2d(x, self.weight2, stride=2, groups=3) + conv3 = F.conv2d(x, self.weight3, stride=2, groups=3) + conv4 = F.conv2d(x, self.weight4, stride=2, groups=3) + return self.conv(torch.cat([conv1, conv2, conv3, conv4], dim=1)) diff --git a/yolo_world/easydeploy/bbox_code/__init__.py b/yolo_world/easydeploy/bbox_code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b85a815536a5749a15f0ad6aab2b028eb6a3fe0a --- /dev/null +++ b/yolo_world/easydeploy/bbox_code/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .bbox_coder import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) + +__all__ = ['yolov5_bbox_decoder', 'rtmdet_bbox_decoder', 'yolox_bbox_decoder'] diff --git a/yolo_world/easydeploy/bbox_code/bbox_coder.py b/yolo_world/easydeploy/bbox_code/bbox_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..6483cf8b0328aff3d61f1fa0788337ab536d347d --- /dev/null +++ b/yolo_world/easydeploy/bbox_code/bbox_coder.py @@ -0,0 +1,46 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +import torch +from torch import Tensor + + +def yolov5_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Tensor) -> Tensor: + bbox_preds = bbox_preds.sigmoid() + + x_center = (priors[..., 0] + priors[..., 2]) * 0.5 + y_center = (priors[..., 1] + priors[..., 3]) * 0.5 + w = priors[..., 2] - priors[..., 0] + h = priors[..., 3] - priors[..., 1] + + x_center_pred = (bbox_preds[..., 0] - 0.5) * 2 * stride + x_center + y_center_pred = (bbox_preds[..., 1] - 0.5) * 2 * stride + y_center + w_pred = (bbox_preds[..., 2] * 2)**2 * w + h_pred = (bbox_preds[..., 3] * 2)**2 * h + + decoded_bboxes = torch.stack( + [x_center_pred, y_center_pred, w_pred, h_pred], dim=-1) + + return decoded_bboxes + + +def rtmdet_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + bbox_preds *= stride + tl_x = (priors[..., 0] - bbox_preds[..., 0]) + tl_y = (priors[..., 1] - bbox_preds[..., 1]) + br_x = (priors[..., 0] + bbox_preds[..., 2]) + br_y = (priors[..., 1] + bbox_preds[..., 3]) + decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) + return decoded_bboxes + + +def yolox_bbox_decoder(priors: Tensor, bbox_preds: Tensor, + stride: Optional[Tensor]) -> Tensor: + stride = stride[None, :, None] + xys = (bbox_preds[..., :2] * stride) + priors + whs = bbox_preds[..., 2:].exp() * stride + decoded_bboxes = torch.cat([xys, whs], -1) + return decoded_bboxes diff --git a/yolo_world/easydeploy/deepstream/CMakeLists.txt b/yolo_world/easydeploy/deepstream/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f640bea13bacfc0f6cc2f33e598f65cf5ce0922e --- /dev/null +++ b/yolo_world/easydeploy/deepstream/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 2.8.12) + +set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86) +set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) + +project(nvdsparsebbox_mmyolo LANGUAGES CXX) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -O3 -g -Wall -Werror -shared -fPIC") +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_BUILD_TYPE Release) +option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) + +# CUDA +find_package(CUDA REQUIRED) + +# TensorRT +set(TensorRT_INCLUDE_DIRS "/usr/include/x86_64-linux-gnu" CACHE STRING "TensorRT headers path") +set(TensorRT_LIBRARIES "/usr/lib/x86_64-linux-gnu" CACHE STRING "TensorRT libs path") + +# DeepStream +set(DEEPSTREAM "/opt/nvidia/deepstream/deepstream" CACHE STRING "DeepStream root path") +set(DS_LIBRARIES ${DEEPSTREAM}/lib) +set(DS_INCLUDE_DIRS ${DEEPSTREAM}/sources/includes) + +include_directories( + ${CUDA_INCLUDE_DIRS} + ${TensorRT_INCLUDE_DIRS} + ${DS_INCLUDE_DIRS}) + +add_library( + ${PROJECT_NAME} + SHARED + custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp) + +target_link_libraries(${PROJECT_NAME} PRIVATE nvinfer nvinfer_plugin) diff --git a/yolo_world/easydeploy/deepstream/README.md b/yolo_world/easydeploy/deepstream/README.md new file mode 100644 index 0000000000000000000000000000000000000000..111f3765e41d558b64097d8a25585bd9c14acf4f --- /dev/null +++ b/yolo_world/easydeploy/deepstream/README.md @@ -0,0 +1,48 @@ +# Inference MMYOLO Models with DeepStream + +This project demonstrates how to inference MMYOLO models with customized parsers in [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk). + +## Pre-requisites + +### 1. Install Nvidia Driver and CUDA + +First, please follow the official documents and instructions to install dedicated Nvidia graphic driver and CUDA matched to your gpu and target Nvidia AIoT devices. + +### 2. Install DeepStream SDK + +Second, please follow the official instruction to download and install DeepStream SDK. Currently stable version of DeepStream is v6.2. + +### 3. Generate TensorRT Engine + +As DeepStream builds on top of several NVIDIA libraries, you need to first convert your trained MMYOLO models to TensorRT engine files. We strongly recommend you to try the supported TensorRT deployment solution in [EasyDeploy](../../easydeploy/). + +## Build and Run + +Please make sure that your converted TensorRT engine is already located in the `deepstream` folder as the config shows. Create your own model config files and change the `config-file` parameter in [deepstream_app_config.txt](deepstream_app_config.txt) to the model you want to run with. + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +Then you can run the inference with this command. + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## Code Structure + +```bash +├── deepstream +│ ├── configs # config file for MMYOLO models +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # customized parser for MMYOLO models to DeepStream formats +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # labels for coco detection +│ ├── deepstream_app_config.txt # deepStream reference app configs for MMYOLO models +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/yolo_world/easydeploy/deepstream/README_zh-CN.md b/yolo_world/easydeploy/deepstream/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..13a85d5bc90159c3ff9f1a32e93d01e82ed2faa4 --- /dev/null +++ b/yolo_world/easydeploy/deepstream/README_zh-CN.md @@ -0,0 +1,48 @@ +# 使用 DeepStream SDK 推理 MMYOLO 模型 + +本项目演示了如何使用 [DeepStream SDK](https://developer.nvidia.com/deepstream-sdk) 配合改写的 parser 来推理 MMYOLO 的模型。 + +## 预先准备 + +### 1. 安装 Nidia 驱动和 CUDA + +首先请根据当前的显卡驱动和目标使用设备的驱动完成显卡驱动和 CUDA 的安装。 + +### 2. 安装 DeepStream SDK + +目前 DeepStream SDK 稳定版本已经更新到 v6.2,官方推荐使用这个版本。 + +### 3. 将 MMYOLO 模型转换为 TensorRT Engine + +推荐使用 EasyDeploy 中的 TensorRT 方案完成目标模型的转换部署,具体可参考 [此文档](../../easydeploy/docs/model_convert.md) 。 + +## 编译使用 + +当前项目使用的是 MMYOLO 的 rtmdet 模型,若想使用其他的模型,请参照目录下的配置文件进行改写。然后将转换完的 TensorRT engine 放在当前目录下并执行如下命令: + +```bash +mkdir build && cd build +cmake .. +make -j$(nproc) && make install +``` + +完成编译后可使用如下命令进行推理: + +```bash +deepstream-app -c deepstream_app_config.txt +``` + +## 项目代码结构 + +```bash +├── deepstream +│ ├── configs # MMYOLO 模型对应的 DeepStream 配置 +│ │ └── config_infer_rtmdet.txt +│ ├── custom_mmyolo_bbox_parser # 适配 DeepStream formats 的 parser +│ │ └── nvdsparsebbox_mmyolo.cpp +| ├── CMakeLists.txt +│ ├── coco_labels.txt # coco labels +│ ├── deepstream_app_config.txt # DeepStream app 配置 +│ ├── README_zh-CN.md +│ └── README.md +``` diff --git a/yolo_world/easydeploy/deepstream/coco_labels.txt b/yolo_world/easydeploy/deepstream/coco_labels.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca76c80b5b2cd0b25047f75736656cfebc9da7aa --- /dev/null +++ b/yolo_world/easydeploy/deepstream/coco_labels.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorbike +aeroplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +sofa +pottedplant +bed +diningtable +toilet +tvmonitor +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/yolo_world/easydeploy/deepstream/configs/config_infer_rtmdet.txt b/yolo_world/easydeploy/deepstream/configs/config_infer_rtmdet.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1e5efd2a3810730144e037ee96dfbd36124b0e6 --- /dev/null +++ b/yolo_world/easydeploy/deepstream/configs/config_infer_rtmdet.txt @@ -0,0 +1,22 @@ +[property] +gpu-id=0 +net-scale-factor=0.01735207357279195 +offsets=57.375;57.12;58.395 +model-color-format=1 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/yolo_world/easydeploy/deepstream/configs/config_infer_yolov5.txt b/yolo_world/easydeploy/deepstream/configs/config_infer_yolov5.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/yolo_world/easydeploy/deepstream/configs/config_infer_yolov5.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/yolo_world/easydeploy/deepstream/configs/config_infer_yolov8.txt b/yolo_world/easydeploy/deepstream/configs/config_infer_yolov8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7d6429cacd0a6050821e5b2a41317478f5119 --- /dev/null +++ b/yolo_world/easydeploy/deepstream/configs/config_infer_yolov8.txt @@ -0,0 +1,21 @@ +[property] +gpu-id=0 +net-scale-factor=0.0039215697906911373 +model-color-format=0 +model-engine-file=../end2end.engine +labelfile-path=../coco_labels.txt +batch-size=1 +network-mode=0 +num-detected-classes=80 +interval=0 +gie-unique-id=1 +process-mode=1 +network-type=0 +cluster-mode=2 +maintain-aspect-ratio=1 +parse-bbox-func-name=NvDsInferParseCustomMMYOLO +custom-lib-path=../build/libnvdsparsebbox_mmyolo.so + +[class-attrs-all] +pre-cluster-threshold=0.45 +topk=100 diff --git a/yolo_world/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp b/yolo_world/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb780856cbd2b289cdf9dc8518438f946a2ab548 --- /dev/null +++ b/yolo_world/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp @@ -0,0 +1,118 @@ +#include "nvdsinfer_custom_impl.h" +#include +#include + +/** + * Function expected by DeepStream for decoding the MMYOLO output. + * + * C-linkage [extern "C"] was written to prevent name-mangling. This function must return true after + * adding all bounding boxes to the objectList vector. + * + * @param [outputLayersInfo] std::vector of NvDsInferLayerInfo objects with information about the output layer. + * @param [networkInfo] NvDsInferNetworkInfo object with information about the MMYOLO network. + * @param [detectionParams] NvDsInferParseDetectionParams with information about some config params. + * @param [objectList] std::vector of NvDsInferParseObjectInfo objects to which bounding box information must + * be stored. + * + * @return true + */ + +// This is just the function prototype. The definition is written at the end of the file. +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList); + +static __inline__ float clamp(float& val, float min, float max) +{ + return val > min ? (val < max ? val : max) : min; +} + +static std::vector decodeMMYoloTensor( + const int* num_dets, + const float* bboxes, + const float* scores, + const int* labels, + const float& conf_thres, + const unsigned int& img_w, + const unsigned int& img_h +) +{ + std::vector bboxInfo; + size_t nums = num_dets[0]; + for (size_t i = 0; i < nums; i++) + { + float score = scores[i]; + if (score < conf_thres)continue; + float x0 = (bboxes[i * 4]); + float y0 = (bboxes[i * 4 + 1]); + float x1 = (bboxes[i * 4 + 2]); + float y1 = (bboxes[i * 4 + 3]); + x0 = clamp(x0, 0.f, img_w); + y0 = clamp(y0, 0.f, img_h); + x1 = clamp(x1, 0.f, img_w); + y1 = clamp(y1, 0.f, img_h); + NvDsInferParseObjectInfo obj; + obj.left = x0; + obj.top = y0; + obj.width = x1 - x0; + obj.height = y1 - y0; + obj.detectionConfidence = score; + obj.classId = labels[i]; + bboxInfo.push_back(obj); + } + + return bboxInfo; +} + +/* C-linkage to prevent name-mangling */ +extern "C" bool NvDsInferParseCustomMMYOLO( + std::vector const& outputLayersInfo, + NvDsInferNetworkInfo const& networkInfo, + NvDsInferParseDetectionParams const& detectionParams, + std::vector& objectList) +{ + +// Some assertions and error checking. + if (outputLayersInfo.empty() || outputLayersInfo.size() != 4) + { + std::cerr << "Could not find output layer in bbox parsing" << std::endl; + return false; + } + +// Score threshold of bboxes. + const float conf_thres = detectionParams.perClassThreshold[0]; + +// Obtaining the output layer. + const NvDsInferLayerInfo& num_dets = outputLayersInfo[0]; + const NvDsInferLayerInfo& bboxes = outputLayersInfo[1]; + const NvDsInferLayerInfo& scores = outputLayersInfo[2]; + const NvDsInferLayerInfo& labels = outputLayersInfo[3]; + +// num_dets(int) bboxes(float) scores(float) labels(int) + assert (num_dets.dims.numDims == 2); + assert (bboxes.dims.numDims == 3); + assert (scores.dims.numDims == 2); + assert (labels.dims.numDims == 2); + + +// Decoding the output tensor of MMYOLO to the NvDsInferParseObjectInfo format. + std::vector objects = + decodeMMYoloTensor( + (const int*)(num_dets.buffer), + (const float*)(bboxes.buffer), + (const float*)(scores.buffer), + (const int*)(labels.buffer), + conf_thres, + networkInfo.width, + networkInfo.height + ); + + objectList.clear(); + objectList = objects; + return true; +} + +/* Check that the custom function has been defined correctly */ +CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomMMYOLO); diff --git a/yolo_world/easydeploy/deepstream/deepstream_app_config.txt b/yolo_world/easydeploy/deepstream/deepstream_app_config.txt new file mode 100644 index 0000000000000000000000000000000000000000..331776897a5e9109b9007ed1b7974f128287c4fc --- /dev/null +++ b/yolo_world/easydeploy/deepstream/deepstream_app_config.txt @@ -0,0 +1,62 @@ +[application] +enable-perf-measurement=1 +perf-measurement-interval-sec=5 + +[tiled-display] +enable=1 +rows=1 +columns=1 +width=1280 +height=720 +gpu-id=0 +nvbuf-memory-type=0 + +[source0] +enable=1 +type=3 +uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4 +num-sources=1 +gpu-id=0 +cudadec-memtype=0 + +[sink0] +enable=1 +type=2 +sync=0 +gpu-id=0 +nvbuf-memory-type=0 + +[osd] +enable=1 +gpu-id=0 +border-width=5 +text-size=15 +text-color=1;1;1;1; +text-bg-color=0.3;0.3;0.3;1 +font=Serif +show-clock=0 +clock-x-offset=800 +clock-y-offset=820 +clock-text-size=12 +clock-color=1;0;0;0 +nvbuf-memory-type=0 + +[streammux] +gpu-id=0 +live-source=0 +batch-size=1 +batched-push-timeout=40000 +width=1920 +height=1080 +enable-padding=0 +nvbuf-memory-type=0 + +[primary-gie] +enable=1 +gpu-id=0 +gie-unique-id=1 +nvbuf-memory-type=0 +config-file=configs/config_infer_rtmdet.txt + +[tests] +file-loop=0 diff --git a/yolo_world/easydeploy/docs/model_convert.md b/yolo_world/easydeploy/docs/model_convert.md new file mode 100644 index 0000000000000000000000000000000000000000..9af62599dd1b56648680fc315ca88c35c7b31cb9 --- /dev/null +++ b/yolo_world/easydeploy/docs/model_convert.md @@ -0,0 +1,156 @@ +# MMYOLO 模型 ONNX 转换 + +## 1. 导出后端支持的 ONNX + +## 环境依赖 + +- [onnx](https://github.com/onnx/onnx) + + ```shell + pip install onnx + ``` + + [onnx-simplifier](https://github.com/daquexian/onnx-simplifier) (可选,用于简化模型) + + ```shell + pip install onnx-simplifier + ``` + +\*\*\* 请确保您在 `MMYOLO` 根目录下运行相关脚本,避免无法找到相关依赖包。\*\*\* + +## 使用方法 + +[模型导出脚本](./projects/easydeploy/tools/export_onnx.py)用于将 `MMYOLO` 模型转换为 `onnx` 。 + +### 参数介绍: + +- `config` : 构建模型使用的配置文件,如 [`yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`](./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) 。 +- `checkpoint` : 训练得到的权重文件,如 `yolov5s.pth` 。 +- `--work-dir` : 转换后的模型保存路径。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--batch-size`: 转换后的模型输入 `batch size` 。 +- `--device`: 转换模型使用的设备,默认为 `cuda:0`。 +- `--simplify`: 是否简化导出的 `onnx` 模型,需要安装 [onnx-simplifier](https://github.com/daquexian/onnx-simplifier),默认关闭。 +- `--opset`: 指定导出 `onnx` 的 `opset`,默认为 `11` 。 +- `--backend`: 指定导出 `onnx` 用于的后端名称,`ONNXRuntime`: `onnxruntime`, `TensorRT8`: `tensorrt8`, `TensorRT7`: `tensorrt7`,默认为`onnxruntime`即 `ONNXRuntime`。 +- `--pre-topk`: 指定导出 `onnx` 的后处理筛选候选框个数阈值,默认为 `1000`。 +- `--keep-topk`: 指定导出 `onnx` 的非极大值抑制输出的候选框个数阈值,默认为 `100`。 +- `--iou-threshold`: 非极大值抑制中过滤重复候选框的 `iou` 阈值,默认为 `0.65`。 +- `--score-threshold`: 非极大值抑制中过滤候选框得分的阈值,默认为 `0.25`。 +- `--model-only`: 指定仅导出模型 backbone + neck, 不包含后处理,默认关闭。 + +例子: + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --backend 1 \ + --pre-topk 1000 \ + --keep-topk 100 \ + --iou-threshold 0.65 \ + --score-threshold 0.25 +``` + +然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等。 + +`MMYOLO` 目前支持 `TensorRT8`, `TensorRT7`, `ONNXRuntime` 后端的端到端模型转换,目前仅支持静态 shape 模型的导出和转换,动态 batch 或动态长宽的模型端到端转换会在未来继续支持。 + +端到端转换得到的 `onnx` 模型输入输出如图: + +
+ +
+ +输入名: `images`, 尺寸 640x640 + +输出名: `num_dets`, 尺寸 1x1,表示检测目标数量。 + +输出名: `boxes`, 尺寸 1x100x4,表示检测框的坐标,格式为 `x1y1x2y1`。 + +输出名: `scores`, 尺寸 1x100,表示检测框的分数。 + +输出名: `labels`, 尺寸 1x100,表示检测框的类别 id。 + +可以利用 `num_dets` 中的个数对 `boxes`, `scores`, `labels` 进行截断,从 100 个检测结果中抽取前 `num_dets` 个目标作为最终检测结果。 + +## 2. 仅导出模型 Backbone + Neck + +当您需要部署在非 `TensorRT`, `ONNXRuntime` 等支持端到端部署的平台时,您可以考虑使用`--model-only` 参数并且不要传递 `--backend` 参数,您将会导出仅包含 `Backbone` + `neck` 的模型,模型的部分输出如图: + +
+ +
+ +这种导出方式获取的 `ONNX` 模型具有如下优点: + +- 算子简单,一般而言只包含 `Conv`,激活函数等简单算子,几乎不存在无法正确导出的情况,对于嵌入式部署更加友好。 +- 方便不同算法之间对比速度性能,由于不同的算法后处理不同,仅对比 `backbone` + `Neck` 的速度更加公平。 + +也有如下缺点: + +- 后处理逻辑需要单独完成,会有额外的 `decode` + `nms` 的操作需要实现。 +- 与 `TensorRT` 相比,由于 `TensorRT` 可以利用多核优势并行进行后处理,使用 `--model-only` 方式导出的模型性能会差很多。 + +### 使用方法 + +```shell +python ./projects/easydeploy/tools/export.py \ + configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \ + yolov5s.pth \ + --work-dir work_dir \ + --img-size 640 640 \ + --batch 1 \ + --device cpu \ + --simplify \ + --opset 11 \ + --model-only +``` + +## 使用 `model-only` 导出的 ONNX 进行推理 + +[模型推理脚本](./projects/easydeploy/examples/main_onnxruntime.py)用于推理导出的 `ONNX` 模型,需要安装基础依赖环境: + +[`onnxruntime`](https://github.com/microsoft/onnxruntime) 和 [`opencv-python`](https://github.com/opencv/opencv-python) + +```shell +pip install onnxruntime +pip install opencv-python==4.7.0.72 # 建议使用最新的 opencv +``` + +### 参数介绍: + +- `img` : 待检测的图片路径或图片文件夹路径。 +- `onnx` : 导出的 `model-only` ONNX 模型。 +- `--type` : 模型名称,目前支持 `yolov5`, `yolox`, `yolov6`, `ppyoloe`, `ppyoloep`, `yolov7`, `rtmdet`, `yolov8`。 +- `--img-size`: 转换模型时输入的尺寸,如 `640 640`。 +- `--out-dir`: 保存检测结果的路径 。 +- `--show`: 是否可视化检测结果。 +- `--score-thr`: 模型检测后处理的置信度分数 。 +- `--iou-thr`: 模型检测后处理的 IOU 分数 。 + +## 使用方法 + +```shell +cd ./projects/easydeploy/examples +python main_onnxruntime.py \ + "image_path_to_detect" \ + yolov5_s_model-only.onnx \ + --out-dir work_dir \ + --img-size 640 640 \ + --show \ + --score-thr 0.3 \ + --iou-thr 0.7 +``` + +*注意!!!* + +当您使用自定义数据集训练得到的模型时,请修改 [`config.py`](./projects/easydeploy/examples/config.py) 中 `CLASS_NAMES` 和 `CLASS_COLORS`,如果是 `yolov5` 或者 `yolov7` 基于 `anchor` 的模型请同时修改 `YOLOv5_ANCHORS` 和 `YOLOv7_ANCHORS`。 + +[`numpy_coder.py`](./projects/easydeploy/examples/numpy_coder.py) 是目前所有算法仅使用 `numpy` 实现的 `decoder`,如果您对性能有较高的要求,可以参照相关代码改写为 `c/c++`。 diff --git a/yolo_world/easydeploy/examples/config.py b/yolo_world/easydeploy/examples/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a85ff34273c22a356c9d6a3eaeb048b637b5f40 --- /dev/null +++ b/yolo_world/easydeploy/examples/config.py @@ -0,0 +1,64 @@ +from enum import Enum + + +class TASK_TYPE(Enum): + DET = 'det' + SEG = 'seg' + POSE = 'pose' + + +class ModelType(Enum): + YOLOV5 = 'yolov5' + YOLOX = 'yolox' + PPYOLOE = 'ppyoloe' + PPYOLOEP = 'ppyoloep' + YOLOV6 = 'yolov6' + YOLOV7 = 'yolov7' + RTMDET = 'rtmdet' + YOLOV8 = 'yolov8' + + +CLASS_NAMES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', + 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', + 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', + 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', + 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', + 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', + 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', + 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') + +CLASS_COLORS = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), + (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70), + (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0), + (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255), + (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157), + (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118), + (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182), + (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255), + (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255), + (134, 134, 103), (145, 148, 174), (255, 208, 186), + (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255), + (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105), + (166, 196, 102), (208, 195, 210), (255, 109, 65), + (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0), + (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161), + (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120), + (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133), + (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62), + (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45), + (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1), + (246, 0, 122), (191, 162, 208)] + +YOLOv5_ANCHORS = [[(10, 13), (16, 30), (33, 23)], + [(30, 61), (62, 45), (59, 119)], + [(116, 90), (156, 198), (373, 326)]] + +YOLOv7_ANCHORS = [[(12, 16), (19, 36), (40, 28)], + [(36, 75), (76, 55), (72, 146)], + [(142, 110), (192, 243), (459, 401)]] diff --git a/yolo_world/easydeploy/examples/cv2_nms.py b/yolo_world/easydeploy/examples/cv2_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..79e376356b75339c796aeeb280cd8cdb52db8518 --- /dev/null +++ b/yolo_world/easydeploy/examples/cv2_nms.py @@ -0,0 +1,36 @@ +from typing import List, Tuple, Union + +import cv2 +from numpy import ndarray + +MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2]) +assert MAJOR == 4 + + +def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]], + scores: Union[List[float], Tuple[float]], + labels: Union[List[int], Tuple[int]], + conf_thres: float = 0.25, + iou_thres: float = 0.65) -> Tuple[List, List, List]: + if MINOR >= 7: + indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres, + iou_thres) + elif MINOR == 6: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres) + else: + indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, + iou_thres).flatten() + + nmsd_boxes = [] + nmsd_scores = [] + nmsd_labels = [] + for idx in indices: + box = boxes[idx] + # x0y0wh -> x0y0x1y1 + box[2:] = box[:2] + box[2:] + score = scores[idx] + label = labels[idx] + nmsd_boxes.append(box) + nmsd_scores.append(score) + nmsd_labels.append(label) + return nmsd_boxes, nmsd_scores, nmsd_labels diff --git a/yolo_world/easydeploy/examples/main_onnxruntime.py b/yolo_world/easydeploy/examples/main_onnxruntime.py new file mode 100644 index 0000000000000000000000000000000000000000..bc0ad1b0f10ed6cbea8c8b3c0c5010ec7a760cb5 --- /dev/null +++ b/yolo_world/easydeploy/examples/main_onnxruntime.py @@ -0,0 +1,110 @@ +import math +import sys +from argparse import ArgumentParser +from pathlib import Path + +import cv2 +import onnxruntime +from config import (CLASS_COLORS, CLASS_NAMES, ModelType, YOLOv5_ANCHORS, + YOLOv7_ANCHORS) +from cv2_nms import non_max_suppression +from numpy_coder import Decoder +from preprocess import Preprocess +from tqdm import tqdm + +# Add __FILE__ to sys.path +sys.path.append(str(Path(__file__).resolve().parents[0])) + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', + '.tiff', '.webp') + + +def path_to_list(path: str): + path = Path(path) + if path.is_file() and path.suffix in IMG_EXTENSIONS: + res_list = [str(path.absolute())] + elif path.is_dir(): + res_list = [ + str(p.absolute()) for p in path.iterdir() + if p.suffix in IMG_EXTENSIONS + ] + else: + raise RuntimeError + return res_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('onnx', type=str, help='Onnx file') + parser.add_argument('--type', type=str, help='Model type') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--out-dir', default='./output', type=str, help='Path to output file') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + parser.add_argument( + '--score-thr', type=float, default=0.3, help='Bbox score threshold') + parser.add_argument( + '--iou-thr', type=float, default=0.7, help='Bbox iou threshold') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + out_dir = Path(args.out_dir) + model_type = ModelType(args.type.lower()) + + if not args.show: + out_dir.mkdir(parents=True, exist_ok=True) + + files = path_to_list(args.img) + session = onnxruntime.InferenceSession( + args.onnx, providers=['CPUExecutionProvider']) + preprocessor = Preprocess(model_type) + decoder = Decoder(model_type, model_only=True) + if model_type == ModelType.YOLOV5: + anchors = YOLOv5_ANCHORS + elif model_type == ModelType.YOLOV7: + anchors = YOLOv7_ANCHORS + else: + anchors = None + + for file in tqdm(files): + image = cv2.imread(file) + image_h, image_w = image.shape[:2] + img, (ratio_w, ratio_h) = preprocessor(image, args.img_size) + features = session.run(None, {'images': img}) + decoder_outputs = decoder( + features, + args.score_thr, + num_labels=len(CLASS_NAMES), + anchors=anchors) + nmsd_boxes, nmsd_scores, nmsd_labels = non_max_suppression( + *decoder_outputs, args.score_thr, args.iou_thr) + for box, score, label in zip(nmsd_boxes, nmsd_scores, nmsd_labels): + x0, y0, x1, y1 = box + x0 = math.floor(min(max(x0 / ratio_w, 1), image_w - 1)) + y0 = math.floor(min(max(y0 / ratio_h, 1), image_h - 1)) + x1 = math.ceil(min(max(x1 / ratio_w, 1), image_w - 1)) + y1 = math.ceil(min(max(y1 / ratio_h, 1), image_h - 1)) + cv2.rectangle(image, (x0, y0), (x1, y1), CLASS_COLORS[label], 2) + cv2.putText(image, f'{CLASS_NAMES[label]}: {score:.2f}', + (x0, y0 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, + (0, 255, 255), 2) + if args.show: + cv2.imshow('result', image) + cv2.waitKey(0) + else: + cv2.imwrite(f'{out_dir / Path(file).name}', image) + + +if __name__ == '__main__': + main() diff --git a/yolo_world/easydeploy/examples/numpy_coder.py b/yolo_world/easydeploy/examples/numpy_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..ccd3687f89ed47dbbb1d90e603eba21a760bded9 --- /dev/null +++ b/yolo_world/easydeploy/examples/numpy_coder.py @@ -0,0 +1,310 @@ +from typing import List, Tuple, Union + +import numpy as np +from config import ModelType +from numpy import ndarray + + +def softmax(x: ndarray, axis: int = -1) -> ndarray: + e_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) + y = e_x / e_x.sum(axis=axis, keepdims=True) + return y + + +def sigmoid(x: ndarray) -> ndarray: + return 1. / (1. + np.exp(-x)) + + +class Decoder: + + def __init__(self, model_type: ModelType, model_only: bool = False): + self.model_type = model_type + self.model_only = model_only + self.boxes_pro = [] + self.scores_pro = [] + self.labels_pro = [] + self.is_logging = False + + def __call__(self, + feats: Union[List, Tuple], + conf_thres: float, + num_labels: int = 80, + **kwargs) -> Tuple: + if not self.is_logging: + print('Only support decode in batch==1') + self.is_logging = True + self.boxes_pro.clear() + self.scores_pro.clear() + self.labels_pro.clear() + + if self.model_only: + # transpose channel to last dim for easy decoding + feats = [ + np.ascontiguousarray(feat[0].transpose(1, 2, 0)) + for feat in feats + ] + else: + # ax620a horizonX3 transpose channel to last dim by default + feats = [np.ascontiguousarray(feat) for feat in feats] + if self.model_type == ModelType.YOLOV5: + self.__yolov5_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOX: + self.__yolox_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type in (ModelType.PPYOLOE, ModelType.PPYOLOEP): + self.__ppyoloe_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV6: + self.__yolov6_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV7: + self.__yolov7_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.RTMDET: + self.__rtmdet_decode(feats, conf_thres, num_labels, **kwargs) + elif self.model_type == ModelType.YOLOV8: + self.__yolov8_decode(feats, conf_thres, num_labels, **kwargs) + else: + raise NotImplementedError + return self.boxes_pro, self.scores_pro, self.labels_pro + + def __yolov5_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(10, 13), (16, 30), + (33, 23)], [(30, 61), (62, 45), + (59, 119)], [(116, 90), (156, 198), (373, 326)]]) + for i, feat in enumerate(feats): + stride = 8 << i + feat_h, feat_w, _ = feat.shape + anchor = anchors[i] + feat = sigmoid(feat) + feat = feat.reshape((feat_h, feat_w, len(anchor), -1)) + box_feat, conf_feat, score_feat = np.split(feat, [4, 5], -1) + + hIdx, wIdx, aIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = score_feat[hIdx, wIdx, aIdx] * conf_feat[hIdx, wIdx, + aIdx] + boxes = box_feat[hIdx, wIdx, aIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + + indices = np.where(scores > conf_thres)[0] + if len(indices) == 0: + continue + + for idx in indices: + a_w, a_h = anchor[aIdx[idx]] + x, y, w, h = boxes[idx] + x = (x * 2.0 - 0.5 + wIdx[idx]) * stride + y = (y * 2.0 - 0.5 + hIdx[idx]) * stride + w = (w * 2.0)**2 * a_w + h = (h * 2.0)**2 * a_h + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(scores[idx])) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(labels[idx])) + + def __yolox_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat, conf_feat = np.split( + feat, [num_labels, num_labels + 4], -1) + conf_feat = sigmoid(conf_feat) + + hIdx, wIdx, _ = np.where(conf_feat > conf_thres) + + num_proposal = hIdx.size + if not num_proposal: + continue + + score_feat = sigmoid(score_feat[hIdx, wIdx]) * conf_feat[hIdx, + wIdx] + boxes = box_feat[hIdx, wIdx] + labels = score_feat.argmax(-1) + scores = score_feat.max(-1) + indices = np.where(scores > conf_thres)[0] + + if len(indices) == 0: + continue + + for idx in indices: + score = scores[idx] + label = labels[idx] + + x, y, w, h = boxes[idx] + + x = (x + wIdx[idx]) * stride + y = (y + hIdx[idx]) * stride + w = np.exp(w) * stride + h = np.exp(h) * stride + + x0 = x - w / 2 + y0 = y - h / 2 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __ppyoloe_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + reg_max: int = kwargs.get('reg_max', 17) + dfl = np.arange(0, reg_max, dtype=np.float32) + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx].reshape(num_proposal, 4, reg_max) + boxes = softmax(boxes, -1) @ dfl + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov6_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] + 0.5 - x0) * stride + y0 = (hIdx[k] + 0.5 - y0) * stride + x1 = (wIdx[k] + 0.5 + x1) * stride + y1 = (hIdx[k] + 0.5 + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov7_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + anchors: Union[List, Tuple] = kwargs.get( + 'anchors', + [[(12, 16), (19, 36), + (40, 28)], [(36, 75), (76, 55), + (72, 146)], [(142, 110), (192, 243), (459, 401)]]) + self.__yolov5_decode(feats, conf_thres, num_labels, anchors=anchors) + + def __rtmdet_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + for i, feat in enumerate(feats): + stride = 8 << i + score_feat, box_feat = np.split(feat, [ + num_labels, + ], -1) + score_feat = sigmoid(score_feat) + _argmax = score_feat.argmax(-1) + _max = score_feat.max(-1) + indices = np.where(_max > conf_thres) + hIdx, wIdx = indices + num_proposal = hIdx.size + if not num_proposal: + continue + + scores = _max[hIdx, wIdx] + boxes = box_feat[hIdx, wIdx] + labels = _argmax[hIdx, wIdx] + + for k in range(num_proposal): + score = scores[k] + label = labels[k] + + x0, y0, x1, y1 = boxes[k] + + x0 = (wIdx[k] - x0) * stride + y0 = (hIdx[k] - y0) * stride + x1 = (wIdx[k] + x1) * stride + y1 = (hIdx[k] + y1) * stride + + w = x1 - x0 + h = y1 - y0 + + self.scores_pro.append(float(score)) + self.boxes_pro.append( + np.array([x0, y0, w, h], dtype=np.float32)) + self.labels_pro.append(int(label)) + + def __yolov8_decode(self, + feats: List[ndarray], + conf_thres: float, + num_labels: int = 80, + **kwargs): + reg_max: int = kwargs.get('reg_max', 16) + self.__ppyoloe_decode(feats, conf_thres, num_labels, reg_max=reg_max) diff --git a/yolo_world/easydeploy/examples/preprocess.py b/yolo_world/easydeploy/examples/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..6b6fb563a16a7f40ef556b5a23f635ab4627fc4f --- /dev/null +++ b/yolo_world/easydeploy/examples/preprocess.py @@ -0,0 +1,57 @@ +from typing import List, Tuple, Union + +import cv2 +import numpy as np +from config import ModelType +from numpy import ndarray + + +class Preprocess: + + def __init__(self, model_type: ModelType): + if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7, + ModelType.YOLOV8): + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.YOLOX: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([1, 1, 1], dtype=np.float32) + is_rgb = False + elif model_type == ModelType.PPYOLOE: + mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) + std = np.array([58.395, 57.12, 57.375], dtype=np.float32) + is_rgb = True + + elif model_type == ModelType.PPYOLOEP: + mean = np.array([0, 0, 0], dtype=np.float32) + std = np.array([255, 255, 255], dtype=np.float32) + is_rgb = True + elif model_type == ModelType.RTMDET: + mean = np.array([103.53, 116.28, 123.675], dtype=np.float32) + std = np.array([57.375, 57.12, 58.3955], dtype=np.float32) + is_rgb = False + else: + raise NotImplementedError + + self.mean = mean.reshape((3, 1, 1)) + self.std = std.reshape((3, 1, 1)) + self.is_rgb = is_rgb + + def __call__(self, + image: ndarray, + new_size: Union[List[int], Tuple[int]] = (640, 640), + **kwargs) -> Tuple[ndarray, Tuple[float, float]]: + # new_size: (height, width) + height, width = image.shape[:2] + ratio_h, ratio_w = new_size[0] / height, new_size[1] / width + image = cv2.resize( + image, (0, 0), + fx=ratio_w, + fy=ratio_h, + interpolation=cv2.INTER_LINEAR) + image = np.ascontiguousarray(image.transpose(2, 0, 1)) + image = image.astype(np.float32) + image -= self.mean + image /= self.std + return image[np.newaxis], (ratio_w, ratio_h) diff --git a/yolo_world/easydeploy/examples/requirements.txt b/yolo_world/easydeploy/examples/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b761189b52fc57e4231b37df0ff42bb44404c95 --- /dev/null +++ b/yolo_world/easydeploy/examples/requirements.txt @@ -0,0 +1,2 @@ +onnxruntime +opencv-python==4.7.0.72 diff --git a/yolo_world/easydeploy/model/__init__.py b/yolo_world/easydeploy/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..38af8bc322b0a8e0c870fac243a0af9c1dba7315 --- /dev/null +++ b/yolo_world/easydeploy/model/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .backend import MMYOLOBackend +from .backendwrapper import ORTWrapper, TRTWrapper +from .model import DeployModel + +__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend'] diff --git a/yolo_world/easydeploy/model/backend.py b/yolo_world/easydeploy/model/backend.py new file mode 100644 index 0000000000000000000000000000000000000000..64d6e3f020bcfd3c3cf7db5f5611a8f815df4cb1 --- /dev/null +++ b/yolo_world/easydeploy/model/backend.py @@ -0,0 +1,23 @@ +from enum import Enum + +import torch +import torch.nn.functional as F + + +class MMYOLOBackend(Enum): + AX620A = 'ax620a' + COREML = 'coreml' + HORIZONX3 = 'horizonx3' + NCNN = 'ncnn' + ONNXRUNTIME = 'onnxruntime' + OPENVINO = 'openvino' + PPLNN = 'pplnn' + RKNN = 'rknn' + TENSORRT8 = 'tensorrt8' + TENSORRT7 = 'tensorrt7' + TORCHSCRIPT = 'torchscript' + TVM = 'tvm' + + +def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor: + return F.hardsigmoid(x, inplace=True) diff --git a/yolo_world/easydeploy/model/backendwrapper.py b/yolo_world/easydeploy/model/backendwrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..2997d84ea98b3f30973cf2335ab0eb4af4edaef5 --- /dev/null +++ b/yolo_world/easydeploy/model/backendwrapper.py @@ -0,0 +1,202 @@ +import warnings +from collections import namedtuple +from functools import partial +from pathlib import Path +from typing import List, Optional, Union + +import numpy as np +import onnxruntime + +try: + import tensorrt as trt +except Exception: + trt = None +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class TRTWrapper(torch.nn.Module): + dtype_mapping = {} + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix in ('.engine', '.plan') + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.stream = torch.cuda.Stream(device=device) + self.__update_mapping() + self.__init_engine() + self.__init_bindings() + + def __update_mapping(self): + self.dtype_mapping.update({ + trt.bool: torch.bool, + trt.int8: torch.int8, + trt.int32: torch.int32, + trt.float16: torch.float16, + trt.float32: torch.float32 + }) + + def __init_engine(self): + logger = trt.Logger(trt.Logger.ERROR) + self.log = partial(logger.log, trt.Logger.ERROR) + trt.init_libnvinfer_plugins(logger, namespace='') + self.logger = logger + with trt.Runtime(logger) as runtime: + model = runtime.deserialize_cuda_engine(self.weight.read_bytes()) + + context = model.create_execution_context() + + names = [model.get_binding_name(i) for i in range(model.num_bindings)] + + num_inputs, num_outputs = 0, 0 + + for i in range(model.num_bindings): + if model.binding_is_input(i): + num_inputs += 1 + else: + num_outputs += 1 + + self.is_dynamic = -1 in model.get_binding_shape(0) + + self.model = model + self.context = context + self.input_names = names[:num_inputs] + self.output_names = names[num_inputs:] + self.num_inputs = num_inputs + self.num_outputs = num_outputs + self.num_bindings = num_inputs + num_outputs + self.bindings: List[int] = [0] * self.num_bindings + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + + for i, name in enumerate(self.input_names): + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + inputs_info.append(Binding(name, dtype, shape)) + + for i, name in enumerate(self.output_names): + i += self.num_inputs + assert self.model.get_binding_name(i) == name + dtype = self.dtype_mapping[self.model.get_binding_dtype(i)] + shape = tuple(self.model.get_binding_shape(i)) + outputs_info.append(Binding(name, dtype, shape)) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + if not self.is_dynamic: + self.output_tensor = [ + torch.empty(o.shape, dtype=o.dtype, device=self.device) + for o in outputs_info + ] + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[torch.Tensor] = [ + i.contiguous() for i in inputs + ] + + for i in range(self.num_inputs): + self.bindings[i] = contiguous_inputs[i].data_ptr() + if self.is_dynamic: + self.context.set_binding_shape( + i, tuple(contiguous_inputs[i].shape)) + + # create output tensors + outputs: List[torch.Tensor] = [] + + for i in range(self.num_outputs): + j = i + self.num_inputs + if self.is_dynamic: + shape = tuple(self.context.get_binding_shape(j)) + output = torch.empty( + size=shape, + dtype=self.output_dtypes[i], + device=self.device) + + else: + output = self.output_tensor[i] + outputs.append(output) + self.bindings[j] = output.data_ptr() + + self.context.execute_async_v2(self.bindings, self.stream.cuda_stream) + self.stream.synchronize() + + return tuple(outputs) + + +class ORTWrapper(torch.nn.Module): + + def __init__(self, weight: Union[str, Path], + device: Optional[torch.device]): + super().__init__() + weight = Path(weight) if isinstance(weight, str) else weight + assert weight.exists() and weight.suffix == '.onnx' + + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + self.weight = weight + self.device = device + self.__init_session() + self.__init_bindings() + + def __init_session(self): + providers = ['CPUExecutionProvider'] + if 'cuda' in self.device.type: + providers.insert(0, 'CUDAExecutionProvider') + + session = onnxruntime.InferenceSession( + str(self.weight), providers=providers) + self.session = session + + def __init_bindings(self): + Binding = namedtuple('Binding', ('name', 'dtype', 'shape')) + inputs_info = [] + outputs_info = [] + self.is_dynamic = False + for i, tensor in enumerate(self.session.get_inputs()): + if any(not isinstance(i, int) for i in tensor.shape): + self.is_dynamic = True + inputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + + for i, tensor in enumerate(self.session.get_outputs()): + outputs_info.append( + Binding(tensor.name, tensor.type, tuple(tensor.shape))) + self.inputs_info = inputs_info + self.outputs_info = outputs_info + self.num_inputs = len(inputs_info) + + def forward(self, *inputs): + + assert len(inputs) == self.num_inputs + + contiguous_inputs: List[np.ndarray] = [ + i.contiguous().cpu().numpy() for i in inputs + ] + + if not self.is_dynamic: + # make sure input shape is right for static input shape + for i in range(self.num_inputs): + assert contiguous_inputs[i].shape == self.inputs_info[i].shape + + outputs = self.session.run([o.name for o in self.outputs_info], { + j.name: contiguous_inputs[i] + for i, j in enumerate(self.inputs_info) + }) + + return tuple(torch.from_numpy(o).to(self.device) for o in outputs) diff --git a/yolo_world/easydeploy/model/model.py b/yolo_world/easydeploy/model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c67ed2872097e82d7f569a2f486b1a6463cde986 --- /dev/null +++ b/yolo_world/easydeploy/model/model.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy +from functools import partial +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +from mmdet.models.backbones.csp_darknet import Focus +from mmdet.models.layers import ChannelAttention +from mmengine.config import ConfigDict +from torch import Tensor + +from mmyolo.models import RepVGGBlock +from mmyolo.models.dense_heads import (PPYOLOEHead, RTMDetHead, YOLOv5Head, + YOLOv7Head, YOLOv8Head, YOLOXHead) +from mmyolo.models.layers import ImplicitA, ImplicitM +from ..backbone import DeployFocus, GConvFocus, NcnnFocus +from ..bbox_code import (rtmdet_bbox_decoder, yolov5_bbox_decoder, + yolox_bbox_decoder) +from ..nms import batched_nms, efficient_nms, onnx_nms +from .backend import MMYOLOBackend + + +class DeployModel(nn.Module): + transpose = False + + def __init__(self, + baseModel: nn.Module, + backend: MMYOLOBackend, + postprocess_cfg: Optional[ConfigDict] = None): + super().__init__() + self.baseModel = baseModel + self.baseHead = baseModel.bbox_head + self.backend = backend + if postprocess_cfg is None: + self.with_postprocess = False + else: + self.with_postprocess = True + self.__init_sub_attributes() + self.detector_type = type(self.baseHead) + self.pre_top_k = postprocess_cfg.get('pre_top_k', 1000) + self.keep_top_k = postprocess_cfg.get('keep_top_k', 100) + self.iou_threshold = postprocess_cfg.get('iou_threshold', 0.65) + self.score_threshold = postprocess_cfg.get('score_threshold', 0.25) + self.__switch_deploy() + + def __init_sub_attributes(self): + self.bbox_decoder = self.baseHead.bbox_coder.decode + self.prior_generate = self.baseHead.prior_generator.grid_priors + self.num_base_priors = self.baseHead.num_base_priors + self.featmap_strides = self.baseHead.featmap_strides + self.num_classes = self.baseHead.num_classes + + def __switch_deploy(self): + headType = type(self.baseHead) + if not self.with_postprocess: + if headType in (YOLOv5Head, YOLOv7Head): + self.baseHead.head_module.forward_single = self.forward_single + elif headType in (PPYOLOEHead, YOLOv8Head): + self.baseHead.head_module.reg_max = 0 + + if self.backend in (MMYOLOBackend.HORIZONX3, MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + self.transpose = True + for layer in self.baseModel.modules(): + if isinstance(layer, RepVGGBlock): + layer.switch_to_deploy() + elif isinstance(layer, ChannelAttention): + layer.global_avgpool.forward = self.forward_gvp + elif isinstance(layer, Focus): + # onnxruntime openvino tensorrt8 tensorrt7 + if self.backend in (MMYOLOBackend.ONNXRUNTIME, + MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + self.baseModel.backbone.stem = DeployFocus(layer) + # ncnn + elif self.backend == MMYOLOBackend.NCNN: + self.baseModel.backbone.stem = NcnnFocus(layer) + # switch focus to group conv + else: + self.baseModel.backbone.stem = GConvFocus(layer) + + def pred_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + **kwargs): + assert len(cls_scores) == len(bbox_preds) + dtype = cls_scores[0].dtype + device = cls_scores[0].device + + nms_func = self.select_nms() + if self.detector_type in (YOLOv5Head, YOLOv7Head): + bbox_decoder = yolov5_bbox_decoder + elif self.detector_type is RTMDetHead: + bbox_decoder = rtmdet_bbox_decoder + elif self.detector_type is YOLOXHead: + bbox_decoder = yolox_bbox_decoder + else: + bbox_decoder = self.bbox_decoder + + num_imgs = cls_scores[0].shape[0] + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + mlvl_priors = self.prior_generate( + featmap_sizes, dtype=dtype, device=device) + + flatten_priors = torch.cat(mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size[0] * featmap_size[1] * self.num_base_priors, ), + stride) for featmap_size, stride in zip( + featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + + if objectnesses is not None: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1)) + + scores = cls_scores + + bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds, + flatten_stride) + + return nms_func(bboxes, scores, self.keep_top_k, self.iou_threshold, + self.score_threshold, self.pre_top_k, self.keep_top_k) + + def select_nms(self): + if self.backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO): + nms_func = onnx_nms + elif self.backend == MMYOLOBackend.TENSORRT8: + nms_func = efficient_nms + elif self.backend == MMYOLOBackend.TENSORRT7: + nms_func = batched_nms + else: + raise NotImplementedError + if type(self.baseHead) in (YOLOv5Head, YOLOv7Head, YOLOXHead): + nms_func = partial(nms_func, box_coding=1) + + return nms_func + + def forward(self, inputs: Tensor): + neck_outputs = self.baseModel(inputs) + if self.with_postprocess: + return self.pred_by_feat(*neck_outputs) + else: + outputs = [] + if self.transpose: + for feats in zip(*neck_outputs): + if self.backend in (MMYOLOBackend.NCNN, + MMYOLOBackend.TORCHSCRIPT): + outputs.append( + torch.cat( + [feat.permute(0, 2, 3, 1) for feat in feats], + -1)) + else: + outputs.append(torch.cat(feats, 1).permute(0, 2, 3, 1)) + else: + for feats in zip(*neck_outputs): + outputs.append(torch.cat(feats, 1)) + return tuple(outputs) + + @staticmethod + def forward_single(x: Tensor, convs: nn.Module) -> Tuple[Tensor]: + if isinstance(convs, nn.Sequential) and any( + type(m) in (ImplicitA, ImplicitM) for m in convs): + a, c, m = convs + aw = a.implicit.clone() + mw = m.implicit.clone() + c = deepcopy(c) + nw, cw, _, _ = c.weight.shape + na, ca, _, _ = aw.shape + nm, cm, _, _ = mw.shape + c.bias = nn.Parameter(c.bias + ( + c.weight.reshape(nw, cw) @ aw.reshape(ca, na)).squeeze(1)) + c.bias = nn.Parameter(c.bias * mw.reshape(cm)) + c.weight = nn.Parameter(c.weight * mw.transpose(0, 1)) + convs = c + feat = convs(x) + return (feat, ) + + @staticmethod + def forward_gvp(x: Tensor) -> Tensor: + return torch.mean(x, [2, 3], keepdim=True) diff --git a/yolo_world/easydeploy/nms/__init__.py b/yolo_world/easydeploy/nms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..59c5cdbd2b3b195125a14f473b825f616755fd6e --- /dev/null +++ b/yolo_world/easydeploy/nms/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .ort_nms import onnx_nms +from .trt_nms import batched_nms, efficient_nms + +__all__ = ['efficient_nms', 'batched_nms', 'onnx_nms'] diff --git a/yolo_world/easydeploy/nms/ort_nms.py b/yolo_world/easydeploy/nms/ort_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..aad93cf05ac2ee9d61a85b4bf9e7b63c352859ec --- /dev/null +++ b/yolo_world/easydeploy/nms/ort_nms.py @@ -0,0 +1,122 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +def select_nms_index(scores: Tensor, + boxes: Tensor, + nms_index: Tensor, + batch_size: int, + keep_top_k: int = -1): + batch_inds, cls_inds = nms_index[:, 0], nms_index[:, 1] + box_inds = nms_index[:, 2] + + scores = scores[batch_inds, cls_inds, box_inds].unsqueeze(1) + boxes = boxes[batch_inds, box_inds, ...] + dets = torch.cat([boxes, scores], dim=1) + + batched_dets = dets.unsqueeze(0).repeat(batch_size, 1, 1) + batch_template = torch.arange( + 0, batch_size, dtype=batch_inds.dtype, device=batch_inds.device) + batched_dets = batched_dets.where( + (batch_inds == batch_template.unsqueeze(1)).unsqueeze(-1), + batched_dets.new_zeros(1)) + + batched_labels = cls_inds.unsqueeze(0).repeat(batch_size, 1) + batched_labels = batched_labels.where( + (batch_inds == batch_template.unsqueeze(1)), + batched_labels.new_ones(1) * -1) + + N = batched_dets.shape[0] + + batched_dets = torch.cat((batched_dets, batched_dets.new_zeros((N, 1, 5))), + 1) + batched_labels = torch.cat((batched_labels, -batched_labels.new_ones( + (N, 1))), 1) + + _, topk_inds = batched_dets[:, :, -1].sort(dim=1, descending=True) + topk_batch_inds = torch.arange( + batch_size, dtype=topk_inds.dtype, + device=topk_inds.device).view(-1, 1) + batched_dets = batched_dets[topk_batch_inds, topk_inds, ...] + batched_labels = batched_labels[topk_batch_inds, topk_inds, ...] + batched_dets, batched_scores = batched_dets.split([4, 1], 2) + batched_scores = batched_scores.squeeze(-1) + + num_dets = (batched_scores > 0).sum(1, keepdim=True) + return num_dets, batched_dets, batched_scores, batched_labels + + +class ONNXNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]) + ) -> Tensor: + device = boxes.device + batch = scores.shape[0] + num_det = 20 + batches = torch.randint(0, batch, (num_det, )).sort()[0].to(device) + idxs = torch.arange(100, 100 + num_det).to(device) + zeros = torch.zeros((num_det, ), dtype=torch.int64).to(device) + selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], + 0).T.contiguous() + selected_indices = selected_indices.to(torch.int64) + + return selected_indices + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: Tensor = torch.tensor([100]), + iou_threshold: Tensor = torch.tensor([0.5]), + score_threshold: Tensor = torch.tensor([0.05]), + ): + return g.op( + 'NonMaxSuppression', + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + outputs=1) + + +def onnx_nms( + boxes: torch.Tensor, + scores: torch.Tensor, + max_output_boxes_per_class: int = 100, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + max_output_boxes_per_class = torch.tensor([max_output_boxes_per_class]) + iou_threshold = torch.tensor([iou_threshold]) + score_threshold = torch.tensor([score_threshold]) + + batch_size, _, _ = scores.shape + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + scores = scores.transpose(1, 2).contiguous() + selected_indices = ONNXNMSop.apply(boxes, scores, + max_output_boxes_per_class, + iou_threshold, score_threshold) + + num_dets, batched_dets, batched_scores, batched_labels = select_nms_index( + scores, boxes, selected_indices, batch_size, keep_top_k=keep_top_k) + + return num_dets, batched_dets, batched_scores, batched_labels.to( + torch.int32) diff --git a/yolo_world/easydeploy/nms/trt_nms.py b/yolo_world/easydeploy/nms/trt_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..e0db1e2164d4366ff9ce4f74d39ded917c39ba79 --- /dev/null +++ b/yolo_world/easydeploy/nms/trt_nms.py @@ -0,0 +1,226 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor + +_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], + [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]], + dtype=torch.float32) + + +class TRTEfficientNMSop(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25, + ): + batch_size, _, num_classes = scores.shape + num_det = torch.randint( + 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, max_output_boxes, 4) + det_scores = torch.randn(batch_size, max_output_boxes) + det_classes = torch.randint( + 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic(g, + boxes: Tensor, + scores: Tensor, + background_class: int = -1, + box_coding: int = 0, + iou_threshold: float = 0.45, + max_output_boxes: int = 100, + plugin_version: str = '1', + score_activation: int = 0, + score_threshold: float = 0.25): + out = g.op( + 'TRT::EfficientNMS_TRT', + boxes, + scores, + background_class_i=background_class, + box_coding_i=box_coding, + iou_threshold_f=iou_threshold, + max_output_boxes_i=max_output_boxes, + plugin_version_s=plugin_version, + score_activation_i=score_activation, + score_threshold_f=score_threshold, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +class TRTbatchedNMSop(torch.autograd.Function): + """TensorRT NMS operation.""" + + @staticmethod + def forward( + ctx, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + batch_size, _, numClasses = scores.shape + num_det = torch.randint( + 0, keepTopK, (batch_size, 1), dtype=torch.int32) + det_boxes = torch.randn(batch_size, keepTopK, 4) + det_scores = torch.randn(batch_size, keepTopK) + det_classes = torch.randint(0, numClasses, + (batch_size, keepTopK)).float() + return num_det, det_boxes, det_scores, det_classes + + @staticmethod + def symbolic( + g, + boxes: Tensor, + scores: Tensor, + plugin_version: str = '1', + shareLocation: int = 1, + backgroundLabelId: int = -1, + numClasses: int = 80, + topK: int = 1000, + keepTopK: int = 100, + scoreThreshold: float = 0.25, + iouThreshold: float = 0.45, + isNormalized: int = 0, + clipBoxes: int = 0, + scoreBits: int = 16, + caffeSemantics: int = 1, + ): + out = g.op( + 'TRT::BatchedNMSDynamic_TRT', + boxes, + scores, + shareLocation_i=shareLocation, + plugin_version_s=plugin_version, + backgroundLabelId_i=backgroundLabelId, + numClasses_i=numClasses, + topK_i=topK, + keepTopK_i=keepTopK, + scoreThreshold_f=scoreThreshold, + iouThreshold_f=iouThreshold, + isNormalized_i=isNormalized, + clipBoxes_i=clipBoxes, + scoreBits_i=scoreBits, + caffeSemantics_i=caffeSemantics, + outputs=4) + num_det, det_boxes, det_scores, det_classes = out + return num_det, det_boxes, det_scores, det_classes + + +def _efficient_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply( + boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, + score_threshold) + return num_det, det_boxes, det_scores, det_classes + + +def _batched_nms( + boxes: Tensor, + scores: Tensor, + max_output_boxes_per_class: int = 1000, + iou_threshold: float = 0.5, + score_threshold: float = 0.05, + pre_top_k: int = -1, + keep_top_k: int = 100, + box_coding: int = 0, +): + """Wrapper for `efficient_nms` with TensorRT. + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4]. + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes]. + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5. + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (int): Number of top K boxes to keep before nms. + Defaults to -1. + keep_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + box_coding (int): Bounding boxes format for nms. + Defaults to 0 means [x1, y1 ,x2, y2]. + Set to 1 means [x, y, w, h]. + Returns: + tuple[Tensor, Tensor, Tensor, Tensor]: + (num_det, det_boxes, det_scores, det_classes), + `num_det` of shape [N, 1] + `det_boxes` of shape [N, num_det, 4] + `det_scores` of shape [N, num_det] + `det_classes` of shape [N, num_det] + """ + if box_coding == 1: + boxes = boxes @ (_XYWH2XYXY.to(boxes.device)) + boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2) + _, _, numClasses = scores.shape + + num_det, det_boxes, det_scores, det_classes = TRTbatchedNMSop.apply( + boxes, scores, '1', 1, -1, int(numClasses), min(pre_top_k, 4096), + keep_top_k, score_threshold, iou_threshold, 0, 0, 16, 1) + + det_classes = det_classes.int() + return num_det, det_boxes, det_scores, det_classes + + +def efficient_nms(*args, **kwargs): + """Wrapper function for `_efficient_nms`.""" + return _efficient_nms(*args, **kwargs) + + +def batched_nms(*args, **kwargs): + """Wrapper function for `_batched_nms`.""" + return _batched_nms(*args, **kwargs) diff --git a/yolo_world/easydeploy/tools/build_engine.py b/yolo_world/easydeploy/tools/build_engine.py new file mode 100644 index 0000000000000000000000000000000000000000..b400c9db826878a7bb0fb13f4b1dea9b793583e7 --- /dev/null +++ b/yolo_world/easydeploy/tools/build_engine.py @@ -0,0 +1,136 @@ +import argparse +from pathlib import Path +from typing import List, Optional, Tuple, Union + +try: + import tensorrt as trt +except Exception: + trt = None +import warnings + +import numpy as np +import torch + +warnings.filterwarnings(action='ignore', category=DeprecationWarning) + + +class EngineBuilder: + + def __init__( + self, + checkpoint: Union[str, Path], + opt_shape: Union[Tuple, List] = (1, 3, 640, 640), + device: Optional[Union[str, int, torch.device]] = None) -> None: + checkpoint = Path(checkpoint) if isinstance(checkpoint, + str) else checkpoint + assert checkpoint.exists() and checkpoint.suffix == '.onnx' + if isinstance(device, str): + device = torch.device(device) + elif isinstance(device, int): + device = torch.device(f'cuda:{device}') + + self.checkpoint = checkpoint + self.opt_shape = np.array(opt_shape, dtype=np.float32) + self.device = device + + def __build_engine(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling: bool = True) -> None: + logger = trt.Logger(trt.Logger.WARNING) + trt.init_libnvinfer_plugins(logger, namespace='') + builder = trt.Builder(logger) + config = builder.create_builder_config() + config.max_workspace_size = torch.cuda.get_device_properties( + self.device).total_memory + flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + network = builder.create_network(flag) + parser = trt.OnnxParser(network, logger) + if not parser.parse_from_file(str(self.checkpoint)): + raise RuntimeError( + f'failed to load ONNX file: {str(self.checkpoint)}') + inputs = [network.get_input(i) for i in range(network.num_inputs)] + outputs = [network.get_output(i) for i in range(network.num_outputs)] + profile = None + dshape = -1 in network.get_input(0).shape + if dshape: + profile = builder.create_optimization_profile() + if scale is None: + scale = np.array( + [[1, 1, 0.5, 0.5], [1, 1, 1, 1], [4, 1, 1.5, 1.5]], + dtype=np.float32) + scale = (self.opt_shape * scale).astype(np.int32) + elif isinstance(scale, List): + scale = np.array(scale, dtype=np.int32) + assert scale.shape[0] == 3, 'Input a wrong scale list' + else: + raise NotImplementedError + + for inp in inputs: + logger.log( + trt.Logger.WARNING, + f'input "{inp.name}" with shape{inp.shape} {inp.dtype}') + if dshape: + profile.set_shape(inp.name, *scale) + for out in outputs: + logger.log( + trt.Logger.WARNING, + f'output "{out.name}" with shape{out.shape} {out.dtype}') + if fp16 and builder.platform_has_fast_fp16: + config.set_flag(trt.BuilderFlag.FP16) + self.weight = self.checkpoint.with_suffix('.engine') + if dshape: + config.add_optimization_profile(profile) + if with_profiling: + config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED + with builder.build_engine(network, config) as engine: + self.weight.write_bytes(engine.serialize()) + logger.log( + trt.Logger.WARNING, f'Build tensorrt engine finish.\n' + f'Save in {str(self.weight.absolute())}') + + def build(self, + scale: Optional[List[List]] = None, + fp16: bool = True, + with_profiling=True): + self.__build_engine(scale, fp16, with_profiling) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument( + '--device', type=str, default='cuda:0', help='TensorRT builder device') + parser.add_argument( + '--scales', + type=str, + default='[[1,3,640,640],[1,3,640,640],[1,3,640,640]]', + help='Input scales for build dynamic input shape engine') + parser.add_argument( + '--fp16', action='store_true', help='Build model with fp16 mode') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def main(args): + img_size = (1, 3, *args.img_size) + try: + scales = eval(args.scales) + except Exception: + print('Input scales is not a python variable') + print('Set scales default None') + scales = None + builder = EngineBuilder(args.checkpoint, img_size, args.device) + builder.build(scales, fp16=args.fp16) + + +if __name__ == '__main__': + args = parse_args() + main(args) diff --git a/yolo_world/easydeploy/tools/export_onnx.py b/yolo_world/easydeploy/tools/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..b937cc8a72b5c09d61580ddb1297213693adaf1c --- /dev/null +++ b/yolo_world/easydeploy/tools/export_onnx.py @@ -0,0 +1,157 @@ +import argparse +import os +import sys +import warnings +from io import BytesIO +from pathlib import Path + +import onnx +import torch +from mmdet.apis import init_detector +from mmengine.config import ConfigDict +from mmengine.logging import print_log +from mmengine.utils.path import mkdir_or_exist + +# Add MMYOLO ROOT to sys.path +sys.path.append(str(Path(__file__).resolve().parents[3])) +from projects.easydeploy.model import DeployModel, MMYOLOBackend # noqa E402 + +warnings.filterwarnings(action='ignore', category=torch.jit.TracerWarning) +warnings.filterwarnings(action='ignore', category=torch.jit.ScriptWarning) +warnings.filterwarnings(action='ignore', category=UserWarning) +warnings.filterwarnings(action='ignore', category=FutureWarning) +warnings.filterwarnings(action='ignore', category=ResourceWarning) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--model-only', action='store_true', help='Export model only') + parser.add_argument( + '--work-dir', default='./work_dir', help='Path to save export model') + parser.add_argument( + '--img-size', + nargs='+', + type=int, + default=[640, 640], + help='Image size of height and width') + parser.add_argument('--batch-size', type=int, default=1, help='Batch size') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--simplify', + action='store_true', + help='Simplify onnx model by onnx-sim') + parser.add_argument( + '--opset', type=int, default=11, help='ONNX opset version') + parser.add_argument( + '--backend', + type=str, + default='onnxruntime', + help='Backend for export onnx') + parser.add_argument( + '--pre-topk', + type=int, + default=1000, + help='Postprocess pre topk bboxes feed into NMS') + parser.add_argument( + '--keep-topk', + type=int, + default=100, + help='Postprocess keep topk bboxes out of NMS') + parser.add_argument( + '--iou-threshold', + type=float, + default=0.65, + help='IoU threshold for NMS') + parser.add_argument( + '--score-threshold', + type=float, + default=0.25, + help='Score threshold for NMS') + args = parser.parse_args() + args.img_size *= 2 if len(args.img_size) == 1 else 1 + return args + + +def build_model_from_cfg(config_path, checkpoint_path, device): + model = init_detector(config_path, checkpoint_path, device=device) + model.eval() + return model + + +def main(): + args = parse_args() + mkdir_or_exist(args.work_dir) + backend = MMYOLOBackend(args.backend.lower()) + if backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO, + MMYOLOBackend.TENSORRT8, MMYOLOBackend.TENSORRT7): + if not args.model_only: + print_log('Export ONNX with bbox decoder and NMS ...') + else: + args.model_only = True + print_log(f'Can not export postprocess for {args.backend.lower()}.\n' + f'Set "args.model_only=True" default.') + if args.model_only: + postprocess_cfg = None + output_names = None + else: + postprocess_cfg = ConfigDict( + pre_top_k=args.pre_topk, + keep_top_k=args.keep_topk, + iou_threshold=args.iou_threshold, + score_threshold=args.score_threshold) + output_names = ['num_dets', 'boxes', 'scores', 'labels'] + baseModel = build_model_from_cfg(args.config, args.checkpoint, args.device) + + deploy_model = DeployModel( + baseModel=baseModel, backend=backend, postprocess_cfg=postprocess_cfg) + deploy_model.eval() + + fake_input = torch.randn(args.batch_size, 3, + *args.img_size).to(args.device) + # dry run + deploy_model(fake_input) + + save_onnx_path = os.path.join( + args.work_dir, + os.path.basename(args.checkpoint).replace('pth', 'onnx')) + # export onnx + with BytesIO() as f: + torch.onnx.export( + deploy_model, + fake_input, + f, + input_names=['images'], + output_names=output_names, + opset_version=args.opset) + f.seek(0) + onnx_model = onnx.load(f) + onnx.checker.check_model(onnx_model) + + # Fix tensorrt onnx output shape, just for view + if not args.model_only and backend in (MMYOLOBackend.TENSORRT8, + MMYOLOBackend.TENSORRT7): + shapes = [ + args.batch_size, 1, args.batch_size, args.keep_topk, 4, + args.batch_size, args.keep_topk, args.batch_size, + args.keep_topk + ] + for i in onnx_model.graph.output: + for j in i.type.tensor_type.shape.dim: + j.dim_param = str(shapes.pop(0)) + if args.simplify: + try: + import onnxsim + onnx_model, check = onnxsim.simplify(onnx_model) + assert check, 'assert check failed' + except Exception as e: + print_log(f'Simplify failure: {e}') + onnx.save(onnx_model, save_onnx_path) + print_log(f'ONNX export success, save into {save_onnx_path}') + + +if __name__ == '__main__': + main() diff --git a/yolo_world/easydeploy/tools/image-demo.py b/yolo_world/easydeploy/tools/image-demo.py new file mode 100644 index 0000000000000000000000000000000000000000..c85f31a02beeb708e23662fe08dd0a105f112aaf --- /dev/null +++ b/yolo_world/easydeploy/tools/image-demo.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from projects.easydeploy.model import ORTWrapper, TRTWrapper # isort:skip +import os +import random +from argparse import ArgumentParser + +import cv2 +import mmcv +import numpy as np +import torch +from mmcv.transforms import Compose +from mmdet.utils import get_test_pipeline_cfg +from mmengine.config import Config, ConfigDict +from mmengine.utils import ProgressBar, path + +from mmyolo.utils import register_all_modules +from mmyolo.utils.misc import get_file_list + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + 'img', help='Image path, include image file, dir and URL.') + parser.add_argument('config', help='Config file') + parser.add_argument('checkpoint', help='Checkpoint file') + parser.add_argument( + '--out-dir', default='./output', help='Path to output file') + parser.add_argument( + '--device', default='cuda:0', help='Device used for inference') + parser.add_argument( + '--show', action='store_true', help='Show the detection results') + args = parser.parse_args() + return args + + +def preprocess(config): + data_preprocess = config.get('model', {}).get('data_preprocessor', {}) + mean = data_preprocess.get('mean', [0., 0., 0.]) + std = data_preprocess.get('std', [1., 1., 1.]) + mean = torch.tensor(mean, dtype=torch.float32).reshape(1, 3, 1, 1) + std = torch.tensor(std, dtype=torch.float32).reshape(1, 3, 1, 1) + + class PreProcess(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x): + x = x[None].float() + x -= mean.to(x.device) + x /= std.to(x.device) + return x + + return PreProcess().eval() + + +def main(): + args = parse_args() + + # register all modules in mmdet into the registries + register_all_modules() + + colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(1000)] + + # build the model from a config file and a checkpoint file + if args.checkpoint.endswith('.onnx'): + model = ORTWrapper(args.checkpoint, args.device) + elif args.checkpoint.endswith('.engine') or args.checkpoint.endswith( + '.plan'): + model = TRTWrapper(args.checkpoint, args.device) + else: + raise NotImplementedError + + model.to(args.device) + + cfg = Config.fromfile(args.config) + class_names = cfg.get('class_name') + + test_pipeline = get_test_pipeline_cfg(cfg) + test_pipeline[0] = ConfigDict({'type': 'mmdet.LoadImageFromNDArray'}) + test_pipeline = Compose(test_pipeline) + + pre_pipeline = preprocess(cfg) + + if not args.show: + path.mkdir_or_exist(args.out_dir) + + # get file list + files, source_type = get_file_list(args.img) + + # start detector inference + progress_bar = ProgressBar(len(files)) + for i, file in enumerate(files): + bgr = mmcv.imread(file) + rgb = mmcv.imconvert(bgr, 'bgr', 'rgb') + data, samples = test_pipeline(dict(img=rgb, img_id=i)).values() + pad_param = samples.get('pad_param', + np.array([0, 0, 0, 0], dtype=np.float32)) + h, w = samples.get('ori_shape', rgb.shape[:2]) + pad_param = torch.asarray( + [pad_param[2], pad_param[0], pad_param[2], pad_param[0]], + device=args.device) + scale_factor = samples.get('scale_factor', [1., 1]) + scale_factor = torch.asarray(scale_factor * 2, device=args.device) + data = pre_pipeline(data).to(args.device) + + result = model(data) + if source_type['is_dir']: + filename = os.path.relpath(file, args.img).replace('/', '_') + else: + filename = os.path.basename(file) + out_file = None if args.show else os.path.join(args.out_dir, filename) + + # Get candidate predict info by num_dets + num_dets, bboxes, scores, labels = result + scores = scores[0, :num_dets] + bboxes = bboxes[0, :num_dets] + labels = labels[0, :num_dets] + bboxes -= pad_param + bboxes /= scale_factor + + bboxes[:, 0::2].clamp_(0, w) + bboxes[:, 1::2].clamp_(0, h) + bboxes = bboxes.round().int() + + for (bbox, score, label) in zip(bboxes, scores, labels): + bbox = bbox.tolist() + color = colors[label] + + if class_names is not None: + label_name = class_names[label] + name = f'cls:{label_name}_score:{score:0.4f}' + else: + name = f'cls:{label}_score:{score:0.4f}' + + cv2.rectangle(bgr, bbox[:2], bbox[2:], color, 2) + cv2.putText( + bgr, + name, (bbox[0], bbox[1] - 2), + cv2.FONT_HERSHEY_SIMPLEX, + 2.0, [225, 255, 255], + thickness=3) + + if args.show: + mmcv.imshow(bgr, 'result', 0) + else: + mmcv.imwrite(bgr, out_file) + progress_bar.update() + + +if __name__ == '__main__': + main() diff --git a/yolo_world/engine/__init__.py b/yolo_world/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74177cd3c2f867cfa85c41ad6e41a75be478af80 --- /dev/null +++ b/yolo_world/engine/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .optimizers import * # noqa diff --git a/yolo_world/engine/optimizers/__init__.py b/yolo_world/engine/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..607cefb27435590334926f1521734b1ecadc32ab --- /dev/null +++ b/yolo_world/engine/optimizers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolow_v5_optim_constructor import YOLOWv5OptimizerConstructor + +__all__ = ['YOLOWv5OptimizerConstructor'] diff --git a/yolo_world/engine/optimizers/yolow_v5_optim_constructor.py b/yolo_world/engine/optimizers/yolow_v5_optim_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b625ebc9684c4cac2a27383f592a786a5a9e00 --- /dev/null +++ b/yolo_world/engine/optimizers/yolow_v5_optim_constructor.py @@ -0,0 +1,187 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import logging +from typing import List, Optional, Union + +import torch +import torch.nn as nn +from torch.nn import GroupNorm, LayerNorm +from mmengine.dist import get_world_size +from mmengine.logging import print_log +from mmengine.optim import OptimWrapper, DefaultOptimWrapperConstructor +from mmengine.utils.dl_utils import mmcv_full_available +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm + +from mmyolo.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, + OPTIMIZERS) + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class YOLOWv5OptimizerConstructor(DefaultOptimWrapperConstructor): + """YOLO World v5 constructor for optimizers.""" + + def __init__(self, + optim_wrapper_cfg: dict, + paramwise_cfg: Optional[dict] = None) -> None: + super().__init__(optim_wrapper_cfg, paramwise_cfg) + self.base_total_batch_size = self.paramwise_cfg.pop( + 'base_total_batch_size', 64) + + def add_params(self, + params: List[dict], + module: nn.Module, + prefix: str = '', + is_dcn_module: Optional[Union[int, float]] = None) -> None: + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + prefix (str): The prefix of the module + is_dcn_module (int|float|None): If the current module is a + submodule of DCN, `is_dcn_module` will be passed to + control conv_offset layer's learning rate. Defaults to None. + """ + # get param-wise options + custom_keys = self.paramwise_cfg.get('custom_keys', {}) + # first sort with alphabet order and then sort with reversed len of str + sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) + + bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None) + bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None) + norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None) + dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None) + flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None) + bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False) + dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None) + + # special rules for norm layers and depth-wise conv layers + is_norm = isinstance(module, + (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + is_dwconv = ( + isinstance(module, torch.nn.Conv2d) + and module.in_channels == module.groups) + + for name, param in module.named_parameters(recurse=False): + param_group = {'params': [param]} + if bypass_duplicate and self._is_in(param_group, params): + print_log( + f'{prefix} is duplicate. It is skipped since ' + f'bypass_duplicate={bypass_duplicate}', + logger='current', + level=logging.WARNING) + continue + if not param.requires_grad: + params.append(param_group) + continue + + # if the parameter match one of the custom keys, ignore other rules + for key in sorted_keys: + if key in f'{prefix}.{name}': + lr_mult = custom_keys[key].get('lr_mult', 1.) + param_group['lr'] = self.base_lr * lr_mult + if self.base_wd is not None: + decay_mult = custom_keys[key].get('decay_mult', 1.) + param_group['weight_decay'] = self.base_wd * decay_mult + # add custom settings to param_group + for k, v in custom_keys[key].items(): + param_group[k] = v + break + + # NOTE: the behavious is different from MMDetection + # bias_lr_mult affects all bias parameters + # except for norm.bias dcn.conv_offset.bias + if name == 'bias' and not ( + is_norm or is_dcn_module) and bias_lr_mult is not None: + param_group['lr'] = self.base_lr * bias_lr_mult + + if (prefix.find('conv_offset') != -1 and is_dcn_module + and dcn_offset_lr_mult is not None + and isinstance(module, torch.nn.Conv2d)): + # deal with both dcn_offset's bias & weight + param_group['lr'] = self.base_lr * dcn_offset_lr_mult + + # apply weight decay policies + if self.base_wd is not None: + # norm decay + if is_norm and norm_decay_mult is not None: + param_group[ + 'weight_decay'] = self.base_wd * norm_decay_mult + # bias lr and decay + elif (name == 'bias' and not is_dcn_module + and bias_decay_mult is not None): + param_group[ + 'weight_decay'] = self.base_wd * bias_decay_mult + # depth-wise conv + elif is_dwconv and dwconv_decay_mult is not None: + param_group[ + 'weight_decay'] = self.base_wd * dwconv_decay_mult + # flatten parameters except dcn offset + elif (param.ndim == 1 and not is_dcn_module + and flat_decay_mult is not None): + param_group[ + 'weight_decay'] = self.base_wd * flat_decay_mult + params.append(param_group) + for key, value in param_group.items(): + if key == 'params': + continue + full_name = f'{prefix}.{name}' if prefix else name + print_log( + f'paramwise_options -- {full_name}:{key}={value}', + logger='current') + + if mmcv_full_available(): + from mmcv.ops import DeformConv2d, ModulatedDeformConv2d + is_dcn_module = isinstance(module, + (DeformConv2d, ModulatedDeformConv2d)) + else: + is_dcn_module = False + for child_name, child_mod in module.named_children(): + child_prefix = f'{prefix}.{child_name}' if prefix else child_name + self.add_params( + params, + child_mod, + prefix=child_prefix, + is_dcn_module=is_dcn_module) + + def __call__(self, model: nn.Module) -> OptimWrapper: + if hasattr(model, 'module'): + model = model.module + + optim_wrapper_cfg = self.optim_wrapper_cfg.copy() + optim_wrapper_cfg.setdefault('type', 'OptimWrapper') + optimizer_cfg = self.optimizer_cfg.copy() + + # follow the original yolov5 implementation + if 'batch_size_per_gpu' in optimizer_cfg: + batch_size_per_gpu = optimizer_cfg.pop('batch_size_per_gpu') + # No scaling if total_batch_size is less than + # base_total_batch_size, otherwise linear scaling. + total_batch_size = get_world_size() * batch_size_per_gpu + accumulate = max( + round(self.base_total_batch_size / total_batch_size), 1) + scale_factor = total_batch_size * \ + accumulate / self.base_total_batch_size + + if scale_factor != 1: + weight_decay = optimizer_cfg.get('weight_decay', 0) + weight_decay *= scale_factor + optimizer_cfg['weight_decay'] = weight_decay + print_log(f'Scaled weight_decay to {weight_decay}', 'current') + + # if no paramwise option is specified, just use the global setting + if not self.paramwise_cfg: + optimizer_cfg['params'] = model.parameters() + optimizer = OPTIMIZERS.build(optimizer_cfg) + else: + # set param-wise lr and weight decay recursively + params: List = [] + self.add_params(params, model) + optimizer_cfg['params'] = params + optimizer = OPTIMIZERS.build(optimizer_cfg) + optim_wrapper = OPTIM_WRAPPERS.build( + optim_wrapper_cfg, default_args=dict(optimizer=optimizer)) + return optim_wrapper diff --git a/yolo_world/models/__init__.py b/yolo_world/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..266251a562ba23081552bff4f82d56b34eba66a2 --- /dev/null +++ b/yolo_world/models/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .backbones import * # noqa +from .layers import * # noqa +from .detectors import * # noqa +from .losses import * # noqa +from .data_preprocessors import * # noqa +from .dense_heads import * # noqa +from .necks import * # noqa diff --git a/yolo_world/models/backbones/__init__.py b/yolo_world/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..67698adfe7d6efe0beef29127ce7f34e9aa573ba --- /dev/null +++ b/yolo_world/models/backbones/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Tencent Inc. All rights reserved. +# YOLO Multi-Modal Backbone (Vision Language) +# Vision: YOLOv8 CSPDarknet +# Language: CLIP Text Encoder (12-layer transformer) +from .mm_backbone import ( + MultiModalYOLOBackbone, + HuggingVisionBackbone, + HuggingCLIPLanguageBackbone, + PseudoLanguageBackbone) + +__all__ = [ + 'MultiModalYOLOBackbone', + 'HuggingVisionBackbone', + 'HuggingCLIPLanguageBackbone', + 'PseudoLanguageBackbone' +] diff --git a/yolo_world/models/backbones/mm_backbone.py b/yolo_world/models/backbones/mm_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..2ebbe21c91386bc0c5ab312fbdb57ae0ae340b84 --- /dev/null +++ b/yolo_world/models/backbones/mm_backbone.py @@ -0,0 +1,215 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import itertools +from typing import List, Sequence, Tuple +import torch +from torch import Tensor +from torch.nn.modules.batchnorm import _BatchNorm +from mmengine.model import BaseModule +from mmyolo.registry import MODELS +from mmdet.utils import OptMultiConfig, ConfigType +from transformers import ( + AutoTokenizer, + AutoModel, + CLIPTextConfig) +from transformers import CLIPTextModelWithProjection as CLIPTP + + +@MODELS.register_module() +class HuggingVisionBackbone(BaseModule): + def __init__(self, + model_name: str, + out_indices: Sequence[int] = (0, 1, 2, 3), + norm_eval: bool = True, + frozen_modules: Sequence[str] = (), + init_cfg: OptMultiConfig = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.norm_eval = norm_eval + self.frozen_modules = frozen_modules + self.model = AutoModel.from_pretrained(model_name) + + self._freeze_modules() + + def forward(self, image: Tensor) -> Tuple[Tensor]: + encoded_dict = self.image_model(pixel_values=image, + output_hidden_states=True) + hidden_states = encoded_dict.hidden_states + img_feats = encoded_dict.get('reshaped_hidden_states', hidden_states) + img_feats = [img_feats[i] for i in self.image_out_indices] + return tuple(img_feats) + + def _freeze_modules(self): + for name, module in self.model.named_modules(): + for frozen_name in self.frozen_modules: + if name.startswith(frozen_name): + module.eval() + for param in module.parameters(): + param.requires_grad = False + break + + def train(self, mode=True): + super().train(mode) + self._freeze_modules() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + +@MODELS.register_module() +class HuggingCLIPLanguageBackbone(BaseModule): + def __init__(self, + model_name: str, + frozen_modules: Sequence[str] = (), + dropout: float = 0.0, + training_use_cache: bool = False, + init_cfg: OptMultiConfig = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.frozen_modules = frozen_modules + self.training_use_cache = training_use_cache + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + clip_config = CLIPTextConfig.from_pretrained(model_name, + attention_dropout=dropout) + self.model = CLIPTP.from_pretrained(model_name, + config=clip_config) + self._freeze_modules() + + def forward_cache(self, text: List[List[str]]) -> Tensor: + if not hasattr(self, "cache"): + self.cache = self.forward_text(text) + return self.cache + + def forward(self, text: List[List[str]]) -> Tensor: + if self.training: + return self.forward_text(text) + else: + return self.forward_text(text) + # return self.forward_cache(text) + + def forward_tokenizer(self, texts): + if not hasattr(self, 'text'): + text = list(itertools.chain(*texts)) + # print(text) + # # text = ['a photo of {}'.format(x) for x in text] + text = self.tokenizer(text=text, return_tensors='pt', padding=True) + # print(text) + self.text = text.to(device=self.model.device) + return self.text + + def forward_text(self, text: List[List[str]]) -> Tensor: + num_per_batch = [len(t) for t in text] + assert max(num_per_batch) == min(num_per_batch), ( + 'number of sequences not equal in batch') + # print(max([[len(t.split(' ')) for t in tt] for tt in text])) + # print(num_per_batch, max(num_per_batch)) + text = list(itertools.chain(*text)) + # print(text) + # text = ['a photo of {}'.format(x) for x in text] + # text = self.forward_tokenizer(text) + text = self.tokenizer(text=text, return_tensors='pt', padding=True) + text = text.to(device=self.model.device) + txt_outputs = self.model(**text) + # txt_feats = txt_outputs.last_hidden_state[:, 0, :] + txt_feats = txt_outputs.text_embeds + txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True) + txt_feats = txt_feats.reshape(-1, num_per_batch[0], + txt_feats.shape[-1]) + return txt_feats + + def _freeze_modules(self): + + if len(self.frozen_modules) == 0: + # not freeze + return + if self.frozen_modules[0] == "all": + self.model.eval() + for _, module in self.model.named_modules(): + module.eval() + for param in module.parameters(): + param.requires_grad = False + return + for name, module in self.model.named_modules(): + for frozen_name in self.frozen_modules: + if name.startswith(frozen_name): + module.eval() + for param in module.parameters(): + param.requires_grad = False + break + + def train(self, mode=True): + super().train(mode) + self._freeze_modules() + + +@MODELS.register_module() +class PseudoLanguageBackbone(BaseModule): + """Pseudo Language Backbone + Args: + text_embed_path (str): path to the text embedding file + """ + def __init__(self, + text_embed_path: str = "", + test_embed_path: str = None, + init_cfg: OptMultiConfig = None): + super().__init__(init_cfg) + # {text:embed} + self.text_embed = torch.load(text_embed_path, map_location='cpu') + if test_embed_path is None: + self.test_embed = self.text_embed + else: + self.test_embed = torch.load(test_embed_path) + self.register_buffer("buff", torch.zeros([ + 1, + ])) + + def forward_cache(self, text: List[List[str]]) -> Tensor: + if not hasattr(self, "cache"): + self.cache = self.forward_text(text) + return self.cache + + def forward(self, text: List[List[str]]) -> Tensor: + if self.training: + return self.forward_text(text) + else: + return self.forward_cache(text) + + def forward_text(self, text: List[List[str]]) -> Tensor: + num_per_batch = [len(t) for t in text] + assert max(num_per_batch) == min(num_per_batch), ( + 'number of sequences not equal in batch') + text = list(itertools.chain(*text)) + if self.training: + text_embed_dict = self.text_embed + else: + text_embed_dict = self.test_embed + text_embeds = torch.stack( + [text_embed_dict[x.split("/")[0]] for x in text]) + # requires no grad and force to float + text_embeds = text_embeds.to( + self.buff.device).requires_grad_(False).float() + text_embeds = text_embeds.reshape(-1, num_per_batch[0], + text_embeds.shape[-1]) + return text_embeds + + +@MODELS.register_module() +class MultiModalYOLOBackbone(BaseModule): + def __init__(self, + image_model: ConfigType, + text_model: ConfigType, + init_cfg: OptMultiConfig = None) -> None: + + super().__init__(init_cfg) + + self.image_model = MODELS.build(image_model) + self.text_model = MODELS.build(text_model) + + def forward(self, image: Tensor, + text: List[List[str]]) -> Tuple[Tuple[Tensor], Tensor]: + img_feats = self.image_model(image) + txt_feats = self.text_model(text) + return img_feats, txt_feats diff --git a/yolo_world/models/data_preprocessors/__init__.py b/yolo_world/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e3959ac60693349c2ecd1a659aa0ca32c00c7eae --- /dev/null +++ b/yolo_world/models/data_preprocessors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .data_preprocessor import YOLOWDetDataPreprocessor + +__all__ = ['YOLOWDetDataPreprocessor'] diff --git a/yolo_world/models/data_preprocessors/data_preprocessor.py b/yolo_world/models/data_preprocessors/data_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..58787063c8da3cd654c6e33eb81919a106273ab9 --- /dev/null +++ b/yolo_world/models/data_preprocessors/data_preprocessor.py @@ -0,0 +1,63 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Union + +import torch +from mmdet.models.data_preprocessors import DetDataPreprocessor +from mmengine.structures import BaseDataElement + +from mmyolo.registry import MODELS + +CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str, + None] + + +@MODELS.register_module() +class YOLOWDetDataPreprocessor(DetDataPreprocessor): + """Rewrite collate_fn to get faster training speed. + + Note: It must be used together with `mmyolo.datasets.utils.yolow_collate` + """ + + def __init__(self, *args, non_blocking: Optional[bool] = True, **kwargs): + super().__init__(*args, non_blocking=non_blocking, **kwargs) + + def forward(self, data: dict, training: bool = False) -> dict: + """Perform normalization, padding and bgr2rgb conversion based on + ``DetDataPreprocessorr``. + + Args: + data (dict): Data sampled from dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + if not training: + return super().forward(data, training) + + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + assert isinstance(data['data_samples'], dict) + + # TODO: Supports multi-scale training + if self._channel_conversion and inputs.shape[1] == 3: + inputs = inputs[:, [2, 1, 0], ...] + if self._enable_normalize: + inputs = (inputs - self.mean) / self.std + + if self.batch_augments is not None: + for batch_aug in self.batch_augments: + inputs, data_samples = batch_aug(inputs, data_samples) + + img_metas = [{'batch_input_shape': inputs.shape[2:]}] * len(inputs) + data_samples_output = { + 'bboxes_labels': data_samples['bboxes_labels'], + 'texts': data_samples['texts'], + 'img_metas': img_metas + } + if 'masks' in data_samples: + data_samples_output['masks'] = data_samples['masks'] + if 'is_detection' in data_samples: + data_samples_output['is_detection'] = data_samples['is_detection'] + + return {'inputs': inputs, 'data_samples': data_samples_output} diff --git a/yolo_world/models/dense_heads/__init__.py b/yolo_world/models/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2c8ede3abaf0ce284748b22c3db2f5c9a22d7a43 --- /dev/null +++ b/yolo_world/models/dense_heads/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolo_world_head import YOLOWorldHead, YOLOWorldHeadModule +__all__ = ['YOLOWorldHead', 'YOLOWorldHeadModule'] diff --git a/yolo_world/models/dense_heads/yolo_world_head.py b/yolo_world/models/dense_heads/yolo_world_head.py new file mode 100644 index 0000000000000000000000000000000000000000..8f9d695b0d966d2c64b0a5d0667b8b05f458fab5 --- /dev/null +++ b/yolo_world/models/dense_heads/yolo_world_head.py @@ -0,0 +1,596 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import math +import copy +from typing import List, Optional, Tuple, Union, Sequence +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from mmcv.cnn import ConvModule +from mmengine.config import ConfigDict +from mmengine.model import BaseModule +from torch import Tensor + +from mmengine.dist import get_dist_info +from mmengine.structures import InstanceData +from mmdet.structures import SampleList +from mmdet.utils import OptConfigType, InstanceList, OptInstanceList +from mmdet.models.utils import ( + multi_apply, + unpack_gt_instances, + filter_scores_and_topk) +from mmyolo.registry import MODELS +from mmyolo.models.dense_heads import YOLOv8HeadModule, YOLOv8Head +from mmyolo.models.utils import gt_instances_preprocess +from mmcv.cnn.bricks import build_norm_layer + + +@MODELS.register_module() +class ContrastiveHead(BaseModule): + """Contrastive Head for YOLO-World + compute the region-text scores according to the + similarity between image and text features + Args: + embed_dims (int): embed dim of text and image features + """ + def __init__(self, + embed_dims: int, + init_cfg: OptConfigType = None) -> None: + + super().__init__(init_cfg=init_cfg) + + self.bias = nn.Parameter(torch.zeros([])) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + def forward(self, x: Tensor, w: Tensor) -> Tensor: + """Forward function of contrastive learning.""" + x = F.normalize(x, dim=1, p=2) + w = F.normalize(w, dim=-1, p=2) + x = torch.einsum('bchw,bkc->bkhw', x, w) + x = x * self.logit_scale.exp() + self.bias + return x + + +@MODELS.register_module() +class BNContrastiveHead(BaseModule): + """ Batch Norm Contrastive Head for YOLO-World + using batch norm instead of l2-normalization + Args: + embed_dims (int): embed dim of text and image features + norm_cfg (dict): normalization params + """ + def __init__(self, + embed_dims: int, + norm_cfg: ConfigDict, + init_cfg: OptConfigType = None) -> None: + + super().__init__(init_cfg=init_cfg) + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + self.bias = nn.Parameter(torch.zeros([])) + # use -1.0 is more stable + self.logit_scale = nn.Parameter(-1.0 * torch.ones([])) + + def forward(self, x: Tensor, w: Tensor) -> Tensor: + """Forward function of contrastive learning.""" + x = self.norm(x) + w = F.normalize(w, dim=-1, p=2) + x = torch.einsum('bchw,bkc->bkhw', x, w) + x = x * self.logit_scale.exp() + self.bias + return x + + +@MODELS.register_module() +class YOLOWorldHeadModule(YOLOv8HeadModule): + """Head Module for YOLO-World + + Args: + embed_dims (int): embed dim for text feautures and image features + use_bn_head (bool): use batch normalization head + """ + + def __init__(self, + *args, + embed_dims: int, + use_bn_head: bool = False, + **kwargs) -> None: + self.embed_dims = embed_dims + self.use_bn_head = use_bn_head + super().__init__(*args, **kwargs) + + def init_weights(self, prior_prob=0.01): + """Initialize the weight and bias of PPYOLOE head.""" + super().init_weights() + for cls_pred, cls_contrast, stride in zip(self.cls_preds, + self.cls_contrasts, + self.featmap_strides): + cls_pred[-1].bias.data[:] = 0.0 # reset bias + if hasattr(cls_contrast, 'bias'): + nn.init.constant_( + cls_contrast.bias.data, + math.log(5 / self.num_classes / (640 / stride)**2)) + + def _init_layers(self) -> None: + """initialize conv layers in YOLOv8 head.""" + # Init decouple head + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.cls_contrasts = nn.ModuleList() + + reg_out_channels = max( + (16, self.in_channels[0] // 4, self.reg_max * 4)) + cls_out_channels = max(self.in_channels[0], self.num_classes) + + for i in range(self.num_levels): + self.reg_preds.append( + nn.Sequential( + ConvModule(in_channels=self.in_channels[i], + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule(in_channels=reg_out_channels, + out_channels=reg_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(in_channels=reg_out_channels, + out_channels=4 * self.reg_max, + kernel_size=1))) + self.cls_preds.append( + nn.Sequential( + ConvModule(in_channels=self.in_channels[i], + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + ConvModule(in_channels=cls_out_channels, + out_channels=cls_out_channels, + kernel_size=3, + stride=1, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d(in_channels=cls_out_channels, + out_channels=self.embed_dims, + kernel_size=1))) + if self.use_bn_head: + self.cls_contrasts.append( + BNContrastiveHead(self.embed_dims, self.norm_cfg)) + else: + self.cls_contrasts.append(ContrastiveHead(self.embed_dims)) + + proj = torch.arange(self.reg_max, dtype=torch.float) + self.register_buffer('proj', proj, persistent=False) + + def forward(self, img_feats: Tuple[Tensor], + txt_feats: Tensor) -> Tuple[List]: + """Forward features from the upstream network.""" + assert len(img_feats) == self.num_levels + txt_feats = [txt_feats for _ in range(self.num_levels)] + return multi_apply(self.forward_single, img_feats, txt_feats, + self.cls_preds, self.reg_preds, self.cls_contrasts) + + def forward_single(self, img_feat: Tensor, txt_feat: Tensor, + cls_pred: nn.ModuleList, reg_pred: nn.ModuleList, + cls_contrast: nn.ModuleList) -> Tuple: + """Forward feature of a single scale level.""" + b, _, h, w = img_feat.shape + cls_embed = cls_pred(img_feat) + cls_logit = cls_contrast(cls_embed, txt_feat) + bbox_dist_preds = reg_pred(img_feat) + if self.reg_max > 1: + bbox_dist_preds = bbox_dist_preds.reshape( + [-1, 4, self.reg_max, h * w]).permute(0, 3, 1, 2) + + # TODO: The get_flops script cannot handle the situation of + # matmul, and needs to be fixed later + # bbox_preds = bbox_dist_preds.softmax(3).matmul(self.proj) + bbox_preds = bbox_dist_preds.softmax(3).matmul( + self.proj.view([-1, 1])).squeeze(-1) + bbox_preds = bbox_preds.transpose(1, 2).reshape(b, -1, h, w) + else: + bbox_preds = bbox_dist_preds + if self.training: + return cls_logit, bbox_preds, bbox_dist_preds + else: + return cls_logit, bbox_preds + + +@MODELS.register_module() +class YOLOWorldHead(YOLOv8Head): + """YOLO-World Head + """ + def __init__(self, world_size=-1, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.world_size = world_size + + """YOLO World v8 head.""" + def loss(self, img_feats: Tuple[Tensor], txt_feats: Tensor, + batch_data_samples: Union[list, dict]) -> dict: + """Perform forward propagation and loss calculation of the detection + head on the features of the upstream network.""" + + outs = self(img_feats, txt_feats) + # Fast version + loss_inputs = outs + (batch_data_samples['bboxes_labels'], + batch_data_samples['img_metas']) + losses = self.loss_by_feat(*loss_inputs) + + return losses + + def loss_and_predict( + self, + img_feats: Tuple[Tensor], + txt_feats: Tensor, + batch_data_samples: SampleList, + proposal_cfg: Optional[ConfigDict] = None + ) -> Tuple[dict, InstanceList]: + """Perform forward propagation of the head, then calculate loss and + predictions from the features and data samples. + """ + outputs = unpack_gt_instances(batch_data_samples) + (batch_gt_instances, batch_gt_instances_ignore, + batch_img_metas) = outputs + + outs = self(img_feats, txt_feats) + + loss_inputs = outs + (batch_gt_instances, batch_img_metas, + batch_gt_instances_ignore) + losses = self.loss_by_feat(*loss_inputs) + + predictions = self.predict_by_feat(*outs, + batch_img_metas=batch_img_metas, + cfg=proposal_cfg) + return losses, predictions + + def forward(self, img_feats: Tuple[Tensor], + txt_feats: Tensor) -> Tuple[List]: + """Forward features from the upstream network.""" + self.num_classes = txt_feats.shape[1] + return self.head_module(img_feats, txt_feats) + + def predict(self, + img_feats: Tuple[Tensor], + txt_feats: Tensor, + batch_data_samples: SampleList, + rescale: bool = False) -> InstanceList: + """Perform forward propagation of the detection head and predict + detection results on the features of the upstream network. + """ + batch_img_metas = [ + data_samples.metainfo for data_samples in batch_data_samples + ] + outs = self(img_feats, txt_feats) + predictions = self.predict_by_feat(*outs, + batch_img_metas=batch_img_metas, + rescale=rescale) + return predictions + + def aug_test(self, + aug_batch_feats, + aug_batch_img_metas, + rescale=False, + with_ori_nms=False, + **kwargs): + """Test function with test time augmentation.""" + raise NotImplementedError('aug_test is not implemented yet.') + + def loss_by_feat( + self, + cls_scores: Sequence[Tensor], + bbox_preds: Sequence[Tensor], + bbox_dist_preds: Sequence[Tensor], + batch_gt_instances: Sequence[InstanceData], + batch_img_metas: Sequence[dict], + batch_gt_instances_ignore: OptInstanceList = None) -> dict: + """Calculate the loss based on the features extracted by the detection + head. + + Args: + cls_scores (Sequence[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_priors * num_classes. + bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_priors * 4. + bbox_dist_preds (Sequence[Tensor]): Box distribution logits for + each scale level with shape (bs, reg_max + 1, H*W, 4). + batch_gt_instances (list[:obj:`InstanceData`]): Batch of + gt_instance. It usually includes ``bboxes`` and ``labels`` + attributes. + batch_img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + batch_gt_instances_ignore (list[:obj:`InstanceData`], optional): + Batch of gt_instances_ignore. It includes ``bboxes`` attribute + data that is ignored during training and testing. + Defaults to None. + Returns: + dict[str, Tensor]: A dictionary of losses. + """ + num_imgs = len(batch_img_metas) + + current_featmap_sizes = [ + cls_score.shape[2:] for cls_score in cls_scores + ] + # If the shape does not equal, generate new one + if current_featmap_sizes != self.featmap_sizes_train: + self.featmap_sizes_train = current_featmap_sizes + + mlvl_priors_with_stride = self.prior_generator.grid_priors( + self.featmap_sizes_train, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device, + with_stride=True) + + self.num_level_priors = [len(n) for n in mlvl_priors_with_stride] + self.flatten_priors_train = torch.cat( + mlvl_priors_with_stride, dim=0) + self.stride_tensor = self.flatten_priors_train[..., [2]] + + # gt info + gt_info = gt_instances_preprocess(batch_gt_instances, num_imgs) + gt_labels = gt_info[:, :, :1] + gt_bboxes = gt_info[:, :, 1:] # xyxy + pad_bbox_flag = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + # pred info + flatten_cls_preds = [ + cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_pred in cls_scores + ] + flatten_pred_bboxes = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + # (bs, n, 4 * reg_max) + flatten_pred_dists = [ + bbox_pred_org.reshape(num_imgs, -1, self.head_module.reg_max * 4) + for bbox_pred_org in bbox_dist_preds + ] + + flatten_dist_preds = torch.cat(flatten_pred_dists, dim=1) + flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) + flatten_pred_bboxes = torch.cat(flatten_pred_bboxes, dim=1) + flatten_pred_bboxes = self.bbox_coder.decode( + self.flatten_priors_train[..., :2], flatten_pred_bboxes, + self.stride_tensor[..., 0]) + + assigned_result = self.assigner( + (flatten_pred_bboxes.detach()).type(gt_bboxes.dtype), + flatten_cls_preds.detach().sigmoid(), self.flatten_priors_train, + gt_labels, gt_bboxes, pad_bbox_flag) + + assigned_bboxes = assigned_result['assigned_bboxes'] + assigned_scores = assigned_result['assigned_scores'] + fg_mask_pre_prior = assigned_result['fg_mask_pre_prior'] + + assigned_scores_sum = assigned_scores.sum().clamp(min=1) + + loss_cls = self.loss_cls(flatten_cls_preds, assigned_scores).sum() + loss_cls /= assigned_scores_sum + + # rescale bbox + assigned_bboxes /= self.stride_tensor + flatten_pred_bboxes /= self.stride_tensor + + # select positive samples mask + num_pos = fg_mask_pre_prior.sum() + if num_pos > 0: + # when num_pos > 0, assigned_scores_sum will >0, so the loss_bbox + # will not report an error + # iou loss + prior_bbox_mask = fg_mask_pre_prior.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select( + flatten_pred_bboxes, prior_bbox_mask).reshape([-1, 4]) + assigned_bboxes_pos = torch.masked_select( + assigned_bboxes, prior_bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + assigned_scores.sum(-1), fg_mask_pre_prior).unsqueeze(-1) + loss_bbox = self.loss_bbox( + pred_bboxes_pos, assigned_bboxes_pos, + weight=bbox_weight) / assigned_scores_sum + + # dfl loss + pred_dist_pos = flatten_dist_preds[fg_mask_pre_prior] + assigned_ltrb = self.bbox_coder.encode( + self.flatten_priors_train[..., :2] / self.stride_tensor, + assigned_bboxes, + max_dis=self.head_module.reg_max - 1, + eps=0.01) + assigned_ltrb_pos = torch.masked_select( + assigned_ltrb, prior_bbox_mask).reshape([-1, 4]) + loss_dfl = self.loss_dfl( + pred_dist_pos.reshape(-1, self.head_module.reg_max), + assigned_ltrb_pos.reshape(-1), + weight=bbox_weight.expand(-1, 4).reshape(-1), + avg_factor=assigned_scores_sum) + else: + loss_bbox = flatten_pred_bboxes.sum() * 0 + loss_dfl = flatten_pred_bboxes.sum() * 0 + if self.world_size == -1: + _, world_size = get_dist_info() + else: + world_size = self.world_size + return dict( + loss_cls=loss_cls * num_imgs * world_size, + loss_bbox=loss_bbox * num_imgs * world_size, + loss_dfl=loss_dfl * num_imgs * world_size) + + def predict_by_feat(self, + cls_scores: List[Tensor], + bbox_preds: List[Tensor], + objectnesses: Optional[List[Tensor]] = None, + batch_img_metas: Optional[List[dict]] = None, + cfg: Optional[ConfigDict] = None, + rescale: bool = True, + with_nms: bool = True) -> List[InstanceData]: + """Transform a batch of output features extracted by the head into + bbox results. + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + objectnesses (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, 1, H, W). + batch_img_metas (list[dict], Optional): Batch image meta info. + Defaults to None. + cfg (ConfigDict, optional): Test / postprocessing + configuration, if None, test_cfg would be used. + Defaults to None. + rescale (bool): If True, return boxes in original image space. + Defaults to False. + with_nms (bool): If True, do nms before return boxes. + Defaults to True. + + Returns: + list[:obj:`InstanceData`]: Object detection results of each image + after the post process. Each item usually contains following keys. + + - scores (Tensor): Classification scores, has a shape + (num_instance, ) + - labels (Tensor): Labels of bboxes, has a shape + (num_instances, ). + - bboxes (Tensor): Has a shape (num_instances, 4), + the last dimension 4 arrange as (x1, y1, x2, y2). + """ + assert len(cls_scores) == len(bbox_preds) + if objectnesses is None: + with_objectnesses = False + else: + with_objectnesses = True + assert len(cls_scores) == len(objectnesses) + + cfg = self.test_cfg if cfg is None else cfg + cfg = copy.deepcopy(cfg) + + multi_label = cfg.multi_label + multi_label &= self.num_classes > 1 + cfg.multi_label = multi_label + + num_imgs = len(batch_img_metas) + featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] + + # If the shape does not change, use the previous mlvl_priors + if featmap_sizes != self.featmap_sizes: + self.mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, + dtype=cls_scores[0].dtype, + device=cls_scores[0].device) + self.featmap_sizes = featmap_sizes + flatten_priors = torch.cat(self.mlvl_priors) + + mlvl_strides = [ + flatten_priors.new_full( + (featmap_size.numel() * self.num_base_priors, ), stride) for + featmap_size, stride in zip(featmap_sizes, self.featmap_strides) + ] + flatten_stride = torch.cat(mlvl_strides) + + # flatten cls_scores, bbox_preds and objectness + flatten_cls_scores = [ + cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, + self.num_classes) + for cls_score in cls_scores + ] + flatten_bbox_preds = [ + bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) + for bbox_pred in bbox_preds + ] + + flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid() + flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) + flatten_decoded_bboxes = self.bbox_coder.decode( + flatten_priors[None], flatten_bbox_preds, flatten_stride) + + if with_objectnesses: + flatten_objectness = [ + objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) + for objectness in objectnesses + ] + flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid() + else: + flatten_objectness = [None for _ in range(num_imgs)] + # 8400 + # print(flatten_cls_scores.shape) + results_list = [] + for (bboxes, scores, objectness, + img_meta) in zip(flatten_decoded_bboxes, flatten_cls_scores, + flatten_objectness, batch_img_metas): + ori_shape = img_meta['ori_shape'] + scale_factor = img_meta['scale_factor'] + if 'pad_param' in img_meta: + pad_param = img_meta['pad_param'] + else: + pad_param = None + + score_thr = cfg.get('score_thr', -1) + # yolox_style does not require the following operations + if objectness is not None and score_thr > 0 and not cfg.get( + 'yolox_style', False): + conf_inds = objectness > score_thr + bboxes = bboxes[conf_inds, :] + scores = scores[conf_inds, :] + objectness = objectness[conf_inds] + + if objectness is not None: + # conf = obj_conf * cls_conf + scores *= objectness[:, None] + + if scores.shape[0] == 0: + empty_results = InstanceData() + empty_results.bboxes = bboxes + empty_results.scores = scores[:, 0] + empty_results.labels = scores[:, 0].int() + results_list.append(empty_results) + continue + + nms_pre = cfg.get('nms_pre', 100000) + if cfg.multi_label is False: + scores, labels = scores.max(1, keepdim=True) + scores, _, keep_idxs, results = filter_scores_and_topk( + scores, + score_thr, + nms_pre, + results=dict(labels=labels[:, 0])) + labels = results['labels'] + else: + scores, labels, keep_idxs, _ = filter_scores_and_topk( + scores, score_thr, nms_pre) + + results = InstanceData( + scores=scores, labels=labels, bboxes=bboxes[keep_idxs]) + + if rescale: + if pad_param is not None: + results.bboxes -= results.bboxes.new_tensor([ + pad_param[2], pad_param[0], pad_param[2], pad_param[0] + ]) + results.bboxes /= results.bboxes.new_tensor( + scale_factor).repeat((1, 2)) + + if cfg.get('yolox_style', False): + # do not need max_per_img + cfg.max_per_img = len(results) + + results = self._bbox_post_process( + results=results, + cfg=cfg, + rescale=False, + with_nms=with_nms, + img_meta=img_meta) + results.bboxes[:, 0::2].clamp_(0, ori_shape[1]) + results.bboxes[:, 1::2].clamp_(0, ori_shape[0]) + + results_list.append(results) + return results_list diff --git a/yolo_world/models/detectors/__init__.py b/yolo_world/models/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..54e83fadee0244f7bf8c4d2f12487eb40a0dae66 --- /dev/null +++ b/yolo_world/models/detectors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolo_world import YOLOWorldDetector + +__all__ = ['YOLOWorldDetector'] diff --git a/yolo_world/models/detectors/yolo_world.py b/yolo_world/models/detectors/yolo_world.py new file mode 100644 index 0000000000000000000000000000000000000000..7cce285950a6eb9bb1d79c36225f0217a0d4bdd4 --- /dev/null +++ b/yolo_world/models/detectors/yolo_world.py @@ -0,0 +1,88 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from typing import List, Tuple, Union +from torch import Tensor +from mmdet.structures import OptSampleList, SampleList +from mmyolo.models.detectors import YOLODetector +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class YOLOWorldDetector(YOLODetector): + """Implementation of YOLOW Series""" + def __init__(self, + *args, + mm_neck: bool = False, + num_train_classes=80, + num_test_classes=80, + **kwargs) -> None: + self.mm_neck = mm_neck + self.num_train_classes = num_train_classes + self.num_test_classes = num_test_classes + super().__init__(*args, **kwargs) + + def loss(self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Union[dict, list]: + """Calculate losses from a batch of inputs and data samples.""" + self.bbox_head.num_classes = self.num_train_classes + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + losses = self.bbox_head.loss(img_feats, txt_feats, batch_data_samples) + return losses + + def predict(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + rescale: bool = True) -> SampleList: + """Predict results from a batch of inputs and data samples with post- + processing. + """ + + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + + # self.bbox_head.num_classes = self.num_test_classes + self.bbox_head.num_classes = txt_feats[0].shape[0] + results_list = self.bbox_head.predict(img_feats, + txt_feats, + batch_data_samples, + rescale=rescale) + + batch_data_samples = self.add_pred_to_datasample( + batch_data_samples, results_list) + return batch_data_samples + + def reparameterize(self, texts: List[List[str]]) -> None: + self.texts = texts + + def _forward( + self, + batch_inputs: Tensor, + batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]: + """Network forward process. Usually includes backbone, neck and head + forward without any post-processing. + """ + img_feats, txt_feats = self.extract_feat(batch_inputs, + batch_data_samples) + results = self.bbox_head.forward(img_feats, txt_feats) + return results + + def extract_feat( + self, batch_inputs: Tensor, + batch_data_samples: SampleList) -> Tuple[Tuple[Tensor], Tensor]: + """Extract features.""" + if batch_data_samples is None: + texts = self.texts + elif isinstance(batch_data_samples, dict): + texts = batch_data_samples['texts'] + elif isinstance(batch_data_samples, list): + texts = [data_sample.texts for data_sample in batch_data_samples] + else: + raise TypeError('batch_data_samples should be dict or list.') + + img_feats, txt_feats = self.backbone(batch_inputs, texts) + if self.with_neck: + if self.mm_neck: + img_feats = self.neck(img_feats, txt_feats) + else: + img_feats = self.neck(img_feats) + return img_feats, txt_feats diff --git a/yolo_world/models/layers/__init__.py b/yolo_world/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..df266c27335bc3a7dc32dc994067f0ddcc39f84c --- /dev/null +++ b/yolo_world/models/layers/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) Tencent Inc. All rights reserved. +# Basic brick modules for PAFPN based on CSPLayers + +from .yolo_bricks import ( + CSPLayerWithTwoConv, + MaxSigmoidAttnBlock, + MaxSigmoidCSPLayerWithTwoConv, + ImagePoolingAttentionModule, + ) + +__all__ = ['CSPLayerWithTwoConv', + 'MaxSigmoidAttnBlock', + 'MaxSigmoidCSPLayerWithTwoConv', + 'ImagePoolingAttentionModule'] diff --git a/yolo_world/models/layers/yolo_bricks.py b/yolo_world/models/layers/yolo_bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..48d7dfd44531ec8a285d8f340def2bbf53f71d47 --- /dev/null +++ b/yolo_world/models/layers/yolo_bricks.py @@ -0,0 +1,301 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from typing import List + +import torch +import torch.nn as nn +from torch import Tensor +import torch.nn.functional as F +from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Linear +from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig +from mmengine.model import BaseModule +from mmyolo.registry import MODELS +from mmyolo.models.layers import CSPLayerWithTwoConv + + +@MODELS.register_module() +class MaxSigmoidAttnBlock(BaseModule): + """Max Sigmoid attention block.""" + def __init__(self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + kernel_size: int = 3, + padding: int = 1, + num_heads: int = 1, + use_depthwise: bool = False, + with_scale: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + assert (out_channels % num_heads == 0 and + embed_channels % num_heads == 0), \ + 'out_channels and embed_channels should be divisible by num_heads.' + self.num_heads = num_heads + self.head_channels = out_channels // num_heads + + self.embed_conv = ConvModule( + in_channels, + embed_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) if embed_channels != in_channels else None + self.guide_fc = Linear(guide_channels, embed_channels) + self.bias = nn.Parameter(torch.zeros(num_heads)) + if with_scale: + self.scale = nn.Parameter(torch.ones(1, num_heads, 1, 1)) + else: + self.scale = 1.0 + + self.project_conv = conv(in_channels, + out_channels, + kernel_size, + stride=1, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + B, _, H, W = x.shape + + guide = self.guide_fc(guide) + guide = guide.reshape(B, -1, self.num_heads, self.head_channels) + embed = self.embed_conv(x) if self.embed_conv is not None else x + embed = embed.reshape(B, self.num_heads, self.head_channels, H, W) + + attn_weight = torch.einsum('bmchw,bnmc->bmhwn', embed, guide) + attn_weight = attn_weight.max(dim=-1)[0] + attn_weight = attn_weight / (self.head_channels**0.5) + attn_weight = attn_weight + self.bias[None, :, None, None] + attn_weight = attn_weight.sigmoid() * self.scale + + x = self.project_conv(x) + x = x.reshape(B, self.num_heads, -1, H, W) + x = x * attn_weight.unsqueeze(2) + x = x.reshape(B, -1, H, W) + return x + + +@MODELS.register_module() +class MaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv): + """Sigmoid-attention based CSP layer with two convolution layers.""" + def __init__( + self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + num_heads: int = 1, + expand_ratio: float = 0.5, + num_blocks: int = 1, + with_scale: bool = False, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + expand_ratio=expand_ratio, + num_blocks=num_blocks, + add_identity=add_identity, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.attn_block = MaxSigmoidAttnBlock(self.mid_channels, + self.mid_channels, + guide_channels=guide_channels, + embed_channels=embed_channels, + num_heads=num_heads, + with_scale=with_scale, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.append(self.attn_block(x_main[-1], guide)) + return self.final_conv(torch.cat(x_main, 1)) + + +@MODELS.register_module() +class ImagePoolingAttentionModule(nn.Module): + def __init__(self, + image_channels: List[int], + text_channels: int, + embed_channels: int, + with_scale: bool = False, + num_feats: int = 3, + num_heads: int = 8, + pool_size: int = 3): + super().__init__() + + self.text_channels = text_channels + self.embed_channels = embed_channels + self.num_heads = num_heads + self.num_feats = num_feats + self.head_channels = embed_channels // num_heads + self.pool_size = pool_size + + if with_scale: + self.scale = nn.Parameter(torch.tensor([0.]), requires_grad=True) + else: + self.scale = 1.0 + self.projections = nn.ModuleList([ + ConvModule(in_channels, embed_channels, 1, act_cfg=None) + for in_channels in image_channels + ]) + self.query = nn.Sequential(nn.LayerNorm(text_channels), + Linear(text_channels, embed_channels)) + self.key = nn.Sequential(nn.LayerNorm(embed_channels), + Linear(embed_channels, embed_channels)) + self.value = nn.Sequential(nn.LayerNorm(embed_channels), + Linear(embed_channels, embed_channels)) + self.proj = Linear(embed_channels, text_channels) + + self.image_pools = nn.ModuleList([ + nn.AdaptiveMaxPool2d((pool_size, pool_size)) + for _ in range(num_feats) + ]) + + def forward(self, text_features, image_features): + B = image_features[0].shape[0] + assert len(image_features) == self.num_feats + num_patches = self.pool_size**2 + mlvl_image_features = [ + pool(proj(x)).view(B, -1, num_patches) + for (x, proj, pool + ) in zip(image_features, self.projections, self.image_pools) + ] + mlvl_image_features = torch.cat(mlvl_image_features, + dim=-1).transpose(1, 2) + q = self.query(text_features) + k = self.key(mlvl_image_features) + v = self.value(mlvl_image_features) + + q = q.reshape(B, -1, self.num_heads, self.head_channels) + k = k.reshape(B, -1, self.num_heads, self.head_channels) + v = v.reshape(B, -1, self.num_heads, self.head_channels) + + attn_weight = torch.einsum('bnmc,bkmc->bmnk', q, k) + attn_weight = attn_weight / (self.head_channels**0.5) + attn_weight = F.softmax(attn_weight, dim=-1) + + x = torch.einsum('bmnk,bkmc->bnmc', attn_weight, v) + x = self.proj(x.reshape(B, -1, self.embed_channels)) + return x * self.scale + text_features + + +@MODELS.register_module() +class VanillaSigmoidBlock(BaseModule): + """Sigmoid attention block.""" + def __init__(self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + kernel_size: int = 3, + padding: int = 1, + num_heads: int = 1, + use_depthwise: bool = False, + with_scale: bool = False, + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(init_cfg=init_cfg) + conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule + + assert (out_channels % num_heads == 0 and + embed_channels % num_heads == 0), \ + 'out_channels and embed_channels should be divisible by num_heads.' + self.num_heads = num_heads + self.head_channels = out_channels // num_heads + + self.project_conv = conv(in_channels, + out_channels, + kernel_size, + stride=1, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x = self.project_conv(x) + x = x * x.sigmoid() + return x + + +@MODELS.register_module() +class EfficientCSPLayerWithTwoConv(CSPLayerWithTwoConv): + """Sigmoid-attention based CSP layer with two convolution layers.""" + def __init__( + self, + in_channels: int, + out_channels: int, + guide_channels: int, + embed_channels: int, + num_heads: int = 1, + expand_ratio: float = 0.5, + num_blocks: int = 1, + with_scale: bool = False, + add_identity: bool = True, # shortcut + conv_cfg: OptConfigType = None, + norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + expand_ratio=expand_ratio, + num_blocks=num_blocks, + add_identity=add_identity, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.attn_block = VanillaSigmoidBlock(self.mid_channels, + self.mid_channels, + guide_channels=guide_channels, + embed_channels=embed_channels, + num_heads=num_heads, + with_scale=with_scale, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg) + + def forward(self, x: Tensor, guide: Tensor) -> Tensor: + """Forward process.""" + x_main = self.main_conv(x) + x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1)) + x_main.extend(blocks(x_main[-1]) for blocks in self.blocks) + x_main.append(self.attn_block(x_main[-1], guide)) + return self.final_conv(torch.cat(x_main, 1)) diff --git a/yolo_world/models/losses/__init__.py b/yolo_world/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8423e30498fa69a08b9d66b492261cbfdec9e4f3 --- /dev/null +++ b/yolo_world/models/losses/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .dynamic_loss import CoVMSELoss + +__all__ = ['CoVMSELoss'] diff --git a/yolo_world/models/losses/dynamic_loss.py b/yolo_world/models/losses/dynamic_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..30d56c3afd8ef22867bf5cba919e4a2844577688 --- /dev/null +++ b/yolo_world/models/losses/dynamic_loss.py @@ -0,0 +1,38 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from typing import Optional + +import torch +import torch.nn as nn +from torch import Tensor +from mmdet.models.losses.mse_loss import mse_loss +from mmyolo.registry import MODELS + + +@MODELS.register_module() +class CoVMSELoss(nn.Module): + + def __init__(self, + dim: int = 0, + reduction: str = 'mean', + loss_weight: float = 1.0, + eps: float = 1e-6) -> None: + super().__init__() + self.dim = dim + self.reduction = reduction + self.loss_weight = loss_weight + self.eps = eps + + def forward(self, + pred: Tensor, + weight: Optional[Tensor] = None, + avg_factor: Optional[int] = None, + reduction_override: Optional[str] = None) -> Tensor: + """Forward function of loss.""" + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps) + target = torch.zeros_like(cov) + loss = self.loss_weight * mse_loss( + cov, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss diff --git a/yolo_world/models/necks/__init__.py b/yolo_world/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5cec4289156bd90738cbfe64c2b6f3c11e1602d1 --- /dev/null +++ b/yolo_world/models/necks/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Tencent Inc. All rights reserved. +from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWolrdDualPAFPN + +__all__ = ['YOLOWorldPAFPN', 'YOLOWolrdDualPAFPN'] diff --git a/yolo_world/models/necks/yolo_world_pafpn.py b/yolo_world/models/necks/yolo_world_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..d8cbf11ef79a1457cb2e6d89eff37b2e1b38bd4d --- /dev/null +++ b/yolo_world/models/necks/yolo_world_pafpn.py @@ -0,0 +1,235 @@ +# Copyright (c) Tencent Inc. All rights reserved. +import copy +from typing import List, Union + +import torch +import torch.nn as nn +from torch import Tensor +from mmdet.utils import ConfigType, OptMultiConfig + +from mmyolo.registry import MODELS +from mmyolo.models.utils import make_divisible, make_round +from mmyolo.models.necks.yolov8_pafpn import YOLOv8PAFPN + + +@MODELS.register_module() +class YOLOWorldPAFPN(YOLOv8PAFPN): + """Path Aggregation Network used in YOLO World + Following YOLOv8 PAFPN, including text to image fusion + """ + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + guide_channels: int, + embed_channels: List[int], + num_heads: List[int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'), + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + self.guide_channels = guide_channels + self.embed_channels = embed_channels + self.num_heads = num_heads + self.block_cfg = block_cfg + super().__init__(in_channels=in_channels, + out_channels=out_channels, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + def build_top_down_layer(self, idx: int) -> nn.Module: + """build top down layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The top down layer. + """ + block_cfg = copy.deepcopy(self.block_cfg) + block_cfg.update( + dict(in_channels=make_divisible( + (self.in_channels[idx - 1] + self.in_channels[idx]), + self.widen_factor), + out_channels=make_divisible(self.out_channels[idx - 1], + self.widen_factor), + guide_channels=self.guide_channels, + embed_channels=make_round(self.embed_channels[idx - 1], + self.widen_factor), + num_heads=make_round(self.num_heads[idx - 1], + self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, + self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return MODELS.build(block_cfg) + + def build_bottom_up_layer(self, idx: int) -> nn.Module: + """build bottom up layer. + + Args: + idx (int): layer idx. + + Returns: + nn.Module: The bottom up layer. + """ + block_cfg = copy.deepcopy(self.block_cfg) + block_cfg.update( + dict(in_channels=make_divisible( + (self.out_channels[idx] + self.out_channels[idx + 1]), + self.widen_factor), + out_channels=make_divisible(self.out_channels[idx + 1], + self.widen_factor), + guide_channels=self.guide_channels, + embed_channels=make_round(self.embed_channels[idx + 1], + self.widen_factor), + num_heads=make_round(self.num_heads[idx + 1], + self.widen_factor), + num_blocks=make_round(self.num_csp_blocks, + self.deepen_factor), + add_identity=False, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + return MODELS.build(block_cfg) + + def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple: + """Forward function. + including multi-level image features, text features: BxLxD + """ + assert len(img_feats) == len(self.in_channels) + # reduce layers + reduce_outs = [] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](img_feats[idx])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = reduce_outs[idx - 1] + upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - + idx](feat_high) + if self.upsample_feats_cat_first: + top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1) + else: + top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs, txt_feats) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx](torch.cat( + [downsample_feat, feat_high], 1), txt_feats) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) + + +@MODELS.register_module() +class YOLOWolrdDualPAFPN(YOLOWorldPAFPN): + """Path Aggregation Network used in YOLO World v8.""" + def __init__(self, + in_channels: List[int], + out_channels: Union[List[int], int], + guide_channels: int, + embed_channels: List[int], + num_heads: List[int], + deepen_factor: float = 1.0, + widen_factor: float = 1.0, + num_csp_blocks: int = 3, + freeze_all: bool = False, + text_enhancder: ConfigType = dict( + type='ImagePoolingAttentionModule', + embed_channels=256, + num_heads=8, + pool_size=3), + block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'), + norm_cfg: ConfigType = dict(type='BN', + momentum=0.03, + eps=0.001), + act_cfg: ConfigType = dict(type='SiLU', inplace=True), + init_cfg: OptMultiConfig = None) -> None: + super().__init__(in_channels=in_channels, + out_channels=out_channels, + guide_channels=guide_channels, + embed_channels=embed_channels, + num_heads=num_heads, + deepen_factor=deepen_factor, + widen_factor=widen_factor, + num_csp_blocks=num_csp_blocks, + freeze_all=freeze_all, + block_cfg=block_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + init_cfg=init_cfg) + + text_enhancder.update( + dict( + image_channels=[int(x * widen_factor) for x in out_channels], + text_channels=guide_channels, + num_feats=len(out_channels), + )) + print(text_enhancder) + self.text_enhancer = MODELS.build(text_enhancder) + + def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple: + """Forward function.""" + assert len(img_feats) == len(self.in_channels) + # reduce layers + reduce_outs = [] + for idx in range(len(self.in_channels)): + reduce_outs.append(self.reduce_layers[idx](img_feats[idx])) + + # top-down path + inner_outs = [reduce_outs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_high = inner_outs[0] + feat_low = reduce_outs[idx - 1] + upsample_feat = self.upsample_layers[len(self.in_channels) - 1 - + idx](feat_high) + if self.upsample_feats_cat_first: + top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1) + else: + top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1) + inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx]( + top_down_layer_inputs, txt_feats) + inner_outs.insert(0, inner_out) + + txt_feats = self.text_enhancer(txt_feats, inner_outs) + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_high = inner_outs[idx + 1] + downsample_feat = self.downsample_layers[idx](feat_low) + out = self.bottom_up_layers[idx](torch.cat( + [downsample_feat, feat_high], 1), txt_feats) + outs.append(out) + + # out_layers + results = [] + for idx in range(len(self.in_channels)): + results.append(self.out_layers[idx](outs[idx])) + + return tuple(results) diff --git a/yolo_world/version.py b/yolo_world/version.py new file mode 100644 index 0000000000000000000000000000000000000000..07900455d9d57a108cdc6dd0b307442874bc6f40 --- /dev/null +++ b/yolo_world/version.py @@ -0,0 +1,27 @@ +# Copyright (c) Tencent Inc. All rights reserved. +__version__ = '0.1.0' + + +def parse_version_info(version_str): + """Parse a version string into a tuple. + + Args: + version_str (str): The version string. + Returns: + tuple[int | str]: The version info, e.g., "1.3.0" is parsed into + (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1'). + """ + version_info = [] + for x in version_str.split('.'): + if x.isdigit(): + version_info.append(int(x)) + elif x.find('rc') != -1: + patch_version = x.split('rc') + version_info.append(int(patch_version[0])) + version_info.append(f'rc{patch_version[1]}') + return tuple(version_info) + + +version_info = parse_version_info(__version__) + +__all__ = ['__version__', 'version_info', 'parse_version_info']