Pinwheel commited on
Commit
128757a
1 Parent(s): 75d666b
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +131 -0
  2. LICENSE +29 -0
  3. README.md +94 -13
  4. app.py +57 -0
  5. checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth +3 -0
  6. checkpoints/model_base_vqa_capfilt_large.pth +3 -0
  7. configs/glip_Swin_T_O365_GoldG.yaml +100 -0
  8. configs/med_config.json +21 -0
  9. configs/vqa.yaml +25 -0
  10. itm.py +77 -0
  11. maskrcnn_benchmark/__init__.py +1 -0
  12. maskrcnn_benchmark/config/__init__.py +3 -0
  13. maskrcnn_benchmark/config/defaults.py +861 -0
  14. maskrcnn_benchmark/config/paths_catalog.py +447 -0
  15. maskrcnn_benchmark/csrc/ROIAlign.h +46 -0
  16. maskrcnn_benchmark/csrc/ROIPool.h +48 -0
  17. maskrcnn_benchmark/csrc/SigmoidFocalLoss.h +41 -0
  18. maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp +257 -0
  19. maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp +75 -0
  20. maskrcnn_benchmark/csrc/cpu/soft_nms.cpp +117 -0
  21. maskrcnn_benchmark/csrc/cpu/vision.h +22 -0
  22. maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu +346 -0
  23. maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu +202 -0
  24. maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu +188 -0
  25. maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu +691 -0
  26. maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu +874 -0
  27. maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu +87 -0
  28. maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu +365 -0
  29. maskrcnn_benchmark/csrc/cuda/ml_nms.cu +136 -0
  30. maskrcnn_benchmark/csrc/cuda/nms.cu +131 -0
  31. maskrcnn_benchmark/csrc/cuda/vision.h +116 -0
  32. maskrcnn_benchmark/csrc/deform_conv.h +191 -0
  33. maskrcnn_benchmark/csrc/deform_pool.h +70 -0
  34. maskrcnn_benchmark/csrc/ml_nms.h +27 -0
  35. maskrcnn_benchmark/csrc/nms.h +45 -0
  36. maskrcnn_benchmark/csrc/vision.cpp +27 -0
  37. maskrcnn_benchmark/data/__init__.py +2 -0
  38. maskrcnn_benchmark/data/build.py +489 -0
  39. maskrcnn_benchmark/data/collate_batch.py +93 -0
  40. maskrcnn_benchmark/data/datasets/__init__.py +23 -0
  41. maskrcnn_benchmark/data/datasets/background.py +53 -0
  42. maskrcnn_benchmark/data/datasets/box_label_loader.py +251 -0
  43. maskrcnn_benchmark/data/datasets/caption.py +279 -0
  44. maskrcnn_benchmark/data/datasets/coco.py +268 -0
  45. maskrcnn_benchmark/data/datasets/coco_dt.py +154 -0
  46. maskrcnn_benchmark/data/datasets/concat_dataset.py +23 -0
  47. maskrcnn_benchmark/data/datasets/custom_distributed_sampler.py +185 -0
  48. maskrcnn_benchmark/data/datasets/duplicate_dataset.py +31 -0
  49. maskrcnn_benchmark/data/datasets/evaluation/__init__.py +56 -0
  50. maskrcnn_benchmark/data/datasets/evaluation/box_aug.py +349 -0
.gitignore ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/\
130
+
131
+ flagged/
LICENSE ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2022, Aastha Singh
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ 3. Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
README.md CHANGED
@@ -1,13 +1,94 @@
1
- ---
2
- title: GLIP BLIP Object Detection VQA
3
- emoji: 📊
4
- colorFrom: indigo
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 3.4.1
8
- app_file: app.py
9
- pinned: false
10
- license: bsd-3-clause
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Vision-Language Object Detection and Visual Question Answering
2
+ This repository includes Microsoft's GLIP and Salesforce's BLIP ensembled demo for detecting objects and Visual Question Answering based on text prompts.
3
+
4
+ <br />
5
+
6
+ ## About GLIP: Grounded Language-Image Pre-training -
7
+ > GLIP demonstrate strong zero-shot and few-shot transferability to various object-level recognition tasks.
8
+
9
+ > The model used in this repo is GLIP-T, it is originally pre-trained on Conceptual Captions 3M and SBU captions.
10
+
11
+ <br />
12
+
13
+ ## About BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation -
14
+
15
+ > A new model architecture that enables a wider range of downstream tasks than existing methods, and a new dataset bootstrapping method for learning from noisy web data.
16
+
17
+ <br />
18
+
19
+ ## Installation and Setup
20
+
21
+ ***Enviornment*** - Due to limitations with `maskrcnn_benchmark`, this repo requires Pytorch=1.10 and torchvision.
22
+
23
+ Use `requirements.txt` to install dependencies
24
+
25
+ ```sh
26
+ pip3 install -r requirements.txt
27
+ ```
28
+ Build `maskrcnn_benchmark`
29
+ ```
30
+ python setup.py build develop --user
31
+ ```
32
+
33
+ To verify a successful build, check the terminal for message
34
+ "Finished processing dependencies for maskrcnn-benchmark==0.1"
35
+
36
+ ## Checkpoints
37
+
38
+ > Download the pre-trained models into the `checkpoints` folder.
39
+
40
+ <br />
41
+
42
+ ```sh
43
+ mkdir checkpoints
44
+ cd checkpoints
45
+ ```
46
+
47
+ Model | Weight
48
+ -- | --
49
+ **GLIP-T** | [weight](https://drive.google.com/file/d/1nlPL6PHkslarP6RiWJJu6QGKjqHG4tkc/view?usp=sharing)
50
+ **BLIP** | [weight](https://drive.google.com/file/d/1QliNGiAcyCCJLd22eNOxWvMUDzb7GzrO/view?usp=sharing)
51
+
52
+ <br />files.maxMemoryForLargeFilesMB
53
+
54
+ ## If you have an NVIDIA GPU with 8GB VRAM, run local demo using Gradio interface
55
+
56
+ ```sh
57
+ python3 app.py
58
+ ```
59
+ ## Future Work
60
+
61
+ - [x] Frame based Visual Question Answering
62
+ - [ ] Each object based Visual Question Answering
63
+
64
+
65
+ ## Citations
66
+
67
+ ```txt
68
+ @inproceedings{li2022blip,
69
+ title={BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
70
+ author={Junnan Li and Dongxu Li and Caiming Xiong and Steven Hoi},
71
+ year={2022},
72
+ booktitle={ICML},
73
+ }
74
+ @inproceedings{li2021grounded,
75
+ title={Grounded Language-Image Pre-training},
76
+ author={Liunian Harold Li* and Pengchuan Zhang* and Haotian Zhang* and Jianwei Yang and Chunyuan Li and Yiwu Zhong and Lijuan Wang and Lu Yuan and Lei Zhang and Jenq-Neng Hwang and Kai-Wei Chang and Jianfeng Gao},
77
+ year={2022},
78
+ booktitle={CVPR},
79
+ }
80
+ @article{zhang2022glipv2,
81
+ title={GLIPv2: Unifying Localization and Vision-Language Understanding},
82
+ author={Zhang, Haotian* and Zhang, Pengchuan* and Hu, Xiaowei and Chen, Yen-Chun and Li, Liunian Harold and Dai, Xiyang and Wang, Lijuan and Yuan, Lu and Hwang, Jenq-Neng and Gao, Jianfeng},
83
+ journal={arXiv preprint arXiv:2206.05836},
84
+ year={2022}
85
+ }
86
+ @article{li2022elevater,
87
+ title={ELEVATER: A Benchmark and Toolkit for Evaluating Language-Augmented Visual Models},
88
+ author={Li*, Chunyuan and Liu*, Haotian and Li, Liunian Harold and Zhang, Pengchuan and Aneja, Jyoti and Yang, Jianwei and Jin, Ping and Lee, Yong Jae and Hu, Houdong and Liu, Zicheng and others},
89
+ journal={arXiv preprint arXiv:2204.08790},
90
+ year={2022}
91
+ }
92
+ ```
93
+ ## Acknowledgement
94
+ The implementation of this work relies on resources from <a href="https://github.com/salesforce/BLIP">BLIP</a>, <a href="https://github.com/microsoft/GLIP">GLIP</a>, <a href="https://github.com/huggingface/transformers">Huggingface Transformers</a>, and <a href="https://github.com/rwightman/pytorch-image-models/tree/master/timm">timm</a>. We thank the original authors for their open-sourcing.
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import warnings
4
+
5
+ warnings.filterwarnings("ignore")
6
+
7
+ os.system("python setup.py build develop --user")
8
+
9
+ from maskrcnn_benchmark.config import cfg
10
+ from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
11
+ import vqa
12
+ import vqa
13
+
14
+ # Use this command for evaluate the GLIP-T model
15
+ config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
16
+ weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth"
17
+
18
+ # manual override some options
19
+ cfg.local_rank = 0
20
+ cfg.num_gpus = 1
21
+ cfg.merge_from_file(config_file)
22
+ cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
23
+ cfg.merge_from_list(["MODEL.DEVICE", "cuda"])
24
+
25
+ glip_demo = GLIPDemo(
26
+ cfg,
27
+ min_image_size=800,
28
+ confidence_threshold=0.7,
29
+ show_mask_heatmaps=False
30
+ )
31
+ blip_demo = vqa.VQA(
32
+ model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
33
+ )
34
+
35
+ def predict(image, object, question):
36
+ result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
37
+ answer = blip_demo.vqa_demo(image, question)
38
+ return result[:, :, [2, 1, 0]], answer
39
+
40
+ image = gr.inputs.Image()
41
+
42
+ gr.Interface(
43
+ description="GLIP + BLIP VQA Demo.",
44
+ fn=predict,
45
+ inputs=[
46
+ "image",
47
+ gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."),
48
+ gr.Textbox(label='Question', lines=1, placeholder="Question here..")],
49
+
50
+ outputs=[
51
+ gr.outputs.Image(
52
+ type="pil",
53
+ label="grounding results"
54
+ ),
55
+ gr.Textbox(label="Answer")
56
+ ],
57
+ ).launch()
checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bec0a3dea804fcb278d7106c5438de5116ee888e49dfae46270e7ad7bc4ccbf
3
+ size 3710104213
checkpoints/model_base_vqa_capfilt_large.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a7d546209f1ccfa8b3cd3a0138c53e0d1e95e4a4bc280bef8f67e20fe4925ae
3
+ size 1446244375
configs/glip_Swin_T_O365_GoldG.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+ FREEZE_CONV_BODY_AT: -1
11
+
12
+ LANGUAGE_BACKBONE:
13
+ FREEZE: False
14
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
15
+ MASK_SPECIAL: False
16
+
17
+ RPN:
18
+ USE_FPN: True
19
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
20
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
21
+ ASPECT_RATIOS: (1.0,)
22
+ SCALES_PER_OCTAVE: 1
23
+
24
+ DYHEAD:
25
+ CHANNELS: 256
26
+ NUM_CONVS: 6
27
+ USE_GN: True
28
+ USE_DYRELU: True
29
+ USE_DFCONV: True
30
+ USE_DYFUSE: True
31
+ TOPK: 9 # topk for selecting candidate positive samples from each level
32
+ SCORE_AGG: "MEAN"
33
+ LOG_SCALE: 0.0
34
+
35
+ FUSE_CONFIG:
36
+ EARLY_FUSE_ON: True
37
+ TYPE: "MHA-B"
38
+ USE_CLASSIFICATION_LOSS: False
39
+ USE_TOKEN_LOSS: False
40
+ USE_CONTRASTIVE_ALIGN_LOSS: False
41
+ CONTRASTIVE_HIDDEN_DIM: 64
42
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
43
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
44
+ USE_LAYER_SCALE: True
45
+ CLAMP_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_MAX_FOR_OVERFLOW: True
47
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
48
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
49
+ CLAMP_DOT_PRODUCT: True
50
+
51
+ USE_CHECKPOINT: True
52
+
53
+ TEST:
54
+ DURING_TRAINING: False
55
+ IMS_PER_BATCH: 64
56
+
57
+ # use for grounding model
58
+ DATASETS:
59
+ TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
60
+ TEST: ("coco_2017_val", )
61
+ DISABLE_SHUFFLE: False
62
+ ADD_DET_PROMPT: False
63
+ RANDOM_SAMPLE_NEG: 85
64
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
65
+
66
+ SEPARATION_TOKENS: ". "
67
+
68
+ INPUT:
69
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
70
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
71
+ MIN_SIZE_TRAIN: 800
72
+ MAX_SIZE_TRAIN: 1333
73
+ MIN_SIZE_TEST: 800
74
+ MAX_SIZE_TEST: 1333
75
+
76
+ AUGMENT:
77
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
78
+
79
+ DATALOADER:
80
+ SIZE_DIVISIBILITY: 32
81
+
82
+ SOLVER:
83
+ OPTIMIZER: ADAMW
84
+ BASE_LR: 0.0001
85
+ LANG_LR: 0.00001
86
+ WEIGHT_DECAY: 0.0001
87
+ STEPS: (0.67, 0.89)
88
+ MAX_EPOCH: 30
89
+ IMS_PER_BATCH: 64
90
+ WARMUP_ITERS: 2000
91
+ WARMUP_FACTOR: 0.001
92
+ USE_AMP: True
93
+ MODEL_EMA: 0.999
94
+ FIND_UNUSED_PARAMETERS: False
95
+
96
+ CLIP_GRADIENTS:
97
+ ENABLED: True
98
+ CLIP_TYPE: "full_model"
99
+ CLIP_VALUE: 1.0
100
+ NORM_TYPE: 2.0
configs/med_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 768,
9
+ "initializer_range": 0.02,
10
+ "intermediate_size": 3072,
11
+ "layer_norm_eps": 1e-12,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "bert",
14
+ "num_attention_heads": 12,
15
+ "num_hidden_layers": 12,
16
+ "pad_token_id": 0,
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 30524,
19
+ "encoder_width": 768,
20
+ "add_cross_attention": true
21
+ }
configs/vqa.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/
2
+ vg_root: '/export/share/datasets/vision/visual-genome/' #followed by image/
3
+ train_files: ['vqa_train','vqa_val','vg_qa']
4
+ ann_root: 'annotation'
5
+
6
+ # set pretrained as a file path or an url
7
+ pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'
8
+
9
+ # size of vit model; base or large
10
+ vit: 'base'
11
+ batch_size_train: 16
12
+ batch_size_test: 32
13
+ vit_grad_ckpt: False
14
+ vit_ckpt_layer: 0
15
+ init_lr: 2e-5
16
+
17
+ image_size: 480
18
+
19
+ k_test: 128
20
+ inference: 'rank'
21
+
22
+ # optimizer
23
+ weight_decay: 0.05
24
+ min_lr: 0
25
+ max_epoch: 10
itm.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from PIL import Image
3
+ import torch
4
+ from torchvision import transforms
5
+ from torchvision.transforms.functional import InterpolationMode
6
+ from models.blip_vqa import blip_vqa
7
+ from models.blip_itm import blip_itm
8
+
9
+
10
+ class VQA:
11
+ def __init__(self, model_path, image_size=480):
12
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
13
+ self.model = blip_vqa(pretrained=model_path, image_size=image_size, vit='base')
14
+ self.model.eval()
15
+ self.model = self.model.to(self.device)
16
+
17
+ def load_demo_image(self, image_size, img_path, device):
18
+ raw_image = Image.open(img_path).convert('RGB')
19
+ w,h = raw_image.size
20
+ transform = transforms.Compose([
21
+ transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
22
+ transforms.ToTensor(),
23
+ transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
24
+ ])
25
+ image = transform(raw_image).unsqueeze(0).to(device)
26
+ return raw_image, image
27
+
28
+ def vqa(self, img_path, question):
29
+ raw_image, image = self.load_demo_image(image_size=480, img_path=img_path, device=self.device)
30
+ with torch.no_grad():
31
+ answer = self.model(image, question, train=False, inference='generate')
32
+ return answer[0]
33
+ class ITM:
34
+ def __init__(self, model_path, image_size=384):
35
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
36
+ self.model = blip_itm(pretrained=model_path, image_size=image_size, vit='base')
37
+ self.model.eval()
38
+ self.model = self.model.to(device='cpu')
39
+
40
+ def load_demo_image(self, image_size, img_path, device):
41
+ raw_image = Image.open(img_path).convert('RGB')
42
+ w,h = raw_image.size
43
+ transform = transforms.Compose([
44
+ transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
45
+ transforms.ToTensor(),
46
+ transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
47
+ ])
48
+ image = transform(raw_image).unsqueeze(0).to(device)
49
+ return raw_image, image
50
+
51
+ def itm(self, img_path, caption):
52
+ raw_image, image = self.load_demo_image(image_size=384,img_path=img_path, device=self.device)
53
+ itm_output = self.model(image,caption,match_head='itm')
54
+ itm_score = torch.nn.functional.softmax(itm_output,dim=1)[:,1]
55
+ itc_score = self.model(image,caption,match_head='itc')
56
+ # print('The image and text is matched with a probability of %.4f'%itm_score)
57
+ # print('The image feature and text feature has a cosine similarity of %.4f'%itc_score)
58
+ return itm_score, itc_score
59
+
60
+ if __name__=="__main__":
61
+ if not len(sys.argv) == 3:
62
+ print('Format: python3 vqa.py <path_to_img> <question>')
63
+ print('Sample: python3 vqa.py sample.jpg "What is the color of the horse?"')
64
+
65
+ else:
66
+ model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
67
+ model2_path = 'model_base_retrieval_coco.pth'
68
+ # vqa_object = VQA(model_path=model_path)
69
+ itm_object = ITM(model_path=model2_path)
70
+ img_path = sys.argv[1]
71
+ # question = sys.argv[2]
72
+ caption = sys.argv[2]
73
+ # answer = vqa_object.vqa(img_path, caption)
74
+ itm_score, itc_score = itm_object.itm(img_path, caption)
75
+ # print('Question: {} | Answer: {}'.format(caption, answer))
76
+ print('Caption: {} | The image and text is matched with a probability of %.4f: {} | The image feature and text feature has a cosine similarity of %.4f: {}'.format (caption,itm_score,itc_score))
77
+
maskrcnn_benchmark/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
maskrcnn_benchmark/config/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from .defaults import _C as cfg
3
+ from .paths_catalog import try_to_find
maskrcnn_benchmark/config/defaults.py ADDED
@@ -0,0 +1,861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import os
3
+
4
+ from yacs.config import CfgNode as CN
5
+
6
+ # -----------------------------------------------------------------------------
7
+ # Convention about Training / Test specific parameters
8
+ # -----------------------------------------------------------------------------
9
+ # Whenever an argument can be either used for training or for testing, the
10
+ # corresponding name will be post-fixed by a _TRAIN for a training parameter,
11
+ # or _TEST for a test-specific parameter.
12
+ # For example, the number of images during training will be
13
+ # IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
14
+ # IMAGES_PER_BATCH_TEST
15
+
16
+ # -----------------------------------------------------------------------------
17
+ # Config definition
18
+ # -----------------------------------------------------------------------------
19
+
20
+ _C = CN()
21
+
22
+ _C.MODEL = CN()
23
+ _C.MODEL.RPN_ONLY = False
24
+ _C.MODEL.BOX_ON = True
25
+ _C.MODEL.MASK_ON = False
26
+ _C.MODEL.KEYPOINT_ON = False
27
+ _C.MODEL.DEVICE = "cuda"
28
+
29
+ _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
30
+
31
+ _C.MODEL.RPN_ARCHITECTURE = "RPN"
32
+ _C.MODEL.DEBUG = False # add debug flag
33
+ _C.MODEL.ONNX = False # add onnx flag
34
+
35
+ # If the WEIGHT starts with a catalog://, like :R-50, the code will look for
36
+ # the path in paths_catalog. Else, it will use it as the specified absolute
37
+ # path
38
+ _C.MODEL.WEIGHT = ""
39
+ _C.MODEL.PRETRAIN_NAME = ""
40
+
41
+ # If LINEAR_PROB = True, only the last linear layers in rpn and roi_head are trainable
42
+ _C.MODEL.LINEAR_PROB = False
43
+
44
+ # -----------------------------------------------------------------------------
45
+ # Multitask Training / Test specific parameters
46
+ # -----------------------------------------------------------------------------
47
+ _C.MODEL.MULTITASK = CN(new_allowed=True)
48
+
49
+ # -----------------------------------------------------------------------------
50
+ # INPUT
51
+ # -----------------------------------------------------------------------------
52
+ _C.INPUT = CN()
53
+ # Size of the smallest side of the image during training
54
+ _C.INPUT.MIN_SIZE_TRAIN = 800 # (800,)
55
+ # Maximum size of the side of the image during training
56
+ _C.INPUT.MAX_SIZE_TRAIN = 1333
57
+ # Size of the smallest side of the image during testing
58
+ _C.INPUT.MIN_SIZE_TEST = 800
59
+ # Maximum size of the side of the image during testing
60
+ _C.INPUT.MAX_SIZE_TEST = 1333
61
+ # Values to be used for image normalization
62
+ _C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
63
+ # Values to be used for image normalization
64
+ _C.INPUT.PIXEL_STD = [1., 1., 1.]
65
+ # Convert image to BGR format (for Caffe2 models), in range 0-255
66
+ _C.INPUT.TO_BGR255 = True
67
+ _C.INPUT.FORMAT = ''
68
+ _C.INPUT.FIX_RES = False
69
+
70
+ # -----------------------------------------------------------------------------
71
+ # Augmentation
72
+ # -----------------------------------------------------------------------------
73
+ _C.AUGMENT = CN()
74
+ _C.AUGMENT.USE_RA = 0
75
+ _C.AUGMENT.FLIP_PROB_TRAIN = 0.5
76
+ _C.AUGMENT.VERTICAL_FLIP_PROB_TRAIN = 0.0
77
+ _C.AUGMENT.MULT_MIN_SIZE_TRAIN = ()
78
+
79
+ _C.AUGMENT.BRIGHTNESS = 0.0
80
+ _C.AUGMENT.CONTRAST = 0.0
81
+ _C.AUGMENT.SATURATION = 0.0
82
+ _C.AUGMENT.HUE = 0.0
83
+
84
+ _C.AUGMENT.CROP_PROB = 0.5
85
+ _C.AUGMENT.CROP_MIN_IOUS = (0.1, 0.3, 0.5, 0.7, 0.9)
86
+ _C.AUGMENT.CROP_MIN_SIZE = 0.3
87
+
88
+ # -----------------------------------------------------------------------------
89
+ # Dataset
90
+ # -----------------------------------------------------------------------------
91
+ _C.DATASETS = CN()
92
+ # List of the dataset names for training, as present in paths_catalog.py
93
+ _C.DATASETS.TRAIN = ()
94
+ # List of the dataset names for testing, as present in paths_catalog.py
95
+ _C.DATASETS.TEST = ()
96
+ # Use is_crowd label
97
+ _C.DATASETS.USE_CROWD = False
98
+ _C.DATASETS.CLASS_AGNOSTIC = False
99
+ _C.DATASETS.CLASS_CONCAT = False
100
+ _C.DATASETS.MAX_BOX = -1
101
+ _C.DATASETS.SAMPLE_RATIO = 0.0
102
+ _C.DATASETS.FEW_SHOT = 0
103
+ # SHUFFLE_SEED != 0 means shuffle the dataset in the few shot setting
104
+ _C.DATASETS.SHUFFLE_SEED = 0
105
+ _C.DATASETS.PREDEFINED_TEXT = ''
106
+ _C.DATASETS.ALTERNATIVE_TRAINING = False
107
+ _C.DATASETS.MULTISTAGE_TRAINING = False
108
+ _C.DATASETS.REGISTER = CN(new_allowed=True)
109
+ _C.DATASETS.BOX_THRESHOLD = 0.1
110
+ # Duplicate Dataset
111
+ _C.DATASETS.COCO_COPY = 1
112
+ _C.DATASETS.LVIS_COPY = 1
113
+ _C.DATASETS.FLICKR_COPY = 1
114
+ _C.DATASETS.MIXED_COPY = 1
115
+ _C.DATASETS.OBJECT365_COPY = 1
116
+ _C.DATASETS.VG_COPY = 1
117
+ _C.DATASETS.OI_COPY = 1
118
+ _C.DATASETS.IN_COPY = 1
119
+
120
+ # Duplicate Dataset
121
+ _C.DATASETS.COCO_COPY = 1
122
+ _C.DATASETS.FLICKR_COPY = 1
123
+ _C.DATASETS.MIXED_COPY = 1
124
+ _C.DATASETS.OBJECT365_COPY = 1
125
+ _C.DATASETS.VG_COPY = 1
126
+ _C.DATASETS.OI_COPY = 1
127
+ _C.DATASETS.IN_COPY = 1
128
+ _C.DATASETS.GENERAL_COPY = -1
129
+ _C.DATASETS.GENERAL_COPY_TEST = -1
130
+
131
+ # OD to Grounding
132
+ _C.DATASETS.RANDOM_SAMPLE_NEG = -1
133
+ _C.DATASETS.ADD_DET_PROMPT = False
134
+ _C.DATASETS.ADD_DET_PROMPT_ADVANCED = False
135
+ _C.DATASETS.USE_OD_AUG = False
136
+ _C.DATASETS.USE_COCO_FORMAT = False
137
+ _C.DATASETS.CONTROL_PROB = ()
138
+ _C.DATASETS.DISABLE_SHUFFLE = False
139
+ _C.DATASETS.PROMPT_VERSION = ""
140
+ _C.DATASETS.PROMPT_LIMIT_NEG = -1
141
+ _C.DATASETS.POS_QUESTION_PROB = 0.6
142
+ _C.DATASETS.NEG_QUESTION_PROB = 0.8
143
+ _C.DATASETS.FULL_QUESTION_PROB = 0.5
144
+ _C.DATASETS.ONE_HOT = False
145
+ _C.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT = False
146
+
147
+ _C.DATASETS.DISABLE_CLIP_TO_IMAGE = False
148
+ _C.DATASETS.SEPARATION_TOKENS = " "
149
+
150
+ # LVIS
151
+ _C.DATASETS.LVIS_USE_NORMAL_AP = False
152
+ _C.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING = False
153
+
154
+ # Caption
155
+ _C.DATASETS.BING_INDEX_LIST = []
156
+ _C.DATASETS.CAPTION_MIN_BOX = 1
157
+ _C.DATASETS.REPLACE_CLEAN_LABEL = False
158
+ _C.DATASETS.FURTHER_SCREEN = False
159
+ _C.DATASETS.CAPTION_CONF = 0.9
160
+ _C.DATASETS.CAPTION_NMS = 0.9
161
+ _C.DATASETS.PACK_RANDOM_CAPTION_NUMBER = 0
162
+ _C.DATASETS.INFERENCE_CAPTION = False
163
+ _C.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA = -1.0
164
+ _C.DATASETS.RANDOM_PACK_PROB = -1.0
165
+ _C.DATASETS.NO_RANDOM_PACK_PROBABILITY = 0.0
166
+ _C.DATASETS.SAFEGUARD_POSITIVE_CAPTION = True
167
+ _C.DATASETS.CAPTION_FORMAT_VERSION = "v1"
168
+ _C.DATASETS.LOCAL_DEBUG = False
169
+
170
+
171
+ # Od in the wild
172
+ _C.DATASETS.PREDEFINED_TEXT = None
173
+ _C.DATASETS.TRAIN_DATASETNAME_SUFFIX = ""
174
+ _C.DATASETS.TEST_DATASETNAME_SUFFIX = ""
175
+ _C.DATASETS.OVERRIDE_CATEGORY = None
176
+ _C.DATASETS.USE_OVERRIDE_CATEGORY = False
177
+ _C.DATASETS.SUPRESS_QUERY = None
178
+ _C.DATASETS.USE_SUPRESS_QUERY = False
179
+ _C.DATASETS.USE_CAPTION_PROMPT = False
180
+ _C.DATASETS.CAPTION_PROMPT = None
181
+
182
+ _C.DATASETS.FLICKR_GT_TYPE = "separate"
183
+
184
+ # VQA
185
+ _C.DATASETS.DIVER_BOX_FOR_VQA = False
186
+ # -----------------------------------------------------------------------------
187
+ # DataLoader
188
+ # -----------------------------------------------------------------------------
189
+ _C.DATALOADER = CN()
190
+ # Number of data loading threads
191
+ _C.DATALOADER.NUM_WORKERS = 4
192
+ # If > 0, this enforces that each collated batch should have a size divisible
193
+ # by SIZE_DIVISIBILITY
194
+ _C.DATALOADER.SIZE_DIVISIBILITY = 0
195
+ # If True, each batch should contain only images for which the aspect ratio
196
+ # is compatible. This groups portrait images together, and landscape images
197
+ # are not batched with portrait images.
198
+ _C.DATALOADER.ASPECT_RATIO_GROUPING = True
199
+ # Define min number of keypoints required from GT, for example 10 out of 17
200
+ _C.DATALOADER.MIN_KPS_PER_IMS = 0
201
+ # Use random sampler during training
202
+ _C.DATALOADER.USE_RANDOM_SEED = False
203
+
204
+ _C.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE = False
205
+ # ---------------------------------------------------------------------------- #
206
+ # Backbone options
207
+ # ---------------------------------------------------------------------------- #
208
+ _C.MODEL.BACKBONE = CN()
209
+
210
+ # The backbone conv body to use
211
+ # The string must match a function that is imported in modeling.model_builder
212
+ # (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
213
+ # backbone)
214
+ _C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
215
+
216
+ # Add StopGrad at a specified stage so the bottom layers are frozen
217
+ _C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
218
+ _C.MODEL.BACKBONE.FREEZE = False
219
+ _C.MODEL.BACKBONE.GROUP = 1
220
+ _C.MODEL.BACKBONE.OUT_CHANNELS = 256 * 4
221
+ # Option to reset bn running statics
222
+ _C.MODEL.BACKBONE.RESET_BN = False
223
+ # Backbone Normalization Level
224
+ _C.MODEL.BACKBONE.NORM_LEVEL = 3
225
+ # BN for backbone
226
+ _C.MODEL.BACKBONE.USE_BN = False
227
+ # Sync BN for backbone
228
+ _C.MODEL.BACKBONE.USE_SYNCBN = False
229
+ _C.MODEL.BACKBONE.USE_NSYNCBN = False
230
+ # GN for backbone
231
+ _C.MODEL.BACKBONE.USE_GN = False
232
+ # Evo Norm for backbone
233
+ _C.MODEL.BACKBONE.USE_EN = False
234
+ # Layers for backbone
235
+ _C.MODEL.BACKBONE.USE_DFCONV = False
236
+ _C.MODEL.BACKBONE.USE_DYRELU = False
237
+ _C.MODEL.BACKBONE.USE_SE = False
238
+ _C.MODEL.BACKBONE.LAYER_SETUP = (3, 4, 6, 3)
239
+ _C.MODEL.BACKBONE.LAYER_SEARCH = CN(new_allowed=True)
240
+ _C.MODEL.BACKBONE.OUT_FEATURES = ("stage2", "stage3", "stage4", "stage5")
241
+ _C.MODEL.BACKBONE.FPN_LAYER = ()
242
+ _C.MODEL.BACKBONE.USE_CHECKPOINT = False
243
+ # Add JF efficient det cfgs
244
+ _C.MODEL.BACKBONE.EFFICIENT_DET_START_FROM = 3
245
+ _C.MODEL.BACKBONE.EFFICIENT_DET_COMPOUND = 0
246
+ _C.MODEL.BACKBONE.EFFICIENT_DET_BIFPN_VERSION = 0
247
+
248
+ _C.MODEL.LANGUAGE_BACKBONE = CN()
249
+ _C.MODEL.LANGUAGE_BACKBONE.WEIGHT = ""
250
+ _C.MODEL.LANGUAGE_BACKBONE.FREEZE = False
251
+ _C.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False
252
+ _C.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased"
253
+ _C.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased"
254
+ _C.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768
255
+ _C.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 256
256
+ _C.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1
257
+ _C.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106
258
+ _C.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False
259
+
260
+ _C.MODEL.LANGUAGE_BACKBONE.RNN_TYPE = "lstm"
261
+ _C.MODEL.LANGUAGE_BACKBONE.VARIABLE_LENGTH = True
262
+ _C.MODEL.LANGUAGE_BACKBONE.WORD_EMBEDDING_SIZE = 512
263
+ _C.MODEL.LANGUAGE_BACKBONE.WORD_VEC_SIZE = 512
264
+ _C.MODEL.LANGUAGE_BACKBONE.HIDDEN_SIZE = 512
265
+ _C.MODEL.LANGUAGE_BACKBONE.BIDIRECTIONAL = True
266
+ _C.MODEL.LANGUAGE_BACKBONE.INPUT_DROPOUT_P = 0.5
267
+ _C.MODEL.LANGUAGE_BACKBONE.DROPOUT_P = 0.2
268
+ _C.MODEL.LANGUAGE_BACKBONE.CORPUS_PATH = ""
269
+ _C.MODEL.LANGUAGE_BACKBONE.VOCAB_SIZE = 0
270
+
271
+ _C.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True
272
+ # ---------------------------------------------------------------------------- #
273
+ # FPN options
274
+ # ---------------------------------------------------------------------------- #
275
+ _C.MODEL.FPN = CN()
276
+ _C.MODEL.FPN.FREEZE = False
277
+ _C.MODEL.FPN.USE_GN = False
278
+ _C.MODEL.FPN.USE_RELU = False
279
+ _C.MODEL.FPN.USE_DYRELU = False
280
+ _C.MODEL.FPN.DROP_BLOCK = True
281
+ _C.MODEL.FPN.DROP_PROB = 0.3
282
+ _C.MODEL.FPN.DROP_SIZE = 3
283
+ _C.MODEL.FPN.USE_SPP = False
284
+ _C.MODEL.FPN.USE_PAN = False
285
+ _C.MODEL.FPN.USE_DYHEAD = False
286
+ _C.MODEL.FPN.RETURN_SWINT_FEATURE_BEFORE_FUSION = False
287
+ # ---------------------------------------------------------------------------- #
288
+ # BIFPN options
289
+ # ---------------------------------------------------------------------------- #
290
+ _C.MODEL.BIFPN = CN()
291
+ _C.MODEL.BIFPN.NUM_REPEATS = 1
292
+ _C.MODEL.BIFPN.USE_ATTENTION = True
293
+
294
+ # ---------------------------------------------------------------------------- #
295
+ # Group Norm options
296
+ # ---------------------------------------------------------------------------- #
297
+ _C.MODEL.GROUP_NORM = CN()
298
+ # Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
299
+ _C.MODEL.GROUP_NORM.DIM_PER_GP = -1
300
+ # Number of groups in GroupNorm (-1 if using DIM_PER_GP)
301
+ _C.MODEL.GROUP_NORM.NUM_GROUPS = 16
302
+ # GroupNorm's small constant in the denominator
303
+ _C.MODEL.GROUP_NORM.EPSILON = 1e-5
304
+
305
+ # ---------------------------------------------------------------------------- #
306
+ # Evo Norm options
307
+ # ---------------------------------------------------------------------------- #
308
+ _C.MODEL.EVO_NORM = CN()
309
+ # Number of groups in EvoNorm (-1 if using DIM_PER_GP)
310
+ _C.MODEL.EVO_NORM.NUM_GROUPS = 8
311
+ # EvoNorm's small constant in the denominator
312
+ _C.MODEL.EVO_NORM.EPSILON = 1e-5
313
+
314
+ # ---------------------------------------------------------------------------- #
315
+ # RetinaNet Options (Follow the Detectron version)
316
+ # ---------------------------------------------------------------------------- #
317
+ _C.MODEL.RETINANET = CN()
318
+ # This is the number of foreground classes and background.
319
+ _C.MODEL.RETINANET.NUM_CLASSES = 81
320
+ # Convolutions to use in the cls and bbox tower
321
+ # NOTE: this doesn't include the last conv for logits
322
+ _C.MODEL.RETINANET.NUM_CONVS = 4
323
+ # During inference, #locs to select based on cls score before NMS is performed
324
+ # per FPN level
325
+ _C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
326
+ # Prior prob for the positives at the beginning of training. This is used to set
327
+ # the bias init for the logits layer
328
+ _C.MODEL.RETINANET.PRIOR_PROB = 0.01
329
+ # Inference cls score threshold, anchors with score > INFERENCE_TH are
330
+ # considered for inference
331
+ _C.MODEL.RETINANET.INFERENCE_TH = 0.05
332
+ # NMS threshold used in RetinaNet
333
+ _C.MODEL.RETINANET.NMS_TH = 0.4
334
+ _C.MODEL.RETINANET.DETECTIONS_PER_IMG = 100
335
+
336
+ # ---------------------------------------------------------------------------- #
337
+ # Focal Loss Options (Follow the Detectron version)
338
+ # ---------------------------------------------------------------------------- #
339
+ _C.MODEL.FOCAL = CN()
340
+ # Weight for bbox_regression loss
341
+ _C.MODEL.FOCAL.BBOX_REG_WEIGHT = 4.0
342
+ # Smooth L1 loss beta for bbox regression
343
+ _C.MODEL.FOCAL.BBOX_REG_BETA = 0.11
344
+ # IoU overlap ratio for labeling an anchor as positive
345
+ # Anchors with >= iou overlap are labeled positive
346
+ _C.MODEL.FOCAL.FG_IOU_THRESHOLD = 0.5
347
+ # IoU overlap ratio for labeling an anchor as negative
348
+ # Anchors with < iou overlap are labeled negative
349
+ _C.MODEL.FOCAL.BG_IOU_THRESHOLD = 0.4
350
+ # Focal loss parameter: alpha
351
+ _C.MODEL.FOCAL.LOSS_ALPHA = 0.25
352
+ # Focal loss parameter: gamma
353
+ _C.MODEL.FOCAL.LOSS_GAMMA = 2.0
354
+
355
+ # ---------------------------------------------------------------------------- #
356
+ # FCOS Options
357
+ # ---------------------------------------------------------------------------- #
358
+ _C.MODEL.FCOS = CN()
359
+ _C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background
360
+ _C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
361
+ _C.MODEL.FCOS.PRIOR_PROB = 0.01
362
+ _C.MODEL.FCOS.INFERENCE_TH = 0.05
363
+ _C.MODEL.FCOS.NMS_TH = 0.6
364
+ _C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
365
+
366
+ # the number of convolutions used in the cls and bbox tower
367
+ _C.MODEL.FCOS.NUM_CONVS = 4
368
+ # if use deformable conv to align features
369
+ _C.MODEL.FCOS.USE_DFCONV = False
370
+
371
+ # if CENTER_SAMPLING_RADIUS <= 0, it will disable center sampling
372
+ _C.MODEL.FCOS.CENTER_SAMPLING_RADIUS = 0.0
373
+ # IOU_LOSS_TYPE can be "iou", "linear_iou" or "giou"
374
+ _C.MODEL.FCOS.IOU_LOSS_TYPE = "iou"
375
+
376
+ _C.MODEL.FCOS.NORM_REG_TARGETS = False
377
+ _C.MODEL.FCOS.CENTERNESS_ON_REG = False
378
+ _C.MODEL.FCOS.USE_GT_CENTER = False
379
+
380
+ _C.MODEL.FCOS.DETECTIONS_PER_IMG = 100
381
+ _C.MODEL.FCOS.USE_GN = False
382
+ _C.MODEL.FCOS.USE_BN = False
383
+
384
+ _C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.0
385
+ _C.MODEL.FCOS.PRE_NMS_TOP_N_TRAIN = 3000
386
+ _C.MODEL.FCOS.POST_NMS_TOP_N_TRAIN = 1000
387
+
388
+ # ---------------------------------------------------------------------------- #
389
+ # ATSS Options
390
+ # ---------------------------------------------------------------------------- #
391
+ _C.MODEL.ATSS = CN()
392
+ _C.MODEL.ATSS.NUM_CLASSES = 81 # the number of classes including background
393
+ _C.MODEL.ATSS.PRIOR_PROB = 0.01
394
+ _C.MODEL.ATSS.INFERENCE_TH = 0.05
395
+ _C.MODEL.ATSS.NMS_TH = 0.6
396
+ _C.MODEL.ATSS.PRE_NMS_TOP_N = 1000
397
+
398
+ # the number of convolutions used in the cls and bbox tower
399
+ _C.MODEL.ATSS.NUM_CONVS = 4
400
+ # the channels of convolutions used in the cls and bbox tower
401
+ _C.MODEL.ATSS.CHANNELS = 128
402
+ # if use deformable conv to align features
403
+ _C.MODEL.ATSS.USE_DFCONV = False
404
+
405
+ # topk for selecting candidate positive samples from each level
406
+ _C.MODEL.ATSS.TOPK = 9
407
+
408
+ # Weight for bbox_regression loss
409
+ _C.MODEL.ATSS.REG_LOSS_WEIGHT = 2.0
410
+
411
+ _C.MODEL.ATSS.DETECTIONS_PER_IMG = 100
412
+ _C.MODEL.ATSS.USE_GN = False
413
+ _C.MODEL.ATSS.USE_BN = False
414
+
415
+ _C.MODEL.ATSS.USE_DYRELU = False
416
+ _C.MODEL.ATSS.USE_SE = False
417
+
418
+ _C.MODEL.ATSS.INFERENCE_TH_TRAIN = 0.0
419
+ _C.MODEL.ATSS.PRE_NMS_TOP_N_TRAIN = 3000
420
+ _C.MODEL.ATSS.POST_NMS_TOP_N_TRAIN = 1000
421
+ # ---------------------------------------------------------------------------- #
422
+ # DYHEAD Options
423
+ # ---------------------------------------------------------------------------- #
424
+ _C.MODEL.DYHEAD = CN()
425
+ _C.MODEL.DYHEAD.NUM_CLASSES = 81 # the number of classes including background
426
+ _C.MODEL.DYHEAD.PRIOR_PROB = 0.01
427
+
428
+ # the number of convolutions used in the cls and bbox tower
429
+ _C.MODEL.DYHEAD.NUM_CONVS = 4
430
+ # the channels of convolutions used in the cls and bbox tower
431
+ _C.MODEL.DYHEAD.CHANNELS = 128
432
+ _C.MODEL.DYHEAD.GROUPS = 1
433
+ # if use deformable conv to align features
434
+ _C.MODEL.DYHEAD.USE_DFCONV = False
435
+
436
+ # topk for selecting candidate positive samples from each level
437
+ _C.MODEL.DYHEAD.TOPK = 9
438
+
439
+ _C.MODEL.DYHEAD.SCORE_AGG = "MEAN" # MEAN or MAX, for binary focal loss score aggregation
440
+
441
+ _C.MODEL.DYHEAD.LOG_SCALE = 0.0 # temperature (dot product)
442
+ _C.MODEL.DYHEAD.SHALLOW_LOG_SCALE = 0.0 # # temperature (shallow contrastive)
443
+
444
+ _C.MODEL.DYHEAD.USE_GN = False
445
+ _C.MODEL.DYHEAD.USE_NSYNCBN = False
446
+ _C.MODEL.DYHEAD.USE_SYNCBN = False
447
+
448
+ _C.MODEL.DYHEAD.USE_DYFUSE = False
449
+ _C.MODEL.DYHEAD.USE_DYRELU = False
450
+
451
+ _C.MODEL.DYHEAD.CONV_FUNC = ''
452
+
453
+ # CosineSimOutputLayers: https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/modeling/roi_heads/fast_rcnn.py#L448-L464
454
+ _C.MODEL.DYHEAD.COSINE_SCALE = -1.0
455
+
456
+ _C.MODEL.DYHEAD.FUSE_CONFIG = CN()
457
+ _C.MODEL.DYHEAD.FUSE_CONFIG.EARLY_FUSE_ON = False
458
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TYPE = ""
459
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_SIZE = 256
460
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_OUT_SIZE = 256
461
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_DROPOUT = 0.1
462
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_MLP_LAYERS = 2
463
+
464
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_CLASSIFICATION_LOSS = False
465
+
466
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_TOKEN_LOSS = False
467
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_LOSS_WEIGHT = 1.0
468
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_GAMMA = 2.0
469
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_ALPHA = 0.25
470
+
471
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS = False
472
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_CONTRASTIVE_ALIGN_LOSS = False
473
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_HIDDEN_DIM = 64
474
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_ALIGN_LOSS_WEIGHT = 1.0
475
+ _C.MODEL.DYHEAD.FUSE_CONFIG.DOT_PRODUCT_TOKEN_LOSS_WEIGHT = 1.0
476
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_LAYER_SCALE = True
477
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SEPARATE_BIDIRECTIONAL = False
478
+ _C.MODEL.DYHEAD.FUSE_CONFIG.STABLE_SOFTMAX_2D = False
479
+
480
+ _C.MODEL.DYHEAD.FUSE_CONFIG.DO_LANG_PROJ_OUTSIDE_CHECKPOINT = False
481
+
482
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_FUSED_FEATURES_DOT_PRODUCT = False
483
+
484
+ # Controls for
485
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MIN_FOR_UNDERFLOW = False
486
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MAX_FOR_OVERFLOW = False
487
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MIN_FOR_UNDERFLOW = False
488
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MAX_FOR_OVERFLOW = False
489
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_DOT_PRODUCT = False
490
+
491
+ # MLM Loss
492
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS = False
493
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_FOR_ONLY_POSITIVES = True
494
+ _C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD = False
495
+ _C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD = False
496
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_COEF = 1.0
497
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE = False
498
+
499
+ # Shallow Contrastive Loss (FPN)
500
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_CONTRASTIVE_LOSS = False
501
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_MAX_POSITIVE_ANCHORS = 100
502
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_ZERO_PADS = False
503
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_HIDDEN_DIM = 64
504
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_LOSS_WEIGHT = 1.0
505
+
506
+ # Shallow Contrastive Loss (BACKBONE)
507
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_BACKBONE_SHALLOW_CONTRASTIVE_LOSS = False
508
+
509
+ _C.MODEL.DYHEAD.FUSE_CONFIG.ADD_LINEAR_LAYER = False
510
+
511
+ # use checkpoint to save memory
512
+ _C.MODEL.DYHEAD.USE_CHECKPOINT = False
513
+
514
+ # ---------------------------------------------------------------------------- #
515
+ # RPN options
516
+ # ---------------------------------------------------------------------------- #
517
+ _C.MODEL.RPN = CN()
518
+ _C.MODEL.RPN.USE_FPN = False
519
+ # Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
520
+ _C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
521
+ # Stride of the feature map that RPN is attached.
522
+ # For FPN, number of strides should match number of scales
523
+ _C.MODEL.RPN.ANCHOR_STRIDE = (16,)
524
+ # RPN anchor aspect ratios
525
+ _C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
526
+ # Anchor shift away ration from the center for r,t,l,d
527
+ _C.MODEL.RPN.ANCHOR_SHIFT = (0.0, 0.0, 0.0, 0.0)
528
+ # Use center to decide anchor size
529
+ _C.MODEL.RPN.USE_RELATIVE_SIZE = False
530
+ # Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
531
+ # Set to -1 or a large value, e.g. 100000, to disable pruning anchors
532
+ _C.MODEL.RPN.STRADDLE_THRESH = 0
533
+ # Anchor scales per octave for complex anchors
534
+ _C.MODEL.RPN.OCTAVE = 2.0
535
+ _C.MODEL.RPN.SCALES_PER_OCTAVE = 3
536
+ # Minimum overlap required between an anchor and ground-truth box for the
537
+ # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
538
+ # ==> positive RPN example)
539
+ _C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
540
+ # Maximum overlap allowed between an anchor and ground-truth box for the
541
+ # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
542
+ # ==> negative RPN example)
543
+ _C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
544
+ # Total number of RPN examples per image
545
+ _C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
546
+ # Target fraction of foreground (positive) examples per RPN minibatch
547
+ _C.MODEL.RPN.POSITIVE_FRACTION = 0.5
548
+ # Number of top scoring RPN proposals to keep before applying NMS
549
+ # When FPN is used, this is *per FPN level* (not total)
550
+ _C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
551
+ _C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
552
+ # Number of top scoring RPN proposals to keep after applying NMS
553
+ _C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
554
+ _C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
555
+ # NMS threshold used on RPN proposals
556
+ _C.MODEL.RPN.NMS_THRESH = 0.7
557
+ # Proposal height and width both need to be greater than RPN_MIN_SIZE
558
+ # (a the scale used during training or inference)
559
+ _C.MODEL.RPN.MIN_SIZE = 0
560
+ # Number of top scoring RPN proposals to keep after combining proposals from
561
+ # all FPN levels
562
+ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
563
+ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
564
+ # Custom rpn head, empty to use default conv or separable conv
565
+ _C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
566
+ _C.MODEL.RPN.FREEZE = False
567
+ _C.MODEL.RPN.FORCE_BOXES = False
568
+ _C.MODEL.RPN.RETURN_FUSED_FEATURES = False
569
+
570
+ # ---------------------------------------------------------------------------- #
571
+ # ROI HEADS options
572
+ # ---------------------------------------------------------------------------- #
573
+ _C.MODEL.ROI_HEADS = CN()
574
+ _C.MODEL.ROI_HEADS.USE_FPN = False
575
+ # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
576
+ _C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
577
+ # Overlap threshold for an RoI to be considered background
578
+ # (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
579
+ _C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
580
+ # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
581
+ # These are empirically chosen to approximately lead to unit variance targets
582
+ _C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)
583
+ # RoI minibatch size *per image* (number of regions of interest [ROIs])
584
+ # Total number of RoIs per training minibatch =
585
+ # TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS
586
+ # E.g., a common configuration is: 512 * 2 * 8 = 8192
587
+ _C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
588
+ # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
589
+ _C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
590
+
591
+ # Only used on test mode
592
+
593
+ # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
594
+ # balance obtaining high recall with not having too many low precision
595
+ # detections that will slow down inference post processing steps (like NMS)
596
+ _C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
597
+ # Overlap threshold used for non-maximum suppression (suppress boxes with
598
+ # IoU >= this threshold)
599
+ _C.MODEL.ROI_HEADS.NMS = 0.5
600
+ # Maximum number of detections to return per image (100 is based on the limit
601
+ # established for the COCO dataset)
602
+ _C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
603
+
604
+ _C.MODEL.ROI_BOX_HEAD = CN()
605
+ _C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
606
+ _C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
607
+ _C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
608
+ _C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
609
+ _C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
610
+ _C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
611
+ # Hidden layer dimension when using an MLP for the RoI box head
612
+ _C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
613
+ # GN
614
+ _C.MODEL.ROI_BOX_HEAD.USE_GN = False
615
+ # Dilation
616
+ _C.MODEL.ROI_BOX_HEAD.DILATION = 1
617
+ _C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
618
+ _C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
619
+ # Use D2 style ROIAlignV2
620
+ _C.MODEL.ROI_BOX_HEAD.POOLER_ALIGNED = False
621
+
622
+ _C.MODEL.ROI_MASK_HEAD = CN()
623
+ _C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
624
+ _C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor"
625
+ _C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
626
+ _C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
627
+ _C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,)
628
+ _C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024
629
+ _C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
630
+ _C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14
631
+ _C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
632
+ # Whether or not resize and translate masks to the input image.
633
+ _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
634
+ _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
635
+ # Dilation
636
+ _C.MODEL.ROI_MASK_HEAD.DILATION = 1
637
+ # GN
638
+ _C.MODEL.ROI_MASK_HEAD.USE_GN = False
639
+ # HG
640
+ _C.MODEL.ROI_MASK_HEAD.HG_SCALE = 1
641
+
642
+ _C.MODEL.ROI_KEYPOINT_HEAD = CN()
643
+ _C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor"
644
+ _C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor"
645
+ _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
646
+ _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
647
+ _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,)
648
+ _C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024
649
+ _C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8))
650
+ _C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14
651
+ _C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17
652
+ _C.MODEL.ROI_KEYPOINT_HEAD.KEYPOINT_NAME = () # If left empty, use default names
653
+ _C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
654
+
655
+ # ---------------------------------------------------------------------------- #
656
+ # ResNe[X]t options (ResNets = {ResNet, ResNeXt}
657
+ # Note that parts of a resnet may be used for both the backbone and the head
658
+ # These options apply to both
659
+ # ---------------------------------------------------------------------------- #
660
+ _C.MODEL.RESNETS = CN()
661
+
662
+ _C.MODEL.RESNETS.USE_STEM3X3 = False
663
+ _C.MODEL.RESNETS.WITH_SE = False
664
+ _C.MODEL.RESNETS.USE_AVG_DOWN = False
665
+
666
+ # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
667
+ _C.MODEL.RESNETS.NUM_GROUPS = 1
668
+
669
+ # Baseline width of each group
670
+ _C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
671
+
672
+ # Place the stride 2 conv on the 1x1 filter
673
+ # Use True only for the original MSRA ResNet; use False for C2 and Torch models
674
+ _C.MODEL.RESNETS.STRIDE_IN_1X1 = True
675
+
676
+ # Residual transformation function
677
+ _C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
678
+ # ResNet's stem function (conv1 and pool1)
679
+ _C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
680
+
681
+ # Apply dilation in stage "res5"
682
+ _C.MODEL.RESNETS.RES5_DILATION = 1
683
+
684
+ _C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
685
+ _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
686
+ _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
687
+
688
+ _C.MODEL.RESNETS.REVISION = "resnet_light"
689
+ # Deformable convolutions
690
+ _C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
691
+ _C.MODEL.RESNETS.WITH_MODULATED_DCN = False
692
+ _C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
693
+
694
+ # ---------------------------------------------------------------------------- #
695
+ # Swin Transformer
696
+ # ---------------------------------------------------------------------------- #
697
+ _C.MODEL.SWINT = CN()
698
+ _C.MODEL.SWINT.EMBED_DIM = 96
699
+ _C.MODEL.SWINT.OUT_CHANNELS = (96, 192, 384, 768)
700
+ _C.MODEL.SWINT.DEPTHS = (2, 2, 6, 2)
701
+ _C.MODEL.SWINT.NUM_HEADS = (3, 6, 12, 24)
702
+ _C.MODEL.SWINT.WINDOW_SIZE = 7
703
+ _C.MODEL.SWINT.MLP_RATIO = 4
704
+ _C.MODEL.SWINT.DROP_PATH_RATE = 0.2
705
+ _C.MODEL.SWINT.APE = False
706
+ _C.MODEL.SWINT.VERSION = "v1"
707
+ _C.MODEL.SWINT.OUT_NORM = True
708
+ _C.MODEL.SWINT.LAYER_SCALE = 0
709
+
710
+ # ---------------------------------------------------------------------------- #
711
+ # CVT SPEC
712
+ # ---------------------------------------------------------------------------- #
713
+ _C.MODEL.SPEC = CN(new_allowed=True)
714
+
715
+ # ---------------------------------------------------------------------------- #
716
+ # CLIP SPEC
717
+ # ---------------------------------------------------------------------------- #
718
+ _C.MODEL.CLIP = CN()
719
+ _C.MODEL.CLIP.CONTEXT_LENGTH = 256 # default 77
720
+ _C.MODEL.CLIP.WIDTH = 512
721
+ _C.MODEL.CLIP.LAYERS = 12
722
+ _C.MODEL.CLIP.HEADS = 8
723
+ _C.MODEL.CLIP.DROP_PATH = 0.0
724
+ _C.MODEL.CLIP.TOKENIZER = "clip"
725
+ _C.MODEL.CLIP.VOCAB_SIZE = 49408
726
+
727
+ # ---------------------------------------------------------------------------- #
728
+ # SEARCH
729
+ # ---------------------------------------------------------------------------- #
730
+
731
+ _C.SEARCH = CN()
732
+ _C.SEARCH.MAX_EPOCH = 20
733
+ _C.SEARCH.SELECT_NUM = 20
734
+ _C.SEARCH.POPULATION_NUM = 64
735
+ _C.SEARCH.MUTATION_NUM = 24
736
+ _C.SEARCH.CROSSOVER_NUM = 24
737
+ _C.SEARCH.MUTATION_PROB = 0.1
738
+
739
+ # ---------------------------------------------------------------------------- #
740
+ # Solver
741
+ # ---------------------------------------------------------------------------- #
742
+ _C.SOLVER = CN()
743
+ _C.SOLVER.USE_AMP = False
744
+
745
+ _C.SOLVER.MAX_ITER = 40000
746
+ _C.SOLVER.MULTI_MAX_ITER = () # set different max epoch for different stage
747
+ _C.SOLVER.MAX_EPOCH = 0 # any epoch number>0 will overwrite max_iter
748
+ _C.SOLVER.MULTI_MAX_EPOCH = () # set different max epoch for different stage
749
+
750
+ _C.SOLVER.OPTIMIZER = "SGD" # "ADAMW"
751
+
752
+ _C.SOLVER.BASE_LR = 0.001
753
+
754
+ _C.SOLVER.LANG_LR = 0.00001
755
+ _C.SOLVER.BACKBONE_BODY_LR_FACTOR = 1.0
756
+
757
+ _C.SOLVER.BIAS_LR_FACTOR = 2
758
+ _C.SOLVER.GRAD_CLIP = 0.0
759
+ # D2 gradient clip
760
+ _C.SOLVER.CLIP_GRADIENTS = CN()
761
+ _C.SOLVER.CLIP_GRADIENTS.ENABLED = False
762
+ _C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.0
763
+ _C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
764
+ _C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
765
+ _C.SOLVER.MODEL_EMA = 0.0
766
+
767
+ _C.SOLVER.MOMENTUM = 0.9
768
+
769
+ _C.SOLVER.WEIGHT_DECAY = 0.0005
770
+ _C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
771
+ _C.SOLVER.WEIGHT_DECAY_NORM_FACTOR = 1.0
772
+
773
+ # use cosine lr to replace default multistage
774
+ _C.SOLVER.USE_COSINE = False
775
+ _C.SOLVER.MIN_LR = 0.000001
776
+
777
+ _C.SOLVER.GAMMA = 0.1
778
+ _C.SOLVER.STEPS = (30000,)
779
+
780
+ _C.SOLVER.USE_AUTOSTEP = False
781
+ _C.SOLVER.STEP_PATIENCE = 5
782
+
783
+ _C.SOLVER.WARMUP_FACTOR = 1.0 / 3
784
+ _C.SOLVER.WARMUP_ITERS = 500
785
+ _C.SOLVER.WARMUP_METHOD = "linear"
786
+
787
+ _C.SOLVER.CHECKPOINT_PERIOD = 2500
788
+ _C.SOLVER.CHECKPOINT_PER_EPOCH = -1.0
789
+ _C.SOLVER.TEST_WITH_INFERENCE = False
790
+ _C.SOLVER.AUTO_TERMINATE_PATIENCE = -1
791
+ # Number of images per batch
792
+ # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
793
+ # see 2 images per batch
794
+ _C.SOLVER.IMS_PER_BATCH = 16
795
+ # This is the max negative ratio allowed per batch
796
+ _C.SOLVER.MAX_NEG_PER_BATCH = 0.1
797
+
798
+ _C.SOLVER.SEED = 0
799
+ _C.SOLVER.DISABLE_OUTPUT_DISTRIBUTED = False
800
+
801
+
802
+ _C.SOLVER.PROMPT_PROBING_LEVEL = -1.0
803
+ # -1 means tuning the whole model;
804
+ # 1 means tuning the whole language model; 1.5 means tuning the box head as well
805
+
806
+ _C.SOLVER.FIND_UNUSED_PARAMETERS = True
807
+ _C.SOLVER.DATASET_LENGTH = -1 # Just for logging purpose
808
+ _C.SOLVER.TUNING_HIGHLEVEL_OVERRIDE = None
809
+ _C.SOLVER.USE_EMA_FOR_MONITOR = False
810
+
811
+ _C.SOLVER.WEIGHT_DECAY_SCHEDULE = False
812
+ _C.SOLVER.WEIGHT_DECAY_SCHEDULE_RATIO = 0.667
813
+
814
+ # ---------------------------------------------------------------------------- #
815
+ # Specific test options
816
+ # ---------------------------------------------------------------------------- #
817
+ _C.TEST = CN()
818
+ _C.TEST.EXPECTED_RESULTS = []
819
+ _C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
820
+ _C.TEST.DURING_TRAINING = False
821
+ # Number of images per batch
822
+ # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
823
+ # see 2 images per batch
824
+ _C.TEST.IMS_PER_BATCH = 16
825
+ # Special Test Configuration
826
+ _C.TEST.USE_MULTISCALE = False
827
+ # _C.TEST.SCALES = (400, 600, 800, 1000, 1200, 1400)
828
+ # _C.TEST.RANGES = ((96, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 192))
829
+ _C.TEST.SCALES = (400, 500, 600, 640, 700, 900, 1000, 1100, 1200, 1300, 1400, 1800)
830
+ _C.TEST.RANGES = ((96, 10000), (96, 10000), (64, 10000), (64, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 256), (0, 192), (0, 192), (0, 96))
831
+ _C.TEST.MAX_SIZE = 2500
832
+ _C.TEST.FLIP = True
833
+ _C.TEST.SPECIAL_NMS = 'none' # ('none', 'soft-nms', 'vote', 'soft-vote')
834
+ _C.TEST.TH = 0.6 # threshold for nms or vote
835
+ _C.TEST.PRE_NMS_TOP_N = 1000
836
+ _C.TEST.NUM_CLASSES = 81
837
+ _C.TEST.SELECT_CLASSES = ()
838
+
839
+ _C.TEST.EVAL_TASK = ""
840
+ _C.TEST.SUBSET = -1
841
+ _C.TEST.CHUNKED_EVALUATION = -1
842
+ _C.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM = -1
843
+ # ---------------------------------------------------------------------------- #
844
+ # Misc options
845
+ # ---------------------------------------------------------------------------- #
846
+ _C.OUTPUT_DIR = "OUTPUT"
847
+
848
+ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
849
+
850
+ # TensorBoard experiment location
851
+ _C.TENSORBOARD_EXP = "OUTPUT"
852
+
853
+
854
+ _C.GLIPKNOW = CN()
855
+ _C.GLIPKNOW.KNOWLEDGE_FILE = ""
856
+ _C.GLIPKNOW.KNOWLEDGE_TYPE = ""
857
+ _C.GLIPKNOW.MAX_NUM_CLASSES_PER_BATCH_TRAIN = -1
858
+ _C.GLIPKNOW.PARALLEL_LANGUAGE_INPUT = False
859
+ _C.GLIPKNOW.LAN_FEATURE_AGG_TYPE = "first"
860
+ _C.GLIPKNOW.GPT3_NUM = 5
861
+ _C.GLIPKNOW.WIKI_AND_GPT3 = False
maskrcnn_benchmark/config/paths_catalog.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ """Centralized catalog of paths."""
3
+
4
+ import os
5
+
6
+
7
+ def try_to_find(file, return_dir=False, search_path=['./DATASET', './OUTPUT', './data', './MODEL']):
8
+ if not file:
9
+ return file
10
+
11
+ if file.startswith('catalog://'):
12
+ return file
13
+
14
+ DATASET_PATH = ['./']
15
+ if 'DATASET' in os.environ:
16
+ DATASET_PATH.append(os.environ['DATASET'])
17
+ DATASET_PATH += search_path
18
+
19
+ for path in DATASET_PATH:
20
+ if os.path.exists(os.path.join(path, file)):
21
+ if return_dir:
22
+ return path
23
+ else:
24
+ return os.path.join(path, file)
25
+
26
+ print('Cannot find {} in {}'.format(file, DATASET_PATH))
27
+ exit(1)
28
+
29
+
30
+ class DatasetCatalog(object):
31
+ DATASETS = {
32
+ # pretrained grounding dataset
33
+ # mixed vg and coco
34
+ "mixed_train": {
35
+ "coco_img_dir": "coco/train2014",
36
+ "vg_img_dir": "gqa/images",
37
+ "ann_file": "mdetr_annotations/final_mixed_train.json",
38
+ },
39
+ "mixed_train_no_coco": {
40
+ "coco_img_dir": "coco/train2014",
41
+ "vg_img_dir": "gqa/images",
42
+ "ann_file": "mdetr_annotations/final_mixed_train_no_coco.json",
43
+ },
44
+
45
+ # flickr30k
46
+ "flickr30k_train": {
47
+ "img_folder": "flickr30k/flickr30k_images/train",
48
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_train.json",
49
+ "is_train": True
50
+ },
51
+ "flickr30k_val": {
52
+ "img_folder": "flickr30k/flickr30k_images/val",
53
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_val.json",
54
+ "is_train": False
55
+ },
56
+ "flickr30k_test": {
57
+ "img_folder": "flickr30k/flickr30k_images/test",
58
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_test.json",
59
+ "is_train": False
60
+ },
61
+
62
+ # refcoco
63
+ "refexp_all_val": {
64
+ "img_dir": "refcoco/train2014",
65
+ "ann_file": "mdetr_annotations/final_refexp_val.json",
66
+ "is_train": False
67
+ },
68
+
69
+ # gqa
70
+ "gqa_val": {
71
+ "img_dir": "gqa/images",
72
+ "ann_file": "mdetr_annotations/final_gqa_val.json",
73
+ "is_train": False
74
+ },
75
+
76
+ # phrasecut
77
+ "phrasecut_train": {
78
+ "img_dir": "gqa/images",
79
+ "ann_file": "mdetr_annotations/finetune_phrasecut_train.json",
80
+ "is_train": True
81
+ },
82
+
83
+
84
+ # od to grounding
85
+ # coco tsv
86
+ "coco_dt_train": {
87
+ "dataset_file": "coco_dt",
88
+ "yaml_path": "coco_tsv/coco_obj.yaml",
89
+ "is_train": True,
90
+ },
91
+ "COCO_odinw_train_8copy_dt_train": {
92
+ "dataset_file": "coco_odinw_dt",
93
+ "yaml_path": "coco_tsv/COCO_odinw_train_8copy.yaml",
94
+ "is_train": True,
95
+ },
96
+ "COCO_odinw_val_dt_train": {
97
+ "dataset_file": "coco_odinw_dt",
98
+ "yaml_path": "coco_tsv/COCO_odinw_val.yaml",
99
+ "is_train": False,
100
+ },
101
+ # lvis tsv
102
+ "lvisv1_dt_train": {
103
+ "dataset_file": "lvisv1_dt",
104
+ "yaml_path": "coco_tsv/LVIS_v1_train.yaml",
105
+ "is_train": True,
106
+ },
107
+ "LVIS_odinw_train_8copy_dt_train": {
108
+ "dataset_file": "coco_odinw_dt",
109
+ "yaml_path": "coco_tsv/LVIS_odinw_train_8copy.yaml",
110
+ "is_train": True,
111
+ },
112
+ # object365 tsv
113
+ "object365_dt_train": {
114
+ "dataset_file": "object365_dt",
115
+ "yaml_path": "Objects365/objects365_train_vgoiv6.cas2000.yaml",
116
+ "is_train": True,
117
+ },
118
+ "object365_odinw_2copy_dt_train": {
119
+ "dataset_file": "object365_odinw_dt",
120
+ "yaml_path": "Objects365/objects365_train_odinw.cas2000_2copy.yaml",
121
+ "is_train": True,
122
+ },
123
+ "objects365_odtsv_train": {
124
+ "dataset_file": "objects365_odtsv",
125
+ "yaml_path": "Objects365/train.cas2000.yaml",
126
+ "is_train": True,
127
+ },
128
+ "objects365_odtsv_val": {
129
+ "dataset_file": "objects365_odtsv",
130
+ "yaml_path": "Objects365/val.yaml",
131
+ "is_train": False,
132
+ },
133
+
134
+ # ImagetNet OD
135
+ "imagenetod_train_odinw_2copy_dt": {
136
+ "dataset_file": "imagenetod_odinw_dt",
137
+ "yaml_path": "imagenet_od/imagenetod_train_odinw_2copy.yaml",
138
+ "is_train": True,
139
+ },
140
+
141
+ # OpenImage OD
142
+ "oi_train_odinw_dt": {
143
+ "dataset_file": "oi_odinw_dt",
144
+ "yaml_path": "openimages_v5c/oi_train_odinw.cas.2000.yaml",
145
+ "is_train": True,
146
+ },
147
+
148
+ # vg tsv
149
+ "vg_dt_train": {
150
+ "dataset_file": "vg_dt",
151
+ "yaml_path": "visualgenome/train_vgoi6_clipped.yaml",
152
+ "is_train": True,
153
+ },
154
+
155
+ "vg_odinw_clipped_8copy_dt_train": {
156
+ "dataset_file": "vg_odinw_clipped_8copy_dt",
157
+ "yaml_path": "visualgenome/train_odinw_clipped_8copy.yaml",
158
+ "is_train": True,
159
+ },
160
+ "vg_vgoi6_clipped_8copy_dt_train": {
161
+ "dataset_file": "vg_vgoi6_clipped_8copy_dt",
162
+ "yaml_path": "visualgenome/train_vgoi6_clipped_8copy.yaml",
163
+ "is_train": True,
164
+ },
165
+
166
+ # coco json
167
+ "coco_grounding_train": {
168
+ "img_dir": "coco/train2017",
169
+ "ann_file": "coco/annotations/instances_train2017.json",
170
+ "is_train": True,
171
+ },
172
+
173
+ "lvis_grounding_train": {
174
+ "img_dir": "coco",
175
+ "ann_file": "coco/annotations/lvis_od_train.json"
176
+ },
177
+
178
+
179
+ "lvis_val": {
180
+ "img_dir": "coco",
181
+ "ann_file": "coco/annotations/lvis_od_val.json"
182
+ },
183
+ "coco_2017_train": {
184
+ "img_dir": "coco/train2017",
185
+ "ann_file": "coco/annotations/instances_train2017.json"
186
+ },
187
+ "coco_2017_val": {
188
+ "img_dir": "coco/val2017",
189
+ "ann_file": "coco/annotations/instances_val2017.json"
190
+ },
191
+ "coco_2017_test": {
192
+ "img_dir": "coco/test2017",
193
+ "ann_file": "coco/annotations/image_info_test-dev2017.json"
194
+ },
195
+ "coco_2014_train": {
196
+ "img_dir": "coco/train2014",
197
+ "ann_file": "coco/annotations/instances_train2014.json"
198
+ },
199
+ "coco_2014_val": {
200
+ "img_dir": "coco/val2014",
201
+ "ann_file": "coco/annotations/instances_val2014.json"
202
+ },
203
+ "coco_2014_minival": {
204
+ "img_dir": "coco/val2014",
205
+ "ann_file": "coco/annotations/instances_minival2014.json"
206
+ },
207
+ }
208
+
209
+ @staticmethod
210
+ def set(name, info):
211
+ DatasetCatalog.DATASETS.update({name: info})
212
+
213
+ @staticmethod
214
+ def get(name):
215
+
216
+ if name.endswith('_bg'):
217
+ attrs = DatasetCatalog.DATASETS[name]
218
+ data_dir = try_to_find(attrs["ann_file"], return_dir=True)
219
+ args = dict(
220
+ root=os.path.join(data_dir, attrs["img_dir"]),
221
+ ann_file=os.path.join(data_dir, attrs["ann_file"]),
222
+ )
223
+ return dict(
224
+ factory="Background",
225
+ args=args,
226
+ )
227
+ else:
228
+ if "bing" in name.split("_"):
229
+ attrs = DatasetCatalog.DATASETS["bing_caption_train"]
230
+ else:
231
+ attrs = DatasetCatalog.DATASETS[name]
232
+
233
+ if "voc" in name and 'split' in attrs:
234
+ data_dir = try_to_find(attrs["data_dir"], return_dir=True)
235
+ args = dict(
236
+ data_dir=os.path.join(data_dir, attrs["data_dir"]),
237
+ split=attrs["split"],
238
+ )
239
+ return dict(
240
+ factory="PascalVOCDataset",
241
+ args=args,
242
+ )
243
+ elif "mixed" in name:
244
+ vg_img_dir = try_to_find(attrs["vg_img_dir"], return_dir=True)
245
+ coco_img_dir = try_to_find(attrs["coco_img_dir"], return_dir=True)
246
+ ann_file = try_to_find(attrs["ann_file"], return_dir=True)
247
+ args = dict(
248
+ img_folder_coco=os.path.join(coco_img_dir, attrs["coco_img_dir"]),
249
+ img_folder_vg=os.path.join(vg_img_dir, attrs["vg_img_dir"]),
250
+ ann_file=os.path.join(ann_file, attrs["ann_file"])
251
+ )
252
+ return dict(
253
+ factory="MixedDataset",
254
+ args=args,
255
+ )
256
+ elif "flickr" in name:
257
+ img_dir = try_to_find(attrs["img_folder"], return_dir=True)
258
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
259
+ args = dict(
260
+ img_folder=os.path.join(img_dir, attrs["img_folder"]),
261
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
262
+ is_train=attrs["is_train"]
263
+ )
264
+ return dict(
265
+ factory="FlickrDataset",
266
+ args=args,
267
+ )
268
+ elif "refexp" in name:
269
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
270
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
271
+ args = dict(
272
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
273
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
274
+ )
275
+ return dict(
276
+ factory="RefExpDataset",
277
+ args=args,
278
+ )
279
+ elif "gqa" in name:
280
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
281
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
282
+ args = dict(
283
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
284
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
285
+ )
286
+ return dict(
287
+ factory="GQADataset",
288
+ args=args,
289
+ )
290
+ elif "phrasecut" in name:
291
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
292
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
293
+ args = dict(
294
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
295
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
296
+ )
297
+ return dict(
298
+ factory="PhrasecutDetection",
299
+ args=args,
300
+ )
301
+ elif "_caption" in name:
302
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
303
+ if "no_coco" in name:
304
+ yaml_name = attrs["yaml_name_no_coco"]
305
+ else:
306
+ yaml_name = attrs["yaml_name"]
307
+ yaml_file_name = "{}.{}.yaml".format(yaml_name, name.split("_")[2])
308
+ args = dict(
309
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"], yaml_file_name)
310
+ )
311
+ return dict(
312
+ factory="CaptionTSV",
313
+ args=args,
314
+ )
315
+ elif "inferencecap" in name:
316
+ yaml_file_name = try_to_find(attrs["yaml_path"])
317
+ args = dict(
318
+ yaml_file=yaml_file_name)
319
+ return dict(
320
+ factory="CaptionTSV",
321
+ args=args,
322
+ )
323
+ elif "pseudo_data" in name:
324
+ args = dict(
325
+ yaml_file=try_to_find(attrs["yaml_path"])
326
+ )
327
+ return dict(
328
+ factory="PseudoData",
329
+ args=args,
330
+ )
331
+ elif "_dt" in name:
332
+ dataset_file = attrs["dataset_file"]
333
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
334
+ args = dict(
335
+ name=dataset_file,
336
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
337
+ )
338
+ return dict(
339
+ factory="CocoDetectionTSV",
340
+ args=args,
341
+ )
342
+ elif "_odtsv" in name:
343
+ dataset_file = attrs["dataset_file"]
344
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
345
+ args = dict(
346
+ name=dataset_file,
347
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
348
+ )
349
+ return dict(
350
+ factory="ODTSVDataset",
351
+ args=args,
352
+ )
353
+ elif "_grounding" in name:
354
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
355
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
356
+ args = dict(
357
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
358
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
359
+ )
360
+ return dict(
361
+ factory="CocoGrounding",
362
+ args=args,
363
+ )
364
+ elif "lvis_evaluation" in name:
365
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
366
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
367
+ args = dict(
368
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
369
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
370
+ )
371
+ return dict(
372
+ factory="LvisDetection",
373
+ args=args,
374
+ )
375
+ else:
376
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
377
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
378
+ args = dict(
379
+ root=os.path.join(img_dir, attrs["img_dir"]),
380
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
381
+ )
382
+ for k, v in attrs.items():
383
+ args.update({k: os.path.join(ann_dir, v)})
384
+ return dict(
385
+ factory="COCODataset",
386
+ args=args,
387
+ )
388
+
389
+ raise RuntimeError("Dataset not available: {}".format(name))
390
+
391
+
392
+ class ModelCatalog(object):
393
+ S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
394
+ C2_IMAGENET_MODELS = {
395
+ "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
396
+ "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
397
+ "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
398
+ "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
399
+ "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
400
+ "FAIR/20171220/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
401
+ }
402
+
403
+ C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl"
404
+ C2_DETECTRON_MODELS = {
405
+ "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
406
+ "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
407
+ "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
408
+ "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
409
+ "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
410
+ "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
411
+ "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
412
+ "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
413
+ }
414
+
415
+ @staticmethod
416
+ def get(name):
417
+ if name.startswith("Caffe2Detectron/COCO"):
418
+ return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
419
+ if name.startswith("ImageNetPretrained"):
420
+ return ModelCatalog.get_c2_imagenet_pretrained(name)
421
+ raise RuntimeError("model not present in the catalog {}".format(name))
422
+
423
+ @staticmethod
424
+ def get_c2_imagenet_pretrained(name):
425
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
426
+ name = name[len("ImageNetPretrained/"):]
427
+ name = ModelCatalog.C2_IMAGENET_MODELS[name]
428
+ url = "/".join([prefix, name])
429
+ return url
430
+
431
+ @staticmethod
432
+ def get_c2_detectron_12_2017_baselines(name):
433
+ # Detectron C2 models are stored following the structure
434
+ # prefix/<model_id>/2012_2017_baselines/<model_name>.yaml.<signature>/suffix
435
+ # we use as identifiers in the catalog Caffe2Detectron/COCO/<model_id>/<model_name>
436
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
437
+ suffix = ModelCatalog.C2_DETECTRON_SUFFIX
438
+ # remove identification prefix
439
+ name = name[len("Caffe2Detectron/COCO/"):]
440
+ # split in <model_id> and <model_name>
441
+ model_id, model_name = name.split("/")
442
+ # parsing to make it match the url address from the Caffe2 models
443
+ model_name = "{}.yaml".format(model_name)
444
+ signature = ModelCatalog.C2_DETECTRON_MODELS[name]
445
+ unique_name = ".".join([model_name, signature])
446
+ url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
447
+ return url
maskrcnn_benchmark/csrc/ROIAlign.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include "cpu/vision.h"
5
+
6
+ #ifdef WITH_CUDA
7
+ #include "cuda/vision.h"
8
+ #endif
9
+
10
+ // Interface for Python
11
+ at::Tensor ROIAlign_forward(const at::Tensor& input,
12
+ const at::Tensor& rois,
13
+ const float spatial_scale,
14
+ const int pooled_height,
15
+ const int pooled_width,
16
+ const int sampling_ratio) {
17
+ if (input.device().is_cuda()) {
18
+ #ifdef WITH_CUDA
19
+ return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
20
+ #else
21
+ AT_ERROR("Not compiled with GPU support");
22
+ #endif
23
+ }
24
+ return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
25
+ }
26
+
27
+ at::Tensor ROIAlign_backward(const at::Tensor& grad,
28
+ const at::Tensor& rois,
29
+ const float spatial_scale,
30
+ const int pooled_height,
31
+ const int pooled_width,
32
+ const int batch_size,
33
+ const int channels,
34
+ const int height,
35
+ const int width,
36
+ const int sampling_ratio) {
37
+ if (grad.device().is_cuda()) {
38
+ #ifdef WITH_CUDA
39
+ return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
40
+ #else
41
+ AT_ERROR("Not compiled with GPU support");
42
+ #endif
43
+ }
44
+ AT_ERROR("Not implemented on the CPU");
45
+ }
46
+
maskrcnn_benchmark/csrc/ROIPool.h ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include "cpu/vision.h"
5
+
6
+ #ifdef WITH_CUDA
7
+ #include "cuda/vision.h"
8
+ #endif
9
+
10
+
11
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
12
+ const at::Tensor& rois,
13
+ const float spatial_scale,
14
+ const int pooled_height,
15
+ const int pooled_width) {
16
+ if (input.device().is_cuda()) {
17
+ #ifdef WITH_CUDA
18
+ return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
19
+ #else
20
+ AT_ERROR("Not compiled with GPU support");
21
+ #endif
22
+ }
23
+ AT_ERROR("Not implemented on the CPU");
24
+ }
25
+
26
+ at::Tensor ROIPool_backward(const at::Tensor& grad,
27
+ const at::Tensor& input,
28
+ const at::Tensor& rois,
29
+ const at::Tensor& argmax,
30
+ const float spatial_scale,
31
+ const int pooled_height,
32
+ const int pooled_width,
33
+ const int batch_size,
34
+ const int channels,
35
+ const int height,
36
+ const int width) {
37
+ if (grad.device().is_cuda()) {
38
+ #ifdef WITH_CUDA
39
+ return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
40
+ #else
41
+ AT_ERROR("Not compiled with GPU support");
42
+ #endif
43
+ }
44
+ AT_ERROR("Not implemented on the CPU");
45
+ }
46
+
47
+
48
+
maskrcnn_benchmark/csrc/SigmoidFocalLoss.h ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+ // Interface for Python
10
+ at::Tensor SigmoidFocalLoss_forward(
11
+ const at::Tensor& logits,
12
+ const at::Tensor& targets,
13
+ const int num_classes,
14
+ const float gamma,
15
+ const float alpha) {
16
+ if (logits.device().is_cuda()) {
17
+ #ifdef WITH_CUDA
18
+ return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
19
+ #else
20
+ AT_ERROR("Not compiled with GPU support");
21
+ #endif
22
+ }
23
+ AT_ERROR("Not implemented on the CPU");
24
+ }
25
+
26
+ at::Tensor SigmoidFocalLoss_backward(
27
+ const at::Tensor& logits,
28
+ const at::Tensor& targets,
29
+ const at::Tensor& d_losses,
30
+ const int num_classes,
31
+ const float gamma,
32
+ const float alpha) {
33
+ if (logits.device().is_cuda()) {
34
+ #ifdef WITH_CUDA
35
+ return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
36
+ #else
37
+ AT_ERROR("Not compiled with GPU support");
38
+ #endif
39
+ }
40
+ AT_ERROR("Not implemented on the CPU");
41
+ }
maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+ // implementation taken from Caffe2
5
+ template <typename T>
6
+ struct PreCalc {
7
+ int pos1;
8
+ int pos2;
9
+ int pos3;
10
+ int pos4;
11
+ T w1;
12
+ T w2;
13
+ T w3;
14
+ T w4;
15
+ };
16
+
17
+ template <typename T>
18
+ void pre_calc_for_bilinear_interpolate(
19
+ const int height,
20
+ const int width,
21
+ const int pooled_height,
22
+ const int pooled_width,
23
+ const int iy_upper,
24
+ const int ix_upper,
25
+ T roi_start_h,
26
+ T roi_start_w,
27
+ T bin_size_h,
28
+ T bin_size_w,
29
+ int roi_bin_grid_h,
30
+ int roi_bin_grid_w,
31
+ std::vector<PreCalc<T>>& pre_calc) {
32
+ int pre_calc_index = 0;
33
+ for (int ph = 0; ph < pooled_height; ph++) {
34
+ for (int pw = 0; pw < pooled_width; pw++) {
35
+ for (int iy = 0; iy < iy_upper; iy++) {
36
+ const T yy = roi_start_h + ph * bin_size_h +
37
+ static_cast<T>(iy + .5f) * bin_size_h /
38
+ static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
39
+ for (int ix = 0; ix < ix_upper; ix++) {
40
+ const T xx = roi_start_w + pw * bin_size_w +
41
+ static_cast<T>(ix + .5f) * bin_size_w /
42
+ static_cast<T>(roi_bin_grid_w);
43
+
44
+ T x = xx;
45
+ T y = yy;
46
+ // deal with: inverse elements are out of feature map boundary
47
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
48
+ // empty
49
+ PreCalc<T> pc;
50
+ pc.pos1 = 0;
51
+ pc.pos2 = 0;
52
+ pc.pos3 = 0;
53
+ pc.pos4 = 0;
54
+ pc.w1 = 0;
55
+ pc.w2 = 0;
56
+ pc.w3 = 0;
57
+ pc.w4 = 0;
58
+ pre_calc[pre_calc_index] = pc;
59
+ pre_calc_index += 1;
60
+ continue;
61
+ }
62
+
63
+ if (y <= 0) {
64
+ y = 0;
65
+ }
66
+ if (x <= 0) {
67
+ x = 0;
68
+ }
69
+
70
+ int y_low = (int)y;
71
+ int x_low = (int)x;
72
+ int y_high;
73
+ int x_high;
74
+
75
+ if (y_low >= height - 1) {
76
+ y_high = y_low = height - 1;
77
+ y = (T)y_low;
78
+ } else {
79
+ y_high = y_low + 1;
80
+ }
81
+
82
+ if (x_low >= width - 1) {
83
+ x_high = x_low = width - 1;
84
+ x = (T)x_low;
85
+ } else {
86
+ x_high = x_low + 1;
87
+ }
88
+
89
+ T ly = y - y_low;
90
+ T lx = x - x_low;
91
+ T hy = 1. - ly, hx = 1. - lx;
92
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
93
+
94
+ // save weights and indeces
95
+ PreCalc<T> pc;
96
+ pc.pos1 = y_low * width + x_low;
97
+ pc.pos2 = y_low * width + x_high;
98
+ pc.pos3 = y_high * width + x_low;
99
+ pc.pos4 = y_high * width + x_high;
100
+ pc.w1 = w1;
101
+ pc.w2 = w2;
102
+ pc.w3 = w3;
103
+ pc.w4 = w4;
104
+ pre_calc[pre_calc_index] = pc;
105
+
106
+ pre_calc_index += 1;
107
+ }
108
+ }
109
+ }
110
+ }
111
+ }
112
+
113
+ template <typename T>
114
+ void ROIAlignForward_cpu_kernel(
115
+ const int nthreads,
116
+ const T* bottom_data,
117
+ const T& spatial_scale,
118
+ const int channels,
119
+ const int height,
120
+ const int width,
121
+ const int pooled_height,
122
+ const int pooled_width,
123
+ const int sampling_ratio,
124
+ const T* bottom_rois,
125
+ //int roi_cols,
126
+ T* top_data) {
127
+ //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
128
+ int roi_cols = 5;
129
+
130
+ int n_rois = nthreads / channels / pooled_width / pooled_height;
131
+ // (n, c, ph, pw) is an element in the pooled output
132
+ // can be parallelized using omp
133
+ // #pragma omp parallel for num_threads(32)
134
+ for (int n = 0; n < n_rois; n++) {
135
+ int index_n = n * channels * pooled_width * pooled_height;
136
+
137
+ // roi could have 4 or 5 columns
138
+ const T* offset_bottom_rois = bottom_rois + n * roi_cols;
139
+ int roi_batch_ind = 0;
140
+ if (roi_cols == 5) {
141
+ roi_batch_ind = offset_bottom_rois[0];
142
+ offset_bottom_rois++;
143
+ }
144
+
145
+ // Do not using rounding; this implementation detail is critical
146
+ T roi_start_w = offset_bottom_rois[0] * spatial_scale;
147
+ T roi_start_h = offset_bottom_rois[1] * spatial_scale;
148
+ T roi_end_w = offset_bottom_rois[2] * spatial_scale;
149
+ T roi_end_h = offset_bottom_rois[3] * spatial_scale;
150
+ // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
151
+ // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
152
+ // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
153
+ // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
154
+
155
+ // Force malformed ROIs to be 1x1
156
+ T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
157
+ T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
158
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
159
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
160
+
161
+ // We use roi_bin_grid to sample the grid and mimic integral
162
+ int roi_bin_grid_h = (sampling_ratio > 0)
163
+ ? sampling_ratio
164
+ : ceil(roi_height / pooled_height); // e.g., = 2
165
+ int roi_bin_grid_w =
166
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
167
+
168
+ // We do average (integral) pooling inside a bin
169
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
170
+
171
+ // we want to precalculate indeces and weights shared by all chanels,
172
+ // this is the key point of optimiation
173
+ std::vector<PreCalc<T>> pre_calc(
174
+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
175
+ pre_calc_for_bilinear_interpolate(
176
+ height,
177
+ width,
178
+ pooled_height,
179
+ pooled_width,
180
+ roi_bin_grid_h,
181
+ roi_bin_grid_w,
182
+ roi_start_h,
183
+ roi_start_w,
184
+ bin_size_h,
185
+ bin_size_w,
186
+ roi_bin_grid_h,
187
+ roi_bin_grid_w,
188
+ pre_calc);
189
+
190
+ for (int c = 0; c < channels; c++) {
191
+ int index_n_c = index_n + c * pooled_width * pooled_height;
192
+ const T* offset_bottom_data =
193
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
194
+ int pre_calc_index = 0;
195
+
196
+ for (int ph = 0; ph < pooled_height; ph++) {
197
+ for (int pw = 0; pw < pooled_width; pw++) {
198
+ int index = index_n_c + ph * pooled_width + pw;
199
+
200
+ T output_val = 0.;
201
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
202
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
203
+ PreCalc<T> pc = pre_calc[pre_calc_index];
204
+ output_val += pc.w1 * offset_bottom_data[pc.pos1] +
205
+ pc.w2 * offset_bottom_data[pc.pos2] +
206
+ pc.w3 * offset_bottom_data[pc.pos3] +
207
+ pc.w4 * offset_bottom_data[pc.pos4];
208
+
209
+ pre_calc_index += 1;
210
+ }
211
+ }
212
+ output_val /= count;
213
+
214
+ top_data[index] = output_val;
215
+ } // for pw
216
+ } // for ph
217
+ } // for c
218
+ } // for n
219
+ }
220
+
221
+ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
222
+ const at::Tensor& rois,
223
+ const float spatial_scale,
224
+ const int pooled_height,
225
+ const int pooled_width,
226
+ const int sampling_ratio) {
227
+ AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
228
+ AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
229
+
230
+ auto num_rois = rois.size(0);
231
+ auto channels = input.size(1);
232
+ auto height = input.size(2);
233
+ auto width = input.size(3);
234
+
235
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
236
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
237
+
238
+ if (output.numel() == 0) {
239
+ return output;
240
+ }
241
+
242
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
243
+ ROIAlignForward_cpu_kernel<scalar_t>(
244
+ output_size,
245
+ input.data_ptr<scalar_t>(),
246
+ spatial_scale,
247
+ channels,
248
+ height,
249
+ width,
250
+ pooled_height,
251
+ pooled_width,
252
+ sampling_ratio,
253
+ rois.data_ptr<scalar_t>(),
254
+ output.data_ptr<scalar_t>());
255
+ });
256
+ return output;
257
+ }
maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+
5
+ template <typename scalar_t>
6
+ at::Tensor nms_cpu_kernel(const at::Tensor& dets,
7
+ const at::Tensor& scores,
8
+ const float threshold) {
9
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
10
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
11
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
12
+
13
+ if (dets.numel() == 0) {
14
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
15
+ }
16
+
17
+ auto x1_t = dets.select(1, 0).contiguous();
18
+ auto y1_t = dets.select(1, 1).contiguous();
19
+ auto x2_t = dets.select(1, 2).contiguous();
20
+ auto y2_t = dets.select(1, 3).contiguous();
21
+
22
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
23
+
24
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
25
+
26
+ auto ndets = dets.size(0);
27
+ at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
28
+
29
+ auto suppressed = suppressed_t.data_ptr<uint8_t>();
30
+ auto order = order_t.data_ptr<int64_t>();
31
+ auto x1 = x1_t.data_ptr<scalar_t>();
32
+ auto y1 = y1_t.data_ptr<scalar_t>();
33
+ auto x2 = x2_t.data_ptr<scalar_t>();
34
+ auto y2 = y2_t.data_ptr<scalar_t>();
35
+ auto areas = areas_t.data_ptr<scalar_t>();
36
+
37
+ for (int64_t _i = 0; _i < ndets; _i++) {
38
+ auto i = order[_i];
39
+ if (suppressed[i] == 1)
40
+ continue;
41
+ auto ix1 = x1[i];
42
+ auto iy1 = y1[i];
43
+ auto ix2 = x2[i];
44
+ auto iy2 = y2[i];
45
+ auto iarea = areas[i];
46
+
47
+ for (int64_t _j = _i + 1; _j < ndets; _j++) {
48
+ auto j = order[_j];
49
+ if (suppressed[j] == 1)
50
+ continue;
51
+ auto xx1 = std::max(ix1, x1[j]);
52
+ auto yy1 = std::max(iy1, y1[j]);
53
+ auto xx2 = std::min(ix2, x2[j]);
54
+ auto yy2 = std::min(iy2, y2[j]);
55
+
56
+ auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
57
+ auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
58
+ auto inter = w * h;
59
+ auto ovr = inter / (iarea + areas[j] - inter);
60
+ if (ovr >= threshold)
61
+ suppressed[j] = 1;
62
+ }
63
+ }
64
+ return at::nonzero(suppressed_t == 0).squeeze(1);
65
+ }
66
+
67
+ at::Tensor nms_cpu(const at::Tensor& dets,
68
+ const at::Tensor& scores,
69
+ const float threshold) {
70
+ at::Tensor result;
71
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
72
+ result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
73
+ });
74
+ return result;
75
+ }
maskrcnn_benchmark/csrc/cpu/soft_nms.cpp ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+
5
+ template <typename scalar_t>
6
+ std::pair<at::Tensor, at::Tensor> soft_nms_cpu_kernel(const at::Tensor& dets,
7
+ const at::Tensor& scores,
8
+ const float threshold,
9
+ const float sigma) {
10
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
11
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
12
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
13
+
14
+ if (dets.numel() == 0) {
15
+ return std::make_pair(at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)),
16
+ at::empty({0}, scores.options().dtype(at::kFloat).device(at::kCPU)));
17
+ }
18
+
19
+ auto x1_t = dets.select(1, 0).contiguous();
20
+ auto y1_t = dets.select(1, 1).contiguous();
21
+ auto x2_t = dets.select(1, 2).contiguous();
22
+ auto y2_t = dets.select(1, 3).contiguous();
23
+
24
+ auto scores_t = scores.clone();
25
+
26
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
27
+ auto ndets = dets.size(0);
28
+ auto inds_t = at::arange(ndets, dets.options().dtype(at::kLong).device(at::kCPU));
29
+
30
+ auto x1 = x1_t.data_ptr<scalar_t>();
31
+ auto y1 = y1_t.data_ptr<scalar_t>();
32
+ auto x2 = x2_t.data_ptr<scalar_t>();
33
+ auto y2 = y2_t.data_ptr<scalar_t>();
34
+ auto s = scores_t.data_ptr<scalar_t>();
35
+ auto inds = inds_t.data_ptr<int64_t>();
36
+ auto areas = areas_t.data_ptr<scalar_t>();
37
+
38
+ for (int64_t i = 0; i < ndets; i++) {
39
+
40
+ auto ix1 = x1[i];
41
+ auto iy1 = y1[i];
42
+ auto ix2 = x2[i];
43
+ auto iy2 = y2[i];
44
+ auto is = s[i];
45
+ auto ii = inds[i];
46
+ auto iarea = areas[i];
47
+
48
+ auto maxpos = scores_t.slice(0, i, ndets).argmax().item<int64_t>() + i;
49
+
50
+ // add max box as a detection
51
+ x1[i] = x1[maxpos];
52
+ y1[i] = y1[maxpos];
53
+ x2[i] = x2[maxpos];
54
+ y2[i] = y2[maxpos];
55
+ s[i] = s[maxpos];
56
+ inds[i] = inds[maxpos];
57
+ areas[i] = areas[maxpos];
58
+
59
+ // swap ith box with position of max box
60
+ x1[maxpos] = ix1;
61
+ y1[maxpos] = iy1;
62
+ x2[maxpos] = ix2;
63
+ y2[maxpos] = iy2;
64
+ s[maxpos] = is;
65
+ inds[maxpos] = ii;
66
+ areas[maxpos] = iarea;
67
+
68
+ ix1 = x1[i];
69
+ iy1 = y1[i];
70
+ ix2 = x2[i];
71
+ iy2 = y2[i];
72
+ iarea = areas[i];
73
+
74
+ // NMS iterations, note that ndets changes if detection boxes
75
+ // fall below threshold
76
+ for (int64_t j = i + 1; j < ndets; j++) {
77
+ auto xx1 = std::max(ix1, x1[j]);
78
+ auto yy1 = std::max(iy1, y1[j]);
79
+ auto xx2 = std::min(ix2, x2[j]);
80
+ auto yy2 = std::min(iy2, y2[j]);
81
+
82
+ auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
83
+ auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
84
+
85
+ auto inter = w * h;
86
+ auto ovr = inter / (iarea + areas[j] - inter);
87
+
88
+ s[j] = s[j] * std::exp(- std::pow(ovr, 2.0) / sigma);
89
+
90
+ // if box score falls below threshold, discard the box by
91
+ // swapping with last box update ndets
92
+ if (s[j] < threshold) {
93
+ x1[j] = x1[ndets - 1];
94
+ y1[j] = y1[ndets - 1];
95
+ x2[j] = x2[ndets - 1];
96
+ y2[j] = y2[ndets - 1];
97
+ s[j] = s[ndets - 1];
98
+ inds[j] = inds[ndets - 1];
99
+ areas[j] = areas[ndets - 1];
100
+ j--;
101
+ ndets--;
102
+ }
103
+ }
104
+ }
105
+ return std::make_pair(inds_t.slice(0, 0, ndets), scores_t.slice(0, 0, ndets));
106
+ }
107
+
108
+ std::pair<at::Tensor, at::Tensor> soft_nms_cpu(const at::Tensor& dets,
109
+ const at::Tensor& scores,
110
+ const float threshold,
111
+ const float sigma) {
112
+ std::pair<at::Tensor, at::Tensor> result;
113
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
114
+ result = soft_nms_cpu_kernel<scalar_t>(dets, scores, threshold, sigma);
115
+ });
116
+ return result;
117
+ }
maskrcnn_benchmark/csrc/cpu/vision.h ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include <torch/extension.h>
4
+
5
+
6
+ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
7
+ const at::Tensor& rois,
8
+ const float spatial_scale,
9
+ const int pooled_height,
10
+ const int pooled_width,
11
+ const int sampling_ratio);
12
+
13
+
14
+ at::Tensor nms_cpu(const at::Tensor& dets,
15
+ const at::Tensor& scores,
16
+ const float threshold);
17
+
18
+
19
+ std::pair<at::Tensor, at::Tensor> soft_nms_cpu(const at::Tensor& dets,
20
+ const at::Tensor& scores,
21
+ const float threshold,
22
+ const float sigma);
maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCAtomics.cuh>
7
+ #include <THC/THCDeviceUtils.cuh>
8
+
9
+ // TODO make it in a common file
10
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
11
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
12
+ i += blockDim.x * gridDim.x)
13
+
14
+
15
+ template <typename T>
16
+ __device__ T bilinear_interpolate(const T* bottom_data,
17
+ const int height, const int width,
18
+ T y, T x,
19
+ const int index /* index for debug only*/) {
20
+
21
+ // deal with cases that inverse elements are out of feature map boundary
22
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
23
+ //empty
24
+ return 0;
25
+ }
26
+
27
+ if (y <= 0) y = 0;
28
+ if (x <= 0) x = 0;
29
+
30
+ int y_low = (int) y;
31
+ int x_low = (int) x;
32
+ int y_high;
33
+ int x_high;
34
+
35
+ if (y_low >= height - 1) {
36
+ y_high = y_low = height - 1;
37
+ y = (T) y_low;
38
+ } else {
39
+ y_high = y_low + 1;
40
+ }
41
+
42
+ if (x_low >= width - 1) {
43
+ x_high = x_low = width - 1;
44
+ x = (T) x_low;
45
+ } else {
46
+ x_high = x_low + 1;
47
+ }
48
+
49
+ T ly = y - y_low;
50
+ T lx = x - x_low;
51
+ T hy = 1. - ly, hx = 1. - lx;
52
+ // do bilinear interpolation
53
+ T v1 = bottom_data[y_low * width + x_low];
54
+ T v2 = bottom_data[y_low * width + x_high];
55
+ T v3 = bottom_data[y_high * width + x_low];
56
+ T v4 = bottom_data[y_high * width + x_high];
57
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
58
+
59
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
60
+
61
+ return val;
62
+ }
63
+
64
+ template <typename T>
65
+ __global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
66
+ const T spatial_scale, const int channels,
67
+ const int height, const int width,
68
+ const int pooled_height, const int pooled_width,
69
+ const int sampling_ratio,
70
+ const T* bottom_rois, T* top_data) {
71
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
72
+ // (n, c, ph, pw) is an element in the pooled output
73
+ int pw = index % pooled_width;
74
+ int ph = (index / pooled_width) % pooled_height;
75
+ int c = (index / pooled_width / pooled_height) % channels;
76
+ int n = index / pooled_width / pooled_height / channels;
77
+
78
+ const T* offset_bottom_rois = bottom_rois + n * 5;
79
+ int roi_batch_ind = offset_bottom_rois[0];
80
+
81
+ // Do not using rounding; this implementation detail is critical
82
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
83
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
84
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
85
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
86
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
87
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
88
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
89
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
90
+
91
+ // Force malformed ROIs to be 1x1
92
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
93
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
94
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
95
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
96
+
97
+ const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
98
+
99
+ // We use roi_bin_grid to sample the grid and mimic integral
100
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
101
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
102
+
103
+ // We do average (integral) pooling inside a bin
104
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
105
+
106
+ T output_val = 0.;
107
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
108
+ {
109
+ const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
110
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
111
+ {
112
+ const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
113
+
114
+ T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
115
+ output_val += val;
116
+ }
117
+ }
118
+ output_val /= count;
119
+
120
+ top_data[index] = output_val;
121
+ }
122
+ }
123
+
124
+
125
+ template <typename T>
126
+ __device__ void bilinear_interpolate_gradient(
127
+ const int height, const int width,
128
+ T y, T x,
129
+ T & w1, T & w2, T & w3, T & w4,
130
+ int & x_low, int & x_high, int & y_low, int & y_high,
131
+ const int index /* index for debug only*/) {
132
+
133
+ // deal with cases that inverse elements are out of feature map boundary
134
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
135
+ //empty
136
+ w1 = w2 = w3 = w4 = 0.;
137
+ x_low = x_high = y_low = y_high = -1;
138
+ return;
139
+ }
140
+
141
+ if (y <= 0) y = 0;
142
+ if (x <= 0) x = 0;
143
+
144
+ y_low = (int) y;
145
+ x_low = (int) x;
146
+
147
+ if (y_low >= height - 1) {
148
+ y_high = y_low = height - 1;
149
+ y = (T) y_low;
150
+ } else {
151
+ y_high = y_low + 1;
152
+ }
153
+
154
+ if (x_low >= width - 1) {
155
+ x_high = x_low = width - 1;
156
+ x = (T) x_low;
157
+ } else {
158
+ x_high = x_low + 1;
159
+ }
160
+
161
+ T ly = y - y_low;
162
+ T lx = x - x_low;
163
+ T hy = 1. - ly, hx = 1. - lx;
164
+
165
+ // reference in forward
166
+ // T v1 = bottom_data[y_low * width + x_low];
167
+ // T v2 = bottom_data[y_low * width + x_high];
168
+ // T v3 = bottom_data[y_high * width + x_low];
169
+ // T v4 = bottom_data[y_high * width + x_high];
170
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
171
+
172
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
173
+
174
+ return;
175
+ }
176
+
177
+ template <typename T>
178
+ __global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
179
+ const int num_rois, const T spatial_scale,
180
+ const int channels, const int height, const int width,
181
+ const int pooled_height, const int pooled_width,
182
+ const int sampling_ratio,
183
+ T* bottom_diff,
184
+ const T* bottom_rois) {
185
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
186
+ // (n, c, ph, pw) is an element in the pooled output
187
+ int pw = index % pooled_width;
188
+ int ph = (index / pooled_width) % pooled_height;
189
+ int c = (index / pooled_width / pooled_height) % channels;
190
+ int n = index / pooled_width / pooled_height / channels;
191
+
192
+ const T* offset_bottom_rois = bottom_rois + n * 5;
193
+ int roi_batch_ind = offset_bottom_rois[0];
194
+
195
+ // Do not using rounding; this implementation detail is critical
196
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
197
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
198
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
199
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
200
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
201
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
202
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
203
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
204
+
205
+ // Force malformed ROIs to be 1x1
206
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
207
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
208
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
209
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
210
+
211
+ T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
212
+
213
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
214
+ const T* offset_top_diff = top_diff + top_offset;
215
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
216
+
217
+ // We use roi_bin_grid to sample the grid and mimic integral
218
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
219
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
220
+
221
+ // We do average (integral) pooling inside a bin
222
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
223
+
224
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
225
+ {
226
+ const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
227
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
228
+ {
229
+ const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
230
+
231
+ T w1, w2, w3, w4;
232
+ int x_low, x_high, y_low, y_high;
233
+
234
+ bilinear_interpolate_gradient(height, width, y, x,
235
+ w1, w2, w3, w4,
236
+ x_low, x_high, y_low, y_high,
237
+ index);
238
+
239
+ T g1 = top_diff_this_bin * w1 / count;
240
+ T g2 = top_diff_this_bin * w2 / count;
241
+ T g3 = top_diff_this_bin * w3 / count;
242
+ T g4 = top_diff_this_bin * w4 / count;
243
+
244
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
245
+ {
246
+ atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
247
+ atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
248
+ atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
249
+ atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
250
+ } // if
251
+ } // ix
252
+ } // iy
253
+ } // CUDA_1D_KERNEL_LOOP
254
+ } // RoIAlignBackward
255
+
256
+
257
+ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
258
+ const at::Tensor& rois,
259
+ const float spatial_scale,
260
+ const int pooled_height,
261
+ const int pooled_width,
262
+ const int sampling_ratio) {
263
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
264
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
265
+
266
+ auto num_rois = rois.size(0);
267
+ auto channels = input.size(1);
268
+ auto height = input.size(2);
269
+ auto width = input.size(3);
270
+
271
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
272
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
273
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
274
+
275
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
276
+ dim3 block(512);
277
+
278
+ if (output.numel() == 0) {
279
+ THCudaCheck(cudaGetLastError());
280
+ return output;
281
+ }
282
+
283
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
284
+ RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
285
+ output_size,
286
+ input.contiguous().data_ptr<scalar_t>(),
287
+ spatial_scale,
288
+ channels,
289
+ height,
290
+ width,
291
+ pooled_height,
292
+ pooled_width,
293
+ sampling_ratio,
294
+ rois.contiguous().data_ptr<scalar_t>(),
295
+ output.data_ptr<scalar_t>());
296
+ });
297
+ THCudaCheck(cudaGetLastError());
298
+ return output;
299
+ }
300
+
301
+ // TODO remove the dependency on input and use instead its sizes -> save memory
302
+ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
303
+ const at::Tensor& rois,
304
+ const float spatial_scale,
305
+ const int pooled_height,
306
+ const int pooled_width,
307
+ const int batch_size,
308
+ const int channels,
309
+ const int height,
310
+ const int width,
311
+ const int sampling_ratio) {
312
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
313
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
314
+
315
+ auto num_rois = rois.size(0);
316
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
317
+
318
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
319
+
320
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
321
+ dim3 block(512);
322
+
323
+ // handle possibly empty gradients
324
+ if (grad.numel() == 0) {
325
+ THCudaCheck(cudaGetLastError());
326
+ return grad_input;
327
+ }
328
+
329
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
330
+ RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
331
+ grad.numel(),
332
+ grad.contiguous().data_ptr<scalar_t>(),
333
+ num_rois,
334
+ spatial_scale,
335
+ channels,
336
+ height,
337
+ width,
338
+ pooled_height,
339
+ pooled_width,
340
+ sampling_ratio,
341
+ grad_input.data_ptr<scalar_t>(),
342
+ rois.contiguous().data_ptr<scalar_t>());
343
+ });
344
+ THCudaCheck(cudaGetLastError());
345
+ return grad_input;
346
+ }
maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCAtomics.cuh>
7
+ #include <THC/THCDeviceUtils.cuh>
8
+
9
+
10
+ // TODO make it in a common file
11
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
12
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
13
+ i += blockDim.x * gridDim.x)
14
+
15
+
16
+ template <typename T>
17
+ __global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
18
+ const T spatial_scale, const int channels, const int height,
19
+ const int width, const int pooled_height, const int pooled_width,
20
+ const T* bottom_rois, T* top_data, int* argmax_data) {
21
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
22
+ // (n, c, ph, pw) is an element in the pooled output
23
+ int pw = index % pooled_width;
24
+ int ph = (index / pooled_width) % pooled_height;
25
+ int c = (index / pooled_width / pooled_height) % channels;
26
+ int n = index / pooled_width / pooled_height / channels;
27
+
28
+ const T* offset_bottom_rois = bottom_rois + n * 5;
29
+ int roi_batch_ind = offset_bottom_rois[0];
30
+ int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
31
+ int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
32
+ int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
33
+ int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
34
+
35
+ // Force malformed ROIs to be 1x1
36
+ int roi_width = max(roi_end_w - roi_start_w + 1, 1);
37
+ int roi_height = max(roi_end_h - roi_start_h + 1, 1);
38
+ T bin_size_h = static_cast<T>(roi_height)
39
+ / static_cast<T>(pooled_height);
40
+ T bin_size_w = static_cast<T>(roi_width)
41
+ / static_cast<T>(pooled_width);
42
+
43
+ int hstart = static_cast<int>(floor(static_cast<T>(ph)
44
+ * bin_size_h));
45
+ int wstart = static_cast<int>(floor(static_cast<T>(pw)
46
+ * bin_size_w));
47
+ int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
48
+ * bin_size_h));
49
+ int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
50
+ * bin_size_w));
51
+
52
+ // Add roi offsets and clip to input boundaries
53
+ hstart = min(max(hstart + roi_start_h, 0), height);
54
+ hend = min(max(hend + roi_start_h, 0), height);
55
+ wstart = min(max(wstart + roi_start_w, 0), width);
56
+ wend = min(max(wend + roi_start_w, 0), width);
57
+ bool is_empty = (hend <= hstart) || (wend <= wstart);
58
+
59
+ // Define an empty pooling region to be zero
60
+ T maxval = is_empty ? 0 : -FLT_MAX;
61
+ // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
62
+ int maxidx = -1;
63
+ const T* offset_bottom_data =
64
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
65
+ for (int h = hstart; h < hend; ++h) {
66
+ for (int w = wstart; w < wend; ++w) {
67
+ int bottom_index = h * width + w;
68
+ if (offset_bottom_data[bottom_index] > maxval) {
69
+ maxval = offset_bottom_data[bottom_index];
70
+ maxidx = bottom_index;
71
+ }
72
+ }
73
+ }
74
+ top_data[index] = maxval;
75
+ argmax_data[index] = maxidx;
76
+ }
77
+ }
78
+
79
+ template <typename T>
80
+ __global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
81
+ const int* argmax_data, const int num_rois, const T spatial_scale,
82
+ const int channels, const int height, const int width,
83
+ const int pooled_height, const int pooled_width, T* bottom_diff,
84
+ const T* bottom_rois) {
85
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
86
+ // (n, c, ph, pw) is an element in the pooled output
87
+ int pw = index % pooled_width;
88
+ int ph = (index / pooled_width) % pooled_height;
89
+ int c = (index / pooled_width / pooled_height) % channels;
90
+ int n = index / pooled_width / pooled_height / channels;
91
+
92
+ const T* offset_bottom_rois = bottom_rois + n * 5;
93
+ int roi_batch_ind = offset_bottom_rois[0];
94
+ int bottom_offset = (roi_batch_ind * channels + c) * height * width;
95
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
96
+ const T* offset_top_diff = top_diff + top_offset;
97
+ T* offset_bottom_diff = bottom_diff + bottom_offset;
98
+ const int* offset_argmax_data = argmax_data + top_offset;
99
+
100
+ int argmax = offset_argmax_data[ph * pooled_width + pw];
101
+ if (argmax != -1) {
102
+ atomicAdd(
103
+ offset_bottom_diff + argmax,
104
+ static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
105
+
106
+ }
107
+ }
108
+ }
109
+
110
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
111
+ const at::Tensor& rois,
112
+ const float spatial_scale,
113
+ const int pooled_height,
114
+ const int pooled_width) {
115
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
116
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
117
+
118
+ auto num_rois = rois.size(0);
119
+ auto channels = input.size(1);
120
+ auto height = input.size(2);
121
+ auto width = input.size(3);
122
+
123
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
124
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
125
+ auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
126
+
127
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
128
+
129
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
130
+ dim3 block(512);
131
+
132
+ if (output.numel() == 0) {
133
+ THCudaCheck(cudaGetLastError());
134
+ return std::make_tuple(output, argmax);
135
+ }
136
+
137
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
138
+ RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
139
+ output_size,
140
+ input.contiguous().data_ptr<scalar_t>(),
141
+ spatial_scale,
142
+ channels,
143
+ height,
144
+ width,
145
+ pooled_height,
146
+ pooled_width,
147
+ rois.contiguous().data_ptr<scalar_t>(),
148
+ output.data_ptr<scalar_t>(),
149
+ argmax.data_ptr<int>());
150
+ });
151
+ THCudaCheck(cudaGetLastError());
152
+ return std::make_tuple(output, argmax);
153
+ }
154
+
155
+ // TODO remove the dependency on input and use instead its sizes -> save memory
156
+ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
157
+ const at::Tensor& input,
158
+ const at::Tensor& rois,
159
+ const at::Tensor& argmax,
160
+ const float spatial_scale,
161
+ const int pooled_height,
162
+ const int pooled_width,
163
+ const int batch_size,
164
+ const int channels,
165
+ const int height,
166
+ const int width) {
167
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
168
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
169
+ // TODO add more checks
170
+
171
+ auto num_rois = rois.size(0);
172
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
173
+
174
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
175
+
176
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
177
+ dim3 block(512);
178
+
179
+ // handle possibly empty gradients
180
+ if (grad.numel() == 0) {
181
+ THCudaCheck(cudaGetLastError());
182
+ return grad_input;
183
+ }
184
+
185
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
186
+ RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
187
+ grad.numel(),
188
+ grad.contiguous().data_ptr<scalar_t>(),
189
+ argmax.data_ptr<int>(),
190
+ num_rois,
191
+ spatial_scale,
192
+ channels,
193
+ height,
194
+ width,
195
+ pooled_height,
196
+ pooled_width,
197
+ grad_input.data_ptr<scalar_t>(),
198
+ rois.contiguous().data_ptr<scalar_t>());
199
+ });
200
+ THCudaCheck(cudaGetLastError());
201
+ return grad_input;
202
+ }
maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ // This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
3
+ // Cheng-Yang Fu
4
+ // cyfu@cs.unc.edu
5
+ #include <ATen/ATen.h>
6
+ #include <ATen/cuda/CUDAContext.h>
7
+
8
+ #include <THC/THC.h>
9
+ #include <THC/THCAtomics.cuh>
10
+ #include <THC/THCDeviceUtils.cuh>
11
+
12
+ #include <cfloat>
13
+
14
+ // TODO make it in a common file
15
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
16
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
17
+ i += blockDim.x * gridDim.x)
18
+
19
+
20
+ template <typename T>
21
+ __global__ void SigmoidFocalLossForward(const int nthreads,
22
+ const T* logits,
23
+ const int* targets,
24
+ const int num_classes,
25
+ const float gamma,
26
+ const float alpha,
27
+ const int num,
28
+ T* losses) {
29
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
30
+
31
+ int n = i / num_classes;
32
+ int d = i % num_classes; // current class[0~79];
33
+ int t = targets[n]; // target class [1~80];
34
+
35
+ // Decide it is positive or negative case.
36
+ T c1 = (t == (d+1));
37
+ T c2 = (t>=0 & t != (d+1));
38
+
39
+ T zn = (1.0 - alpha);
40
+ T zp = (alpha);
41
+
42
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
43
+ T p = 1. / (1. + expf(-logits[i]));
44
+
45
+ // (1-p)**gamma * log(p) where
46
+ T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
47
+
48
+ // p**gamma * log(1-p)
49
+ T term2 = powf(p, gamma) *
50
+ (-1. * logits[i] * (logits[i] >= 0) -
51
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
52
+
53
+ losses[i] = 0.0;
54
+ losses[i] += -c1 * term1 * zp;
55
+ losses[i] += -c2 * term2 * zn;
56
+
57
+ } // CUDA_1D_KERNEL_LOOP
58
+ } // SigmoidFocalLossForward
59
+
60
+
61
+ template <typename T>
62
+ __global__ void SigmoidFocalLossBackward(const int nthreads,
63
+ const T* logits,
64
+ const int* targets,
65
+ const T* d_losses,
66
+ const int num_classes,
67
+ const float gamma,
68
+ const float alpha,
69
+ const int num,
70
+ T* d_logits) {
71
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
72
+
73
+ int n = i / num_classes;
74
+ int d = i % num_classes; // current class[0~79];
75
+ int t = targets[n]; // target class [1~80], 0 is background;
76
+
77
+ // Decide it is positive or negative case.
78
+ T c1 = (t == (d+1));
79
+ T c2 = (t>=0 & t != (d+1));
80
+
81
+ T zn = (1.0 - alpha);
82
+ T zp = (alpha);
83
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
84
+ T p = 1. / (1. + expf(-logits[i]));
85
+
86
+ // (1-p)**g * (1 - p - g*p*log(p)
87
+ T term1 = powf((1. - p), gamma) *
88
+ (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
89
+
90
+ // (p**g) * (g*(1-p)*log(1-p) - p)
91
+ T term2 = powf(p, gamma) *
92
+ ((-1. * logits[i] * (logits[i] >= 0) -
93
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
94
+ (1. - p) * gamma - p);
95
+ d_logits[i] = 0.0;
96
+ d_logits[i] += -c1 * term1 * zp;
97
+ d_logits[i] += -c2 * term2 * zn;
98
+ d_logits[i] = d_logits[i] * d_losses[i];
99
+
100
+ } // CUDA_1D_KERNEL_LOOP
101
+ } // SigmoidFocalLossBackward
102
+
103
+
104
+ at::Tensor SigmoidFocalLoss_forward_cuda(
105
+ const at::Tensor& logits,
106
+ const at::Tensor& targets,
107
+ const int num_classes,
108
+ const float gamma,
109
+ const float alpha) {
110
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
111
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
112
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
113
+
114
+ const int num_samples = logits.size(0);
115
+
116
+ auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
117
+ auto losses_size = num_samples * logits.size(1);
118
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
119
+
120
+ dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
121
+ dim3 block(512);
122
+
123
+ if (losses.numel() == 0) {
124
+ THCudaCheck(cudaGetLastError());
125
+ return losses;
126
+ }
127
+
128
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
129
+ SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
130
+ losses_size,
131
+ logits.contiguous().data_ptr<scalar_t>(),
132
+ targets.contiguous().data_ptr<int>(),
133
+ num_classes,
134
+ gamma,
135
+ alpha,
136
+ num_samples,
137
+ losses.data_ptr<scalar_t>());
138
+ });
139
+ THCudaCheck(cudaGetLastError());
140
+ return losses;
141
+ }
142
+
143
+
144
+ at::Tensor SigmoidFocalLoss_backward_cuda(
145
+ const at::Tensor& logits,
146
+ const at::Tensor& targets,
147
+ const at::Tensor& d_losses,
148
+ const int num_classes,
149
+ const float gamma,
150
+ const float alpha) {
151
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
152
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
153
+ AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
154
+
155
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
156
+
157
+ const int num_samples = logits.size(0);
158
+ AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
159
+
160
+ auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
161
+ auto d_logits_size = num_samples * logits.size(1);
162
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
163
+
164
+ dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
165
+ dim3 block(512);
166
+
167
+ if (d_logits.numel() == 0) {
168
+ THCudaCheck(cudaGetLastError());
169
+ return d_logits;
170
+ }
171
+
172
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
173
+ SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
174
+ d_logits_size,
175
+ logits.contiguous().data_ptr<scalar_t>(),
176
+ targets.contiguous().data_ptr<int>(),
177
+ d_losses.contiguous().data_ptr<scalar_t>(),
178
+ num_classes,
179
+ gamma,
180
+ alpha,
181
+ num_samples,
182
+ d_logits.data_ptr<scalar_t>());
183
+ });
184
+
185
+ THCudaCheck(cudaGetLastError());
186
+ return d_logits;
187
+ }
188
+
maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu ADDED
@@ -0,0 +1,691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // modify from
2
+ // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
3
+
4
+ #include <ATen/ATen.h>
5
+ #include <ATen/cuda/CUDAContext.h>
6
+
7
+ #include <THC/THC.h>
8
+ #include <THC/THCDeviceUtils.cuh>
9
+
10
+ #include <vector>
11
+ #include <iostream>
12
+ #include <cmath>
13
+
14
+
15
+ void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
16
+ const int channels, const int height, const int width,
17
+ const int ksize_h, const int ksize_w, const int pad_h,
18
+ const int pad_w, const int stride_h, const int stride_w,
19
+ const int dilation_h, const int dilation_w,
20
+ const int parallel_imgs, const int deformable_group,
21
+ at::Tensor data_col);
22
+
23
+ void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
24
+ const int channels, const int height, const int width,
25
+ const int ksize_h, const int ksize_w, const int pad_h,
26
+ const int pad_w, const int stride_h, const int stride_w,
27
+ const int dilation_h, const int dilation_w,
28
+ const int parallel_imgs, const int deformable_group,
29
+ at::Tensor grad_im);
30
+
31
+ void deformable_col2im_coord(
32
+ const at::Tensor data_col, const at::Tensor data_im,
33
+ const at::Tensor data_offset, const int channels, const int height,
34
+ const int width, const int ksize_h, const int ksize_w, const int pad_h,
35
+ const int pad_w, const int stride_h, const int stride_w,
36
+ const int dilation_h, const int dilation_w, const int parallel_imgs,
37
+ const int deformable_group, at::Tensor grad_offset);
38
+
39
+ void modulated_deformable_im2col_cuda(
40
+ const at::Tensor data_im, const at::Tensor data_offset,
41
+ const at::Tensor data_mask, const int batch_size, const int channels,
42
+ const int height_im, const int width_im, const int height_col,
43
+ const int width_col, const int kernel_h, const int kenerl_w,
44
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
45
+ const int dilation_h, const int dilation_w, const int deformable_group,
46
+ at::Tensor data_col);
47
+
48
+ void modulated_deformable_col2im_cuda(
49
+ const at::Tensor data_col, const at::Tensor data_offset,
50
+ const at::Tensor data_mask, const int batch_size, const int channels,
51
+ const int height_im, const int width_im, const int height_col,
52
+ const int width_col, const int kernel_h, const int kenerl_w,
53
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
54
+ const int dilation_h, const int dilation_w, const int deformable_group,
55
+ at::Tensor grad_im);
56
+
57
+ void modulated_deformable_col2im_coord_cuda(
58
+ const at::Tensor data_col, const at::Tensor data_im,
59
+ const at::Tensor data_offset, const at::Tensor data_mask,
60
+ const int batch_size, const int channels, const int height_im,
61
+ const int width_im, const int height_col, const int width_col,
62
+ const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
63
+ const int stride_h, const int stride_w, const int dilation_h,
64
+ const int dilation_w, const int deformable_group, at::Tensor grad_offset,
65
+ at::Tensor grad_mask);
66
+
67
+ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
68
+ at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
69
+ int padW, int dilationH, int dilationW, int group,
70
+ int deformable_group)
71
+ {
72
+ TORCH_CHECK(weight.ndimension() == 4,
73
+ "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
74
+ "but got: %s",
75
+ weight.ndimension());
76
+
77
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
78
+
79
+ TORCH_CHECK(kW > 0 && kH > 0,
80
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
81
+ kW);
82
+
83
+ TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
84
+ "kernel size should be consistent with weight, ",
85
+ "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
86
+ kW, weight.size(2), weight.size(3));
87
+
88
+ TORCH_CHECK(dW > 0 && dH > 0,
89
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
90
+
91
+ TORCH_CHECK(
92
+ dilationW > 0 && dilationH > 0,
93
+ "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
94
+ dilationH, dilationW);
95
+
96
+ int ndim = input.ndimension();
97
+ int dimf = 0;
98
+ int dimh = 1;
99
+ int dimw = 2;
100
+
101
+ if (ndim == 4) {
102
+ dimf++;
103
+ dimh++;
104
+ dimw++;
105
+ }
106
+
107
+ TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
108
+ ndim);
109
+
110
+ long nInputPlane = weight.size(1) * group;
111
+ long inputHeight = input.size(dimh);
112
+ long inputWidth = input.size(dimw);
113
+ long nOutputPlane = weight.size(0);
114
+ long outputHeight =
115
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
116
+ long outputWidth =
117
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
118
+
119
+ TORCH_CHECK(nInputPlane % deformable_group == 0,
120
+ "input channels must divide deformable group size");
121
+
122
+ if (outputWidth < 1 || outputHeight < 1)
123
+ AT_ERROR(
124
+ "Given input size: (%ld x %ld x %ld). "
125
+ "Calculated output size: (%ld x %ld x %ld). Output size is too small",
126
+ nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
127
+ outputWidth);
128
+
129
+ TORCH_CHECK(input.size(1) == nInputPlane,
130
+ "invalid number of input planes, expected: %d, but got: %d",
131
+ nInputPlane, input.size(1));
132
+
133
+ TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
134
+ "input image is smaller than kernel");
135
+
136
+ TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
137
+ "invalid spatial size of offset, expected height: %d width: %d, but "
138
+ "got height: %d width: %d",
139
+ outputHeight, outputWidth, offset.size(2), offset.size(3));
140
+
141
+ TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
142
+ "invalid number of channels of offset");
143
+
144
+ if (gradOutput != NULL) {
145
+ TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
146
+ "invalid number of gradOutput planes, expected: %d, but got: %d",
147
+ nOutputPlane, gradOutput->size(dimf));
148
+
149
+ TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
150
+ gradOutput->size(dimw) == outputWidth),
151
+ "invalid size of gradOutput, expected height: %d width: %d , but "
152
+ "got height: %d width: %d",
153
+ outputHeight, outputWidth, gradOutput->size(dimh),
154
+ gradOutput->size(dimw));
155
+ }
156
+ }
157
+
158
+ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
159
+ at::Tensor offset, at::Tensor output,
160
+ at::Tensor columns, at::Tensor ones, int kW,
161
+ int kH, int dW, int dH, int padW, int padH,
162
+ int dilationW, int dilationH, int group,
163
+ int deformable_group, int im2col_step)
164
+ {
165
+ // todo: resize columns to include im2col: done
166
+ // todo: add im2col_step as input
167
+ // todo: add new output buffer and transpose it to output (or directly
168
+ // transpose output) todo: possibly change data indexing because of
169
+ // parallel_imgs
170
+
171
+ shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
172
+ dilationH, dilationW, group, deformable_group);
173
+
174
+ input = input.contiguous();
175
+ offset = offset.contiguous();
176
+ weight = weight.contiguous();
177
+
178
+ int batch = 1;
179
+ if (input.ndimension() == 3) {
180
+ // Force batch
181
+ batch = 0;
182
+ input.unsqueeze_(0);
183
+ offset.unsqueeze_(0);
184
+ }
185
+
186
+ // todo: assert batchsize dividable by im2col_step
187
+
188
+ long batchSize = input.size(0);
189
+ long nInputPlane = input.size(1);
190
+ long inputHeight = input.size(2);
191
+ long inputWidth = input.size(3);
192
+
193
+ long nOutputPlane = weight.size(0);
194
+
195
+ long outputWidth =
196
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
197
+ long outputHeight =
198
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
199
+
200
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
201
+
202
+ output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
203
+ outputHeight, outputWidth});
204
+ columns = at::zeros(
205
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
206
+ input.options());
207
+
208
+ if (ones.ndimension() != 2 ||
209
+ ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
210
+ ones = at::ones({outputHeight, outputWidth}, input.options());
211
+ }
212
+
213
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
214
+ inputHeight, inputWidth});
215
+ offset =
216
+ offset.view({batchSize / im2col_step, im2col_step,
217
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
218
+
219
+ at::Tensor output_buffer =
220
+ at::zeros({batchSize / im2col_step, nOutputPlane,
221
+ im2col_step * outputHeight, outputWidth},
222
+ output.options());
223
+
224
+ output_buffer = output_buffer.view(
225
+ {output_buffer.size(0), group, output_buffer.size(1) / group,
226
+ output_buffer.size(2), output_buffer.size(3)});
227
+
228
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
229
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
230
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
231
+ dilationW, im2col_step, deformable_group, columns);
232
+
233
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
234
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
235
+ weight.size(2), weight.size(3)});
236
+
237
+ for (int g = 0; g < group; g++) {
238
+ output_buffer[elt][g] = output_buffer[elt][g]
239
+ .flatten(1)
240
+ .addmm_(weight[g].flatten(1), columns[g])
241
+ .view_as(output_buffer[elt][g]);
242
+ }
243
+ }
244
+
245
+ output_buffer = output_buffer.view(
246
+ {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
247
+ output_buffer.size(3), output_buffer.size(4)});
248
+
249
+ output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
250
+ im2col_step, outputHeight, outputWidth});
251
+ output_buffer.transpose_(1, 2);
252
+ output.copy_(output_buffer);
253
+ output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
254
+
255
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
256
+ offset = offset.view(
257
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
258
+
259
+ if (batch == 0) {
260
+ output = output.view({nOutputPlane, outputHeight, outputWidth});
261
+ input = input.view({nInputPlane, inputHeight, inputWidth});
262
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
263
+ }
264
+
265
+ return 1;
266
+ }
267
+
268
+ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
269
+ at::Tensor gradOutput, at::Tensor gradInput,
270
+ at::Tensor gradOffset, at::Tensor weight,
271
+ at::Tensor columns, int kW, int kH, int dW,
272
+ int dH, int padW, int padH, int dilationW,
273
+ int dilationH, int group,
274
+ int deformable_group, int im2col_step)
275
+ {
276
+ shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
277
+ dilationH, dilationW, group, deformable_group);
278
+
279
+ input = input.contiguous();
280
+ offset = offset.contiguous();
281
+ gradOutput = gradOutput.contiguous();
282
+ weight = weight.contiguous();
283
+
284
+ int batch = 1;
285
+
286
+ if (input.ndimension() == 3) {
287
+ // Force batch
288
+ batch = 0;
289
+ input = input.view({1, input.size(0), input.size(1), input.size(2)});
290
+ offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
291
+ gradOutput = gradOutput.view(
292
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
293
+ }
294
+
295
+ long batchSize = input.size(0);
296
+ long nInputPlane = input.size(1);
297
+ long inputHeight = input.size(2);
298
+ long inputWidth = input.size(3);
299
+
300
+ long nOutputPlane = weight.size(0);
301
+
302
+ long outputWidth =
303
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
304
+ long outputHeight =
305
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
306
+
307
+ TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
308
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
309
+ columns = at::zeros(
310
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
311
+ input.options());
312
+
313
+ // change order of grad output
314
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
315
+ nOutputPlane, outputHeight, outputWidth});
316
+ gradOutput.transpose_(1, 2);
317
+
318
+ gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
319
+ inputHeight, inputWidth});
320
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
321
+ inputHeight, inputWidth});
322
+ gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
323
+ deformable_group * 2 * kH * kW, outputHeight,
324
+ outputWidth});
325
+ offset =
326
+ offset.view({batchSize / im2col_step, im2col_step,
327
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
328
+
329
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
330
+ // divide into groups
331
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
332
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
333
+ weight.size(2), weight.size(3)});
334
+ gradOutput = gradOutput.view(
335
+ {gradOutput.size(0), group, gradOutput.size(1) / group,
336
+ gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
337
+
338
+ for (int g = 0; g < group; g++) {
339
+ columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
340
+ gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
341
+ }
342
+
343
+ columns =
344
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
345
+ gradOutput = gradOutput.view(
346
+ {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
347
+ gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
348
+
349
+ deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
350
+ inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
351
+ dilationH, dilationW, im2col_step, deformable_group,
352
+ gradOffset[elt]);
353
+
354
+ deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
355
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
356
+ dilationW, im2col_step, deformable_group, gradInput[elt]);
357
+ }
358
+
359
+ gradOutput.transpose_(1, 2);
360
+ gradOutput =
361
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
362
+
363
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
364
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
365
+ gradOffset = gradOffset.view(
366
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
367
+ offset = offset.view(
368
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
369
+
370
+ if (batch == 0) {
371
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
372
+ input = input.view({nInputPlane, inputHeight, inputWidth});
373
+ gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
374
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
375
+ gradOffset =
376
+ gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
377
+ }
378
+
379
+ return 1;
380
+ }
381
+
382
+ int deform_conv_backward_parameters_cuda(
383
+ at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
384
+ at::Tensor gradWeight, // at::Tensor gradBias,
385
+ at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
386
+ int padW, int padH, int dilationW, int dilationH, int group,
387
+ int deformable_group, float scale, int im2col_step)
388
+ {
389
+ // todo: transpose and reshape outGrad
390
+ // todo: reshape columns
391
+ // todo: add im2col_step as input
392
+
393
+ shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
394
+ padW, dilationH, dilationW, group, deformable_group);
395
+
396
+ input = input.contiguous();
397
+ offset = offset.contiguous();
398
+ gradOutput = gradOutput.contiguous();
399
+
400
+ int batch = 1;
401
+
402
+ if (input.ndimension() == 3) {
403
+ // Force batch
404
+ batch = 0;
405
+ input = input.view(
406
+ at::IntList({1, input.size(0), input.size(1), input.size(2)}));
407
+ gradOutput = gradOutput.view(
408
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
409
+ }
410
+
411
+ long batchSize = input.size(0);
412
+ long nInputPlane = input.size(1);
413
+ long inputHeight = input.size(2);
414
+ long inputWidth = input.size(3);
415
+
416
+ long nOutputPlane = gradWeight.size(0);
417
+
418
+ long outputWidth =
419
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
420
+ long outputHeight =
421
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
422
+
423
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
424
+
425
+ columns = at::zeros(
426
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
427
+ input.options());
428
+
429
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
430
+ nOutputPlane, outputHeight, outputWidth});
431
+ gradOutput.transpose_(1, 2);
432
+
433
+ at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
434
+ gradOutputBuffer =
435
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
436
+ outputHeight, outputWidth});
437
+ gradOutputBuffer.copy_(gradOutput);
438
+ gradOutputBuffer =
439
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
440
+ im2col_step * outputHeight, outputWidth});
441
+
442
+ gradOutput.transpose_(1, 2);
443
+ gradOutput =
444
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
445
+
446
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
447
+ inputHeight, inputWidth});
448
+ offset =
449
+ offset.view({batchSize / im2col_step, im2col_step,
450
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
451
+
452
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
453
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
454
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
455
+ dilationW, im2col_step, deformable_group, columns);
456
+
457
+ // divide into group
458
+ gradOutputBuffer = gradOutputBuffer.view(
459
+ {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
460
+ gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
461
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
462
+ gradWeight =
463
+ gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
464
+ gradWeight.size(2), gradWeight.size(3)});
465
+
466
+ for (int g = 0; g < group; g++) {
467
+ gradWeight[g] = gradWeight[g]
468
+ .flatten(1)
469
+ .addmm_(gradOutputBuffer[elt][g].flatten(1),
470
+ columns[g].transpose(1, 0), 1.0, scale)
471
+ .view_as(gradWeight[g]);
472
+ }
473
+ gradOutputBuffer = gradOutputBuffer.view(
474
+ {gradOutputBuffer.size(0),
475
+ gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
476
+ gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
477
+ columns =
478
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
479
+ gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
480
+ gradWeight.size(2), gradWeight.size(3),
481
+ gradWeight.size(4)});
482
+ }
483
+
484
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
485
+ offset = offset.view(
486
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
487
+
488
+ if (batch == 0) {
489
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
490
+ input = input.view({nInputPlane, inputHeight, inputWidth});
491
+ }
492
+
493
+ return 1;
494
+ }
495
+
496
+ void modulated_deform_conv_cuda_forward(
497
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
498
+ at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
499
+ int kernel_h, int kernel_w, const int stride_h, const int stride_w,
500
+ const int pad_h, const int pad_w, const int dilation_h,
501
+ const int dilation_w, const int group, const int deformable_group,
502
+ const bool with_bias)
503
+ {
504
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
505
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
506
+
507
+ const int batch = input.size(0);
508
+ const int channels = input.size(1);
509
+ const int height = input.size(2);
510
+ const int width = input.size(3);
511
+
512
+ const int channels_out = weight.size(0);
513
+ const int channels_kernel = weight.size(1);
514
+ const int kernel_h_ = weight.size(2);
515
+ const int kernel_w_ = weight.size(3);
516
+
517
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
518
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
519
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
520
+ if (channels != channels_kernel * group)
521
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
522
+ channels, channels_kernel * group);
523
+
524
+ const int height_out =
525
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
526
+ const int width_out =
527
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
528
+
529
+ if (ones.ndimension() != 2 ||
530
+ ones.size(0) * ones.size(1) < height_out * width_out) {
531
+ // Resize plane and fill with ones...
532
+ ones = at::ones({height_out, width_out}, input.options());
533
+ }
534
+
535
+ // resize output
536
+ output = output.view({batch, channels_out, height_out, width_out}).zero_();
537
+ // resize temporary columns
538
+ columns =
539
+ at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
540
+ input.options());
541
+
542
+ output = output.view({output.size(0), group, output.size(1) / group,
543
+ output.size(2), output.size(3)});
544
+
545
+ for (int b = 0; b < batch; b++) {
546
+ modulated_deformable_im2col_cuda(
547
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
548
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
549
+ dilation_h, dilation_w, deformable_group, columns);
550
+
551
+ // divide into group
552
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
553
+ weight.size(2), weight.size(3)});
554
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
555
+
556
+ for (int g = 0; g < group; g++) {
557
+ output[b][g] = output[b][g]
558
+ .flatten(1)
559
+ .addmm_(weight[g].flatten(1), columns[g])
560
+ .view_as(output[b][g]);
561
+ }
562
+
563
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
564
+ weight.size(3), weight.size(4)});
565
+ columns =
566
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
567
+ }
568
+
569
+ output = output.view({output.size(0), output.size(1) * output.size(2),
570
+ output.size(3), output.size(4)});
571
+
572
+ if (with_bias) {
573
+ output += bias.view({1, bias.size(0), 1, 1});
574
+ }
575
+ }
576
+
577
+ void modulated_deform_conv_cuda_backward(
578
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
579
+ at::Tensor offset, at::Tensor mask, at::Tensor columns,
580
+ at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
581
+ at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
582
+ int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
583
+ int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
584
+ const bool with_bias)
585
+ {
586
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
587
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
588
+
589
+ const int batch = input.size(0);
590
+ const int channels = input.size(1);
591
+ const int height = input.size(2);
592
+ const int width = input.size(3);
593
+
594
+ const int channels_kernel = weight.size(1);
595
+ const int kernel_h_ = weight.size(2);
596
+ const int kernel_w_ = weight.size(3);
597
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
598
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
599
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
600
+ if (channels != channels_kernel * group)
601
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
602
+ channels, channels_kernel * group);
603
+
604
+ const int height_out =
605
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
606
+ const int width_out =
607
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
608
+
609
+ if (ones.ndimension() != 2 ||
610
+ ones.size(0) * ones.size(1) < height_out * width_out) {
611
+ // Resize plane and fill with ones...
612
+ ones = at::ones({height_out, width_out}, input.options());
613
+ }
614
+
615
+ grad_input = grad_input.view({batch, channels, height, width});
616
+ columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
617
+ input.options());
618
+
619
+ grad_output =
620
+ grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
621
+ grad_output.size(2), grad_output.size(3)});
622
+
623
+ for (int b = 0; b < batch; b++) {
624
+ // divide int group
625
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
626
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
627
+ weight.size(2), weight.size(3)});
628
+
629
+ for (int g = 0; g < group; g++) {
630
+ columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
631
+ grad_output[b][g].flatten(1), 0.0f, 1.0f);
632
+ }
633
+
634
+ columns =
635
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
636
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
637
+ weight.size(3), weight.size(4)});
638
+
639
+ // gradient w.r.t. input coordinate data
640
+ modulated_deformable_col2im_coord_cuda(
641
+ columns, input[b], offset[b], mask[b], 1, channels, height, width,
642
+ height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
643
+ stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
644
+ grad_mask[b]);
645
+ // gradient w.r.t. input data
646
+ modulated_deformable_col2im_cuda(
647
+ columns, offset[b], mask[b], 1, channels, height, width, height_out,
648
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
649
+ dilation_h, dilation_w, deformable_group, grad_input[b]);
650
+
651
+ // gradient w.r.t. weight, dWeight should accumulate across the batch and
652
+ // group
653
+ modulated_deformable_im2col_cuda(
654
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
655
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
656
+ dilation_h, dilation_w, deformable_group, columns);
657
+
658
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
659
+ grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
660
+ grad_weight.size(1), grad_weight.size(2),
661
+ grad_weight.size(3)});
662
+ if (with_bias)
663
+ grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
664
+
665
+ for (int g = 0; g < group; g++) {
666
+ grad_weight[g] =
667
+ grad_weight[g]
668
+ .flatten(1)
669
+ .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
670
+ .view_as(grad_weight[g]);
671
+ if (with_bias) {
672
+ grad_bias[g] =
673
+ grad_bias[g]
674
+ .view({-1, 1})
675
+ .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
676
+ .view(-1);
677
+ }
678
+ }
679
+
680
+ columns =
681
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
682
+ grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
683
+ grad_weight.size(2), grad_weight.size(3),
684
+ grad_weight.size(4)});
685
+ if (with_bias)
686
+ grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
687
+ }
688
+ grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
689
+ grad_output.size(2), grad_output.size(3),
690
+ grad_output.size(4)});
691
+ }
maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu ADDED
@@ -0,0 +1,874 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
3
+ *
4
+ * COPYRIGHT
5
+ *
6
+ * All contributions by the University of California:
7
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
8
+ * All rights reserved.
9
+ *
10
+ * All other contributions:
11
+ * Copyright (c) 2014-2017, the respective contributors
12
+ * All rights reserved.
13
+ *
14
+ * Caffe uses a shared copyright model: each contributor holds copyright over
15
+ * their contributions to Caffe. The project versioning records all such
16
+ * contribution and copyright details. If a contributor wants to further mark
17
+ * their specific copyright on a particular contribution, they should indicate
18
+ * their copyright solely in the commit message of the change when it is
19
+ * committed.
20
+ *
21
+ * LICENSE
22
+ *
23
+ * Redistribution and use in source and binary forms, with or without
24
+ * modification, are permitted provided that the following conditions are met:
25
+ *
26
+ * 1. Redistributions of source code must retain the above copyright notice, this
27
+ * list of conditions and the following disclaimer.
28
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
29
+ * this list of conditions and the following disclaimer in the documentation
30
+ * and/or other materials provided with the distribution.
31
+ *
32
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
33
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
35
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
36
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
37
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
39
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
40
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
41
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42
+ *
43
+ * CONTRIBUTION AGREEMENT
44
+ *
45
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
46
+ * or otherwise, the contributor releases their content to the
47
+ * license and copyright terms herein.
48
+ *
49
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
50
+ *
51
+ * Copyright (c) 2018 Microsoft
52
+ * Licensed under The MIT License [see LICENSE for details]
53
+ * \file modulated_deformable_im2col.cuh
54
+ * \brief Function definitions of converting an image to
55
+ * column matrix based on kernel, padding, dilation, and offset.
56
+ * These functions are mainly used in deformable convolution operators.
57
+ * \ref: https://arxiv.org/abs/1703.06211
58
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
59
+ */
60
+
61
+ // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
62
+
63
+
64
+ #include <ATen/ATen.h>
65
+ #include <THC/THCAtomics.cuh>
66
+ #include <stdio.h>
67
+ #include <math.h>
68
+ #include <float.h>
69
+
70
+ using namespace at;
71
+
72
+ #define CUDA_KERNEL_LOOP(i, n) \
73
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
74
+ i += blockDim.x * gridDim.x)
75
+
76
+ const int CUDA_NUM_THREADS = 1024;
77
+ const int kMaxGridNum = 65535;
78
+ inline int GET_BLOCKS(const int N)
79
+ {
80
+ return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
81
+ }
82
+
83
+ /*
84
+ const int CUDA_NUM_THREADS = 1024;
85
+
86
+ inline int GET_BLOCKS(const int N)
87
+ {
88
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
89
+ }*/
90
+
91
+ template <typename scalar_t>
92
+ __device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
93
+ const int height, const int width, scalar_t h, scalar_t w)
94
+ {
95
+
96
+ int h_low = floor(h);
97
+ int w_low = floor(w);
98
+ int h_high = h_low + 1;
99
+ int w_high = w_low + 1;
100
+
101
+ scalar_t lh = h - h_low;
102
+ scalar_t lw = w - w_low;
103
+ scalar_t hh = 1 - lh, hw = 1 - lw;
104
+
105
+ scalar_t v1 = 0;
106
+ if (h_low >= 0 && w_low >= 0)
107
+ v1 = bottom_data[h_low * data_width + w_low];
108
+ scalar_t v2 = 0;
109
+ if (h_low >= 0 && w_high <= width - 1)
110
+ v2 = bottom_data[h_low * data_width + w_high];
111
+ scalar_t v3 = 0;
112
+ if (h_high <= height - 1 && w_low >= 0)
113
+ v3 = bottom_data[h_high * data_width + w_low];
114
+ scalar_t v4 = 0;
115
+ if (h_high <= height - 1 && w_high <= width - 1)
116
+ v4 = bottom_data[h_high * data_width + w_high];
117
+
118
+ scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
119
+
120
+ scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
121
+ return val;
122
+ }
123
+
124
+ template <typename scalar_t>
125
+ __device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
126
+ const int h, const int w, const int height, const int width)
127
+ {
128
+
129
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
130
+ {
131
+ //empty
132
+ return 0;
133
+ }
134
+
135
+ int argmax_h_low = floor(argmax_h);
136
+ int argmax_w_low = floor(argmax_w);
137
+ int argmax_h_high = argmax_h_low + 1;
138
+ int argmax_w_high = argmax_w_low + 1;
139
+
140
+ scalar_t weight = 0;
141
+ if (h == argmax_h_low && w == argmax_w_low)
142
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
143
+ if (h == argmax_h_low && w == argmax_w_high)
144
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
145
+ if (h == argmax_h_high && w == argmax_w_low)
146
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
147
+ if (h == argmax_h_high && w == argmax_w_high)
148
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
149
+ return weight;
150
+ }
151
+
152
+ template <typename scalar_t>
153
+ __device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
154
+ const int height, const int width, const scalar_t *im_data,
155
+ const int data_width, const int bp_dir)
156
+ {
157
+
158
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
159
+ {
160
+ //empty
161
+ return 0;
162
+ }
163
+
164
+ int argmax_h_low = floor(argmax_h);
165
+ int argmax_w_low = floor(argmax_w);
166
+ int argmax_h_high = argmax_h_low + 1;
167
+ int argmax_w_high = argmax_w_low + 1;
168
+
169
+ scalar_t weight = 0;
170
+
171
+ if (bp_dir == 0)
172
+ {
173
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
174
+ weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
175
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
176
+ weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
177
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
178
+ weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
179
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
180
+ weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
181
+ }
182
+ else if (bp_dir == 1)
183
+ {
184
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
185
+ weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
186
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
187
+ weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
188
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
189
+ weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
190
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
191
+ weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
192
+ }
193
+
194
+ return weight;
195
+ }
196
+
197
+ template <typename scalar_t>
198
+ __global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
199
+ const int height, const int width, const int kernel_h, const int kernel_w,
200
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
201
+ const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
202
+ const int batch_size, const int num_channels, const int deformable_group,
203
+ const int height_col, const int width_col,
204
+ scalar_t *data_col)
205
+ {
206
+ CUDA_KERNEL_LOOP(index, n)
207
+ {
208
+ // index index of output matrix
209
+ const int w_col = index % width_col;
210
+ const int h_col = (index / width_col) % height_col;
211
+ const int b_col = (index / width_col / height_col) % batch_size;
212
+ const int c_im = (index / width_col / height_col) / batch_size;
213
+ const int c_col = c_im * kernel_h * kernel_w;
214
+
215
+ // compute deformable group index
216
+ const int deformable_group_index = c_im / channel_per_deformable_group;
217
+
218
+ const int h_in = h_col * stride_h - pad_h;
219
+ const int w_in = w_col * stride_w - pad_w;
220
+ scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
221
+ //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
222
+ const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
223
+ const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
224
+
225
+ for (int i = 0; i < kernel_h; ++i)
226
+ {
227
+ for (int j = 0; j < kernel_w; ++j)
228
+ {
229
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
230
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
231
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
232
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
233
+ scalar_t val = static_cast<scalar_t>(0);
234
+ const scalar_t h_im = h_in + i * dilation_h + offset_h;
235
+ const scalar_t w_im = w_in + j * dilation_w + offset_w;
236
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
237
+ {
238
+ //const scalar_t map_h = i * dilation_h + offset_h;
239
+ //const scalar_t map_w = j * dilation_w + offset_w;
240
+ //const int cur_height = height - h_in;
241
+ //const int cur_width = width - w_in;
242
+ //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
243
+ val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
244
+ }
245
+ *data_col_ptr = val;
246
+ data_col_ptr += batch_size * height_col * width_col;
247
+ }
248
+ }
249
+ }
250
+ }
251
+
252
+ void deformable_im2col(
253
+ const at::Tensor data_im, const at::Tensor data_offset, const int channels,
254
+ const int height, const int width, const int ksize_h, const int ksize_w,
255
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
256
+ const int dilation_h, const int dilation_w, const int parallel_imgs,
257
+ const int deformable_group, at::Tensor data_col)
258
+ {
259
+ // num_axes should be smaller than block size
260
+ // todo: check parallel_imgs is correctly passed in
261
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
262
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
263
+ int num_kernels = channels * height_col * width_col * parallel_imgs;
264
+ int channel_per_deformable_group = channels / deformable_group;
265
+
266
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
267
+ data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
268
+ const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
269
+ const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
270
+ scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
271
+
272
+ deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
273
+ num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
274
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
275
+ channel_per_deformable_group, parallel_imgs, channels, deformable_group,
276
+ height_col, width_col, data_col_);
277
+ }));
278
+
279
+ cudaError_t err = cudaGetLastError();
280
+ if (err != cudaSuccess)
281
+ {
282
+ printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
283
+ }
284
+ }
285
+
286
+ template <typename scalar_t>
287
+ __global__ void deformable_col2im_gpu_kernel(
288
+ const int n, const scalar_t *data_col, const scalar_t *data_offset,
289
+ const int channels, const int height, const int width,
290
+ const int kernel_h, const int kernel_w,
291
+ const int pad_h, const int pad_w,
292
+ const int stride_h, const int stride_w,
293
+ const int dilation_h, const int dilation_w,
294
+ const int channel_per_deformable_group,
295
+ const int batch_size, const int deformable_group,
296
+ const int height_col, const int width_col,
297
+ scalar_t *grad_im)
298
+ {
299
+ CUDA_KERNEL_LOOP(index, n)
300
+ {
301
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
302
+ const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
303
+ const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
304
+ // compute the start and end of the output
305
+
306
+ const int deformable_group_index = c / channel_per_deformable_group;
307
+
308
+ int w_out = index % width_col;
309
+ int h_out = (index / width_col) % height_col;
310
+ int b = (index / width_col / height_col) % batch_size;
311
+ int w_in = w_out * stride_w - pad_w;
312
+ int h_in = h_out * stride_h - pad_h;
313
+
314
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
315
+ 2 * kernel_h * kernel_w * height_col * width_col;
316
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
317
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
318
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
319
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
320
+ const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
321
+ const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
322
+
323
+ const scalar_t cur_top_grad = data_col[index];
324
+ const int cur_h = (int)cur_inv_h_data;
325
+ const int cur_w = (int)cur_inv_w_data;
326
+ for (int dy = -2; dy <= 2; dy++)
327
+ {
328
+ for (int dx = -2; dx <= 2; dx++)
329
+ {
330
+ if (cur_h + dy >= 0 && cur_h + dy < height &&
331
+ cur_w + dx >= 0 && cur_w + dx < width &&
332
+ abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
333
+ abs(cur_inv_w_data - (cur_w + dx)) < 1)
334
+ {
335
+ int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
336
+ scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
337
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
338
+ }
339
+ }
340
+ }
341
+ }
342
+ }
343
+
344
+ void deformable_col2im(
345
+ const at::Tensor data_col, const at::Tensor data_offset, const int channels,
346
+ const int height, const int width, const int ksize_h,
347
+ const int ksize_w, const int pad_h, const int pad_w,
348
+ const int stride_h, const int stride_w,
349
+ const int dilation_h, const int dilation_w,
350
+ const int parallel_imgs, const int deformable_group,
351
+ at::Tensor grad_im)
352
+ {
353
+
354
+ // todo: make sure parallel_imgs is passed in correctly
355
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
356
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
357
+ int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
358
+ int channel_per_deformable_group = channels / deformable_group;
359
+
360
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
361
+ data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
362
+ const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
363
+ const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
364
+ scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
365
+
366
+ deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
367
+ num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
368
+ ksize_w, pad_h, pad_w, stride_h, stride_w,
369
+ dilation_h, dilation_w, channel_per_deformable_group,
370
+ parallel_imgs, deformable_group, height_col, width_col, grad_im_);
371
+ }));
372
+
373
+ cudaError_t err = cudaGetLastError();
374
+ if (err != cudaSuccess)
375
+ {
376
+ printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
377
+ }
378
+ }
379
+
380
+ template <typename scalar_t>
381
+ __global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
382
+ const scalar_t *data_im, const scalar_t *data_offset,
383
+ const int channels, const int height, const int width,
384
+ const int kernel_h, const int kernel_w,
385
+ const int pad_h, const int pad_w,
386
+ const int stride_h, const int stride_w,
387
+ const int dilation_h, const int dilation_w,
388
+ const int channel_per_deformable_group,
389
+ const int batch_size, const int offset_channels, const int deformable_group,
390
+ const int height_col, const int width_col, scalar_t *grad_offset)
391
+ {
392
+ CUDA_KERNEL_LOOP(index, n)
393
+ {
394
+ scalar_t val = 0;
395
+ int w = index % width_col;
396
+ int h = (index / width_col) % height_col;
397
+ int c = (index / width_col / height_col) % offset_channels;
398
+ int b = (index / width_col / height_col) / offset_channels;
399
+ // compute the start and end of the output
400
+
401
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
402
+ const int col_step = kernel_h * kernel_w;
403
+ int cnt = 0;
404
+ const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
405
+ batch_size * width_col * height_col;
406
+ const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
407
+ channel_per_deformable_group / kernel_h / kernel_w * height * width;
408
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
409
+ kernel_h * kernel_w * height_col * width_col;
410
+
411
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
412
+
413
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
414
+ {
415
+ const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
416
+ const int bp_dir = offset_c % 2;
417
+
418
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
419
+ int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
420
+ int w_out = col_pos % width_col;
421
+ int h_out = (col_pos / width_col) % height_col;
422
+ int w_in = w_out * stride_w - pad_w;
423
+ int h_in = h_out * stride_h - pad_h;
424
+ const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
425
+ const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
426
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
427
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
428
+ scalar_t inv_h = h_in + i * dilation_h + offset_h;
429
+ scalar_t inv_w = w_in + j * dilation_w + offset_w;
430
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
431
+ {
432
+ inv_h = inv_w = -2;
433
+ }
434
+ const scalar_t weight = get_coordinate_weight(
435
+ inv_h, inv_w,
436
+ height, width, data_im_ptr + cnt * height * width, width, bp_dir);
437
+ val += weight * data_col_ptr[col_pos];
438
+ cnt += 1;
439
+ }
440
+
441
+ grad_offset[index] = val;
442
+ }
443
+ }
444
+
445
+ void deformable_col2im_coord(
446
+ const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
447
+ const int channels, const int height, const int width, const int ksize_h,
448
+ const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
449
+ const int stride_w, const int dilation_h, const int dilation_w,
450
+ const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
451
+ {
452
+
453
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
454
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
455
+ int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
456
+ int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
457
+
458
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
459
+ data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
460
+ const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
461
+ const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
462
+ const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
463
+ scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
464
+
465
+ deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
466
+ num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
467
+ ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
468
+ dilation_h, dilation_w, channel_per_deformable_group,
469
+ parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
470
+ height_col, width_col, grad_offset_);
471
+ }));
472
+ }
473
+
474
+ template <typename scalar_t>
475
+ __device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
476
+ const int height, const int width, scalar_t h, scalar_t w)
477
+ {
478
+ int h_low = floor(h);
479
+ int w_low = floor(w);
480
+ int h_high = h_low + 1;
481
+ int w_high = w_low + 1;
482
+
483
+ scalar_t lh = h - h_low;
484
+ scalar_t lw = w - w_low;
485
+ scalar_t hh = 1 - lh, hw = 1 - lw;
486
+
487
+ scalar_t v1 = 0;
488
+ if (h_low >= 0 && w_low >= 0)
489
+ v1 = bottom_data[h_low * data_width + w_low];
490
+ scalar_t v2 = 0;
491
+ if (h_low >= 0 && w_high <= width - 1)
492
+ v2 = bottom_data[h_low * data_width + w_high];
493
+ scalar_t v3 = 0;
494
+ if (h_high <= height - 1 && w_low >= 0)
495
+ v3 = bottom_data[h_high * data_width + w_low];
496
+ scalar_t v4 = 0;
497
+ if (h_high <= height - 1 && w_high <= width - 1)
498
+ v4 = bottom_data[h_high * data_width + w_high];
499
+
500
+ scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
501
+
502
+ scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
503
+ return val;
504
+ }
505
+
506
+ template <typename scalar_t>
507
+ __device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
508
+ const int h, const int w, const int height, const int width)
509
+ {
510
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
511
+ {
512
+ //empty
513
+ return 0;
514
+ }
515
+
516
+ int argmax_h_low = floor(argmax_h);
517
+ int argmax_w_low = floor(argmax_w);
518
+ int argmax_h_high = argmax_h_low + 1;
519
+ int argmax_w_high = argmax_w_low + 1;
520
+
521
+ scalar_t weight = 0;
522
+ if (h == argmax_h_low && w == argmax_w_low)
523
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
524
+ if (h == argmax_h_low && w == argmax_w_high)
525
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
526
+ if (h == argmax_h_high && w == argmax_w_low)
527
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
528
+ if (h == argmax_h_high && w == argmax_w_high)
529
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
530
+ return weight;
531
+ }
532
+
533
+ template <typename scalar_t>
534
+ __device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
535
+ const int height, const int width, const scalar_t *im_data,
536
+ const int data_width, const int bp_dir)
537
+ {
538
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
539
+ {
540
+ //empty
541
+ return 0;
542
+ }
543
+
544
+ int argmax_h_low = floor(argmax_h);
545
+ int argmax_w_low = floor(argmax_w);
546
+ int argmax_h_high = argmax_h_low + 1;
547
+ int argmax_w_high = argmax_w_low + 1;
548
+
549
+ scalar_t weight = 0;
550
+
551
+ if (bp_dir == 0)
552
+ {
553
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
554
+ weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
555
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
556
+ weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
557
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
558
+ weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
559
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
560
+ weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
561
+ }
562
+ else if (bp_dir == 1)
563
+ {
564
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
565
+ weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
566
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
567
+ weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
568
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
569
+ weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
570
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
571
+ weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
572
+ }
573
+
574
+ return weight;
575
+ }
576
+
577
+ template <typename scalar_t>
578
+ __global__ void modulated_deformable_im2col_gpu_kernel(const int n,
579
+ const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
580
+ const int height, const int width, const int kernel_h, const int kernel_w,
581
+ const int pad_h, const int pad_w,
582
+ const int stride_h, const int stride_w,
583
+ const int dilation_h, const int dilation_w,
584
+ const int channel_per_deformable_group,
585
+ const int batch_size, const int num_channels, const int deformable_group,
586
+ const int height_col, const int width_col,
587
+ scalar_t *data_col)
588
+ {
589
+ CUDA_KERNEL_LOOP(index, n)
590
+ {
591
+ // index index of output matrix
592
+ const int w_col = index % width_col;
593
+ const int h_col = (index / width_col) % height_col;
594
+ const int b_col = (index / width_col / height_col) % batch_size;
595
+ const int c_im = (index / width_col / height_col) / batch_size;
596
+ const int c_col = c_im * kernel_h * kernel_w;
597
+
598
+ // compute deformable group index
599
+ const int deformable_group_index = c_im / channel_per_deformable_group;
600
+
601
+ const int h_in = h_col * stride_h - pad_h;
602
+ const int w_in = w_col * stride_w - pad_w;
603
+
604
+ scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
605
+ //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
606
+ const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
607
+ const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
608
+
609
+ const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
610
+
611
+ for (int i = 0; i < kernel_h; ++i)
612
+ {
613
+ for (int j = 0; j < kernel_w; ++j)
614
+ {
615
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
616
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
617
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
618
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
619
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
620
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
621
+ scalar_t val = static_cast<scalar_t>(0);
622
+ const scalar_t h_im = h_in + i * dilation_h + offset_h;
623
+ const scalar_t w_im = w_in + j * dilation_w + offset_w;
624
+ //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
625
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
626
+ {
627
+ //const float map_h = i * dilation_h + offset_h;
628
+ //const float map_w = j * dilation_w + offset_w;
629
+ //const int cur_height = height - h_in;
630
+ //const int cur_width = width - w_in;
631
+ //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
632
+ val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
633
+ }
634
+ *data_col_ptr = val * mask;
635
+ data_col_ptr += batch_size * height_col * width_col;
636
+ //data_col_ptr += height_col * width_col;
637
+ }
638
+ }
639
+ }
640
+ }
641
+
642
+ template <typename scalar_t>
643
+ __global__ void modulated_deformable_col2im_gpu_kernel(const int n,
644
+ const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
645
+ const int channels, const int height, const int width,
646
+ const int kernel_h, const int kernel_w,
647
+ const int pad_h, const int pad_w,
648
+ const int stride_h, const int stride_w,
649
+ const int dilation_h, const int dilation_w,
650
+ const int channel_per_deformable_group,
651
+ const int batch_size, const int deformable_group,
652
+ const int height_col, const int width_col,
653
+ scalar_t *grad_im)
654
+ {
655
+ CUDA_KERNEL_LOOP(index, n)
656
+ {
657
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
658
+ const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
659
+ const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
660
+ // compute the start and end of the output
661
+
662
+ const int deformable_group_index = c / channel_per_deformable_group;
663
+
664
+ int w_out = index % width_col;
665
+ int h_out = (index / width_col) % height_col;
666
+ int b = (index / width_col / height_col) % batch_size;
667
+ int w_in = w_out * stride_w - pad_w;
668
+ int h_in = h_out * stride_h - pad_h;
669
+
670
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
671
+ const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
672
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
673
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
674
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
675
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
676
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
677
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
678
+ const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
679
+ const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
680
+
681
+ const scalar_t cur_top_grad = data_col[index] * mask;
682
+ const int cur_h = (int)cur_inv_h_data;
683
+ const int cur_w = (int)cur_inv_w_data;
684
+ for (int dy = -2; dy <= 2; dy++)
685
+ {
686
+ for (int dx = -2; dx <= 2; dx++)
687
+ {
688
+ if (cur_h + dy >= 0 && cur_h + dy < height &&
689
+ cur_w + dx >= 0 && cur_w + dx < width &&
690
+ abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
691
+ abs(cur_inv_w_data - (cur_w + dx)) < 1)
692
+ {
693
+ int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
694
+ scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
695
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
696
+ }
697
+ }
698
+ }
699
+ }
700
+ }
701
+
702
+ template <typename scalar_t>
703
+ __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
704
+ const scalar_t *data_col, const scalar_t *data_im,
705
+ const scalar_t *data_offset, const scalar_t *data_mask,
706
+ const int channels, const int height, const int width,
707
+ const int kernel_h, const int kernel_w,
708
+ const int pad_h, const int pad_w,
709
+ const int stride_h, const int stride_w,
710
+ const int dilation_h, const int dilation_w,
711
+ const int channel_per_deformable_group,
712
+ const int batch_size, const int offset_channels, const int deformable_group,
713
+ const int height_col, const int width_col,
714
+ scalar_t *grad_offset, scalar_t *grad_mask)
715
+ {
716
+ CUDA_KERNEL_LOOP(index, n)
717
+ {
718
+ scalar_t val = 0, mval = 0;
719
+ int w = index % width_col;
720
+ int h = (index / width_col) % height_col;
721
+ int c = (index / width_col / height_col) % offset_channels;
722
+ int b = (index / width_col / height_col) / offset_channels;
723
+ // compute the start and end of the output
724
+
725
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
726
+ const int col_step = kernel_h * kernel_w;
727
+ int cnt = 0;
728
+ const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
729
+ const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
730
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
731
+ const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
732
+
733
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
734
+
735
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
736
+ {
737
+ const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
738
+ const int bp_dir = offset_c % 2;
739
+
740
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
741
+ int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
742
+ int w_out = col_pos % width_col;
743
+ int h_out = (col_pos / width_col) % height_col;
744
+ int w_in = w_out * stride_w - pad_w;
745
+ int h_in = h_out * stride_h - pad_h;
746
+ const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
747
+ const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
748
+ const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
749
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
750
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
751
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
752
+ scalar_t inv_h = h_in + i * dilation_h + offset_h;
753
+ scalar_t inv_w = w_in + j * dilation_w + offset_w;
754
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
755
+ {
756
+ inv_h = inv_w = -2;
757
+ }
758
+ else
759
+ {
760
+ mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
761
+ }
762
+ const scalar_t weight = dmcn_get_coordinate_weight(
763
+ inv_h, inv_w,
764
+ height, width, data_im_ptr + cnt * height * width, width, bp_dir);
765
+ val += weight * data_col_ptr[col_pos] * mask;
766
+ cnt += 1;
767
+ }
768
+ // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
769
+ grad_offset[index] = val;
770
+ if (offset_c % 2 == 0)
771
+ // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
772
+ grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
773
+ }
774
+ }
775
+
776
+ void modulated_deformable_im2col_cuda(
777
+ const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
778
+ const int batch_size, const int channels, const int height_im, const int width_im,
779
+ const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
780
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
781
+ const int dilation_h, const int dilation_w,
782
+ const int deformable_group, at::Tensor data_col)
783
+ {
784
+ // num_axes should be smaller than block size
785
+ const int channel_per_deformable_group = channels / deformable_group;
786
+ const int num_kernels = channels * batch_size * height_col * width_col;
787
+
788
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
789
+ data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
790
+ const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
791
+ const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
792
+ const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
793
+ scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
794
+
795
+ modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
796
+ num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
797
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
798
+ batch_size, channels, deformable_group, height_col, width_col, data_col_);
799
+ }));
800
+
801
+ cudaError_t err = cudaGetLastError();
802
+ if (err != cudaSuccess)
803
+ {
804
+ printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
805
+ }
806
+ }
807
+
808
+ void modulated_deformable_col2im_cuda(
809
+ const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
810
+ const int batch_size, const int channels, const int height_im, const int width_im,
811
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
812
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
813
+ const int dilation_h, const int dilation_w,
814
+ const int deformable_group, at::Tensor grad_im)
815
+ {
816
+
817
+ const int channel_per_deformable_group = channels / deformable_group;
818
+ const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
819
+
820
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
821
+ data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
822
+ const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
823
+ const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
824
+ const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
825
+ scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
826
+
827
+ modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
828
+ num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
829
+ kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
830
+ dilation_h, dilation_w, channel_per_deformable_group,
831
+ batch_size, deformable_group, height_col, width_col, grad_im_);
832
+ }));
833
+
834
+ cudaError_t err = cudaGetLastError();
835
+ if (err != cudaSuccess)
836
+ {
837
+ printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
838
+ }
839
+ }
840
+
841
+ void modulated_deformable_col2im_coord_cuda(
842
+ const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
843
+ const int batch_size, const int channels, const int height_im, const int width_im,
844
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
845
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
846
+ const int dilation_h, const int dilation_w,
847
+ const int deformable_group,
848
+ at::Tensor grad_offset, at::Tensor grad_mask)
849
+ {
850
+ const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
851
+ const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
852
+
853
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
854
+ data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
855
+ const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
856
+ const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
857
+ const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
858
+ const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
859
+ scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
860
+ scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
861
+
862
+ modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
863
+ num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
864
+ kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
865
+ dilation_h, dilation_w, channel_per_deformable_group,
866
+ batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
867
+ grad_offset_, grad_mask_);
868
+ }));
869
+ cudaError_t err = cudaGetLastError();
870
+ if (err != cudaSuccess)
871
+ {
872
+ printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
873
+ }
874
+ }
maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // modify from
2
+ // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
3
+
4
+ // based on
5
+ // author: Charles Shang
6
+ // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
7
+
8
+ #include <ATen/ATen.h>
9
+ #include <ATen/cuda/CUDAContext.h>
10
+
11
+ #include <THC/THC.h>
12
+ #include <THC/THCDeviceUtils.cuh>
13
+
14
+ #include <vector>
15
+ #include <iostream>
16
+ #include <cmath>
17
+
18
+
19
+ void DeformablePSROIPoolForward(
20
+ const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
21
+ at::Tensor out, at::Tensor top_count, const int batch, const int channels,
22
+ const int height, const int width, const int num_bbox,
23
+ const int channels_trans, const int no_trans, const float spatial_scale,
24
+ const int output_dim, const int group_size, const int pooled_size,
25
+ const int part_size, const int sample_per_part, const float trans_std);
26
+
27
+ void DeformablePSROIPoolBackwardAcc(
28
+ const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
29
+ const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
30
+ at::Tensor trans_grad, const int batch, const int channels,
31
+ const int height, const int width, const int num_bbox,
32
+ const int channels_trans, const int no_trans, const float spatial_scale,
33
+ const int output_dim, const int group_size, const int pooled_size,
34
+ const int part_size, const int sample_per_part, const float trans_std);
35
+
36
+ void deform_psroi_pooling_cuda_forward(
37
+ at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
38
+ at::Tensor top_count, const int no_trans, const float spatial_scale,
39
+ const int output_dim, const int group_size, const int pooled_size,
40
+ const int part_size, const int sample_per_part, const float trans_std)
41
+ {
42
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
43
+
44
+ const int batch = input.size(0);
45
+ const int channels = input.size(1);
46
+ const int height = input.size(2);
47
+ const int width = input.size(3);
48
+ const int channels_trans = no_trans ? 2 : trans.size(1);
49
+
50
+ const int num_bbox = bbox.size(0);
51
+ if (num_bbox != out.size(0))
52
+ AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
53
+ out.size(0), num_bbox);
54
+
55
+ DeformablePSROIPoolForward(
56
+ input, bbox, trans, out, top_count, batch, channels, height, width,
57
+ num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
58
+ pooled_size, part_size, sample_per_part, trans_std);
59
+ }
60
+
61
+ void deform_psroi_pooling_cuda_backward(
62
+ at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
63
+ at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
64
+ const int no_trans, const float spatial_scale, const int output_dim,
65
+ const int group_size, const int pooled_size, const int part_size,
66
+ const int sample_per_part, const float trans_std)
67
+ {
68
+ TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
69
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
70
+
71
+ const int batch = input.size(0);
72
+ const int channels = input.size(1);
73
+ const int height = input.size(2);
74
+ const int width = input.size(3);
75
+ const int channels_trans = no_trans ? 2 : trans.size(1);
76
+
77
+ const int num_bbox = bbox.size(0);
78
+ if (num_bbox != out_grad.size(0))
79
+ AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
80
+ out_grad.size(0), num_bbox);
81
+
82
+ DeformablePSROIPoolBackwardAcc(
83
+ out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
84
+ channels, height, width, num_bbox, channels_trans, no_trans,
85
+ spatial_scale, output_dim, group_size, pooled_size, part_size,
86
+ sample_per_part, trans_std);
87
+ }
maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*!
2
+ * Copyright (c) 2017 Microsoft
3
+ * Licensed under The MIT License [see LICENSE for details]
4
+ * \file deformable_psroi_pooling.cu
5
+ * \brief
6
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
7
+ */
8
+ /***************** Adapted by Charles Shang *********************/
9
+ // modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
10
+
11
+
12
+ #include <ATen/ATen.h>
13
+ #include <THC/THCAtomics.cuh>
14
+ #include <stdio.h>
15
+ #include <math.h>
16
+ #include <algorithm>
17
+
18
+ using namespace at;
19
+
20
+ #define CUDA_KERNEL_LOOP(i, n) \
21
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
22
+ i < (n); \
23
+ i += blockDim.x * gridDim.x)
24
+
25
+ const int CUDA_NUM_THREADS = 1024;
26
+ inline int GET_BLOCKS(const int N)
27
+ {
28
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
29
+ }
30
+
31
+ template <typename scalar_t>
32
+ __device__ scalar_t bilinear_interp(
33
+ const scalar_t *data,
34
+ const scalar_t x,
35
+ const scalar_t y,
36
+ const int width,
37
+ const int height)
38
+ {
39
+ int x1 = floor(x);
40
+ int x2 = ceil(x);
41
+ int y1 = floor(y);
42
+ int y2 = ceil(y);
43
+ scalar_t dist_x = (scalar_t)(x - x1);
44
+ scalar_t dist_y = (scalar_t)(y - y1);
45
+ scalar_t value11 = data[y1 * width + x1];
46
+ scalar_t value12 = data[y2 * width + x1];
47
+ scalar_t value21 = data[y1 * width + x2];
48
+ scalar_t value22 = data[y2 * width + x2];
49
+ scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
50
+ return value;
51
+ }
52
+
53
+ template <typename scalar_t>
54
+ __global__ void DeformablePSROIPoolForwardKernel(
55
+ const int count,
56
+ const scalar_t *bottom_data,
57
+ const scalar_t spatial_scale,
58
+ const int channels,
59
+ const int height, const int width,
60
+ const int pooled_height, const int pooled_width,
61
+ const scalar_t *bottom_rois, const scalar_t *bottom_trans,
62
+ const int no_trans,
63
+ const scalar_t trans_std,
64
+ const int sample_per_part,
65
+ const int output_dim,
66
+ const int group_size,
67
+ const int part_size,
68
+ const int num_classes,
69
+ const int channels_each_class,
70
+ scalar_t *top_data,
71
+ scalar_t *top_count)
72
+ {
73
+ CUDA_KERNEL_LOOP(index, count)
74
+ {
75
+ // The output is in order (n, ctop, ph, pw)
76
+ int pw = index % pooled_width;
77
+ int ph = (index / pooled_width) % pooled_height;
78
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
79
+ int n = index / pooled_width / pooled_height / output_dim;
80
+
81
+ // [start, end) interval for spatial sampling
82
+ const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
83
+ int roi_batch_ind = offset_bottom_rois[0];
84
+ scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
85
+ scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
86
+ scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
87
+ scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
88
+
89
+ // Force too small ROIs to be 1x1
90
+ scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
91
+ scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
92
+
93
+ // Compute w and h at bottom
94
+ scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
95
+ scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
96
+
97
+ scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
98
+ scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
99
+
100
+ int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
101
+ int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
102
+ int class_id = ctop / channels_each_class;
103
+ scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
104
+ scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
105
+
106
+ scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
107
+ wstart += trans_x * roi_width;
108
+ scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
109
+ hstart += trans_y * roi_height;
110
+
111
+ scalar_t sum = 0;
112
+ int count = 0;
113
+ int gw = floor((scalar_t)(pw)*group_size / pooled_width);
114
+ int gh = floor((scalar_t)(ph)*group_size / pooled_height);
115
+ gw = min(max(gw, 0), group_size - 1);
116
+ gh = min(max(gh, 0), group_size - 1);
117
+
118
+ const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
119
+ for (int ih = 0; ih < sample_per_part; ih++)
120
+ {
121
+ for (int iw = 0; iw < sample_per_part; iw++)
122
+ {
123
+ scalar_t w = wstart + iw * sub_bin_size_w;
124
+ scalar_t h = hstart + ih * sub_bin_size_h;
125
+ // bilinear interpolation
126
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
127
+ {
128
+ continue;
129
+ }
130
+ w = min(max(w, 0.), width - 1.);
131
+ h = min(max(h, 0.), height - 1.);
132
+ int c = (ctop * group_size + gh) * group_size + gw;
133
+ scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
134
+ sum += val;
135
+ count++;
136
+ }
137
+ }
138
+ top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
139
+ top_count[index] = count;
140
+ }
141
+ }
142
+
143
+ template <typename scalar_t>
144
+ __global__ void DeformablePSROIPoolBackwardAccKernel(
145
+ const int count,
146
+ const scalar_t *top_diff,
147
+ const scalar_t *top_count,
148
+ const int num_rois,
149
+ const scalar_t spatial_scale,
150
+ const int channels,
151
+ const int height, const int width,
152
+ const int pooled_height, const int pooled_width,
153
+ const int output_dim,
154
+ scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
155
+ const scalar_t *bottom_data,
156
+ const scalar_t *bottom_rois,
157
+ const scalar_t *bottom_trans,
158
+ const int no_trans,
159
+ const scalar_t trans_std,
160
+ const int sample_per_part,
161
+ const int group_size,
162
+ const int part_size,
163
+ const int num_classes,
164
+ const int channels_each_class)
165
+ {
166
+ CUDA_KERNEL_LOOP(index, count)
167
+ {
168
+ // The output is in order (n, ctop, ph, pw)
169
+ int pw = index % pooled_width;
170
+ int ph = (index / pooled_width) % pooled_height;
171
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
172
+ int n = index / pooled_width / pooled_height / output_dim;
173
+
174
+ // [start, end) interval for spatial sampling
175
+ const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
176
+ int roi_batch_ind = offset_bottom_rois[0];
177
+ scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
178
+ scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
179
+ scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
180
+ scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
181
+
182
+ // Force too small ROIs to be 1x1
183
+ scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
184
+ scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
185
+
186
+ // Compute w and h at bottom
187
+ scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
188
+ scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
189
+
190
+ scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
191
+ scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
192
+
193
+ int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
194
+ int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
195
+ int class_id = ctop / channels_each_class;
196
+ scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
197
+ scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
198
+
199
+ scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
200
+ wstart += trans_x * roi_width;
201
+ scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
202
+ hstart += trans_y * roi_height;
203
+
204
+ if (top_count[index] <= 0)
205
+ {
206
+ continue;
207
+ }
208
+ scalar_t diff_val = top_diff[index] / top_count[index];
209
+ const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
210
+ scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
211
+ int gw = floor((scalar_t)(pw)*group_size / pooled_width);
212
+ int gh = floor((scalar_t)(ph)*group_size / pooled_height);
213
+ gw = min(max(gw, 0), group_size - 1);
214
+ gh = min(max(gh, 0), group_size - 1);
215
+
216
+ for (int ih = 0; ih < sample_per_part; ih++)
217
+ {
218
+ for (int iw = 0; iw < sample_per_part; iw++)
219
+ {
220
+ scalar_t w = wstart + iw * sub_bin_size_w;
221
+ scalar_t h = hstart + ih * sub_bin_size_h;
222
+ // bilinear interpolation
223
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
224
+ {
225
+ continue;
226
+ }
227
+ w = min(max(w, 0.), width - 1.);
228
+ h = min(max(h, 0.), height - 1.);
229
+ int c = (ctop * group_size + gh) * group_size + gw;
230
+ // backward on feature
231
+ int x0 = floor(w);
232
+ int x1 = ceil(w);
233
+ int y0 = floor(h);
234
+ int y1 = ceil(h);
235
+ scalar_t dist_x = w - x0, dist_y = h - y0;
236
+ scalar_t q00 = (1 - dist_x) * (1 - dist_y);
237
+ scalar_t q01 = (1 - dist_x) * dist_y;
238
+ scalar_t q10 = dist_x * (1 - dist_y);
239
+ scalar_t q11 = dist_x * dist_y;
240
+ int bottom_index_base = c * height * width;
241
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
242
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
243
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
244
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
245
+
246
+ if (no_trans)
247
+ {
248
+ continue;
249
+ }
250
+ scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
251
+ scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
252
+ scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
253
+ scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
254
+ scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
255
+ diff_x *= roi_width;
256
+ scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
257
+ diff_y *= roi_height;
258
+
259
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
260
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
261
+ }
262
+ }
263
+ }
264
+ }
265
+
266
+ void DeformablePSROIPoolForward(const at::Tensor data,
267
+ const at::Tensor bbox,
268
+ const at::Tensor trans,
269
+ at::Tensor out,
270
+ at::Tensor top_count,
271
+ const int batch,
272
+ const int channels,
273
+ const int height,
274
+ const int width,
275
+ const int num_bbox,
276
+ const int channels_trans,
277
+ const int no_trans,
278
+ const float spatial_scale,
279
+ const int output_dim,
280
+ const int group_size,
281
+ const int pooled_size,
282
+ const int part_size,
283
+ const int sample_per_part,
284
+ const float trans_std)
285
+ {
286
+ const int pooled_height = pooled_size;
287
+ const int pooled_width = pooled_size;
288
+ const int count = num_bbox * output_dim * pooled_height * pooled_width;
289
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
290
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
291
+
292
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
293
+ data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
294
+ const scalar_t *bottom_data = data.data_ptr<scalar_t>();
295
+ const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
296
+ const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
297
+ scalar_t *top_data = out.data_ptr<scalar_t>();
298
+ scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
299
+
300
+ DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
301
+ count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
302
+ bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
303
+ group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
304
+ }));
305
+
306
+ cudaError_t err = cudaGetLastError();
307
+ if (err != cudaSuccess)
308
+ {
309
+ printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
310
+ }
311
+ }
312
+
313
+ void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
314
+ const at::Tensor data,
315
+ const at::Tensor bbox,
316
+ const at::Tensor trans,
317
+ const at::Tensor top_count,
318
+ at::Tensor in_grad,
319
+ at::Tensor trans_grad,
320
+ const int batch,
321
+ const int channels,
322
+ const int height,
323
+ const int width,
324
+ const int num_bbox,
325
+ const int channels_trans,
326
+ const int no_trans,
327
+ const float spatial_scale,
328
+ const int output_dim,
329
+ const int group_size,
330
+ const int pooled_size,
331
+ const int part_size,
332
+ const int sample_per_part,
333
+ const float trans_std)
334
+ {
335
+ // LOG(INFO) << "DeformablePSROIPoolBackward";
336
+ const int num_rois = num_bbox;
337
+ const int pooled_height = pooled_size;
338
+ const int pooled_width = pooled_size;
339
+ const int count = num_bbox * output_dim * pooled_height * pooled_width;
340
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
341
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
342
+
343
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
344
+ out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
345
+ const scalar_t *top_diff = out_grad.data_ptr<scalar_t>();
346
+ const scalar_t *bottom_data = data.data_ptr<scalar_t>();
347
+ const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
348
+ const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
349
+ scalar_t *bottom_data_diff = in_grad.data_ptr<scalar_t>();
350
+ scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr<scalar_t>();
351
+ const scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
352
+
353
+ DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
354
+ count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
355
+ pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
356
+ bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
357
+ group_size, part_size, num_classes, channels_each_class);
358
+ }));
359
+
360
+ cudaError_t err = cudaGetLastError();
361
+ if (err != cudaSuccess)
362
+ {
363
+ printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
364
+ }
365
+ }
maskrcnn_benchmark/csrc/cuda/ml_nms.cu ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCDeviceUtils.cuh>
7
+
8
+ #include <vector>
9
+ #include <iostream>
10
+
11
+ int const threadsPerBlock = sizeof(unsigned long long) * 8;
12
+
13
+ __device__ inline float devIoU(float const * const a, float const * const b) {
14
+ if (a[5] != b[5]) {
15
+ return 0.0;
16
+ }
17
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
18
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
19
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
20
+ float interS = width * height;
21
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
22
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
23
+ return interS / (Sa + Sb - interS);
24
+ }
25
+
26
+ __global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh,
27
+ const float *dev_boxes, unsigned long long *dev_mask) {
28
+ const int row_start = blockIdx.y;
29
+ const int col_start = blockIdx.x;
30
+
31
+ // if (row_start > col_start) return;
32
+
33
+ const int row_size =
34
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
35
+ const int col_size =
36
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
37
+
38
+ __shared__ float block_boxes[threadsPerBlock * 6];
39
+ if (threadIdx.x < col_size) {
40
+ block_boxes[threadIdx.x * 6 + 0] =
41
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
42
+ block_boxes[threadIdx.x * 6 + 1] =
43
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
44
+ block_boxes[threadIdx.x * 6 + 2] =
45
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
46
+ block_boxes[threadIdx.x * 6 + 3] =
47
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
48
+ block_boxes[threadIdx.x * 6 + 4] =
49
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
50
+ block_boxes[threadIdx.x * 6 + 5] =
51
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
52
+ }
53
+ __syncthreads();
54
+
55
+ if (threadIdx.x < row_size) {
56
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
57
+ const float *cur_box = dev_boxes + cur_box_idx * 6;
58
+ int i = 0;
59
+ unsigned long long t = 0;
60
+ int start = 0;
61
+ if (row_start == col_start) {
62
+ start = threadIdx.x + 1;
63
+ }
64
+ for (i = start; i < col_size; i++) {
65
+ if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) {
66
+ t |= 1ULL << i;
67
+ }
68
+ }
69
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
70
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
71
+ }
72
+ }
73
+
74
+ // boxes is a N x 6 tensor
75
+ at::Tensor ml_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
76
+ using scalar_t = float;
77
+ AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
78
+ auto scores = boxes.select(1, 4);
79
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
80
+ auto boxes_sorted = boxes.index_select(0, order_t);
81
+
82
+ int boxes_num = boxes.size(0);
83
+
84
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
85
+
86
+ scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
87
+
88
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
89
+
90
+ unsigned long long* mask_dev = NULL;
91
+ //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
92
+ // boxes_num * col_blocks * sizeof(unsigned long long)));
93
+
94
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
95
+
96
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
97
+ THCCeilDiv(boxes_num, threadsPerBlock));
98
+ dim3 threads(threadsPerBlock);
99
+ ml_nms_kernel<<<blocks, threads>>>(boxes_num,
100
+ nms_overlap_thresh,
101
+ boxes_dev,
102
+ mask_dev);
103
+
104
+ std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
105
+ THCudaCheck(cudaMemcpy(&mask_host[0],
106
+ mask_dev,
107
+ sizeof(unsigned long long) * boxes_num * col_blocks,
108
+ cudaMemcpyDeviceToHost));
109
+
110
+ std::vector<unsigned long long> remv(col_blocks);
111
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
112
+
113
+ at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
114
+ int64_t* keep_out = keep.data_ptr<int64_t>();
115
+
116
+ int num_to_keep = 0;
117
+ for (int i = 0; i < boxes_num; i++) {
118
+ int nblock = i / threadsPerBlock;
119
+ int inblock = i % threadsPerBlock;
120
+
121
+ if (!(remv[nblock] & (1ULL << inblock))) {
122
+ keep_out[num_to_keep++] = i;
123
+ unsigned long long *p = &mask_host[0] + i * col_blocks;
124
+ for (int j = nblock; j < col_blocks; j++) {
125
+ remv[j] |= p[j];
126
+ }
127
+ }
128
+ }
129
+
130
+ THCudaFree(state, mask_dev);
131
+ // TODO improve this part
132
+ return std::get<0>(order_t.index({
133
+ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
134
+ order_t.device(), keep.scalar_type())
135
+ }).sort(0, false));
136
+ }
maskrcnn_benchmark/csrc/cuda/nms.cu ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCDeviceUtils.cuh>
7
+
8
+ #include <vector>
9
+ #include <iostream>
10
+
11
+ int const threadsPerBlock = sizeof(unsigned long long) * 8;
12
+
13
+ __device__ inline float devIoU(float const * const a, float const * const b) {
14
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
15
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
16
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
17
+ float interS = width * height;
18
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
19
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
20
+ return interS / (Sa + Sb - interS);
21
+ }
22
+
23
+ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
24
+ const float *dev_boxes, unsigned long long *dev_mask) {
25
+ const int row_start = blockIdx.y;
26
+ const int col_start = blockIdx.x;
27
+
28
+ // if (row_start > col_start) return;
29
+
30
+ const int row_size =
31
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
32
+ const int col_size =
33
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
34
+
35
+ __shared__ float block_boxes[threadsPerBlock * 5];
36
+ if (threadIdx.x < col_size) {
37
+ block_boxes[threadIdx.x * 5 + 0] =
38
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
39
+ block_boxes[threadIdx.x * 5 + 1] =
40
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
41
+ block_boxes[threadIdx.x * 5 + 2] =
42
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
43
+ block_boxes[threadIdx.x * 5 + 3] =
44
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
45
+ block_boxes[threadIdx.x * 5 + 4] =
46
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
47
+ }
48
+ __syncthreads();
49
+
50
+ if (threadIdx.x < row_size) {
51
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
52
+ const float *cur_box = dev_boxes + cur_box_idx * 5;
53
+ int i = 0;
54
+ unsigned long long t = 0;
55
+ int start = 0;
56
+ if (row_start == col_start) {
57
+ start = threadIdx.x + 1;
58
+ }
59
+ for (i = start; i < col_size; i++) {
60
+ if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
61
+ t |= 1ULL << i;
62
+ }
63
+ }
64
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
65
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
66
+ }
67
+ }
68
+
69
+ // boxes is a N x 5 tensor
70
+ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
71
+ using scalar_t = float;
72
+ AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
73
+ auto scores = boxes.select(1, 4);
74
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
75
+ auto boxes_sorted = boxes.index_select(0, order_t);
76
+
77
+ int boxes_num = boxes.size(0);
78
+
79
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
80
+
81
+ scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
82
+
83
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
84
+
85
+ unsigned long long* mask_dev = NULL;
86
+ //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
87
+ // boxes_num * col_blocks * sizeof(unsigned long long)));
88
+
89
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
90
+
91
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
92
+ THCCeilDiv(boxes_num, threadsPerBlock));
93
+ dim3 threads(threadsPerBlock);
94
+ nms_kernel<<<blocks, threads>>>(boxes_num,
95
+ nms_overlap_thresh,
96
+ boxes_dev,
97
+ mask_dev);
98
+
99
+ std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
100
+ THCudaCheck(cudaMemcpy(&mask_host[0],
101
+ mask_dev,
102
+ sizeof(unsigned long long) * boxes_num * col_blocks,
103
+ cudaMemcpyDeviceToHost));
104
+
105
+ std::vector<unsigned long long> remv(col_blocks);
106
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
107
+
108
+ at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
109
+ int64_t* keep_out = keep.data_ptr<int64_t>();
110
+
111
+ int num_to_keep = 0;
112
+ for (int i = 0; i < boxes_num; i++) {
113
+ int nblock = i / threadsPerBlock;
114
+ int inblock = i % threadsPerBlock;
115
+
116
+ if (!(remv[nblock] & (1ULL << inblock))) {
117
+ keep_out[num_to_keep++] = i;
118
+ unsigned long long *p = &mask_host[0] + i * col_blocks;
119
+ for (int j = nblock; j < col_blocks; j++) {
120
+ remv[j] |= p[j];
121
+ }
122
+ }
123
+ }
124
+
125
+ THCudaFree(state, mask_dev);
126
+ // TODO improve this part
127
+ return std::get<0>(order_t.index({
128
+ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
129
+ order_t.device(), keep.scalar_type())
130
+ }).sort(0, false));
131
+ }
maskrcnn_benchmark/csrc/cuda/vision.h ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include <torch/extension.h>
4
+
5
+
6
+ at::Tensor SigmoidFocalLoss_forward_cuda(
7
+ const at::Tensor& logits,
8
+ const at::Tensor& targets,
9
+ const int num_classes,
10
+ const float gamma,
11
+ const float alpha);
12
+
13
+ at::Tensor SigmoidFocalLoss_backward_cuda(
14
+ const at::Tensor& logits,
15
+ const at::Tensor& targets,
16
+ const at::Tensor& d_losses,
17
+ const int num_classes,
18
+ const float gamma,
19
+ const float alpha);
20
+
21
+ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
22
+ const at::Tensor& rois,
23
+ const float spatial_scale,
24
+ const int pooled_height,
25
+ const int pooled_width,
26
+ const int sampling_ratio);
27
+
28
+ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
29
+ const at::Tensor& rois,
30
+ const float spatial_scale,
31
+ const int pooled_height,
32
+ const int pooled_width,
33
+ const int batch_size,
34
+ const int channels,
35
+ const int height,
36
+ const int width,
37
+ const int sampling_ratio);
38
+
39
+
40
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
41
+ const at::Tensor& rois,
42
+ const float spatial_scale,
43
+ const int pooled_height,
44
+ const int pooled_width);
45
+
46
+ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
47
+ const at::Tensor& input,
48
+ const at::Tensor& rois,
49
+ const at::Tensor& argmax,
50
+ const float spatial_scale,
51
+ const int pooled_height,
52
+ const int pooled_width,
53
+ const int batch_size,
54
+ const int channels,
55
+ const int height,
56
+ const int width);
57
+
58
+ at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
59
+ at::Tensor ml_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
60
+
61
+ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
62
+ at::Tensor offset, at::Tensor output,
63
+ at::Tensor columns, at::Tensor ones, int kW,
64
+ int kH, int dW, int dH, int padW, int padH,
65
+ int dilationW, int dilationH, int group,
66
+ int deformable_group, int im2col_step);
67
+
68
+ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
69
+ at::Tensor gradOutput, at::Tensor gradInput,
70
+ at::Tensor gradOffset, at::Tensor weight,
71
+ at::Tensor columns, int kW, int kH, int dW,
72
+ int dH, int padW, int padH, int dilationW,
73
+ int dilationH, int group,
74
+ int deformable_group, int im2col_step);
75
+
76
+ int deform_conv_backward_parameters_cuda(
77
+ at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
78
+ at::Tensor gradWeight, // at::Tensor gradBias,
79
+ at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
80
+ int padW, int padH, int dilationW, int dilationH, int group,
81
+ int deformable_group, float scale, int im2col_step);
82
+
83
+ void modulated_deform_conv_cuda_forward(
84
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
85
+ at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
86
+ int kernel_h, int kernel_w, const int stride_h, const int stride_w,
87
+ const int pad_h, const int pad_w, const int dilation_h,
88
+ const int dilation_w, const int group, const int deformable_group,
89
+ const bool with_bias);
90
+
91
+ void modulated_deform_conv_cuda_backward(
92
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
93
+ at::Tensor offset, at::Tensor mask, at::Tensor columns,
94
+ at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
95
+ at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
96
+ int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
97
+ int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
98
+ const bool with_bias);
99
+
100
+ void deform_psroi_pooling_cuda_forward(
101
+ at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
102
+ at::Tensor top_count, const int no_trans, const float spatial_scale,
103
+ const int output_dim, const int group_size, const int pooled_size,
104
+ const int part_size, const int sample_per_part, const float trans_std);
105
+
106
+ void deform_psroi_pooling_cuda_backward(
107
+ at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
108
+ at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
109
+ const int no_trans, const float spatial_scale, const int output_dim,
110
+ const int group_size, const int pooled_size, const int part_size,
111
+ const int sample_per_part, const float trans_std);
112
+
113
+
114
+ at::Tensor compute_flow_cuda(const at::Tensor& boxes,
115
+ const int height,
116
+ const int width);
maskrcnn_benchmark/csrc/deform_conv.h ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+
10
+ // Interface for Python
11
+ int deform_conv_forward(
12
+ at::Tensor input,
13
+ at::Tensor weight,
14
+ at::Tensor offset,
15
+ at::Tensor output,
16
+ at::Tensor columns,
17
+ at::Tensor ones,
18
+ int kW,
19
+ int kH,
20
+ int dW,
21
+ int dH,
22
+ int padW,
23
+ int padH,
24
+ int dilationW,
25
+ int dilationH,
26
+ int group,
27
+ int deformable_group,
28
+ int im2col_step)
29
+ {
30
+ if (input.device().is_cuda()) {
31
+ #ifdef WITH_CUDA
32
+ return deform_conv_forward_cuda(
33
+ input, weight, offset, output, columns, ones,
34
+ kW, kH, dW, dH, padW, padH, dilationW, dilationH,
35
+ group, deformable_group, im2col_step
36
+ );
37
+ #else
38
+ AT_ERROR("Not compiled with GPU support");
39
+ #endif
40
+ }
41
+ AT_ERROR("Not implemented on the CPU");
42
+ }
43
+
44
+
45
+ int deform_conv_backward_input(
46
+ at::Tensor input,
47
+ at::Tensor offset,
48
+ at::Tensor gradOutput,
49
+ at::Tensor gradInput,
50
+ at::Tensor gradOffset,
51
+ at::Tensor weight,
52
+ at::Tensor columns,
53
+ int kW,
54
+ int kH,
55
+ int dW,
56
+ int dH,
57
+ int padW,
58
+ int padH,
59
+ int dilationW,
60
+ int dilationH,
61
+ int group,
62
+ int deformable_group,
63
+ int im2col_step)
64
+ {
65
+ if (input.device().is_cuda()) {
66
+ #ifdef WITH_CUDA
67
+ return deform_conv_backward_input_cuda(
68
+ input, offset, gradOutput, gradInput, gradOffset, weight, columns,
69
+ kW, kH, dW, dH, padW, padH, dilationW, dilationH,
70
+ group, deformable_group, im2col_step
71
+ );
72
+ #else
73
+ AT_ERROR("Not compiled with GPU support");
74
+ #endif
75
+ }
76
+ AT_ERROR("Not implemented on the CPU");
77
+ }
78
+
79
+
80
+ int deform_conv_backward_parameters(
81
+ at::Tensor input,
82
+ at::Tensor offset,
83
+ at::Tensor gradOutput,
84
+ at::Tensor gradWeight, // at::Tensor gradBias,
85
+ at::Tensor columns,
86
+ at::Tensor ones,
87
+ int kW,
88
+ int kH,
89
+ int dW,
90
+ int dH,
91
+ int padW,
92
+ int padH,
93
+ int dilationW,
94
+ int dilationH,
95
+ int group,
96
+ int deformable_group,
97
+ float scale,
98
+ int im2col_step)
99
+ {
100
+ if (input.device().is_cuda()) {
101
+ #ifdef WITH_CUDA
102
+ return deform_conv_backward_parameters_cuda(
103
+ input, offset, gradOutput, gradWeight, columns, ones,
104
+ kW, kH, dW, dH, padW, padH, dilationW, dilationH,
105
+ group, deformable_group, scale, im2col_step
106
+ );
107
+ #else
108
+ AT_ERROR("Not compiled with GPU support");
109
+ #endif
110
+ }
111
+ AT_ERROR("Not implemented on the CPU");
112
+ }
113
+
114
+
115
+ void modulated_deform_conv_forward(
116
+ at::Tensor input,
117
+ at::Tensor weight,
118
+ at::Tensor bias,
119
+ at::Tensor ones,
120
+ at::Tensor offset,
121
+ at::Tensor mask,
122
+ at::Tensor output,
123
+ at::Tensor columns,
124
+ int kernel_h,
125
+ int kernel_w,
126
+ const int stride_h,
127
+ const int stride_w,
128
+ const int pad_h,
129
+ const int pad_w,
130
+ const int dilation_h,
131
+ const int dilation_w,
132
+ const int group,
133
+ const int deformable_group,
134
+ const bool with_bias)
135
+ {
136
+ if (input.device().is_cuda()) {
137
+ #ifdef WITH_CUDA
138
+ return modulated_deform_conv_cuda_forward(
139
+ input, weight, bias, ones, offset, mask, output, columns,
140
+ kernel_h, kernel_w, stride_h, stride_w,
141
+ pad_h, pad_w, dilation_h, dilation_w,
142
+ group, deformable_group, with_bias
143
+ );
144
+ #else
145
+ AT_ERROR("Not compiled with GPU support");
146
+ #endif
147
+ }
148
+ AT_ERROR("Not implemented on the CPU");
149
+ }
150
+
151
+
152
+ void modulated_deform_conv_backward(
153
+ at::Tensor input,
154
+ at::Tensor weight,
155
+ at::Tensor bias,
156
+ at::Tensor ones,
157
+ at::Tensor offset,
158
+ at::Tensor mask,
159
+ at::Tensor columns,
160
+ at::Tensor grad_input,
161
+ at::Tensor grad_weight,
162
+ at::Tensor grad_bias,
163
+ at::Tensor grad_offset,
164
+ at::Tensor grad_mask,
165
+ at::Tensor grad_output,
166
+ int kernel_h,
167
+ int kernel_w,
168
+ int stride_h,
169
+ int stride_w,
170
+ int pad_h,
171
+ int pad_w,
172
+ int dilation_h,
173
+ int dilation_w,
174
+ int group,
175
+ int deformable_group,
176
+ const bool with_bias)
177
+ {
178
+ if (input.device().is_cuda()) {
179
+ #ifdef WITH_CUDA
180
+ return modulated_deform_conv_cuda_backward(
181
+ input, weight, bias, ones, offset, mask, columns,
182
+ grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output,
183
+ kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
184
+ group, deformable_group, with_bias
185
+ );
186
+ #else
187
+ AT_ERROR("Not compiled with GPU support");
188
+ #endif
189
+ }
190
+ AT_ERROR("Not implemented on the CPU");
191
+ }
maskrcnn_benchmark/csrc/deform_pool.h ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+
10
+ // Interface for Python
11
+ void deform_psroi_pooling_forward(
12
+ at::Tensor input,
13
+ at::Tensor bbox,
14
+ at::Tensor trans,
15
+ at::Tensor out,
16
+ at::Tensor top_count,
17
+ const int no_trans,
18
+ const float spatial_scale,
19
+ const int output_dim,
20
+ const int group_size,
21
+ const int pooled_size,
22
+ const int part_size,
23
+ const int sample_per_part,
24
+ const float trans_std)
25
+ {
26
+ if (input.device().is_cuda()) {
27
+ #ifdef WITH_CUDA
28
+ return deform_psroi_pooling_cuda_forward(
29
+ input, bbox, trans, out, top_count,
30
+ no_trans, spatial_scale, output_dim, group_size,
31
+ pooled_size, part_size, sample_per_part, trans_std
32
+ );
33
+ #else
34
+ AT_ERROR("Not compiled with GPU support");
35
+ #endif
36
+ }
37
+ AT_ERROR("Not implemented on the CPU");
38
+ }
39
+
40
+
41
+ void deform_psroi_pooling_backward(
42
+ at::Tensor out_grad,
43
+ at::Tensor input,
44
+ at::Tensor bbox,
45
+ at::Tensor trans,
46
+ at::Tensor top_count,
47
+ at::Tensor input_grad,
48
+ at::Tensor trans_grad,
49
+ const int no_trans,
50
+ const float spatial_scale,
51
+ const int output_dim,
52
+ const int group_size,
53
+ const int pooled_size,
54
+ const int part_size,
55
+ const int sample_per_part,
56
+ const float trans_std)
57
+ {
58
+ if (input.device().is_cuda()) {
59
+ #ifdef WITH_CUDA
60
+ return deform_psroi_pooling_cuda_backward(
61
+ out_grad, input, bbox, trans, top_count, input_grad, trans_grad,
62
+ no_trans, spatial_scale, output_dim, group_size, pooled_size,
63
+ part_size, sample_per_part, trans_std
64
+ );
65
+ #else
66
+ AT_ERROR("Not compiled with GPU support");
67
+ #endif
68
+ }
69
+ AT_ERROR("Not implemented on the CPU");
70
+ }
maskrcnn_benchmark/csrc/ml_nms.h ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+
10
+ at::Tensor ml_nms(const at::Tensor& dets,
11
+ const at::Tensor& scores,
12
+ const at::Tensor& labels,
13
+ const float threshold) {
14
+
15
+ if (dets.device().is_cuda()) {
16
+ #ifdef WITH_CUDA
17
+ // TODO raise error if not compiled with CUDA
18
+ if (dets.numel() == 0)
19
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
20
+ auto b = at::cat({dets, scores.unsqueeze(1), labels.unsqueeze(1)}, 1);
21
+ return ml_nms_cuda(b, threshold);
22
+ #else
23
+ AT_ERROR("Not compiled with GPU support");
24
+ #endif
25
+ }
26
+ AT_ERROR("CPU version not implemented");
27
+ }
maskrcnn_benchmark/csrc/nms.h ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+
10
+ at::Tensor nms(const at::Tensor& dets,
11
+ const at::Tensor& scores,
12
+ const float threshold) {
13
+
14
+ if (dets.device().is_cuda()) {
15
+ #ifdef WITH_CUDA
16
+ // TODO raise error if not compiled with CUDA
17
+ if (dets.numel() == 0)
18
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
19
+ auto b = at::cat({dets, scores.unsqueeze(1)}, 1);
20
+ return nms_cuda(b, threshold);
21
+ #else
22
+ AT_ERROR("Not compiled with GPU support");
23
+ #endif
24
+ }
25
+
26
+ at::Tensor result = nms_cpu(dets, scores, threshold);
27
+ return result;
28
+ }
29
+
30
+
31
+ std::pair<at::Tensor, at::Tensor> soft_nms(const at::Tensor& dets,
32
+ const at::Tensor& scores,
33
+ const float threshold,
34
+ const float sigma) {
35
+
36
+ if (dets.device().is_cuda()) {
37
+ #ifdef WITH_CUDA
38
+ AT_ERROR("Soft NMS Does Not have GPU support");
39
+ #endif
40
+ }
41
+
42
+ std::pair<at::Tensor, at::Tensor> result = soft_nms_cpu(dets, scores, threshold, sigma);
43
+
44
+ return result;
45
+ }
maskrcnn_benchmark/csrc/vision.cpp ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "nms.h"
3
+ #include "ml_nms.h"
4
+ #include "ROIAlign.h"
5
+ #include "ROIPool.h"
6
+ #include "SigmoidFocalLoss.h"
7
+ #include "deform_conv.h"
8
+ #include "deform_pool.h"
9
+
10
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11
+ m.def("nms", &nms, "non-maximum suppression");
12
+ m.def("ml_nms", &ml_nms, "multi-label non-maximum suppression");
13
+ m.def("soft_nms", &soft_nms, "soft non-maximum suppression");
14
+ m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
15
+ m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
16
+ m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
17
+ m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
18
+ m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
19
+ m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
20
+ m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
21
+ m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input");
22
+ m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters");
23
+ m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward");
24
+ m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward");
25
+ m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward");
26
+ m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward");
27
+ }
maskrcnn_benchmark/data/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from .build import make_data_loader
maskrcnn_benchmark/data/build.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import bisect
3
+ import copy
4
+ import logging
5
+ import os
6
+
7
+ import torch.utils.data
8
+ import torch.distributed as dist
9
+ from maskrcnn_benchmark.utils.comm import get_world_size
10
+ from maskrcnn_benchmark.utils.imports import import_file
11
+
12
+ from . import datasets as D
13
+ from . import samplers
14
+
15
+ from .collate_batch import BatchCollator, BBoxAugCollator
16
+ from .transforms import build_transforms
17
+
18
+ from transformers import AutoTokenizer
19
+ from .datasets.duplicate_dataset import create_duplicate_dataset
20
+
21
+ def build_dataset(cfg, dataset_list, transforms, dataset_catalog, is_train=True, class_concat=False, extra_args={}):
22
+ """
23
+ Arguments:
24
+ dataset_list (list[str]): Contains the names of the datasets, i.e.,
25
+ coco_2014_trian, coco_2014_val, etc
26
+ transforms (callable): transforms to apply to each (image, target) sample
27
+ dataset_catalog (DatasetCatalog): contains the information on how to
28
+ construct a dataset.
29
+ is_train (bool): whether to setup the dataset for training or testing
30
+ """
31
+ if not isinstance(dataset_list, (list, tuple)):
32
+ raise RuntimeError(
33
+ "dataset_list should be a list of strings, got {}".format(dataset_list)
34
+ )
35
+ datasets = []
36
+ num_category = 1
37
+ for dataset_id, dataset_name in enumerate(dataset_list, 1):
38
+ if is_train:
39
+ dataset_name = dataset_name + cfg.DATASETS.TRAIN_DATASETNAME_SUFFIX
40
+ else:
41
+ dataset_name = dataset_name + cfg.DATASETS.TEST_DATASETNAME_SUFFIX
42
+ data = dataset_catalog.get(dataset_name)
43
+ factory = getattr(D, data["factory"])
44
+ args = data["args"]
45
+ # for COCODataset, we want to remove images without annotations
46
+ # during training
47
+ if data["factory"] == "COCODataset":
48
+ args["remove_images_without_annotations"] = is_train
49
+
50
+ if data["factory"] == "PascalVOCDataset":
51
+ args["use_difficult"] = not is_train
52
+ if data["factory"] in ["VGTSVDataset", "CocoDetectionTSV", "ODTSVDataset"]:
53
+ args["extra_fields"] = ["class"]
54
+ if cfg.MODEL.MASK_ON:
55
+ args["extra_fields"].append("mask")
56
+
57
+ if data["factory"] in ["CocoGrounding", "CocoDetectionTSV", "CaptionTSV", "MixedDataset", "FlickrDataset", "RefExpDataset", "GQADataset", "PseudoData", "PhrasecutDetection"]:
58
+ # args["return_masks"] = False
59
+ args["return_masks"] = cfg.MODEL.MASK_ON
60
+ args["return_tokens"] = True
61
+ args["max_num_labels"] = cfg.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM
62
+ args["max_query_len"] = cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN
63
+
64
+ args["transforms"] = transforms
65
+ args.update(extra_args)
66
+
67
+ if dataset_name == "flickr30k_train":
68
+ copy = cfg.DATASETS.FLICKR_COPY
69
+ elif dataset_name in ["mixed_train", "mixed_train_no_coco"]:
70
+ copy = cfg.DATASETS.MIXED_COPY
71
+ elif dataset_name == "COCO_odinw_train_8copy_dt_train":
72
+ copy = cfg.DATASETS.COCO_COPY
73
+ elif dataset_name == "LVIS_odinw_train_8copy_dt_train":
74
+ copy = cfg.DATASETS.LVIS_COPY
75
+ elif dataset_name == "object365_odinw_2copy_dt_train":
76
+ copy = cfg.DATASETS.OBJECT365_COPY
77
+ elif dataset_name == "vg_odinw_clipped_8copy_dt_train":
78
+ copy = cfg.DATASETS.VG_COPY
79
+ elif dataset_name == "vg_vgoi6_clipped_8copy_dt_train":
80
+ copy = cfg.DATASETS.VG_COPY
81
+ elif dataset_name == "imagenetod_train_odinw_2copy_dt":
82
+ copy = cfg.DATASETS.IN_COPY
83
+ elif dataset_name == "oi_train_odinw_dt":
84
+ copy = cfg.DATASETS.OI_COPY
85
+ elif is_train:
86
+ copy = cfg.DATASETS.GENERAL_COPY
87
+ elif not is_train:
88
+ copy = cfg.DATASETS.GENERAL_COPY_TEST
89
+ else:
90
+ copy = -1 # do not ever copy test
91
+
92
+ if copy != -1:
93
+ new_factory = create_duplicate_dataset(factory)
94
+ dataset = new_factory(copy=copy, **args)
95
+ else:
96
+ # make dataset from factory
97
+ dataset = factory(**args)
98
+
99
+ print(dataset_name, 'has the {} data points'.format(len(dataset)), data["factory"])
100
+
101
+ if class_concat:
102
+ category = list(dataset.contiguous_category_id_to_json_id.values())
103
+ dataset.contiguous_category_id_to_json_id = {}
104
+ dataset.json_category_id_to_contiguous_id = {}
105
+ for id, cat in enumerate(category, start=num_category):
106
+ dataset.json_category_id_to_contiguous_id[cat] = id
107
+ dataset.contiguous_category_id_to_json_id[id] = cat
108
+ num_category += len(category)
109
+ print("Found {} #category after group {}, concating ...".format(num_category, dataset_id))
110
+ datasets.append(dataset)
111
+
112
+ # for testing, return a list of datasets
113
+ if not is_train:
114
+ return datasets
115
+
116
+ # for training, concatenate all datasets into a single one
117
+ dataset = datasets[0]
118
+ if len(datasets) > 1:
119
+ dataset = D.ConcatDataset(datasets)
120
+
121
+ return [dataset]
122
+
123
+
124
+ def build_dataset_by_group(dataset_list, transforms, dataset_catalog, is_train=True, class_by_group=True,
125
+ class_concat=False, extra_args={}):
126
+ """
127
+ Arguments:
128
+ dataset_list (list[str]): Contains the names of the datasets, i.e.,
129
+ coco_2014_trian, coco_2014_val, etc
130
+ transforms (callable): transforms to apply to each (image, target) sample
131
+ dataset_catalog (DatasetCatalog): contains the information on how to
132
+ construct a dataset.
133
+ is_train (bool): whether to setup the dataset for training or testing
134
+ """
135
+ if not isinstance(dataset_list, (list, tuple)):
136
+ raise RuntimeError(
137
+ "dataset_list should be a list of strings, got {}".format(dataset_list)
138
+ )
139
+
140
+ num_category = 1
141
+ grouped_datasets = []
142
+ for group_id, group in enumerate(dataset_list, 1):
143
+ datasets = []
144
+ for dataset_name in group:
145
+ data = dataset_catalog.get(dataset_name)
146
+ factory = getattr(D, data["factory"])
147
+ args = data["args"]
148
+ # for COCODataset, we want to remove images without annotations
149
+ # during training
150
+ if data["factory"] == "COCODataset":
151
+ args["remove_images_without_annotations"] = is_train
152
+ if data["factory"] == "PascalVOCDataset":
153
+ args["use_difficult"] = not is_train
154
+ args["transforms"] = transforms
155
+ args.update(extra_args)
156
+ # make dataset from factory
157
+ dataset = factory(**args)
158
+
159
+ # check if dataset is grouped by task, assume one class per task
160
+ if class_by_group and data["factory"] != "Background":
161
+ category = dataset.contiguous_category_id_to_json_id[1]
162
+ del dataset.contiguous_category_id_to_json_id[1]
163
+ dataset.json_category_id_to_contiguous_id[category] = group_id
164
+ dataset.contiguous_category_id_to_json_id[group_id] = category
165
+
166
+ datasets.append(dataset)
167
+
168
+ if class_concat:
169
+ for dataset in datasets:
170
+ category = list(dataset.contiguous_category_id_to_json_id.values())
171
+ dataset.contiguous_category_id_to_json_id = {}
172
+ dataset.json_category_id_to_contiguous_id = {}
173
+ for id, cat in enumerate(category, start=num_category):
174
+ dataset.json_category_id_to_contiguous_id[cat] = id
175
+ dataset.contiguous_category_id_to_json_id[id] = cat
176
+ num_category += len(category)
177
+ print("Found {} #category after group {}, concating ...".format(num_category, group_id))
178
+
179
+ if is_train:
180
+ datasets = D.ConcatDataset(datasets)
181
+
182
+ grouped_datasets.append(datasets)
183
+
184
+ # for testing, return a list of datasets
185
+ if not is_train:
186
+ datasets = [dataset for group in grouped_datasets for dataset in group]
187
+ return datasets
188
+ if class_concat:
189
+ grouped_datasets = D.ConcatDataset(grouped_datasets)
190
+ return [grouped_datasets]
191
+
192
+ # for training, concatenate all datasets into a single one
193
+ return grouped_datasets
194
+
195
+
196
+ def make_data_sampler(dataset, shuffle, distributed, num_replicas=None, rank=None, use_random_seed=True):
197
+ if distributed:
198
+ return samplers.DistributedSampler(dataset, shuffle=shuffle, num_replicas=num_replicas, rank=rank,
199
+ use_random=use_random_seed)
200
+ if shuffle:
201
+ sampler = torch.utils.data.sampler.RandomSampler(dataset)
202
+ else:
203
+ sampler = torch.utils.data.sampler.SequentialSampler(dataset)
204
+ return sampler
205
+
206
+
207
+ def _quantize(x, bins):
208
+ bins = copy.copy(bins)
209
+ bins = sorted(bins)
210
+ quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
211
+ return quantized
212
+
213
+
214
+ def _compute_aspect_ratios(dataset):
215
+ aspect_ratios = []
216
+ for i in range(len(dataset)):
217
+ img_info = dataset.get_img_info(i)
218
+ aspect_ratio = float(img_info["height"]) / float(img_info["width"])
219
+ aspect_ratios.append(aspect_ratio)
220
+ return aspect_ratios
221
+
222
+
223
+ def make_batch_data_sampler(
224
+ dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0, drop_last=False
225
+ ):
226
+ if aspect_grouping:
227
+ if not isinstance(aspect_grouping, (list, tuple)):
228
+ aspect_grouping = [aspect_grouping]
229
+ aspect_ratios = _compute_aspect_ratios(dataset)
230
+ group_ids = _quantize(aspect_ratios, aspect_grouping)
231
+ batch_sampler = samplers.GroupedBatchSampler(
232
+ sampler, group_ids, images_per_batch, drop_uneven=drop_last
233
+ )
234
+ else:
235
+ batch_sampler = torch.utils.data.sampler.BatchSampler(
236
+ sampler, images_per_batch, drop_last=drop_last
237
+ )
238
+ if num_iters is not None:
239
+ batch_sampler = samplers.IterationBasedBatchSampler(
240
+ batch_sampler, num_iters, start_iter
241
+ )
242
+ return batch_sampler
243
+
244
+ def make_data_loader(cfg, is_train=True, is_distributed=False, num_replicas=None, rank=None, start_iter=0):
245
+ num_gpus = num_replicas or get_world_size()
246
+
247
+ if is_train:
248
+ images_per_batch = cfg.SOLVER.IMS_PER_BATCH
249
+ assert (
250
+ images_per_batch % num_gpus == 0
251
+ ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
252
+ "of GPUs ({}) used.".format(images_per_batch, num_gpus)
253
+ images_per_gpu = images_per_batch // num_gpus
254
+ shuffle = True
255
+ num_iters = cfg.SOLVER.MAX_ITER
256
+ else:
257
+ images_per_batch = cfg.TEST.IMS_PER_BATCH
258
+ assert (
259
+ images_per_batch % num_gpus == 0
260
+ ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
261
+ "of GPUs ({}) used.".format(images_per_batch, num_gpus)
262
+ images_per_gpu = images_per_batch // num_gpus
263
+ shuffle = False if not is_distributed else True
264
+ num_iters = None
265
+ start_iter = 0
266
+
267
+ if images_per_gpu > 1:
268
+ logger = logging.getLogger(__name__)
269
+ logger.warning(
270
+ "When using more than one image per GPU you may encounter "
271
+ "an out-of-memory (OOM) error if your GPU does not have "
272
+ "sufficient memory. If this happens, you can reduce "
273
+ "SOLVER.IMS_PER_BATCH (for training) or "
274
+ "TEST.IMS_PER_BATCH (for inference). For training, you must "
275
+ "also adjust the learning rate and schedule length according "
276
+ "to the linear scaling rule. See for example: "
277
+ "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
278
+ )
279
+
280
+ # group images which have similar aspect ratio. In this case, we only
281
+ # group in two cases: those with width / height > 1, and the other way around,
282
+ # but the code supports more general grouping strategy
283
+ aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []
284
+
285
+ paths_catalog = import_file(
286
+ "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True
287
+ )
288
+
289
+ DatasetCatalog = paths_catalog.DatasetCatalog
290
+ if len(cfg.DATASETS.REGISTER) > 0:
291
+ for new_dataset in cfg.DATASETS.REGISTER:
292
+ # img_dir = cfg.DATASETS.REGISTER[new_dataset]["img_dir"]
293
+ # if "ann_file" in cfg.DATASETS.REGISTER[new_dataset]:
294
+ # ann_file = cfg.DATASETS.REGISTER[new_dataset]["ann_file"]
295
+ # else:
296
+ # ann_file = None
297
+ attrs = dict(cfg.DATASETS.REGISTER[new_dataset])
298
+ if is_train:
299
+ new_dataset = new_dataset + cfg.DATASETS.TRAIN_DATASETNAME_SUFFIX
300
+ else:
301
+ new_dataset = new_dataset + cfg.DATASETS.TEST_DATASETNAME_SUFFIX
302
+ DatasetCatalog.set(new_dataset, attrs)
303
+
304
+
305
+ dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST
306
+
307
+ # Haotian: expand bing dataset
308
+ if "bing_caption_train" in dataset_list and len(cfg.DATASETS.BING_INDEX_LIST) > 0:
309
+ dataset_list = list(dataset_list)
310
+ dataset_list.remove("bing_caption_train")
311
+ for bing_index in cfg.DATASETS.BING_INDEX_LIST:
312
+ dataset_list.insert(len(dataset_list), "bing_caption_{}_train".format(bing_index))
313
+ dataset_list = tuple(dataset_list)
314
+
315
+ if "bing_caption_train_no_coco" in dataset_list and len(cfg.DATASETS.BING_INDEX_LIST) > 0:
316
+ dataset_list = list(dataset_list)
317
+ dataset_list.remove("bing_caption_train_no_coco")
318
+ for bing_index in cfg.DATASETS.BING_INDEX_LIST:
319
+ dataset_list.insert(len(dataset_list), "bing_caption_{}_train_no_coco".format(bing_index))
320
+ dataset_list = tuple(dataset_list)
321
+
322
+ print("The combined datasets are: {}.".format(dataset_list))
323
+
324
+ transforms = None if not is_train and cfg.TEST.USE_MULTISCALE else build_transforms(cfg, is_train)
325
+
326
+ extra_args = {}
327
+ if is_train and cfg.DATASETS.USE_CROWD:
328
+ extra_args['ignore_crowd'] = False
329
+ if is_train and cfg.DATASETS.MAX_BOX > 0:
330
+ extra_args['max_box'] = cfg.DATASETS.MAX_BOX
331
+ if is_train and cfg.DATASETS.FEW_SHOT>0:
332
+ extra_args['few_shot'] = cfg.DATASETS.FEW_SHOT
333
+ if is_train and cfg.DATASETS.SHUFFLE_SEED != 0:
334
+ extra_args['shuffle_seed'] = cfg.DATASETS.SHUFFLE_SEED
335
+
336
+ # od to grounding
337
+ if is_train and cfg.DATASETS.RANDOM_SAMPLE_NEG > 0:
338
+ extra_args['random_sample_negative'] = cfg.DATASETS.RANDOM_SAMPLE_NEG
339
+ if is_train and cfg.DATASETS.ADD_DET_PROMPT:
340
+ extra_args["add_detection_prompt"] = True
341
+ if is_train and cfg.DATASETS.USE_OD_AUG:
342
+ extra_args["use_od_data_aug"] = True
343
+ if is_train and cfg.DATASETS.DISABLE_SHUFFLE:
344
+ extra_args["disable_shuffle"] = True
345
+ if cfg.DATASETS.ONE_HOT:
346
+ extra_args["one_hot"] = True
347
+ if is_train and len(cfg.DATASETS.PROMPT_VERSION) > 0:
348
+ extra_args["prompt_engineer_version"] = cfg.DATASETS.PROMPT_VERSION
349
+ if is_train and len(cfg.DATASETS.CONTROL_PROB) == 4:
350
+ extra_args["control_probabilities"] = cfg.DATASETS.CONTROL_PROB
351
+ if is_train and cfg.DATASETS.DISABLE_CLIP_TO_IMAGE:
352
+ extra_args["disable_clip_to_image"] = cfg.DATASETS.DISABLE_CLIP_TO_IMAGE
353
+ if is_train and cfg.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT:
354
+ extra_args["no_minus_one_for_one_hot"] = cfg.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT
355
+ if is_train:
356
+ extra_args["separation_tokens"] = cfg.DATASETS.SEPARATION_TOKENS
357
+ # caption
358
+ if is_train and cfg.DATASETS.CAPTION_MIN_BOX > 0:
359
+ extra_args["caption_min_box"] = cfg.DATASETS.CAPTION_MIN_BOX
360
+ if is_train and cfg.DATASETS.REPLACE_CLEAN_LABEL:
361
+ extra_args["replace_clean_label"] = True
362
+ if is_train and cfg.DATASETS.FURTHER_SCREEN:
363
+ extra_args["further_screen"] = True
364
+ if is_train and cfg.DATASETS.CAPTION_CONF > 0.0:
365
+ extra_args["caption_conf"] = cfg.DATASETS.CAPTION_CONF
366
+ if is_train:
367
+ extra_args["caption_nms"] = cfg.DATASETS.CAPTION_NMS
368
+ if is_train and cfg.DATASETS.PACK_RANDOM_CAPTION_NUMBER > 0:
369
+ extra_args["pack_random_caption_number"] = cfg.DATASETS.PACK_RANDOM_CAPTION_NUMBER
370
+ if is_train and cfg.DATASETS.INFERENCE_CAPTION:
371
+ extra_args["inference_caption"] = True
372
+ if is_train and cfg.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA > 0:
373
+ extra_args["sample_negative_for_grounding_data"] = cfg.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA
374
+ if is_train and cfg.DATASETS.RANDOM_PACK_PROB > 0:
375
+ extra_args["random_pack_prob"] = cfg.DATASETS.RANDOM_PACK_PROB
376
+ if is_train and cfg.DATASETS.NO_RANDOM_PACK_PROBABILITY > 0:
377
+ extra_args["no_random_pack_probability"] = cfg.DATASETS.NO_RANDOM_PACK_PROBABILITY
378
+ if is_train:
379
+ extra_args["safeguard_positive_caption"] = cfg.DATASETS.SAFEGUARD_POSITIVE_CAPTION
380
+ if is_train:
381
+ extra_args["local_debug"] = cfg.DATASETS.LOCAL_DEBUG
382
+ if is_train:
383
+ extra_args["no_mask_for_od"] = cfg.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD
384
+ if is_train:
385
+ extra_args["no_mask_for_gold"] = cfg.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD
386
+ if is_train:
387
+ extra_args["mlm_obj_for_only_positive"] = cfg.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE
388
+ if cfg.DATASETS.OVERRIDE_CATEGORY and cfg.DATASETS.USE_OVERRIDE_CATEGORY:
389
+ extra_args["override_category"] = cfg.DATASETS.OVERRIDE_CATEGORY
390
+ if is_train:
391
+ extra_args["caption_format_version"] = cfg.DATASETS.CAPTION_FORMAT_VERSION
392
+ if is_train:
393
+ extra_args["special_safeguard_for_coco_grounding"] = cfg.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING
394
+ if is_train:
395
+ extra_args["diver_box_for_vqa"] = cfg.DATASETS.DIVER_BOX_FOR_VQA
396
+ extra_args["caption_prompt"] = cfg.DATASETS.CAPTION_PROMPT
397
+ extra_args["use_caption_prompt"] = cfg.DATASETS.USE_CAPTION_PROMPT
398
+
399
+ # extra_args['tokenizer'] = AutoTokenizer.from_pretrained(cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE)
400
+ if cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE == "clip":
401
+ # extra_args['tokenizer'] = build_tokenizer("clip")
402
+ from transformers import CLIPTokenizerFast
403
+ if cfg.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS:
404
+ extra_args["tokenizer"] = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32", from_slow=True, mask_token='ðŁĴij</w>')
405
+ else:
406
+ extra_args["tokenizer"] = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32", from_slow=True)
407
+ else:
408
+ extra_args['tokenizer'] = AutoTokenizer.from_pretrained(cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE)
409
+
410
+ if isinstance(dataset_list[0], (tuple, list)):
411
+ datasets = build_dataset_by_group(dataset_list, transforms, DatasetCatalog, is_train,
412
+ class_by_group=cfg.DATASETS.ALTERNATIVE_TRAINING,
413
+ class_concat=cfg.DATASETS.CLASS_CONCAT,
414
+ extra_args=extra_args)
415
+ else:
416
+ datasets = build_dataset(cfg, dataset_list, transforms, DatasetCatalog, is_train,
417
+ class_concat=cfg.DATASETS.CLASS_CONCAT,
418
+ extra_args=extra_args)
419
+
420
+ data_loaders = []
421
+ for di, dataset in enumerate(datasets):
422
+ if is_train and cfg.SOLVER.MAX_EPOCH > 0:
423
+ num_iters = cfg.SOLVER.MAX_EPOCH * len(dataset) // cfg.SOLVER.IMS_PER_BATCH
424
+ print("Number of iterations are {}".format(num_iters))
425
+ cfg.defrost()
426
+ cfg.SOLVER.MAX_ITER = num_iters
427
+ cfg.SOLVER.DATASET_LENGTH = len(dataset)
428
+ cfg.freeze()
429
+ if is_train and cfg.SOLVER.MULTI_MAX_EPOCH:
430
+ num_iters = None
431
+ cfg.defrost()
432
+ cfg.SOLVER.MULTI_MAX_ITER += (cfg.SOLVER.MULTI_MAX_EPOCH[di] * len(dataset) // cfg.SOLVER.IMS_PER_BATCH,)
433
+ cfg.freeze()
434
+
435
+ if is_train and cfg.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE:
436
+ from .datasets.custom_distributed_sampler import DistributedSamplerChunkByNode
437
+ chunk_or_not = []
438
+ for i in dataset_list:
439
+ if "bing_caption" in i:
440
+ chunk_or_not.append(True)
441
+ else:
442
+ chunk_or_not.append(False)
443
+ assert(len(chunk_or_not) == len(dataset.datasets))
444
+ '''
445
+ If we are training on 4 nodes, each with 8 GPUs
446
+ '''
447
+ num_nodes = int(os.getenv('NODE_COUNT', os.getenv('OMPI_COMM_WORLD_SIZE', 1)))
448
+ local_size = cfg.num_gpus//num_nodes
449
+ node_rank = int(os.getenv('NODE_RANK', os.getenv('OMPI_COMM_WORLD_RANK', 0)))
450
+ local_rank = cfg.local_rank
451
+ sampler = DistributedSamplerChunkByNode(
452
+ dataset = dataset,
453
+ all_datasets = dataset.datasets, # Assumming dataset is a ConcateDataset instance,
454
+ chunk_or_not = chunk_or_not,
455
+ num_replicas = cfg.num_gpus, # total GPU number, e.g., 32
456
+ rank = dist.get_rank(), # Global Rank, e.g., 0~31
457
+ node_rank = node_rank, # Node Rank, e.g., 0~3
458
+ node_number = num_nodes, # how many node e.g., 4
459
+ process_num_per_node = local_size, # e.g., 8
460
+ rank_within_local_node = local_rank, # e.g., 0~7
461
+ )
462
+ else:
463
+ sampler = make_data_sampler(dataset, shuffle, is_distributed, num_replicas=num_replicas, rank=rank,
464
+ use_random_seed=cfg.DATALOADER.USE_RANDOM_SEED)
465
+ batch_sampler = make_batch_data_sampler(
466
+ dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter, drop_last=is_train
467
+ )
468
+ collator = BBoxAugCollator() if not is_train and cfg.TEST.USE_MULTISCALE else BatchCollator(
469
+ cfg.DATALOADER.SIZE_DIVISIBILITY)
470
+ num_workers = cfg.DATALOADER.NUM_WORKERS
471
+ data_loader = torch.utils.data.DataLoader(
472
+ dataset,
473
+ num_workers=num_workers,
474
+ batch_sampler=batch_sampler,
475
+ collate_fn=collator,
476
+ )
477
+ data_loaders.append(data_loader)
478
+ if is_train and cfg.SOLVER.MULTI_MAX_EPOCH:
479
+ cfg.defrost()
480
+ cfg.SOLVER.MULTI_MAX_ITER += (
481
+ cfg.SOLVER.MULTI_MAX_EPOCH[-1] * min([len(dataset) // cfg.SOLVER.IMS_PER_BATCH for dataset in datasets]),)
482
+ cfg.freeze()
483
+
484
+ if is_train and not cfg.DATASETS.ALTERNATIVE_TRAINING and not cfg.DATASETS.MULTISTAGE_TRAINING:
485
+ # during training, a single (possibly concatenated) data_loader is returned
486
+ assert len(data_loaders) == 1
487
+ return data_loaders[0]
488
+
489
+ return data_loaders
maskrcnn_benchmark/data/collate_batch.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import torch
3
+ from maskrcnn_benchmark.structures.image_list import to_image_list
4
+
5
+ import pdb
6
+ class BatchCollator(object):
7
+ """
8
+ From a list of samples from the dataset,
9
+ returns the batched images and targets.
10
+ This should be passed to the DataLoader
11
+ """
12
+
13
+ def __init__(self, size_divisible=0):
14
+ self.size_divisible = size_divisible
15
+
16
+ def __call__(self, batch):
17
+ transposed_batch = list(zip(*batch))
18
+
19
+ images = to_image_list(transposed_batch[0], self.size_divisible)
20
+ targets = transposed_batch[1]
21
+ img_ids = transposed_batch[2]
22
+ positive_map = None
23
+ positive_map_eval = None
24
+ greenlight_map = None
25
+
26
+ if isinstance(targets[0], dict):
27
+ return images, targets, img_ids, positive_map, positive_map_eval
28
+
29
+ if "greenlight_map" in transposed_batch[1][0].fields():
30
+ greenlight_map = torch.stack([i.get_field("greenlight_map") for i in transposed_batch[1]], dim = 0)
31
+
32
+ if "positive_map" in transposed_batch[1][0].fields():
33
+ # we batch the positive maps here
34
+ # Since in general each batch element will have a different number of boxes,
35
+ # we collapse a single batch dimension to avoid padding. This is sufficient for our purposes.
36
+ max_len = max([v.get_field("positive_map").shape[1] for v in transposed_batch[1]])
37
+ nb_boxes = sum([v.get_field("positive_map").shape[0] for v in transposed_batch[1]])
38
+ batched_pos_map = torch.zeros((nb_boxes, max_len), dtype=torch.bool)
39
+ cur_count = 0
40
+ for v in transposed_batch[1]:
41
+ cur_pos = v.get_field("positive_map")
42
+ batched_pos_map[cur_count: cur_count + len(cur_pos), : cur_pos.shape[1]] = cur_pos
43
+ cur_count += len(cur_pos)
44
+
45
+ assert cur_count == len(batched_pos_map)
46
+ positive_map = batched_pos_map.float()
47
+
48
+
49
+ if "positive_map_eval" in transposed_batch[1][0].fields():
50
+ # we batch the positive maps here
51
+ # Since in general each batch element will have a different number of boxes,
52
+ # we collapse a single batch dimension to avoid padding. This is sufficient for our purposes.
53
+ max_len = max([v.get_field("positive_map_eval").shape[1] for v in transposed_batch[1]])
54
+ nb_boxes = sum([v.get_field("positive_map_eval").shape[0] for v in transposed_batch[1]])
55
+ batched_pos_map = torch.zeros((nb_boxes, max_len), dtype=torch.bool)
56
+ cur_count = 0
57
+ for v in transposed_batch[1]:
58
+ cur_pos = v.get_field("positive_map_eval")
59
+ batched_pos_map[cur_count: cur_count + len(cur_pos), : cur_pos.shape[1]] = cur_pos
60
+ cur_count += len(cur_pos)
61
+
62
+ assert cur_count == len(batched_pos_map)
63
+ # assert batched_pos_map.sum().item() == sum([v["positive_map"].sum().item() for v in batch[1]])
64
+ positive_map_eval = batched_pos_map.float()
65
+
66
+
67
+ return images, targets, img_ids, positive_map, positive_map_eval, greenlight_map
68
+
69
+
70
+ class BBoxAugCollator(object):
71
+ """
72
+ From a list of samples from the dataset,
73
+ returns the images and targets.
74
+ Images should be converted to batched images in `im_detect_bbox_aug`
75
+ """
76
+
77
+ def __call__(self, batch):
78
+ # return list(zip(*batch))
79
+ transposed_batch = list(zip(*batch))
80
+
81
+ images = transposed_batch[0]
82
+ targets = transposed_batch[1]
83
+ img_ids = transposed_batch[2]
84
+ positive_map = None
85
+ positive_map_eval = None
86
+
87
+ if isinstance(targets[0], dict):
88
+ return images, targets, img_ids, positive_map, positive_map_eval
89
+
90
+ return images, targets, img_ids, positive_map, positive_map_eval
91
+
92
+
93
+
maskrcnn_benchmark/data/datasets/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from .coco import COCODataset
3
+ from .voc import PascalVOCDataset
4
+ from .concat_dataset import ConcatDataset
5
+ from .background import Background
6
+ from .tsv import TSVDataset, ODTSVDataset
7
+
8
+ from .modulated_coco import ModulatedDataset, CocoDetection, CocoGrounding
9
+ from .flickr import FlickrDataset
10
+ from .refexp import RefExpDataset
11
+ from .mixed import MixedDataset
12
+ from .gqa import GQADataset
13
+
14
+ from .coco_dt import CocoDetectionTSV
15
+ from .caption import CaptionTSV
16
+ from .lvis import LvisDetection
17
+ from .pseudo_data import PseudoData
18
+ from .phrasecut import PhrasecutDetection
19
+
20
+ __all__ = ["COCODataset", "TSVDataset", "ODTSVDataset", "ConcatDataset", "PascalVOCDataset", "Background",
21
+ "ModulatedDataset", "MixedDataset", "CocoDetection", "FlickrDataset", "RefExpDataset", "GQADataset",
22
+ "CocoDetectionTSV", "CocoGrounding", "CaptionTSV", "LvisDetection", "PseudoData", "PhrasecutDetection"
23
+ ]
maskrcnn_benchmark/data/datasets/background.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os.path
3
+ import json
4
+ from PIL import Image
5
+
6
+ import torch
7
+ import torchvision
8
+ import torch.utils.data as data
9
+ from maskrcnn_benchmark.structures.bounding_box import BoxList
10
+
11
+ class Background(data.Dataset):
12
+ """ Background
13
+
14
+ Args:
15
+ root (string): Root directory where images are downloaded to.
16
+ annFile (string): Path to json annotation file.
17
+ transform (callable, optional): A function/transform that takes in an PIL image
18
+ and returns a transformed version. E.g, ``transforms.ToTensor``
19
+ """
20
+
21
+ def __init__(self, ann_file, root, remove_images_without_annotations=None, transforms=None):
22
+ self.root = root
23
+
24
+ with open(ann_file, 'r') as f:
25
+ self.ids = json.load(f)['images']
26
+ self.transform = transforms
27
+
28
+ def __getitem__(self, index):
29
+ """
30
+ Args:
31
+ index (int): Index
32
+
33
+ Returns:
34
+ tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
35
+ """
36
+ im_info = self.ids[index]
37
+ path = im_info['file_name']
38
+ fp = os.path.join(self.root, path)
39
+
40
+ img = Image.open(fp).convert('RGB')
41
+ if self.transform is not None:
42
+ img, _ = self.transform(img, None)
43
+ null_target = BoxList(torch.zeros((0,4)), (img.shape[-1], img.shape[-2]))
44
+ null_target.add_field('labels', torch.zeros(0))
45
+
46
+ return img, null_target, index
47
+
48
+ def __len__(self):
49
+ return len(self.ids)
50
+
51
+ def get_img_info(self, index):
52
+ im_info = self.ids[index]
53
+ return im_info
maskrcnn_benchmark/data/datasets/box_label_loader.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import math
4
+ import base64
5
+ import collections
6
+ import pycocotools.mask as mask_utils
7
+
8
+ from maskrcnn_benchmark.structures.bounding_box import BoxList
9
+ from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
10
+
11
+
12
+ class LabelLoader(object):
13
+ def __init__(self, labelmap, extra_fields=(), filter_duplicate_relations=False, ignore_attr=None, ignore_rel=None,
14
+ mask_mode="poly"):
15
+ self.labelmap = labelmap
16
+ self.extra_fields = extra_fields
17
+ self.supported_fields = ["class", "conf", "attributes", 'scores_all', 'boxes_all', 'feature', "mask"]
18
+ self.filter_duplicate_relations = filter_duplicate_relations
19
+ self.ignore_attr = set(ignore_attr) if ignore_attr != None else set()
20
+ self.ignore_rel = set(ignore_rel) if ignore_rel != None else set()
21
+ assert mask_mode == "poly" or mask_mode == "mask"
22
+ self.mask_mode = mask_mode
23
+
24
+ def __call__(self, annotations, img_size, remove_empty=False, load_fields=None):
25
+ boxes = [obj["rect"] for obj in annotations]
26
+ boxes = torch.as_tensor(boxes).reshape(-1, 4)
27
+ target = BoxList(boxes, img_size, mode="xyxy")
28
+
29
+ if load_fields is None:
30
+ load_fields = self.extra_fields
31
+
32
+ for field in load_fields:
33
+ assert field in self.supported_fields, "Unsupported field {}".format(field)
34
+ if field == "class":
35
+ classes = self.add_classes(annotations)
36
+ target.add_field("labels", classes)
37
+ elif field == "conf":
38
+ confidences = self.add_confidences(annotations)
39
+ target.add_field("scores", confidences)
40
+ elif field == "attributes":
41
+ attributes = self.add_attributes(annotations)
42
+ target.add_field("attributes", attributes)
43
+ elif field == "scores_all":
44
+ scores_all = self.add_scores_all(annotations)
45
+ target.add_field("scores_all", scores_all)
46
+ elif field == "boxes_all":
47
+ boxes_all = self.add_boxes_all(annotations)
48
+ target.add_field("boxes_all", boxes_all)
49
+ elif field == "feature":
50
+ features = self.add_features(annotations)
51
+ target.add_field("box_features", features)
52
+ elif field == "mask":
53
+ masks, is_box_mask = self.add_masks(annotations, img_size)
54
+ target.add_field("masks", masks)
55
+ target.add_field("is_box_mask", is_box_mask)
56
+
57
+ target = target.clip_to_image(remove_empty=remove_empty)
58
+ return target
59
+
60
+ def get_box_mask(self, rect, img_size):
61
+ x1, y1, x2, y2 = rect[0], rect[1], rect[2], rect[3]
62
+ if self.mask_mode == "poly":
63
+ return [[x1, y1, x1, y2, x2, y2, x2, y1]]
64
+ elif self.mask_mode == "mask":
65
+ # note the order of height/width order in mask is opposite to image
66
+ mask = np.zeros([img_size[1], img_size[0]], dtype=np.uint8)
67
+ mask[math.floor(y1):math.ceil(y2), math.floor(x1):math.ceil(x2)] = 255
68
+ encoded_mask = mask_utils.encode(np.asfortranarray(mask))
69
+ encoded_mask["counts"] = encoded_mask["counts"].decode("utf-8")
70
+ return encoded_mask
71
+
72
+ def add_masks(self, annotations, img_size):
73
+ masks = []
74
+ is_box_mask = []
75
+ for obj in annotations:
76
+ if "mask" in obj:
77
+ masks.append(obj["mask"])
78
+ is_box_mask.append(0)
79
+ else:
80
+ masks.append(self.get_box_mask(obj["rect"], img_size))
81
+ is_box_mask.append(1)
82
+ masks = SegmentationMask(masks, img_size, mode=self.mask_mode)
83
+ is_box_mask = torch.tensor(is_box_mask)
84
+ return masks, is_box_mask
85
+
86
+ def add_classes(self, annotations):
87
+ class_names = [obj["class"] for obj in annotations]
88
+ classes = [None] * len(class_names)
89
+ for i in range(len(class_names)):
90
+ classes[i] = self.labelmap['class_to_ind'][class_names[i]]
91
+ return torch.tensor(classes)
92
+
93
+ def add_confidences(self, annotations):
94
+ confidences = []
95
+ for obj in annotations:
96
+ if "conf" in obj:
97
+ confidences.append(obj["conf"])
98
+ else:
99
+ confidences.append(1.0)
100
+ return torch.tensor(confidences)
101
+
102
+ def add_attributes(self, annotations):
103
+ # the maximal number of attributes per object is 16
104
+ attributes = [[0] * 16 for _ in range(len(annotations))]
105
+ for i, obj in enumerate(annotations):
106
+ for j, attr in enumerate(obj["attributes"]):
107
+ attributes[i][j] = self.labelmap['attribute_to_ind'][attr]
108
+ return torch.tensor(attributes)
109
+
110
+ def add_features(self, annotations):
111
+ features = []
112
+ for obj in annotations:
113
+ features.append(np.frombuffer(base64.b64decode(obj['feature']), np.float32))
114
+ return torch.tensor(features)
115
+
116
+ def add_scores_all(self, annotations):
117
+ scores_all = []
118
+ for obj in annotations:
119
+ scores_all.append(np.frombuffer(base64.b64decode(obj['scores_all']), np.float32))
120
+ return torch.tensor(scores_all)
121
+
122
+ def add_boxes_all(self, annotations):
123
+ boxes_all = []
124
+ for obj in annotations:
125
+ boxes_all.append(np.frombuffer(base64.b64decode(obj['boxes_all']), np.float32).reshape(-1, 4))
126
+ return torch.tensor(boxes_all)
127
+
128
+ def relation_loader(self, relation_annos, target):
129
+ if self.filter_duplicate_relations:
130
+ # Filter out dupes!
131
+ all_rel_sets = collections.defaultdict(list)
132
+ for triplet in relation_annos:
133
+ all_rel_sets[(triplet['subj_id'], triplet['obj_id'])].append(triplet)
134
+ relation_annos = [np.random.choice(v) for v in all_rel_sets.values()]
135
+
136
+ # get M*M pred_labels
137
+ relation_triplets = []
138
+ relations = torch.zeros([len(target), len(target)], dtype=torch.int64)
139
+ for i in range(len(relation_annos)):
140
+ if len(self.ignore_rel) != 0 and relation_annos[i]['class'] in self.ignore_rel:
141
+ continue
142
+ subj_id = relation_annos[i]['subj_id']
143
+ obj_id = relation_annos[i]['obj_id']
144
+ predicate = self.labelmap['relation_to_ind'][relation_annos[i]['class']]
145
+ relations[subj_id, obj_id] = predicate
146
+ relation_triplets.append([subj_id, obj_id, predicate])
147
+
148
+ relation_triplets = torch.tensor(relation_triplets)
149
+ target.add_field("relation_labels", relation_triplets)
150
+ target.add_field("pred_labels", relations)
151
+ return target
152
+
153
+
154
+ class BoxLabelLoader(object):
155
+ def __init__(self, labelmap, extra_fields=(), ignore_attrs=(),
156
+ mask_mode="poly"):
157
+ self.labelmap = labelmap
158
+ self.extra_fields = extra_fields
159
+ self.ignore_attrs = ignore_attrs
160
+ assert mask_mode == "poly" or mask_mode == "mask"
161
+ self.mask_mode = mask_mode
162
+ self.all_fields = ["class", "mask", "confidence",
163
+ "attributes_encode", "IsGroupOf", "IsProposal"]
164
+
165
+ def __call__(self, annotations, img_size, remove_empty=True):
166
+ boxes = [obj["rect"] for obj in annotations]
167
+ boxes = torch.as_tensor(boxes).reshape(-1, 4)
168
+ target = BoxList(boxes, img_size, mode="xyxy")
169
+
170
+ for field in self.extra_fields:
171
+ assert field in self.all_fields, "Unsupported field {}".format(field)
172
+ if field == "class":
173
+ classes = self.add_classes_with_ignore(annotations)
174
+ target.add_field("labels", classes)
175
+ elif field == "mask":
176
+ masks, is_box_mask = self.add_masks(annotations, img_size)
177
+ target.add_field("masks", masks)
178
+ target.add_field("is_box_mask", is_box_mask)
179
+ elif field == "confidence":
180
+ confidences = self.add_confidences(annotations)
181
+ target.add_field("confidences", confidences)
182
+ elif field == "attributes_encode":
183
+ attributes = self.add_attributes(annotations)
184
+ target.add_field("attributes", attributes)
185
+ elif field == "IsGroupOf":
186
+ is_group = [1 if 'IsGroupOf' in obj and obj['IsGroupOf'] == 1 else 0
187
+ for obj in annotations]
188
+ target.add_field("IsGroupOf", torch.tensor(is_group))
189
+ elif field == "IsProposal":
190
+ is_proposal = [1 if "IsProposal" in obj and obj['IsProposal'] == 1 else 0
191
+ for obj in annotations]
192
+ target.add_field("IsProposal", torch.tensor(is_proposal))
193
+
194
+ target = target.clip_to_image(remove_empty=remove_empty)
195
+ return target
196
+
197
+ def add_classes_with_ignore(self, annotations):
198
+ class_names = [obj["class"] for obj in annotations]
199
+ classes = [None] * len(class_names)
200
+ if self.ignore_attrs:
201
+ for i, obj in enumerate(annotations):
202
+ if any([obj[attr] for attr in self.ignore_attrs if attr in obj]):
203
+ classes[i] = -1
204
+ for i, cls in enumerate(classes):
205
+ if cls != -1:
206
+ classes[i] = self.labelmap[class_names[i]] + 1 # 0 is saved for background
207
+ return torch.tensor(classes)
208
+
209
+ def add_masks(self, annotations, img_size):
210
+ masks = []
211
+ is_box_mask = []
212
+ for obj in annotations:
213
+ if "mask" in obj:
214
+ masks.append(obj["mask"])
215
+ is_box_mask.append(0)
216
+ else:
217
+ masks.append(self.get_box_mask(obj["rect"], img_size))
218
+ is_box_mask.append(1)
219
+ masks = SegmentationMask(masks, img_size, mode=self.mask_mode)
220
+ is_box_mask = torch.tensor(is_box_mask)
221
+ return masks, is_box_mask
222
+
223
+ def get_box_mask(self, rect, img_size):
224
+ x1, y1, x2, y2 = rect[0], rect[1], rect[2], rect[3]
225
+ if self.mask_mode == "poly":
226
+ return [[x1, y1, x1, y2, x2, y2, x2, y1]]
227
+ elif self.mask_mode == "mask":
228
+ # note the order of height/width order in mask is opposite to image
229
+ mask = np.zeros([img_size[1], img_size[0]], dtype=np.uint8)
230
+ mask[math.floor(y1):math.ceil(y2), math.floor(x1):math.ceil(x2)] = 255
231
+ encoded_mask = mask_utils.encode(np.asfortranarray(mask))
232
+ encoded_mask["counts"] = encoded_mask["counts"].decode("utf-8")
233
+ return encoded_mask
234
+
235
+ def add_confidences(self, annotations):
236
+ confidences = []
237
+ for obj in annotations:
238
+ if "confidence" in obj:
239
+ confidences.append(obj["confidence"])
240
+ elif "conf" in obj:
241
+ confidences.append(obj["conf"])
242
+ else:
243
+ confidences.append(1.0)
244
+ return torch.tensor(confidences)
245
+
246
+ def add_attributes(self, annotations):
247
+ # we know that the maximal number of attributes per object is 16
248
+ attributes = [[0] * 16 for _ in range(len(annotations))]
249
+ for i, obj in enumerate(annotations):
250
+ attributes[i][:len(obj["attributes_encode"])] = obj["attributes_encode"]
251
+ return torch.tensor(attributes)
maskrcnn_benchmark/data/datasets/caption.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.distributed as dist
3
+ import time
4
+ from torchvision.ops import nms
5
+ import random
6
+ import numpy as np
7
+ from PIL import Image, ImageDraw
8
+ import pdb
9
+ from maskrcnn_benchmark.structures.bounding_box import BoxList
10
+ from .modulated_coco import ConvertCocoPolysToMask
11
+ from .tsv import ODTSVDataset, TSVYamlDataset
12
+ from .od_to_grounding import sanity_check_target_after_processing
13
+
14
+ class CaptionTSV(TSVYamlDataset):
15
+ def __init__(self,
16
+ yaml_file,
17
+ transforms,
18
+ return_tokens,
19
+ return_masks,
20
+ tokenizer,
21
+ caption_min_box=1,
22
+ replace_clean_label=False,
23
+ further_screen=False,
24
+ caption_conf=0.5,
25
+ caption_nms=-1,
26
+ pack_random_caption_number=0,
27
+ inference_caption=False,
28
+ sample_negative_for_grounding_data=-1,
29
+ random_pack_prob=-1.0,
30
+ no_random_pack_probability=0.0,
31
+ safeguard_positive_caption=True,
32
+ mlm_obj_for_only_positive=False,
33
+ caption_format_version="v1",
34
+ local_debug=False,
35
+ max_query_len=256,
36
+ **kwargs
37
+ ):
38
+ super(CaptionTSV, self).__init__(yaml_file, None, replace_clean_label)
39
+ self.yaml_file = yaml_file
40
+ self._transforms = transforms
41
+ self.max_query_len = max_query_len
42
+ self.prepare = ConvertCocoPolysToMask(return_masks=return_masks,
43
+ return_tokens=return_tokens,
44
+ tokenizer=tokenizer,
45
+ max_query_len=max_query_len)
46
+ self.tokenizer = tokenizer
47
+ self.caption_min_box = caption_min_box
48
+ self.replace_clean_label = replace_clean_label
49
+ self.further_screen = further_screen
50
+ self.pack_random_caption_number = pack_random_caption_number
51
+ self.caption_format_version = caption_format_version
52
+
53
+ self.caption_conf = caption_conf
54
+ self.caption_nms = caption_nms
55
+ self.inference_caption = inference_caption
56
+ self.sample_negative_for_grounding_data = sample_negative_for_grounding_data
57
+ self.random_pack_prob = random_pack_prob
58
+ self.no_random_pack_probability = no_random_pack_probability
59
+ self.safeguard_positive_caption = safeguard_positive_caption
60
+ self.mlm_obj_for_only_positive = mlm_obj_for_only_positive
61
+ try:
62
+ self.rank = dist.get_rank()
63
+ except:
64
+ self.rank = 0
65
+
66
+ def __len__(self):
67
+ return super(CaptionTSV, self).__len__()
68
+
69
+ def pack_caption(self, positive_caption, negative_captions, original_tokens_positive):
70
+ if len(negative_captions) == 0:
71
+ return positive_caption, original_tokens_positive, [(0, len(positive_caption))]
72
+ if self.safeguard_positive_caption:
73
+ length_of_each_caption = []
74
+ for caption in negative_captions + [positive_caption]:
75
+ tokenized = self.tokenizer(caption, return_tensors="pt")
76
+ length_of_each_caption.append(tokenized.input_ids.size(-1))
77
+ max_length = self.max_query_len - length_of_each_caption[-1]
78
+ indexes = list(range(len(negative_captions)))
79
+ random.shuffle(indexes)
80
+ new_caption_list = [positive_caption]
81
+ for i in indexes:
82
+ if length_of_each_caption[i] < max_length:
83
+ new_caption_list.append(negative_captions[i])
84
+ max_length -= length_of_each_caption[i]
85
+ else:
86
+ new_caption_list = [positive_caption] + negative_captions
87
+ random.shuffle(new_caption_list)
88
+
89
+ new_caption = ''
90
+
91
+ for i in new_caption_list:
92
+ if i == positive_caption:
93
+ start_position = len(new_caption)
94
+ new_caption += i
95
+ if not i.endswith("."):
96
+ new_caption += "."
97
+ new_caption += " "
98
+
99
+ # shift the token positions the boxes are aligned to
100
+ for index, i in enumerate(original_tokens_positive):
101
+ original_tokens_positive[index] = [tuple(j) for j in i]
102
+ for i in original_tokens_positive:
103
+ for index, j in enumerate(i):
104
+ i[index] = (j[0] + start_position, j[1] + start_position)
105
+
106
+ return new_caption, original_tokens_positive, [(start_position, start_position + len(positive_caption))]
107
+
108
+ def __get_negative_captions__(self, idx, negative_size=7):
109
+ negative_captions = []
110
+ for i in range(negative_size):
111
+ img, anno, _, scale = super(CaptionTSV, self).__getitem__(np.random.choice(len(self)))
112
+ caption = anno["caption"]
113
+ negative_captions.append(caption)
114
+
115
+ return negative_captions
116
+
117
+ def __getitem__(self, idx):
118
+ try:
119
+ img, anno, _, scale = super(CaptionTSV, self).__getitem__(idx)
120
+ if self.inference_caption:
121
+ caption = None
122
+ if isinstance(anno, list):
123
+ caption = anno[0]["caption"] # inference mode for bing
124
+ anno = []
125
+ elif len(anno) == 1:
126
+ caption = anno["caption"] # inference mode for googlecc
127
+ anno = []
128
+ else:
129
+ caption = " ".join(anno["captions"])
130
+ anno = []
131
+ else:
132
+ '''
133
+ An example
134
+ {'img_h': 1154, 'img_w': 1600, 'caption': 'xxx', 'tokens_positive': [[[47, 50], [51, 53], [54, 59]], [[32, 35], [36, 41]], [[32, 35], [36, 41]], [[0, 3], [3, 6], [6, 10], [11, 16], [17, 19], [20, 23]], [[32, 35], [36, 41]], [[32, 35], [36, 41]]], 'bboxes': [[7.344961166381836, 10.479412078857422, 1592.2679443359375, 1090.0028076171875], [950.32861328125, 346.572021484375, 1333.2373046875, 679.3215942382812], [927.44140625, 342.7712707519531, 1389.833984375, 719.5758666992188], [90.48786163330078, 363.67572021484375, 1381.8631591796875, 1078.687744140625], [122.84217071533203, 422.6786193847656, 507.845703125, 667.2651977539062], [80.62384033203125, 416.500244140625, 563.1666259765625, 734.603271484375]], 'scores': [0.7966700196266174, 0.8952182531356812, 0.8186006546020508, 0.9995516538619995, 0.8021856546401978, 0.8923134803771973]}
135
+ '''
136
+ if len(anno["bboxes"]) < self.caption_min_box: # Retry triggered!
137
+ return self[np.random.choice(len(self))]
138
+
139
+ if self.caption_format_version == "v2":
140
+ anno = self.convert_anno_from_v2_to_v1(anno)
141
+
142
+ try:
143
+ if self.further_screen:
144
+ conf = self.caption_conf
145
+ nms_thre = self.caption_nms
146
+
147
+ bboxes = torch.as_tensor(anno["bboxes"]).float()
148
+ scores = torch.as_tensor(anno["scores"])
149
+ tokens_positive = anno["tokens_positive"]
150
+
151
+ # print("\n\n\n\n tokens_positive in original data", tokens_positive)
152
+
153
+ keep = scores > conf
154
+ scores = scores[keep]
155
+ bboxes = bboxes[keep]
156
+ tokens_positive = [i for index, i in enumerate(tokens_positive) if keep[index]]
157
+
158
+ assert (len(tokens_positive) == len(bboxes) == len(scores))
159
+
160
+ if len(bboxes) < self.caption_min_box: # Retry triggered!
161
+ return self[np.random.choice(len(self))]
162
+
163
+ if nms_thre > 0:
164
+ keep = nms(boxes=bboxes, scores=scores, iou_threshold=nms_thre)
165
+ scores = scores[keep]
166
+ bboxes = bboxes[keep]
167
+ tokens_positive = [tokens_positive[i] for i in keep]
168
+ assert (len(tokens_positive) == len(bboxes) == len(scores))
169
+
170
+ # Write back
171
+ anno["bboxes"] = bboxes.tolist()
172
+ anno["scores"] = scores.tolist()
173
+ anno["tokens_positive"] = tokens_positive
174
+
175
+ boxes = torch.as_tensor(anno["bboxes"])
176
+
177
+ if len(boxes) < self.caption_min_box: # Retry triggered!
178
+ return self[np.random.choice(len(self))]
179
+
180
+ target = BoxList(boxes, (anno["img_w"], anno["img_h"]), mode="xyxy")
181
+ target = target.clip_to_image(remove_empty=True)
182
+
183
+ caption = anno["caption"]
184
+ # print("original caption", caption)
185
+ empty_everything = False
186
+ if self.sample_negative_for_grounding_data != -1:
187
+ if random.random() < self.sample_negative_for_grounding_data:
188
+ empty_everything = True
189
+
190
+ if empty_everything:
191
+ caption = self.__get_negative_captions__(idx, negative_size=1)[0]
192
+
193
+ if self.pack_random_caption_number != 0:
194
+ if self.random_pack_prob != -1.0:
195
+ if random.random() < self.no_random_pack_probability:
196
+ negative_pack_number = 0
197
+ elif random.random() < self.random_pack_prob:
198
+ negative_pack_number = self.pack_random_caption_number
199
+ else:
200
+ negative_pack_number = np.random.choice(self.pack_random_caption_number)
201
+ else:
202
+ negative_pack_number = self.pack_random_caption_number
203
+
204
+ negative_captions = self.__get_negative_captions__(idx, negative_size=negative_pack_number)
205
+
206
+ caption, anno["tokens_positive"], greenlight_span_for_masked_lm_objective = self.pack_caption(
207
+ caption, negative_captions, anno["tokens_positive"])
208
+ else:
209
+ greenlight_span_for_masked_lm_objective = [(0, len(caption))]
210
+
211
+ if not self.mlm_obj_for_only_positive:
212
+ greenlight_span_for_masked_lm_objective = [(0, len(caption))]
213
+
214
+ new_anno = []
215
+ areas = target.area()
216
+ for i in range(len(target)):
217
+ new_anno_i = {}
218
+ new_anno_i["area"] = areas[i]
219
+ new_anno_i["iscrowd"] = 0
220
+ new_anno_i["image_id"] = idx
221
+ new_anno_i["category_id"] = 1 # following vg and others
222
+ new_anno_i["id"] = None
223
+ new_anno_i['bbox'] = target.bbox[i].numpy().tolist()
224
+ new_anno_i["tokens_positive"] = anno["tokens_positive"][i]
225
+ new_anno.append(new_anno_i)
226
+
227
+ except:
228
+ return self[np.random.choice(len(self))]
229
+
230
+ anno = new_anno
231
+ if empty_everything:
232
+ anno = []
233
+
234
+ annotations = {"image_id": idx, "annotations": anno, "caption": caption}
235
+ annotations["greenlight_span_for_masked_lm_objective"] = greenlight_span_for_masked_lm_objective
236
+ img, annotations = self.prepare(img, annotations, box_format="xyxy")
237
+
238
+ if self._transforms is not None:
239
+ img, target = self._transforms(img, target)
240
+
241
+ # add additional property
242
+ for ann in annotations:
243
+ target.add_field(ann, annotations[ann])
244
+ except:
245
+ print("Outter Retry triggered!!")
246
+ return self[np.random.choice(len(self))]
247
+
248
+ sanity_check_target_after_processing(target)
249
+
250
+ return img, target, idx
251
+
252
+ def convert_anno_from_v2_to_v1(self, anno):
253
+ flatterned_bboxes = []
254
+ flatterned_tokens_positive = []
255
+ flatterned_bboxes_scores = []
256
+ for i in range(len(anno["bboxes"])):
257
+ # i is the index for entity
258
+ for j in range(len(anno["bboxes"][i])):
259
+ # j is the index for each box
260
+ flatterned_bboxes.append(anno["bboxes"][i][j])
261
+ flatterned_tokens_positive.append(
262
+ anno["tokens_positive"][i]) # Assume this box corresponds to all the token_spans for this entity
263
+ flatterned_bboxes_scores.append(anno["scores"][i][j])
264
+ anno["bboxes"] = flatterned_bboxes
265
+ anno["tokens_positive"] = flatterned_tokens_positive
266
+ anno["scores"] = flatterned_bboxes_scores
267
+ return anno
268
+
269
+
270
+ def get_raw_image(self, idx):
271
+ image, *_ = super(CaptionTSV, self).__getitem__(idx)
272
+ return image
273
+
274
+ def get_img_id(self, idx):
275
+ line_no = self.get_line_no(idx)
276
+ if self.label_tsv is not None:
277
+ row = self.label_tsv.seek(line_no)
278
+ img_id = row[0]
279
+ return img_id
maskrcnn_benchmark/data/datasets/coco.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import os
3
+ import os.path
4
+ import math
5
+ from PIL import Image, ImageDraw
6
+
7
+ import random
8
+ import numpy as np
9
+
10
+ import torch
11
+ import torchvision
12
+ import torch.utils.data as data
13
+
14
+ from maskrcnn_benchmark.structures.bounding_box import BoxList
15
+ from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
16
+ from maskrcnn_benchmark.structures.keypoint import PersonKeypoints
17
+ from maskrcnn_benchmark.config import cfg
18
+ import pdb
19
+
20
+ def _count_visible_keypoints(anno):
21
+ return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
22
+
23
+
24
+ def _has_only_empty_bbox(anno):
25
+ return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
26
+
27
+
28
+ def has_valid_annotation(anno):
29
+ # if it's empty, there is no annotation
30
+ if len(anno) == 0:
31
+ return False
32
+ # if all boxes have close to zero area, there is no annotation
33
+ if _has_only_empty_bbox(anno):
34
+ return False
35
+ # keypoints task have a slight different critera for considering
36
+ # if an annotation is valid
37
+ if "keypoints" not in anno[0]:
38
+ return True
39
+ # for keypoint detection tasks, only consider valid images those
40
+ # containing at least min_keypoints_per_image
41
+ if _count_visible_keypoints(anno) >= cfg.DATALOADER.MIN_KPS_PER_IMS:
42
+ return True
43
+ return False
44
+
45
+
46
+ def pil_loader(path, retry=5):
47
+ # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
48
+ ri = 0
49
+ while ri < retry:
50
+ try:
51
+ with open(path, 'rb') as f:
52
+ img = Image.open(f)
53
+ return img.convert('RGB')
54
+ except:
55
+ ri += 1
56
+
57
+
58
+ def rgb2id(color):
59
+ if isinstance(color, np.ndarray) and len(color.shape) == 3:
60
+ if color.dtype == np.uint8:
61
+ color = color.astype(np.int32)
62
+ return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
63
+ return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
64
+
65
+
66
+ class CocoDetection(data.Dataset):
67
+ """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
68
+
69
+ Args:
70
+ root (string): Root directory where images are downloaded to.
71
+ annFile (string): Path to json annotation file.
72
+ transform (callable, optional): A function/transform that takes in an PIL image
73
+ and returns a transformed version. E.g, ``transforms.ToTensor``
74
+ target_transform (callable, optional): A function/transform that takes in the
75
+ target and transforms it.
76
+ """
77
+
78
+ def __init__(self, root, annFile, transform=None, target_transform=None):
79
+ from pycocotools.coco import COCO
80
+ self.root = root
81
+ self.coco = COCO(annFile)
82
+ self.ids = list(self.coco.imgs.keys())
83
+ self.transform = transform
84
+ self.target_transform = target_transform
85
+
86
+ def __getitem__(self, index, return_meta=False):
87
+ """
88
+ Args:
89
+ index (int): Index
90
+
91
+ Returns:
92
+ tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
93
+ """
94
+ coco = self.coco
95
+ img_id = self.ids[index]
96
+ if isinstance(img_id, str):
97
+ img_id = [img_id]
98
+ ann_ids = coco.getAnnIds(imgIds=img_id)
99
+ target = coco.loadAnns(ann_ids)
100
+
101
+ meta = coco.loadImgs(img_id)[0]
102
+ path = meta['file_name']
103
+ img = pil_loader(os.path.join(self.root, path))
104
+
105
+ if self.transform is not None:
106
+ img = self.transform(img)
107
+
108
+ if self.target_transform is not None:
109
+ target = self.target_transform(target)
110
+
111
+ if return_meta:
112
+ return img, target, meta
113
+ else:
114
+ return img, target
115
+
116
+ def __len__(self):
117
+ return len(self.ids)
118
+
119
+ def __repr__(self):
120
+ fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
121
+ fmt_str += ' Number of datapoints: {}\n'.format(self.__len__())
122
+ fmt_str += ' Root Location: {}\n'.format(self.root)
123
+ tmp = ' Transforms (if any): '
124
+ fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
125
+ tmp = ' Target Transforms (if any): '
126
+ fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp)))
127
+ return fmt_str
128
+
129
+
130
+ class COCODataset(CocoDetection):
131
+ def __init__(self, ann_file, root, remove_images_without_annotations, transforms=None, ignore_crowd=True,
132
+ max_box=-1,
133
+ few_shot=0, one_hot=False, override_category=None, **kwargs
134
+ ):
135
+ super(COCODataset, self).__init__(root, ann_file)
136
+ # sort indices for reproducible results
137
+ self.ids = sorted(self.ids)
138
+
139
+ # filter images without detection annotations
140
+ if remove_images_without_annotations:
141
+ ids = []
142
+ for img_id in self.ids:
143
+ if isinstance(img_id, str):
144
+ ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
145
+ else:
146
+ ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
147
+ anno = self.coco.loadAnns(ann_ids)
148
+ if has_valid_annotation(anno):
149
+ ids.append(img_id)
150
+ self.ids = ids
151
+
152
+ if few_shot:
153
+ ids = []
154
+ cats_freq = [few_shot]*len(self.coco.cats.keys())
155
+ if 'shuffle_seed' in kwargs and kwargs['shuffle_seed'] != 0:
156
+ import random
157
+ random.Random(kwargs['shuffle_seed']).shuffle(self.ids)
158
+ print("Shuffle the dataset with random seed: ", kwargs['shuffle_seed'])
159
+ for img_id in self.ids:
160
+ if isinstance(img_id, str):
161
+ ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
162
+ else:
163
+ ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
164
+ anno = self.coco.loadAnns(ann_ids)
165
+ cat = set([ann['category_id'] for ann in anno]) #set/tuple corresponde to instance/image level
166
+ is_needed = sum([cats_freq[c-1]>0 for c in cat])
167
+ if is_needed:
168
+ ids.append(img_id)
169
+ for c in cat:
170
+ cats_freq[c-1] -= 1
171
+ # print(cat, cats_freq)
172
+ self.ids = ids
173
+
174
+ if override_category is not None:
175
+ self.coco.dataset["categories"] = override_category
176
+ print("Override category: ", override_category)
177
+
178
+ self.json_category_id_to_contiguous_id = {
179
+ v: i + 1 for i, v in enumerate(self.coco.getCatIds())
180
+ }
181
+ self.contiguous_category_id_to_json_id = {
182
+ v: k for k, v in self.json_category_id_to_contiguous_id.items()
183
+ }
184
+ self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
185
+ self.transforms = transforms
186
+ self.ignore_crowd = ignore_crowd
187
+ self.max_box = max_box
188
+ self.one_hot = one_hot
189
+
190
+ def categories(self, no_background=True):
191
+ categories = self.coco.dataset["categories"]
192
+ label_list = {}
193
+ for index, i in enumerate(categories):
194
+ if not no_background or (i["name"] != "__background__" and i['id'] != 0):
195
+ label_list[self.json_category_id_to_contiguous_id[i["id"]]] = i["name"]
196
+ return label_list
197
+
198
+ def __getitem__(self, idx):
199
+
200
+
201
+ img, anno = super(COCODataset, self).__getitem__(idx)
202
+
203
+ # filter crowd annotations
204
+ if self.ignore_crowd:
205
+ anno = [obj for obj in anno if obj["iscrowd"] == 0]
206
+
207
+ boxes = [obj["bbox"] for obj in anno]
208
+ boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes
209
+ if self.max_box > 0 and len(boxes) > self.max_box:
210
+ rand_idx = torch.randperm(self.max_box)
211
+ boxes = boxes[rand_idx, :]
212
+ else:
213
+ rand_idx = None
214
+ target = BoxList(boxes, img.size, mode="xywh").convert("xyxy")
215
+
216
+ classes = [obj["category_id"] for obj in anno]
217
+ classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
218
+ classes = torch.tensor(classes)
219
+
220
+ if rand_idx is not None:
221
+ classes = classes[rand_idx]
222
+ if cfg.DATASETS.CLASS_AGNOSTIC:
223
+ classes = torch.ones_like(classes)
224
+ target.add_field("labels", classes)
225
+
226
+ if anno and "segmentation" in anno[0]:
227
+ masks = [obj["segmentation"] for obj in anno]
228
+ masks = SegmentationMask(masks, img.size, mode='poly')
229
+ target.add_field("masks", masks)
230
+
231
+ if anno and "cbox" in anno[0]:
232
+ cboxes = [obj["cbox"] for obj in anno]
233
+ cboxes = torch.as_tensor(cboxes).reshape(-1, 4) # guard against no boxes
234
+ cboxes = BoxList(cboxes, img.size, mode="xywh").convert("xyxy")
235
+ target.add_field("cbox", cboxes)
236
+
237
+ if anno and "keypoints" in anno[0]:
238
+ keypoints = []
239
+ gt_keypoint = self.coco.cats[1]['keypoints'] # <TODO> a better way to get keypoint description
240
+ use_keypoint = cfg.MODEL.ROI_KEYPOINT_HEAD.KEYPOINT_NAME
241
+ for obj in anno:
242
+ if len(use_keypoint) > 0:
243
+ kps = []
244
+ for name in use_keypoint:
245
+ kp_idx = slice(3 * gt_keypoint.index(name), 3 * gt_keypoint.index(name) + 3)
246
+ kps += obj["keypoints"][kp_idx]
247
+ keypoints.append(kps)
248
+ else:
249
+ keypoints.append(obj["keypoints"])
250
+ keypoints = PersonKeypoints(keypoints, img.size)
251
+ target.add_field("keypoints", keypoints)
252
+
253
+ target = target.clip_to_image(remove_empty=True)
254
+
255
+ if self.transforms is not None:
256
+ img, target = self.transforms(img, target)
257
+
258
+ if cfg.DATASETS.SAMPLE_RATIO != 0.0:
259
+ ratio = cfg.DATASETS.SAMPLE_RATIO
260
+ num_sample_target = math.ceil(len(target) * ratio) if ratio > 0 else math.ceil(-ratio)
261
+ sample_idx = torch.randperm(len(target))[:num_sample_target]
262
+ target = target[sample_idx]
263
+ return img, target, idx
264
+
265
+ def get_img_info(self, index):
266
+ img_id = self.id_to_img_map[index]
267
+ img_data = self.coco.imgs[img_id]
268
+ return img_data
maskrcnn_benchmark/data/datasets/coco_dt.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ COCO dataset which returns image_id for evaluation.
3
+
4
+ Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
5
+ """
6
+
7
+ import torch
8
+ import json
9
+ from PIL import Image, ImageDraw
10
+
11
+ from .modulated_coco import ConvertCocoPolysToMask
12
+ from .tsv import ODTSVDataset
13
+ from pycocotools.coco import COCO
14
+ from maskrcnn_benchmark.structures.bounding_box import BoxList
15
+ import random
16
+ from .od_to_grounding import convert_object_detection_to_grounding_optimized_for_od, check_for_positive_overflow, sanity_check_target_after_processing
17
+
18
+
19
+ class CocoDetectionTSV(ODTSVDataset):
20
+ def __init__(self,
21
+ name,
22
+ yaml_file,
23
+ transforms,
24
+ return_tokens,
25
+ tokenizer,
26
+ extra_fields,
27
+ random_sample_negative=-1,
28
+ add_detection_prompt=False,
29
+ add_detection_prompt_advanced=False,
30
+ use_od_data_aug=False,
31
+ control_probabilities={},
32
+ disable_shuffle=False,
33
+ prompt_engineer_version="v2",
34
+ prompt_limit_negative=-1,
35
+ positive_question_probability=0.6,
36
+ negative_question_probability=0.8,
37
+ full_question_probability=0.5,
38
+ disable_clip_to_image=False,
39
+ separation_tokens=" ",
40
+ no_mask_for_od=False,
41
+ max_num_labels=-1,
42
+ max_query_len=256,
43
+ **kwargs
44
+ ):
45
+ super(CocoDetectionTSV, self).__init__(yaml_file, extra_fields, **kwargs)
46
+
47
+ self._transforms = transforms
48
+ self.name = name
49
+ self.max_query_len = max_query_len
50
+ self.prepare = ConvertCocoPolysToMask(
51
+ return_masks=False,
52
+ return_tokens=return_tokens,
53
+ tokenizer=tokenizer,
54
+ max_query_len=max_query_len
55
+ )
56
+ self.tokenizer = tokenizer
57
+
58
+ self.control_probabilities = control_probabilities
59
+ self.random_sample_negative = random_sample_negative
60
+ self.add_detection_prompt = add_detection_prompt
61
+ self.add_detection_prompt_advanced = add_detection_prompt_advanced
62
+ self.use_od_data_aug = use_od_data_aug
63
+
64
+ self.prompt_engineer_version = prompt_engineer_version
65
+ self.prompt_limit_negative = prompt_limit_negative
66
+ self.positive_question_probability = positive_question_probability
67
+ self.negative_question_probability = negative_question_probability
68
+ self.full_question_probability = full_question_probability
69
+ self.separation_tokens = separation_tokens
70
+ self.disable_clip_to_image = disable_clip_to_image
71
+ self.disable_shuffle = disable_shuffle
72
+ self.no_mask_for_od = no_mask_for_od
73
+ self.max_num_labels = max_num_labels
74
+
75
+ def __len__(self):
76
+ return super(CocoDetectionTSV, self).__len__()
77
+
78
+ def categories(self, no_background=True):
79
+ categories = self.coco.dataset["categories"]
80
+ label_list = {}
81
+ for index, i in enumerate(categories):
82
+ # assert(index + 1 == i["id"])
83
+ if not no_background or (i["name"] != "__background__" and i['id'] != 0):
84
+ label_list[i["id"]] = i["name"]
85
+ return label_list
86
+
87
+ def __getitem__(self, idx):
88
+ # tgt is a BoxList
89
+ img, target, _, scale = super(CocoDetectionTSV, self).__getitem__(idx)
90
+ image_id = self.get_img_id(idx)
91
+ restricted_negative_list = None
92
+
93
+ if not self.disable_clip_to_image:
94
+ target = target.clip_to_image(remove_empty=True)
95
+
96
+ original_box_num = len(target)
97
+
98
+ target, positive_caption_length = check_for_positive_overflow(target, self.ind_to_class, self.tokenizer, self.max_query_len-2) # leave some space for the special tokens
99
+
100
+ if len(target) < original_box_num:
101
+ print("WARNING: removed {} boxes due to positive caption overflow".format(original_box_num - len(target)))
102
+
103
+ annotations, caption, greenlight_span_for_masked_lm_objective, label_to_positions = convert_object_detection_to_grounding_optimized_for_od(
104
+ target=target,
105
+ image_id=image_id,
106
+ ind_to_class=self.ind_to_class,
107
+ disable_shuffle=self.disable_shuffle,
108
+ add_detection_prompt=self.add_detection_prompt,
109
+ add_detection_prompt_advanced=self.add_detection_prompt_advanced,
110
+ random_sample_negative=self.random_sample_negative,
111
+ control_probabilities=self.control_probabilities,
112
+ restricted_negative_list=restricted_negative_list,
113
+ separation_tokens=self.separation_tokens,
114
+ max_num_labels=self.max_num_labels,
115
+ positive_caption_length=positive_caption_length,
116
+ tokenizer=self.tokenizer,
117
+ max_seq_length=self.max_query_len-2
118
+ )
119
+
120
+ # assert(len(self.tokenizer.tokenize(caption)) <= self.max_query_len-2)
121
+
122
+ # print(caption)
123
+ anno = {"image_id": image_id, "annotations": annotations, "caption": caption, "label_to_positions": label_to_positions}
124
+ anno["greenlight_span_for_masked_lm_objective"] = greenlight_span_for_masked_lm_objective
125
+
126
+ if self.no_mask_for_od:
127
+ anno["greenlight_span_for_masked_lm_objective"].append((-1, -1, -1))
128
+
129
+ img, anno = self.prepare(img, anno, box_format="xyxy")
130
+
131
+ if self._transforms is not None:
132
+ img, target = self._transforms(img, target)
133
+
134
+ # add additional property
135
+ for ann in anno:
136
+ target.add_field(ann, anno[ann])
137
+
138
+ sanity_check_target_after_processing(target)
139
+
140
+ return img, target, idx
141
+
142
+ def get_raw_image(self, idx):
143
+ image, *_ = super(CocoDetectionTSV, self).__getitem__(idx)
144
+ return image
145
+
146
+ def get_img_id(self, idx):
147
+ line_no = self.get_line_no(idx)
148
+ if self.label_tsv is not None:
149
+ row = self.label_tsv.seek(line_no)
150
+ img_id = row[0]
151
+ try:
152
+ return int(img_id)
153
+ except:
154
+ return idx
maskrcnn_benchmark/data/datasets/concat_dataset.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import bisect
3
+
4
+ from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
5
+
6
+
7
+ class ConcatDataset(_ConcatDataset):
8
+ """
9
+ Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra
10
+ method for querying the sizes of the image
11
+ """
12
+
13
+ def get_idxs(self, idx):
14
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
15
+ if dataset_idx == 0:
16
+ sample_idx = idx
17
+ else:
18
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
19
+ return dataset_idx, sample_idx
20
+
21
+ def get_img_info(self, idx):
22
+ dataset_idx, sample_idx = self.get_idxs(idx)
23
+ return self.datasets[dataset_idx].get_img_info(sample_idx)
maskrcnn_benchmark/data/datasets/custom_distributed_sampler.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import TypeVar, Optional, Iterator
3
+
4
+ import torch
5
+ from torch.utils.data import Sampler, Dataset
6
+ import torch.distributed as dist
7
+ import random
8
+ import numpy as np
9
+ import torch
10
+
11
+
12
+ class DistributedSamplerChunkByNode(torch.utils.data.Sampler):
13
+
14
+ def __init__(self,
15
+ dataset,
16
+ all_datasets,
17
+ chunk_or_not,
18
+ num_replicas: Optional[int] = None,
19
+ rank: Optional[int] = None,
20
+ shuffle: bool = True,
21
+ seed: int = 0,
22
+ drop_last: bool = False,
23
+ node_rank=0,
24
+ node_number=1, process_num_per_node=1,
25
+ rank_within_local_node=0) -> None:
26
+ if num_replicas is None:
27
+ if not dist.is_available():
28
+ raise RuntimeError("Requires distributed package to be available")
29
+ num_replicas = dist.get_world_size()
30
+ if rank is None:
31
+ if not dist.is_available():
32
+ raise RuntimeError("Requires distributed package to be available")
33
+ rank = dist.get_rank()
34
+ if rank >= num_replicas or rank < 0:
35
+ raise ValueError(
36
+ "Invalid rank {}, rank should be in the interval"
37
+ " [0, {}]".format(rank, num_replicas - 1))
38
+ self.dataset = dataset
39
+ self.num_replicas = num_replicas
40
+ self.rank = rank
41
+ self.epoch = 0
42
+ self.node_number = node_number
43
+ self.node_rank = node_rank
44
+ self.chunk_or_not = chunk_or_not
45
+ self.process_num_per_node = process_num_per_node
46
+ self.rank_within_local_node = rank_within_local_node
47
+
48
+ assert (self.process_num_per_node * self.node_number == self.num_replicas)
49
+
50
+ # 1. divide the datasets into two parts
51
+ normal_datasets = []
52
+ chunked_datasets = []
53
+ for dataset_i, chunk_i in zip(all_datasets, chunk_or_not):
54
+ if chunk_i:
55
+ chunked_datasets.append(dataset_i)
56
+ else:
57
+ normal_datasets.append(dataset_i)
58
+
59
+ # 2. calculate dataset sizes:
60
+ self.normal_dataset_size = sum(
61
+ [len(i) for i in normal_datasets]) # this part we follow the conventional distributed sampler
62
+
63
+ # 3. Divide
64
+ self.current_node_start_range = -1
65
+ self.current_node_end_range = -1
66
+ assert (len(chunked_datasets) >= self.node_number)
67
+ chunk_size = len(chunked_datasets) // self.node_number
68
+ current_example_num = self.normal_dataset_size
69
+
70
+ for index in range(len(chunked_datasets)):
71
+ if index == self.node_rank * chunk_size:
72
+ self.current_node_start_range = current_example_num
73
+ current_example_num += len(chunked_datasets[index])
74
+ if index == (self.node_rank + 1) * chunk_size - 1:
75
+ self.current_node_end_range = current_example_num
76
+
77
+ if self.current_node_end_range == -1: # boundary
78
+ self.current_node_end_range = current_example_num
79
+
80
+ self.drop_last = drop_last
81
+ # If the dataset length is evenly divisible by # of replicas, then there
82
+ # is no need to drop any data, since the dataset will be split equally.
83
+ if self.drop_last and len(self.dataset) % self.num_replicas != 0: # type: ignore[arg-type]
84
+ # Split to nearest available length that is evenly divisible.
85
+ # This is to ensure each rank receives the same amount of data when
86
+ # using this Sampler.
87
+ self.num_samples = math.ceil(
88
+ # `type:ignore` is required because Dataset cannot provide a default __len__
89
+ # see NOTE in pytorch/torch/utils/data/sampler.py
90
+ (len(self.dataset) - self.num_replicas) / self.num_replicas # type: ignore[arg-type]
91
+ )
92
+ else:
93
+ self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) # type: ignore[arg-type]
94
+ self.total_size = self.num_samples * self.num_replicas
95
+ self.shuffle = shuffle
96
+ self.seed = seed
97
+
98
+ def __iter__(self):
99
+ indices = self.generate_indices_within_range_with_rank(
100
+ seed=self.seed,
101
+ epoch=self.epoch,
102
+
103
+ # NOTE: Distribute among all processes
104
+ process_num=self.num_replicas,
105
+ rank=self.rank,
106
+ generate_length=-1,
107
+ valid_indices=list(range(self.normal_dataset_size)),
108
+ prefix="Normal "
109
+ )
110
+
111
+ addition_indices = self.generate_indices_within_range_with_rank(
112
+ seed=self.seed,
113
+ epoch=self.epoch,
114
+
115
+ # NOTE : very important arguments, distribute among local nodes
116
+ process_num=self.process_num_per_node,
117
+ rank=self.rank_within_local_node,
118
+
119
+ generate_length=self.num_samples - len(indices),
120
+ valid_indices=list(range(self.current_node_start_range, self.current_node_end_range)),
121
+ prefix="Distribute "
122
+ )
123
+
124
+ indices.extend(addition_indices)
125
+ random.seed(self.seed + self.epoch + 10 * self.rank) # Set the seed to maximize randomness
126
+ random.shuffle(indices) # Reshuffle
127
+ assert len(indices) == self.num_samples
128
+ return iter(indices)
129
+
130
+ def generate_indices_within_range_with_rank(self, seed, epoch, process_num, generate_length, valid_indices, rank=-1,
131
+ shuffle=True, prefix=""):
132
+ '''
133
+ Use scenario : we want to sample 2500 examples from 10000 examples, while not sampling overlapping examples with other three process.
134
+ Modified from DistributedSampler
135
+ '''
136
+ dataset_size = len(valid_indices)
137
+ if shuffle:
138
+ # deterministically shuffle based on epoch and seed
139
+ g = torch.Generator()
140
+ g.manual_seed(seed + epoch)
141
+ indices = torch.randperm(dataset_size, generator=g).tolist() # type: ignore[arg-type]
142
+ else:
143
+ indices = list(range(dataset_size)) # type: ignore[arg-type]
144
+
145
+ indices = [valid_indices[i] for i in indices]
146
+
147
+ num_samples_normal = math.ceil(
148
+ (dataset_size - process_num) / process_num # type: ignore[arg-type]
149
+ )
150
+ # remove tail of data to make it evenly divisible.
151
+ indices = indices[:num_samples_normal * process_num]
152
+
153
+ print("\n")
154
+ print(prefix,
155
+ "Global Rank {} Local Rank {} generate_length {} valid_indices {} process_num {} indices_before_subsample {} {}".format(
156
+ self.rank, rank, generate_length, len(valid_indices), process_num, len(indices), indices[:10]))
157
+
158
+ # subsample
159
+ indices = indices[rank:num_samples_normal * process_num: process_num]
160
+
161
+ print(prefix,
162
+ "Global Rank {} Local Rank {} generate_length {} valid_indices {} process_num {} indices_after_subsample {} {}".format(
163
+ self.rank, rank, generate_length, len(valid_indices), process_num, len(indices), indices[:10]))
164
+ print("\n")
165
+
166
+ if generate_length != -1:
167
+ if len(indices) > generate_length:
168
+ indices = indices[:generate_length]
169
+ else:
170
+ indices.extend(np.random.choice(valid_indices, generate_length - len(indices)).tolist())
171
+ return indices
172
+
173
+ def __len__(self) -> int:
174
+ return self.num_samples
175
+
176
+ def set_epoch(self, epoch: int) -> None:
177
+ r"""
178
+ Sets the epoch for this sampler. When :attr:`shuffle=True`, this ensures all replicas
179
+ use a different random ordering for each epoch. Otherwise, the next iteration of this
180
+ sampler will yield the same ordering.
181
+
182
+ Args:
183
+ epoch (int): Epoch number.
184
+ """
185
+ self.epoch = epoch
maskrcnn_benchmark/data/datasets/duplicate_dataset.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import TypeVar, Optional, Iterator
3
+
4
+ import torch
5
+ from torch.utils.data import Sampler, Dataset
6
+ import torch.distributed as dist
7
+ import random
8
+ import numpy as np
9
+
10
+
11
+ def create_duplicate_dataset(DatasetBaseClass):
12
+ class DupDataset(DatasetBaseClass):
13
+
14
+ def __init__(self, copy, **kwargs):
15
+ super(DupDataset, self).__init__(**kwargs)
16
+
17
+ self.copy = copy
18
+ self.length = super(DupDataset, self).__len__()
19
+
20
+ def __len__(self):
21
+ return self.copy * self.length
22
+
23
+ def __getitem__(self, index):
24
+ true_index = index % self.length
25
+ return super(DupDataset, self).__getitem__(true_index)
26
+
27
+ def get_img_info(self, index):
28
+ true_index = index % self.length
29
+ return super(DupDataset, self).get_img_info(true_index)
30
+
31
+ return DupDataset
maskrcnn_benchmark/data/datasets/evaluation/__init__.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from maskrcnn_benchmark.data import datasets
2
+
3
+ from .coco import coco_evaluation
4
+ from .voc import voc_evaluation
5
+ from .vg import vg_evaluation
6
+ from .box_aug import im_detect_bbox_aug
7
+ from .od_to_grounding import od_to_grounding_evaluation
8
+
9
+
10
+ def evaluate(dataset, predictions, output_folder, **kwargs):
11
+ """evaluate dataset using different methods based on dataset type.
12
+ Args:
13
+ dataset: Dataset object
14
+ predictions(list[BoxList]): each item in the list represents the
15
+ prediction results for one image.
16
+ output_folder: output folder, to save evaluation files or results.
17
+ **kwargs: other args.
18
+ Returns:
19
+ evaluation result
20
+ """
21
+ args = dict(
22
+ dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
23
+ )
24
+ if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset):
25
+ return coco_evaluation(**args)
26
+ # elif isinstance(dataset, datasets.VGTSVDataset):
27
+ # return vg_evaluation(**args)
28
+ elif isinstance(dataset, datasets.PascalVOCDataset):
29
+ return voc_evaluation(**args)
30
+ elif isinstance(dataset, datasets.CocoDetectionTSV):
31
+ return od_to_grounding_evaluation(**args)
32
+ elif isinstance(dataset, datasets.LvisDetection):
33
+ pass
34
+ else:
35
+ dataset_name = dataset.__class__.__name__
36
+ raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
37
+
38
+
39
+ def evaluate_mdetr(dataset, predictions, output_folder, cfg):
40
+
41
+ args = dict(
42
+ dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs
43
+ )
44
+ if isinstance(dataset, datasets.COCODataset) or isinstance(dataset, datasets.TSVDataset):
45
+ return coco_evaluation(**args)
46
+ # elif isinstance(dataset, datasets.VGTSVDataset):
47
+ # return vg_evaluation(**args)
48
+ elif isinstance(dataset, datasets.PascalVOCDataset):
49
+ return voc_evaluation(**args)
50
+ elif isinstance(dataset, datasets.CocoDetectionTSV):
51
+ return od_to_grounding_evaluation(**args)
52
+ elif isinstance(dataset, datasets.LvisDetection):
53
+ pass
54
+ else:
55
+ dataset_name = dataset.__class__.__name__
56
+ raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name))
maskrcnn_benchmark/data/datasets/evaluation/box_aug.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+ from maskrcnn_benchmark.config import cfg
5
+ from maskrcnn_benchmark.data import transforms as T
6
+ from maskrcnn_benchmark.structures.image_list import to_image_list
7
+ from maskrcnn_benchmark.structures.bounding_box import BoxList
8
+ from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist
9
+ from maskrcnn_benchmark.layers import nms, soft_nms
10
+
11
+
12
+ def im_detect_bbox_aug(model, images, device, captions=None, positive_map_label_to_token=None):
13
+ # Collect detections computed under different transformations
14
+ boxlists_ts = []
15
+ for _ in range(len(images)):
16
+ boxlists_ts.append([])
17
+
18
+ def add_preds_t(boxlists_t):
19
+ for i, boxlist_t in enumerate(boxlists_t):
20
+ # Resize the boxlist as the first one
21
+ boxlists_ts[i].append(boxlist_t.resize(images[i].size))
22
+
23
+ # Compute detections at different scales
24
+ if len(cfg.TEST.RANGES)==len(cfg.TEST.SCALES):
25
+ keep_ranges = cfg.TEST.RANGES
26
+ else:
27
+ keep_ranges = [None for _ in cfg.TEST.SCALES]
28
+
29
+ for scale, keep_range in zip(cfg.TEST.SCALES, keep_ranges):
30
+ max_size = cfg.TEST.MAX_SIZE
31
+ boxlists_scl = im_detect_bbox_scale(
32
+ model, images, scale, max_size, device,
33
+ captions=captions,
34
+ positive_map_label_to_token=positive_map_label_to_token,
35
+ )
36
+ if keep_range is not None:
37
+ boxlists_scl = remove_boxes(boxlists_scl, *keep_range)
38
+ add_preds_t(boxlists_scl)
39
+
40
+ if cfg.TEST.FLIP:
41
+ boxlists_scl_hf = im_detect_bbox_scale(
42
+ model, images, scale, max_size, device,
43
+ captions=captions,
44
+ positive_map_label_to_token=positive_map_label_to_token,
45
+ hflip=True
46
+ )
47
+ if keep_range is not None:
48
+ boxlists_scl_hf = remove_boxes(boxlists_scl_hf, *keep_range)
49
+ add_preds_t(boxlists_scl_hf)
50
+
51
+ # Merge boxlists detected by different bbox aug params
52
+ boxlists = []
53
+ for i, boxlist_ts in enumerate(boxlists_ts):
54
+ bbox = torch.cat([boxlist_t.bbox for boxlist_t in boxlist_ts])
55
+ scores = torch.cat([boxlist_t.get_field('scores') for boxlist_t in boxlist_ts])
56
+ labels = torch.cat([boxlist_t.get_field('labels') for boxlist_t in boxlist_ts])
57
+ boxlist = BoxList(bbox, boxlist_ts[0].size, boxlist_ts[0].mode)
58
+ boxlist.add_field('scores', scores)
59
+ boxlist.add_field('labels', labels)
60
+ boxlists.append(boxlist)
61
+ results = merge_result_from_multi_scales(boxlists)
62
+ return results
63
+
64
+
65
+ def im_detect_bbox(model, images, target_scale, target_max_size, device,
66
+ captions=None,
67
+ positive_map_label_to_token=None
68
+ ):
69
+ """
70
+ Performs bbox detection on the original image.
71
+ """
72
+ if cfg.INPUT.FORMAT is not '':
73
+ input_format = cfg.INPUT.FORMAT
74
+ elif cfg.INPUT.TO_BGR255:
75
+ input_format = 'bgr255'
76
+ transform = T.Compose([
77
+ T.Resize(target_scale, target_max_size),
78
+ T.ToTensor(),
79
+ T.Normalize(
80
+ mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, format=input_format
81
+ )
82
+ ])
83
+ images = [transform(image) for image in images]
84
+ images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY)
85
+ if captions is None:
86
+ return model(images.to(device))
87
+ else:
88
+ return model(images.to(device),
89
+ captions=captions,
90
+ positive_map=positive_map_label_to_token
91
+ )
92
+
93
+
94
+ def im_detect_bbox_hflip(model, images, target_scale, target_max_size, device,
95
+ captions=None,
96
+ positive_map_label_to_token=None
97
+ ):
98
+ """
99
+ Performs bbox detection on the horizontally flipped image.
100
+ Function signature is the same as for im_detect_bbox.
101
+ """
102
+ if cfg.INPUT.FORMAT is not '':
103
+ input_format = cfg.INPUT.FORMAT
104
+ elif cfg.INPUT.TO_BGR255:
105
+ input_format = 'bgr255'
106
+ transform = T.Compose([
107
+ T.Resize(target_scale, target_max_size),
108
+ T.RandomHorizontalFlip(1.0),
109
+ T.ToTensor(),
110
+ T.Normalize(
111
+ mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, format=input_format
112
+ )
113
+ ])
114
+ images = [transform(image) for image in images]
115
+ images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY)
116
+ if captions is None:
117
+ boxlists = model(images.to(device))
118
+ else:
119
+ boxlists = model(images.to(device),
120
+ captions=captions,
121
+ positive_map=positive_map_label_to_token
122
+ )
123
+
124
+ # Invert the detections computed on the flipped image
125
+ boxlists_inv = [boxlist.transpose(0) for boxlist in boxlists]
126
+ return boxlists_inv
127
+
128
+
129
+ def im_detect_bbox_scale(model, images, target_scale, target_max_size, device,
130
+ captions=None,
131
+ positive_map_label_to_token=None,
132
+ hflip=False):
133
+ """
134
+ Computes bbox detections at the given scale.
135
+ Returns predictions in the scaled image space.
136
+ """
137
+ if hflip:
138
+ boxlists_scl = im_detect_bbox_hflip(model, images, target_scale, target_max_size, device,
139
+ captions=captions,
140
+ positive_map_label_to_token=positive_map_label_to_token
141
+ )
142
+ else:
143
+ boxlists_scl = im_detect_bbox(model, images, target_scale, target_max_size, device,
144
+ captions=captions,
145
+ positive_map_label_to_token=positive_map_label_to_token
146
+ )
147
+ return boxlists_scl
148
+
149
+
150
+ def remove_boxes(boxlist_ts, min_scale, max_scale):
151
+ new_boxlist_ts = []
152
+ for _, boxlist_t in enumerate(boxlist_ts):
153
+ mode = boxlist_t.mode
154
+ boxlist_t = boxlist_t.convert("xyxy")
155
+ boxes = boxlist_t.bbox
156
+ keep = []
157
+ for j, box in enumerate(boxes):
158
+ w = box[2] - box[0] + 1
159
+ h = box[3] - box[1] + 1
160
+ if (w * h > min_scale * min_scale) and (w * h < max_scale * max_scale):
161
+ keep.append(j)
162
+ new_boxlist_ts.append(boxlist_t[keep].convert(mode))
163
+ return new_boxlist_ts
164
+
165
+
166
+ def merge_result_from_multi_scales(boxlists):
167
+ num_images = len(boxlists)
168
+ results = []
169
+ for i in range(num_images):
170
+ scores = boxlists[i].get_field("scores")
171
+ labels = boxlists[i].get_field("labels")
172
+ boxes = boxlists[i].bbox
173
+ boxlist = boxlists[i]
174
+ result = []
175
+ # test on classes
176
+ if len(cfg.TEST.SELECT_CLASSES):
177
+ class_list = cfg.TEST.SELECT_CLASSES
178
+ else:
179
+ class_list = range(1, cfg.TEST.NUM_CLASSES)
180
+ for j in class_list:
181
+ inds = (labels == j).nonzero().view(-1)
182
+
183
+ scores_j = scores[inds]
184
+ boxes_j = boxes[inds, :].view(-1, 4)
185
+ boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
186
+ boxlist_for_class.add_field("scores", scores_j)
187
+ boxlist_for_class = boxlist_nms(boxlist_for_class, cfg.TEST.TH, score_field="scores", nms_type=cfg.TEST.SPECIAL_NMS)
188
+ num_labels = len(boxlist_for_class)
189
+ boxlist_for_class.add_field("labels", torch.full((num_labels,), j, dtype=torch.int64, device=scores.device))
190
+ result.append(boxlist_for_class)
191
+
192
+ result = cat_boxlist(result)
193
+ number_of_detections = len(result)
194
+
195
+ # Limit to max_per_image detections **over all classes**
196
+ if number_of_detections > cfg.TEST.PRE_NMS_TOP_N > 0:
197
+ cls_scores = result.get_field("scores")
198
+ image_thresh, _ = torch.kthvalue(
199
+ cls_scores.cpu(),
200
+ number_of_detections - cfg.TEST.PRE_NMS_TOP_N + 1
201
+ )
202
+ keep = cls_scores >= image_thresh.item()
203
+ keep = torch.nonzero(keep).squeeze(1)
204
+ result = result[keep]
205
+ results.append(result)
206
+ return results
207
+
208
+
209
+ def boxlist_nms(boxlist, thresh, max_proposals=-1, score_field="scores", nms_type='nms'):
210
+ if thresh <= 0:
211
+ return boxlist
212
+ mode = boxlist.mode
213
+ boxlist = boxlist.convert("xyxy")
214
+ boxes = boxlist.bbox
215
+ score = boxlist.get_field(score_field)
216
+
217
+ if nms_type == 'vote':
218
+ boxes_vote, scores_vote = bbox_vote(boxes, score, thresh)
219
+ if len(boxes_vote) > 0:
220
+ boxlist.bbox = boxes_vote
221
+ boxlist.extra_fields['scores'] = scores_vote
222
+ elif nms_type == 'soft-vote':
223
+ boxes_vote, scores_vote = soft_bbox_vote(boxes, score, thresh)
224
+ if len(boxes_vote) > 0:
225
+ boxlist.bbox = boxes_vote
226
+ boxlist.extra_fields['scores'] = scores_vote
227
+ elif nms_type == 'soft-nms':
228
+ keep, new_score = soft_nms(boxes.cpu(), score.cpu(), thresh, 0.95)
229
+ if max_proposals > 0:
230
+ keep = keep[: max_proposals]
231
+ boxlist = boxlist[keep]
232
+ boxlist.extra_fields['scores'] = new_score
233
+ else:
234
+ keep = nms(boxes, score, thresh)
235
+ if max_proposals > 0:
236
+ keep = keep[: max_proposals]
237
+ boxlist = boxlist[keep]
238
+ return boxlist.convert(mode)
239
+
240
+
241
+ def bbox_vote(boxes, scores, vote_thresh):
242
+ boxes = boxes.cpu().numpy()
243
+ scores = scores.cpu().numpy().reshape(-1, 1)
244
+ det = np.concatenate((boxes, scores), axis=1)
245
+ if det.shape[0] <= 1:
246
+ return np.zeros((0, 5)), np.zeros((0, 1))
247
+ order = det[:, 4].ravel().argsort()[::-1]
248
+ det = det[order, :]
249
+ dets = []
250
+ while det.shape[0] > 0:
251
+ # IOU
252
+ area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
253
+ xx1 = np.maximum(det[0, 0], det[:, 0])
254
+ yy1 = np.maximum(det[0, 1], det[:, 1])
255
+ xx2 = np.minimum(det[0, 2], det[:, 2])
256
+ yy2 = np.minimum(det[0, 3], det[:, 3])
257
+ w = np.maximum(0.0, xx2 - xx1 + 1)
258
+ h = np.maximum(0.0, yy2 - yy1 + 1)
259
+ inter = w * h
260
+ o = inter / (area[0] + area[:] - inter)
261
+
262
+ # get needed merge det and delete these det
263
+ merge_index = np.where(o >= vote_thresh)[0]
264
+ det_accu = det[merge_index, :]
265
+ det = np.delete(det, merge_index, 0)
266
+
267
+ if merge_index.shape[0] <= 1:
268
+ try:
269
+ dets = np.row_stack((dets, det_accu))
270
+ except:
271
+ dets = det_accu
272
+ continue
273
+ else:
274
+ det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
275
+ max_score = np.max(det_accu[:, 4])
276
+ det_accu_sum = np.zeros((1, 5))
277
+ det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:])
278
+ det_accu_sum[:, 4] = max_score
279
+ try:
280
+ dets = np.row_stack((dets, det_accu_sum))
281
+ except:
282
+ dets = det_accu_sum
283
+
284
+ boxes = torch.from_numpy(dets[:, :4]).float().cuda()
285
+ scores = torch.from_numpy(dets[:, 4]).float().cuda()
286
+
287
+ return boxes, scores
288
+
289
+
290
+ def soft_bbox_vote(boxes, scores, vote_thresh):
291
+ boxes = boxes.cpu().numpy()
292
+ scores = scores.cpu().numpy().reshape(-1, 1)
293
+ det = np.concatenate((boxes, scores), axis=1)
294
+ if det.shape[0] <= 1:
295
+ return np.zeros((0, 5)), np.zeros((0, 1))
296
+ order = det[:, 4].ravel().argsort()[::-1]
297
+ det = det[order, :]
298
+ dets = []
299
+ while det.shape[0] > 0:
300
+ # IOU
301
+ area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
302
+ xx1 = np.maximum(det[0, 0], det[:, 0])
303
+ yy1 = np.maximum(det[0, 1], det[:, 1])
304
+ xx2 = np.minimum(det[0, 2], det[:, 2])
305
+ yy2 = np.minimum(det[0, 3], det[:, 3])
306
+ w = np.maximum(0.0, xx2 - xx1 + 1)
307
+ h = np.maximum(0.0, yy2 - yy1 + 1)
308
+ inter = w * h
309
+ o = inter / (area[0] + area[:] - inter)
310
+
311
+ # get needed merge det and delete these det
312
+ merge_index = np.where(o >= vote_thresh)[0]
313
+ det_accu = det[merge_index, :]
314
+ det_accu_iou = o[merge_index]
315
+ det = np.delete(det, merge_index, 0)
316
+
317
+ if merge_index.shape[0] <= 1:
318
+ try:
319
+ dets = np.row_stack((dets, det_accu))
320
+ except:
321
+ dets = det_accu
322
+ continue
323
+ else:
324
+ soft_det_accu = det_accu.copy()
325
+ soft_det_accu[:, 4] = soft_det_accu[:, 4] * (1 - det_accu_iou)
326
+ soft_index = np.where(soft_det_accu[:, 4] >= cfg.MODEL.RETINANET.INFERENCE_TH)[0]
327
+ soft_det_accu = soft_det_accu[soft_index, :]
328
+
329
+ det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
330
+ max_score = np.max(det_accu[:, 4])
331
+ det_accu_sum = np.zeros((1, 5))
332
+ det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], axis=0) / np.sum(det_accu[:, -1:])
333
+ det_accu_sum[:, 4] = max_score
334
+
335
+ if soft_det_accu.shape[0] > 0:
336
+ det_accu_sum = np.row_stack((det_accu_sum, soft_det_accu))
337
+
338
+ try:
339
+ dets = np.row_stack((dets, det_accu_sum))
340
+ except:
341
+ dets = det_accu_sum
342
+
343
+ order = dets[:, 4].ravel().argsort()[::-1]
344
+ dets = dets[order, :]
345
+
346
+ boxes = torch.from_numpy(dets[:, :4]).float().cuda()
347
+ scores = torch.from_numpy(dets[:, 4]).float().cuda()
348
+
349
+ return boxes, scores